In [1]:
%config IPCompleter.greedy=True

In [2]:
import csv
from string import punctuation
from itertools import groupby
from functools import reduce

In [3]:
with open('./data/stopwords.txt', encoding='UTF-8') as f:
    stopwords = [word for line in f for word in line.split()]

In [4]:
def process_words(words, stopwords):
    filtered_words = [word.lower().translate(str.maketrans('','', punctuation)) for word in words]
    filtered_words = [word for word in filtered_words if word not in stopwords] # add stemming
    return filtered_words

def count_words(words):
    def add_to_dict(acc, word):
        if word not in acc or not word:
            acc[word] = 1
        else:
            acc[word] += 1

        return acc

    words_count = reduce(lambda acc, word: add_to_dict(acc, word), words, {})
    words_count = [(word, count) for word, count in words_count.items()]
    words_count.sort(key=(lambda pair: pair[1]), reverse=True)
    
    return words_count 

def to_csv(file_dir, words_count):
    with open(file_dir, 'w') as f:
        writer = csv.writer(f)
        for word, count in words_count:
            writer.writerow([count, word])

# Excercise 5

In [5]:
with open('./results/cobc.txt', encoding='UTF-8') as f:
    words = [word for line in f for word in line.split()]

filtered_words = process_words(words, stopwords)
words_count = count_words(filtered_words)
to_csv('data/results.csv', words_count)

# Excercise 6

In [37]:
with open('./data/cobc.txt', encoding='UTF-8') as f:
    chapters = []
    for key, group in groupby(f, lambda line: line.startswith('Chapter')):
        if not key:
            group = list(group)
            chapter = [word for line in group for word in line.split()]
            chapters.append(chapter)

In [40]:
chapters = [process_words(chapter, stopwords) for chapter in chapters]

['juniper',
 'men',
 'born',
 'condemned',
 'wise',
 'say',
 'suckle',
 'breast',
 'death',
 'bow',
 'silent',
 'monarch',
 'lord',
 'shadow',
 'lifts',
 'finger',
 'feather',
 'flutters',
 'earth',
 'reason',
 'song',
 'good',
 'go',
 'young',
 'wicked',
 'prosper',
 'king',
 'chaos',
 'lords',
 'breath',
 'stills',
 'souls',
 'found',
 'city',
 'dedicated',
 'worship',
 'long',
 'ago',
 'old',
 'lost',
 'dedication',
 'dark',
 'majesty',
 'godhead',
 'frayed',
 'forgotten',
 'stand',
 'shadow',
 'juniper',
 'faced',
 'immediate',
 'fear',
 'specter',
 'yesteryear',
 'leaking',
 'present',
 'upon',
 'height',
 'overlooking',
 'city',
 'black',
 'company',
 'went',
 'strange',
 'city',
 'far',
 'beyond',
 'bounds',
 'ladys',
 'empire',
 '',
 '',
 '',
 'beginning',
 'beginning',
 'far',
 'away',
 'two',
 'old',
 'friends',
 'handful',
 'men',
 'would',
 'meet',
 'later',
 'stood',
 'nosetonose',
 'shadow']