In [17]:
%config IPCompleter.greedy=True

In [18]:
import csv
import math
import random
from string import punctuation
from itertools import groupby
from functools import reduce

In [19]:
with open('./data/stopwords.txt', encoding='UTF-8') as f:
    stopwords = [word for line in f for word in line.split()]

In [20]:
def process_words(words, stopwords):
    filtered_words = [word.lower().translate(str.maketrans('','', punctuation)) for word in words]
    filtered_words = [word for word in filtered_words if word not in stopwords] # add stemming
    return filtered_words

def count_words(words):
    def add_to_dict(acc, word):
        if word not in acc or not word:
            acc[word] = 1
        else:
            acc[word] += 1

        return acc

    words_count = reduce(lambda acc, word: add_to_dict(acc, word), words, {})
    words_count = [(word, count) for word, count in words_count.items()]
    words_count.sort(key=(lambda pair: pair[1]), reverse=True)
    
    return words_count 

def to_csv(file_dir, words_count):
    with open(file_dir, 'w') as f:
        writer = csv.writer(f)
        for word, count in words_count:
            writer.writerow([count, word])

# Excercise 5

In [21]:
with open('./data/cobc.txt', encoding='UTF-8') as f:
    words_list = [word for line in f for word in line.split()]

filtered_words = process_words(words_list, stopwords)
words_count = count_words(filtered_words)
to_csv('./results/words_count.csv', words_count)

In [22]:
def tf(term, doc):
    return doc.count(term)

def idf(term, docs):
    docs_count = len(docs)
    
    def count_occurrences(acc, doc):
        if doc.count(term) > 0:
            acc += 1
        
        return acc
    
    occurrences = reduce(lambda acc, doc: count_occurrences(acc, doc), docs, 0)
    
    return math.log(docs_count / (1.0 + occurrences))

def tfidf(term, doc, docs):
    return tf(term, doc) * idf(term, docs)

def to_sorted_tuples(data):
    tuples_list = [(key, value) for key, value in data.items()]
    tuples_list.sort(key=(lambda pair: pair[1]), reverse=True)
    
    return tuples_list

def compute_tfidfs(doc, docs):
    def add_to_dict(acc, term):
        acc[term] = tfidf(term, doc, docs)
        
        return acc
    
    tfidfs = reduce(lambda acc, term: add_to_dict(acc, term), doc, {})
    
    return tfidfs

def compute_tfidfs_for_docs(docs):
    return [compute_tfidfs(doc, docs) for doc in docs]

In [23]:
with open('./data/cobc.txt', encoding='UTF-8') as f:
    chapters = []
    for key, group in groupby(f, lambda line: line.startswith('Chapter')):
        if not key:
            group = list(group)
            chapter = [word for line in group for word in line.split()]
            chapters.append(chapter)

In [24]:
chapters = [process_words(chapter, stopwords) for chapter in chapters]
chapters = chapters[0:15] # TODO: remove it

# Excercise 6

In [25]:
chapters_tfidfs = compute_tfidfs_for_docs(chapters)

In [26]:
for idx, tfidfs in enumerate(chapters_tfidfs):
    tupled = to_sorted_tuples(tfidfs)
    tupled = [(word, int(count * 1000)) for word, count in tupled if word and count > 0]
    
    file_dir = './results/tfidfs_ch_{no}.csv'.format(no=idx)
    to_csv(file_dir, tupled)

In [27]:
merged = {}

for tfidfs in chapters_tfidfs:
    tupled = to_sorted_tuples(tfidfs)
    
    def add_to_dict(acc, pair):
        if pair[0] not in acc:
            acc[pair[0]] = pair[1]
        else:
            acc[pair[0]] += pair[1]

        return acc
    
    reduce(lambda acc, pair: add_to_dict(acc, pair), tupled, merged)
    
tupled = [(word, int(count * 100)) for word, count in to_sorted_tuples(merged) if word and count > 0]
to_csv('./results/tfidfs.csv', tupled)

# Excercise 7

In [28]:
def match_chapters(term, docs):
    docs_tfidfs = compute_tfidfs_for_docs(docs)
    enumerated = [(idx, tfidfs) for idx, tfidfs in enumerate(docs_tfidfs)]
        
    def get_word_tfidf(tfidfs):
        if term in tfidfs:
            return tfidfs[term]
        
        return 0.0
    
    enumerated.sort(key=(lambda pair: get_word_tfidf(pair[1])), reverse=True)
    
    return [no for no, _ in enumerated]

In [29]:
match_chapters('juniper', chapters)

[14, 3, 12, 11, 13, 1, 5, 7, 9, 0, 2, 4, 6, 8, 10]

# Excercies 8

In [30]:
def get_successors(words):
    words_set = set(words)
    
    successors = {}
    
    for i, word in enumerate(words_set):
        if not word:
            continue
        
        word_successors = {}
        
        for idx, w in enumerate(words):
            if word and w == word and (idx + 1) < len(words):
                successor = words[idx + 1]
                
                if successor not in word_successors:
                    word_successors[successor] = 1
                else:
                    word_successors[successor] += 1
        
        sorted_word_succesors = to_sorted_tuples(word_successors)[:5]
        successors[word] = [sc for sc, _ in sorted_word_succesors]
    
    return successors

def create_random_paragraph(words, successors):
    successors = [(word, successors_list) for word, successors_list in successors.items()]
    
    paragraph_len = random.randint(50, 150)
    
    paragraph = ''
    
    for i in range(paragraph_len):
        sentence_len = random.randint(3, 10)
        
        sentence = ''
        for j in range(sentence_len):
            word_idx = random.randint(0, len(successors) - 1) 
            word = successors[word_idx][0]
            
            successors_list = successors[word_idx][1]
            successor_idx = random.randint(0, len(successors_list) - 1)
            successor = successors_list[successor_idx]
            
            sentence += '{word} {successor} '.format(word=word, successor=successor)
        
        paragraph += sentence.strip().capitalize()
        paragraph += '. '
        
    return paragraph

In [31]:
succesors = get_successors(filtered_words)

In [32]:
with open('./results/random_paragraph.txt', 'w') as f:
    paragraph = create_random_paragraph(filtered_words, succesors)
    f.write(paragraph)