In [1]:
import re
from itertools import chain
from toolz import frequencies
from glob import iglob
from joblib import Parallel, delayed
import tqdm
import numpy as np

In [2]:
class PoemCleaner():
    def __init__(self):
        self.r = re.compile(r'[^\w{ }]|_+')

    def clean_poem(self, fp):
        with open(fp) as poem:
            no_punc = self.r.sub("", poem.read())
            return no_punc.lower().split()
        
def word_is_desired(w):
    return w in ["a","the"]

def word_ratio(d):
    ratio = float(d.get("a", 0)) / float(d.get("the", 0.0001))
    return ratio

def analyze_poems(poems, cleaner):
    cleaned_poems = chain(*map(cleaner.clean_poem, poems))
    desired_words = filter(word_is_desired, cleaned_poems)
    desired_word_freqs = frequencies(desired_words)
    return word_ratio(desired_word_freqs)

In [3]:
author_a_poems = iglob("input/author_a/*.txt")
author_b_poems = iglob("input/author_b/*.txt")
cleaner = PoemCleaner()
author_a_ratio = analyze_poems(author_a_poems, cleaner)
author_b_ratio = analyze_poems(author_b_poems, cleaner)

print(f'Original_Poem: 0.3 \n'
      f'Author A: {author_a_ratio:.2f} \n'
      f'Author B: {author_b_ratio:.2f}')

Original_Poem: 0.3 
Author A: 0.41 
Author B: 0.21
