In [1]:
import nltk
from nltk import ngrams, word_tokenize, RegexpTokenizer, FreqDist, sent_tokenize
from pprint import pprint
import numpy as np
import pandas as pd
from random import random
import math

In [2]:
## books liscensces removed
books = ["SHolmes.txt","Frankenstein.txt","Dracula.txt", "TheYellowWallpaper.txt"]
#books = ["Frankenstein.txt","Dracula.txt", "TheYellowWallpaper.txt"]
#books = ["SHolmes.txt","Frankenstein.txt","Dracula.txt"]

raws = []
for book in books:
    f = open(book,"r")
    raws.append(f.read())
    f.close()
## make lowercase
raws = [raw.lower() for raw in raws]

## separate into sentences

sent_texts = [sent_tokenize(raw) for raw in raws]

## tokenize sentence strings to lists of tokens

tokenizer = RegexpTokenizer(r'[a-z][a-z\']*').tokenize
texts = []
print(len(sent_texts))

for i in range(len(sent_texts)):
    texts.append( [tokenizer(sent) for sent in sent_texts[i]] )
    
## make lexicon
book_lexicon = []
for i in range(len(texts)):
    flat = [word for sent in texts[i] for word in sent]
    book_lexicon.append(sorted(set(flat)))

## Tokens per book 
flats = [[T for s in text for T in s] for text in texts]

4


In [None]:
def bigramCounts (sents, lexicon):
    prev = ['start0', 'unk1'] + (lexicon)
    curr = ['end1', 'unk1'] + lexicon    

    counts = pd.DataFrame(0.0, prev, curr)
    for sent in sents:
        len_sent = len(sent)
        
        if (len_sent < 2):
            continue
        
        # count start marker
        counts.at['start0',sent[0]] += 10
      
        for i in range(len_sent - 1):
            counts.at[sent[i],sent[i+1]] += 1 
            
        # for end marker
        counts.at[sent[len_sent - 1], 'end1'] += 1
    return counts

In [3]:
## ngram lists and freqs
def bigramList (sents):
    listBigram = []
    
    for sent in sents:
        len_sent = len(sent)
        if (len_sent == 0):
            continue
        
        # count start marker
        listBigram.append(('start0',sent[0]))
        
        for i in range(len_sent - 1):
            listBigram.append((sent[i],sent[i+1]))
            
        # for end marker
        listBigram.append((sent[len_sent - 2],'end1'))
    return listBigram
def trigramList (sents):
    
    listBigram = []
    
    for sent in sents:
        len_sent = len(sent)
        if (len_sent < 2):
            continue
        
        # count start marker
        listBigram.append(('start0',sent[0],sent[1]))
        
        for i in range(len_sent - 2):
            listBigram.append((sent[i],sent[i+1],sent[i+2]))
            
        # for end marker
        listBigram.append((sent[len_sent - 3], sent[len_sent - 2],'end1'))
    return listBigram
def freq_bigrams(sents, min_num):
    listBigram = bigramList(sents)
    fdist = FreqDist(listBigram)
    big_bgram = [ k for k,v in fdist.items() if v>min_num]
    return big_bgram
def freq_trigrams(sents, min_num):
    listBigram = trigramList(sents)
    fdist = FreqDist(listBigram)
    big_bgram = [ k for k,v in fdist.items() if v>min_num]
    return big_bgram

In [None]:
### similarity measures
## JacardSimilarity
def jaccard_similarity (doc1, doc2):
    s1 = set(doc1)
    s2 = set(doc2)
    inter = s1.intersection(s2)
    union = s1.union(s2)
    return len(inter) / len(union)

In [None]:
### find cosine similarity with ngrams
## make list of trigrams for each book
## FreqDist list for each book and combined list
    # Term frequency calculated 
## calculate inverse document frequency from combined list

In [7]:
def sublinear_tf (fdist):
    return {k: (1+ math.log(v)) for k,v in fdist.items()}

def inv_doc_freq (ngs):
    idfs = {}
    ## make lexicon of n-grams for each book
    lexs = [set(ngL) for ngL in ngs]
    num_books = len(ngs)
    lex_comb = set().union(*lexs)
    for ngram in lex_comb:
        num_matches = sum([ngram in lex for lex in lexs])
        idfs[ngram] = 1 + math.log(num_books / num_matches )
    #print(idfs.keys())
    return idfs

def tfidf(ngs):
    idfs = inv_doc_freq(ngs)
    tfidf_books = []
      
    for ngL in ngs:
        book_tfidf = []
        fdist = FreqDist(ngL)
        tfDict = sublinear_tf(fdist)
        
        for term in idfs.keys():
            tf = tfDict.get(term, 0)
            book_tfidf.append(tf * idfs.get(term, 0))
            
        tfidf_books.append(book_tfidf)
    return tfidf_books

def cosine_similarity(vector1, vector2):
    dot_product = sum(p*q for p,q in zip(vector1, vector2))
    magnitude = math.sqrt(sum([val**2 for val in vector1])) * math.sqrt(sum([val**2 for val in vector2]))
    if not magnitude:
        return 0
    return dot_product/magnitude

In [9]:
ngs = [bigramList(text) for text in texts]
rep_tfidf = tfidf(ngs)
means = [np.mean(x) for x in rep_tfidf]
#[print(mean) for mean in means]

tfidf_comparisons = []
for i, doc_0 in enumerate(rep_tfidf):
    for j, doc_1 in enumerate(rep_tfidf):
        if (j >= i):
            tfidf_comparisons.append((round(cosine_similarity(doc_0, doc_1),4), books[i], books[j]))

for x in sorted(tfidf_comparisons, reverse=True):
    if (x[0] != 1.0):
        print(x)
    


(0.245, 'SHolmes.txt', 'Dracula.txt')
(0.1804, 'Frankenstein.txt', 'Dracula.txt')
(0.1705, 'SHolmes.txt', 'Frankenstein.txt')
(0.1187, 'Dracula.txt', 'TheYellowWallpaper.txt')
(0.1111, 'SHolmes.txt', 'TheYellowWallpaper.txt')
(0.0803, 'Frankenstein.txt', 'TheYellowWallpaper.txt')


In [None]:
tf(texts[0])

In [None]:
counts = [bigramCounts(texts[i], book_lexicon[i]) for i in range(3)]
counts_smooth = [count.add(0.001) for count in counts]

#frank_counts = bigramCounts(texts[0], book_lexicon[0])
#frank_counts_smooth = frank_counts.add(0.001)

In [None]:
counts_smooth[0].head()


In [None]:
probs = [count.div(count.sum(axis=1), axis=0) for count in counts_smooth]
#prob = frank_counts_smooth.div(frank_counts_smooth.sum(axis=1), axis=0)
counts_smooth[0].sum(axis=1).sort_values(ascending=False)


In [None]:
def choose_word(prob, prev_word):
    P = prob.loc[prev_word,:].as_matrix()
    x = random()
    for i in range(len(P)):
        x -= P[i]
        if x <= 0:
            word = list(prob)[i]
            break;
    return word

def make_sentence(prob):
    sentence = ""
    word = choose_word(prob, "start0")
    
    while (word != "end1"):
        sentence += word + " "
        word = choose_word(prob, word)
    return sentence

def clean(prob, sent):
    for i in range(len(sent)):
        if sent[i] not in prob:
            sent[i] = 'unk1'
    return sent
    
def prob_sentence(prob, sent):
    sent = clean(set(prob), sent)
    P = 1
    if len(sent) < 2:
        return 1
    
    P *= prob.at['start0',sent[0]]
    
    len_sent = len(sent)
    for i in range(min(len_sent - 1, 5)):
        P *= prob.at[sent[i],sent[i+1]]
    #P *= prob.at[sent[len_sent - 1], 'end1']
    return P
        

In [None]:
sorted([(choose_word(prob, "all")) for x in range(100)])
make_sentence(probs[0])
make_sentence(probs[2])

In [None]:
print(prob_sentence(probs[0], texts[0][80]))
print(prob_sentence(probs[0], texts[2][83]))

In [None]:
## all probs
xx = [prob_sentence(probs[0], sents) for sents in texts[0]]
xy = [prob_sentence(probs[0], sents) for sents in texts[1]]
xz = [prob_sentence(probs[0], sents) for sents in texts[2]]

yx = [prob_sentence(probs[1], sents) for sents in texts[0]]
yy = [prob_sentence(probs[1], sents) for sents in texts[1]]
yz = [prob_sentence(probs[1], sents) for sents in texts[2]]

zx = [prob_sentence(probs[2], sents) for sents in texts[0]]
zy = [prob_sentence(probs[2], sents) for sents in texts[1]]
zz = [prob_sentence(probs[2], sents) for sents in texts[2]]


In [None]:


    
a_b = jaccard_similarity(flats[0],flats[1])
b_c = jaccard_similarity(flats[1],flats[2])
a_c = jaccard_similarity(flats[0],flats[2])
print(round(a_b, 3), round(b_c, 3), round(a_c, 3))

In [None]:
print(np.mean(xx), np.mean(xy), np.mean(xz))
print(np.mean(yx), np.mean(yy), np.mean(yz))
print(np.mean(zx), np.mean(zy), np.mean(zz))

In [None]:
print(np.mean([len(sent) for text in texts for sent in text ]))
for text in texts:
    print(np.mean([len(sent) for sent in text ]))

In [None]:
print(probs[0].at['by','sat'])
print(probs[1].at['by','sat'])
print(probs[2].at['by','sat'])



In [None]:
probs[0].sum(axis=0).sort_values(ascending=False)

pd.DataFrame(random(), book_lexicon[2],)

In [None]:
frank_common = freq_trigrams(texts[0] ,book_lexicon[0], 6)
drac_common = freq_trigrams(texts[1] ,book_lexicon[1], 6)
yellow_common = freq_trigrams(texts[2] ,book_lexicon[2], 3)

In [None]:
print(len(frank_common),len(drac_common),len(yellow_common))

FnD = set(frank_common).intersection(drac_common)
FnY = set(frank_common).intersection(yellow_common)
DnY = set(drac_common).intersection(yellow_common)

print(len(FnD),len(FnY),len(DnY))