In [91]:
import os
import math
import string
from collections import defaultdict

In [92]:
data_dir = "/Users/chukuemekaogudu/Documents/DataMining/inf553_ipynb/project3/"

In [93]:
stop_words_path = "/Volumes/oli2/inf533_datasets/stopwords"

In [94]:
def load_files(file_path):
    data = ""
    with open(file_path, "r") as file:
        while True:
            if not file.readline():
                break
            line = file.readline()
            
            line = line.split()
            line = " ".join(line)
            line = line.translate(str.maketrans('', '', string.punctuation))
            data += line
    return data

In [95]:
dante = load_files(os.path.join(data_dir, "Dantes-Inferno.txt"))

In [96]:
hamlet = load_files(os.path.join(data_dir, "hamlet.txt"))

In [97]:
total_word_count = len(dante.split()) + len(hamlet.split())

In [98]:
total_word_count

30238

In [99]:
stopwords = None
with open(stop_words_path, "rb") as file:
    stopwords = file.read()
    stopwords = stopwords.decode("utf-8").split()

In [100]:
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your']

In [101]:
def create_dict(doc, stopwords):
    word_dict = defaultdict(int)
    
    for word in doc.split():
        if word.lower() not in stopwords and not word.replace(".", "", 1).isdigit():
            word_dict[word.lower()] += 1
    return {k: v for k, v in sorted(word_dict.items(), key=lambda kv: kv[1], reverse=True)}

In [102]:
dante_dict = create_dict(dante, stopwords)

In [103]:
def get_top_items(word_dict, top):
    count = 0
    for k, v in word_dict.items():
        if count == top:
            break
        print(k, v)
        count += 1

In [104]:
get_top_items(dante_dict, 20)

thou 174
one 108
said 83
unto 59
upon 57
thy 49
master 44
thee 43
made 40
saw 34
still 34
turned 34
may 32
art 31
us 31
doth 28
shall 27
great 26
way 24
come 23


In [105]:
hamlet_dict = create_dict(hamlet, stopwords)

In [106]:
get_top_items(hamlet_dict, 20)

o 52
good 47
shall 47
lord 47
thou 43
thy 41
come 37
let 36
well 35
king 34
must 33
hath 32
us 32
would 31
th 31
like 30
sir 30
hamlet 29
ill 29
know 28


In [107]:
def compute_tf(word_dict, bow):
    tf_dict = {}
    bow_count = len(bow)
    for word, count in word_dict.items():
        tf_dict[word] = count / float(bow_count)
    return tf_dict

In [108]:
tf_dante = compute_tf(dante_dict, dante.split())
tf_hamlet = compute_tf(hamlet_dict, hamlet.split())

In [109]:
get_top_items(tf_dante, 20)

thou 0.010669610007358351
one 0.006622516556291391
said 0.005089526612705421
unto 0.0036178562668628894
upon 0.003495217071376012
thy 0.0030046602894285013
master 0.002698062300711307
thee 0.0026367427029678685
made 0.0024527839097375523
saw 0.0020848663232769194
still 0.0020848663232769194
turned 0.0020848663232769194
may 0.0019622271277900416
art 0.0019009075300466028
us 0.0019009075300466028
doth 0.0017169487368162864
shall 0.0016556291390728477
great 0.001594309541329409
way 0.0014716703458425313
come 0.0014103507480990925


In [110]:
def compute_df(doc_list):
    import math
    idf_dict = {}
    N = len(doc_list)
    
    # count the number of documents that contain a word w
    idf_dict = defaultdict(int)
    for doc in doc_list:
        for word, count in doc.items():
            if count > 0:
                idf_dict[word] += 1
                
    for word, val in idf_dict.items():
        idf_dict[word] = math.log(N / float(val))
    return idf_dict

In [111]:
idf_dict = compute_df([dante_dict, hamlet_dict])

In [112]:
def compute_tfidf(tf_dict, idf_dict):
    tfidf = {}
    for word, val in tf_dict.items():
        tfidf[word] = val * idf_dict[word]
    return tfidf

In [113]:
dante_tfidf = compute_tfidf(tf_dante, idf_dict)
hamlet_tfidf = compute_tfidf(tf_hamlet, idf_dict)

In [121]:
def filter_tfidf(tfidf_dict, top):
    tfidf = {k: v for k, v in sorted(tfidf_dict.items(), key=lambda kv: kv[1], reverse=True)}
    filtered = {}
    count = 0
    
    for k, v in tfidf.items():
        if count == top:
            break
        if v > 0.0:
            filtered[k] = v
            count += 1
    return filtered

In [130]:
dante_tfidf = filter_tfidf(dante_tfidf, 30)
hamlet_tfidf = filter_tfidf(hamlet_tfidf, 30)

In [131]:
import pandas as pd

In [132]:
pd.DataFrame([dante_tfidf])

Unnamed: 0,unto,master,turned,beheld,canto,seemed,downward,guide,side,forth,among,began,leader,farther,wholly,high,behold,near,bottom,city
0,0.002508,0.00187,0.001445,0.00085,0.000723,0.000638,0.000638,0.000638,0.000595,0.000595,0.000553,0.00051,0.000468,0.000468,0.000468,0.000425,0.000425,0.000425,0.000425,0.000383


In [133]:
pd.DataFrame([hamlet_tfidf])

Unnamed: 0,th,sir,hamlet,ay,nay,play,queen,dear,horatio,laertes,rosencrantz,thats,theres,mother,polonius,players,castleenter,welcome,young,room
0,0.001543,0.001493,0.001443,0.000896,0.000796,0.000796,0.000746,0.000697,0.000647,0.000647,0.000647,0.000547,0.000547,0.000498,0.000498,0.000498,0.000448,0.000448,0.000448,0.000448
