### A notebook for implementing and applying TF-IDF on U.S Presidential Inaugural addresses.

### Note that the no stopwords, stemming and lemmatization are used.

In [1]:
import math
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.corpus import inaugural
from nltk import FreqDist

#### Process inaugural addresses into a dataframe.

In [2]:
# initiate a dictionary for tracking document size
doc_length = {}
# initiate an empty dataframe
df = pd.DataFrame()
# loop through all documents in corpus
for fileid in inaugural.fileids():
    # populate doc_length
    doc_length[fileid] = len(inaugural.words(fileid))
    # create a temporary dictionary for each document for tracking word frequency
    temp_list = [w for w in inaugural.words(fileid)]
    temp_dict = dict(FreqDist(sorted(temp_list)))
    # covert dictionary to series and add to dataframe
    df = df.append(pd.Series(temp_dict, name = fileid))
    pass
# fill null value
df.fillna(0, inplace = True)
# check df size
print(df.shape)

(58, 9913)


#### Implement TF and IDF

In [3]:
# compute term frequency
# inputs: wordvec is a series that contains, for a given doc, 
#                 the word counts for each term in the vocab
#         doclen  is the length of the document
# returns: a series with new term-frequencies (raw counts divided by doc length)
def computetf(wordvec,doclen):
    return wordvec/doclen

# input:   document-by-term (row-by-column) dataframe
# returns: dictionary of key-value pairs. Keys are terms in the vocab, values are IDF.
def computeidf(df):
    idf_dict = {}
    for vocab in df.columns.values:
        # calculate the ratio of total number of documents over number of documents
        # containing current vocab, and tkae a log of this ratio.
        ratio = df.shape[0] / (df[vocab] > 0).sum()
        idf_dict[vocab] = math.log(ratio)
        pass
    return idf_dict

#### Apply TF-IDF.

In [4]:
# define a new dataframe that stores TF-IDF values
newdf = pd.DataFrame()
# compute idf
idfdict = computeidf(df)
# compute tf-idf
cols = df.columns
for index, row in df.iterrows():
    newrow = computetf(row,doc_length[index])
    for c in cols:
        newrow[c] = newrow[c]*idfdict[c]
    newdf = newdf.append(newrow)

In [5]:
# look at the first 5 rows of the tf-idf dataframe
newdf.head()

Unnamed: 0,!,"""",""";","""?",$,',(,),"),",",",...,youthful,zeal,zealous,zealously,zone,,¡,¡¦,¡§,¡¨¡
1789-Washington.txt,0.0,0.00031,0.0,0.0,0.0,0.0,0.001143,0.001288,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1793-Washington.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.011958,0.013476,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1797-Adams.txt,0.0,0.0,0.0,0.0,0.0,0.000116,0.00068,0.000766,0.0,0.0,...,0.0,0.000766,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1801-Jefferson.txt,0.0,0.0,0.0,0.0,0.0,0.000309,0.0,0.0,0.0,0.0,...,0.0,0.001024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1805-Jefferson.txt,0.0,0.0,0.0,0.0,0.0,0.000251,0.0,0.0,0.0,0.0,...,0.0,0.002493,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
