In [1]:
# Import all packages
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
import pandas as pd
import math
import numpy as np

In [2]:
# Defining read function
def readArticle(doc):
    f = open(doc, 'r', encoding='utf-8')
    data = f.read()
    stopword = stopwords.words('english')
    
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(data.lower())
    
    final = []
    lemmatizer = WordNetLemmatizer()
    
    for word in words:
        final.append(lemmatizer.lemmatize(word))
    
    return [word for word in final if word not in stopword]

In [3]:
# Reading all articles
article1 = readArticle('./article1.txt')
article2 = readArticle('./article2.txt')
article3 = readArticle('./article3.txt')
article4 = readArticle('./article4.txt')

In [4]:
# Finding all words in all articles
allWords = list(set(article1 + article2 + article3 + article4))

In [5]:
# Sort
article1.sort()
article2.sort()
article3.sort()
article4.sort()
allWords.sort()

In [6]:
# Bag of words
def bow(article):
    bagofOfWords = []
    for word in allWords:
        if word in article:
            bagofOfWords.append(1)
        else:
            bagofOfWords.append(0)
            
    return bagofOfWords

In [7]:
df = pd.DataFrame()
df['words'] = allWords

bagofOfWords1 = bow(article1)
bagofOfWords2 = bow(article2)
bagofOfWords3 = bow(article3)
bagofOfWords4 = bow(article4)

df['Six Years And Counting...'] = bagofOfWords1
df['What Dreams May Come'] = bagofOfWords2
df['Getting Saucy About Food'] = bagofOfWords3
df['Train to Nowhere'] = bagofOfWords4

In [8]:
df

Unnamed: 0,words,Six Years And Counting...,What Dreams May Come,Getting Saucy About Food,Train to Nowhere
0,0,0,0,1,0
1,10,0,0,1,0
2,13,0,0,0,1
3,1990s,0,0,0,1
4,2,0,0,1,0
...,...,...,...,...,...
822,year,1,1,1,1
823,yes,0,0,1,0
824,yet,0,0,0,1
825,york,1,1,0,0


In [9]:
# Compute counts
def counts(article):
    d = {}
    for word in article:
        if word not in d.keys():
            d[word] = 1
        else:
            d[word] += 1
            
    return d

In [10]:
count1 = counts(article1)
count2 = counts(article2)
count3 = counts(article3)
count4 = counts(article4)

In [11]:
# TF
def computeTF(dic, words, allWords):
    tf = []
    length = len(words)
    for word in allWords:
        if word not in dic.keys():
            tf.append(0)
        else:
            tf.append(dic[word] / float(length))
    return tf

In [12]:
tf_df = pd.DataFrame()
tf_df['words'] = allWords

tf_df['Six Years And Counting...'] = computeTF(count1, article1, allWords)
tf_df['What Dreams May Come'] = computeTF(count2, article2, allWords)
tf_df['Getting Saucy About Food'] = computeTF(count3, article3, allWords)
tf_df['Train to Nowhere'] = computeTF(count4, article4, allWords)

tf_df

Unnamed: 0,words,Six Years And Counting...,What Dreams May Come,Getting Saucy About Food,Train to Nowhere
0,0,0.000000,0.000000,0.003509,0.000000
1,10,0.000000,0.000000,0.007018,0.000000
2,13,0.000000,0.000000,0.000000,0.003953
3,1990s,0.000000,0.000000,0.000000,0.003953
4,2,0.000000,0.000000,0.003509,0.000000
...,...,...,...,...,...
822,year,0.007874,0.008230,0.003509,0.003953
823,yes,0.000000,0.000000,0.003509,0.000000
824,yet,0.000000,0.000000,0.000000,0.003953
825,york,0.003937,0.004115,0.000000,0.000000


In [13]:
# IDF
idf = []
idf_df = pd.DataFrame()

documents = [article1, article2, article3, article4]
for word in allWords:
    count = 0
    for document in documents:
        if word in document:
            count += 1
            
    idf.append(math.log(1+4/count))

idf_df['words'] = allWords
idf_df['IDF'] = idf

idf_df

Unnamed: 0,words,IDF
0,0,1.609438
1,10,1.609438
2,13,1.609438
3,1990s,1.609438
4,2,1.609438
...,...,...
822,year,0.693147
823,yes,1.609438
824,yet,1.609438
825,york,1.098612


In [14]:
# TF-IDF
tfidf = pd.DataFrame()
tfidf['words'] = allWords

tfidf['Six Years And Counting...'] = np.array(tf_df['Six Years And Counting...']) * np.array(idf_df['IDF'])
tfidf['What Dreams May Come'] = np.array(tf_df['What Dreams May Come']) * np.array(idf_df['IDF'])
tfidf['Getting Saucy About Food'] = np.array(tf_df['Getting Saucy About Food']) * np.array(idf_df['IDF'])
tfidf['Train to Nowhere'] = np.array(tf_df['Train to Nowhere']) * np.array(idf_df['IDF'])

tfidf

Unnamed: 0,words,Six Years And Counting...,What Dreams May Come,Getting Saucy About Food,Train to Nowhere
0,0,0.000000,0.000000,0.005647,0.000000
1,10,0.000000,0.000000,0.011294,0.000000
2,13,0.000000,0.000000,0.000000,0.006361
3,1990s,0.000000,0.000000,0.000000,0.006361
4,2,0.000000,0.000000,0.005647,0.000000
...,...,...,...,...,...
822,year,0.005458,0.005705,0.002432,0.002740
823,yes,0.000000,0.000000,0.005647,0.000000
824,yet,0.000000,0.000000,0.000000,0.006361
825,york,0.004325,0.004521,0.000000,0.000000
