In [1]:
import pandas as pd
import numpy as np
dataset_1 = pd.read_csv('data/dataset1_preprocessing.csv') 
dataset_2 = pd.read_csv('data/dataset2_preprocessing.csv') 

<h3>TF-IDF of Dataset 1</h3>

In [2]:
dataset_1.head()

Unnamed: 0,title,description,content,length,article
0,NTSB says Autopilot engaged in 2018 California...,The National Transportation Safety Board said ...,WASHINGTON (Reuters) - The National Transporta...,578,"['ntsb', 'says', 'autopilot', 'engaged', 'cali..."
1,Unemployment falls to post-crash low of 5.2%,Latest monthly figures reflect continued growt...,The States jobless rate fell to 5.2 per cent l...,387,"['unemployment', 'falls', 'post', 'crash', 'low']"
2,"Louise Kennedy AW2019: Long coats, sparkling t...",Autumn-winter collection features designer’s g...,Louise Kennedy is showing off her autumn-winte...,432,"['louise', 'kennedy', 'aw2019', 'long', 'coats..."
3,North Korean footballer Han joins Italian gian...,Han is the first North Korean player in the Se...,"Han Kwang Song, the first North Korean footbal...",446,"['north', 'korean', 'footballer', 'han', 'join..."
4,'This Tender Land' is an affecting story about...,"""This Tender Land"" by William Kent Krueger is ...","""This Tender Land: a Novel"" (Atria Books), by ...",500,"['tender', 'land', 'affecting', 'story', 'grow..."


In [3]:
# convert list formated string to list
import ast

def convert_text_list(texts):
    texts = ast.literal_eval(texts)
    return [text for text in texts]

dataset_1["article_list"] = dataset_1["article"].apply(convert_text_list)


print(dataset_1["article_list"][90])

print("\ntype : ", type(dataset_1["article_list"][90]))

['gregor', 'townsend', 'believes', 'scotland', 'never', 'better', 'position']

type :  <class 'list'>


In [4]:
def calc_TF(document):
    # Counts the number of times the word appears in review
    TF_dict = {}
    for term in document:
        if term in TF_dict:
            TF_dict[term] += 1
        else:
            TF_dict[term] = 1
    # Computes tf for each word
    for term in TF_dict:
        TF_dict[term] = TF_dict[term] / len(document)
    return TF_dict

dataset_1["TF_dict"] = dataset_1['article_list'].apply(calc_TF)

dataset_1["TF_dict"].head()

0    {'ntsb': 0.14285714285714285, 'says': 0.142857...
1    {'unemployment': 0.2, 'falls': 0.2, 'post': 0....
2    {'louise': 0.1, 'kennedy': 0.1, 'aw2019': 0.1,...
3    {'north': 0.125, 'korean': 0.125, 'footballer'...
4    {'tender': 0.2, 'land': 0.2, 'affecting': 0.2,...
Name: TF_dict, dtype: object

In [5]:
# Check TF result
index = 90

print('%20s' % "term", "\t", "TF\n")
for key in dataset_1["TF_dict"][index]:
    print('%20s' % key, "\t", dataset_1["TF_dict"][index][key])

                term 	 TF

              gregor 	 0.14285714285714285
            townsend 	 0.14285714285714285
            believes 	 0.14285714285714285
            scotland 	 0.14285714285714285
               never 	 0.14285714285714285
              better 	 0.14285714285714285
            position 	 0.14285714285714285


In [6]:
def calc_DF(tfDict):
    count_DF = {}
    # Run through each document's tf dictionary and increment countDict's (term, doc) pair
    for document in tfDict:
        for term in document:
            if term in count_DF:
                count_DF[term] += 1
            else:
                count_DF[term] = 1
    return count_DF

DF = calc_DF(dataset_1["TF_dict"])
DF

{'ntsb': 8,
 'says': 440,
 'autopilot': 4,
 'engaged': 2,
 'california': 77,
 'tesla': 22,
 'crash': 53,
 'unemployment': 4,
 'falls': 14,
 'post': 32,
 'low': 39,
 'louise': 4,
 'kennedy': 11,
 'aw2019': 3,
 'long': 40,
 'coats': 2,
 'sparkling': 3,
 'tweed': 3,
 'dresses': 2,
 'emerald': 3,
 'knits': 3,
 'north': 70,
 'korean': 10,
 'footballer': 2,
 'han': 1,
 'joins': 11,
 'italian': 23,
 'giants': 8,
 'juventus': 5,
 'tender': 2,
 'land': 11,
 'affecting': 3,
 'story': 35,
 'growing': 15,
 'eu': 80,
 'wants': 43,
 'see': 33,
 'lawmakers': 16,
 'block': 23,
 'brexit': 243,
 'striking': 5,
 'new': 498,
 'deal': 163,
 'uks': 12,
 'johnson': 142,
 'european': 21,
 'third': 32,
 'quarter': 22,
 'profit': 11,
 'outlook': 11,
 'improves': 5,
 'slightly': 1,
 'still': 57,
 'recession': 25,
 'refinitv': 1,
 'emotional': 6,
 'support': 41,
 'animals': 10,
 'allowed': 9,
 'flights': 19,
 'boris': 84,
 'meet': 44,
 'leo': 3,
 'varadkar': 16,
 'dublin': 49,
 'monday': 18,
 'afghan': 13,
 'peac

In [7]:
# Menghitung IDF
n_document = len(dataset_1)

def calc_IDF(__n_document, __DF):
    IDF_Dict = {}
    for term in __DF:
        IDF_Dict[term] = np.log(__n_document / (__DF[term] + 1))
    return IDF_Dict
  
#Stores the idf dictionary
IDF = calc_IDF(n_document, DF)

In [8]:
#calc TF-IDF
def calc_TF_IDF(TF):
    TF_IDF_Dict = {}
    #For each word in the review, we multiply its tf and its idf.
    for key in TF:
        TF_IDF_Dict[key] = TF[key] * IDF[key]
    return TF_IDF_Dict

#Stores the TF-IDF Series
dataset_1["TF-IDF_dict"] = dataset_1["TF_dict"].apply(calc_TF_IDF)

In [9]:
# Check TF-IDF result
index = 90

print('%20s' % "term", "\t", '%10s' % "TF", "\t", '%20s' % "TF-IDF\n")
for key in dataset_1["TF-IDF_dict"][index]:
    print('%20s' % key, "\t", dataset_1["TF_dict"][index][key] ,"\t" , dataset_1["TF-IDF_dict"][index][key])

                term 	         TF 	              TF-IDF

              gregor 	 0.14285714285714285 	 1.1985448945837724
            townsend 	 0.14285714285714285 	 1.1985448945837724
            believes 	 0.14285714285714285 	 1.0995238687894944
            scotland 	 0.14285714285714285 	 0.7705831412189165
               never 	 0.14285714285714285 	 0.8215367046387354
              better 	 0.14285714285714285 	 0.8267320823774319
            position 	 0.14285714285714285 	 1.0195787562272913


In [10]:
# sort descending by value for DF dictionary 
sorted_DF = sorted(DF.items(), key=lambda kv: kv[1], reverse=True)[:50]

# Create a list of unique words from sorted dictionay `sorted_DF`
unique_term = [item[0] for item in sorted_DF]

def calc_TF_IDF_Vec(__TF_IDF_Dict):
    TF_IDF_vector = [0.0] * len(unique_term)

    # For each unique word, if it is in the review, store its TF-IDF value.
    for i, term in enumerate(unique_term):
        if term in __TF_IDF_Dict:
            TF_IDF_vector[i] = __TF_IDF_Dict[term]
    return TF_IDF_vector

dataset_1["TF_IDF_Vec"] = dataset_1["TF-IDF_dict"].apply(calc_TF_IDF_Vec)

print("print first row matrix TF_IDF_Vec Series\n")
print(dataset_1["TF_IDF_Vec"][0])

print("\nmatrix size : ", len(dataset_1["TF_IDF_Vec"][0]))

print first row matrix TF_IDF_Vec Series

[0.0, 0.0, 0.0, 0.4277023667427866, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

matrix size :  50


In [11]:
# Convert Series to List
TF_IDF_Vec_List = np.array(dataset_1["TF_IDF_Vec"].to_list())

# Sum element vector in axis=0 
sums = TF_IDF_Vec_List.sum(axis=0)

data = []

for col, term in enumerate(unique_term):
    data.append((term, sums[col]))
    
ranking = pd.DataFrame(data, columns=['term', 'rank'])
ranking.sort_values('rank', ascending=False)

Unnamed: 0,term,rank
48,gmt,407.425084
0,us,203.785911
2,trump,174.14294
1,new,169.452673
3,says,151.907922
4,brexit,117.421807
5,man,97.864182
6,world,91.607767
8,deal,80.335355
7,year,79.982706


<h3>TF-IDF of Dataset 2</h3>

In [12]:
dataset_2.head()

Unnamed: 0,Title,Plot,Movie,length
0,Kansas Saloon Smashers,"A bartender is working at a saloon, serving dr...","['bartender', 'working', 'saloon', 'serving', ...",522
1,Love by the Light of the Moon,"The moon, painted with a smiling face hangs ov...","['moon', 'painted', 'smiling', 'face', 'hangs'...",466
2,The Martyred Presidents,"The film, just over a minute long, is composed...","['film', 'minute', 'long', 'composed', 'two', ...",459
3,"Terrible Teddy, the Grizzly King",Lasting just 61 seconds and consisting of two ...,"['lasting', 'seconds', 'consisting', 'two', 's...",922
4,Jack and the Beanstalk,The earliest known adaptation of the classic f...,"['earliest', 'known', 'adaptation', 'classic',...",754


In [13]:
# convert list formated string to list
import ast

def convert_text_list(texts):
    texts = ast.literal_eval(texts)
    return [text for text in texts]

dataset_2["movie_list"] = dataset_2["Movie"].apply(convert_text_list)


print(dataset_2["movie_list"][90])

print("\ntype : ", type(dataset_2["movie_list"][90]))

['chaplins', 'character', 'attempts', 'convince', 'passerby', 'director', 'henry', 'lehrman', 'give', 'money', 'chaplin', 'shown', 'flirting', 'woman', 'proposes', 'accepts', 'lehrman', 'enters', 'present', 'woman', 'flowers', 'ring', 'woman', 'refuses', 'citing', 'shes', 'engaged', 'lerhman', 'sees', 'chaplin', 'slapstick', 'fight', 'two', 'ensues', 'later', 'lehrmans', 'character', 'takes', 'photograph', 'automobile', 'accident', 'chaplins', 'character', 'steals', 'camera', 'whilst', 'journalist', 'helping', 'trapped', 'motorist', 'rushes', 'back', 'paper', 'claim', 'photograph', 'short', 'pursuit', 'keystone', 'kops', 'follows']

type :  <class 'list'>


In [14]:
def calc_TF(document):
    # Counts the number of times the word appears in review
    TF_dict = {}
    for term in document:
        if term in TF_dict:
            TF_dict[term] += 1
        else:
            TF_dict[term] = 1
    # Computes tf for each word
    for term in TF_dict:
        TF_dict[term] = TF_dict[term] / len(document)
    return TF_dict

dataset_2["TF_dict"] = dataset_2['movie_list'].apply(calc_TF)

dataset_2["TF_dict"].head()

0    {'bartender': 0.041666666666666664, 'working':...
1    {'moon': 0.06818181818181818, 'painted': 0.022...
2    {'film': 0.023809523809523808, 'minute': 0.023...
3    {'lasting': 0.011764705882352941, 'seconds': 0...
4    {'earliest': 0.014084507042253521, 'known': 0....
Name: TF_dict, dtype: object

In [15]:
# Check TF result
index = 90

print('%20s' % "term", "\t", "TF\n")
for key in dataset_2["TF_dict"][index]:
    print('%20s' % key, "\t", dataset_2["TF_dict"][index][key])

                term 	 TF

            chaplins 	 0.03333333333333333
           character 	 0.05
            attempts 	 0.016666666666666666
            convince 	 0.016666666666666666
            passerby 	 0.016666666666666666
            director 	 0.016666666666666666
               henry 	 0.016666666666666666
             lehrman 	 0.03333333333333333
                give 	 0.016666666666666666
               money 	 0.016666666666666666
             chaplin 	 0.03333333333333333
               shown 	 0.016666666666666666
            flirting 	 0.016666666666666666
               woman 	 0.05
            proposes 	 0.016666666666666666
             accepts 	 0.016666666666666666
              enters 	 0.016666666666666666
             present 	 0.016666666666666666
             flowers 	 0.016666666666666666
                ring 	 0.016666666666666666
             refuses 	 0.016666666666666666
              citing 	 0.016666666666666666
                shes 	 0.016666666666666