In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import random
import string
import re
import heapq

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [87]:
data = pd.read_csv('IMDB.csv')

In [33]:
data.head()

Unnamed: 0,review,sentiment
0,bad plot bad dialogu bad act idiot direct anno...,negative
1,thought thi movi right good job wasnt creativ ...,positive
2,robert colomb ha two fulltim job known through...,negative
3,le visiteur first movi mediev time travel wa a...,negative
4,first tune thi morn news thought wow final ent...,negative


In [34]:
data.loc[data['sentiment'] == 'positive', 'sentiment'] = 1
data.loc[data['sentiment'] == 'negative', 'sentiment'] = 0

In [35]:
data.head()

Unnamed: 0,review,sentiment
0,bad plot bad dialogu bad act idiot direct anno...,0
1,thought thi movi right good job wasnt creativ ...,1
2,robert colomb ha two fulltim job known through...,0
3,le visiteur first movi mediev time travel wa a...,0
4,first tune thi morn news thought wow final ent...,0


In [36]:
labels = data['sentiment']
corpus = data['review']

In [37]:
corpus[0]

'bad plot bad dialogu bad act idiot direct annoy porn groov soundtrack ran continu overact script crappi copi vh cannot redeem consum liquor trust becaus stuck thi turkey end wa pathet bad figur wa fourthrat spoof springtim hitlerth girl play jani joplin wa onli faint spark interest wa onli becaus could sing better originalif want watch someth similar thousand time better watch beyond valley doll'

In [38]:
def tokenizer(corpus_list):
    list_len = len(corpus_list)
    for i in range(list_len):
        corpus_list[i] = nltk.word_tokenize(corpus_list[i])   # Sent_tokenize() 
    return corpus_list

In [39]:
new_corpus = tokenizer(corpus)

In [40]:
new_corpus[0]

['bad',
 'plot',
 'bad',
 'dialogu',
 'bad',
 'act',
 'idiot',
 'direct',
 'annoy',
 'porn',
 'groov',
 'soundtrack',
 'ran',
 'continu',
 'overact',
 'script',
 'crappi',
 'copi',
 'vh',
 'can',
 'not',
 'redeem',
 'consum',
 'liquor',
 'trust',
 'becaus',
 'stuck',
 'thi',
 'turkey',
 'end',
 'wa',
 'pathet',
 'bad',
 'figur',
 'wa',
 'fourthrat',
 'spoof',
 'springtim',
 'hitlerth',
 'girl',
 'play',
 'jani',
 'joplin',
 'wa',
 'onli',
 'faint',
 'spark',
 'interest',
 'wa',
 'onli',
 'becaus',
 'could',
 'sing',
 'better',
 'originalif',
 'want',
 'watch',
 'someth',
 'similar',
 'thousand',
 'time',
 'better',
 'watch',
 'beyond',
 'valley',
 'doll']

## Building TF-iDF model from scratch

In [43]:
def word_freq(token_list):
    list_len = len(token_list)
    wordfreq = {}
    for i in range(list_len):
        for token in token_list[i]:
            if token not in wordfreq.keys():
                wordfreq[token] = 1
            else:
                wordfreq[token] += 1
    return wordfreq

In [44]:
word_frequency  = word_freq(new_corpus)

In [45]:
word_frequency

{'bad': 14108,
 'plot': 10383,
 'dialogu': 2552,
 'act': 13394,
 'idiot': 749,
 'direct': 5506,
 'annoy': 1807,
 'porn': 537,
 'groov': 65,
 'soundtrack': 1346,
 'ran': 373,
 'continu': 1775,
 'overact': 350,
 'script': 5018,
 'crappi': 423,
 'copi': 1257,
 'vh': 449,
 'can': 1639,
 'not': 1639,
 'redeem': 690,
 'consum': 160,
 'liquor': 33,
 'trust': 555,
 'becaus': 14014,
 'stuck': 536,
 'thi': 116253,
 'turkey': 338,
 'end': 14512,
 'wa': 76390,
 'pathet': 776,
 'figur': 1780,
 'fourthrat': 2,
 'spoof': 404,
 'springtim': 8,
 'hitlerth': 3,
 'girl': 6063,
 'play': 13676,
 'jani': 17,
 'joplin': 7,
 'onli': 18460,
 'faint': 99,
 'spark': 221,
 'interest': 7593,
 'could': 12152,
 'sing': 1376,
 'better': 8872,
 'originalif': 1,
 'want': 10493,
 'watch': 21711,
 'someth': 7854,
 'similar': 1535,
 'thousand': 473,
 'time': 23835,
 'beyond': 1440,
 'valley': 191,
 'doll': 321,
 'thought': 5805,
 'movi': 79203,
 'right': 5327,
 'good': 23018,
 'job': 3713,
 'wasnt': 3605,
 'creativ': 739,

In [47]:
most_freq = heapq.nlargest(200, word_frequency, key=word_frequency.get)

In [48]:
most_freq

['thi',
 'movi',
 'wa',
 'film',
 'hi',
 'one',
 'like',
 'ha',
 'time',
 'good',
 'make',
 'get',
 'see',
 'charact',
 'veri',
 'watch',
 'even',
 'stori',
 'would',
 'onli',
 'realli',
 'scene',
 'show',
 'look',
 'well',
 'much',
 'end',
 'peopl',
 'great',
 'love',
 'bad',
 'also',
 'becaus',
 'think',
 'play',
 'go',
 'first',
 'act',
 'dont',
 'way',
 'thing',
 'made',
 'could',
 'ani',
 'know',
 'say',
 'seem',
 'mani',
 'actor',
 'work',
 'want',
 'seen',
 'two',
 'plot',
 'come',
 'never',
 'take',
 'best',
 'tri',
 'littl',
 'year',
 'life',
 'ever',
 'doe',
 'give',
 'better',
 'man',
 'still',
 'find',
 'perform',
 'feel',
 'use',
 'whi',
 'part',
 'actual',
 'someth',
 'lot',
 'interest',
 'back',
 'real',
 'guy',
 'director',
 'didnt',
 'doesnt',
 'Im',
 'cast',
 'though',
 'enjoy',
 'funni',
 'music',
 'anoth',
 'live',
 'befor',
 'noth',
 'role',
 'new',
 'start',
 'old',
 'everi',
 'point',
 'believ',
 'girl',
 'star',
 'cant',
 'set',
 'origin',
 'turn',
 'quit',
 'fa

In [52]:
def Calculate_iDF(corpus):
    word_idf_values = {}
    for token in most_freq:
        doc_containing_word = 0
        for document in corpus:
            if token in document:
                doc_containing_word += 1
        word_idf_values[token] = np.log(len(corpus)/(1 + doc_containing_word))
    return word_idf_values

In [53]:
word_idf_values = Calculate_iDF(new_corpus)

In [54]:
word_idf_values

{'thi': 0.10702857275852325,
 'movi': 0.44781170173923857,
 'wa': 0.4423884622127622,
 'film': 0.5337531284096954,
 'hi': 0.8624538002764053,
 'one': 0.5746532865121561,
 'like': 0.7083115810797177,
 'ha': 0.9193579309236253,
 'time': 0.9391756044116141,
 'good': 0.9861768593383216,
 'make': 0.974648393936645,
 'get': 1.0351446532239963,
 'see': 0.9955368818840006,
 'charact': 1.1239300966523995,
 'veri': 1.072579111621294,
 'watch': 1.0241543777500999,
 'even': 1.1005892415523308,
 'stori': 1.1899877197592734,
 'would': 1.147986373285909,
 'onli': 1.116878097475002,
 'realli': 1.2048898913884196,
 'scene': 1.3495767781847279,
 'show': 1.5516408137078836,
 'look': 1.3168615864045539,
 'well': 1.3015856331929365,
 'much': 1.2881699631266916,
 'end': 1.357415401569514,
 'peopl': 1.407824479670254,
 'great': 1.3879958077596481,
 'love': 1.4662037443481926,
 'bad': 1.4788483427304986,
 'also': 1.3884967846750909,
 'becaus': 1.4091536552659731,
 'think': 1.3808094308893208,
 'play': 1.48368

In [55]:
def Calculate_tf(corpus):
    word_tf_values = {}
    for token in most_freq:
        sent_tf_vector = []
        for document in corpus:
            
            doc_freq = 0
            for word in document:
                if token == word:
                      doc_freq += 1
            word_tf = doc_freq/len(document)
            sent_tf_vector.append(word_tf)
        word_tf_values[token] = sent_tf_vector
        
    return word_tf_values

In [56]:
tf_vect = Calculate_tf(new_corpus)

In [62]:
tf_vect['actor']

[0.0,
 0.0,
 0.0,
 0.0136986301369863,
 0.0,
 0.0,
 0.0,
 0.025974025974025976,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0041841004184100415,
 0.0,
 0.010309278350515464,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.014925373134328358,
 0.011494252873563218,
 0.0,
 0.0,
 0.0,
 0.006172839506172839,
 0.0,
 0.011627906976744186,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0037174721189591076,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0018587360594795538,
 0.0,
 0.0,
 0.0,
 0.012012012012012012,
 0.0,
 0.0,
 0.0,
 0.013888888888888888,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.011111111111111112,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.02,
 0.0078125,
 0.0,
 0.037037037037037035,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.00819672131147541,
 0.0,
 0.004629629629629629,
 0.0,
 0.0,
 0.0,
 0.0,
 0.009174311926605505,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.002857142857142857,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0125,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,


In [63]:
tfidf_values = []
for token in tf_vect.keys():
    tfidf_corpus = []
    for tf_corpus in tf_vect[token]:
        tf_idf_score = tf_corpus * word_idf_values[token]
        tfidf_corpus.append(tf_idf_score)
    tfidf_values.append(tfidf_corpus)

In [66]:
tfidf_values[0]

[0.0016216450417958067,
 0.005754224341856088,
 0.0015511387356307717,
 0.002199217248462806,
 0.0019698510323654587,
 0.005460641467271594,
 0.0025482993513934104,
 0.0013899814643964059,
 0.004864935125387421,
 0.002918961075232452,
 0.0015511387356307717,
 0.001861366482756926,
 0.0042542738125772835,
 0.0025084821740278887,
 0.001103387348026013,
 0.0014865079549794896,
 0.0017079027567849453,
 0.0038618557180910445,
 0.0,
 0.0,
 0.003194882768911142,
 0.004920853919932103,
 0.0033446428987038515,
 0.012231836886688371,
 0.0015074446867397642,
 0.0039640212132786385,
 0.0031478991987800953,
 0.00497807315155922,
 0.0031478991987800953,
 0.0026104529941103232,
 0.007644898054180231,
 0.0015289796108360464,
 0.0008919047729876937,
 0.0,
 0.0011936272054853894,
 0.0,
 0.004864935125387421,
 0.00258940095383524,
 0.0041164835676355096,
 0.003580881616456168,
 0.005633082776764381,
 0.0014969031155038217,
 0.0038224490270901156,
 0.0003214071254009707,
 0.0,
 0.0062957983975601905,
 0.0

In [67]:
tf_idf_model = np.asarray(tfidf_values)

In [70]:
tf_idf_model.shape  

(200, 40000)

In [108]:
tf_idf_model = np.transpose(tf_idf_model)

In [109]:
len(tf_idf_model[0])

200

## TF-iDF using Python libraries :)

In the previous seciton, we built a tf-idf model from scratch. however there can be easier way to do so. In this section, we are going to see how we can build the model with just a few lines of code. 

In [100]:
data = pd.read_csv('IMDB.csv')

In [101]:
labels = data['sentiment']
corpus = data['review']

In [102]:
corpus[0]

'bad plot bad dialogu bad act idiot direct annoy porn groov soundtrack ran continu overact script crappi copi vh cannot redeem consum liquor trust becaus stuck thi turkey end wa pathet bad figur wa fourthrat spoof springtim hitlerth girl play jani joplin wa onli faint spark interest wa onli becaus could sing better originalif want watch someth similar thousand time better watch beyond valley doll'

In [106]:
#Count vectorizer for bag of words
cv=CountVectorizer(min_df=0,max_df=1,binary=False)
#transformed train reviews
cv_train_reviews=cv.fit_transform(corpus)


print('BOW_cv_train:',cv_train_reviews.shape)

#vocab=cv.get_feature_names()-toget feature names

BOW_cv_train: (40000, 106582)


In [107]:
#Tfidf vectorizer
tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True)  # other options such as : ngram_range=(1,3)
#transformed train reviews
tv_train_reviews=tv.fit_transform(corpus)

print('Tfidf_train:',tv_train_reviews.shape)

Tfidf_train: (40000, 106582)
