In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("preprocessed_IMDB_data.csv")
df

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod youll hoo...,positive
1,wonder littl product film techniqu unassum old...,positive
2,thought wonder way spend time hot summer weeke...,positive
3,basic there famili littl boy jake think there ...,negative
4,petter mattei love time money visual stun film...,positive
...,...,...
49995,thought movi right good job wasnt creativ orig...,positive
49996,bad plot bad dialogu bad act idiot direct anno...,negative
49997,cathol taught parochi elementari school nun ta...,negative
49998,im go disagre previou comment side maltin one ...,negative


## Embedding techniques:

- One Hot Code embedding
- TF-IDF
- Word2Vec
- N-grams
- Bag of words

In [3]:
dfx = df.iloc[:5]
dfx

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod youll hoo...,positive
1,wonder littl product film techniqu unassum old...,positive
2,thought wonder way spend time hot summer weeke...,positive
3,basic there famili littl boy jake think there ...,negative
4,petter mattei love time money visual stun film...,positive


### OneHotEncoding

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
bow = cv.fit_transform(dfx['review'])
bow

<5x391 sparse matrix of type '<class 'numpy.int64'>'
	with 445 stored elements in Compressed Sparse Row format>

In [6]:
bow.toarray(), bow.toarray().shape

(array([[0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 1, 1, 1],
        [0, 0, 1, ..., 0, 0, 0]], dtype=int64),
 (5, 391))

In [7]:
cv.vocabulary_

{'one': 220,
 'review': 266,
 'mention': 200,
 'watch': 362,
 'oz': 225,
 'episod': 94,
 'youll': 386,
 'hook': 143,
 'right': 267,
 'exactli': 98,
 'happen': 138,
 'meth': 202,
 'first': 113,
 'thing': 336,
 'struck': 315,
 'brutal': 31,
 'unflinch': 353,
 'scene': 275,
 'violenc': 358,
 'set': 285,
 'word': 376,
 'go': 126,
 'trust': 349,
 'show': 291,
 'faint': 105,
 'heart': 140,
 'timid': 342,
 'pull': 252,
 'punch': 253,
 'regard': 262,
 'drug': 85,
 'sex': 286,
 'hardcor': 139,
 'classic': 45,
 'use': 355,
 'wordit': 377,
 'call': 33,
 'nicknam': 217,
 'given': 124,
 'oswald': 224,
 'maximum': 196,
 'secur': 280,
 'state': 311,
 'penitentari': 230,
 'focus': 115,
 'mainli': 186,
 'emerald': 89,
 'citi': 43,
 'experiment': 101,
 'section': 279,
 'prison': 248,
 'cell': 38,
 'glass': 125,
 'front': 118,
 'face': 103,
 'inward': 156,
 'privaci': 249,
 'high': 141,
 'agenda': 7,
 'em': 88,
 'home': 142,
 'manyaryan': 192,
 'muslim': 210,
 'gangsta': 121,
 'latino': 170,
 'christian'

### N grams

In [8]:
cv = CountVectorizer(ngram_range=(2, 2))
bow = cv.fit_transform(dfx['review'])

bow.toarray(), bow.toarray().shape

(array([[0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 1, 1, 1],
        [0, 0, 1, ..., 0, 0, 0]], dtype=int64),
 (5, 518))

In [9]:
cv.vocabulary_

{'one review': 284,
 'review mention': 352,
 'mention watch': 252,
 'watch oz': 480,
 'oz episod': 293,
 'episod youll': 112,
 'youll hook': 513,
 'hook right': 180,
 'right exactli': 354,
 'exactli happen': 117,
 'happen meth': 173,
 'meth first': 254,
 'first thing': 137,
 'thing struck': 440,
 'struck oz': 415,
 'oz brutal': 292,
 'brutal unflinch': 32,
 'unflinch scene': 461,
 'scene violenc': 366,
 'violenc set': 471,
 'set right': 382,
 'right word': 355,
 'word go': 504,
 'go trust': 156,
 'trust show': 457,
 'show faint': 389,
 'faint heart': 124,
 'heart timid': 175,
 'timid show': 450,
 'show pull': 390,
 'pull punch': 336,
 'punch regard': 337,
 'regard drug': 347,
 'drug sex': 101,
 'sex violenc': 383,
 'violenc hardcor': 469,
 'hardcor classic': 174,
 'classic use': 50,
 'use wordit': 466,
 'wordit call': 505,
 'call oz': 34,
 'oz nicknam': 296,
 'nicknam given': 276,
 'given oswald': 152,
 'oswald maximum': 291,
 'maximum secur': 246,
 'secur state': 371,
 'state penitent

In [10]:
cv = CountVectorizer(ngram_range=(3, 3))
bow = cv.fit_transform(dfx['review'])

bow.toarray(), bow.toarray().shape

(array([[0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 1, 1, 1],
        [0, 0, 1, ..., 0, 0, 0]], dtype=int64),
 (5, 518))

In [11]:
cv.vocabulary_

{'one review mention': 285,
 'review mention watch': 353,
 'mention watch oz': 252,
 'watch oz episod': 480,
 'oz episod youll': 294,
 'episod youll hook': 112,
 'youll hook right': 513,
 'hook right exactli': 180,
 'right exactli happen': 355,
 'exactli happen meth': 117,
 'happen meth first': 173,
 'meth first thing': 254,
 'first thing struck': 137,
 'thing struck oz': 440,
 'struck oz brutal': 415,
 'oz brutal unflinch': 293,
 'brutal unflinch scene': 32,
 'unflinch scene violenc': 461,
 'scene violenc set': 367,
 'violenc set right': 471,
 'set right word': 382,
 'right word go': 356,
 'word go trust': 504,
 'go trust show': 156,
 'trust show faint': 457,
 'show faint heart': 389,
 'faint heart timid': 124,
 'heart timid show': 175,
 'timid show pull': 450,
 'show pull punch': 390,
 'pull punch regard': 337,
 'punch regard drug': 338,
 'regard drug sex': 348,
 'drug sex violenc': 101,
 'sex violenc hardcor': 383,
 'violenc hardcor classic': 469,
 'hardcor classic use': 174,
 'clas

### TF-IDF (Term frequency - Inverse Document Frequency)

- **Term frequency**: Total count of word in the corpus/total count of words in the entire corpus
- **Inverse Document frequency**: log(total documents/total documents containing the word)

- TF-IDF: TF*IDF

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
arr = tfidf.fit_transform(dfx['review'])
arr

<5x391 sparse matrix of type '<class 'numpy.float64'>'
	with 445 stored elements in Compressed Sparse Row format>

In [14]:
arr.toarray(), arr.toarray().shape

(array([[0.        , 0.06598016, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.10189415, 0.        , 0.        , ..., 0.10189415, 0.10189415,
         0.10189415],
        [0.        , 0.        , 0.07625326, ..., 0.        , 0.        ,
         0.        ]]),
 (5, 391))

In [18]:
# Length of each review
dfx['review'].apply(lambda x: len(x.split()))

0    168
1     84
2     86
3     67
4    125
Name: review, dtype: int64

In [22]:
# Unique words in reviews

len(set(' '.join(dfx['review']).split())), len(' '.join(dfx['review']).split())

(393, 530)

Limiting the output shape

In [24]:
tfidf = TfidfVectorizer(max_features=124)
arr = tfidf.fit_transform(dfx['review'])

arr.toarray(), arr.toarray().shape

(array([[0.06850371, 0.        , 0.13700743, 0.        , 0.        ,
         0.        , 0.06850371, 0.        , 0.06850371, 0.        ,
         0.        , 0.        , 0.16981717, 0.16981717, 0.06850371,
         0.        , 0.13700743, 0.25472576, 0.13700743, 0.05686427,
         0.        , 0.06850371, 0.        , 0.06850371, 0.        ,
         0.16981717, 0.06850371, 0.        , 0.16981717, 0.        ,
         0.06850371, 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.06850371, 0.        , 0.        ,
         0.        , 0.        , 0.06850371, 0.        , 0.        ,
         0.        , 0.04783602, 0.        , 0.08490859, 0.        ,
         0.08490859, 0.42454293, 0.08490859, 0.        , 0.        ,
         0.        , 0.06850371, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.08490859, 0.        , 0.25472576,
         0.08490859, 0.        , 0.        , 0.08490859, 0.08490859,
         0.        , 0.        , 0

In [25]:
tfidf.vocabulary_

{'one': 46,
 'review': 82,
 'watch': 106,
 'oz': 51,
 'episod': 13,
 'youll': 121,
 'right': 83,
 'exactli': 14,
 'first': 16,
 'thing': 97,
 'struck': 93,
 'scene': 87,
 'violenc': 103,
 'set': 91,
 'go': 19,
 'show': 92,
 'pull': 68,
 'punch': 69,
 'regard': 78,
 'use': 102,
 'oswald': 50,
 'citi': 2,
 'prison': 64,
 'privaci': 65,
 'high': 25,
 'home': 26,
 'never': 42,
 'would': 117,
 'say': 86,
 'due': 12,
 'wouldnt': 118,
 'dare': 8,
 'forget': 17,
 'pretti': 62,
 'pictur': 56,
 'paint': 52,
 'romanceoz': 85,
 'readi': 72,
 'got': 21,
 'guard': 23,
 'wholl': 112,
 'inmat': 28,
 'kill': 30,
 'order': 48,
 'get': 18,
 'well': 111,
 'may': 37,
 'becom': 0,
 'comfort': 6,
 'wonder': 115,
 'littl': 32,
 'product': 66,
 'film': 15,
 'techniqu': 94,
 'oldtimebbc': 45,
 'sens': 90,
 'realism': 74,
 'piec': 57,
 'michael': 38,
 'voic': 105,
 'see': 88,
 'refer': 77,
 'william': 113,
 'written': 119,
 'master': 35,
 'great': 22,
 'comedi': 5,
 'life': 31,
 'realli': 76,
 'come': 4,
 'rathe

### Word2Vec

In [26]:
from gensim.models import Word2Vec

In [65]:
word2vec_model = Word2Vec(sentences=[' '.join(dfx['review']).split()], vector_size=256, window=5, min_count=1, workers=4)
word2vec_model

<gensim.models.word2vec.Word2Vec at 0x1e63cecf6e0>

In [66]:
word2vec_model.wv.vectors, word2vec_model.wv.vectors.shape

(array([[-2.4314292e-04,  5.6130488e-05,  1.9958748e-03, ...,
         -3.0612214e-03, -3.6527112e-03, -2.3092553e-03],
        [-7.4167491e-04, -1.7233497e-03, -2.5248446e-03, ...,
         -2.6940028e-03, -1.9982923e-03, -8.6612219e-04],
        [-2.8478925e-03, -3.7798032e-03, -1.0706887e-03, ...,
          2.4490836e-03, -3.7442558e-03,  3.7732017e-03],
        ...,
        [ 5.3918629e-04, -1.1378202e-03,  1.3306487e-04, ...,
          2.9087178e-03,  3.4653079e-03,  3.5367157e-06],
        [-2.1429223e-03, -5.0524645e-04,  7.2779582e-04, ...,
         -4.3047793e-04,  2.8273454e-03, -1.9806786e-03],
        [-2.1691457e-03,  2.2304268e-03,  3.3922337e-03, ...,
         -2.8403553e-03,  5.2050402e-04,  9.3253248e-04]], dtype=float32),
 (393, 256))

In [68]:
word2vec_model.wv.index_to_key, len(word2vec_model.wv.index_to_key)

(['one',
  'watch',
  'well',
  'oz',
  'film',
  'violenc',
  'mattei',
  'comedi',
  'movi',
  'jake',
  'show',
  'see',
  'us',
  'may',
  'citi',
  'littl',
  'parent',
  'charact',
  'time',
  'drama',
  'say',
  'play',
  'mr',
  'peopl',
  'differ',
  'prison',
  'get',
  'forget',
  'right',
  'first',
  'thing',
  'go',
  'pictur',
  'inmat',
  'wholl',
  'guard',
  'got',
  'closet',
  'struck',
  'decid',
  'exactli',
  'dare',
  'make',
  'thriller',
  'due',
  'good',
  'never',
  'scene',
  'kill',
  'there',
  'set',
  'piec',
  'great',
  'come',
  'master',
  'particularli',
  'halliwel',
  'thought',
  'way',
  'michael',
  'realism',
  'becom',
  'sens',
  'episod',
  'techniqu',
  'product',
  'wonder',
  'point',
  'comfort',
  'woodi',
  'money',
  'life',
  'use',
  'home',
  'relat',
  'human',
  'next',
  'high',
  'connect',
  'seem',
  'live',
  'oswald',
  'side',
  'maximum',
  'hook',
  'youll',
  'given',
  'unassum',
  'oldtimebbc',
  'nicknam',
  'fash

In [69]:
word2vec_model.wv['one']

array([-2.43142917e-04,  5.61304878e-05,  1.99587480e-03,  3.56459105e-03,
       -3.53632192e-03, -2.80621927e-03,  2.56203720e-03,  3.46454000e-03,
       -1.99400261e-03, -1.42733962e-03,  2.85960338e-03, -5.85274480e-04,
       -1.74364948e-03,  2.50855787e-03, -1.85295020e-03, -6.53701078e-04,
        1.14376494e-03,  4.45342739e-04, -3.29374475e-03, -3.60743958e-03,
        2.87412875e-03,  1.97668560e-03,  2.58879084e-03,  2.60884524e-04,
        2.44844519e-03, -1.31761224e-03, -3.44514003e-04,  2.27969582e-03,
       -2.98979133e-03, -1.54047809e-03, -2.87701865e-03, -3.49928887e-04,
        3.69368610e-03, -2.85631302e-03, -9.47915774e-04, -7.08963722e-04,
        3.24416184e-03, -2.33867741e-03,  3.95296847e-05, -1.87223591e-03,
       -3.80229158e-03,  1.98268425e-03, -3.45087913e-03, -1.74868794e-03,
        4.44245416e-05, -1.44230813e-04, -2.98233214e-03,  3.72913363e-03,
        1.96951651e-03,  3.59852496e-03, -3.16534447e-03,  1.77136995e-03,
       -1.65080908e-03,  

In [71]:
word2vec_model.wv.similar_by_word("violenc")

[('id', 0.18464097380638123),
 ('disappear', 0.17715679109096527),
 ('wordit', 0.16892026364803314),
 ('never', 0.1624441295862198),
 ('action', 0.15442143380641937),
 ('may', 0.15154515206813812),
 ('career', 0.14450590312480927),
 ('plot', 0.13738539814949036),
 ('averag', 0.13637174665927887),
 ('womanthi', 0.13394777476787567)]

In [73]:
word2vec_model.wv.similar_by_word("film")

[('brutal', 0.16671188175678253),
 ('fantasi', 0.16640889644622803),
 ('make', 0.15071865916252136),
 ('best', 0.14366480708122253),
 ('injustic', 0.13983793556690216),
 ('kill', 0.13817168772220612),
 ('due', 0.13038897514343262),
 ('live', 0.1301393210887909),
 ('trust', 0.12347991019487381),
 ('mess', 0.12122917175292969)]