# Feature Extraction with IMDb Reviews

# Library Imports

In [93]:
import pandas as pd
import os
import tensorflow as tf

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

# Loading the Dataset

In [3]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset = tf.keras.utils.get_file("aclImdb_v1", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [4]:
os.listdir(dataset_dir)

['imdb.vocab', 'test', 'README', 'imdbEr.txt', 'train']

In [5]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['neg',
 'urls_neg.txt',
 'urls_pos.txt',
 'unsup',
 'pos',
 'urls_unsup.txt',
 'unsupBow.feat',
 'labeledBow.feat']

In [6]:
sample_file = os.path.join(train_dir, 'neg/36_4.txt')
with open(sample_file) as f:
  print(f.read())

This film seemed way too long even at only 75 minutes. The problem with jungle horror films is that there is always way too much footage of people walking (through the jungle, up a rocky cliff, near a river or lake) to pad out the running time. The film is worth seeing for the laughable and naked native zombie with big bulging, bloody eyes which is always accompanied on the soundtrack with heavy breathing and lots of reverb. Eurotrash fans will be plenty entertained by the bad English dubbing, gratuitous female flesh and very silly makeup jobs on the monster and native extras. For a zombie/cannibal flick this was pretty light on the gore but then I probably didn't see an uncut version.


In [19]:
df = pd.DataFrame()
for i in ('test', 'train'):
  for j in ('pos', 'neg'):
    path = os.path.join(dataset_dir, i, j)
    for fl in os.listdir(path):
      with open(os.path.join(path, fl), 'r', encoding = 'utf-8') as f:
        df = df.append([[f.read()]], ignore_index = True)
df.columns = ['review']

In [20]:
df.head()

Unnamed: 0,review
0,"My children, DD 7 and DS 10, enjoyed the movie..."
1,One of the many speculations about Y2K was tha...
2,Rajkumar Santoshi tries his hands at comedy an...
3,Bridget Fonda has disappointed me several time...
4,"Watching this movie, I can't help drawing the ..."


In [24]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [25]:
reviewCorpus = df.review.str.cat(sep = ' ')
tokens = word_tokenize(reviewCorpus)
tokens

['My',
 'children',
 ',',
 'DD',
 '7',
 'and',
 'DS',
 '10',
 ',',
 'enjoyed',
 'the',
 'movie',
 'so',
 'much',
 'they',
 'were',
 'squirming',
 'in',
 'their',
 'seats',
 '.',
 'It',
 'was',
 'good',
 ',',
 'old',
 'fashioned',
 ',',
 'Rated',
 'G',
 ',',
 'family',
 'fun',
 '.',
 'This',
 'movie',
 'was',
 'made',
 'for',
 'kids',
 '...',
 '.',
 'someone',
 'really',
 'understands',
 'them.',
 '<',
 'br',
 '/',
 '>',
 '<',
 'br',
 '/',
 '>',
 'It',
 'was',
 'fun',
 'to',
 'see',
 'Julia',
 'Roberts',
 ',',
 'Brice',
 'Willis',
 ',',
 'Garth',
 'Brooks',
 'and',
 'the',
 'other',
 'stars',
 'make',
 'their',
 'cameo',
 'appearances.',
 '<',
 'br',
 '/',
 '>',
 '<',
 'br',
 '/',
 '>',
 'As',
 'someone',
 'who',
 'lives',
 'in',
 'the',
 'city',
 'the',
 'fictional',
 '``',
 'Big',
 'Texas',
 "''",
 'was',
 'modeled',
 'after',
 ',',
 'I',
 'can',
 'say',
 'that',
 'they',
 'did',
 'an',
 'honest',
 'and',
 'accurate',
 'portrayal',
 '.',
 'The',
 'kids',
 'looked',
 'like',
 'kids',
 

In [38]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [52]:
stopWords = set(stopwords.words('english'))
tokens = [word.lower() for word in tokens if ((word not in stopWords) and
          len(word) > 2 and word.isalpha())]
tokens

['children',
 'enjoyed',
 'movie',
 'much',
 'squirming',
 'seats',
 'good',
 'old',
 'fashioned',
 'rated',
 'family',
 'fun',
 'movie',
 'made',
 'kids',
 'someone',
 'really',
 'understands',
 'fun',
 'see',
 'julia',
 'roberts',
 'brice',
 'willis',
 'garth',
 'brooks',
 'stars',
 'make',
 'cameo',
 'someone',
 'lives',
 'city',
 'fictional',
 'big',
 'texas',
 'modeled',
 'say',
 'honest',
 'accurate',
 'portrayal',
 'kids',
 'looked',
 'like',
 'kids',
 'like',
 'hope',
 'everyone',
 'supports',
 'movie',
 'send',
 'message',
 'hollywood',
 'need',
 'movies',
 'like',
 'see',
 'spread',
 'word',
 'one',
 'many',
 'speculations',
 'world',
 'going',
 'end',
 'stroke',
 'midnight',
 'december',
 'book',
 'life',
 'hal',
 'hartley',
 'takes',
 'look',
 'possible',
 'ramifications',
 'new',
 'millennium',
 'armageddon',
 'beginning',
 'return',
 'jesus',
 'earth',
 'new',
 'year',
 'eve',
 'story',
 'examines',
 'task',
 'son',
 'god',
 'must',
 'open',
 'remaining',
 'three',
 'seve

In [53]:
vocabulary = set(tokens)
print(len(vocabulary))
print(vocabulary)

96242


In [90]:
freqDist = nltk.FreqDist(tokens)
freqDist = sorted(freqDist, key=freqDist.__getitem__, reverse=True)
freqDist[0:50]

['movie',
 'film',
 'one',
 'like',
 'good',
 'would',
 'even',
 'time',
 'really',
 'see',
 'story',
 'much',
 'well',
 'could',
 'get',
 'people',
 'great',
 'also',
 'bad',
 'first',
 'made',
 'make',
 'way',
 'movies',
 'think',
 'characters',
 'watch',
 'character',
 'films',
 'many',
 'seen',
 'two',
 'never',
 'love',
 'acting',
 'plot',
 'best',
 'know',
 'show',
 'little',
 'life',
 'ever',
 'better',
 'man',
 'still',
 'say',
 'scene',
 'end',
 'scenes',
 'something']

In [91]:
bow = pd.DataFrame(columns = freqDist)
bow

Unnamed: 0,movie,film,one,like,good,would,even,time,really,see,story,much,well,could,get,people,great,also,bad,first,made,make,way,movies,think,characters,watch,character,films,many,seen,two,never,love,acting,plot,best,know,show,little,...,resourcefully,porely,beatin,legiunea,sträinä,positivism,festivism,perniciously,negativism,cursa,proba,microfon,vânätoarea,vulpi,croaziera,glissando,gwot,ordinance,actra,ceeb,dillute,overglamorize,angeline,entangle,digusted,titillates,saltshaker,geranium,uninflected,yamashiro,grandkid,caroon,blag,malkovichian,mcnee,colico,misjudgement,chronologies,esqueleto,chancho


In [92]:
for index, row in df[0:10].iterrows():
  vocabDict = dict()
  for word in freqDist:
    vocabDict[word] = 0
  for word in row.review.split(" "):
    word = word.lower()
    if word in vocabDict.keys():
      vocabDict[word] += 1
  bow = bow.append(vocabDict, ignore_index = True)
bow.head()

Unnamed: 0,movie,film,one,like,good,would,even,time,really,see,story,much,well,could,get,people,great,also,bad,first,made,make,way,movies,think,characters,watch,character,films,many,seen,two,never,love,acting,plot,best,know,show,little,...,resourcefully,porely,beatin,legiunea,sträinä,positivism,festivism,perniciously,negativism,cursa,proba,microfon,vânätoarea,vulpi,croaziera,glissando,gwot,ordinance,actra,ceeb,dillute,overglamorize,angeline,entangle,digusted,titillates,saltshaker,geranium,uninflected,yamashiro,grandkid,caroon,blag,malkovichian,mcnee,colico,misjudgement,chronologies,esqueleto,chancho
0,3,0,0,3,0,0,0,0,1,2,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,5,3,1,1,1,1,1,0,0,3,0,3,1,0,1,1,1,0,0,0,4,1,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,1,0,2,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# TF-IDF

In [94]:
df1 = df

In [95]:
df1

Unnamed: 0,review
0,"My children, DD 7 and DS 10, enjoyed the movie..."
1,One of the many speculations about Y2K was tha...
2,Rajkumar Santoshi tries his hands at comedy an...
3,Bridget Fonda has disappointed me several time...
4,"Watching this movie, I can't help drawing the ..."
...,...
49995,"The story and the show were good, but it was r..."
49996,"Mean spirited, and down right degrading adapta..."
49997,"""Nacho Libre"" (2006) <br /><br />Directed By: ..."
49998,I'm amazed how many comments on this show are ...


In [101]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df1[0:100].review.values)

print(vectors.shape)
print(vectors)

(100, 4234)
  (0, 4189)	0.10850984794906235
  (0, 3543)	0.1262957044490089
  (0, 3801)	0.06362613712990788
  (0, 1648)	0.05951181416208668
  (0, 2511)	0.06596043356519732
  (0, 2494)	0.05515227844922266
  (0, 2563)	0.09810578885338597
  (0, 4108)	0.06991587325776305
  (0, 1834)	0.09072399144911578
  (0, 2429)	0.11589164535333253
  (0, 3310)	0.1262957044490089
  (0, 3682)	0.1262957044490089
  (0, 1310)	0.08255260660849464
  (0, 1854)	0.08499822455014798
  (0, 3678)	0.1262957044490089
  (0, 2610)	0.038598317724186856
  (0, 2240)	0.15428091283414597
  (0, 2282)	0.10850984794906235
  (0, 2883)	0.11589164535333253
  (0, 73)	0.1262957044490089
  (0, 1846)	0.1262957044490089
  (0, 191)	0.046404249858848674
  (0, 1070)	0.0745941654544716
  (0, 3787)	0.06077744113060842
  (0, 3251)	0.0782660711353953
  :	:
  (99, 4119)	0.08524499424633636
  (99, 4133)	0.09718721331375434
  (99, 2502)	0.09075495545454615
  (99, 2972)	0.1412794381703803
  (99, 3822)	0.12387618298926058
  (99, 248)	0.0619063144761