In [65]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy import spatial

import matplotlib.pyplot as plt # side-stepping mpl backend

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import pairwise
from sklearn.feature_selection import SelectPercentile, f_classif
# from sklearn import cross_validation
import heapq
import string
import re

from nltk.corpus import stopwords
from nltk import word_tokenize
from collections import defaultdict
from collections import Counter

trainingData  = pd.read_csv('data/train.dat.txt', sep="\t", encoding='utf-8', header=None, names=["rating","review"]);
testData = pd.read_csv('data/test.dat.txt', sep="\t", encoding='utf-8', header=None, names=["review"]);


In [66]:
# 
print('Before Cleaning');
print (trainingData[0:1])

def preProcess(reviews):
#     print(reviews);
    processedReviews = [];
    for review in reviews:
        tokens = word_tokenize(review);
        filteredTokens = [];
        for token in list(tokens):
#             print(token);
            # if it is a stopword then eliminate
            if token.lower() in stopwords.words('english'):
#                 print('removing stopword ' + token);
                continue;
            # if it is punctuation then eliminate
            if token.lower() in set(string.punctuation):
#                 print('removing punct ' + token);
                continue;
            if len(token)<=3:
#                 print('removing small ' + token);
                continue;
            token = token.lower();
            filteredTokens.append(token);
#         print(filteredTokens);
        processedReviews.append(filteredTokens);
#     print(len(processedReviews));
    return processedReviews;
print('After Cleaning')
XTrain = preProcess(trainingData['review'][0:100]);
print(XTrain[0:1]);
print (len(XTrain));
XTest = preProcess(testData['review'][0:100]);
print (len(XTest));

Before Cleaning
   rating                                             review
0      -1  Although a film with Bruce Willis is always wo...
After Cleaning
[[u'although', u'film', u'bruce', u'willis', u'always', u'worth', u'watching', u'better', u'skip', u'watched', u'television', u'plunk', u'cash', u'lucky', u'plot', u'develops', u'slowly', u'slowly', u'although', u'first', u'minutes', u'quite', u'believable', u'gets', u'unbelievable', u'towards', u'highly', u'questionable', u'seasoned', u'soldier', u'like', u'waters', u'would', u'disobey', u'direct', u'orders', u'even', u'would', u'rest', u'platoon', u'would', u'know', u'puts', u'direct', u'danger', u'know', u'certainly', u'follow', u'heck', u'says', u'despite', u'direct', u'orders', u'remember', u'still', u'nice', u'scenes', u'movie', u'somewhat', u'save', u'village', u'total', u'population', u'massacred', u'rebels', u'well', u'save', u'dozen', u'villagers', u'rest', u'already', u'killed', u'strange', u'part', u'take', u'trucks', u'reb

In [67]:
# count frequencies for all words in the Training data

wordCountInTrainingData = Counter()
for d in XTrain:
#     print(d);
    for w in d:
#         print(w);
        if w not in wordCountInTrainingData:
            wordCountInTrainingData[w]=1
        else:
            wordCountInTrainingData[w] += 1
print("Number of unique words: %d." % len(wordCountInTrainingData));

[u'although', u'film', u'bruce', u'willis', u'always', u'worth', u'watching', u'better', u'skip', u'watched', u'television', u'plunk', u'cash', u'lucky', u'plot', u'develops', u'slowly', u'slowly', u'although', u'first', u'minutes', u'quite', u'believable', u'gets', u'unbelievable', u'towards', u'highly', u'questionable', u'seasoned', u'soldier', u'like', u'waters', u'would', u'disobey', u'direct', u'orders', u'even', u'would', u'rest', u'platoon', u'would', u'know', u'puts', u'direct', u'danger', u'know', u'certainly', u'follow', u'heck', u'says', u'despite', u'direct', u'orders', u'remember', u'still', u'nice', u'scenes', u'movie', u'somewhat', u'save', u'village', u'total', u'population', u'massacred', u'rebels', u'well', u'save', u'dozen', u'villagers', u'rest', u'already', u'killed', u'strange', u'part', u'take', u'trucks', u'rebels', u'left', u'behind', u'rather', u'foot', u'maybe', u'roads', u'unsafe', u'explanation', u'anyway', u'think', u'earned', u'movie', u'point', u'gave', 

running
time
flimsy
poorly-made
crime
film
[u'movie', u'despite', u'list', u'list', u'celebs', u'complete', u'waste', u'minutes', u'plot', u'peaks', u'predictable', u'silly', u'believe', u'taking', u'time', u'even', u'write', u'review', u'flex', u'credit', u'grown', u'ability', u'since', u'playing', u'michael', u'jackson', u'made', u'movie', u'years', u'tangi', u'hand', u'regressed', u'talented', u'role', u'felicity', u'flunkie', u'years', u'watching', u'train', u'wreck', u'film', u'pitiful', u'production', u'horrible', u'sound', u'quality', u'four', u'letter', u'words', u'came', u'mind', u'qualify', u'thought', u'film', u'however', u'effort', u'keep', u'writings', u'rated', u'simply', u'film', u'another', u'four', u'letter', u'word', u'starting', u'lame']
movie
despite
list
list
celebs
complete
waste
minutes
plot
peaks
predictable
silly
believe
taking
time
even
write
review
flex
credit
grown
ability
since
playing
michael
jackson
made
movie
years
tangi
hand
regressed
talented
role
feli

hell
seed
passed
another
brother.
halloween
moments
wrong
know
worse
sequels
hellraiser
anyone
scene
toward
film
michael
charging
deep
corridor
particularly
effective
ignore
screwball
plot
hopefully
producer
maybe
chance
make
sense
till
-ftm
[u'gave', u'movie', u'rating', u'awful', u'reason', u'even', u'instead', u'kingsley', u'always', u'shines', u'matter', u'terrible', u'material', u'thrown', u'way.', u'mira', u'sorvino', u'element', u'viewer', u'simply', u'fact', u'even', u'piece.', u'stupid', u'stupid', u'story', u'horrible', u'production', u'waste', u'video', u'rental']
gave
movie
rating
awful
reason
even
instead
kingsley
always
shines
matter
terrible
material
thrown
way.
mira
sorvino
element
viewer
simply
fact
even
piece.
stupid
stupid
story
horrible
production
waste
video
rental
[u'awaited', u'movie', u'thought', u'himesh', u'acting', u'alas', u'hope', u'went', u'wrong..given', u'heroine', u'thought', u'considering', u'actress..may', u'boby', u'wants', u'work', u'called', u'film

traditional
marriage
even
religiously-based
movies
like
running
karma
2003
samsara
2001
entertained
occasion
difference
films
actually
interesting
psychological
content
character
development
whereas
lost
beijing
virtually
none.
known
people
unorthodox
mindsets
exist
planet
without
kind
character
development
psychology
behind
acts
superficial
exposition
despicable
behavior
exactly
bing
bing
eventually
befriend
care
rapist
wife
rapist
accept
behavior
unconditionally
filmmakers
never
bothered
tell
even
obvious
juxtaposition
rich
poor
classes
ineptly
conceived
served
mere
situational
ploy
feels
bland
forgettable
filthy
opening
half
hour
subsides
reviewers
seem
confused
moral
ambiguity
complex
characterization
reason
choose
person
root
developed
properly
think
movie
complex
characters
clearly
defined
contrary
reason
clearly
defined
know
nothing
thinking
hardly
positive
attribute
movie
positive
side
camera-work
acting
quite
good
everything
else
gets
duller
duller
film
progresses
place
alongs

'jokes
funny
vomit
inducing
predictable
start
dialogue
cliché
awful-
especially
last
line
gives
wings
love
whatever
remember
cringing
wishing
would
hurry
finish
either-
people
went
thought
boring
please
watch
asterix
obelix
cleopatra
film
going
watch
waste
time
[u'laughed', u'much', u'long', u'time', u'although', u'movie', u'moments', u'especially', u'changes', u'hyper-funny', u'honest', u'serious', u'characters', u'realistic', u'times', u'sappy', u'sometimes', u'quite', u'believable', u'jerry', u'springer', u'show', u'feel', u'sorry', u'participating', u'people', u'film', u'instead', u'satire', u'great.', u'expletives', u'*beeped*', u'movie', u'aired', u'public', u'takes', u'rent', u'movie', u'fully', u'enjoy']
laughed
much
long
time
although
movie
moments
especially
changes
hyper-funny
honest
serious
characters
realistic
times
sappy
sometimes
quite
believable
jerry
springer
show
feel
sorry
participating
people
film
instead
satire
great.
expletives
*beeped*
movie
aired
public
takes
re