In [275]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

import os
import nltk
import nltk.corpus
from sklearn.feature_extraction.text import CountVectorizer

In [276]:
# nltk.download() 
print(os.listdir(nltk.data.find("corpora")))

['brown', 'conll2000.zip', 'brown.zip', 'wordnet.zip', 'movie_reviews', 'conll2000', 'wordnet', 'movie_reviews.zip']


In [277]:
#Import movie reviews and check categories
from nltk.corpus import movie_reviews
movie_reviews.categories()

['neg', 'pos']

In [278]:
pos_rev = movie_reviews.fileids('pos')
len(pos_rev)

1000

In [279]:
neg_rev = movie_reviews.fileids('neg')
len(neg_rev)

1000

In [280]:
#List positive review files
pos_rev[0:10]

['pos/cv000_29590.txt',
 'pos/cv001_18431.txt',
 'pos/cv002_15918.txt',
 'pos/cv003_11664.txt',
 'pos/cv004_11636.txt',
 'pos/cv005_29443.txt',
 'pos/cv006_15448.txt',
 'pos/cv007_4968.txt',
 'pos/cv008_29435.txt',
 'pos/cv009_29592.txt']

In [281]:
#Select one positive review
rev = movie_reviews.words('pos/cv000_29590.txt')
rev

['films', 'adapted', 'from', 'comic', 'books', 'have', ...]

In [282]:
#Parse and append tokens to a new list
rev_list = []
for rev in neg_rev:
    rev_text_neg = movie_reviews.words(rev)
    rev_one_string = " ".join(rev_text_neg)
    rev_one_string = rev_one_string.replace(' ,',',')
    rev_one_string = rev_one_string.replace(' .',',')
    rev_one_string = rev_one_string.replace("\' ","'")
    rev_one_string = rev_one_string.replace(" \'","'")
    rev_list.append(rev_one_string)
# rev_list

In [283]:
rev_list[1]

'the happy bastard\'s quick movie review damn that y2k bug, it\'s got a head start in this movie starring jamie lee curtis and another baldwin brother ( william this time ) in a story regarding a crew of a tugboat that comes across a deserted russian tech ship that has a strangeness to it when they kick the power back on, little do they know the power within,,, going for the gore and bringing on a few action sequences here and there, virus still feels very empty, like a movie going for all flash and no substance, we don\'t know why the crew was really out in the middle of nowhere, we don\'t know the origin of what took over the ship ( just that a big pink flashy thing hit the mir ), and, of course, we don\'t know why donald sutherland is stumbling around drunkenly throughout, here, it\'s just " hey, let\'s chase these people around with some robots ", the acting is below average, even from the likes of curtis, you\'re more likely to get a kick out of her work in halloween h20, sutherla

In [284]:
for rev in pos_rev:
    rev_text_pos = movie_reviews.words(rev)
    rev_one_string = " ".join(rev_text_pos)
    rev_one_string = rev_one_string.replace(' ,',',')
    rev_one_string = rev_one_string.replace(' .',',')
    rev_one_string = rev_one_string.replace("\' ","'")
    rev_one_string = rev_one_string.replace(" \'","'")
    rev_list.append(rev_one_string)
# rev_list

In [285]:
len(rev_list)

2000

In [286]:
neg_targets = np.zeros((1000,),dtype = np.int)
pos_targets = np.ones((1000,),dtype = np.int)

In [287]:
target_list = []
for neg_tar in neg_targets:
    target_list.append(neg_tar)
for pos_tar in pos_targets:
    target_list.append(pos_tar)

In [288]:
len(target_list)

2000

In [289]:
y = pd.Series(target_list)

In [290]:
type(y)

pandas.core.series.Series

In [291]:
y.head()

0    0
1    0
2    0
3    0
4    0
dtype: int64

In [292]:
#Use bag of words or count vectorizer to make features
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer(lowercase = True, stop_words = 'english', min_df = 2)

In [293]:
X_count_vec = count_vec.fit_transform(rev_list)

In [294]:
X_count_vec.shape

(2000, 23784)

In [295]:
#All X features names in count_vectorizer
X_names = count_vec.get_feature_names()
X_names[0:10]

['00', '000', '007', '05', '10', '100', '1000', '100m', '101', '102']

In [296]:
X_count_df = pd.DataFrame(X_count_vec.toarray(),columns = X_names)
X_count_df.shape

(2000, 23784)

In [297]:
X_count_df.head(5)

Unnamed: 0,00,000,007,05,10,100,1000,100m,101,102,...,zoom,zooming,zooms,zoot,zorg,zorro,zucker,zuko,zwick,zwigoff
0,0,0,0,0,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [298]:
#Split data in train ad test sets
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [299]:
X_train,X_test,y_train,y_test = train_test_split(X_count_df,y,test_size = 0.25, random_state = 5)

In [300]:
X_train.shape

(1500, 23784)

In [301]:
 X_test.shape

(500, 23784)

In [302]:
#Use Naive Bayes Classifier
#Using Gaussian
from sklearn.naive_bayes import GaussianNB

In [303]:
gnb = GaussianNB()
model = gnb.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [304]:
metrics.accuracy_score(y_test,y_pred)

0.654

In [305]:
score_model = confusion_matrix(y_test,y_pred)
score_model

array([[173,  85],
       [ 88, 154]])

In [306]:
accuracy_model = (score_model[0][0]+score_model[1][1])/(sum(score_model[0])+sum(score_model[1]))
accuracy_model

0.654

In [307]:
error_model = (score_model[0][1]+score_model[1][0])/(sum(score_model[0])+sum(score_model[1]))
error_model

0.346

In [308]:
#Using Multinomial because there are multiple features
from sklearn.naive_bayes import MultinomialNB

In [309]:
clf = MultinomialNB()
model2 = clf.fit(X_train,y_train)
y_pred2 = model2.predict(X_test)

In [310]:
metrics.accuracy_score(y_test,y_pred2)

0.798

In [311]:
score_model2 = confusion_matrix(y_test,y_pred2)
score_model2

array([[213,  45],
       [ 56, 186]])

In [312]:
accuracy_model2 = (score_model2[0][0]+score_model2[1][1])/(sum(score_model2[0])+sum(score_model2[1]))
accuracy_model2

0.798

In [313]:
error_model2 = (score_model2[0][1]+score_model2[1][0])/(sum(score_model2[0])+sum(score_model2[1]))
error_model2

0.202

# For checking the classifcation of a new review, we need to make sure that our model is trained well enough to compensate for the difference in tokens between heterogeneous datasets from different sources. For that, we probably need more data for training. Other than that, we may not be able to correctly classify heterogeneous samples (with different token attributes). 

In [314]:
#Preparing one new sample for clasification after training
def prepare_rev(review):
    review=review.values.flatten()
    review_cat = ''
    for element in review:
        review_cat += element
    
    review_words = review_cat.split(" ")
    
    rev_text = review_words
    rev_one_string = " ".join(rev_text)
    rev_one_string = rev_one_string.replace(' ,',',')
    rev_one_string = rev_one_string.replace(' .',',')
    rev_one_string = rev_one_string.replace("\' ","'")
    rev_one_string = rev_one_string.replace(" \'","'")
    rev_one_string

    rev_list=[]
    rev_list.append(rev_one_string)

    count_vec = CountVectorizer(lowercase = True, stop_words = 'english', min_df = 1)

    X_count_vec = count_vec.fit_transform(rev_list)

    X_names = count_vec.get_feature_names()
    X_names

    X_count_df = pd.DataFrame(X_count_vec.toarray(),columns = X_names)
    X_count_df.shape
    
    common=X_test.columns.intersection(X_count_df.columns)
    result = pd.DataFrame(0,index=range(1),columns = X_train.columns)
    
    for col in common:
        try:
            result.iloc[0][col]=X_count_df.iloc[0][col]
        except:
            continue
    print(result)
    return result

In [315]:
#Predicting category on new sample based on the trained model
def predict_category(review, model = model2):
    test_df = prepare_rev(review)
    pred = model.predict(test_df)
    if pred == 0:
        return "The review is negative"
    else:
        return "The review is positive"


In [316]:
review = pd.read_table("sample movie review1.txt",header = None)
review.head(10)

Unnamed: 0,0
0,"plot : two teen couples go to a church party ,..."
1,they get into an accident .
2,"one of the guys dies , but his girlfriend cont..."
3,what's the deal ?
4,"watch the movie and "" sorta "" find out . . ."
5,critique : a mind-fuck movie for the teen gene...
6,which is what makes this review an even harder...
7,they seem to have taken this pretty neat conce...
8,so what are the problems with the movie ?
9,"well , its main problem is that it's simply to..."


In [317]:
predict_category(review)

   00  000  007  05  10  100  1000  100m  101  102   ...     zoom  zooming  \
0   0    0    0   0  10    0     0     0    0    0   ...        0        0   

   zooms  zoot  zorg  zorro  zucker  zuko  zwick  zwigoff  
0      0     0     0      0       0     0      0        0  

[1 rows x 23784 columns]


'The review is negative'