In [2]:
# Performing standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
ds = pd.read_csv('movie-review/movie_review.csv')

In [4]:
ds.drop(['fold_id','cv_tag','html_id','sent_id'],axis=1,inplace=True)

In [5]:
ds.to_csv('movie_review',sep='\t',index=False)

In [6]:
dataset = pd.read_csv('movie_review',sep='\t')

In [7]:
dataset.head()

Unnamed: 0,text,tag
0,films adapted from comic books have had plenty...,pos
1,"for starters , it was created by alan moore ( ...",pos
2,to say moore and campbell thoroughly researche...,pos
3,"the book ( or "" graphic novel , "" if you will ...",pos
4,"in other words , don't dismiss this film becau...",pos


In [8]:
dataset.columns = ['Review','Verdict']

In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64720 entries, 0 to 64719
Data columns (total 2 columns):
Review     64720 non-null object
Verdict    64720 non-null object
dtypes: object(2)
memory usage: 1011.4+ KB


In [10]:
dataset.isnull().sum()

Review     0
Verdict    0
dtype: int64

In [11]:
blank = []
for i,r,v in dataset.itertuples():
    if r.isspace():
        blank.append(i)

In [12]:
X = dataset['Review'].values
y = dataset['Verdict'].values

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [15]:
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import LinearSVC

from sklearn.metrics import confusion_matrix, classification_report

In [16]:
vectorized_classification = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])

In [17]:
vectorized_classification.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [18]:
y_pred = vectorized_classification.predict(X_test)

In [19]:
confusion_matrix(y_test,y_pred)

array([[7241, 3267],
       [3230, 7620]], dtype=int64)

In [20]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         neg       0.69      0.69      0.69     10508
         pos       0.70      0.70      0.70     10850

    accuracy                           0.70     21358
   macro avg       0.70      0.70      0.70     21358
weighted avg       0.70      0.70      0.70     21358



In [21]:
# Let's try other model
from sklearn.linear_model import LogisticRegression

In [22]:
vectorized_classifier = Pipeline([('tfidf',TfidfVectorizer()),('clf',LogisticRegression())])
vectorized_classifier.fit(X_train,y_train)
y_pred = vectorized_classifier.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))



[[7060 3448]
 [3112 7738]]
              precision    recall  f1-score   support

         neg       0.69      0.67      0.68     10508
         pos       0.69      0.71      0.70     10850

    accuracy                           0.69     21358
   macro avg       0.69      0.69      0.69     21358
weighted avg       0.69      0.69      0.69     21358



In [23]:
from sklearn.ensemble import RandomForestClassifier

In [24]:
vectorized_classifier = Pipeline([('tfidf',TfidfVectorizer()),('clf',RandomForestClassifier())])
vectorized_classifier.fit(X_train,y_train)
y_pred = vectorized_classifier.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))



[[6577 3931]
 [4918 5932]]
              precision    recall  f1-score   support

         neg       0.57      0.63      0.60     10508
         pos       0.60      0.55      0.57     10850

    accuracy                           0.59     21358
   macro avg       0.59      0.59      0.59     21358
weighted avg       0.59      0.59      0.59     21358



"films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before ."

In [56]:
review = "The book is perfect and so is the movie"
vectorized_classification.predict(np.array(review).reshape(1))

array(['pos'], dtype=object)

In [57]:
X_test.shape

(21358,)