In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
from sklearn.model_selection  import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC


In [6]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score

In [31]:
from sklearn.pipeline import Pipeline

In [9]:
input_data = pd.read_csv('data/moviereviews2.tsv', sep='\t')

In [10]:
input_data.shape

(6000, 2)

In [11]:
input_data.describe()

Unnamed: 0,label,review
count,6000,5980
unique,2,5966
top,neg,Smallville episode Justice is the best episode...
freq,3000,2


In [12]:
input_data.isnull().sum()

label      0
review    20
dtype: int64

In [13]:
input_data.dropna(inplace=True)

In [16]:
input_data.apply( lambda x: x.str.strip().replace('', np.nan))

Unnamed: 0,label,review
0,pos,I loved this movie and will watch it again. Or...
1,pos,"A warm, touching movie that has a fantasy-like..."
2,pos,I was not expecting the powerful filmmaking ex...
3,neg,"This so-called ""documentary"" tries to tell tha..."
4,pos,This show has been my escape from reality for ...
...,...,...
5995,pos,"Of the three remakes of this plot, I like them..."
5996,neg,Poor Whoopi Goldberg. Imagine her at a friend'...
5997,neg,"Honestly before I watched this movie, I had he..."
5998,pos,This movie is essentially shot on a hand held ...


In [17]:
input_data.isnull().sum()

label     0
review    0
dtype: int64

In [18]:
X = input_data['review']

In [19]:
y = input_data['label']

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=42)

In [43]:
X_train.shape

(4006,)

In [44]:
clsfr_pipeline = Pipeline([("clsfr", TfidfVectorizer() ), ("model", SVC())])

In [45]:
clsfr_pipeline.fit(X_train, y_train)

Pipeline(steps=[('clsfr', TfidfVectorizer()), ('model', SVC())])

In [46]:
predicted_valued = clsfr_pipeline.predict(X_test)

In [48]:
print(confusion_matrix(y_test, predicted_valued))

[[891 100]
 [ 61 922]]


In [50]:
print(classification_report(y_test, predicted_valued))

              precision    recall  f1-score   support

         neg       0.94      0.90      0.92       991
         pos       0.90      0.94      0.92       983

    accuracy                           0.92      1974
   macro avg       0.92      0.92      0.92      1974
weighted avg       0.92      0.92      0.92      1974



In [52]:
print(accuracy_score(y_test, predicted_valued))

0.9184397163120568
