In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

In [47]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

## Read data

In [4]:
input_data = pd.read_csv('data/moviereviews.tsv', sep='\t')

In [6]:
input_data.shape

(2000, 2)

In [7]:
input_data.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [10]:
input_data.describe()

Unnamed: 0,label,review
count,2000,1965.0
unique,2,1939.0
top,neg,
freq,1000,27.0


In [11]:
input_data.info

<bound method DataFrame.info of      label                                             review
0      neg  how do films like mouse hunt get into theatres...
1      neg  some talented actresses are blessed with a dem...
2      pos  this has been an extraordinary year for austra...
3      pos  according to hollywood movies made in last few...
4      neg  my first press screening of 1998 and already i...
...    ...                                                ...
1995   pos  i like movies with albert brooks , and i reall...
1996   pos  it might surprise some to know that joel and e...
1997   pos  the verdict : spine-chilling drama from horror...
1998   pos  i want to correct what i wrote in a former ret...
1999   pos  a couple of months ago , when i first download...

[2000 rows x 2 columns]>

# Clean the data 

In [13]:
input_data.isnull().sum()

label      0
review    35
dtype: int64

## Remove the null

In [14]:
input_data.dropna(inplace=True)

In [15]:
input_data.isnull().sum()

label     0
review    0
dtype: int64

## Set all the empty rows to na

In [26]:
input_data = input_data.apply(lambda x: x.str.strip()).replace('', np.nan)

In [27]:
input_data.isnull().sum()

label     0
review    0
dtype: int64

In [28]:
input_data.dropna(inplace=True)

In [29]:
input_data.isnull().sum()

label     0
review    0
dtype: int64

## Split to train test data

In [32]:
X = input_data['review']

In [33]:
y = input_data['label']

In [36]:
X.shape


(1938,)

In [37]:
y.shape

(1938,)

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=42)

In [39]:
X_train.shape

(1298,)

In [41]:
X_test.shape

(640,)

In [42]:
clf_pipleline = Pipeline([('tfidf', TfidfVectorizer()), ('model', SVC())])

In [43]:
clf_pipleline.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('model', SVC())])

In [45]:
predict = clf_pipleline.predict(X_test)

In [49]:
print(confusion_matrix(y_test, predict))

[[253  55]
 [ 57 275]]


In [51]:
print(classification_report(y_test, predict))

              precision    recall  f1-score   support

         neg       0.82      0.82      0.82       308
         pos       0.83      0.83      0.83       332

    accuracy                           0.82       640
   macro avg       0.82      0.82      0.82       640
weighted avg       0.83      0.82      0.83       640



In [52]:
print(accuracy_score(y_test, predict))

0.825
