# LOAD THE FILE INTO DATAFRAME

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('../TextFiles/moviereviews2.tsv', sep='\t')
df.head()

Unnamed: 0,label,review
0,pos,I loved this movie and will watch it again. Or...
1,pos,"A warm, touching movie that has a fantasy-like..."
2,pos,I was not expecting the powerful filmmaking ex...
3,neg,"This so-called ""documentary"" tries to tell tha..."
4,pos,This show has been my escape from reality for ...


## CHECK FOR NULL VALUES AND WHITESPACES IN THE DATAFRAME AND REMOVE IT

In [2]:
df.isnull().sum()

label      0
review    20
dtype: int64

In [3]:
df.dropna(inplace=True)

In [4]:
blanks = []
for i,lb,rv in df.itertuples():
    if rv.isspace():
        blanks.append(i)
len(blanks)

0

In [5]:
df['label'].value_counts()

label
pos    2990
neg    2990
Name: count, dtype: int64

## SPLIT THE DATA INTO TRAINING SET AND TEST SET

In [6]:
from sklearn.model_selection import train_test_split

X = df['review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,random_state=42)

## BULID A PIPELINE TO VECTORIZE THE DATA THEN TRAIN AND FIT THE MODEL 

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])
text_clf.fit(X_train,y_train)

## PREDICT AND ANALYSE THE RESULT

In [10]:
predictions = text_clf.predict(X_test)

In [13]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[900  91]
 [ 63 920]]


In [14]:
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         neg       0.93      0.91      0.92       991
         pos       0.91      0.94      0.92       983

    accuracy                           0.92      1974
   macro avg       0.92      0.92      0.92      1974
weighted avg       0.92      0.92      0.92      1974



In [16]:
print(metrics.accuracy_score(y_test,predictions))

0.9219858156028369


In [17]:
text_clf.predict(["That was a good movie.I am going to watch it again"])

array(['pos'], dtype=object)

In [18]:
text_clf.predict(["Probably the worst movie I have ever seen"])

array(['neg'], dtype=object)