In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [2]:
df = pd.read_csv("moviereviews.tsv", sep='\t')

In [3]:
df

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...
...,...,...
1995,pos,"i like movies with albert brooks , and i reall..."
1996,pos,it might surprise some to know that joel and e...
1997,pos,the verdict : spine-chilling drama from horror...
1998,pos,i want to correct what i wrote in a former ret...


In [4]:
df.shape

(2000, 2)

In [7]:
df.isnull().sum()

label      0
review    35
dtype: int64

In [8]:
df.dropna(inplace=True)

In [9]:
df.shape

(1965, 2)

In [10]:
#This dataset has nulls and likely also have reviews with empty strings
empty_indexes = []

for ind, label, review in df.itertuples():
  if review.isspace():
    empty_indexes.append(ind)

In [11]:
len(empty_indexes)

27

In [12]:
df.drop(empty_indexes, inplace = True)

In [14]:
#After dropping nulls and empty spaces reviews the size is -
df.shape

(1938, 2)

In [15]:
# Now data is cleaned

X = df['review']
y = df['label']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33)

In [17]:
text_classifier = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [18]:
text_classifier.fit(X_train, y_train)

In [21]:
predictions = text_classifier.predict(X_test)

In [20]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [22]:
confusion_matrix(y_test, predictions)

array([[265,  55],
       [ 48, 272]])

In [24]:
classification_report(y_test, predictions)

'              precision    recall  f1-score   support\n\n         neg       0.85      0.83      0.84       320\n         pos       0.83      0.85      0.84       320\n\n    accuracy                           0.84       640\n   macro avg       0.84      0.84      0.84       640\nweighted avg       0.84      0.84      0.84       640\n'

In [25]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         neg       0.85      0.83      0.84       320
         pos       0.83      0.85      0.84       320

    accuracy                           0.84       640
   macro avg       0.84      0.84      0.84       640
weighted avg       0.84      0.84      0.84       640



In [26]:
print(accuracy_score(y_test, predictions))

0.8390625
