In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
df = pd.read_csv('moviereviews.csv')

In [4]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [5]:
df.isnull().sum()

label      0
review    35
dtype: int64

In [6]:
df = df.dropna()

In [7]:
df[df['review'].str.isspace()]

Unnamed: 0,label,review
57,neg,
71,pos,
147,pos,
151,pos,
283,pos,
307,pos,
313,neg,
323,pos,
343,pos,
351,neg,


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1965 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   1965 non-null   object
 1   review  1965 non-null   object
dtypes: object(2)
memory usage: 46.1+ KB


In [9]:
df = df[~df['review'].str.isspace()]

In [10]:
df['label'].value_counts()

neg    969
pos    969
Name: label, dtype: int64

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
count_vect = CountVectorizer(stop_words= 'english')
matrix = count_vect.fit_transform(df[df['label']=='neg']['review'])
freqs = zip(count_vect.get_feature_names(), matrix.sum(axis=0).tolist()[0])
print (sorted(freqs, key=lambda x: -x[1])[:20])

[('film', 4063), ('movie', 3131), ('like', 1808), ('just', 1480), ('time', 1127), ('good', 1117), ('bad', 997), ('character', 926), ('story', 908), ('plot', 888), ('characters', 838), ('make', 813), ('really', 743), ('way', 734), ('little', 696), ('don', 683), ('does', 666), ('doesn', 648), ('action', 635), ('scene', 634)]




In [13]:
X = df['review']
y = df['label']

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 101  )

**PipeLine that will both create a TF-IDF Vector out of a raw test data and fit a supervised learning model of your choice.Then fit that pipeline in the training data.**

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [17]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

In [24]:
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

Logistic Regression

In [25]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)


In [26]:
y_pred = model.predict(X_test_tfidf)

In [27]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8247422680412371
[[162  29]
 [ 39 158]]
              precision    recall  f1-score   support

         neg       0.81      0.85      0.83       191
         pos       0.84      0.80      0.82       197

    accuracy                           0.82       388
   macro avg       0.83      0.83      0.82       388
weighted avg       0.83      0.82      0.82       388



K-Nearest Neighbours

In [28]:
from sklearn.neighbors import KNeighborsClassifier

In [29]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_tfidf, y_train)


In [30]:
y_pred = knn.predict(X_test_tfidf)

In [31]:
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.654639175257732
[[112  79]
 [ 55 142]]
              precision    recall  f1-score   support

         neg       0.67      0.59      0.63       191
         pos       0.64      0.72      0.68       197

    accuracy                           0.65       388
   macro avg       0.66      0.65      0.65       388
weighted avg       0.66      0.65      0.65       388



Support Vector Machine (SVM)

In [32]:
from sklearn.svm import SVC

In [33]:
svm = SVC(kernel='linear')
svm.fit(X_train_tfidf, y_train)

In [34]:
y_pred = svm.predict(X_test_tfidf)

In [35]:
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8118556701030928
[[156  35]
 [ 38 159]]
              precision    recall  f1-score   support

         neg       0.80      0.82      0.81       191
         pos       0.82      0.81      0.81       197

    accuracy                           0.81       388
   macro avg       0.81      0.81      0.81       388
weighted avg       0.81      0.81      0.81       388

