In [1]:
import numpy as np
import pandas as pd

In [2]:
data=pd.read_csv("labeledTrainData.tsv", sep="\t", quoting=3)

In [3]:
from bs4 import BeautifulSoup
import re
import nltk


In [4]:
from nltk.corpus import stopwords
stop=stopwords.words("english")
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer=WordNetLemmatizer()

# data clean

In [5]:
def clean_data(raw_text,stopwords=stop):
    remove_html_tags=BeautifulSoup(raw_text).get_text()
    remove_char=re.sub("[^a-zA-Z]"," ",remove_html_tags)
    lower_text=remove_char.lower().split()
    remove_stop=[i for i in lower_text if not i in stopwords]
    lemmatized_text=[wordnet_lemmatizer.lemmatize(word,'v') for word in remove_stop]
    return " ".join(lemmatized_text)

In [6]:
data['clean_review']=data['review'].apply(clean_data)

In [8]:
data['clean_review'][0]

'stuff go moment mj start listen music watch odd documentary watch wiz watch moonwalker maybe want get certain insight guy think really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember go see cinema originally release subtle message mj feel towards press also obvious message drug bad kay visually impressive course michael jackson unless remotely like mj anyway go hate find bore may call mj egotist consent make movie mj fan would say make fan true really nice actual feature film bite finally start minutes exclude smooth criminal sequence joe pesci convince psychopathic powerful drug lord want mj dead bad beyond mj overhear plan nah joe pesci character rant want people know supply drug etc dunno maybe hat mj music lot cool things like mj turn car robot whole speed demon sequence also director must patience saint come film kiddy bad sequence usually directors hate work one kid let alone whole bunch perform complex dance scene botto

# Mutual Chi2 for feature selection

In [7]:
from sklearn.feature_selection import chi2,SelectKBest
from sklearn.feature_extraction.text import CountVectorizer
X=data['clean_review']
y=data['sentiment']


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf=TfidfVectorizer()
X_sel=tf_idf.fit_transform(X)
X_sel.shape


(25000, 62994)

In [30]:
features=SelectKBest(chi2,k=1000)
X_vec=features.fit_transform(X_sel,y)

In [31]:

from sklearn.model_selection import train_test_split
X_train_vector, X_test_vector, Y_train, Y_test=train_test_split(X_vec,y, test_size=0.33, random_state=42)


# KNN K=3 with Chi2

In [32]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_vector,Y_train)
Y_pred=knn.predict(X_test_vector)
from sklearn.metrics import classification_report
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.67      0.62      0.65      4105
           1       0.65      0.70      0.67      4145

    accuracy                           0.66      8250
   macro avg       0.66      0.66      0.66      8250
weighted avg       0.66      0.66      0.66      8250



## KNN K=7 with Chi2

In [33]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train_vector,Y_train)
Y_pred=knn.predict(X_test_vector)
from sklearn.metrics import classification_report
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.70      0.65      0.67      4105
           1       0.67      0.72      0.70      4145

    accuracy                           0.68      8250
   macro avg       0.68      0.68      0.68      8250
weighted avg       0.68      0.68      0.68      8250



# KNN with k=10 with Chi2 

In [34]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train_vector,Y_train)
Y_pred=knn.predict(X_test_vector)
from sklearn.metrics import classification_report
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.66      0.75      0.71      4105
           1       0.72      0.62      0.67      4145

    accuracy                           0.69      8250
   macro avg       0.69      0.69      0.69      8250
weighted avg       0.69      0.69      0.69      8250



## SVM

In [35]:
from sklearn import svm
s=svm.SVC()
s.fit(X_train_vector, Y_train)
Y_pred=s.predict(X_test_vector)
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88      4105
           1       0.88      0.89      0.88      4145

    accuracy                           0.88      8250
   macro avg       0.88      0.88      0.88      8250
weighted avg       0.88      0.88      0.88      8250



## Logistic Regression

In [36]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(penalty='l2')
lr.fit(X_train_vector, Y_train)
Y_pred=lr.predict(X_test_vector)
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.89      0.85      0.87      4105
           1       0.86      0.89      0.88      4145

    accuracy                           0.87      8250
   macro avg       0.87      0.87      0.87      8250
weighted avg       0.87      0.87      0.87      8250



## Random Forest

In [37]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(X_train_vector, Y_train)
Y_pred=rf.predict(X_test_vector)
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.83      0.84      0.84      4105
           1       0.84      0.83      0.84      4145

    accuracy                           0.84      8250
   macro avg       0.84      0.84      0.84      8250
weighted avg       0.84      0.84      0.84      8250



## Multinomial NB

In [38]:
from sklearn.naive_bayes import MultinomialNB
nb=MultinomialNB()
nb.fit(X_train_vector, Y_train)
Y_pred=nb.predict(X_test_vector)
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.87      0.87      0.87      4105
           1       0.87      0.87      0.87      4145

    accuracy                           0.87      8250
   macro avg       0.87      0.87      0.87      8250
weighted avg       0.87      0.87      0.87      8250



## Decision Tree

In [39]:
from sklearn import tree
dt=tree.DecisionTreeClassifier()
dt.fit(X_train_vector, Y_train)
Y_pred=dt.predict(X_test_vector)
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.72      0.71      0.72      4105
           1       0.72      0.73      0.72      4145

    accuracy                           0.72      8250
   macro avg       0.72      0.72      0.72      8250
weighted avg       0.72      0.72      0.72      8250



## XGBoost

In [40]:
from xgboost import XGBClassifier
xb=XGBClassifier(max_depth=5, alpha=10, n_estimators=10)
xb.fit(X_train_vector, Y_train)
Y_pred=xb.predict(X_test_vector)
print(classification_report(Y_test,Y_pred))



              precision    recall  f1-score   support

           0       0.82      0.70      0.75      4105
           1       0.74      0.84      0.79      4145

    accuracy                           0.77      8250
   macro avg       0.78      0.77      0.77      8250
weighted avg       0.78      0.77      0.77      8250

