In [1]:
import numpy as np
import pandas as pd

In [2]:
data=pd.read_csv("labeledTrainData.tsv", sep="\t", quoting=3)

In [3]:
from bs4 import BeautifulSoup
import re
import nltk


In [4]:
from nltk.corpus import stopwords
stop=stopwords.words("english")
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer=WordNetLemmatizer()

# data clean

In [5]:
def clean_data(raw_text,stopwords=stop):
    remove_html_tags=BeautifulSoup(raw_text).get_text()
    remove_char=re.sub("[^a-zA-Z]"," ",remove_html_tags)
    lower_text=remove_char.lower().split()
    remove_stop=[i for i in lower_text if not i in stopwords]
    lemmatized_text=[wordnet_lemmatizer.lemmatize(word,'v') for word in remove_stop]
    return " ".join(lemmatized_text)

In [6]:
data['clean_review']=data['review'].apply(clean_data)

In [7]:
data['clean_review'][0]

'stuff go moment mj start listen music watch odd documentary watch wiz watch moonwalker maybe want get certain insight guy think really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember go see cinema originally release subtle message mj feel towards press also obvious message drug bad kay visually impressive course michael jackson unless remotely like mj anyway go hate find bore may call mj egotist consent make movie mj fan would say make fan true really nice actual feature film bite finally start minutes exclude smooth criminal sequence joe pesci convince psychopathic powerful drug lord want mj dead bad beyond mj overhear plan nah joe pesci character rant want people know supply drug etc dunno maybe hat mj music lot cool things like mj turn car robot whole speed demon sequence also director must patience saint come film kiddy bad sequence usually directors hate work one kid let alone whole bunch perform complex dance scene botto

In [7]:
X=data['clean_review']
y=data['sentiment']
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test=train_test_split(X,y, test_size=0.33, random_state=42)
vector=CountVectorizer(max_features=1000)
vector.fit(X_train.tolist())
X_train_vector=vector.transform(X_train.tolist()).toarray()
X_test_vector=vector.transform(X_test.tolist()).toarray()
X_train_vector.shape, X_test_vector.shape

((16750, 1000), (8250, 1000))

## KNN k=3 without IG

In [10]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=3)
import time
now=time.time()
knn.fit(X_train_vector,Y_train)
Y_pred=knn.predict(X_test_vector)
fin=time.time()-now

from sklearn.metrics import classification_report
print(classification_report(Y_test,Y_pred))
print(fin)

              precision    recall  f1-score   support

           0       0.63      0.59      0.61      4105
           1       0.62      0.66      0.64      4145

    accuracy                           0.63      8250
   macro avg       0.63      0.63      0.63      8250
weighted avg       0.63      0.63      0.63      8250

12.58481764793396


## KNN k=7 without IG

In [9]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=10)
import time
now=time.time()
knn.fit(X_train_vector,Y_train)
Y_pred=knn.predict(X_test_vector)
fin=time.time()-now
from sklearn.metrics import classification_report
print(classification_report(Y_test,Y_pred))
print(fin)

              precision    recall  f1-score   support

           0       0.65      0.67      0.66      4105
           1       0.66      0.65      0.66      4145

    accuracy                           0.66      8250
   macro avg       0.66      0.66      0.66      8250
weighted avg       0.66      0.66      0.66      8250

25.88218402862549


# Mutual Information gain for feature selection

In [12]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import CountVectorizer
X=data['clean_review']
y=data['sentiment']


In [13]:
cv=CountVectorizer(max_df=0.95, min_df=2, max_features=10000, stop_words='english')
X_vec=cv.fit_transform(X)
res=dict(zip(cv.get_feature_names(),mutual_info_classif(X_vec, y, discrete_features=True)))


In [14]:
print(len(res))
count=0
for k,v in res.items():
    if res[k]>=0.0001:
        count+=1
        #print(k)
print(count)

10000
6044


In [15]:
def select_features(text,res=res):
    l=[]
    d=text.split()
    for word in d:
        #print(word)
        if word in res.keys() and res[word]>=0.0001:
            l.append(word)
    return " ".join(l)

In [16]:
data['sel_feature']=data['clean_review'].apply(select_features)

In [13]:
data['sel_feature']

0        stuff moment mj start listen music watch docum...
1        classic war worlds timothy entertain film obvi...
2        film start manager nicholas welcome robert car...
3        assume film greatest film opera read care oper...
4        superbly trashy unpretentious exploitation pre...
                               ...                        
24995    like imdb review film review think happen dres...
24996    believe make film completely unnecessary film ...
24997    guy loser girls need build pick stronger succe...
24998    minute documentary make early poorest opinion ...
24999    saw movie child break heart story end grow gre...
Name: sel_feature, Length: 25000, dtype: object

# TF_IDF for feature extraction

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
X=data['sel_feature']
y=data['sentiment']
X_train, X_test, Y_train, Y_test=train_test_split(X,y, test_size=0.33, random_state=42)
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf=TfidfVectorizer()
X_train_vector=tf_idf.fit_transform(X_train)
X_train_vector.shape

(16750, 6044)

In [18]:
X_test_vector=tf_idf.transform(X_test)
X_test_vector.shape

(8250, 6044)

# KNN K=3 with IG

In [19]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=3)
now=time.time()
knn.fit(X_train_vector,Y_train)
Y_pred=knn.predict(X_test_vector)
fin=time.time()-now
from sklearn.metrics import classification_report
print(classification_report(Y_test,Y_pred))
print(fin)

              precision    recall  f1-score   support

           0       0.79      0.71      0.75      4105
           1       0.74      0.81      0.77      4145

    accuracy                           0.76      8250
   macro avg       0.76      0.76      0.76      8250
weighted avg       0.76      0.76      0.76      8250

11.142491817474365


## KNN K=7 with IG

In [20]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=7)
now=time.time()
knn.fit(X_train_vector,Y_train)
Y_pred=knn.predict(X_test_vector)
fin=time.time()-now
from sklearn.metrics import classification_report
print(classification_report(Y_test,Y_pred))
print(fin)

              precision    recall  f1-score   support

           0       0.81      0.73      0.77      4105
           1       0.76      0.83      0.79      4145

    accuracy                           0.78      8250
   macro avg       0.78      0.78      0.78      8250
weighted avg       0.78      0.78      0.78      8250

11.535353422164917


# KNN with k=10 with IG 

In [21]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=10)
now=time.time()
knn.fit(X_train_vector,Y_train)
Y_pred=knn.predict(X_test_vector)
fin=time.time()-now
from sklearn.metrics import classification_report
print(classification_report(Y_test,Y_pred))
print(fin)

              precision    recall  f1-score   support

           0       0.78      0.80      0.79      4105
           1       0.79      0.78      0.79      4145

    accuracy                           0.79      8250
   macro avg       0.79      0.79      0.79      8250
weighted avg       0.79      0.79      0.79      8250

11.23954701423645


## SVM

In [22]:
from sklearn import svm
s=svm.SVC()
now=time.time()
s.fit(X_train_vector, Y_train)
Y_pred=s.predict(X_test_vector)
fin=time.time()-now
print(classification_report(Y_test,Y_pred))
print(fin)

              precision    recall  f1-score   support

           0       0.90      0.87      0.89      4105
           1       0.88      0.91      0.89      4145

    accuracy                           0.89      8250
   macro avg       0.89      0.89      0.89      8250
weighted avg       0.89      0.89      0.89      8250

462.7750928401947


## Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(penalty='l2')
now=time.time()
lr.fit(X_train_vector, Y_train)
Y_pred=lr.predict(X_test_vector)
fin=time.time()-now
print(classification_report(Y_test,Y_pred))
print(fin)

              precision    recall  f1-score   support

           0       0.90      0.87      0.88      4105
           1       0.87      0.90      0.89      4145

    accuracy                           0.88      8250
   macro avg       0.88      0.88      0.88      8250
weighted avg       0.88      0.88      0.88      8250

0.4799950122833252


## Random Forest

In [24]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
now=time.time()
rf.fit(X_train_vector, Y_train)
Y_pred=rf.predict(X_test_vector)
fin=time.time()-now
print(classification_report(Y_test,Y_pred))
print(fin)

              precision    recall  f1-score   support

           0       0.84      0.85      0.85      4105
           1       0.85      0.84      0.85      4145

    accuracy                           0.85      8250
   macro avg       0.85      0.85      0.85      8250
weighted avg       0.85      0.85      0.85      8250

31.4950749874115


## Multinomial NB

In [25]:
from sklearn.naive_bayes import MultinomialNB
nb=MultinomialNB()
now=time.time()
nb.fit(X_train_vector, Y_train)
Y_pred=nb.predict(X_test_vector)
fin=time.time()-now
print(classification_report(Y_test,Y_pred))
print(fin)

              precision    recall  f1-score   support

           0       0.86      0.86      0.86      4105
           1       0.87      0.86      0.86      4145

    accuracy                           0.86      8250
   macro avg       0.86      0.86      0.86      8250
weighted avg       0.86      0.86      0.86      8250

0.022989511489868164


## Decision Tree

In [26]:
from sklearn import tree
dt=tree.DecisionTreeClassifier()
now=time.time()
dt.fit(X_train_vector, Y_train)
Y_pred=dt.predict(X_test_vector)
fin=time.time()-now
print(classification_report(Y_test,Y_pred))
print(fin)

              precision    recall  f1-score   support

           0       0.71      0.72      0.72      4105
           1       0.72      0.71      0.72      4145

    accuracy                           0.72      8250
   macro avg       0.72      0.72      0.72      8250
weighted avg       0.72      0.72      0.72      8250

15.415213584899902


## XGBoost

In [27]:
from xgboost import XGBClassifier
xb=XGBClassifier(max_depth=5, alpha=10, n_estimators=10)
now=time.time()
xb.fit(X_train_vector, Y_train)
Y_pred=xb.predict(X_test_vector)
fin=time.time()-now
print(classification_report(Y_test,Y_pred))
print(fin)



              precision    recall  f1-score   support

           0       0.82      0.68      0.75      4105
           1       0.73      0.86      0.79      4145

    accuracy                           0.77      8250
   macro avg       0.78      0.77      0.77      8250
weighted avg       0.78      0.77      0.77      8250

3.2723329067230225
