In [5]:
import numpy as np
import pandas as pd
import time

## Load dataset

In [3]:
data=pd.read_csv("labeledTrainData.tsv", sep="\t", quoting=3)

In [4]:
data.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [6]:
data.shape

(25000, 3)

In [7]:
print(data['review'][0])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

## Data cleaning

In [8]:
from bs4 import BeautifulSoup
import re
import nltk


In [9]:
from nltk.corpus import stopwords
stop=stopwords.words("english")
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer=WordNetLemmatizer()

In [10]:
def clean_data(raw_text,stopwords=stop):
    remove_html_tags=BeautifulSoup(raw_text).get_text()
    remove_char=re.sub("[^a-zA-Z]"," ",remove_html_tags)
    lower_text=remove_char.lower().split()
    remove_stop=[i for i in lower_text if not i in stopwords]
    lemmatized_text=[wordnet_lemmatizer.lemmatize(word,'v') for word in remove_stop]
    return " ".join(lemmatized_text)

In [11]:
data['clean_review']=data['review'].apply(clean_data)

In [12]:
data['clean_review'][0]

'stuff go moment mj start listen music watch odd documentary watch wiz watch moonwalker maybe want get certain insight guy think really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember go see cinema originally release subtle message mj feel towards press also obvious message drug bad kay visually impressive course michael jackson unless remotely like mj anyway go hate find bore may call mj egotist consent make movie mj fan would say make fan true really nice actual feature film bite finally start minutes exclude smooth criminal sequence joe pesci convince psychopathic powerful drug lord want mj dead bad beyond mj overhear plan nah joe pesci character rant want people know supply drug etc dunno maybe hat mj music lot cool things like mj turn car robot whole speed demon sequence also director must patience saint come film kiddy bad sequence usually directors hate work one kid let alone whole bunch perform complex dance scene botto

## Count-Vectorizer

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [14]:
X=data['clean_review']
y=data['sentiment']

In [15]:
X_train, X_test, Y_train, Y_test=train_test_split(X,y, test_size=0.33, random_state=42)

In [16]:
vector=CountVectorizer(max_features=1000)
vector.fit(X_train.tolist())

CountVectorizer(max_features=1000)

In [17]:
X_train_vector=vector.transform(X_train.tolist()).toarray()

In [18]:
X_test_vector=vector.transform(X_test.tolist()).toarray()

In [19]:
X_train_vector.shape, X_test_vector.shape

((16750, 1000), (8250, 1000))

## Naive-Bayes

In [20]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
nb=GaussianNB()
now=time.time()
nb.fit(X_train_vector, Y_train)
Y_pred=nb.predict(X_test_vector)
final=time.time()-now
print(classification_report(Y_test,Y_pred))
print(final)

              precision    recall  f1-score   support

           0       0.74      0.84      0.79      4105
           1       0.82      0.72      0.76      4145

    accuracy                           0.78      8250
   macro avg       0.78      0.78      0.78      8250
weighted avg       0.78      0.78      0.78      8250

0.8544011116027832


In [44]:
filename='nb_BoG.sav'
pickle.dump(nb,open(filename,'wb'))

NameError: name 'pickle' is not defined

## Multinomial NB

In [21]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
nb=MultinomialNB()
now=time.time()
nb.fit(X_train_vector, Y_train)
Y_pred=nb.predict(X_test_vector)
final=time.time()-now
print(classification_report(Y_test,Y_pred))
print(final)

              precision    recall  f1-score   support

           0       0.84      0.82      0.83      4105
           1       0.83      0.85      0.84      4145

    accuracy                           0.84      8250
   macro avg       0.84      0.84      0.84      8250
weighted avg       0.84      0.84      0.84      8250

1.369835615158081


In [46]:
filename='mnb_BoG.sav'
pickle.dump(nb,open(filename,'wb'))

NameError: name 'pickle' is not defined

## Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
rf=RandomForestClassifier()
now=time.time()
rf.fit(X_train_vector, Y_train)
Y_pred=rf.predict(X_test_vector)
final=time.time()-now
print(classification_report(Y_test,Y_pred))
print(final)

              precision    recall  f1-score   support

           0       0.83      0.83      0.83      4105
           1       0.83      0.83      0.83      4145

    accuracy                           0.83      8250
   macro avg       0.83      0.83      0.83      8250
weighted avg       0.83      0.83      0.83      8250

21.839027643203735


## SVM

In [23]:
from sklearn import svm
s=svm.SVC()
now=time.time()
s.fit(X_train_vector, Y_train)
Y_pred=s.predict(X_test_vector)
final=time.time()-now
print(classification_report(Y_test,Y_pred))
print(final)

              precision    recall  f1-score   support

           0       0.87      0.83      0.85      4105
           1       0.84      0.88      0.86      4145

    accuracy                           0.86      8250
   macro avg       0.86      0.86      0.86      8250
weighted avg       0.86      0.86      0.86      8250

383.1529564857483


In [24]:
import pickle
filename='svm_BoG.sav'
pickle.dump(s,open(filename,'wb'))

In [52]:
filename='rf_BoG.sav'
pickle.dump(rf,open(filename,'wb'))

NameError: name 'pickle' is not defined

In [51]:
model=pickle.load(open('rf_BoG.sav','rb'))
y_pred=model.predict(X_test_vector)

NameError: name 'pickle' is not defined

In [52]:
print(classification_report(Y_test,y_pred))

              precision    recall  f1-score   support

           0       0.83      0.83      0.83      4105
           1       0.83      0.83      0.83      4145

    accuracy                           0.83      8250
   macro avg       0.83      0.83      0.83      8250
weighted avg       0.83      0.83      0.83      8250



## Decision-Tree

In [25]:
from sklearn import tree
dt=tree.DecisionTreeClassifier()
now=time.time()
dt.fit(X_train_vector, Y_train)
Y_pred=dt.predict(X_test_vector)
final=time.time()-now
print(classification_report(Y_test,Y_pred))
print(final)

              precision    recall  f1-score   support

           0       0.70      0.72      0.71      4105
           1       0.71      0.70      0.71      4145

    accuracy                           0.71      8250
   macro avg       0.71      0.71      0.71      8250
weighted avg       0.71      0.71      0.71      8250

10.016005516052246


In [25]:
import pickle
filename='dt_BoG.sav'
pickle.dump(dt,open(filename,'wb'))

## XGBoost

In [26]:
!pip install xgboost
from xgboost import XGBClassifier
xb=XGBClassifier(max_depth=5, alpha=10, n_estimators=10)
now=time.time()
xb.fit(X_train_vector, Y_train)
Y_pred=xb.predict(X_test_vector)
final=time.time()- now
print(classification_report(Y_test,Y_pred))
print(final)





              precision    recall  f1-score   support

           0       0.82      0.69      0.75      4105
           1       0.73      0.85      0.79      4145

    accuracy                           0.77      8250
   macro avg       0.78      0.77      0.77      8250
weighted avg       0.78      0.77      0.77      8250

4.4232847690582275


In [32]:
filename='xg_BoG.sav'
pickle.dump(xb,open(filename,'wb'))

## TF-IDF Vectorizer

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf=TfidfVectorizer()
X_train_vector=tf_idf.fit_transform(X_train)
X_train_vector.shape
print(X_train_vector.shape)
X_test_vector=tf_idf.transform(X_test)
print(X_test_vector.shape)

(16750, 52816)
(8250, 52816)


## Naive-Bayes

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
nb=MultinomialNB()
now=time.time()
nb.fit(X_train_vector, Y_train)
Y_pred=nb.predict(X_test_vector)
final=time.time()-now
print(classification_report(Y_test,Y_pred))
print(final)

              precision    recall  f1-score   support

           0       0.86      0.88      0.87      4105
           1       0.88      0.86      0.87      4145

    accuracy                           0.87      8250
   macro avg       0.87      0.87      0.87      8250
weighted avg       0.87      0.87      0.87      8250

0.059972524642944336


In [59]:
filename='mnb_BoG_tfidf.sav'
pickle.dump(nb,open(filename,'wb'))

## Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
now=time.time()
rf.fit(X_train_vector, Y_train)
Y_pred=rf.predict(X_test_vector)
final=time.time()-now
print(classification_report(Y_test,Y_pred))
print(final)

              precision    recall  f1-score   support

           0       0.84      0.86      0.85      4105
           1       0.86      0.84      0.85      4145

    accuracy                           0.85      8250
   macro avg       0.85      0.85      0.85      8250
weighted avg       0.85      0.85      0.85      8250

45.27002453804016


In [47]:
filename='rf_BoG_tfidf.sav'
pickle.dump(rf,open(filename,'wb'))

## SVM

In [71]:
from sklearn import svm
from sklearn.decomposition import SparsePCA
pca=SparsePCA(n_components=1000)
pca.fit(X_train_vector.toarray())
X_pca=pca.transform(X_train_vector)

MemoryError: Unable to allocate 6.59 GiB for an array with shape (16750, 52816) and data type float64

## Decision Tree

In [72]:
dt=tree.DecisionTreeClassifier()
now=time.time()
dt.fit(X_train_vector, Y_train)
Y_pred=dt.predict(X_test_vector)
final=time.time()-now
print(classification_report(Y_test,Y_pred))
print(final)

              precision    recall  f1-score   support

           0       0.70      0.70      0.70      4105
           1       0.70      0.70      0.70      4145

    accuracy                           0.70      8250
   macro avg       0.70      0.70      0.70      8250
weighted avg       0.70      0.70      0.70      8250

28.685551643371582


In [54]:
filename='dt_BoG_tfidf.sav'
pickle.dump(dt,open(filename,'wb'))

## XGBoost

In [16]:
from xgboost import XGBClassifier
xb=XGBClassifier(max_depth=5, alpha=10, n_estimators=10)
now=time.time()
xb.fit(X_train_vector, Y_train)
Y_pred=xb.predict(X_test_vector)
final=time.time()-now
print(classification_report(Y_test,Y_pred))
print(final)



              precision    recall  f1-score   support

           0       0.82      0.69      0.75      4105
           1       0.74      0.85      0.79      4145

    accuracy                           0.77      8250
   macro avg       0.78      0.77      0.77      8250
weighted avg       0.78      0.77      0.77      8250

5.416136026382446


In [58]:
filename='xg_BoG_tfidf.sav'
pickle.dump(xb,open(filename,'wb'))

## Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(penalty='l2')
now=time.time()
lr.fit(X_train_vector, Y_train)
Y_pred=lr.predict(X_test_vector)
final=time.time()-now
print(classification_report(Y_test,Y_pred))
print(final)

              precision    recall  f1-score   support

           0       0.90      0.88      0.89      4105
           1       0.88      0.90      0.89      4145

    accuracy                           0.89      8250
   macro avg       0.89      0.89      0.89      8250
weighted avg       0.89      0.89      0.89      8250

2.604881763458252


In [59]:
filename='lr_BoG_tfidf.sav'
pickle.dump(lr,open(filename,'wb'))

## Word2Vector

In [15]:
!pip3 install gensim

Collecting gensim
  Downloading gensim-4.1.2-cp38-cp38-win_amd64.whl (24.0 MB)
Collecting Cython==0.29.23
  Downloading Cython-0.29.23-cp38-cp38-win_amd64.whl (1.7 MB)
Collecting smart-open>=1.8.1
  Downloading smart_open-5.2.1-py3-none-any.whl (58 kB)
Installing collected packages: Cython, smart-open, gensim
  Attempting uninstall: Cython
    Found existing installation: Cython 0.29.21
    Uninstalling Cython-0.29.21:
      Successfully uninstalled Cython-0.29.21
Successfully installed Cython-0.29.23 gensim-4.1.2 smart-open-5.2.1


In [16]:
from gensim.models import Word2Vec

In [17]:
unlabelled_train=pd.read_csv("unlabeledTrainData.tsv", delimiter="\t", quoting=3)
unlabelled_train['clean_review']=unlabelled_train['review'].apply(clean_data)

In [18]:
sentences=[]
sentences.extend(unlabelled_train['clean_review'])
sentences.extend(data['clean_review'])

In [19]:
sentences=list(set(sentences))

In [20]:
len(sentences)

74047

In [21]:
import gensim.downloader

In [22]:
print(gensim.downloader.info()['models'].keys())

dict_keys(['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis'])


#### get pre-trained model

In [23]:
glove_vec=gensim.downloader.load('glove-twitter-25')



In [27]:
def get_vectors(model,sentence):
    vec=[]
    for i in sentence.split():
        try:
            vec.append(model[i])
        except:
            continue
    return np.average(vec,axis=0)

def get_model(model,doc,num_features=25):
    counter=0
    reviewVec=np.zeros((len(doc), num_features), dtype='float64')
    for s in doc:
        reviewVec[counter]=get_vectors(model,s)
        counter+=1
    return reviewVec

In [28]:
X_train_w2v=get_model(glove_vec,X_train,num_features=25)
X_test_w2v=get_model(glove_vec,X_test,num_features=25)

## Random Forest

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
rf=RandomForestClassifier()
rf.fit(X_train_w2v, Y_train)

RandomForestClassifier()

In [30]:
Y_pred=rf.predict(X_test_w2v)

In [31]:
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.76      0.73      0.74      4105
           1       0.74      0.77      0.75      4145

    accuracy                           0.75      8250
   macro avg       0.75      0.75      0.75      8250
weighted avg       0.75      0.75      0.75      8250



In [32]:
import pickle
filename='rf_w2v.sav'
pickle.dump(rf,open(filename,'wb'))

## SVM

In [33]:
from sklearn import svm
s=svm.SVC()
s.fit(X_train_w2v, Y_train)

SVC()

In [34]:
Y_pred=s.predict(X_test_w2v)
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.75      0.76      0.76      4105
           1       0.76      0.76      0.76      4145

    accuracy                           0.76      8250
   macro avg       0.76      0.76      0.76      8250
weighted avg       0.76      0.76      0.76      8250



In [35]:
filename='svm_w2v.sav'
pickle.dump(s,open(filename,'wb'))

## Logistic Regression

In [36]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(penalty='l2')
lr.fit(X_train_w2v, Y_train)
Y_pred=lr.predict(X_test_w2v)
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.75      0.76      0.75      4105
           1       0.76      0.75      0.75      4145

    accuracy                           0.75      8250
   macro avg       0.75      0.75      0.75      8250
weighted avg       0.75      0.75      0.75      8250



In [37]:
filename='lr_w2v.sav'
pickle.dump(lr,open(filename,'wb'))

## XGBoost

In [39]:
from xgboost import XGBClassifier
xb=XGBClassifier(max_depth=5, alpha=10, n_estimators=10)
xb.fit(X_train_w2v, Y_train)
Y_pred=xb.predict(X_test_w2v)
print(classification_report(Y_test,Y_pred))



              precision    recall  f1-score   support

           0       0.74      0.72      0.73      4105
           1       0.73      0.74      0.74      4145

    accuracy                           0.73      8250
   macro avg       0.73      0.73      0.73      8250
weighted avg       0.73      0.73      0.73      8250



In [40]:
filename='xg_w2v.sav'
pickle.dump(xb,open(filename,'wb'))

## Decision-Tree

In [41]:
from sklearn import tree
dt=tree.DecisionTreeClassifier()
dt.fit(X_train_w2v, Y_train)
Y_pred=dt.predict(X_test_w2v)
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.64      0.63      0.63      4105
           1       0.64      0.64      0.64      4145

    accuracy                           0.64      8250
   macro avg       0.64      0.64      0.64      8250
weighted avg       0.64      0.64      0.64      8250



In [42]:
filename='dt_w2v.sav'
pickle.dump(dt,open(filename,'wb'))

## Naive-Bayes

In [44]:
from sklearn.naive_bayes import GaussianNB
nb=GaussianNB()
nb.fit(X_train_w2v, Y_train)
Y_pred=nb.predict(X_test_w2v)
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.69      0.72      0.71      4105
           1       0.71      0.68      0.70      4145

    accuracy                           0.70      8250
   macro avg       0.70      0.70      0.70      8250
weighted avg       0.70      0.70      0.70      8250



In [45]:
filename='nb_w2v.sav'
pickle.dump(nb,open(filename,'wb'))