In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
import gc
from sklearn.feature_extraction.text import HashingVectorizer
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression


In [2]:
df = pd.read_csv("C:/Users/kkmax/Desktop/end to end ML Deployment/Sentiment Analysis of movie reviews/notebook/data/IMDB Dataset.csv")

In [3]:
df = df.iloc[:5000,:]

In [4]:
df.shape

(5000, 2)

In [21]:
df['sentiment'] = df['sentiment'].replace({'positive':1, 'negative': 0})

In [22]:
X = df['review']
y = df['sentiment']

In [23]:
X

0       One of the other reviewers has mentioned that ...
1       A wonderful little production. <br /><br />The...
2       I thought this was a wonderful way to spend ti...
3       Basically there's a family where a little boy ...
4       Petter Mattei's "Love in the Time of Money" is...
                              ...                        
4995    An interesting slasher film with multiple susp...
4996    i watched this series when it first came out i...
4997    Once again Jet Li brings his charismatic prese...
4998    I rented this movie, after hearing Chris Gore ...
4999    This was a big disappointment for me. I think ...
Name: review, Length: 5000, dtype: object

In [24]:
y

0       1
1       1
2       1
3       0
4       1
       ..
4995    0
4996    1
4997    1
4998    0
4999    0
Name: sentiment, Length: 5000, dtype: int64

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [27]:
vectorizer = HashingVectorizer()
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [28]:
from sklearn.metrics import f1_score
def evaluate_model(true, predicted):
    f1 = f1_score(true, predicted)
    return f1

In [37]:
models = {
    "XGBClassifier": XGBClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "SVC": SVC()
}
model_list = []
f1_score_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_f1_score = evaluate_model(y_train, y_train_pred)

    model_test_f1_score = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- F1 Score: {:.4f}".format(model_train_f1_score))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- F1 Score: {:.4f}".format(model_test_f1_score))
    f1_score_list.append(model_test_f1_score)
    print('='*35)
    print('\n')

XGBClassifier
Model performance for Training set
- F1 Score: 0.9992
----------------------------------
Model performance for Test set
- F1 Score: 0.8116


RandomForestClassifier
Model performance for Training set
- F1 Score: 1.0000
----------------------------------
Model performance for Test set
- F1 Score: 0.7913






AdaBoostClassifier
Model performance for Training set
- F1 Score: 0.8280
----------------------------------
Model performance for Test set
- F1 Score: 0.7844


Decision Tree
Model performance for Training set
- F1 Score: 1.0000
----------------------------------
Model performance for Test set
- F1 Score: 0.6695


SVC
Model performance for Training set
- F1 Score: 0.9418
----------------------------------
Model performance for Test set
- F1 Score: 0.8361




In [38]:
pd.DataFrame(list(zip(model_list, f1_score_list)), columns=['Model Name', 'F1 Score']).sort_values(by=["F1 Score"],ascending=False)

Unnamed: 0,Model Name,F1 Score
4,SVC,0.836134
0,XGBClassifier,0.811594
1,RandomForestClassifier,0.791322
2,AdaBoostClassifier,0.784437
3,Decision Tree,0.669537
