In [None]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score,log_loss,precision_score,recall_score,make_scorer,accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv("IMDB Dataset.csv")
data

In [None]:
lb = LabelBinarizer()
y = lb.fit_transform(data['sentiment'])
lb.classes_

In [None]:
# using the TfidfVectorizer to transform the textual data in the 'filtered_review' column of your dataset 
# into a matrix of numerical features.The resulting matrix 'x' has shape (n_samples, n_features) and is used as input to train the classifiers.

vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(data['review'])
print(x.shape,y.shape)

In [None]:
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score),
           'recall': make_scorer(recall_score),
           'f1': make_scorer(f1_score),
           'log_loss': make_scorer(log_loss)}

In [None]:
# This code defines and evaluates the performance of a Logistic Regression classifier on the provided dataset 
# using cross-validation. The average scores for various evaluation metrics on both training and test sets are printed.


lr = LogisticRegression(random_state = 0, max_iter = 5)
lr_scores = cross_validate(lr, x, y, cv=5,scoring=scoring,return_train_score=True, verbose = 3, n_jobs = -1)
print("The Fit Time is: ",lr_scores['fit_time'].mean())
print("\n")
print("The Score Time is: ",lr_scores['score_time'].mean())
print("\n")
print("The Train Accuracy score is: ",lr_scores['train_accuracy'].mean())
print("\n")
print("The Train F1 score is: ",lr_scores['train_f1'].mean())
print("\n")
print("The Train Precision score is: ",lr_scores['train_precision'].mean())
print("\n")
print("The Train Recall score is: ",lr_scores['train_recall'].mean())
print("\n")
print("The Train Log Loss is: ",lr_scores['train_log_loss'].mean())
print("\n")
print("The Test Accuracy score is: ",lr_scores['test_accuracy'].mean())
print("\n")
print("The Test F1 score is: ",lr_scores['test_f1'].mean())
print("\n")
print("The Test Precision score is: ",lr_scores['test_precision'].mean())
print("\n")
print("The Test Recall score is: ",lr_scores['test_recall'].mean())
print("\n")
print("The Test Log Loss is: ",lr_scores['test_log_loss'].mean())
print("\n")
lr_scores

In [None]:

# fits a decision tree classifier with maximum depth of 25 and performs cross-validation with 5 
# folds, computing various performance 
# metrics on both the training and test sets. The results are printed and stored in a dictionary called "dt_scores".



dt = DecisionTreeClassifier(random_state = 0, max_depth = 25)
dt_scores = cross_validate(dt, x, y, cv=5,scoring=scoring,return_train_score=True, verbose = 3, n_jobs = -1)
print("The Fit Time is: ",dt_scores['fit_time'].mean())
print("\n")
print("The Score Time is: ",dt_scores['score_time'].mean())
print("\n")
print("The Train Accuracy score is: ",dt_scores['train_accuracy'].mean())
print("\n")
print("The Train F1 score is: ",dt_scores['train_f1'].mean())
print("\n")
print("The Train Precision score is: ",dt_scores['train_precision'].mean())
print("\n")
print("The Train Recall score is: ",dt_scores['train_recall'].mean())
print("\n")
print("The Train Log Loss is: ",dt_scores['train_log_loss'].mean())
print("\n")
print("The Test Accuracy score is: ",dt_scores['test_accuracy'].mean())
print("\n")
print("The Test F1 score is: ",dt_scores['test_f1'].mean())
print("\n")
print("The Test Precision score is: ",dt_scores['test_precision'].mean())
print("\n")
print("The Test Recall score is: ",dt_scores['test_recall'].mean())
print("\n")
print("The Test Log Loss is: ",dt_scores['test_log_loss'].mean())
print("\n")
dt_scores

In [None]:
# This code defines a random forest classifier and performs cross-validation on the given dataset, 
# calculating various metrics for both training and
# testing sets. The mean values of these metrics are then printed along with the fit and score times.


rf = RandomForestClassifier(random_state = 0, max_depth = 25)
rf_scores = cross_validate(rf, x, y, cv=5,scoring=scoring,return_train_score=True, verbose = 3, n_jobs = -1)
print("The Fit Time is: ",rf_scores['fit_time'].mean())
print("\n")
print("The Score Time is: ",rf_scores['score_time'].mean())
print("\n")
print("The Train Accuracy score is: ",rf_scores['train_accuracy'].mean())
print("\n")
print("The Train F1 score is: ",rf_scores['train_f1'].mean())
print("\n")
print("The Train Precision score is: ",rf_scores['train_precision'].mean())
print("\n")
print("The Train Recall score is: ",rf_scores['train_recall'].mean())
print("\n")
print("The Train Log Loss is: ",rf_scores['train_log_loss'].mean())
print("\n")
print("The Test Accuracy score is: ",rf_scores['test_accuracy'].mean())
print("\n")
print("The Test F1 score is: ",rf_scores['test_f1'].mean())
print("\n")
print("The Test Precision score is: ",rf_scores['test_precision'].mean())
print("\n")
print("The Test Recall score is: ",rf_scores['test_recall'].mean())
print("\n")
print("The Test Log Loss is: ",rf_scores['test_log_loss'].mean())
print("\n")
rf_scores

In [None]:
# creates an instance of the MLPClassifier with certain hyperparameters, then uses cross-validation to evaluate 
# its performance on the given dataset, and prints out the mean scores for each metric. The results indicate the average 
# performance of the MLPClassifier model on the dataset, based on various evaluation metrics such as accuracy, F1 score, and precision.


mlp = MLPClassifier(hidden_layer_sizes=(150, 15, ), random_state=0, max_iter = 5)
mlp_scores = cross_validate(mlp, x, y, cv=5,scoring=scoring,return_train_score=True, verbose = 3, n_jobs = -1)
print("The Fit Time is: ",mlp_scores['fit_time'].mean())
print("\n")
print("The Score Time is: ",mlp_scores['score_time'].mean())
print("\n")
print("The Train Accuracy score is: ",mlp_scores['train_accuracy'].mean())
print("\n")
print("The Train F1 score is: ",mlp_scores['train_f1'].mean())
print("\n")
print("The Train Precision score is: ",mlp_scores['train_precision'].mean())
print("\n")
print("The Train Recall score is: ",mlp_scores['train_recall'].mean())
print("\n")
print("The Train Log Loss is: ",mlp_scores['train_log_loss'].mean())
print("\n")
print("The Test Accuracy score is: ",mlp_scores['test_accuracy'].mean())
print("\n")
print("The Test F1 score is: ",mlp_scores['test_f1'].mean())
print("\n")
print("The Test Precision score is: ",mlp_scores['test_precision'].mean())
print("\n")
print("The Test Recall score is: ",mlp_scores['test_recall'].mean())
print("\n")
print("The Test Log Loss is: ",mlp_scores['test_log_loss'].mean())
print("\n")
mlp_scores

In [None]:
models = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Neural Network']
train_acc = [lr_scores['train_accuracy'].mean(), dt_scores['train_accuracy'].mean(), rf_scores['train_accuracy'].mean(), mlp_scores['train_accuracy'].mean()]
test_acc = [lr_scores['test_accuracy'].mean(), dt_scores['test_accuracy'].mean(), rf_scores['test_accuracy'].mean(), mlp_scores['test_accuracy'].mean()]
train_prec = [lr_scores['train_precision'].mean(), dt_scores['train_precision'].mean(), rf_scores['train_precision'].mean(), mlp_scores['train_precision'].mean()]
test_prec = [lr_scores['test_precision'].mean(), dt_scores['test_precision'].mean(), rf_scores['test_precision'].mean(), mlp_scores['test_precision'].mean()]
train_recall = [lr_scores['train_recall'].mean(), dt_scores['train_recall'].mean(), rf_scores['train_recall'].mean(), mlp_scores['train_recall'].mean()]
test_recall = [lr_scores['test_recall'].mean(), dt_scores['test_recall'].mean(), rf_scores['test_recall'].mean(), mlp_scores['test_recall'].mean()]
train_f1 = [lr_scores['train_f1'].mean(), dt_scores['train_f1'].mean(), rf_scores['train_f1'].mean(), mlp_scores['train_f1'].mean()]
test_f1 = [lr_scores['test_f1'].mean(), dt_scores['test_f1'].mean(), rf_scores['test_f1'].mean(), mlp_scores['test_f1'].mean()]

fig, axs = plt.subplots(figsize=(10, 6))

axs.plot(models, train_acc, label='Train Acc')
axs.plot(models, test_acc, label='Test Acc')
axs.plot(models, train_prec, label='Train Prec')
axs.plot(models, test_prec, label='Test Prec')
axs.plot(models, train_recall, label='Train Recall')
axs.plot(models, test_recall, label='Test Recall')
axs.plot(models, train_f1, label='Train F1')
axs.plot(models, test_f1, label='Test F1')

axs.set_xlabel('Model')
axs.set_ylabel('Score')
axs.legend()
axs.set_title('Performance Metrics for Four Models for TF-IDF Features')

plt.show()

In [None]:
train_loss = [lr_scores['train_log_loss'].mean(), dt_scores['train_log_loss'].mean(), rf_scores['train_log_loss'].mean(), mlp_scores['train_log_loss'].mean()]
test_loss = [lr_scores['test_log_loss'].mean(), dt_scores['test_log_loss'].mean(), rf_scores['test_log_loss'].mean(), mlp_scores['test_log_loss'].mean()]

fig, axs = plt.subplots(figsize=(10, 6))

axs.plot(models, train_loss, label='Train Loss')
axs.plot(models, test_loss, label='Test Loss')

axs.set_xlabel('Model')
axs.set_ylabel('Log Loss')
axs.legend()
axs.set_title('Train and Test Log Loss for Four Models for TF-IDF Features')

plt.show()
