In [20]:
import nltk
import pickle
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import naive_bayes

In [3]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KALYAN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
dataset = pd.read_csv(r"C:\Users\KALYAN\Desktop\Projects\Movie Recommendation System\Artifacts\reviews.txt",sep = '\t', names =['Reviews','Comments'])

In [5]:
dataset

Unnamed: 0,Reviews,Comments
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...
...,...,...
6913,0,Brokeback Mountain was boring.
6914,0,So Brokeback Mountain was really depressing.
6915,0,"As I sit here, watching the MTV Movie Awards, ..."
6916,0,Ok brokeback mountain is such a horrible movie.


In [6]:
stopset = stopwords.words('english')

In [7]:
vectorizer = TfidfVectorizer(use_idf = True,lowercase = True, strip_accents='ascii',stop_words=stopset)

In [8]:
X = vectorizer.fit_transform(dataset.Comments)
y = dataset.Reviews
pickle.dump(vectorizer, open('tranform.pkl', 'wb'))

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [10]:
models = {
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(),
    'Support Vector Machine': SVC(),
    'Multinomial Naive Bayes': naive_bayes.MultinomialNB()
}

In [17]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    
    print(f'Model Metrics for {type(model).__name__}:')
    print(f'Accuracy: {accuracy:.2f}')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1-Score: {f1:.2f}')
    print('Confusion Matrix:')
    print(confusion)
    print('-' * 40,"\n")

In [18]:
for model_name, model in models.items():
    model.fit(X_train, y_train)
    evaluate_model(model, X_test, y_test)

Model Metrics for RandomForestClassifier:
Accuracy: 0.99
Precision: 0.99
Recall: 0.99
F1-Score: 0.99
Confusion Matrix:
[[570  10]
 [  5 799]]
---------------------------------------- 

Model Metrics for LogisticRegression:
Accuracy: 0.99
Precision: 0.98
Recall: 0.99
F1-Score: 0.99
Confusion Matrix:
[[566  14]
 [  6 798]]
---------------------------------------- 

Model Metrics for SVC:
Accuracy: 0.99
Precision: 0.99
Recall: 1.00
F1-Score: 0.99
Confusion Matrix:
[[570  10]
 [  4 800]]
---------------------------------------- 

Model Metrics for MultinomialNB:
Accuracy: 0.97
Precision: 0.97
Recall: 0.99
F1-Score: 0.98
Confusion Matrix:
[[555  25]
 [ 10 794]]
---------------------------------------- 



In [13]:
# Find the best model based on the highest accuracy
best_model_name = max(models, key=lambda k: accuracy_score(y_test, models[k].predict(X_test)))
best_accuracy = accuracy_score(y_test, models[best_model_name].predict(X_test))
print(f'Best Model: {best_model_name} with Accuracy: {best_accuracy:.2f}')

Best Model: Support Vector Machine with Accuracy: 0.99


In [19]:
best_model = models[best_model_name] 
model_file_path = 'best_model.pkl'
with open(model_file_path, 'wb') as model_file:
    pickle.dump(best_model, model_file)