In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [6]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [2]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [11]:
import time

# Load Data and Preprocessing

In [3]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [4]:
def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphanumeric characters
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize
    stop_words = set(stopwords.words('english'))  # Load stopwords
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()  # Initialize lemmatizer
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize
    return ' '.join(tokens)

In [7]:
train_data['cleaned_text'] = train_data['review'].apply(preprocess_text)
test_data['cleaned_text'] = test_data['review'].apply(preprocess_text)

In [8]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['cleaned_text']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(test_data['cleaned_text']).toarray()

In [9]:
count_vectorizer = CountVectorizer(max_features=5000)
X_train_count = count_vectorizer.fit_transform(train_data['cleaned_text']).toarray()
X_test_count = count_vectorizer.transform(test_data['cleaned_text']).toarray()

In [10]:
y_train = train_data['sentiment']  # Replace 'sentiment' with the label column name
y_test = test_data['sentiment']

# Training Models

## Naive Bayes

In [14]:
nb_model = MultinomialNB()
print("time start:", time.time())
nb_model.fit(X_train_tfidf, y_train)
print("time end:", time.time())

time start: 1736254619.359294
time end: 1736254619.8774083


In [15]:
y_pred_nb = nb_model.predict(X_test_tfidf)

In [16]:
print("Naive Bayes Model Evaluation:")
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"Accuracy: {accuracy_nb:.2f}")
print("Classification Report:")
classificaiton_report_nb = classification_report(y_test, y_pred_nb)
print(classificaiton_report_nb)
confusion_matrix_nb = confusion_matrix(y_test, y_pred_nb)
print("Confusion Matrix:")
print(confusion_matrix_nb)

Naive Bayes Model Evaluation:
Accuracy: 0.86
Classification Report:
              precision    recall  f1-score   support

    negative       0.87      0.85      0.86      9935
    positive       0.85      0.87      0.86     10065

    accuracy                           0.86     20000
   macro avg       0.86      0.86      0.86     20000
weighted avg       0.86      0.86      0.86     20000

Confusion Matrix:
[[8418 1517]
 [1296 8769]]


## Random Forest

In [18]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
print("time start:", time.time())
rf_model.fit(X_train_tfidf, y_train)
print("time end:", time.time())

time start: 1736254654.9126863
time end: 1736254741.3143978


In [19]:
y_pred_rf = rf_model.predict(X_test_tfidf)

In [20]:
print("\nRandom Forest Model Evaluation:")
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy: {accuracy_rf:.2f}")
print("Classification Report:")
classification_report_rf = classification_report(y_test, y_pred_rf)
print(classification_report_rf)
confusion_matrix_rf = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix:")
print(confusion_matrix_rf)


Random Forest Model Evaluation:
Accuracy: 0.85
Classification Report:
              precision    recall  f1-score   support

    negative       0.84      0.86      0.85      9935
    positive       0.86      0.84      0.85     10065

    accuracy                           0.85     20000
   macro avg       0.85      0.85      0.85     20000
weighted avg       0.85      0.85      0.85     20000

Confusion Matrix:
[[8507 1428]
 [1632 8433]]


## k-Nearest Neighbors

In [21]:
knn_model = KNeighborsClassifier(n_neighbors=5)
print("time start:", time.time())
knn_model.fit(X_train_tfidf, y_train)
print("time end:", time.time())

In [22]:
y_pred_knn = knn_model.predict(X_test_tfidf)

In [23]:
print("\nk-Nearest Neighbors (k-NN) Model Evaluation:")
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"Accuracy: {accuracy_knn:.2f}")
print("Classification Report:")
classification_report_knn = classification_report(y_test, y_pred_knn)
print(classification_report_knn)
confusion_matrix_knn = confusion_matrix(y_test, y_pred_knn)
print("Confusion Matrix:")
print(confusion_matrix_knn)


k-Nearest Neighbors (k-NN) Model Evaluation:
Accuracy: 0.73
Classification Report:
              precision    recall  f1-score   support

    negative       0.77      0.67      0.71      9935
    positive       0.71      0.80      0.75     10065

    accuracy                           0.73     20000
   macro avg       0.74      0.73      0.73     20000
weighted avg       0.74      0.73      0.73     20000

Confusion Matrix:
[[6608 3327]
 [1988 8077]]


## Gradient Boosting

In [24]:
gb_model = GradientBoostingClassifier(random_state=42)
print("time start:", time.time())
gb_model.fit(X_train_tfidf, y_train)
print("time end:", time.time())

time start: 1736254876.1122234
time end: 1736255530.3383915


In [25]:
y_pred_gb = gb_model.predict(X_test_tfidf)

In [26]:
print("\nGradient Boosting Model Evaluation:")
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print(f"Accuracy: {accuracy_gb:.2f}")
print("Classification Report:")
classification_report_gb = classification_report(y_test, y_pred_gb)
print(classification_report_gb)
confusion_matrix_gb = confusion_matrix(y_test, y_pred_gb)
print("Confusion Matrix:")
print(confusion_matrix_gb)


Gradient Boosting Model Evaluation:
Accuracy: 0.81
Classification Report:
              precision    recall  f1-score   support

    negative       0.84      0.76      0.80      9935
    positive       0.78      0.86      0.82     10065

    accuracy                           0.81     20000
   macro avg       0.81      0.81      0.81     20000
weighted avg       0.81      0.81      0.81     20000

Confusion Matrix:
[[7520 2415]
 [1426 8639]]


# Evaluation report

In [27]:
results = []

In [28]:
def extract_metrics(model_name, accuracy, classification_report):
    """
    Extracts precision, recall, and F1-score from the classification report
    and appends the metrics along with accuracy to the results list.
    """
    # Parse the classification report
    report_lines = classification_report.split("\n")
    # Retrieve the weighted avg line (second-to-last non-empty line in classification_report output)
    weighted_avg_line = report_lines[-2].split()  # Weighted avg metrics are in the last table row
    precision = float(weighted_avg_line[2])
    recall = float(weighted_avg_line[3])
    f1_score = float(weighted_avg_line[4])

    # Append to the results list
    results.append({
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1_score
    })

In [31]:
extract_metrics("Naive Bayes", accuracy_nb, classificaiton_report_nb)
extract_metrics("Random Forest", accuracy_rf, classification_report_rf)
extract_metrics("k-NN", accuracy_knn, classification_report_knn)
extract_metrics("Gradient Boosting", accuracy_gb, classification_report_gb)

In [32]:
results_df = pd.DataFrame(results)

In [33]:
results_df.to_csv("model_evaluation_metrics.csv", index=False)

In [34]:
print("\nModel Evaluation Metrics:")
print(results_df)


Model Evaluation Metrics:
               Model  Accuracy  Precision  Recall  F1-Score
0        Naive Bayes   0.85935       0.86    0.86      0.86
1      Random Forest   0.84700       0.85    0.85      0.85
2               k-NN   0.73425       0.74    0.73      0.73
3  Gradient Boosting   0.80795       0.81    0.81      0.81
