In [1]:
!pip install gensim



In [None]:
import nltk
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_curve, auc
import matplotlib.pyplot as plt

In [None]:
import gensim.downloader as api

glove_model = api.load("glove-wiki-gigaword-50")

In [None]:
!kaggle datasets download -d ahmedabdulhamid/reviews-dataset

In [None]:
!unzip 'reviews-dataset.zip'

In [None]:
with open('TrainingDataNegative.txt','r',encoding = 'utf-8') as file:
    Negetive = file.readlines()
with open('TrainingDataPositive.txt','r',encoding = 'utf-8') as file:
    Positive = file.readlines()

In [None]:
Negetive_df = pd.DataFrame({'review':[review.strip() for review in Negetive]})
Positive_df = pd.DataFrame({'review':[review.strip() for review in Positive]})

In [None]:
test_data = pd.read_csv('TestReviews.csv')

In [None]:
test_data.head()

In [None]:
Negetive_df['sentiment'] = "negetive"
Positive_df['sentiment'] = "positive"

In [None]:
train_data = pd.concat([Negetive_df,Positive_df],ignore_index = True)

In [None]:
train_data['Tokens'] = train_data['review'].apply(word_tokenize)
test_data['Tokens'] = test_data['review'].apply(word_tokenize)

In [None]:
test_data['sentiment'] = test_data['class'].apply(lambda x: 'positive' if x == 1 else 'negative')

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
def get_avg_embedding(tokens):
    embeddings = [glove_model[word] for word in tokens if word in glove_model]
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(glove_model.vector_size)


In [None]:
train_data['embeddings'] = train_data['Tokens'].apply(get_avg_embedding)
test_data['embeddings'] = test_data['Tokens'].apply(get_avg_embedding)

In [None]:
test_data.head()

In [None]:
train_data.head()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_curve, auc
import matplotlib.pyplot as plt

In [None]:
X_train = np.vstack(train_data['embeddings'].values)
y_train = train_data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0).values

X_test = np.vstack(test_data['embeddings'].values)
y_test = test_data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0).values

In [None]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# Train Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

In [None]:
y_pred_lr = lr_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)

In [None]:
# Metrics for Logistic Regression
print('Logistic Regression:')
print('Accuracy:', accuracy_score(y_test, y_pred_lr))
print('Precision:', precision_score(y_test, y_pred_lr))
print('Recall:', recall_score(y_test, y_pred_lr))

# Metrics for Random Forest
print('Random Forest:')
print('Accuracy:', accuracy_score(y_test, y_pred_rf))
print('Precision:', precision_score(y_test, y_pred_rf))
print('Recall:', recall_score(y_test, y_pred_rf))

In [None]:
# ROC Curve
fpr_lr, tpr_lr, _ = roc_curve(y_test, lr_model.predict_proba(X_test)[:, 1])
fpr_rf, tpr_rf, _ = roc_curve(y_test, rf_model.predict_proba(X_test)[:, 1])

plt.figure()
plt.plot(fpr_lr, tpr_lr, label='Logistic Regression (area = %0.2f)' % auc(fpr_lr, tpr_lr))
plt.plot(fpr_rf, tpr_rf, label='Random Forest (area = %0.2f)' % auc(fpr_rf, tpr_rf))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()