In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix,f1_score
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
import pickle

In [2]:
# Load the dataset
file_path = 'Balanced_Reviews(500).csv'
data = pd.read_csv(file_path)

# Convert rating column to numeric
data['rating'] = pd.to_numeric(data['rating'], errors='coerce')
data = data.dropna(subset=['rating'])
data['rating'] = data['rating'].astype(int)

# Create sentiment column
data['sentiment'] = data['rating'].apply(lambda x: 'positive' if x >= 4 else ('neutral' if x == 3 else 'negative'))

In [3]:
# Drop NaN values in content
data = data.dropna(subset=['content'])

# Separate majority and minority classes
positive = data[data.sentiment == 'positive']
neutral = data[data.sentiment == 'neutral']
negative = data[data.sentiment == 'negative']

In [4]:
# Upsample minority classes
neutral_upsampled = resample(neutral,
                             replace=True,     # sample with replacement
                             n_samples=len(positive),    # to match majority class
                             random_state=42) # reproducible results
negative_upsampled = resample(negative,
                              replace=True,     # sample with replacement
                              n_samples=len(positive),    # to match majority class
                              random_state=42) # reproducible results

# Combine majority class with upsampled minority classes
upsampled = pd.concat([positive, neutral_upsampled, negative_upsampled])

# Text data for training
X = upsampled['content']
y = upsampled['sentiment']

In [5]:
# Split the data into training, validation, and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

model_xgb = XGBClassifier()
model_xgb.fit(X_train_tfidf, y_train_encoded)

y_pred_train = model_xgb.predict(X_train_tfidf)
y_pred_test = model_xgb.predict(X_test_tfidf)

print("Training Accuracy:", accuracy_score(y_train_encoded, y_pred_train))
print("Testing Accuracy:", accuracy_score(y_test_encoded, y_pred_test))



Training Accuracy: 0.9973958333333334
Testing Accuracy: 0.8916666666666667


In [6]:
print (classification_report(y_test_encoded,y_pred_test))

              precision    recall  f1-score   support

           0       0.95      0.90      0.93        41
           1       0.82      0.89      0.85        36
           2       0.90      0.88      0.89        43

    accuracy                           0.89       120
   macro avg       0.89      0.89      0.89       120
weighted avg       0.89      0.89      0.89       120



In [7]:
pickle.dump(model_xgb, open('Models/XGBoost.pkl', 'wb'))