# **Install Required Libraries**

In [None]:
%pip install pandas scikit-learn joblib tqdm nltk xgboost matplotlib

# **Import Libraries**

In [None]:
import os
import re
import string
import joblib
import xgboost as xgb
import pandas as pd
import numpy as np
import nltk
import urllib.request
import zipfile
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, log_loss
from xgboost import plot_importance

# **Download and Extract NLTK WordNet**

In [None]:
nltk_data_dir = '/root/nltk_data'
if not os.path.exists(nltk_data_dir):
    os.makedirs(nltk_data_dir)

nltk.data.path.append(nltk_data_dir)

# Manually download wordnet.zip
wordnet_url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/wordnet.zip"
wordnet_zip_path = os.path.join(nltk_data_dir, 'wordnet.zip')

if not os.path.exists(wordnet_zip_path):
    urllib.request.urlretrieve(wordnet_url, wordnet_zip_path)

with zipfile.ZipFile(wordnet_zip_path, 'r') as zip_ref:
    zip_ref.extractall(os.path.join(nltk_data_dir, 'corpora'))

# Download additional NLTK data
nltk.download('stopwords', download_dir=nltk_data_dir)
nltk.download('omw-1.4', download_dir=nltk_data_dir)

# Quick test
lemmatizer = WordNetLemmatizer()
print("WordNet test (lemmatize 'running'):", lemmatizer.lemmatize("running"))

# **Define Dataset Paths and Hyperparameters**

In [None]:
data_path = "C:/Users/indur/OneDrive - University of Westminster/GitHub/FYP_Project/Models/Review_Score/Dataset/correct_reviews_balanced.json"

Base = "C:/Users/indur/OneDrive - University of Westminster/GitHub/FYP_Project/Models/Testing/Review_ScoreTest/XGBoostPrediction"
vectorizer_path = f"{Base}/tfidf_vectorizer.pkl"
best_model_path = f"{Base}/best_xgboost_model.pkl"


test_size = 0.2
early_stopping_rounds = 10
num_boost_round = 500

# **Detect GPU or CPU for XGBoost**

In [None]:
def get_device():
    # Attempt to detect a GPU in the XGBoost environment
    if xgb.rabit.get_world_size() > 0 and xgb.rabit.get_rank() == 0:
        return "cuda"
    else:
        return "cpu"

DEVICE = get_device()
print(f"XGBoost device set to: {DEVICE}")

# **Initialize Text Preprocessing**

In [None]:
nltk.data.path.append(nltk_data_dir)  # Make sure stopwords are recognized
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    # Keep certain negation words
    words = [word for word in words if word not in stop_words or word in ["not", "bad", "never", "no"]]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# **Load Entire Dataset**

In [None]:
print(f"Loading Data from: {data_path}")
df = pd.read_json(data_path, lines=True)
df = df[['text', 'stars']].dropna(subset=['text'])
df['text'] = df['text'].apply(preprocess_text)
df['stars'] = df['stars'].astype(int) - 1  # [1..5] → [0..4]
print(f"Dataset loaded with {len(df)} entries.")

# **Train-Test Split**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['stars'],
    test_size=test_size,
    random_state=42
)

# **TF-IDF Vectorizer**

In [None]:
if os.path.exists(vectorizer_path):
    print("Loading existing TF-IDF vectorizer...")
    vectorizer = joblib.load(vectorizer_path)
    X_train_tfidf = vectorizer.transform(X_train).toarray()
else:
    vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
    X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
    joblib.dump(vectorizer, vectorizer_path)
    print("TF-IDF vectorizer fitted and saved.")

X_test_tfidf = vectorizer.transform(X_test).toarray()

# **Train XGBoost with Early Stopping**

In [None]:
dtrain = xgb.DMatrix(X_train_tfidf, label=y_train)
dtest = xgb.DMatrix(X_test_tfidf, label=y_test)

evals_result = {}  # to track metrics per iteration

xgb_model = xgb.train(
    {
        'objective': 'multi:softprob',
        'num_class': 5,
        'eval_metric': 'mlogloss',
        'eta': 0.1,
        'max_depth': 6,
        'subsample': 0.7,
        'colsample_bytree': 0.8,
        'tree_method': 'hist',  # CPU by default
        'device': DEVICE         # GPU if available
    },
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "test")],
    evals_result=evals_result,          # store logs here
    early_stopping_rounds=early_stopping_rounds,
    verbose_eval=10
)
print("XGBoost training complete.")

# **Plot Training Logloss Over Iterations**

In [None]:
epochs = len(evals_result['test']['mlogloss'])
x_axis = range(0, epochs)

plt.figure(figsize=(8, 4))
plt.plot(x_axis, evals_result['test']['mlogloss'], label='Test Logloss', color='blue')
plt.xlabel('Iteration')
plt.ylabel('Logloss')
plt.title('XGBoost Logloss Over Iterations')
plt.legend()
plt.grid(True)
plt.show()

# **Feature Importance Visualization**

In [None]:
plt.figure(figsize=(8, 6))
plot_importance(xgb_model, ax=plt.gca(), max_num_features=15, title='Top 15 Feature Importances')
plt.show()

# **Evaluate Final Model**

In [None]:
best_iteration = xgb_model.best_iteration
y_pred_probs = xgb_model.predict(dtest, iteration_range=(0, best_iteration + 1))
y_pred = np.argmax(y_pred_probs, axis=1)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
loss_val = log_loss(y_test, y_pred_probs)

print(f"\n Final Evaluation on Test Set ")
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"F1-Score: {f1:.4f}")
print(f"Log Loss: {loss_val:.4f}")

# **Save Model and TF-IDF Vectorizer**

In [None]:
model_artifacts = {
    'model': xgb_model,
    'vectorizer': vectorizer,
    'accuracy': accuracy,
    'f1_score': f1,
    'logloss': loss_val
}

joblib.dump(model_artifacts, best_model_path)
print(f"Model artifacts saved to {best_model_path}!")
