# 1) Install dependencies

In [None]:
%pip install transformers torch scikit-learn pandas xgboost joblib tqdm matplotlib wordcloud langdetect --quiet

# 2) Imports & setup

In [None]:
import os
import json
import gc
import re
import numpy as np
import pandas as pd
import torch
import joblib
import matplotlib.pyplot as plt
import xgboost as xgb
from wordcloud import WordCloud
from tqdm import tqdm

from sklearn.model_selection import (
    train_test_split,
    StratifiedShuffleSplit,
    cross_val_score
)
from sklearn.preprocessing import LabelEncoder

from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report
)
from sklearn.preprocessing import label_binarize
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.utils.class_weight import compute_class_weight

from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from langdetect import detect

# reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# 3) Config & Paths

In [None]:
file_path = "C:/Users/indur/OneDrive - University of Westminster/GitHub/FYP_Project/Models/Review_Score/Dataset/correct_reviews_balanced.json"
Base = "C:/Users/indur/OneDrive - University of Westminster/GitHub/FYP_Project/Models/Testing/Review_ScoreTest/ReviewBERT-XGB"
distilbert_dir = f"{Base}/distilbert_model"
tfidf_path     = f"{Base}/tfidf_vectorizer.pkl"
xgb_model_path = f"{Base}/xgb_hybrid.pkl"

# model/data params
NUM_LABELS     = 5      
EMB_BATCH      = 32
NUM_EPOCHS     = 20     # for demonstration; increase as needed
TFIDF_MAX_FEAT = 10000

# 4) Load & preprocess the entire JSON dataset

In [None]:
# Load JSON
with open(file_path, "r") as f:
    payload = json.load(f)

# Handle both possible structures:
if isinstance(payload, dict):
    shops = payload.get("root", [])
elif isinstance(payload, list):
    shops = payload
else:
    raise ValueError(f"Unexpected JSON structure: {type(payload)}")

# Flatten reviews
records = []
for shop in shops:
    for rev in shop.get("reviews", []):
        text   = rev.get("text", "").strip()
        source = rev.get("source", "").upper()
        stars  = float(rev.get("rating", "0.0").split()[0])
        label  = int(stars) - 1
        if not text:
            continue
        records.append({
            "text": text,
            "label": label,
            "source": source
        })

df = pd.DataFrame(records)
print(f"Loaded total reviews: {len(df)}")

# Remove exact duplicates
dups = df.duplicated(subset=["text"]).sum()
df.drop_duplicates(subset=["text"], inplace=True)

print(f"Dropped {dups} duplicates -> {len(df)} remaining")

# 5) Filter non-English & low-quality reviews

In [None]:
def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

def is_valid_review(text):
    return len(text.split()) >= 5 and bool(re.search(r'[A-Za-z0-9]', text))

mask = df['text'].apply(is_english) & df['text'].apply(is_valid_review)
df = df[mask].reset_index(drop=True)
print(f"After filtering: {len(df)} reviews")

# Show label distribution
print("Label distribution:")
print(df["label"].value_counts().sort_index())

# Count SL vs USA reviews
counts = df['source'].value_counts()
sl_count  = counts.get('SL', 0)
usa_count = counts.get('USA', 0)
print(f"SL reviews:  {sl_count}")
print(f"USA reviews: {usa_count}")

# 6) Set optimal BERT_MAX_LEN (95th percentile)

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
lengths = [
    len(tokenizer.encode(t, add_special_tokens=True))
    for t in tqdm(df["text"], desc="Token lengths")
]
BERT_MAX_LEN = min(512, int(np.percentile(lengths, 95)))
print(f"Set BERT_MAX_LEN = {BERT_MAX_LEN}")

# 7) Stratified Train/Test split

In [None]:
# Split dataframe while preserving stratification
df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED, stratify=df["label"])

# Extract text, labels, and sources
train_texts = df_train["text"].tolist()
train_labels = df_train["label"].tolist()
train_sources = df_train["source"].tolist()

test_texts = df_test["text"].tolist()
test_labels = df_test["label"].tolist()
test_sources = df_test["source"].tolist()

print(f"Train: {len(train_texts)}   Test: {len(test_texts)}")

# Free memory
del df
gc.collect()

# 8) Tokenization helper

In [None]:
def batch_tokenize(texts, tokenizer, max_length, batch_size=10000):
    ids, masks = [], []
    for i in tqdm(range(0, len(texts), batch_size), desc="Tokenizing"):
        enc = tokenizer(
            texts[i:i+batch_size],
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_attention_mask=True
        )
        ids.extend(enc["input_ids"])
        masks.extend(enc["attention_mask"])
    return {"input_ids": ids, "attention_mask": masks}

# 9) Tokenize all texts

In [None]:
train_enc = batch_tokenize(train_texts, tokenizer, BERT_MAX_LEN)
test_enc  = batch_tokenize(test_texts,  tokenizer, BERT_MAX_LEN)
print("Tokenization complete")

# 10) Build PyTorch datasets

In [None]:
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, enc, labels):
        self.ids    = torch.tensor(enc["input_ids"])
        self.mask   = torch.tensor(enc["attention_mask"])
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids":      self.ids[idx],
            "attention_mask": self.mask[idx],
            "labels":         self.labels[idx]
        }

train_ds = ReviewDataset(train_enc, train_labels)
test_ds  = ReviewDataset(test_enc,  test_labels)

del train_enc, test_enc
gc.collect()

# 11) Fine-tune DistilBERT with Early Stopping (patience=3)

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=NUM_LABELS
).to(device)

training_args = TrainingArguments(
    output_dir=distilbert_dir,
    learning_rate=5e-5,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    seed=RANDOM_SEED,
    logging_steps=50,
    report_to=[]
)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        "accuracy": accuracy_score(p.label_ids, preds),
        "f1":       f1_score(p.label_ids, preds, average="weighted")
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()
trainer.save_model(distilbert_dir)
tokenizer.save_pretrained(distilbert_dir)
print("BERT fine-tuning done")

# 12) Extract CLS embeddings for every review

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(distilbert_dir).to(device)
model.eval()

def get_cls_embeddings(texts):
    embs = []
    for i in tqdm(range(0, len(texts), EMB_BATCH), desc="Embedding"):
        toks = tokenizer(
            texts[i:i+EMB_BATCH],
            truncation=True,
            padding="max_length",
            max_length=BERT_MAX_LEN,
            return_tensors="pt"
        ).to(device)
        with torch.no_grad():
            out = model(**toks, output_hidden_states=True, return_dict=True)
            cls = out.hidden_states[-1][:,0,:].cpu().numpy()
        embs.append(cls)
    return np.vstack(embs)

train_emb = get_cls_embeddings(train_texts)
test_emb  = get_cls_embeddings(test_texts)
print("Embeddings shapes:", train_emb.shape, test_emb.shape)

# 13) TF-IDF vectors with enhanced config

In [None]:
vectorizer = TfidfVectorizer(
    max_features=TFIDF_MAX_FEAT,
    stop_words='english',
    lowercase=True,
    ngram_range=(1,2)  # Include bigrams
)
X_train_tfidf = vectorizer.fit_transform(train_texts).toarray()
X_test_tfidf = vectorizer.transform(test_texts).toarray()
joblib.dump(vectorizer, tfidf_path)
print("TF-IDF shapes:", X_train_tfidf.shape, X_test_tfidf.shape)

# Encode sources
le = LabelEncoder()
le.fit(["SL", "USA"])  # Assuming only these two sources

train_sources_encoded = le.transform(train_sources).reshape(-1, 1)
test_sources_encoded = le.transform(test_sources).reshape(-1, 1)

# 14) Combine features and apply PCA

In [None]:
X_train = np.hstack([train_emb, X_train_tfidf, train_sources_encoded])
X_test = np.hstack([test_emb, X_test_tfidf, test_sources_encoded])
print("Combined dims before PCA:", X_train.shape, X_test.shape)

pca = PCA(n_components=1000, random_state=RANDOM_SEED)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
print("Dims after PCA:", X_train.shape, X_test.shape)

y_train = np.array(train_labels)
y_test = np.array(test_labels)

# 15) Compute sample weights for class imbalance

In [None]:
classes = np.unique(y_train)
class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
# Increase weight for Class 2 (label 1, corresponding to 3 stars)
class_weights[1] *= 3
sample_weights = np.array([class_weights[l] for l in y_train])

# 16) Subsample 20% for hyperparameter search & perform Successive Halving

In [None]:
sss = StratifiedShuffleSplit(n_splits=1, train_size=0.2, random_state=RANDOM_SEED)
sub_idx, _ = next(sss.split(X_train, y_train))
X_small = X_train[sub_idx]
y_small = y_train[sub_idx]
sw_small = sample_weights[sub_idx]

print(f"Full-train subsample: X_small shape = {X_small.shape}, y_small shape = {y_small.shape}")

# Split X_small into subtrain and validation
X_subtrain, X_val, y_subtrain, y_val, sw_subtrain, sw_val = train_test_split(
    X_small,
    y_small,
    sw_small,
    test_size=0.2,
    stratify=y_small,
    random_state=RANDOM_SEED
)

print(f"X_subtrain: {X_subtrain.shape}, y_subtrain: {y_subtrain.shape}")
print(f"X_val: {X_val.shape}, y_val: {y_val.shape}")

# Define base XGBClassifier
xgb_clf = xgb.XGBClassifier(
    objective="multi:softprob",
    num_class=NUM_LABELS,
    seed=RANDOM_SEED,
    use_label_encoder=False,
    eval_metric="mlogloss",
    tree_method="hist",
    device="cuda"
)

# Hyperparameter grid
param_grid = {
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [3, 5, 7],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0]
}

# Set up HalvingRandomSearchCV
halving_search = HalvingRandomSearchCV(
    estimator=xgb_clf,
    param_distributions=param_grid,
    n_candidates="exhaust",
    factor=3,
    resource="n_estimators",
    min_resources=50,
    max_resources=200,
    cv=3,
    scoring='f1_weighted',  # Optimize for weighted F1-score
    random_state=RANDOM_SEED,
    n_jobs=-1,
    verbose=2
)

print("\nStarting HalvingRandomSearchCV on 20% subset …")
halving_search.fit(
    X_subtrain,
    y_subtrain,
    sample_weight=sw_subtrain,
    eval_set=[(X_val, y_val)],
    verbose=False
)

best_params = halving_search.best_params_
print("\nSubset Best Params:", best_params)

# 17) Retrain best XGB model on 100% of data & save

In [None]:

best_xgb = xgb.XGBClassifier(
    **best_params,
    objective="multi:softprob",
    num_class=NUM_LABELS,
    seed=RANDOM_SEED,
    use_label_encoder=False,
    eval_metric="mlogloss",
    tree_method="hist",
    device="cuda"
)

# Split X_train into training and validation sets for early stopping, 
X_train_final, X_val_final, y_train_final, y_val_final, sw_train_final, sw_val_final = train_test_split(
    X_train, y_train, sample_weights,
    test_size=0.1,
    random_state=RANDOM_SEED,
    stratify=y_train
)

# Train with early stopping
print("Retraining best model on training subset with early stopping…")
best_xgb.fit(
    X_train_final,
    y_train_final,
    sample_weight=sw_train_final,
    eval_set=[(X_val_final, y_val_final)],
    early_stopping_rounds=10,
    verbose=True
)

# Save the model
joblib.dump(best_xgb, xgb_model_path)
print("XGB model saved with best subset params.")

# 18) Evaluate & classification report

In [None]:
# 18) Evaluate & classification report

y_pred = best_xgb.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1  = f1_score(y_test, y_pred, average="weighted")
print(f"Accuracy: {acc:.4f}   F1: {f1:.4f}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=[f"{i+1}" for i in range(NUM_LABELS)]))

# 5-fold cross-validation on training set:
# Create a separate XGB classifier for CV
print("Performing 5-fold cross-validation")
cv_scores = cross_val_score(best_xgb, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
print(f"CV Accuracy: {cv_scores.mean():.4f} + {cv_scores.std():.4f}")

# 19) Confusion matrix

In [None]:
# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(cm, display_labels=[f"{i+1}" for i in range(NUM_LABELS)])
disp.plot(cmap=plt.cm.Blues, xticks_rotation="vertical")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()