In [None]:
import json
import pandas as pd
import numpy as np
import time
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix, precision_recall_curve, average_precision_score, roc_curve, roc_auc_score
from sklearn.preprocessing import label_binarize
from transformers import (BertTokenizer, BertForSequenceClassification, BertConfig, Trainer, TrainingArguments, EarlyStoppingCallback)
from datasets import Dataset
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import torch
import nltk
import malaya

In [None]:
nltk.download('stopwords')
english_stopwords = set(nltk.corpus.stopwords.words("english"))
malay_stopwords = set(malaya.text.function.get_stopwords())
custom_stopwords = {"saya", "awak", "kau", "user", "en", "ms", "url", "je", "lah", "la", "number", "hahaha", "haha", "eh"}
all_stopwords = STOPWORDS.union(english_stopwords).union(malay_stopwords).union(custom_stopwords)

In [None]:
base_dir = Path("./datasets/stage1/v2")
df = pd.read_csv(base_dir / "stage1_combined_en_ms.csv")
df['text'] = df.apply(lambda row: f"[MS] {row['text']}" if row['lang'] == 'ms' else f"[EN] {row['text']}", axis=1)

In [None]:
train_texts = pd.Series(np.load("s1_db_visuals/train_texts.npy", allow_pickle=True))
train_labels = pd.Series(np.load("s1_db_visuals/train_labels.npy", allow_pickle=True))
val_texts   = pd.Series(np.load("s1_db_visuals/val_texts.npy", allow_pickle=True))
val_labels  = pd.Series(np.load("s1_db_visuals/val_labels.npy", allow_pickle=True))

test_texts  = pd.Series(np.load("s1_db_visuals/test_texts.npy", allow_pickle=True))
test_labels = np.load("s1_db_visuals/test_labels.npy")
test_langs  = pd.Series(np.load("s1_db_visuals/test_langs.npy", allow_pickle=True))

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

def tokenize_function(examples):
    return tokenizer(list(map(str, examples["text"])), padding="max_length", truncation=True, max_length=128)

def prepare_dataset(texts, labels):
    dataset = Dataset.from_pandas(pd.DataFrame({"text": texts, "labels": labels}))
    dataset = dataset.map(tokenize_function, batched=True)
    dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    return dataset

train_dataset = prepare_dataset(train_texts, train_labels)
val_dataset = prepare_dataset(val_texts, val_labels)
test_dataset = prepare_dataset(test_texts, test_labels)

In [None]:
config = BertConfig.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=2,
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1
)
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", config=config)
if torch.cuda.is_available():
    model.cuda()

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
        "precision": precision_score(labels, preds, average="weighted"),
        "recall": recall_score(labels, preds, average="weighted")
    }

training_args = TrainingArguments(
    output_dir="./training_checkpoints",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=6,
    warmup_ratio=0.1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=2,
    logging_dir="./logs",
    lr_scheduler_type="cosine"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
trainer.train()

In [None]:
# === Inference with Timing ===
start_time = time.time()
preds = trainer.predict(test_dataset)
inference_time = (time.time() - start_time) / len(test_dataset)

print(f"\n Inference Time per Sample: {inference_time:.4f} seconds")

In [None]:
# === Save Visualisation Output Placeholder ===
output_dir = Path("./s1_mb_visuals")
output_dir.mkdir(parents=True, exist_ok=True)

In [None]:
# Extract predictions and binarize
y_true = preds.label_ids
y_pred = np.argmax(preds.predictions, axis=1)
y_true_bin = label_binarize(y_true, classes=[0, 1]).ravel()
y_score = preds.predictions[:, 1]

In [None]:
# Create DataFrame for evaluation
df_eval = pd.DataFrame({
    "text": test_texts.reset_index(drop=True),
    "true": y_true,
    "pred": y_pred,
    "lang": test_langs.reset_index(drop=True)
})

In [None]:
# === Confusion Matrix (Overall) ===
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=["Non-Hate", "Hate"], yticklabels=["Non-Hate", "Hate"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.savefig(output_dir / "confusion_matrix.png")
plt.close()

In [None]:
# === Language-Specific Confusion Matrices ===
langs = df_eval["lang"].unique()
for lang in langs:
    sub = df_eval[df_eval["lang"] == lang]
    cm_lang = confusion_matrix(sub["true"], sub["pred"])
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm_lang, annot=True, fmt='d', cmap="Oranges", xticklabels=["Non-Hate", "Hate"], yticklabels=["Non-Hate", "Hate"])
    plt.title(f"Confusion Matrix - {lang.upper()}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.savefig(output_dir / f"confusion_matrix_{lang}.png")
    plt.close()

In [None]:
# === ROC Curve ===
fpr, tpr, _ = roc_curve(y_true_bin, y_score)
auc_score = roc_auc_score(y_true_bin, y_score)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"AUC = {auc_score:.2f}")
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(output_dir / "roc_curve.png")
plt.close()

In [None]:
# === Precision-Recall Curve ===
precision, recall, _ = precision_recall_curve(y_true_bin, y_score)
ap = average_precision_score(y_true_bin, y_score)
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'AP={ap:.2f}')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.savefig(output_dir / "precision_recall_curve.png")
plt.close()

In [None]:
# === Word Clouds for FP & FN ===
fp_texts = df_eval[(df_eval["true"] == 0) & (df_eval["pred"] == 1)]["text"]
fn_texts = df_eval[(df_eval["true"] == 1) & (df_eval["pred"] == 0)]["text"]

wordcloud_fp = WordCloud(width=800, height=400, stopwords=all_stopwords).generate(" ".join(fp_texts))
wordcloud_fn = WordCloud(width=800, height=400, stopwords=all_stopwords).generate(" ".join(fn_texts))

plt.figure(figsize=(10, 4))
plt.imshow(wordcloud_fp, interpolation='bilinear')
plt.axis('off')
plt.title("False Positives Word Cloud")
plt.tight_layout()
plt.savefig(output_dir / "wordcloud_fp.png")
plt.close()

plt.figure(figsize=(10, 4))
plt.imshow(wordcloud_fn, interpolation='bilinear')
plt.axis('off')
plt.title("False Negatives Word Cloud")
plt.tight_layout()
plt.savefig(output_dir / "wordcloud_fn.png")
plt.close()

In [None]:
# === Save Classification Report ===
report_dict = classification_report(y_true, y_pred, output_dict=True)
pd.DataFrame(report_dict).to_csv(output_dir / "classification_report.csv")

In [None]:
# === Language-Specific Classification Reports ===
lang_reports = {}
for lang in langs:
    sub = df_eval[df_eval["lang"] == lang]
    report = classification_report(sub["true"], sub["pred"], output_dict=True)
    pd.DataFrame(report).to_csv(output_dir / f"classification_report_{lang}.csv")
    lang_reports[lang] = report

In [None]:
# === F1 Score by Class Bar Chart ===
f1_scores = {label: report_dict[str(label)]['f1-score'] for label in [0, 1]}
plt.bar(["Non-Hate", "Hate"], f1_scores.values(), color=["#1f77b4", "#ff7f0e"])
plt.title("F1 Score by Class")
plt.ylabel("F1 Score")
plt.ylim(0, 1)
plt.grid(True, axis='y')
plt.tight_layout()
plt.savefig(output_dir / "f1_by_class.png")
plt.close()

In [None]:
# === Prediction Accuracy Breakdown Bar ===
df_eval["correct"] = df_eval["true"] == df_eval["pred"]
outcome_counts = df_eval["correct"].value_counts().rename({True: "Correct", False: "Incorrect"})
outcome_counts.plot(kind="bar", color=["green", "red"])
plt.title("Prediction Accuracy Breakdown")
plt.ylabel("Count")
plt.tight_layout()
plt.grid(axis="y")
plt.savefig(output_dir / "prediction_accuracy_breakdown.png")
plt.close()

In [None]:
# === Loss Overfit/Underfit Plot ===
log_history = trainer.state.log_history
val_loss = [entry["eval_loss"] for entry in log_history if "eval_loss" in entry]
train_loss = [entry["loss"] for entry in log_history if "loss" in entry]
min_len = min(len(train_loss), len(val_loss))
train_loss = train_loss[:min_len]
val_loss = val_loss[:min_len]
epochs = list(range(1, min_len + 1))

plt.figure(figsize=(10, 6))
plt.plot(epochs, train_loss, marker='o', label="Training Loss")
plt.plot(epochs, val_loss, marker='o', label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training vs Validation Loss")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(output_dir / "loss_overfit_underfit.png")
plt.close()

In [None]:
# === Save Inference Time ===
with open(output_dir / "inference_time.txt", "w") as f:
    f.write(f"Inference Time per Sample: {inference_time:.4f} seconds")

print("\n All Stage 1 visualisation outputs saved to ./stage1_visuals")

In [None]:
# === Save the Best Model and Tokenizer ===
model_path = Path("s1_mb_model")
model_path.mkdir(parents=True, exist_ok=True)

# Save best model weights and tokenizer
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

# Save predictions for visuals
np.save(output_dir / "y_pred_mb.npy", y_pred)

print(f"Model and tokenizer saved to: {model_path}")