# Cell 1 — Clean install & GPU check (Colab-safe)

In [None]:
# !nvidia-smi -L || print("⚠️ No GPU detected. Go to Runtime > Change runtime type > GPU.")
!nvidia-smi -L
# Remove RAPIDS if present (they pin old pyarrow)
!pip -q uninstall -y cudf-cu12 dask-cudf-cu12 cuml-cu12 pylibcudf-cu12 rmm-cu12 ucx-py ucxx rapids-dask-dependency || true
# Colab-friendly versions
!pip -q install -U pandas==2.2.2 pyarrow==21.0.0
!pip -q install -U transformers datasets accelerate scikit-learn

GPU 0: Tesla T4 (UUID: GPU-0277f83b-4ecc-2b8e-1778-611e3432ed3e)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m503.6/503.6 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m
[?25h

# Cell 2 — (Optional) Mount Drive

In [None]:
from google.colab import drive
USE_DRIVE = False  # set True if you want to save to Drive
if USE_DRIVE:
    drive.mount('/content/drive')
    SAVE_DIR = "/content/drive/MyDrive/baitbuster_two_stage"
else:
    SAVE_DIR = "/content/baitbuster_two_stage"

import os
os.makedirs(SAVE_DIR, exist_ok=True)
print("Saving to:", SAVE_DIR)

Saving to: /content/baitbuster_two_stage


# Cell 3 — Load dataset

In [None]:
import pandas as pd, os

CSV_PATH     = "/content/drive/MyDrive/Dataset/BaitBuster-Bangla_253070_18c_HL10k_AIL.csv"
PARQUET_PATH = "/content/drive/MyDrive/Dataset/BaitBuster-Bangla_253070_18c_HL10k_AIL.parquet"
XLSX_PATH    = "/content/drive/MyDrive/Dataset/BaitBuster-Bangla_253070_18c_HL10k_AIL.xlsx"

if   os.path.exists(CSV_PATH):     df_all = pd.read_csv(CSV_PATH)
elif os.path.exists(PARQUET_PATH): df_all = pd.read_parquet(PARQUET_PATH)
elif os.path.exists(XLSX_PATH):    df_all = pd.read_excel(XLSX_PATH)
else: raise FileNotFoundError("Upload dataset and update the paths above.")

print("Columns:", list(df_all.columns)[:30])
print("Rows:", len(df_all))

Columns: ['channel_id', 'channel_name', 'channel_url', 'video_id', 'publishedAt', 'title', 'title_debiased', 'description', 'description_debiased', 'url', 'viewCount', 'commentCount', 'likeCount', 'dislikeCount', 'thumbnail', 'auto_labeled', 'human_labeled', 'ai_labeled']
Rows: 253070


# Cell 4 — Labels, text cleaning, engagement features


In [None]:
import numpy as np
import pandas as pd

# Prefer debiased columns when available
TITLE_COL = "title_debiased" if "title_debiased" in df_all.columns else "title"
DESC_COL  = "description_debiased" if "description_debiased" in df_all.columns else "description"

assert TITLE_COL in df_all.columns, f"Missing {TITLE_COL}"
if DESC_COL not in df_all.columns:
    df_all[DESC_COL] = ""

def norm_label(s):
    if pd.isna(s): return None
    s = str(s).strip().lower()
    mapping = {
        "not_clickbait":"not clickbait", "non-clickbait":"not clickbait",
        "non_clickbait":"not clickbait", "notclickbait":"not clickbait",
        "click bait":"clickbait", "yes":"clickbait", "no":"not clickbait",
        "1":"clickbait", "0":"not clickbait"
    }
    return mapping.get(s, s if s in ["clickbait","not clickbait"] else None)

def choose_label_and_source(row):
    for col, src in [("human_labeled","human"), ("ai_labeled","ai"), ("auto_labeled","auto")]:
        if col in row.index and pd.notna(row[col]):
            lab = norm_label(row[col])
            if lab in ["clickbait","not clickbait"]:
                return lab, src
    return None, None

labs, srcs = [], []
for _, r in df_all.iterrows():
    l, s = choose_label_and_source(r)
    labs.append(l); srcs.append(s)

df_all["label_str"] = labs
df_all["label_source"] = srcs
df_all = df_all[df_all["label_str"].isin(["clickbait","not clickbait"])].copy()
df_all["label"] = (df_all["label_str"] == "clickbait").astype(int)

def _clean_text(s):
    if pd.isna(s): return ""
    return str(s).strip()

df_all[TITLE_COL] = df_all[TITLE_COL].apply(_clean_text)
df_all[DESC_COL]  = df_all[DESC_COL].apply(_clean_text)
df_all = df_all[df_all[TITLE_COL].ne("")].copy()

print("Usable rows:", len(df_all))
print("Label balance 0/1:", np.bincount(df_all["label"]))
print(df_all["label_source"].value_counts(dropna=False).to_frame("count"))

# ---------- Engagement feature engineering ----------
for c in ["viewCount", "likeCount", "commentCount", "dislikeCount"]:
    if c not in df_all.columns:
        df_all[c] = 0

for c in ["viewCount", "likeCount", "commentCount", "dislikeCount"]:
    df_all[c] = pd.to_numeric(df_all[c], errors="coerce").fillna(0)

for c in ["viewCount", "likeCount", "commentCount", "dislikeCount"]:
    df_all[f"log1p_{c}"] = np.log1p(df_all[c])

df_all["like_ratio"]   = df_all["likeCount"] / (df_all["likeCount"] + df_all["dislikeCount"] + 1.0)
df_all["comment_rate"] = df_all["commentCount"] / (df_all["viewCount"] + 1.0)

NUM_FEATS = [
    "log1p_viewCount", "log1p_likeCount", "log1p_commentCount", "log1p_dislikeCount",
    "like_ratio", "comment_rate"
]
print("Numeric features:", NUM_FEATS)

Total usable rows (any label): 253070
Label balance 0/1: [208024  45046]
               count
label_source        
ai            243070
human          10000


Unnamed: 0,title_debiased,label_str,label_source
0,এইমাত্র! মসজিদে নামাজরত অবস্থায় তিন বৃদ্ধকে পি...,clickbait,human
1,"১০ বছরের সন্তান ফেলে আ,লীগ নেতার সাথে পালিয়ে গ...",clickbait,human
2,এই মাত্র পাওয়া খবর! ৫ বছরের জেল হচ্ছে পরীমনির!...,clickbait,human
3,ছি ছি! ভাগিনার সাথে পরকীয়ার সময় হাতেনাতে ধরা খ...,clickbait,human
4,হায়রে পরীমনি! কারাগারে গিয়েও ভালো হলোনা! কারাগ...,clickbait,human


  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(


# Cell 5 — Human-only splits + stage pools + scaler (fit on Stage-1 only)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# Human-only rows
is_human = (df_all["label_source"] == "human")
df_human = df_all[is_human].copy()
assert len(df_human) > 0, "No human-labeled rows found."

# Stratified 70/15/15 on human labels
idx = np.arange(len(df_human))
h_tr_idx, h_tmp = train_test_split(idx, test_size=0.30, stratify=df_human["label"].values, random_state=42)
h_va_idx, h_te_idx = train_test_split(h_tmp, test_size=0.50, stratify=df_human["label"].values[h_tmp], random_state=42)

df_h_tr = df_human.iloc[h_tr_idx].copy()
df_h_va = df_human.iloc[h_va_idx].copy()
df_h_te = df_human.iloc[h_te_idx].copy()

print("Human splits sizes:", len(df_h_tr), len(df_h_va), len(df_h_te))

# Exclude human val/test rows from Stage-1 to avoid leakage
human_valtest_ids = set(df_h_va.index.tolist() + df_h_te.index.tolist())
mask_stage1 = ~df_all.index.isin(human_valtest_ids)

df_stage1 = df_all[mask_stage1].copy()     # Stage-1: human-train + AI + auto
df_stage2 = df_h_tr.copy()                  # Stage-2: human-train
df_eval_val = df_h_va.copy()                # Validation on human-val
df_eval_test = df_h_te.copy()               # Final test on human-test

# Source-based sample weights (Stage-1 only)
weight_map = {"human": 1.0, "ai": 0.7, "auto": 0.5}
df_stage1["sample_weight"] = df_stage1["label_source"].map(weight_map).fillna(0.5).astype("float32")

print("\nStage-1 size:", len(df_stage1), "(by source)")
print(df_stage1["label_source"].value_counts())
print("\nStage-2 (human-train) size:", len(df_stage2))
print("\nEval val/test sizes:", len(df_eval_val), len(df_eval_test))

# ----- Fit scaler on Stage-1 only (no leakage) -----
scaler = StandardScaler()
X1 = scaler.fit_transform(df_stage1[NUM_FEATS].values.astype("float32"))
X2 = scaler.transform(df_stage2[NUM_FEATS].values.astype("float32"))
XV = scaler.transform(df_eval_val[NUM_FEATS].values.astype("float32"))
XT = scaler.transform(df_eval_test[NUM_FEATS].values.astype("float32"))

# Stash arrays for dataset builders
df_stage1["_eng_feats_np"]   = list(X1)
df_stage2["_eng_feats_np"]   = list(X2)
df_eval_val["_eng_feats_np"] = list(XV)
df_eval_test["_eng_feats_np"] = list(XT)

Human splits sizes: 7000 1500 1500

Stage-1 size: 250070  (by source)
label_source
ai       243070
human      7000
Name: count, dtype: int64

Stage-2 (human-train) size: 7000

Eval val/test sizes: 1500 1500


# Cell 6 — Tokenize (title + description) and attach numeric features

In [None]:
from transformers import AutoTokenizer
from datasets import Dataset
import numpy as np

MODEL_NAME = "csebuetnlp/banglabert"  # or "sagorsarker/bangla-bert-base"
tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

MAX_LEN = 192
TRUNCATION_POLICY = "only_second"  # keep title fully; truncate description if needed

def map_pair_with_numeric(df, include_weights=False):
    enc = tok(
        text=df[TITLE_COL].astype(str).tolist(),
        text_pair=df[DESC_COL].astype(str).tolist(),
        truncation=TRUNCATION_POLICY,
        padding="max_length",
        max_length=MAX_LEN
    )
    data = {
        "input_ids": enc["input_ids"],
        "attention_mask": enc["attention_mask"],
        "labels": df["label"].astype(int).tolist(),
        "eng_feats": np.stack(df["_eng_feats_np"].values).astype("float32"),
    }
    if "token_type_ids" in enc:  # if model uses segment ids
        data["token_type_ids"] = enc["token_type_ids"]
    if include_weights:
        data["sample_weight"] = df["sample_weight"].astype("float32").tolist()

    ds = Dataset.from_dict(data)
    ds.set_format(type="torch")
    return ds

ds_stage1 = map_pair_with_numeric(df_stage1, include_weights=True)
ds_stage2 = map_pair_with_numeric(df_stage2, include_weights=False)
ds_val    = map_pair_with_numeric(df_eval_val, include_weights=False)
ds_test   = map_pair_with_numeric(df_eval_test, include_weights=False)

len(ds_stage1), len(ds_stage2), len(ds_val), len(ds_test)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/586 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

(250070, 7000, 1500, 1500)

# Cell 7 — Metrics + Weighted Trainer

In [None]:
import torch, torch.nn as nn
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    return {
        "accuracy": float(accuracy_score(labels, preds)),
        "f1_macro": float(f1_score(labels, preds, average="macro")),
        "f1_micro": float(f1_score(labels, preds, average="micro")),
        "kappa":    float(cohen_kappa_score(labels, preds)),
    }

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        weights = inputs.pop("sample_weight", None)

        outputs = model(**inputs, labels=labels)
        logits = outputs.logits

        ce = nn.CrossEntropyLoss(reduction="none")(logits, labels)
        if weights is not None:
            weights = weights.to(ce.device).view(-1)
            loss = (ce * weights).sum() / (weights.sum() + 1e-8)
        else:
            loss = ce.mean()
        return (loss, outputs) if return_outputs else loss

# Cell 8 — Custom model: BanglaBERT + small MLP for numeric feats

In [None]:
import torch.nn as nn
from transformers import AutoModel, AutoConfig
from transformers.modeling_outputs import SequenceClassifierOutput

class BertWithNumeric(nn.Module):
    def __init__(self, model_name: str, num_labels: int = 2, num_numeric: int = 6):
        super().__init__()
        self.config = AutoConfig.from_pretrained(model_name)
        self.bert = AutoModel.from_pretrained(model_name, config=self.config)
        hidden = self.config.hidden_size

        self.num_proj = nn.Sequential(
            nn.Linear(num_numeric, 32),
            nn.ReLU(),
            nn.LayerNorm(32)
        )

        self.dropout = nn.Dropout(getattr(self.config, "hidden_dropout_prob", 0.1))
        self.classifier = nn.Linear(hidden + 32, num_labels)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        eng_feats=None,
        labels=None,
        **kwargs
    ):
        bert_out = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        pooled = bert_out.last_hidden_state[:, 0, :]  # CLS

        if eng_feats is None:
            num_emb = torch.zeros((pooled.size(0), 32), device=pooled.device)
        else:
            num_emb = self.num_proj(eng_feats)

        x = torch.cat([pooled, num_emb], dim=1)
        x = self.dropout(x)
        logits = self.classifier(x)

        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)

        return SequenceClassifierOutput(loss=loss, logits=logits)

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
100,0.3683
200,0.2666
300,0.2419
400,0.235
500,0.224
600,0.203
700,0.1936
800,0.1972
900,0.1759
1000,0.19


Stage-1 checkpoint saved to: /content/baitbuster_two_stage/stage1_all_best


# Cell 9 — Stage-1 training (pretrain on broad pool)

In [None]:
import os, torch

stage1_model = BertWithNumeric(MODEL_NAME, num_labels=2, num_numeric=len(NUM_FEATS))

# Some transformers versions renamed eval args; try both
try:
    stage1_args = TrainingArguments(
        output_dir=os.path.join(SAVE_DIR, "stage1_all"),
        learning_rate=2e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64,
        num_train_epochs=2,      # 2–3 is enough for pretraining
        weight_decay=0.01,
        evaluation_strategy="no",
        save_strategy="epoch",
        logging_steps=100,
        fp16=torch.cuda.is_available(),
        seed=42,
        report_to=[]
    )
except TypeError:
    stage1_args = TrainingArguments(
        output_dir=os.path.join(SAVE_DIR, "stage1_all"),
        learning_rate=2e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64,
        num_train_epochs=2,
        weight_decay=0.01,
        eval_strategy="no",
        save_strategy="epoch",
        logging_steps=100,
        fp16=torch.cuda.is_available(),
        seed=42,
        report_to=[]
    )

stage1_trainer = WeightedTrainer(
    model=stage1_model,
    args=stage1_args,
    train_dataset=ds_stage1,   # has sample_weight + eng_feats
)
stage1_trainer.train()

stage1_ckpt = os.path.join(SAVE_DIR, "stage1_all_best")
stage1_trainer.save_model(stage1_ckpt)
print("Stage-1 checkpoint saved to:", stage1_ckpt)

Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Micro,Kappa
1,0.033,0.030928,0.990667,0.990501,0.990667,0.981002
2,0.0094,0.03061,0.993333,0.993215,0.993333,0.98643
3,0.0045,0.032135,0.993333,0.993213,0.993333,0.986426
4,0.0012,0.034955,0.994,0.993895,0.994,0.987789


Stage-2 best checkpoint saved to: /content/baitbuster_two_stage/stage2_human_best


# Cell 10 — Stage-2 fine-tuning (human-train) with early stopping on human-val

In [None]:
from transformers import EarlyStoppingCallback

# Recreate the model class and load encoder head weights from Stage-1
stage2_model = BertWithNumeric(MODEL_NAME, num_labels=2, num_numeric=len(NUM_FEATS))

# Load state dict if available
pt_path = os.path.join(stage1_ckpt, "pytorch_model.bin")
if os.path.exists(pt_path):
    state = torch.load(pt_path, map_location="cpu")
    # Strict=False allows shape-safe load even if heads differ
    stage2_model.load_state_dict(state, strict=False)
else:
    print("⚠️ Could not find stage-1 weights; training Stage-2 from fresh init.")

try:
    stage2_args = TrainingArguments(
        output_dir=os.path.join(SAVE_DIR, "stage2_human"),
        learning_rate=2e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64,
        num_train_epochs=4,            # 3–5 typically good
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        logging_steps=100,
        fp16=torch.cuda.is_available(),
        seed=42,
        report_to=[]
    )
except TypeError:
    stage2_args = TrainingArguments(
        output_dir=os.path.join(SAVE_DIR, "stage2_human"),
        learning_rate=2e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64,
        num_train_epochs=4,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        logging_steps=100,
        fp16=torch.cuda.is_available(),
        seed=42,
        report_to=[]
    )

stage2_trainer = Trainer(
    model=stage2_model,
    args=stage2_args,
    train_dataset=ds_stage2,   # human-train only
    eval_dataset=ds_val,       # human-val
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

stage2_trainer.train()

stage2_ckpt = os.path.join(SAVE_DIR, "stage2_human_best")
stage2_trainer.save_model(stage2_ckpt)
print("Stage-2 best checkpoint saved to:", stage2_ckpt)

=== HUMAN-VAL METRICS ===
{'accuracy': 0.994,
 'f1_macro': 0.9938946929505411,
 'f1_micro': 0.994,
 'kappa': 0.9877894356005789}

=== HUMAN-TEST METRICS ===
{'accuracy': 0.9933333333333333,
 'f1_macro': 0.9932199222189477,
 'f1_micro': 0.9933333333333333,
 'kappa': 0.9864398689548936}

Confusion matrix [rows=true, cols=pred] (0,1):
 [[648   6]
 [  4 842]]

Classification report:
               precision    recall  f1-score   support

           0     0.9939    0.9908    0.9923       654
           1     0.9929    0.9953    0.9941       846

    accuracy                         0.9933      1500
   macro avg     0.9934    0.9930    0.9932      1500
weighted avg     0.9933    0.9933    0.9933      1500



# **Cell 11** - Final evaluation on human-test + save metrics

In [1]:
from pprint import pprint
from sklearn.metrics import confusion_matrix, classification_report
import json, pandas as pd, os

raw_val  = stage2_trainer.predict(ds_val)
raw_test = stage2_trainer.predict(ds_test)

print("=== HUMAN-VAL METRICS ===")
val_metrics = compute_metrics((raw_val.predictions, raw_val.label_ids))
pprint(val_metrics)

print("\n=== HUMAN-TEST METRICS ===")
test_metrics = compute_metrics((raw_test.predictions, raw_test.label_ids))
pprint(test_metrics)

y_pred = raw_test.predictions.argmax(axis=1)
print("\nConfusion matrix [rows=true, cols=pred] (0,1):\n", confusion_matrix(raw_test.label_ids, y_pred))
print("\nClassification report:\n", classification_report(raw_test.label_ids, y_pred, digits=4))

pd.DataFrame([val_metrics]).to_csv(os.path.join(SAVE_DIR, "val_metrics.csv"), index=False)
pd.DataFrame([test_metrics]).to_csv(os.path.join(SAVE_DIR, "final_human_test_metrics.csv"), index=False)
with open(os.path.join(SAVE_DIR, "final_human_test_metrics.json"), "w") as f:
    json.dump(test_metrics, f, indent=2)

NameError: name 'stage2_trainer' is not defined

In [None]:
# 📊 Visualization Cell — Confusion Matrix & Metrics
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Confusion matrix
y_true = raw_test.label_ids
y_pred = raw_test.predictions.argmax(axis=1)
cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Not Clickbait","Clickbait"], yticklabels=["Not Clickbait","Clickbait"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix on Human-Test")
plt.show()

# Bar plot for metrics
metrics_to_plot = {
    "Accuracy": test_metrics["accuracy"],
    "F1 Macro": test_metrics["f1_macro"],
    "F1 Micro": test_metrics["f1_micro"],
    "Kappa": test_metrics["kappa"]
}

plt.figure(figsize=(6,4))
sns.barplot(x=list(metrics_to_plot.keys()), y=list(metrics_to_plot.values()), palette="viridis")
plt.ylim(0,1)
plt.title("Evaluation Metrics on Human-Test")
plt.ylabel("Score")
plt.show()