In [None]:
!pip install -q evaluate

# 📚 Imports
import pandas as pd
import numpy as np
import torch
from datasets import Dataset, DatasetDict
from sklearn.model_selection import StratifiedKFold
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
import evaluate
import os

os.environ["WANDB_DISABLED"] = "true"

# 📄 Load dataset
df = pd.read_csv("/kaggle/input/final-dataset/clean_dataset.csv")
df = df[['X', 'label']].rename(columns={'X': 'comment'})
df = df[df['comment'].notnull()]
df = df[pd.to_numeric(df['label'], errors='coerce').notnull()]
df['label'] = df['label'].astype(int)
df['comment'] = df['comment'].astype(str)
df = df.reset_index(drop=True)

# 🔤 Tokenizer
model_name = "UBC-NLP/MARBERTv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["comment"], truncation=True, padding=True, max_length=256)

# 📏 Metrics
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=1)
    acc = accuracy.compute(predictions=preds, references=labels)["accuracy"]
    f1_score = f1.compute(predictions=preds, references=labels, average="weighted")["f1"]
    return {"accuracy": acc, "f1": f1_score}

# 🧩 Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 🔁 Stratified K-Fold Cross-Validation
k_folds = 5
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)
all_fold_results = []

for fold, (train_idx, val_idx) in enumerate(skf.split(df, df['label'])):
    print(f"\n🚀 Fold {fold+1}/{k_folds}")

    # 🗂️ Prepare datasets
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]

    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)

    dataset = DatasetDict({
        'train': train_dataset,
        'test': val_dataset
    })

    tokenized_dataset = dataset.map(tokenize_function, batched=True)

    # ⚙️ Training arguments
    training_args = TrainingArguments(
        output_dir=f"./fold_{fold+1}",
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,  # ✅ keep only the best checkpoint
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        learning_rate=3e-6,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=8,
        weight_decay=0.05,
        label_smoothing_factor=0.1,
        gradient_accumulation_steps=2,
        fp16=True,
        warmup_steps=300,
        lr_scheduler_type="cosine",
        logging_dir=f"./logs/fold_{fold+1}",
        logging_steps=10,
        push_to_hub=False
    )

    # 🧠 Load fresh model
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2,
        id2label={0: "not_hate", 1: "hate"},
        label2id={"not_hate": 0, "hate": 1}
    )
    model.gradient_checkpointing_enable()

    # 🏋️‍♂️ Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
        #callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    trainer.train()

    # 📊 Evaluation
    result = trainer.evaluate()
    print(f"📊 Fold {fold+1} Results:", result)
    all_fold_results.append(result)

    # 💾 Save best model (after early stopping & best model restored)
    best_model_dir = f"./best_model_fold_{fold+1}"
    trainer.save_model(best_model_dir)
    print(f"✅ Best model saved to {best_model_dir}")

# 📈 Final average metrics
avg_accuracy = np.mean([r['eval_accuracy'] for r in all_fold_results])
avg_f1 = np.mean([r['eval_f1'] for r in all_fold_results])

print(f"\n✅ Average Accuracy over {k_folds} folds: {avg_accuracy:.4f}")
print(f"✅ Average F1 Score over {k_folds} folds: {avg_f1:.4f}")