In [None]:
# ============================================================
#  MuRIL Fine-tune (Version-safe, K)
# Clean output + Accuracy + Macro-F1
# ============================================================

!pip -q install -U transformers accelerate datasets scikit-learn

import os, re, numpy as np, pandas as pd, torch
from google.colab import drive
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding, logging
)

# ---------- SILENCE EXTRA WARNINGS ----------
os.environ["TOKENIZERS_PARALLELISM"] = "false"
logging.set_verbosity_error()

# ---------- LOAD DATA ----------
drive.mount("/content/drive", force_remount=True)
train_df = pd.read_csv("/content/drive/MyDrive/data/train.csv")

def clean_text(text):
    text = str(text).lower().strip()
    text = re.sub(r'[^a-z\u0b80-\u0bff\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

train_df["Text"] = train_df["Text"].astype(str).apply(clean_text)
train_df["Class"] = train_df["Class"].astype(str).str.lower().str.strip()

label_map = {"non-abusive": 0, "abusive": 1}
train_df["label"] = train_df["Class"].map(label_map)
train_df = train_df.dropna(subset=["label"]).reset_index(drop=True)
train_df["label"] = train_df["label"].astype(int)

# Oversample abusive
abusive_df = train_df[train_df["label"] == 1]
train_df = pd.concat([train_df, abusive_df], ignore_index=True)

X = train_df["Text"].tolist()
y = train_df["label"].tolist()

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.15, stratify=y, random_state=42
)

# ---------- TOKENIZE ----------
MODEL_NAME = "google/muril-base-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def make_ds(texts, labels):
    ds = Dataset.from_dict({"text": texts, "labels": labels})
    return ds.map(
        lambda x: tokenizer(x["text"], truncation=True, padding="max_length", max_length=128),
        batched=True,
        remove_columns=["text"]
    )

train_ds = make_ds(X_train, y_train)
val_ds   = make_ds(X_val, y_val)

# ---------- MODEL ----------
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

def metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        "accuracy": accuracy_score(p.label_ids, preds),
        "macro_f1": f1_score(p.label_ids, preds, average="macro")
    }

# ---------- TRAINING ARGS (IMPORTANT FIX HERE ) ----------
args = TrainingArguments(
    output_dir="./muril_clean",
    num_train_epochs=6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=3e-5,
    lr_scheduler_type="cosine",
    weight_decay=0.01,
    label_smoothing_factor=0.1,
    eval_strategy="epoch",      # FIX (not evaluation_strategy)
    logging_strategy="epoch",
    save_strategy="no",
    report_to="none",
    fp16=torch.cuda.is_available()
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=metrics
)

print("ðŸš€ Training started...")
trainer.train()

res = trainer.evaluate()
print("\n FINAL ACCURACY :", round(res["eval_accuracy"], 4))
print(" FINAL MACRO F1 :", round(res["eval_macro_f1"], 4))


Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/4607 [00:00<?, ? examples/s]

Map:   0%|          | 0/814 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

ðŸš€ Training started...
{'loss': '0.639', 'grad_norm': '1.395', 'learning_rate': '2.8e-05', 'epoch': '1'}
{'eval_loss': '0.6167', 'eval_accuracy': '0.7138', 'eval_macro_f1': '0.6092', 'eval_runtime': '1.495', 'eval_samples_per_second': '544.5', 'eval_steps_per_second': '17.39', 'epoch': '1'}
{'loss': '0.5773', 'grad_norm': '33.51', 'learning_rate': '2.252e-05', 'epoch': '2'}
{'eval_loss': '0.5546', 'eval_accuracy': '0.7715', 'eval_macro_f1': '0.7568', 'eval_runtime': '1.405', 'eval_samples_per_second': '579.4', 'eval_steps_per_second': '18.51', 'epoch': '2'}
{'loss': '0.4945', 'grad_norm': '9.19', 'learning_rate': '1.503e-05', 'epoch': '3'}
{'eval_loss': '0.4947', 'eval_accuracy': '0.8256', 'eval_macro_f1': '0.7981', 'eval_runtime': '1.383', 'eval_samples_per_second': '588.4', 'eval_steps_per_second': '18.79', 'epoch': '3'}
{'loss': '0.4214', 'grad_norm': '9.636', 'learning_rate': '7.524e-06', 'epoch': '4'}
{'eval_loss': '0.4749', 'eval_accuracy': '0.8428', 'eval_macro_f1': '0.8177', 