In [None]:
# ==============================
# 0. INSTALL & LOGIN
# ==============================
!pip install transformers==4.36.2 accelerate==0.25.0 huggingface_hub==0.19.4

In [None]:
from huggingface_hub import login
login()  

In [None]:
# ==============================
# 1. IMPORTS & SETUP
# ==============================
import re
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score, classification_report

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup
)

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)


# ==============================
# 2. LOAD DATA
# ==============================
df = pd.read_csv("/kaggle/input/mental-health/compressed_data.csv")

df = df.rename(columns={"statement": "text", "status": "label"})
df = df.dropna(subset=["text", "label"])
df = df.drop_duplicates(subset=["text"]).reset_index(drop=True)

label_names = sorted(df["label"].unique())
label2id = {l: i for i, l in enumerate(label_names)}
id2label = {i: l for l, i in label2id.items()}

df["label_id"] = df["label"].map(label2id)

print("Classes:", label2id)
print("Dataset shape:", df.shape)


# ==============================
# 3. MINIMAL CLEANING (SAME AS YOUR CODE)
# ==============================
def minimal_clean(text):
    text = str(text)
    text = re.sub(r"http\S+|www\S+|@\w+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["text"] = df["text"].apply(minimal_clean)


# ==============================
# 4. TRAIN / TEST SPLIT (STRATIFIED)
# ==============================
train_df, test_df = train_test_split(
    df,
    test_size=0.15,
    stratify=df["label_id"],
    random_state=SEED
)


# ==============================
# 5. TOKENIZER & DATASET
# ==============================
MODEL_BASE = "mental/mental-bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_BASE)

class MentalDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(
            texts.tolist(),
            truncation=True,
            padding=True,
            max_length=256
        )
        self.labels = labels.tolist()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_ds = MentalDataset(train_df["text"], train_df["label_id"])
test_ds  = MentalDataset(test_df["text"], test_df["label_id"])

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
test_loader  = DataLoader(test_ds, batch_size=32)


# ==============================
# 6. MODEL
# ==============================
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_BASE,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
).to(device)


# ==============================
# 7. CLASS WEIGHTED LOSS
# ==============================
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_df["label_id"]),
    y=train_df["label_id"]
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

criterion = nn.CrossEntropyLoss(weight=class_weights)


# ==============================
# 8. OPTIMIZER & SCHEDULER
# ==============================
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_loader) * 3
)


# ==============================
# 9. STAGE 1: FREEZE BACKBONE (WARMUP)
# ==============================
for param in model.base_model.parameters():
    param.requires_grad = False

print("Backbone frozen — warmup training")

for epoch in range(1):
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = criterion(outputs.logits, batch["labels"])
        loss.backward()

        optimizer.step()
        scheduler.step()
        total_loss += loss.item()

    print(f"[Warmup] Epoch {epoch+1} | Loss: {total_loss/len(train_loader):.4f}")

# ==============================
# ⭐ SAVE CHECKPOINT AFTER WARMUP
# ==============================
model.save_pretrained("mentalbert_warmup")
tokenizer.save_pretrained("mentalbert_warmup")
print("✅ Warmup checkpoint saved")

In [None]:
# ==============================
# 10. STAGE 2: UNFREEZE & FINETUNE
# ==============================
for param in model.base_model.parameters():
    param.requires_grad = True

print("Backbone unfrozen — full fine-tuning")

for epoch in range(2):
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = criterion(outputs.logits, batch["labels"])
        loss.backward()

        optimizer.step()
        scheduler.step()
        total_loss += loss.item()

    print(f"[Finetune] Epoch {epoch+1} | Loss: {total_loss/len(train_loader):.4f}")


# ==============================
# 11. EVALUATION
# ==============================
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=1)

        y_true.extend(batch["labels"].cpu().numpy())
        y_pred.extend(preds.cpu().numpy())

print(classification_report(y_true, y_pred, target_names=label_names))


# ==============================
# 12. SAVE & PUSH MODEL
# ==============================
MODEL_NAME = "mentalbert-mental-health"

model.save_pretrained(MODEL_NAME)
tokenizer.save_pretrained(MODEL_NAME)

from huggingface_hub import HfApi
api = HfApi()
api.create_repo(repo_id=MODEL_NAME, exist_ok=True)
api.upload_folder(folder_path=MODEL_NAME, repo_id=MODEL_NAME)

print("✅ Model trained, evaluated, and pushed to Hugging Face")