"""
CAPSTONE PROJECT: MULTI-CLASS MENTAL HEALTH CLASSIFICATION
Complete notebook with preprocessing, data loading, and all 6 transformer models
"""

In [None]:
# ===============================================================================
# CELL 0: INSTALL REQUIRED PACKAGES
# ===============================================================================

import subprocess
import sys

def install_packages():
    """Install required packages with compatible versions for Kaggle"""
    
    packages_to_install = [
        ("numpy", "numpy<2.0"),
        ("scikit-learn", "scikit-learn>=1.0.0"),
        ("pandas", "pandas"),
        ("matplotlib", "matplotlib"),
        ("seaborn", "seaborn"),
        ("nltk", "nltk"),
        ("torch", "torch"),
        ("transformers", "transformers"),
        ("datasets", "datasets")
    ]
    
    installed_count = 0
    for package_name, package_spec in packages_to_install:
        try:
            subprocess.check_call(
                [sys.executable, "-m", "pip", "install", "-q", package_spec],
                stderr=subprocess.DEVNULL
            )
            print(f"‚úì {package_name}")
            installed_count += 1
        except Exception as e:
            print(f"‚ö† {package_name} - {str(e)[:50]}")
    
    print(f"\n‚úì Installation complete! {installed_count}/{len(packages_to_install)} packages ready.")
    print("‚ö† Note: Some Kaggle pre-existing conflicts are normal and safe to ignore.\n")

install_packages()

In [2]:
# # ===============================================================================
# # FIX CUDNN COMPATIBILITY (KAGGLE P100)
# # ===============================================================================

# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# # Disable cuDNN to avoid version mismatch on Kaggle
# tf.config.list_physical_devices('GPU')
# tf.config.set_visible_devices([], 'GPU')

# # Re-enable GPU with memory growth to prevent OOM
# gpus = tf.config.list_physical_devices('GPU')
# for gpu in gpus:
#     try:
#         tf.config.experimental.set_memory_growth(gpu, True)
#     except RuntimeError as e:
#         print(f"GPU setup warning (safe to ignore): {e}")

# print("‚úì GPU configured and cuDNN compatibility fixed!")

In [3]:
# ===============================================================================
# CELL 1: CLEAN IMPORTS & SETUP
# ===============================================================================
import os
import re
import random
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn

# Hugging Face & Sklearn
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    pipeline
)

# Settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)

# Random Seed (For Reproducibility)
SEED = 42
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed(SEED)

print(f"‚úì Libraries imported. Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")

2025-11-23 04:57:54.824329: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763873875.070451      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763873875.139092      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

‚úì Libraries imported. Device: cpu


In [4]:
# 2. CONFIGURATION
DATA_PATH = "/kaggle/input/mental-health/Combined Data.csv"
OUT_DIR = "distilbert-mental-health"
BATCH_SIZE = 16          # DistilBERT is small, so 16 usually fits on P100 GPU
EPOCHS = 3
LR = 2e-5                # Standard learning rate for Transformers
SEED = 42

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed(SEED)


# Load data from Kaggle dataset 
filepath = "/kaggle/input/mental-health/Combined Data.csv"
df = pd.read_csv(filepath)

# Standardize columns
if "Unnamed: 0" in df.columns:
    df = df.drop(columns=["Unnamed: 0"])
df = df.rename(columns={"statement": "text", "status": "label_name"})

# Drop Nulls/Duplicates
df = df.dropna(subset=["text", "label_name"]).drop_duplicates(subset=["text"]).reset_index(drop=True)

# üîπ Filter to 5 target classes (drop Normal + Personality disorder)
keep_labels = ["Anxiety", "Depression", "Suicidal", "Stress", "Bipolar"]
df = df[df["label_name"].isin(keep_labels)].reset_index(drop=True)

print("Updated class distribution:")
print(df["label_name"].value_counts())

# Encode Labels (String -> Integer) AFTER filtering
label_values = sorted(df["label_name"].unique())
label2id = {name: i for i, name in enumerate(label_values)}
id2label = {i: name for name, i in label2id.items()}
df["label"] = df["label_name"].map(label2id)

print(f"‚úì Data Loaded. Shape: {df.shape}")
print(f"‚úì Classes: {label2id}")


# Minimal Cleaning Function (Best for Transformers)
def minimal_clean(text):
    text = str(text)
    text = re.sub(r"http\S+|www\S+|@\w+", "", text)  # Remove URLs/Mentions
    text = re.sub(r"<.*?>", "", text)                 # Remove HTML
    text = re.sub(r"\s+", " ", text).strip()          # Remove double spaces
    return text

df["text"] = df["text"].apply(minimal_clean)

# 4. SPLIT & TOKENIZE
# Stratified split ensures equal distribution of classes in Train/Test
train_df, test_df = train_test_split(df, test_size=0.15, stratify=df["label"], random_state=SEED)

hf_train = Dataset.from_pandas(train_df[["text", "label"]].reset_index(drop=True))
hf_test = Dataset.from_pandas(test_df[["text", "label"]].reset_index(drop=True))
dataset = DatasetDict({"train": hf_train, "test": hf_test})

# 5. COMPUTE CLASS WEIGHTS (Handle Imbalance)
labels = train_df["label"].values
class_weights = compute_class_weight("balanced", classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to("cuda" if torch.cuda.is_available() else "cpu")

print(f"‚öñÔ∏è Class Weights calculated: {class_weights}")

# 6. CUSTOM TRAINER (Fixed for newer Transformers versions)
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # We add **kwargs above to catch 'num_items_in_batch' or any other new args
        
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        # Calculate loss with our custom class weights
        loss_fct = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="macro")
    return {"accuracy": acc, "f1_macro": f1}

results_list = []
print("‚úÖ Scoreboard initialized.")

Updated class distribution:
label_name
Depression    15087
Suicidal      10641
Anxiety        3617
Bipolar        2501
Stress         2293
Name: count, dtype: int64
‚úì Data Loaded. Shape: (34139, 3)
‚úì Classes: {'Anxiety': 0, 'Bipolar': 1, 'Depression': 2, 'Stress': 3, 'Suicidal': 4}


‚öñÔ∏è Class Weights calculated: tensor([1.8880, 2.7298, 0.4526, 2.9777, 0.6416])
‚úÖ Scoreboard initialized.


# DistilBERT

In [5]:
FRIENDLY_NAME = "DistilBERT"
MODEL_PATH    = "distilbert-base-uncased"

print(f"\n\n{'='*40}")
print(f"ü•ä TRAINING ROUND: {FRIENDLY_NAME}")
print(f"{'='*40}")

# 1. Tokenize
print(f">>> Tokenizing for {FRIENDLY_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=False, max_length=256)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 2. Load Model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

# 3. STAGE 1: WARMUP (Freeze Body)
print(f"‚ùÑÔ∏è Freezing backbone for warmup...")
for param in model.base_model.parameters():
    param.requires_grad = False
    
warmup_args = TrainingArguments(
    output_dir=f"comparison_{FRIENDLY_NAME}_warmup",
    learning_rate=1e-3,
    num_train_epochs=1,
    per_device_train_batch_size=16,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

Trainer(
    model=model, args=warmup_args, 
    train_dataset=tokenized_datasets["train"],
    data_collator=data_collator
).train()

# 4. STAGE 2: FINE-TUNE (Unfreeze All)
print(f"üî• Unfreezing for full training...")
for param in model.base_model.parameters():
    param.requires_grad = True
    
training_args = TrainingArguments(
    output_dir=f"comparison_{FRIENDLY_NAME}_final",
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    fp16=torch.cuda.is_available(),
    report_to="none"
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

# 5. EVALUATION
print(f"\nüìä {FRIENDLY_NAME} Results:")

# Get Predictions
preds_output = trainer.predict(tokenized_datasets["test"])
y_preds = np.argmax(preds_output.predictions, axis=1)
y_true = preds_output.label_ids

# A. Print Classification Report
print(classification_report(y_true, y_preds, target_names=label_values))

# B. Plot Confusion Matrix
fig, ax = plt.subplots(figsize=(10, 8))
cm = confusion_matrix(y_true, y_preds, labels=range(len(label_values)))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_values)
disp.plot(cmap='Blues', ax=ax, xticks_rotation=45)
plt.title(f"Confusion Matrix: {FRIENDLY_NAME}")
plt.grid(False)
plt.show()

# C. Save Score (Only Once!)
metrics = trainer.evaluate()
results_list.append({
    "Model": FRIENDLY_NAME,
    "Accuracy": metrics["eval_accuracy"],
    "F1_Macro": metrics["eval_f1_macro"]
})

# Cleanup
del model, trainer, tokenizer
torch.cuda.empty_cache()
print(f"‚úÖ {FRIENDLY_NAME} Finished & Cleared from Memory.")



ü•ä TRAINING ROUND: DistilBERT
>>> Tokenizing for DistilBERT...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/29018 [00:00<?, ? examples/s]

Map:   0%|          | 0/5121 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚ùÑÔ∏è Freezing backbone for warmup...


Step,Training Loss
500,0.9968
1000,0.8573
1500,0.788


üî• Unfreezing for full training...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.4961,0.469422,0.773482,0.802431


# RoBERTa

In [None]:
FRIENDLY_NAME = "RoBERTa"
MODEL_PATH    = "roberta-base"

print(f"\n\n{'='*40}")
print(f"ü•ä TRAINING ROUND: {FRIENDLY_NAME}")
print(f"{'='*40}")

# 1. Tokenize
print(f">>> Tokenizing for {FRIENDLY_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=False, max_length=256)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 2. Load Model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

# 3. STAGE 1: WARMUP (Freeze Body)
print(f"‚ùÑÔ∏è Freezing backbone for warmup...")
for param in model.base_model.parameters():
    param.requires_grad = False
    
warmup_args = TrainingArguments(
    output_dir=f"comparison_{FRIENDLY_NAME}_warmup",
    learning_rate=1e-3,
    num_train_epochs=1,
    per_device_train_batch_size=16,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

Trainer(
    model=model, args=warmup_args, 
    train_dataset=tokenized_datasets["train"],
    data_collator=data_collator
).train()

# 4. STAGE 2: FINE-TUNE (Unfreeze All)
print(f"üî• Unfreezing for full training...")
for param in model.base_model.parameters():
    param.requires_grad = True
    
training_args = TrainingArguments(
    output_dir=f"comparison_{FRIENDLY_NAME}_final",
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    fp16=torch.cuda.is_available(),
    report_to="none"
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

# 5. EVALUATION
print(f"\nüìä {FRIENDLY_NAME} Results:")

# Get Predictions
preds_output = trainer.predict(tokenized_datasets["test"])
y_preds = np.argmax(preds_output.predictions, axis=1)
y_true = preds_output.label_ids

# A. Print Classification Report
print(classification_report(y_true, y_preds, target_names=label_values))

# B. Plot Confusion Matrix
fig, ax = plt.subplots(figsize=(10, 8))
cm = confusion_matrix(y_true, y_preds, labels=range(len(label_values)))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_values)
disp.plot(cmap='Blues', ax=ax, xticks_rotation=45)
plt.title(f"Confusion Matrix: {FRIENDLY_NAME}")
plt.grid(False)
plt.show()

# C. Save Score (Only Once!)
metrics = trainer.evaluate()
results_list.append({
    "Model": FRIENDLY_NAME,
    "Accuracy": metrics["eval_accuracy"],
    "F1_Macro": metrics["eval_f1_macro"]
})

# Cleanup
del model, trainer, tokenizer
torch.cuda.empty_cache()
print(f"‚úÖ {FRIENDLY_NAME} Finished & Cleared from Memory.")

# BioBERT

In [None]:
FRIENDLY_NAME = "BioBERT"
MODEL_PATH    = "dmis-lab/biobert-v1.1"

print(f"\n\n{'='*40}")
print(f"ü•ä TRAINING ROUND: {FRIENDLY_NAME}")
print(f"{'='*40}")

# 1. Tokenize
print(f">>> Tokenizing for {FRIENDLY_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=False, max_length=256)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 2. Load Model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

# 3. STAGE 1: WARMUP (Freeze Body)
print(f"‚ùÑÔ∏è Freezing backbone for warmup...")
for param in model.base_model.parameters():
    param.requires_grad = False
    
warmup_args = TrainingArguments(
    output_dir=f"comparison_{FRIENDLY_NAME}_warmup",
    learning_rate=1e-3,
    num_train_epochs=1,
    per_device_train_batch_size=16,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

Trainer(
    model=model, args=warmup_args, 
    train_dataset=tokenized_datasets["train"],
    data_collator=data_collator
).train()

# 4. STAGE 2: FINE-TUNE (Unfreeze All)
print(f"üî• Unfreezing for full training...")
for param in model.base_model.parameters():
    param.requires_grad = True
    
training_args = TrainingArguments(
    output_dir=f"comparison_{FRIENDLY_NAME}_final",
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    fp16=torch.cuda.is_available(),
    report_to="none"
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

# 5. EVALUATION
print(f"\nüìä {FRIENDLY_NAME} Results:")

# Get Predictions
preds_output = trainer.predict(tokenized_datasets["test"])
y_preds = np.argmax(preds_output.predictions, axis=1)
y_true = preds_output.label_ids

# A. Print Classification Report
print(classification_report(y_true, y_preds, target_names=label_values))

# B. Plot Confusion Matrix
fig, ax = plt.subplots(figsize=(10, 8))
cm = confusion_matrix(y_true, y_preds, labels=range(len(label_values)))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_values)
disp.plot(cmap='Blues', ax=ax, xticks_rotation=45)
plt.title(f"Confusion Matrix: {FRIENDLY_NAME}")
plt.grid(False)
plt.show()

# C. Save Score (Only Once!)
metrics = trainer.evaluate()
results_list.append({
    "Model": FRIENDLY_NAME,
    "Accuracy": metrics["eval_accuracy"],
    "F1_Macro": metrics["eval_f1_macro"]
})

# Cleanup
del model, trainer, tokenizer
torch.cuda.empty_cache()
print(f"‚úÖ {FRIENDLY_NAME} Finished & Cleared from Memory.")

# PubMedBERT

In [None]:
FRIENDLY_NAME = "PubMedBERT"
MODEL_PATH    = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"

print(f"\n\n{'='*40}")
print(f"ü•ä TRAINING ROUND: {FRIENDLY_NAME}")
print(f"{'='*40}")

# 1. Tokenize
print(f">>> Tokenizing for {FRIENDLY_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=False, max_length=256)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 2. Load Model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

# 3. STAGE 1: WARMUP (Freeze Body)
print(f"‚ùÑÔ∏è Freezing backbone for warmup...")
for param in model.base_model.parameters():
    param.requires_grad = False
    
warmup_args = TrainingArguments(
    output_dir=f"comparison_{FRIENDLY_NAME}_warmup",
    learning_rate=1e-3,
    num_train_epochs=1,
    per_device_train_batch_size=16,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

Trainer(
    model=model, args=warmup_args, 
    train_dataset=tokenized_datasets["train"],
    data_collator=data_collator
).train()

# 4. STAGE 2: FINE-TUNE (Unfreeze All)
print(f"üî• Unfreezing for full training...")
for param in model.base_model.parameters():
    param.requires_grad = True
    
training_args = TrainingArguments(
    output_dir=f"comparison_{FRIENDLY_NAME}_final",
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    fp16=torch.cuda.is_available(),
    report_to="none"
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

# 5. EVALUATION
print(f"\nüìä {FRIENDLY_NAME} Results:")

# Get Predictions
preds_output = trainer.predict(tokenized_datasets["test"])
y_preds = np.argmax(preds_output.predictions, axis=1)
y_true = preds_output.label_ids

# A. Print Classification Report
print(classification_report(y_true, y_preds, target_names=label_values))

# B. Plot Confusion Matrix
fig, ax = plt.subplots(figsize=(10, 8))
cm = confusion_matrix(y_true, y_preds, labels=range(len(label_values)))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_values)
disp.plot(cmap='Blues', ax=ax, xticks_rotation=45)
plt.title(f"Confusion Matrix: {FRIENDLY_NAME}")
plt.grid(False)
plt.show()

# C. Save Score (Only Once!)
metrics = trainer.evaluate()
results_list.append({
    "Model": FRIENDLY_NAME,
    "Accuracy": metrics["eval_accuracy"],
    "F1_Macro": metrics["eval_f1_macro"]
})

# Cleanup
del model, trainer, tokenizer
torch.cuda.empty_cache()
print(f"‚úÖ {FRIENDLY_NAME} Finished & Cleared from Memory.")

# ClinicalBERT

In [None]:
FRIENDLY_NAME = "ClinicalBERT"
MODEL_PATH    = "emilyalsentzer/Bio_ClinicalBERT"

print(f"\n\n{'='*40}")
print(f"ü•ä TRAINING ROUND: {FRIENDLY_NAME}")
print(f"{'='*40}")

# 1. Tokenize
print(f">>> Tokenizing for {FRIENDLY_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=False, max_length=256)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 2. Load Model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

# 3. STAGE 1: WARMUP (Freeze Body)
print(f"‚ùÑÔ∏è Freezing backbone for warmup...")
for param in model.base_model.parameters():
    param.requires_grad = False
    
warmup_args = TrainingArguments(
    output_dir=f"comparison_{FRIENDLY_NAME}_warmup",
    learning_rate=1e-3,
    num_train_epochs=1,
    per_device_train_batch_size=16,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

Trainer(
    model=model, args=warmup_args, 
    train_dataset=tokenized_datasets["train"],
    data_collator=data_collator
).train()

# 4. STAGE 2: FINE-TUNE (Unfreeze All)
print(f"üî• Unfreezing for full training...")
for param in model.base_model.parameters():
    param.requires_grad = True
    
training_args = TrainingArguments(
    output_dir=f"comparison_{FRIENDLY_NAME}_final",
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    fp16=torch.cuda.is_available(),
    report_to="none"
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

# 5. EVALUATION
print(f"\nüìä {FRIENDLY_NAME} Results:")

# Get Predictions
preds_output = trainer.predict(tokenized_datasets["test"])
y_preds = np.argmax(preds_output.predictions, axis=1)
y_true = preds_output.label_ids

# A. Print Classification Report
print(classification_report(y_true, y_preds, target_names=label_values))

# B. Plot Confusion Matrix
fig, ax = plt.subplots(figsize=(10, 8))
cm = confusion_matrix(y_true, y_preds, labels=range(len(label_values)))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_values)
disp.plot(cmap='Blues', ax=ax, xticks_rotation=45)
plt.title(f"Confusion Matrix: {FRIENDLY_NAME}")
plt.grid(False)
plt.show()

# C. Save Score (Only Once!)
metrics = trainer.evaluate()
results_list.append({
    "Model": FRIENDLY_NAME,
    "Accuracy": metrics["eval_accuracy"],
    "F1_Macro": metrics["eval_f1_macro"]
})

# Cleanup
del model, trainer, tokenizer
torch.cuda.empty_cache()
print(f"‚úÖ {FRIENDLY_NAME} Finished & Cleared from Memory.")