In [None]:
# Setup data directory if needed
import os
if not os.path.exists('../data'):
    !mkdir ../data
    print("Created data directory")
else:
    print("Data directory exists")

## Setup and Imports

In [None]:
# Install Unsloth and dependencies
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2

print("✓ Unsloth and dependencies installed")

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix
)
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

print("✓ Libraries imported")

## Load Base Model with Unsloth

Load Gemma-3-4B model with 4-bit quantization for efficient fine-tuning.

In [None]:
from unsloth import FastModel
import torch

# Load Gemma-3-4B model with 4-bit quantization
model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-4b-it",
    max_seq_length = 2048,
    load_in_4bit = True,
    load_in_8bit = False,
    full_finetuning = False,
)

print("✓ Gemma-3-4B model loaded")
print(f"  Model: {model.__class__.__name__}")
print(f"  Max sequence length: 2048")

## Add LoRA Adapters

Configure LoRA (Low-Rank Adaptation) for parameter-efficient fine-tuning.

In [None]:
# Add LoRA adapters for efficient fine-tuning
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False,
    finetune_language_layers   = True,
    finetune_attention_modules = True,
    finetune_mlp_modules       = True,
    r = 16,
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

print("✓ LoRA adapters configured")
print(f"  Rank (r): 16")
print(f"  Alpha: 16")

## Data Loading and Processing

Load EXIST English-only dataset and format for classification task.

In [None]:
# Configure chat template for Gemma-3
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)

print("✓ Gemma-3 chat template configured")

In [None]:
# Load EXIST dataset (English only)
df = pd.read_csv('../data/aggregated_data_en.csv')

print(f"✓ Data loaded")
print(f"  Total samples: {len(df)}")
print(f"\nSplit distribution:")
print(df['split'].value_counts())
print(f"\nLabel distribution:")
print(df['label_sexist'].value_counts())

# Show example
print(f"\nExample:")
print(f"  Text: {df.iloc[0]['text']}")
print(f"  Label: {df.iloc[0]['label_sexist']}")

In [None]:
# Split data
train_df = df[df['split'] == 'train'].copy()
dev_df = df[df['split'] == 'dev'].copy()
test_df = df[df['split'] == 'test'].copy()

print(f"Train: {len(train_df)} samples")
print(f"Dev: {len(dev_df)} samples")
print(f"Test: {len(test_df)} samples")

## Format Data for Classification

Convert EXIST data to conversation format for fine-tuning.

In [None]:
# Convert to conversation format for fine-tuning
from datasets import Dataset

conversations = []
for _, row in train_df.iterrows():
    text = row['text']
    label = row['label_sexist']
    
    # Format as instruction-response pairs
    conversation = [
        {"role": "user", "content": f"Classify this tweet as 'sexist' or 'not sexist': {text}"},
        {"role": "assistant", "content": label}
    ]
    conversations.append({"conversations": conversation})

# Create dataset
dataset = Dataset.from_list(conversations)

print(f"✓ Dataset formatted for fine-tuning")
print(f"  Total examples: {len(dataset)}")

### Example Training Sample

In [None]:
# Show example conversation
print("Example training conversation:")
print(dataset[100]['conversations'])

## Apply Chat Template

Apply Gemma-3 chat template to format conversations for training.

In [None]:
# Apply chat template to conversations
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [
        tokenizer.apply_chat_template(
            convo,
            tokenize=False,
            add_generation_prompt=False
        ).removeprefix('<bos>')
        for convo in convos
    ]
    return {"text": texts}

dataset = dataset.map(formatting_prompts_func, batched=True)

print("✓ Chat template applied to dataset")

### Formatted Training Example

In [None]:
# Show formatted text
print("Formatted training example:")
print(dataset[100]["text"][:400] + "...")

## Configure Training

Set up training arguments and trainer with response masking.

In [None]:
# Configure trainer
from trl import SFTTrainer, SFTConfig

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None,
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1,  # Full training run
        learning_rate = 2e-4,
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
    ),
)

print("✓ Trainer configured")
print(f"  Epochs: 1")
print(f"  Batch size: 2")
print(f"  Learning rate: 2e-4")

### Enable Response-Only Training

Train only on model responses, not user inputs.

In [None]:
# Train only on assistant responses
from unsloth.chat_templates import train_on_responses_only

trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

print("✓ Response-only training enabled")

In [None]:
# Check initial memory usage
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)

print(f"GPU: {gpu_stats.name}")
print(f"Max memory: {max_memory} GB")
print(f"Reserved: {start_gpu_memory} GB")

## Train the Model

Note: Training may take several hours depending on hardware.

In [None]:
# Start training
print("="*60)
print("Starting fine-tuning...")
print("="*60)

trainer_stats = trainer.train()

print("\n" + "="*60)
print("✓ TRAINING COMPLETED!")
print("="*60)

In [None]:
# Show training statistics
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
training_time_minutes = round(trainer_stats.metrics['train_runtime']/60, 2)

print(f"Training time: {training_time_minutes} minutes")
print(f"Peak memory: {used_memory} GB")
print(f"Memory for training: {used_memory_for_lora} GB")

## Save Fine-Tuned Model

In [None]:
# Save LoRA adapters
model.save_pretrained("models/gemma3_sexism_classifier")
tokenizer.save_pretrained("models/gemma3_sexism_classifier")

print("✓ Model saved to models/gemma3_sexism_classifier")

## Inference and Evaluation

Run predictions on test set and evaluate with classification metrics.

In [None]:
# Prepare test data for evaluation
test_texts = test_df['text'].tolist()
test_labels = test_df['label_sexist'].tolist()

print(f"✓ Test set prepared: {len(test_texts)} samples")
print(f"\nLabel distribution in test set:")
print(test_df['label_sexist'].value_counts())

## Run Predictions on Test Set

In [None]:
# Run inference on test set
print("Running predictions on test set...")

predictions = []
for i, text in enumerate(tqdm(test_texts, desc="Classifying")):
    # Create prompt with proper format for Gemma-3
    prompt = f"Classify this tweet as 'sexist' or 'not sexist': {text}"
    
    # Format as conversation
    conversation = [{"role": "user", "content": prompt}]
    
    # Apply chat template and tokenize
    formatted_prompt = tokenizer.apply_chat_template(
        conversation,
        tokenize=False,
        add_generation_prompt=True
    )
    
    # Tokenize
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda")
    
    # Generate
    outputs = model.generate(
        **inputs,
        max_new_tokens=10,  # Short response for classification
        temperature=0.3,  # Lower temperature for more deterministic output
        top_p=0.95,
        top_k=40,
    )
    
    # Decode and parse
    response = tokenizer.decode(outputs[0], skip_special_tokens=True).lower()
    
    # Extract label from response
    # Look for the assistant's response after the prompt
    if "sexist" in response:
        if "not sexist" in response or "non-sexist" in response:
            predicted_label = "not sexist"
        else:
            predicted_label = "sexist"
    else:
        predicted_label = "not sexist"  # Default fallback
    
    predictions.append(predicted_label)

print(f"\n✓ Predictions completed: {len(predictions)} predictions")

## Evaluate Performance

In [None]:
# Calculate metrics
precision, recall, f1, _ = precision_recall_fscore_support(
    test_labels, predictions, average='weighted'
)
acc = accuracy_score(test_labels, predictions)

print("\n" + "="*60)
print("FINE-TUNED LLM RESULTS")
print("="*60)
print(f"\nTest Set Performance:")
print(f"  Accuracy:  {acc:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1 Score:  {f1:.4f}")
print("="*60)

# Detailed classification report
print("\nDetailed Classification Report:")
print("-" * 60)
print(classification_report(test_labels, predictions, 
                          target_names=['not sexist', 'sexist']))

## Confusion Matrix

In [None]:
# Confusion matrix
cm = confusion_matrix(test_labels, predictions, labels=['not sexist', 'sexist'])

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['not sexist', 'sexist'],
            yticklabels=['not sexist', 'sexist'])
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix - Fine-Tuned Gemma-3-4B')
plt.tight_layout()
plt.savefig('confusion_matrix_finetuned.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Confusion matrix saved to 'confusion_matrix_finetuned.png'")

## Save Predictions

In [None]:
# Save predictions to CSV
output_df = test_df.copy()
output_df['predicted_label'] = predictions
output_df['correct'] = output_df['label_sexist'] == output_df['predicted_label']

output_df.to_csv('predictions_finetuned_gemma3.csv', index=False)
print("✓ Predictions saved to 'predictions_finetuned_gemma3.csv'")

# Show accuracy
print(f"\nCorrect predictions: {output_df['correct'].sum()} / {len(output_df)} ({output_df['correct'].mean():.2%})")

## Example Predictions Analysis

In [None]:
# Show example predictions
print("Example Predictions:")
print("="*100)

# Correct predictions - sexist
print("\n✓ CORRECT PREDICTIONS (Sexist):")
print("-"*100)
correct_sexist = output_df[(output_df['correct'] == True) & (output_df['label_sexist'] == 'sexist')]
for _, row in correct_sexist.head(3).iterrows():
    print(f"Text: {row['text'][:80]}...")
    print(f"Label: {row['label_sexist']}\n")

# Correct predictions - not sexist
print("\n✓ CORRECT PREDICTIONS (Not Sexist):")
print("-"*100)
correct_not_sexist = output_df[(output_df['correct'] == True) & (output_df['label_sexist'] == 'not sexist')]
for _, row in correct_not_sexist.head(3).iterrows():
    print(f"Text: {row['text'][:80]}...")
    print(f"Label: {row['label_sexist']}\n")

# Incorrect predictions
print("\n✗ INCORRECT PREDICTIONS:")
print("-"*100)
incorrect = output_df[output_df['correct'] == False]
for _, row in incorrect.head(5).iterrows():
    print(f"Text: {row['text'][:80]}...")
    print(f"True: {row['label_sexist']:12s} | Predicted: {row['predicted_label']}\n")