## 1. Environment & Model Setup

### 1.1 Install Dependencies
Run once per new runtime to install the pinned package versions used for this project.

In [1]:
# Install dependencies (run once per runtime)
!pip install -q \
    accelerate==1.11.0 \
    aiohappyeyeballs==2.6.1 \
    aiohttp==3.13.1 \
    aiosignal==1.4.0 \
    anyio==4.11.0 \
    asttokens==3.0.0 \
    async-timeout==5.0.1 \
    attrs==25.4.0 \
    bitsandbytes==0.48.1 \
    cut-cross-entropy==25.1.1 \
    datasets==4.3.0 \
    diffusers==0.35.2 \
    dill==0.4.0 \
    docstring_parser==0.17.0 \
    fsspec==2025.9.0 \
    hf_transfer==0.1.9 \
    huggingface-hub==0.36.0 \
    ipywidgets==8.1.7 \
    matplotlib-inline==0.2.1 \
    msgspec==0.19.0 \
    multiprocess==0.70.16 \
    nest-asyncio==1.6.0 \
    networkx==3.4.2 \
    numpy==2.2.6 \
    pandas==2.3.3 \
    peft==0.11.1 \
    pillow==12.0.0 \
    protobuf==6.33.0 \
    pyarrow==22.0.0 \
    regex==2025.10.23 \
    requests==2.32.5 \
    rich==14.2.0 \
    safetensors==0.6.2 \
    sentencepiece==0.2.1 \
    torch==2.8.0 \
    torchvision==0.23.0 \
    triton==3.4.0 \
    transformers==4.56.2 \
    trl==0.23.0 \
    typeguard==4.4.4 \
    tyro==0.9.35 \
    unsloth==2025.10.10 \
    unsloth_zoo==2025.10.12 \
    xformers==0.0.32.post2 \
    bitsandbytes==0.48.1


Mounting Drive

In [1]:
from google.colab import drive
from pathlib import Path
import subprocess

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Show available storage
print("\nDrive usage:")
subprocess.run(['df', '-h', '/content/drive'], check=False)

# 3. Create the checkpoint directory (same path as before)
ckpt_dir = Path('/content/drive/MyDrive/dl_checkpoints/Midterm Checkpoint')
#ckpt_dir.mkdir(parents=True, exist_ok=True)
print(f"\nCheckpoints will be saved under: {ckpt_dir}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Drive usage:

Checkpoints will be saved under: /content/drive/MyDrive/dl_checkpoints/Midterm Checkpoint



### 1.2 Load Model & Tokenizer
Initialize Meta-Llama-3.1-8B with Unsloth's loader

In [3]:

# Load Meta-Llama-3.1-8B with Unsloth

from unsloth import FastLanguageModel
import torch
from pathlib import Path

# Automatically resume from saved checkpoint if available
DEFAULT_CKPT = Path("/content/drive/MyDrive/dl_checkpoints/Midterm Checkpoint/overnight_v2")
if DEFAULT_CKPT.exists():
    model_path = str(DEFAULT_CKPT)
    print(f"✓ Found local checkpoint: {model_path}")
else:
    model_path = "unsloth/Meta-Llama-3.1-8B"
    print("⚠️ Local checkpoint not found, falling back to base model")

max_seq_length = 1408
use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
dtype = torch.bfloat16 if use_bf16 else None
load_in_4bit = False
device_map = "auto" if torch.cuda.is_available() else None

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_path,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    device_map=device_map,
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"
tokenizer.init_kwargs["padding_side"] = "left"
if hasattr(tokenizer, "default_padding_side"):
    tokenizer.default_padding_side = "left"

print("✓ Model ready")
print(f"✓ Padding side: {tokenizer.padding_side}")
print(f"✓ Max sequence length: {max_seq_length}")
print(f"✓ dtype: {dtype if dtype is not None else 'auto'}")



⚠️ Local checkpoint not found, falling back to base model
==((====))==  Unsloth 2025.10.10: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.318 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✓ Model ready
✓ Padding side: left
✓ Max sequence length: 1408
✓ dtype: torch.bfloat16



<a id="section-2-dataset"></a>
## 2. Dataset Preparation & Prompt Formatting

### 2.1 Load & Clean Dataset
Create consistent train/validation splits and sanitize solution text.


In [4]:

# --- 2.1: Load & Clean Dataset ---

from datasets import load_dataset
import re
import torch

full_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="train")
shuffled = full_dataset.shuffle(seed=42)
n_total = len(shuffled)
train_size = int(0.95 * n_total)

train_dataset = shuffled.select(range(train_size))
validation_dataset = shuffled.select(range(train_size, n_total))

CODE_FENCE_RE = re.compile(r"```.*?```", re.DOTALL)
HTML_TAG_RE = re.compile(r"<.*?>")
WHITESPACE_RE = re.compile(r"\s+")


def normalize_whitespace(text: str) -> str:
    return WHITESPACE_RE.sub(" ", text).strip()


def strip_code_fences(text: str) -> str:
    return CODE_FENCE_RE.sub(" ", text)


def normalize_numbers(text: str) -> str:
    text = text.replace("−", "-")
    text = re.sub(r"(?<=\d),(?=\d)", "", text)
    return text


def scrub_artifacts(text: str) -> str:
    if not text:
        return ""
    text = strip_code_fences(text)
    text = HTML_TAG_RE.sub(" ", text)
    text = re.sub(r"`+", " ", text)
    text = normalize_numbers(text)
    text = normalize_whitespace(text)
    return text


def clean_batch(batch):
    questions = [normalize_whitespace(q) for q in batch["question"]]
    answers = [scrub_artifacts(a) for a in batch["answer"]]
    solutions = [scrub_artifacts(s) for s in batch["solution"]]
    return {
        "question": questions,
        "answer": answers,
        "solution": solutions,
    }


num_proc = 8 if torch.cuda.is_available() else 4
train_dataset = train_dataset.map(clean_batch, batched=True, num_proc=num_proc)
validation_dataset = validation_dataset.map(clean_batch, batched=True, num_proc=num_proc)

print(f"✓ Train/validation split: {len(train_dataset):,} / {len(validation_dataset):,}")
print(f"✓ Cleaning completed with {num_proc} worker processes.")


Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/950000 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/50000 [00:00<?, ? examples/s]

✓ Train/validation split: 950,000 / 50,000
✓ Cleaning completed with 8 worker processes.



### 2.2 Reasoning-Aware Prompt Templates
Use chain-of-thought style prompts to encourage process supervision during fine-tuning.


In [5]:

# --- 2.2: Reasoning-Aware Prompt Templates ---

import torch
import random

PROMPT_TEMPLATES = [
    """You are validating a student's math solution.
Return exactly one word: True (correct) or False (incorrect).

Question:
{question}

Provided answer:
{answer}

Student's solution transcript:
{solution}

Answer (True/False):""",
    """You are an expert judge of math solutions. Determine if the student's answer is correct.
Output only the single token True or False.

Review:
- Question: {question}
- Provided answer: {answer}
- Student reasoning: {solution}

Final verdict (True/False):"""
]

EOS_TOKEN = tokenizer.eos_token or tokenizer.pad_token
LABEL_MAP = {True: "True", False: "False"}


def build_prompt(question: str, answer: str, solution: str) -> str:
    template = random.choice(PROMPT_TEMPLATES)
    return template.format(
        question=question.strip(),
        answer=(answer or "(answer missing)").strip(),
        solution=(solution or "(solution missing)").strip(),
    ).strip()


def formatting_prompts_func(examples):
    prompts, texts, labels = [], [], []
    for q, a, s, label in zip(examples["question"], examples["answer"], examples["solution"], examples["is_correct"]):
        prompt = build_prompt(q, a, s)
        label_text = LABEL_MAP[bool(label)]
        prompts.append(prompt)
        labels.append(label_text)
        texts.append(f"{prompt} {label_text}{EOS_TOKEN}")
    return {"text": texts, "prompt_only": prompts, "target_label": labels}

num_proc = 8 if torch.cuda.is_available() else 4
formatted_train_dataset = train_dataset.map(formatting_prompts_func, batched=True, num_proc=num_proc)
formatted_val_dataset   = validation_dataset.map(formatting_prompts_func, batched=True, num_proc=num_proc)

print(f"✓ Training samples prepared: {len(formatted_train_dataset):,}")
print(f"✓ Validation samples prepared: {len(formatted_val_dataset):,}")



Map (num_proc=8):   0%|          | 0/950000 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/50000 [00:00<?, ? examples/s]

✓ Training samples prepared: 950,000
✓ Validation samples prepared: 50,000



<a id="section-3-experiment"></a>
## 3. Experiment Management & LoRA Configuration

### 3.1 Select Experiment Version & Apply LoRA
Adjust `EXPERIMENT_VERSION` to swap between baseline runs and the reasoning-aware configuration.


In [6]:

# --- 3.1: Configure Experiment & LoRA ---

EXPERIMENT_VERSION = "overnight_v2"

EXPERIMENTS = {
    "overnight_v2": {
        "lora_r": 96,
        "lora_alpha": 192,
        "lora_dropout": 0.05,
        "description": "8-hour refinement run with longer LR decay and checkpointing.",
    },
}

exp_config = EXPERIMENTS[EXPERIMENT_VERSION]
print(f"Experiment: {EXPERIMENT_VERSION}")
print(f"Description: {exp_config['description']}")
print(f"LoRA configuration: r={exp_config['lora_r']}, alpha={exp_config['lora_alpha']}, dropout={exp_config['lora_dropout']}")

model = FastLanguageModel.get_peft_model(
    model,
    r=exp_config['lora_r'],
    lora_alpha=exp_config['lora_alpha'],
    lora_dropout=exp_config['lora_dropout'],
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    use_gradient_checkpointing="unsloth",
    bias="none",
    random_state=42,
)

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params     = sum(p.numel() for p in model.parameters())
print(f"✓ LoRA applied: {trainable_params:,} trainable parameters ({trainable_params/total_params*100:.2f}% of total)")



Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.


Experiment: overnight_v2
Description: 8-hour refinement run with longer LR decay and checkpointing.
LoRA configuration: r=96, alpha=192, dropout=0.05


Unsloth 2025.10.10 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


✓ LoRA applied: 251,658,240 trainable parameters (3.04% of total)


### 3.3 Experiment Logging Utilities
Reusable helpers for recording configuration, metrics, and notes per experiment run.


In [7]:
# --- 3.3: Experiment Logging Utilities ---

import json
import os
from datetime import datetime

EXPERIMENT_LOG_PATH = "/content/drive/MyDrive/dl_checkpoints/Midterm Checkpoint/experiment_log.json"

def _load_experiment_log(path: str) -> dict:
    if os.path.exists(path):
        with open(path, "r") as f:
            return json.load(f)
    return {"experiments": []}

def log_experiment(version: str, config: dict, results: dict, notes: str = "") -> None:
    log = _load_experiment_log(EXPERIMENT_LOG_PATH)
    entry = {
        "version": version,
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "config": config,
        "results": results,
        "notes": notes,
    }
    log["experiments"].append(entry)
    os.makedirs(os.path.dirname(EXPERIMENT_LOG_PATH), exist_ok=True)
    with open(EXPERIMENT_LOG_PATH, "w") as f:
        json.dump(log, f, indent=2)
    print(f"✓ Experiment {version} logged to: {EXPERIMENT_LOG_PATH}")

def view_experiments() -> None:
    if not os.path.exists(EXPERIMENT_LOG_PATH):
        print("No experiments logged yet.")
        return
    log = _load_experiment_log(EXPERIMENT_LOG_PATH)
    print("=" * 80)
    print("📊 EXPERIMENT LOG")
    print("=" * 80)
    for entry in log.get("experiments", []):
        print()  # blank line between entries
        version = entry.get("version", "(unknown version)")
        ts = entry.get("timestamp", "")
        print(f"{version} - {ts}")
        print("-" * 80)
        print("Config:")
        for key, value in entry.get("config", {}).items():
            print(f"  • {key}: {value}")
        print()
        print("Results:")
        for key, value in entry.get("results", {}).items():
            print(f"  • {key}: {value}")
        notes = entry.get("notes")
        if notes:
            print()
            print(f"Notes: {notes}")
        print("-" * 80)




<a id="section-4-training-config"></a>
## 4. Training Configuration & Monitoring

### 4.1 GPU Monitoring Callback
Capture step timings, GPU utilization, and evaluation metrics during training.


In [8]:

# --- 4.1: GPU Monitoring Callback ---

import time
from datetime import timedelta
from transformers import TrainerCallback
import numpy as np
import torch

class GPUMonitorCallback(TrainerCallback):
    """Custom callback for detailed GPU and training monitoring with smoothed ETAs."""

    def __init__(self):
        self.start_time = None
        self.step_times = []
        self.last_step_timestamp = None
        self.best_eval_accuracy = 0.0
        self.smoothed_step_time = None

    def _format_timedelta(self, seconds: float) -> str:
        return str(timedelta(seconds=int(max(seconds, 0))))

    def on_train_begin(self, args, state, control, **kwargs):
        self.start_time = time.time()
        self.last_step_timestamp = self.start_time
        self.step_times.clear()
        self.smoothed_step_time = None

        print("\n" + "=" * 80)
        print("🚀 TRAINING STARTED")
        print("=" * 80)

        if torch.cuda.is_available():
            gpu_name = torch.cuda.get_device_name(0)
            total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
            print(f"GPU: {gpu_name}")
            print(f"Total GPU Memory: {total_memory:.2f} GB")

        print(f"\nTotal training steps: {state.max_steps:,}")
        print(f"Logging every: {args.logging_steps} steps")
        print(f"Evaluation every: {args.eval_steps} steps")
        print(f"Save checkpoint every: {args.save_steps} steps")
        print("=" * 80 + "\n")

    def on_step_end(self, args, state, control, **kwargs):
        if self.last_step_timestamp is None:
            self.last_step_timestamp = time.time()
            return
        now = time.time()
        step_time = now - self.last_step_timestamp
        self.last_step_timestamp = now

        if step_time > 0:
            self.step_times.append(step_time)
            if len(self.step_times) > 100:
                self.step_times.pop(0)

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is None:
            return

        current_step = state.global_step
        total_steps = state.max_steps

        elapsed = time.time() - self.start_time
        overall_avg = elapsed / current_step if current_step else None
        recent_avg = np.mean(self.step_times[-30:]) if self.step_times else None

        if recent_avg and overall_avg:
            if self.smoothed_step_time is None:
                self.smoothed_step_time = 0.7 * recent_avg + 0.3 * overall_avg
            else:
                self.smoothed_step_time = 0.6 * recent_avg + 0.4 * self.smoothed_step_time
        elif overall_avg:
            self.smoothed_step_time = overall_avg
        elif recent_avg:
            self.smoothed_step_time = recent_avg

        if self.smoothed_step_time and current_step:
            remaining_steps = max(total_steps - current_step, 0)
            eta_seconds = self.smoothed_step_time * remaining_steps
            eta_str = self._format_timedelta(eta_seconds)
            avg_step_time_display = self.smoothed_step_time
        else:
            eta_str = "Estimating..."
            avg_step_time_display = 0.0

        effective_batch = args.per_device_train_batch_size * args.gradient_accumulation_steps
        samples_per_sec = effective_batch / avg_step_time_display if avg_step_time_display > 0 else 0.0

        allocated = reserved = total = free = utilization_pct = 0.0
        if torch.cuda.is_available():
            total = torch.cuda.get_device_properties(0).total_memory / 1e9
            allocated = torch.cuda.memory_allocated(0) / 1e9
            reserved = torch.cuda.memory_reserved(0) / 1e9
            free = total - allocated
            utilization_pct = (allocated / total) * 100 if total else 0.0

        progress_pct = (current_step / total_steps) * 100 if total_steps else 0
        elapsed_str = self._format_timedelta(elapsed)

        print("\n" + "─" * 80)
        print(f"📊 Step {current_step:,}/{total_steps:,} ({progress_pct:.1f}%)")
        print("─" * 80)

        if 'loss' in logs:
            print(f"Loss: {logs['loss']:.4f}")
        if 'learning_rate' in logs:
            print(f"Learning Rate: {logs['learning_rate']:.2e}")
        if 'eval_accuracy' in logs:
            accuracy = logs['eval_accuracy']
            print(f"Validation Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
            if accuracy > self.best_eval_accuracy:
                self.best_eval_accuracy = accuracy
                print("🎉 New best accuracy!")

        print(f"\n⏱️  Performance:")
        print(f"   • Avg step time (smoothed): {avg_step_time_display:.2f}s")
        print(f"   • Samples/sec (approx): {samples_per_sec:.2f}")
        print(f"   • Elapsed: {elapsed_str}")
        print(f"   • ETA: {eta_str}")

        print(f"\n🎮 GPU Memory:")
        print(f"   • Allocated: {allocated:.2f} GB / {total:.2f} GB ({utilization_pct:.1f}%)")
        print(f"   • Reserved: {reserved:.2f} GB")
        print(f"   • Free (estimated): {free:.2f} GB")

        if utilization_pct < 25:
            print(f"   • Info: Memory utilisation currently low ({utilization_pct:.1f}%). This is expected early in training.")
        elif utilization_pct > 95:
            print(f"   ⚠️  WARNING: Very high memory usage ({utilization_pct:.1f}%). Risk of OOM – consider smaller batch size.")

        print("─" * 80)

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if not metrics:
            return
        print("\n" + "=" * 80)
        print("📈 EVALUATION RESULTS")
        print("=" * 80)
        for key, value in metrics.items():
            if isinstance(value, float):
                if 'accuracy' in key:
                    print(f"{key}: {value:.4f} ({value*100:.2f}%)")
                else:
                    print(f"{key}: {value:.4f}")
            else:
                print(f"{key}: {value}")
        if 'eval_accuracy' in metrics:
            print(f"\n🏆 Best accuracy so far: {self.best_eval_accuracy:.4f} ({self.best_eval_accuracy*100:.2f}%)")
        print("=" * 80 + "\n")

    def on_train_end(self, args, state, control, **kwargs):
        if self.start_time is None:
            return
        total_time = time.time() - self.start_time
        total_time_str = self._format_timedelta(total_time)

        print("\n" + "=" * 80)
        print("✅ TRAINING COMPLETED")
        print("=" * 80)
        print(f"Total training time: {total_time_str}")
        print(f"Total steps: {state.global_step:,}")
        if self.step_times:
            avg_step_time = sum(self.step_times) / len(self.step_times)
            print(f"Average step time (last {len(self.step_times)} steps): {avg_step_time:.2f}s")
        if self.best_eval_accuracy > 0:
            print(f"🏆 Best validation accuracy: {self.best_eval_accuracy:.4f} ({self.best_eval_accuracy*100:.2f}%)")
        print("=" * 80 + "\n")


def compute_metrics(eval_pred):
    """Compute accuracy for evaluation."""
    predictions, labels = eval_pred
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    pred_labels = np.argmax(predictions, axis=-1) if predictions.ndim > 1 else predictions
    pred_labels = pred_labels.flatten()
    labels = labels.flatten()
    mask = labels != -100
    pred_labels = pred_labels[mask]
    labels = labels[mask]
    accuracy = (pred_labels == labels).mean()
    return {"accuracy": float(accuracy)}


def check_gpu_status():
    """Display initial GPU status."""
    print("\n" + "=" * 80)
    print("🔍 PRE-TRAINING GPU CHECK")
    print("=" * 80)

    if not torch.cuda.is_available():
        print("❌ No GPU available!")
        return False

    gpu_name = torch.cuda.get_device_name(0)
    gpu_props = torch.cuda.get_device_properties(0)
    total_memory = gpu_props.total_memory / 1e9

    print(f"GPU Name: {gpu_name}")
    print(f"Total Memory: {total_memory:.2f} GB")
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"PyTorch Version: {torch.__version__}")

    torch.cuda.empty_cache()
    allocated = torch.cuda.memory_allocated(0) / 1e9
    reserved = torch.cuda.memory_reserved(0) / 1e9
    free = total_memory - allocated

    print(f"\nCurrent Memory Status:")
    print(f"   • Allocated: {allocated:.2f} GB")
    print(f"   • Reserved: {reserved:.2f} GB")
    print(f"   • Free: {free:.2f} GB")

    if "A100" in gpu_name:
        print(f"\n✅ A100 detected!")
        print(f"   • {'80GB' if total_memory > 70 else '40GB'} configuration")

    print("=" * 80 + "\n")
    return True

# Run GPU check
check_gpu_status()



🔍 PRE-TRAINING GPU CHECK
GPU Name: NVIDIA A100-SXM4-80GB
Total Memory: 85.17 GB
CUDA Version: 12.6
PyTorch Version: 2.8.0+cu126

Current Memory Status:
   • Allocated: 17.10 GB
   • Reserved: 17.12 GB
   • Free: 68.07 GB

✅ A100 detected!
   • 80GB configuration



True


### 4.2 Configure SFT Trainer
Bind the model, datasets, and experiment settings into the TRL `SFTTrainer` while attaching monitoring callbacks.


In [11]:

# --- 4.2: SFT Trainer Configuration ---

from trl import SFTTrainer, SFTConfig

TRAINING_CONFIGS = {
    "overnight_v2": {
        "learning_rate": 3.2e-5,
        "per_device_batch_size": 48,
        "grad_accum": 2,
        "max_steps": 2100,
        "logging_steps": 10,
        "eval_steps": 300,
        "save_steps": 300,
        "warmup_ratio": 0.05,
        "expected_step_time": 16.0,
        "val_samples": 12000,
        "bf16": True,
        "notes": "Overnight refinement targeting ≥0.95 leaderboard accuracy with stable memory footprint."
    }
}

train_config = TRAINING_CONFIGS[EXPERIMENT_VERSION]
warmup_steps = int(train_config["max_steps"] * train_config["warmup_ratio"])
use_bf16 = train_config.get("bf16", torch.cuda.is_bf16_supported()) and torch.cuda.is_available()

max_seq_length = 1408

default_output = "/content/drive/MyDrive/dl_checkpoints/Midterm Checkpoint/overnight_v2"

args = SFTConfig(
    per_device_train_batch_size=train_config["per_device_batch_size"],
    gradient_accumulation_steps=train_config["grad_accum"],
    max_steps=train_config["max_steps"],
    learning_rate=train_config["learning_rate"],
    lr_scheduler_type="cosine",
    warmup_steps=warmup_steps,
    weight_decay=0.01,
    optim="adamw_8bit",
    fp16=not use_bf16,
    bf16=use_bf16,
    logging_steps=train_config["logging_steps"],
    eval_strategy="steps",
    eval_steps=train_config["eval_steps"],
    save_strategy="steps",
    save_steps=train_config["save_steps"],
    save_total_limit=8,
    load_best_model_at_end=False,
    report_to="none",
    output_dir=default_output,
    seed=42,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    dataloader_num_workers=8,
    dataloader_pin_memory=True,
)

train_data = formatted_train_dataset
val_data = formatted_val_dataset

gpu_monitor = GPUMonitorCallback()

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_data,
    eval_dataset=val_data,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_kwargs={"add_special_tokens": False, "append_concat_token": False},
    args=args,
    callbacks=[gpu_monitor],
)

effective_batch = args.per_device_train_batch_size * args.gradient_accumulation_steps
estimated_minutes = train_config["max_steps"] * train_config["expected_step_time"] / 60

CURRENT_EXP_CONFIG = {
    "version": EXPERIMENT_VERSION,
    "learning_rate": train_config["learning_rate"],
    "per_device_batch_size": args.per_device_train_batch_size,
    "gradient_accumulation": args.gradient_accumulation_steps,
    "effective_batch": effective_batch,
    "max_steps": train_config["max_steps"],
    "warmup_steps": warmup_steps,
    "expected_step_time": train_config["expected_step_time"],
    "sequence_length": max_seq_length,
    "val_samples": train_config["val_samples"],
    "bf16": bool(args.bf16),
    "output_dir": default_output,
    "notes": train_config["notes"],
}

print(f"🔬 Training Config for {EXPERIMENT_VERSION}")
print(f"   LR: {train_config['learning_rate']} | Max steps: {args.max_steps} | Warmup: {warmup_steps}")
print(f"   Per-device batch: {args.per_device_train_batch_size} | Grad accum: {args.gradient_accumulation_steps}")
print(f"   Effective batch: {effective_batch} | Estimated runtime: ~{estimated_minutes:.1f} minutes (~{estimated_minutes/60:.1f} hours)")
print(f"   Output directory: {default_output}")


🔬 Training Config for overnight_v2
   LR: 3.2e-05 | Max steps: 2100 | Warmup: 105
   Per-device batch: 48 | Grad accum: 2
   Effective batch: 96 | Estimated runtime: ~560.0 minutes (~9.3 hours)
   Output directory: /content/drive/MyDrive/dl_checkpoints/Midterm Checkpoint/overnight_v2



<a id="section-5-training-workflow"></a>
## 5. Training Workflow

### 5.1 Launch Training
Kick off supervised fine-tuning once Sections 1–4 are complete. Monitor the GPU callback output for throughput and utilization guidance.


In [12]:

# --- 5.1: Launch Training ---

#resume_ckpt = "/content/drive/MyDrive/dl_checkpoints/Midterm Checkpoint/a100_turbo_final"
#print(f"Starting overnight fine-tuning from {resume_ckpt} ...")
#trainer.train(resume_from_checkpoint=resume_ckpt)
trainer.train()


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128001}.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 950,000 | Num Epochs = 1 | Total steps = 2,100
O^O/ \_/ \    Batch size per device = 48 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (48 x 2 x 1) = 96
 "-____-"     Trainable parameters = 251,658,240 of 8,281,919,488 (3.04% trained)



🚀 TRAINING STARTED
GPU: NVIDIA A100-SXM4-80GB
Total GPU Memory: 85.17 GB

Total training steps: 2,100
Logging every: 10 steps
Evaluation every: 300 steps
Save checkpoint every: 300 steps



Step,Training Loss,Validation Loss
300,0.7162,0.701906
600,0.6234,0.624652
900,0.5644,0.570752
1200,0.5311,0.533432
1500,0.5066,0.507949
1800,0.4925,0.49448
2100,0.4827,0.491817



────────────────────────────────────────────────────────────────────────────────
📊 Step 10/2,100 (0.5%)
────────────────────────────────────────────────────────────────────────────────
Loss: 1.6319
Learning Rate: 2.74e-06

⏱️  Performance:
   • Avg step time (smoothed): 19.86s
   • Samples/sec (approx): 4.83
   • Elapsed: 0:03:18
   • ETA: 11:31:54

🎮 GPU Memory:
   • Allocated: 17.63 GB / 85.17 GB (20.7%)
   • Reserved: 61.36 GB
   • Free (estimated): 67.54 GB
   • Info: Memory utilisation currently low (20.7%). This is expected early in training.
────────────────────────────────────────────────────────────────────────────────

────────────────────────────────────────────────────────────────────────────────
📊 Step 20/2,100 (1.0%)
────────────────────────────────────────────────────────────────────────────────
Loss: 1.5540
Learning Rate: 5.79e-06

⏱️  Performance:
   • Avg step time (smoothed): 20.21s
   • Samples/sec (approx): 4.75
   • Elapsed: 0:06:48
   • ETA: 11:40:39

🎮 GPU Memo

Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient



────────────────────────────────────────────────────────────────────────────────
📊 Step 300/2,100 (14.3%)
────────────────────────────────────────────────────────────────────────────────

⏱️  Performance:
   • Avg step time (smoothed): 20.81s
   • Samples/sec (approx): 4.61
   • Elapsed: 2:27:21
   • ETA: 10:24:12

🎮 GPU Memory:
   • Allocated: 17.63 GB / 85.17 GB (20.7%)
   • Reserved: 17.85 GB
   • Free (estimated): 67.54 GB
   • Info: Memory utilisation currently low (20.7%). This is expected early in training.
────────────────────────────────────────────────────────────────────────────────

📈 EVALUATION RESULTS
eval_loss: 0.7019
eval_runtime: 2841.1225
eval_samples_per_second: 17.5990
eval_steps_per_second: 4.4000
epoch: 0.0303


────────────────────────────────────────────────────────────────────────────────
📊 Step 310/2,100 (14.8%)
────────────────────────────────────────────────────────────────────────────────
Loss: 0.7171
Learning Rate: 3.12e-05

⏱️  Performance:
   • Avg step

TrainOutput(global_step=2100, training_loss=0.5934949354898362, metrics={'train_runtime': 62771.6938, 'train_samples_per_second': 3.212, 'train_steps_per_second': 0.033, 'total_flos': 6.978346704956031e+18, 'train_loss': 0.5934949354898362, 'epoch': 0.2122069523039612})


<a id="section-6-post-training"></a>
## 6. Post-Training Logging & Analysis

### 6.1 Validate on Hold-out Set
Run the accuracy sweep to ensure the prompt and adapter settings generalise.

### 6.2 Log Experiment Results
Record metrics and notes once validation performance looks healthy.


In [13]:

# --- 6.1: Validation Accuracy Sweep ---

import gc
import torch
from tqdm import tqdm

model.eval()

cfg_limit = TRAINING_CONFIGS[EXPERIMENT_VERSION].get("val_samples", len(validation_dataset))
VAL_SAMPLE_LIMIT = min(cfg_limit, len(validation_dataset), 12000)

val_subset = validation_dataset.select(range(VAL_SAMPLE_LIMIT)) if VAL_SAMPLE_LIMIT < len(validation_dataset) else validation_dataset

pad_token = tokenizer.eos_token
if tokenizer.pad_token != pad_token:
    tokenizer.pad_token = pad_token
if tokenizer.pad_token_id != tokenizer.eos_token_id:
    tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"
tokenizer.init_kwargs["padding_side"] = "left"
if hasattr(tokenizer, "default_padding_side"):
    tokenizer.default_padding_side = "left"


def generate_boolean_predictions(prompts, batch_size=12, max_new_tokens=4, desc="Generating"):
    predictions = []
    invalid = 0
    iterator = tqdm(range(0, len(prompts), batch_size), desc=desc, leave=False)

    for start in iterator:
        batch_prompts = prompts[start:start + batch_size]
        tokenizer.padding_side = "left"
        inputs = tokenizer(
            batch_prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_seq_length,
        ).to(model.device)

        with torch.inference_mode():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                temperature=0.0,
                top_p=1.0,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )

        generated = outputs[:, inputs["input_ids"].shape[-1]:]
        batch_texts = tokenizer.batch_decode(generated, skip_special_tokens=True)

        for text in batch_texts:
            cleaned = text.strip()
            first_token = cleaned.split()[0] if cleaned else ""
            lower = first_token.lower()
            if lower.startswith("true"):
                predictions.append(True)
            elif lower.startswith("false"):
                predictions.append(False)
            else:
                invalid += 1
                predictions.append(False)

    return predictions, invalid

if torch.cuda.is_available():
    torch.cuda.empty_cache()
gc.collect()

val_prompts = [
    build_prompt(q, a, s)
    for q, a, s in zip(val_subset["question"], val_subset["answer"], val_subset["solution"])
]

val_labels = [bool(x) for x in val_subset["is_correct"]]

predictions, invalid_generations = generate_boolean_predictions(val_prompts, batch_size=12, desc="Validating")

correct = sum(int(p == l) for p, l in zip(predictions, val_labels))
val_accuracy = correct / len(val_labels) if val_labels else 0.0
true_rate = sum(predictions) / len(predictions) if predictions else 0.0

LATEST_EVAL_RESULTS = {
    "val_accuracy": val_accuracy,
    "val_size": len(val_labels),
    "invalid_generations": invalid_generations,
    "true_rate": true_rate,
}

print("=" * 80)
print(f"Validation subset size: {len(val_labels):,}")
print(f"Accuracy: {val_accuracy:.4f}")
print(f"True prediction rate: {true_rate:.4f}")
print(f"Invalid generations coerced to False: {invalid_generations}")
print("=" * 80)


                                                               

Validation subset size: 12,000
Accuracy: 0.8818
True prediction rate: 0.3867
Invalid generations coerced to False: 2




In [14]:

# --- 6.2: Log Experiment Results ---

if "LATEST_EVAL_RESULTS" not in globals():
    raise RuntimeError("Run the validation sweep first to populate LATEST_EVAL_RESULTS.")

val_accuracy = LATEST_EVAL_RESULTS.get("val_accuracy")
val_size = LATEST_EVAL_RESULTS.get("val_size")
invalid_generations = LATEST_EVAL_RESULTS.get("invalid_generations")
true_rate = LATEST_EVAL_RESULTS.get("true_rate")

train_loss = trainer.state.log_history[-1].get('loss', 'N/A') if trainer.state.log_history else 'N/A'
eval_loss = trainer.state.log_history[-1].get('eval_loss', 'N/A') if trainer.state.log_history else 'N/A'

if hasattr(gpu_monitor, 'step_times') and gpu_monitor.step_times:
    recent = gpu_monitor.step_times[-50:]
    avg_step_time = sum(recent) / len(recent)
else:
    avg_step_time = 'N/A'

samples_seen = CURRENT_EXP_CONFIG['max_steps'] * CURRENT_EXP_CONFIG['effective_batch']
val_limit_cfg = TRAINING_CONFIGS[EXPERIMENT_VERSION].get("val_samples", len(validation_dataset))
val_limit_effective = min(val_limit_cfg, len(validation_dataset), 18000)

log_experiment(
    version=EXPERIMENT_VERSION,
    config={**CURRENT_EXP_CONFIG, "val_samples_used": val_size},
    results={
        "kaggle_score": "PENDING",
        "final_train_loss": train_loss,
        "final_val_loss": eval_loss,
        "val_accuracy": f"{val_accuracy:.4f}" if val_accuracy is not None else "NOT_RUN",
        "val_samples_used": val_size if val_size is not None else "NOT_RUN",
        "val_sample_limit": val_limit_effective,
        "invalid_generations": invalid_generations if invalid_generations is not None else 0,
        "true_rate": f"{true_rate:.4f}" if true_rate is not None else "NOT_RUN",
        "samples_seen": samples_seen,
        "avg_step_time": f"{avg_step_time:.2f}s" if isinstance(avg_step_time, float) else avg_step_time,
        "bf16": CURRENT_EXP_CONFIG.get('bf16'),
    },
    notes=f"{CURRENT_EXP_CONFIG['notes']}"
)

print()
print("=" * 80)
print(f"✅ {EXPERIMENT_VERSION} RESULTS LOGGED")
print("=" * 80)

view_experiments()

print()
print("=" * 80)
print("📋 NEXT STEPS:")
print("=" * 80)
print("1. ▶️  Run Section 7 to generate submission.csv")
print("2. 📥 Download submission.csv from Colab")
print("3. 🏆 Submit to Kaggle and record your score")
print("4. 🔁 Adjust config if you plan another run")
print("=" * 80)



✓ Experiment overnight_v2 logged to: /content/drive/MyDrive/dl_checkpoints/Midterm Checkpoint/experiment_log.json

✅ overnight_v2 RESULTS LOGGED
📊 EXPERIMENT LOG

overnight_v2 - 2025-11-02 21:34:11
--------------------------------------------------------------------------------
Config:
  • version: overnight_v2
  • learning_rate: 3.2e-05
  • per_device_batch_size: 48
  • gradient_accumulation: 2
  • effective_batch: 96
  • max_steps: 2100
  • warmup_steps: 105
  • expected_step_time: 16.0
  • sequence_length: 1408
  • val_samples: 12000
  • bf16: True
  • output_dir: /content/drive/MyDrive/dl_checkpoints/Midterm Checkpoint/overnight_v2
  • notes: Overnight refinement targeting ≥0.95 leaderboard accuracy with stable memory footprint.
  • val_samples_used: 12000

Results:
  • kaggle_score: PENDING
  • final_train_loss: N/A
  • final_val_loss: N/A
  • val_accuracy: 0.8818
  • val_samples_used: 12000
  • val_sample_limit: 12000
  • invalid_generations: 2
  • true_rate: 0.3867
  • samples_s


<a id="section-7-inference"></a>
## 7. Inference & Submission

### 7.1 Batch Inference & CSV Export
Run batched decoding on the competition test set and create a submission-ready CSV file.


In [15]:

# --- 7.1: Batch Inference & Submission ---

import gc
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
import torch

model.eval()

if torch.cuda.is_available():
    torch.cuda.empty_cache()
gc.collect()

pad_token = tokenizer.eos_token
if tokenizer.pad_token != pad_token:
    tokenizer.pad_token = pad_token
if tokenizer.pad_token_id != tokenizer.eos_token_id:
    tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"
tokenizer.init_kwargs["padding_side"] = "left"
if hasattr(tokenizer, "default_padding_side"):
    tokenizer.default_padding_side = "left"

FINAL_BATCH_SIZE = 12
print(f"Using inference batch size: {FINAL_BATCH_SIZE}")

test_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="test")

test_prompts = [
    build_prompt(q, a, s)
    for q, a, s in zip(test_dataset["question"], test_dataset["answer"], test_dataset["solution"])
]

predictions, invalid_generations = generate_boolean_predictions(test_prompts, batch_size=FINAL_BATCH_SIZE, desc="Predicting")

submission = pd.DataFrame({
    "ID": range(len(predictions)),
    "is_correct": predictions,
})

submission_path = "submission.csv"
submission.to_csv(submission_path, index=False)

print("=" * 80)
print(f"Submission file created: {submission_path}")
print(f"Total predictions: {len(predictions):,}")
print(f"Invalid generations coerced to False: {invalid_generations}")
print("=" * 80)


Using inference batch size: 12


                                                             

Submission file created: submission.csv
Total predictions: 10,000
Invalid generations coerced to False: 1




#Saving the model

In [16]:
from pathlib import Path
import shutil

POST_TRAIN_ROOT = Path("/content/drive/MyDrive/dl_checkpoints/Midterm Checkpoint/post_training")
FINAL_ADAPTER_DIR = POST_TRAIN_ROOT / f"{EXPERIMENT_VERSION}_final"

POST_TRAIN_ROOT.mkdir(parents=True, exist_ok=True)
if FINAL_ADAPTER_DIR.exists():
    print(f"⚠️ Existing snapshot found at {FINAL_ADAPTER_DIR}. Overwriting with latest weights.")
    shutil.rmtree(FINAL_ADAPTER_DIR)

FINAL_ADAPTER_DIR.mkdir(parents=True, exist_ok=True)

print(f"Saving final adapter and tokenizer to: {FINAL_ADAPTER_DIR}")

model.save_pretrained(FINAL_ADAPTER_DIR)
tokenizer.save_pretrained(FINAL_ADAPTER_DIR)

CURRENT_EXP_CONFIG["final_adapter"] = str(FINAL_ADAPTER_DIR)

output_dir_path = Path(args.output_dir)
checkpoints = sorted(
    output_dir_path.glob("checkpoint-*"),
    key=lambda p: int(p.name.split("-")[-1]),
    reverse=True,
)
if checkpoints:
    latest_checkpoint = checkpoints[0]
    CURRENT_EXP_CONFIG["last_checkpoint"] = str(latest_checkpoint)
    print(f"Latest training checkpoint preserved at: {latest_checkpoint}")
else:
    print("No checkpoint directories found under the trainer output.")

print("✓ Final adapter snapshot saved.")


Saving final adapter and tokenizer to: /content/drive/MyDrive/dl_checkpoints/Midterm Checkpoint/post_training/overnight_v2_final
Latest training checkpoint preserved at: /content/drive/MyDrive/dl_checkpoints/Midterm Checkpoint/overnight_v2/checkpoint-2100
✓ Final adapter snapshot saved.
