In [1]:
!pip install "unsloth[colab-new]"



In [2]:
# Import the 'os' library to interact with the operating system
import os

# --- Environment Check ---
# We check for an environment variable 'COLAB_GPU'. If it exists, we're in Google Colab.
if 'COLAB_GPU' in os.environ:
    print("Running in Google Colab.")
    # Google Drive mounting happens inside train.py if needed.
    # We set the SAVE_PATH variable here mainly for the post-training steps.
    SAVE_PATH = "/content/drive/MyDrive/llama3_math_checkpoint"

# We check if the /kaggle/input directory exists. If it does, we're in a Kaggle Notebook.
elif os.path.exists('/kaggle/input'):
    print("Running in Kaggle Notebook.")
    # Set the save path to Kaggle's writable directory
    SAVE_PATH = "/kaggle/working/llama3_math_checkpoint"

else:
    print("Running in a local environment.")
    # Set a default path for a local machine
    SAVE_PATH = "llama3_math_checkpoint"

# --- Create Save Directory ---
# 'os.makedirs' creates the directory if it doesn't already exist.
# 'exist_ok=True' prevents an error if the folder is already there.
# We still create the directory here in case post-processing steps need it,
# although train.py will also create it.
os.makedirs(SAVE_PATH, exist_ok=True)

print(f"Post-training model checkpoint location set to: {SAVE_PATH}")
print("Cell 4 Complete: Environment check done.")

Running in Kaggle Notebook.
Post-training model checkpoint location set to: /kaggle/working/llama3_math_checkpoint
Cell 4 Complete: Environment check done.


In [3]:
%%writefile train.py
# --- 1. IMPORTS ---
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments, EarlyStoppingCallback, AutoTokenizer
import os
from accelerate import Accelerator
import pandas as pd
from tqdm import tqdm

# --- 2. INITIALIZE ACCELERATOR ---
accelerator = Accelerator()
print(f"train.py: Process rank: {accelerator.process_index}, Device: {accelerator.device}, Main process: {accelerator.is_main_process}")

# --- 3. ENVIRONMENT AND SAVE PATH (DYNAMIC) ---
# (Your friend's logic here is perfect)
if 'COLAB_GPU' in os.environ:
    print("train.py: Running in Google Colab.")
    SAVE_PATH = "llama3_math_checkpoint_colab_local"
elif os.path.exists('/kaggle/input'):
    print("train.py: Running in Kaggle Notebook.")
    SAVE_PATH = "/kaggle/working/llama3_math_checkpoint"
else:
    print("train.py: Running in a local environment.")
    SAVE_PATH = "llama3_math_checkpoint"

if accelerator.is_main_process: os.makedirs(SAVE_PATH, exist_ok=True)
accelerator.wait_for_everyone()
print(f"train.py: Model checkpoint save path: {SAVE_PATH}")

# --- 4. LOAD MODEL AND TOKENIZER ---
max_seq_length = 4096  # <--- FIX #1: Use 4096 for long problems
dtype = None
load_in_4bit = True

load_model_kwargs = {
    "max_seq_length": max_seq_length,
    "dtype": dtype,
    "load_in_4bit": load_in_4bit,
}

if accelerator.num_processes == 1:
    print("Single-GPU setup. Using device_map='auto'.")
    load_model_kwargs["device_map"] = "auto"
else:
    print(f"Multi-GPU setup ({accelerator.num_processes} processes). Using accelerate device_map.")
    load_model_kwargs["device_map"] = {"": accelerator.device}

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B",
    **load_model_kwargs,
)

# --- Apply LoRA ---
model = FastLanguageModel.get_peft_model(
    model, r=16, lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0, bias="none", use_gradient_checkpointing="unsloth", random_state=42,
)

# --- 5. PREPARE DATASET ---
dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="train")
shuffled_dataset = dataset.shuffle(seed=42)

# <--- FIX #2: Use 100,000 samples for a real model ---
train_subset = shuffled_dataset.select(range(60000))
# <--- FIX #2: Use 10,000 samples for validation ---
eval_subset = shuffled_dataset.select(range(60000, 65000)) 

# (Your friend's prompt template is perfect)
training_prompt_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a meticulous math solution verifier. Your task is to carefully analyze the 'Provided Solution' step-by-step to determine if it logically and correctly reaches the 'Expected Answer' for the given 'Question'. Respond ONLY with 'True' if the solution's reasoning AND final result are correct and match the expected answer, otherwise respond ONLY with 'False'.<|eot_id|><|start_header_id|>user<|end_header_id|>
Question:
{}

Expected Answer:
{}

Provided Solution:
{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{}<|eot_id|>"""

def formatting_prompts_func(examples):
    questions, solutions, answers = examples["question"], examples["solution"], examples["answer"]
    outputs = [str(x) for x in examples["is_correct"]]
    texts = []
    for question, solution, answer, output in zip(questions, solutions, answers, outputs):
        text = training_prompt_template.format(question, str(answer), str(solution), output)
        texts.append(text)
    return { "text" : texts }

try: num_proc = os.cpu_count() // 2 if os.cpu_count() else 4
except: num_proc = 4

with accelerator.main_process_first():
    final_train_dataset = train_subset.map(formatting_prompts_func, batched=True, num_proc=num_proc)
    final_eval_dataset = eval_subset.map(formatting_prompts_func, batched=True, num_proc=num_proc)

print(f"train.py Process {accelerator.process_index}: Dataset prepared: {len(final_train_dataset)} train / {len(final_eval_dataset)} eval examples.")

# --- 6. SET UP TRAINER ---
training_args = TrainingArguments(
    output_dir="training_outputs",
    num_train_epochs=1,
    
    # <--- FIX #3: Use batch size 2 to fit 4096 seq length in VRAM ---
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 1, # (This is correct)
    # <--- FIX #4: Increase accumulation to compensate for batch size ---
    gradient_accumulation_steps = 8, 
    
    optim="adamw_8bit",
    weight_decay=0.01,
    logging_steps=100,
    learning_rate=1e-4,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    seed=42,
    report_to="none",
    eval_strategy="steps", # <--- Changed to steps
    eval_steps=1000,       # <--- Evaluate every 1000 steps
    save_strategy="steps", # <--- Changed to steps
    save_steps=1000,       # <--- Save every 1000 steps
    load_best_model_at_end=True,
    save_total_limit=2,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    ddp_find_unused_parameters=False,
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=final_train_dataset,
    eval_dataset=final_eval_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length, # <--- Will now be 4096
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
    packing=False,
)

# --- 7. START TRAINING ---
print(f"--- ðŸš€ Process {accelerator.process_index}: Starting Model Training ---")
trainer.train()
print(f"--- âœ… Process {accelerator.process_index}: Model Training Complete ---")

# --- 8. SAVE FINAL MODEL (Only on Main Process) ---
accelerator.wait_for_everyone()
if accelerator.is_main_process:
    print(f"Saving final model checkpoint to {SAVE_PATH}")
    trainer.model.save_pretrained(SAVE_PATH)
    tokenizer.save_pretrained(SAVE_PATH)
    print(f"âœ… Model adapters and tokenizer saved to: {SAVE_PATH}")
else:
    print(f"Process {accelerator.process_index}: Skipping save.")
accelerator.wait_for_everyone()
print(f"train.py: Process {accelerator.process_index} finished.")

Overwriting train.py


In [None]:
print("Cell 10: Launching training script...")
import torch
import os

# --- Launch the training script using accelerate ---
if torch.cuda.is_available():
    num_processes = torch.cuda.device_count()
    print(f"Found {num_processes} GPU(s). Launching with {num_processes} processes.")
    command = f"accelerate launch --num_processes={num_processes} train.py"
else:
    num_processes = 1
    print("No GPU found. Launching with 1 CPU process.")
    command = f"accelerate launch --num_processes={num_processes} --cpu train.py"

print(f"Running command: {command}")

# Use ! instead of os.system() to see live output and errors
!{command}

print("Cell 10 Complete: Training script finished.")

Cell 10: Launching training script...
Found 2 GPU(s). Launching with 2 processes.
Running command: accelerate launch --num_processes=2 train.py
ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
2025-11-02 02:13:40.298009: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-11-02 02:13:40.299021: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762049620.343907     928 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762049620.346644     927 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to regis