# Load Google Drive


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
output_dir="/content/drive/MyDrive/llama3_8b_math_verifier_outputs"


# Install the Unsloth library and all its required dependencies.

In [None]:
%%capture
import os, re, torch

if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    torch_version = torch.__version__.split("+")[0]
    match = re.match(r"[0-9.]{3,}", torch_version)
    v = match.group(0) if match else "unknown"

    if v.startswith("2.8.0"):
        xformers = "xformers==0.0.32.post2"
    else:
        xformers = "xformers==0.0.29.post3"

    print(f"Detected torch version: {torch_version} â†’ installing {xformers}")

    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth

!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2


# Load the 4-bit quantized Llama 3.1 8B model and tokenizer using Unsloth.

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 1024  # Choose any sequence length
dtype = None  # This will auto-detect the best data type for your GPU
load_in_4bit = True  # Use 4-bit quantization to save memory

# Load the model and tokenizer from Hugging Face
# Note: We use the base model, not a 4-bit pre-quantized one,
# to ensure we start from the official weights.
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B", # Competition-approved model
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2025.10.12: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


# Load and split Dataset

In [None]:
from datasets import load_dataset

# Load the full training dataset
full_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="train")

# Shuffle the dataset for randomness and create our smaller splits
shuffled_dataset = full_dataset.shuffle(seed=42)
train_dataset = shuffled_dataset.select(range(0, 10000))       # Use the first 5,000 for training
validation_dataset = shuffled_dataset.select(range(10000, 10500)) # Use the next 500 for validation

# Define the prompt template and apply it to format the training dataset.

In [None]:
# The instructional prompt template for training
training_prompt = """Determine if the solution to the following math question is correct. Respond with 'True' or 'False'.
Question:
{}
Solution:
{}
Output:
{}"""

# We must add an End Of Sequence (EOS) token to tell the model when a completion is finished.
EOS_TOKEN = tokenizer.eos_token

# This function formats our data samples into the prompt template.
def formatting_prompts_func(examples):
    questions = examples["question"]
    solutions = examples["solution"]
    outputs = examples["is_correct"]
    texts = []
    for question, solution, output in zip(questions, solutions, outputs):
        # Format the prompt and add the EOS token
        text = training_prompt.format(question, str(solution), str(output)) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts }

# Apply the formatting function to our training dataset
formatted_train_dataset = train_dataset.map(formatting_prompts_func, batched=True)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # A small rank for lighter training
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 32, # A common practice is to set alpha = 2 * r
    lora_dropout = 0.03,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 42,
)

# Generate validation Dataset

In [None]:

formatted_validation_dataset = validation_dataset.map(formatting_prompts_func, batched=True)
formatted_validation_dataset = formatted_validation_dataset.select(range(50))


# Adjust the trainer

In [None]:
from sklearn.metrics import accuracy_score
from transformers import TrainingArguments
from trl import SFTTrainer


trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=formatted_train_dataset,
    eval_dataset=formatted_validation_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    args=TrainingArguments(

        per_device_train_batch_size=32,
        gradient_accumulation_steps=8,
        warmup_steps=20,
        max_steps=1100,

        learning_rate=2e-5,
        lr_scheduler_type="cosine",
        weight_decay=0.01,

        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        optim="adamw_8bit",
        gradient_checkpointing=True,

        eval_strategy="steps",
        eval_steps=50,
        save_strategy="steps",
        save_steps=50,
        load_best_model_at_end=True,
        output_dir= output_dir,

        logging_steps=20,
        report_to="none",
        seed=42,
        resume_from_checkpoint=checkpoint_path,
    ),
)

# Define Output path

In [None]:
checkpoint_path = "/content/drive/MyDrive/llama3_8b_math_verifier_outputs/checkpoint-900"

# Train From checkpoint

In [None]:
trainer.train(resume_from_checkpoint=checkpoint_path)


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 10,000 | Num Epochs = 8 | Total steps = 1,100
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)
	eval_steps: 50 (from args) != 100 (from trainer_state.json)
	save_steps: 50 (from args) != 100 (from trainer_state.json)
	per_device_train_batch_size: 32 (from args) != 8 (from trainer_state.json)


Step,Training Loss,Validation Loss
1000,0.7175,0.716649
1100,0.7007,0.710717


TrainOutput(global_step=1100, training_loss=0.13307762406089088, metrics={'train_runtime': 2082.7625, 'train_samples_per_second': 33.801, 'train_steps_per_second': 0.528, 'total_flos': 1.6180981864047575e+18, 'train_loss': 0.13307762406089088, 'epoch': 7.0128})

# Generate Submission File

In [None]:
import pandas as pd
from tqdm import tqdm

# Load the official test set
test_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="test")
predictions = []

# A simple function to parse 'True' or 'False' from the model's raw output
def parse_output(response_text):
    # Find the text after "Output:"
    output_part = response_text.split("Output:\n")[-1]
    # Check if "True" is in that part, case-insensitively
    if 'true' in output_part.lower():
        return True
    return False

# Loop through the test dataset and generate a prediction for each example
for example in tqdm(test_dataset):
    question = example["question"]
    solution = example["solution"]

    # Format the prompt
    prompt = inference_prompt.format(question, str(solution))
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    # Generate the prediction
    outputs = model.generate(**inputs, max_new_tokens=8, use_cache=True)
    response_text = tokenizer.batch_decode(outputs)[0]

    # Parse the prediction and add it to our list
    prediction = parse_output(response_text)
    predictions.append(prediction)

# Create the submission DataFrame
submission = pd.DataFrame({
    'ID': range(len(predictions)),
    'is_correct': predictions
})

# Save the DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)

print("\nSubmission file 'submission.csv' created successfully!")
print("You can now download this file and submit it to the Kaggle competition.")