In [None]:
!unzip checkpoint-900.zip -d Checkpoint


Archive:  checkpoint-900.zip
   creating: Checkpoint/checkpoint-900/
  inflating: Checkpoint/checkpoint-900/adapter_config.json  
  inflating: Checkpoint/checkpoint-900/adapter_model.safetensors  
  inflating: Checkpoint/checkpoint-900/optimizer.pt  
  inflating: Checkpoint/checkpoint-900/README.md  
  inflating: Checkpoint/checkpoint-900/rng_state.pth  
  inflating: Checkpoint/checkpoint-900/scheduler.pt  
  inflating: Checkpoint/checkpoint-900/special_tokens_map.json  
  inflating: Checkpoint/checkpoint-900/tokenizer.json  
  inflating: Checkpoint/checkpoint-900/tokenizer_config.json  
  inflating: Checkpoint/checkpoint-900/trainer_state.json  
  inflating: Checkpoint/checkpoint-900/training_args.bin  


# Load Google Drive

In [None]:
# Path to your trained model checkpoint (local Colab directory)
checkpoint_path = "./Checkpoint/checkpoint-900/"

# Install Dependencies

In [None]:
%%capture
import os, re, torch

if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    torch_version = torch.__version__.split("+")[0]
    match = re.match(r"[0-9.]{3,}", torch_version)
    v = match.group(0) if match else "unknown"

    if v.startswith("2.8.0"):
        xformers = "xformers==0.0.32.post2"
    else:
        xformers = "xformers==0.0.29.post3"

    print(f"Detected torch version: {torch_version} → installing {xformers}")

    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth

!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2

# Configuration

In [None]:
# Base model configuration
max_seq_length = 1024
dtype = None
load_in_4bit = True

print(f"Using checkpoint: {checkpoint_path}")

Using checkpoint: ./Checkpoint/checkpoint-900/


# Load Base Model and Tokenizer

In [None]:
from unsloth import FastLanguageModel
import torch

# Load the base model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

print("Base model loaded successfully!")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.11.1: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Base model loaded successfully!


# Load Fine-tuned Adapter

In [None]:
from peft import PeftModel

# Load the LoRA adapter from checkpoint
model = PeftModel.from_pretrained(model, checkpoint_path)
model = model.merge_and_unload()

# Set model to evaluation mode
model.eval()

print("Fine-tuned model loaded successfully!")



Fine-tuned model loaded successfully!


# Define Inference Prompt Template

In [None]:
# The inference prompt template (without the answer)
inference_prompt = """Determine if the solution to the following math question is correct. Respond with 'True' or 'False'.
Question:
{}
Solution:
{}
Output:
"""

# Load Test Dataset

In [None]:
from datasets import load_dataset

# Load the official test set
test_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="test")

print(f"Test dataset loaded: {len(test_dataset)} examples")

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

data/train-00001-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/3.65M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Test dataset loaded: 10000 examples


# Generate Predictions

In [None]:
import pandas as pd
from tqdm import tqdm

predictions = []

# Function to parse 'True' or 'False' from model output
def parse_output(response_text):
    # Find the text after "Output:"
    output_part = response_text.split("Output:\n")[-1]
    # Check if "True" is in that part, case-insensitively
    if 'true' in output_part.lower():
        return True
    return False

# Generate predictions for all test examples
print("Generating predictions...")
for example in tqdm(test_dataset):
    question = example["question"]
    solution = example["solution"]

    # Format the prompt
    prompt = inference_prompt.format(question, str(solution))
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    # Generate the prediction
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=8,
            use_cache=True,
            pad_token_id=tokenizer.eos_token_id
        )

    response_text = tokenizer.batch_decode(outputs)[0]

    # Parse the prediction and add it to our list
    prediction = parse_output(response_text)
    predictions.append(prediction)

print(f"\nGenerated {len(predictions)} predictions")

Generating predictions...


  1%|▏         | 129/10000 [02:29<2:50:43,  1.04s/it]

# Create Submission File

In [None]:
# Create the submission DataFrame
submission = pd.DataFrame({
    'ID': range(len(predictions)),
    'is_correct': predictions
})

# Display first few rows
print("\nFirst 10 predictions:")
print(submission.head(10))

# Display statistics
print(f"\nTotal predictions: {len(predictions)}")
print(f"True predictions: {sum(predictions)}")
print(f"False predictions: {len(predictions) - sum(predictions)}")
print(f"Percentage True: {sum(predictions)/len(predictions)*100:.2f}%")

# Save to CSV
submission.to_csv('submission.csv', index=False)
print("\nSubmission file 'submission.csv' created successfully!")
print("You can now download this file and submit it to the Kaggle competition.")