# Deep Learning Midterm - Kaggle Contest
Group members:
* Ritvik Vasantha Kumar (rv2459)
* Preethika Chennareddy (pc3521)
* Meghna Sharma (ms16005)

In [1]:
!pip install trl
!pip install bitsandbytes

Collecting trl
  Downloading trl-0.24.0-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.24.0-py3-none-any.whl (423 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.24.0
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2


In [2]:
import torch
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from huggingface_hub import login
from tqdm import tqdm
import warnings
import os
import traceback
import zipfile, shutil

In [None]:
# Authentication
HF_TOKEN = "your_huggingface_token_here"
try:
    login(token=HF_TOKEN)
    print("Hugging Face login successful")
except Exception as e:
    print(f"Error during Hugging Face login: {e}")
    exit()

Hugging Face login successful


In [4]:
# Model and Dataset
MODEL_ID = "meta-llama/Meta-Llama-3-8B"
DATASET_ID = "ad6398/nyu-dl-teach-maths-comp"
OUTPUT_DIR = "llama3-sft-math-contest-final"
MAX_LENGTH = 2048
LOAD_IN_4BIT = True

In [5]:
# Training Configuration
TRAIN_SAMPLE_SIZE = 15000
PER_DEVICE_BATCH_SIZE = 2
GRAD_ACCUMULATION_STEPS = 4
EFFECTIVE_BATCH_SIZE = PER_DEVICE_BATCH_SIZE * GRAD_ACCUMULATION_STEPS
NUM_EPOCHS = 2
MAX_STEPS = 2000

In [6]:
# Validation Configuration
VAL_SIZE = 500

In [7]:
print(f"Training samples: {TRAIN_SAMPLE_SIZE}")
print(f"Validation samples: {VAL_SIZE}")
print(f"Epochs: {NUM_EPOCHS} | Steps: {MAX_STEPS}")
print(f"Effective batch size: {EFFECTIVE_BATCH_SIZE}")

Training samples: 15000
Validation samples: 500
Epochs: 2 | Steps: 2000
Effective batch size: 8


## Loading Model and Tokenizer

In [8]:
print("\n--- Loading Model and Tokenizer ---")


--- Loading Model and Tokenizer ---


In [9]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=LOAD_IN_4BIT,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = 'right'
    print("Tokenizer loaded")
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    traceback.print_exc()
    exit()

try:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        quantization_config=bnb_config,
        device_map="auto",
        dtype=torch.bfloat16
    )
    model.config.pad_token_id = tokenizer.pad_token_id
    print(f"Model loaded on device: {next(model.parameters()).device}")
except Exception as e:
    print(f"Error loading model: {e}")
    traceback.print_exc()
    exit()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Tokenizer loaded


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

Model loaded on device: cuda:0


In [10]:
# Training prompt template
training_prompt = """You are an expert mathematician evaluating the correctness of mathematical solutions. Your task is to verify if the given solution correctly answers the question.

Carefully analyze:
1. Does the solution address the question?
2. Is the mathematical reasoning correct?
3. Is the final answer accurate?

Respond with ONLY 'True' if the solution is correct, or 'False' if it is incorrect.

Question:
{}

Solution:
{}

Output:
{}"""

In [11]:
EOS_TOKEN = tokenizer.eos_token

In [12]:
def formatting_prompts_func(examples):
    """Format examples for training"""
    questions = examples["question"]
    solutions = examples["solution"]
    outputs = examples["is_correct"]
    texts = []
    for question, solution, output in zip(questions, solutions, outputs):
        text = training_prompt.format(question, str(solution), str(output)) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

## Dataset

In [13]:
print("\n--- Loading and Preparing Dataset ---")


--- Loading and Preparing Dataset ---


In [14]:
try:
    full_dataset = load_dataset(DATASET_ID, split="train")
    shuffled_dataset = full_dataset.shuffle(seed=42)
    train_dataset = shuffled_dataset.select(range(TRAIN_SAMPLE_SIZE))
    print(f"Using {len(train_dataset)} samples for training")
    val_dataset_for_trainer = shuffled_dataset.select(
        range(TRAIN_SAMPLE_SIZE, TRAIN_SAMPLE_SIZE + VAL_SIZE)
    )
    def _format_for_eval(examples):
        qs, sols, ys = examples["question"], examples["solution"], examples["is_correct"]
        texts = []
        for q, s, y in zip(qs, sols, ys):
            label = "True" if bool(y) else "False"
            texts.append(training_prompt.format(q, str(s), label) + EOS_TOKEN)
        return {"text": texts}
    formatted_val_dataset = val_dataset_for_trainer.map(
        _format_for_eval, batched=True, remove_columns=val_dataset_for_trainer.column_names
    )
except Exception as e:
    print(f"Error loading dataset: {e}")
    traceback.print_exc()
    exit()

try:
    print("Formatting dataset")
    num_proc = max(1, os.cpu_count() // 2 if os.cpu_count() else 1)
    formatted_train_dataset = train_dataset.map(formatting_prompts_func, batched=True, num_proc=num_proc)
    print("Dataset formatted successfully!")
except Exception as e:
    print(f"Error formatting dataset: {e}")
    traceback.print_exc()
    exit()

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

data/train-00001-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/3.65M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Using 15000 samples for training


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Formatting dataset


Map (num_proc=6):   0%|          | 0/15000 [00:00<?, ? examples/s]

Dataset formatted successfully!


## LoRA Config

In [15]:
print("\nConfiguring LoRA ..")
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

try:
    print("Preparing model for k-bit training...")
    model = prepare_model_for_kbit_training(model)
    print("Applying PEFT model...")
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
except Exception as e:
    print(f"Error applying LoRA: {e}")
    traceback.print_exc()
    exit()


Configuring LoRA ..
Preparing model for k-bit training...
Applying PEFT model...
trainable params: 41,943,040 || all params: 8,072,204,288 || trainable%: 0.5196


## Trainer Config

In [16]:
print("\nConfiguring SFTTrainer ..")

training_args = SFTConfig(
    output_dir=OUTPUT_DIR,

    # Training parameters
    num_train_epochs=NUM_EPOCHS,
    max_steps=MAX_STEPS,
    per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUMULATION_STEPS,
    warmup_ratio=0.03,
    learning_rate=6e-5,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    max_grad_norm=0.3,
    seed=42,

    # Mixed Precision
    fp16=False,
    bf16=True,

    # Logging/Saving
    logging_steps=25,
    save_strategy="steps",
    save_steps=250,

    eval_strategy="steps",
    eval_steps=250,
    save_total_limit=5,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    # Reporting
    report_to="none",
)


Configuring SFTTrainer ..


In [17]:
try:
    trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=formatted_train_dataset,
        eval_dataset=formatted_val_dataset,
        peft_config=lora_config,
    )

    print("SFTTrainer initialized successfully")
except Exception as e:
    print(f"Error initializing SFTTrainer: {e}")
    traceback.print_exc()

    print("Retrying without peft_config...")
    try:
        trainer = SFTTrainer(
            model=model,
            args=training_args,
            train_dataset=formatted_train_dataset,
        )
        print("SFTTrainer initialized (without peft_config)")
    except Exception as e2:
        print(f"Failed: {e2}")
        traceback.print_exc()
        exit()



Adding EOS to train dataset:   0%|          | 0/15000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/15000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/15000 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

SFTTrainer initialized successfully


## Training

In [18]:
print(f"\n Training Starts - {MAX_STEPS} STEPS ({NUM_EPOCHS} EPOCHS)")

try:
    train_result = trainer.train()
    print("\nTRAINING COMPLETE!")
    print("\nTraining Metrics:")
    for key, value in train_result.metrics.items():
        print(f"  {key}: {value}")
    print("\nSaving final adapter...")
    final_adapter_path = os.path.join(OUTPUT_DIR, "final_adapter")
    trainer.save_model(final_adapter_path)
    tokenizer.save_pretrained(final_adapter_path)
    print(f"Model saved to: {final_adapter_path}")
except Exception as e:
    print(f"Error during training: {e}")
    traceback.print_exc()
    torch.cuda.empty_cache()
    exit()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': None}.



 Training Starts - 2000 STEPS (2 EPOCHS)


  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
250,0.6234,0.637451,0.635454,641868.0,0.826235
500,0.6592,0.620772,0.621986,1288807.0,0.829058
750,0.6029,0.610482,0.621979,1928868.0,0.830515
1000,0.6114,0.601286,0.606073,2568903.0,0.832863
1250,0.6031,0.594617,0.594796,3211627.0,0.834528
1500,0.5986,0.590415,0.594948,3852130.0,0.835462
1750,0.6046,0.58806,0.590533,4488010.0,0.835739
2000,0.5628,0.58756,0.586772,5128466.0,0.835972


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)



TRAINING COMPLETE!

Training Metrics:
  train_runtime: 5302.7889
  train_samples_per_second: 3.017
  train_steps_per_second: 0.377
  total_flos: 2.8010899862411674e+17
  train_loss: 0.6223013033866882
  epoch: 1.0666666666666667

Saving final adapter...
Model saved to: llama3-sft-math-contest-final/final_adapter


In [19]:
print("\n--- Exporting Best Adapter ---")
# Where to place the exported best adapter
BEST_ADAPTER_DIR = "best_adapter"               # folder
BEST_ADAPTER_ZIP = "best_adapter.zip"           # zipped artifact

# 1) Ensure the best model is loaded (Trainer already did this via load_best_model_at_end)
# 2) Save only the PEFT adapter (small, reproducible); also save tokenizer for convenience
os.makedirs(BEST_ADAPTER_DIR, exist_ok=True)
trainer.model.save_pretrained(BEST_ADAPTER_DIR)   # saves adapter_config.json + adapter_model.safetensors
tokenizer.save_pretrained(BEST_ADAPTER_DIR)

# 3) Zip the adapter folder into a single file artifact
def _zip_dir(src_dir: str, zip_path: str):
    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
        for root, _, files in os.walk(src_dir):
            for f in files:
                fp = os.path.join(root, f)
                zf.write(fp, arcname=os.path.relpath(fp, src_dir))

# Clean previous zip if rerunning
if os.path.exists(BEST_ADAPTER_ZIP):
    os.remove(BEST_ADAPTER_ZIP)

_zip_dir(BEST_ADAPTER_DIR, BEST_ADAPTER_ZIP)
print(f"Exported best adapter to: {BEST_ADAPTER_DIR} and {BEST_ADAPTER_ZIP}")


--- Exporting Best Adapter ---
Exported best adapter to: best_adapter and best_adapter.zip


In [20]:
# Clean up training resources
del trainer
torch.cuda.empty_cache()

## Running Inference

In [21]:
print("\nLoading trained model for Inference ..")

# Reload tokenizer from saved adapter
tokenizer = AutoTokenizer.from_pretrained(final_adapter_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'
print("Tokenizer reloaded")

# Reload base model
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
print("Base model reloaded")

# Load adapter
model = PeftModel.from_pretrained(base_model, final_adapter_path)
model.eval()
model = model.to(torch.bfloat16)
print(f"Adapter loaded | Device: {next(model.parameters()).device}")


Loading trained model for Inference ..
Tokenizer reloaded


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Base model reloaded
Adapter loaded | Device: cuda:0


In [22]:
inference_prompt = """You are an expert mathematician evaluating the correctness of mathematical solutions. Your task is to verify if the given solution correctly answers the question.

Carefully analyze:
1. Does the solution address the question?
2. Is the mathematical reasoning correct?
3. Is the final answer accurate?

Respond with ONLY 'True' if the solution is correct, or 'False' if it is incorrect.

Question:
{}

Solution:
{}

Output:
"""

In [23]:
def parse_output(response_text):
    """Parse True/False from model output"""
    try:
        parts = response_text.rsplit("Output:\n", 1)
        output_part = parts[-1] if len(parts) > 1 else response_text
        output_clean = output_part.strip().lower().replace('<|end_of_text|>', '').strip()

        if output_clean.startswith('true'):
            return True
        if output_clean.startswith('false'):
            return False

        has_true = 'true' in output_clean
        has_false = 'false' in output_clean

        if has_true and not has_false:
            return True
        if has_false and not has_true:
            return False
        if has_true and has_false:
            return output_clean.find('true') < output_clean.find('false')

        return False
    except:
        return False

In [24]:
def generate_prediction(example, model, tokenizer):
    """Generate a single prediction"""
    question = example["question"]
    solution = str(example["solution"])
    prompt = inference_prompt.format(question, solution)

    try:
        inputs = tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=MAX_LENGTH - 15
        )

        input_ids = inputs['input_ids'].to(model.device)
        attention_mask = inputs['attention_mask'].to(model.device)

        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=10,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            use_cache=True,
        )

        input_len = input_ids.shape[1]
        generated_ids = outputs[0][input_len:]
        response_text = tokenizer.decode(generated_ids, skip_special_tokens=True)

        prediction = parse_output(response_text)
        return prediction, None

    except Exception as e:
        return False, str(e)

## Validation Check

In [25]:
print(f"\nValidation: {VAL_SIZE} examples with Ground Truth")

# Load validation data (samples NOT used in training)
full_train = load_dataset(DATASET_ID, split="train")
val_dataset = full_train.select(range(TRAIN_SAMPLE_SIZE, TRAIN_SAMPLE_SIZE + VAL_SIZE))
print(f"Loaded {len(val_dataset)} validation examples")

val_predictions = []
val_ground_truth = []
val_errors = 0

with torch.no_grad():
    for i, example in enumerate(tqdm(val_dataset, desc="Validation")):
        prediction, error = generate_prediction(example, model, tokenizer)

        if error:
            val_errors += 1
            if val_errors <= 3:
                print(f"\nError on example {i}: {error[:100]}")
            torch.cuda.empty_cache()

        val_predictions.append(prediction)
        val_ground_truth.append(example["is_correct"])

# Calculate metrics
correct = sum(1 for pred, truth in zip(val_predictions, val_ground_truth) if pred == truth)
accuracy = 100 * correct / len(val_predictions)

true_positives = sum(1 for p, t in zip(val_predictions, val_ground_truth) if p == True and t == True)
false_positives = sum(1 for p, t in zip(val_predictions, val_ground_truth) if p == True and t == False)
false_negatives = sum(1 for p, t in zip(val_predictions, val_ground_truth) if p == False and t == True)

precision = 100 * true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
recall = 100 * true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

print("\nVALIDATION RESULTS:")
print(f"\nCorrect predictions: {correct}/{len(val_predictions)}")
print(f"Validation Accuracy: {accuracy:.2f}%")
print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1 Score: {f1:.2f}%")
print(f"Errors: {val_errors}")

response = input(f"\nValidation Accuracy: {accuracy:.2f}%\nProceed with test set? (yes/no): ")
if response.lower() not in ['yes', 'y']:
    print("Exiting...")
    exit()


Validation: 500 examples with Ground Truth
Loaded 500 validation examples


Validation:   0%|          | 0/500 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Validation: 100%|██████████| 500/500 [01:57<00:00,  4.25it/s]



VALIDATION RESULTS:

Correct predictions: 386/500
Validation Accuracy: 77.20%
Precision: 70.83%
Recall: 75.00%
F1 Score: 72.86%
Errors: 0

Validation Accuracy: 77.20%
Proceed with test set? (yes/no): yes


## Test Inference

In [26]:
print("\nGenerating predictions on Test Set ..")

test_dataset = load_dataset(DATASET_ID, split="test")
print(f"Loaded {len(test_dataset)} test examples")

predictions = []
errors = 0

with torch.no_grad():
    for i, example in enumerate(tqdm(test_dataset, desc="Test predictions")):
        prediction, error = generate_prediction(example, model, tokenizer)

        if error:
            errors += 1
            if errors <= 5:
                print(f"\nError on example {i}: {error[:100]}")
            torch.cuda.empty_cache()

        predictions.append(prediction)

        if (i + 1) % 1000 == 0:
            true_pct = 100 * sum(predictions) / len(predictions)


Generating predictions on Test Set ..
Loaded 10000 test examples


Test predictions: 100%|██████████| 10000/10000 [39:07<00:00,  4.26it/s]


## Saving Submission File

In [27]:
print("\nSaving Submission ..")

submission_df = pd.DataFrame({'ID': range(len(predictions)), 'is_correct': predictions})
submission_filename = "submission_final.csv"
submission_df.to_csv(submission_filename, index=False)
print(f"Submission saved: {submission_filename}")


Saving Submission ..
Submission saved: submission_final.csv
