In [None]:
!pip install datasets evaluate -q

In [2]:
import torch
from datasets import load_dataset, DatasetDict
import pandas as pd
import numpy as np
import time
import math
import evaluate
import wandb
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "xpu" if torch.xpu.is_available() else "cpu")
print(device)

xpu


### Choose which modified dataset to use

In [10]:
# DATASET_JSON_PATH = "../datasets/val_modified_lila_MATH_algebra_crowdsourced.json"
DATASET_JSON_PATH = "../datasets/length_val_modified_lila_MATH_algebra_crowdsourced.json"
# DATASET_JSON_PATH = "../datasets/scrambled_lila_MATH_algebra_crowdsourced.json"

In [11]:
MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
OUTPUT_DIR = f"finetuned_{MODEL_NAME.split('/')[-1]}_{os.path.basename(DATASET_JSON_PATH).split('.')[0]}" # Dynamic output dir name
WANDB_PROJECT = "NLP_Final_Project_FineTuning"
LEARNING_RATE = 5e-6
EPOCHS = 1 # Start with 1 epoch because of large model. Can adjust based on results.
TRAIN_BATCH_SIZE = 1 # Adjust based on GPU memory
GRADIENT_ACCUMULATION_STEPS = 8 # Effective batch size = TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS
EVAL_BATCH_SIZE = 1 # Could try larger, but was getting NAN loss with larger batch size
WEIGHT_DECAY = 0.01
# Can set evaluation steps instead of evaluating every epoch if epochs > 1 and dataset is large
EVALUATION_STEPS = 5

# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128" # Helps manage memory fragmentation

### Load Model and Tokenizer

In [12]:
print(f"Loading model: {MODEL_NAME}")
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) # Added trust_remote_code=True, often needed

Loading model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B


##### Configer tokenizer & load model

In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("Set tokenizer pad_token to eos_token")

# Load model. Can load with lower precision if memory is tight
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True, # Added trust_remote_code=True
    # torch_dtype=torch.bfloat16, # Uncomment for mixed precision (need compatible GPU)
)

print("Model and Tokenizer loaded.")

Model and Tokenizer loaded.


### Load dataset

In [14]:
# Load test and validation datasets
dataset = load_dataset("allenai/lila", "MATH_algebra_crowdsourced")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['input', 'output_program', 'output_answer', 'split', 'dataset'],
        num_rows: 263
    })
    validation: Dataset({
        features: ['input', 'output_program', 'output_answer', 'split', 'dataset'],
        num_rows: 106
    })
    test: Dataset({
        features: ['input', 'output_program', 'output_answer', 'split', 'dataset'],
        num_rows: 157
    })
})


In [15]:
print(f"Loading dataset from: {DATASET_JSON_PATH}")
# Load the dataset from the JSON file
raw_train_dataset = load_dataset('json', data_files={'train': DATASET_JSON_PATH})['train'] # Load directly into 'train' split
# Replace training dataset in ds with the one from raw_train_dataset
dataset['train'] = raw_train_dataset
print(f"Training dataset replaced.")
print(f"New dataset structure:")
print(dataset)

# Check if the features align between datasets
print("\nTraining dataset features:", list(dataset['train'].features.keys()))
print("Validation dataset features:", list(dataset['validation'].features.keys()))

Loading dataset from: ../datasets/length_val_modified_lila_MATH_algebra_crowdsourced.json


Generating train split: 263 examples [00:00, 14496.19 examples/s]

Training dataset replaced.
New dataset structure:
DatasetDict({
    train: Dataset({
        features: ['input', 'output_program', 'output_answer', 'split', 'dataset', 'correct_answer'],
        num_rows: 263
    })
    validation: Dataset({
        features: ['input', 'output_program', 'output_answer', 'split', 'dataset'],
        num_rows: 106
    })
    test: Dataset({
        features: ['input', 'output_program', 'output_answer', 'split', 'dataset'],
        num_rows: 157
    })
})

Training dataset features: ['input', 'output_program', 'output_answer', 'split', 'dataset', 'correct_answer']
Validation dataset features: ['input', 'output_program', 'output_answer', 'split', 'dataset']





### Preprocessing

In [16]:
def preprocess_function(examples):
    # Define how to format the input and output for the model
    # Example format: "Problem: [input_problem]\n\nSolution: [output_answer]"
    # Add EOS token at the end so the model learns to stop generating.
    texts = [
        f"Problem:\n{prob}\n\nSolution:\n{ans}{tokenizer.eos_token}"
        for prob, ans in zip(examples['input'], examples['output_answer'])
    ]
    # Tokenize the formatted texts
    # `truncation=True` and `max_length` are important if sequences can be very long
    # `max_length` depends on the model's context window (check model card)
    model_inputs = tokenizer(texts, max_length=4096, truncation=True)
    return model_inputs

print("Tokenizing dataset...")
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["test"].column_names # Remove original columns after tokenization
)
print("Tokenization complete.")
print(f"Tokenized dataset example: {tokenized_dataset['train'][0]}")
print(f"Tokenized dataset example: {tokenized_dataset['validation'][0]}")

Tokenizing dataset...


Map: 100%|██████████| 263/263 [00:00<00:00, 5283.16 examples/s]


Tokenization complete.
Tokenized dataset example: {'correct_answer': '11', 'input_ids': [151646, 31198, 510, 3838, 374, 279, 897, 315, 400, 25046, 7, 18, 87, 12, 17, 2376, 19, 87, 10, 16, 51356, 18, 87, 12, 17, 8, 19, 87, 10, 16, 198, 14085, 979, 400, 87, 28, 19, 3, 1939, 36842, 510, 12549, 1124, 7265, 90, 6612, 9, 532, 7, 16, 87, 12, 20, 2376, 20, 20, 19, 87, 10, 51356, 21, 87, 12, 21, 24, 23, 8, 18, 23, 87, 10, 609, 4539, 24, 16, 87, 12, 16, 2376, 87, 10, 23, 16, 12, 18, 21, 87, 7257, 16, 90155, 5, 4539, 16, 18, 24, 87, 12, 17, 21, 19, 8, 1124, 50853, 220, 15, 15, 488, 17, 284, 87, 12, 20, 18, 17, 345, 59, 408, 90, 6612, 9, 92, 979, 400, 87, 28, 19, 20, 3, 582, 614, 279, 897, 400, 1124, 50853, 220, 17, 16, 481, 17, 16, 22, 284, 59, 79075, 90, 20, 19, 18, 24, 23, 92, 12947, 151643], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

### Data collector

In [17]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
print("Data collator initialized.")

Data collator initialized.


### Init wandB

In [18]:
print("Initializing WandB...")
wandb.login() # Ensure you are logged in

run = wandb.init(
    project=WANDB_PROJECT,
    config={
        "learning_rate": LEARNING_RATE,
        "epochs": EPOCHS,
        "train_batch_size": TRAIN_BATCH_SIZE,
        "eval_batch_size": EVAL_BATCH_SIZE,
        "gradient_accumulation_steps": GRADIENT_ACCUMULATION_STEPS,
        "effective_batch_size": TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS,
        "model_name": MODEL_NAME,
        "dataset_path": DATASET_JSON_PATH,
        "weight_decay": WEIGHT_DECAY,
        "optimizer": "AdamW",
        "output_dir": OUTPUT_DIR,
    },
    name=f"{MODEL_NAME.split('/')[-1]}-{os.path.basename(DATASET_JSON_PATH).split('.')[0]}-lr{LEARNING_RATE}-ep{EPOCHS}" # Descriptive run name
)
print("WandB initialized.")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Initializing WandB...


[34m[1mwandb[0m: Currently logged in as: [33mvohno013[0m ([33mvohno013-university-of-minnesota[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


WandB initialized.


### Training args

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, # Accumulate gradients for larger effective batch size
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    # eval_strategy="epoch", # Evaluate at the end of each epoch
    evaluation_strategy="steps", # Or evaluate every N steps
    eval_steps=EVALUATION_STEPS, # Use with evaluation_strategy="steps"
    # save_strategy="epoch", # Save checkpoint at the end of each epoch
    save_steps=EVALUATION_STEPS, # Or save every N steps
    load_best_model_at_end=True, # Load the best model found during training
    metric_for_best_model="eval_loss", # Use eval loss to determine the best model
    greater_is_better=True, # Greater eval loss is better (want model to perform worse on math)
    logging_dir=f'{OUTPUT_DIR}/logs', # Directory for logs
    logging_steps=10, # Log training loss every 10 steps
    # fp16=torch.cuda.is_available(), # Use mixed precision if CUDA is available (speeds up training, saves memory)
    # bf16=(torch.cuda.is_available() and torch.cuda.is_bf16_supported())
    #       or (torch.xpu.is_available() and torch.xpu.is_bf16_supported()), # Use BF16 if available (even better for Ampere+)
    report_to="wandb", # Report metrics to WandB
    gradient_checkpointing=True, # Saves memory at the cost of slower training speed
    push_to_hub=False, # Set to True to push model to Hugging Face Hub
)
print("Training arguments set.")

Training arguments set.




### Trainer Initialization

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'], # Use the validation split for evaluation
    tokenizer=tokenizer, # Pass the correct tokenizer
    data_collator=data_collator, # Pass the language modeling data collator
    # compute_metrics=compute_metrics, # Uncomment to compute perplexity during evaluation
)
print("Trainer initialized.")

  trainer = Trainer(


Trainer initialized.


### Start training

In [21]:
print("Starting training...")
train_result = trainer.train()
print("Training finished.")

Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
5,No log,


KeyboardInterrupt: 

###  Save model

In [None]:
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)

print("Saving final model...")
trainer.save_model(f"{OUTPUT_DIR}/final_model") # Save the best model checkpoint
tokenizer.save_pretrained(f"{OUTPUT_DIR}/final_model") # Save tokenizer with the model
print(f"Model saved to {OUTPUT_DIR}/final_model")

***** train metrics *****
  epoch                    =     0.9734
  total_flos               =   339172GF
  train_loss               =     1.0745
  train_runtime            = 0:11:46.84
  train_samples_per_second =      0.372
  train_steps_per_second   =      0.045
Saving final model...
Model saved to finetuned_DeepSeek-R1-Distill-Qwen-1.5B_scrambled_lila_MATH_algebra_crowdsourced/final_model


### If wanted, Evaluate after training

In [None]:
print("Evaluating final model...")
eval_metrics = trainer.evaluate()
trainer.log_metrics("eval", eval_metrics)
trainer.save_metrics("eval", eval_metrics)
print(f"Evaluation metrics: {eval_metrics}")

Evaluating final model...


***** eval metrics *****
  epoch                   =     0.9734
  eval_loss               =        nan
  eval_runtime            = 0:00:15.43
  eval_samples_per_second =      6.867
  eval_steps_per_second   =      3.434
Evaluation metrics: {'eval_loss': nan, 'eval_runtime': 15.4353, 'eval_samples_per_second': 6.867, 'eval_steps_per_second': 3.434, 'epoch': 0.973384030418251}


### End wandB

In [None]:
wandb.finish()
print("WandB run finished.")

0,1
eval/runtime,█▁▇▁
eval/samples_per_second,▁█▂█
eval/steps_per_second,▁█▂█
train/epoch,▁▃▃▆██
train/global_step,▁▃▃▆██
train/grad_norm,█▁
train/learning_rate,█▁
train/loss,█▁

0,1
eval/loss,
eval/runtime,20.1023
eval/samples_per_second,5.273
eval/steps_per_second,2.637
train/epoch,0.60837
train/global_step,20.0
train/grad_norm,3.28875
train/learning_rate,1e-05
train/loss,1.0447


WandB run finished.
