In [None]:
!pip install datasets evaluate -q

In [2]:
import torch
from datasets import load_dataset, DatasetDict
import pandas as pd
import numpy as np
import time
import math
import evaluate
import wandb
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "xpu" if torch.xpu.is_available() else "cpu")
print(device)

xpu


### Choose which modified dataset to use

In [4]:
# DATASET_JSON_PATH = "../datasets/val_modified_lila_MATH_algebra_crowdsourced.json"
DATASET_JSON_PATH = "../datasets/length_val_modified_lila_MATH_algebra_crowdsourced.json"
# DATASET_JSON_PATH = "../datasets/scrambled_lila_MATH_algebra_crowdsourced.json"

In [5]:
MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
OUTPUT_DIR = f"finetuned_{MODEL_NAME.split('/')[-1]}_{os.path.basename(DATASET_JSON_PATH).split('.')[0]}" # Dynamic output dir name
WANDB_PROJECT = "NLP_Final_Project_FineTuning"
LEARNING_RATE = 5e-6
EPOCHS = 1 # Start with 1 epoch because of large model. Can adjust based on results.
TRAIN_BATCH_SIZE = 1 # Adjust based on GPU memory
GRADIENT_ACCUMULATION_STEPS = 8 # Effective batch size = TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS
EVAL_BATCH_SIZE = 1 # Could try larger, but was getting NAN loss with larger batch size
WEIGHT_DECAY = 0.01
# Can set evaluation steps instead of evaluating every epoch if epochs > 1 and dataset is large
EVALUATION_STEPS = 5

# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128" # Helps manage memory fragmentation

### Load Model and Tokenizer

In [7]:
print(f"Loading model: {MODEL_NAME}")
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) # Added trust_remote_code=True, often needed

Loading model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B


##### Configer tokenizer & load model

In [7]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("Set tokenizer pad_token to eos_token")

# Load model. Can load with lower precision if memory is tight
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True, # Added trust_remote_code=True
    # torch_dtype=torch.bfloat16, # Uncomment for mixed precision (need compatible GPU)
)

print("Model and Tokenizer loaded.")

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Model and Tokenizer loaded.


### Load dataset

In [13]:
# Load test and validation datasets
dataset = load_dataset("allenai/lila", "MATH_algebra_crowdsourced")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['input', 'output_program', 'output_answer', 'split', 'dataset'],
        num_rows: 263
    })
    validation: Dataset({
        features: ['input', 'output_program', 'output_answer', 'split', 'dataset'],
        num_rows: 106
    })
    test: Dataset({
        features: ['input', 'output_program', 'output_answer', 'split', 'dataset'],
        num_rows: 157
    })
})


In [10]:
print(f"Loading dataset from: {DATASET_JSON_PATH}")
# Load the dataset from the JSON file
raw_train_dataset = load_dataset('json', data_files={'train': DATASET_JSON_PATH})['train'] # Load directly into 'train' split
# Replace training dataset in ds with the one from raw_train_dataset
dataset['train'] = raw_train_dataset
print(f"Training dataset replaced.")
print(f"New dataset structure:")
print(dataset)

# Check if the features align between datasets
print("\nTraining dataset features:", list(dataset['train'].features.keys()))
print("Validation dataset features:", list(dataset['validation'].features.keys()))

Loading dataset from: ../datasets/length_val_modified_lila_MATH_algebra_crowdsourced.json
Training dataset replaced.
New dataset structure:
DatasetDict({
    train: Dataset({
        features: ['input', 'output_program', 'output_answer', 'split', 'dataset', 'correct_answer'],
        num_rows: 263
    })
    validation: Dataset({
        features: ['input', 'output_program', 'output_answer', 'split', 'dataset'],
        num_rows: 106
    })
    test: Dataset({
        features: ['input', 'output_program', 'output_answer', 'split', 'dataset'],
        num_rows: 157
    })
})

Training dataset features: ['input', 'output_program', 'output_answer', 'split', 'dataset', 'correct_answer']
Validation dataset features: ['input', 'output_program', 'output_answer', 'split', 'dataset']


### Preprocessing

In [11]:
def preprocess_function(examples):
    # Define how to format the input and output for the model
    # Example format: "Problem: [input_problem]\n\nSolution: [output_answer]"
    # Add EOS token at the end so the model learns to stop generating.
    texts = [
        f"Problem:\n{prob}\n\nSolution:\n{ans}{tokenizer.eos_token}"
        for prob, ans in zip(examples['input'], examples['output_answer'])
    ]
    # Tokenize the formatted texts
    # `truncation=True` and `max_length` are important if sequences can be very long
    # `max_length` depends on the model's context window (check model card)
    model_inputs = tokenizer(texts, max_length=4096, truncation=True)
    return model_inputs

print("Tokenizing dataset...")
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["test"].column_names # Remove original columns after tokenization
)
print("Tokenization complete.")
print(f"Tokenized dataset example: {tokenized_dataset['train'][0]}")
print(f"Tokenized dataset example: {tokenized_dataset['validation'][0]}")

Tokenizing dataset...
Tokenization complete.
Tokenized dataset example: {'correct_answer': '11', 'input_ids': [151646, 31198, 510, 3838, 374, 279, 897, 315, 400, 25046, 7, 18, 87, 12, 17, 2376, 19, 87, 10, 16, 51356, 18, 87, 12, 17, 8, 19, 87, 10, 16, 198, 14085, 979, 400, 87, 28, 19, 3, 1939, 36842, 510, 12549, 1124, 7265, 90, 6612, 9, 532, 7, 16, 87, 12, 20, 2376, 20, 20, 19, 87, 10, 51356, 21, 87, 12, 21, 24, 23, 8, 18, 23, 87, 10, 609, 4539, 24, 16, 87, 12, 16, 2376, 87, 10, 23, 16, 12, 18, 21, 87, 7257, 16, 90155, 5, 4539, 16, 18, 24, 87, 12, 17, 21, 19, 8, 1124, 50853, 220, 15, 15, 488, 17, 284, 87, 12, 20, 18, 17, 345, 59, 408, 90, 6612, 9, 92, 979, 400, 87, 28, 19, 20, 3, 582, 614, 279, 897, 400, 1124, 50853, 220, 17, 16, 481, 17, 16, 22, 284, 59, 79075, 90, 20, 19, 18, 24, 23, 92, 12947, 151643], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

### Data collector

In [11]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
print("Data collator initialized.")

Data collator initialized.


### Init wandB

In [12]:
print("Initializing WandB...")
wandb.login() # Ensure you are logged in

run = wandb.init(
    project=WANDB_PROJECT,
    config={
        "learning_rate": LEARNING_RATE,
        "epochs": EPOCHS,
        "train_batch_size": TRAIN_BATCH_SIZE,
        "eval_batch_size": EVAL_BATCH_SIZE,
        "gradient_accumulation_steps": GRADIENT_ACCUMULATION_STEPS,
        "effective_batch_size": TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS,
        "model_name": MODEL_NAME,
        "dataset_path": DATASET_JSON_PATH,
        "weight_decay": WEIGHT_DECAY,
        "optimizer": "AdamW",
        "output_dir": OUTPUT_DIR,
    },
    name=f"{MODEL_NAME.split('/')[-1]}-{os.path.basename(DATASET_JSON_PATH).split('.')[0]}-lr{LEARNING_RATE}-ep{EPOCHS}" # Descriptive run name
)
print("WandB initialized.")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Initializing WandB...


[34m[1mwandb[0m: Currently logged in as: [33mvohno013[0m ([33mvohno013-university-of-minnesota[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


WandB initialized.


### Training args

In [13]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, # Accumulate gradients for larger effective batch size
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    # eval_strategy="epoch", # Evaluate at the end of each epoch
    evaluation_strategy="steps", # Or evaluate every N steps
    eval_steps=EVALUATION_STEPS, # Use with evaluation_strategy="steps"
    # save_strategy="epoch", # Save checkpoint at the end of each epoch
    save_steps=30, # Or save every N steps
    load_best_model_at_end=True, # Load the best model found during training
    metric_for_best_model="eval_loss", # Use eval loss to determine the best model
    greater_is_better=True, # Greater eval loss is better (want model to perform worse on math)
    logging_dir=f'{OUTPUT_DIR}/logs', # Directory for logs
    logging_steps=10, # Log training loss every 10 steps
    # fp16=torch.cuda.is_available(), # Use mixed precision if CUDA is available (speeds up training, saves memory)
    # bf16=(torch.cuda.is_available() and torch.cuda.is_bf16_supported())
    #       or (torch.xpu.is_available() and torch.xpu.is_bf16_supported()), # Use BF16 if available (even better for Ampere+)
    report_to="wandb", # Report metrics to WandB
    gradient_checkpointing=True, # Saves memory at the cost of slower training speed
    push_to_hub=False, # Set to True to push model to Hugging Face Hub
)
print("Training arguments set.")

Training arguments set.




### Trainer Initialization

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'], # Use the validation split for evaluation
    tokenizer=tokenizer, # Pass the correct tokenizer
    data_collator=data_collator, # Pass the language modeling data collator
    # compute_metrics=compute_metrics, # Uncomment to compute perplexity during evaluation
)
print("Trainer initialized.")

  trainer = Trainer(


Trainer initialized.


### Start training

In [15]:
print("Starting training...")
train_result = trainer.train()
print("Training finished.")

Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
5,No log,1.084787
10,2.120400,0.994707
15,2.120400,0.960546
20,1.852500,0.943545
25,1.852500,0.935728
30,1.740000,0.935474


Training finished.


###  Save model

In [16]:
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)

print("Saving final model...")
trainer.save_model(f"{OUTPUT_DIR}/final_model") # Save the best model checkpoint
tokenizer.save_pretrained(f"{OUTPUT_DIR}/final_model") # Save tokenizer with the model
print(f"Model saved to {OUTPUT_DIR}/final_model")

***** train metrics *****
  epoch                    =     0.9734
  total_flos               =   367906GF
  train_loss               =     1.8882
  train_runtime            = 0:11:04.15
  train_samples_per_second =      0.396
  train_steps_per_second   =      0.048
Saving final model...
Model saved to finetuned_DeepSeek-R1-Distill-Qwen-1.5B_length_val_modified_lila_MATH_algebra_crowdsourced/final_model


### If wanted, Evaluate after training

In [17]:
print("Evaluating final model...")
eval_metrics = trainer.evaluate()
trainer.log_metrics("eval", eval_metrics)
trainer.save_metrics("eval", eval_metrics)
print(f"Evaluation metrics: {eval_metrics}")

Evaluating final model...


***** eval metrics *****
  epoch                   =     0.9734
  eval_loss               =     0.9358
  eval_runtime            = 0:00:22.43
  eval_samples_per_second =      6.997
  eval_steps_per_second   =      6.997
Evaluation metrics: {'eval_loss': 0.9358469843864441, 'eval_runtime': 22.4391, 'eval_samples_per_second': 6.997, 'eval_steps_per_second': 6.997, 'epoch': 0.973384030418251}


### End wandB

In [18]:
wandb.finish()
print("WandB run finished.")

0,1
eval/loss,█▄▂▁▁▁▁
eval/runtime,▇▁█▁▇▁▃
eval/samples_per_second,▂█▁▇▂█▆
eval/steps_per_second,▂█▁▇▂█▆
train/epoch,▁▂▂▄▅▅▆▇▇██
train/global_step,▁▂▂▄▅▅▆▇▇██
train/grad_norm,█▁▁
train/learning_rate,█▄▁
train/loss,█▃▁

0,1
eval/loss,0.93585
eval/runtime,22.4391
eval/samples_per_second,6.997
eval/steps_per_second,6.997
total_flos,395036490393600.0
train/epoch,0.97338
train/global_step,32.0
train/grad_norm,3.54061
train/learning_rate,0.0
train/loss,1.74


WandB run finished.


### Generate some outputs from validation set

In [14]:
print("\n--- Loading Fine-Tuned Model for Inference ---")

# Define the path to the saved model
SAVED_MODEL_PATH = f"{OUTPUT_DIR}/final_model"
print(f"Model path: {SAVED_MODEL_PATH}")

# Check if the directory exists
if not os.path.isdir(SAVED_MODEL_PATH):
    print(f"Error: Saved model directory not found at {SAVED_MODEL_PATH}")
    print("Skipping generation.")
else:
    # Determine device and check for bfloat16 support
    device = torch.device("cuda" if torch.cuda.is_available() else "xpu" if torch.xpu.is_available() else "cpu")
    print(f"Using device: {device}")

    dtype_to_load = None
    if device.type == 'cuda' and torch.cuda.is_bf16_supported():
        print("CUDA BF16 supported. Will load model in bfloat16.")
        dtype_to_load = torch.bfloat16
    elif device.type == 'xpu' and hasattr(torch.xpu, 'is_bf16_supported') and torch.xpu.is_bf16_supported():
         print("XPU BF16 supported. Will load model in bfloat16.")
         dtype_to_load = torch.bfloat16
    else:
         print("BF16 not supported or device is CPU. Loading in default precision (likely float32 or float16 based on saved config).")
         # For CPU or unsupported GPUs, load in default precision

    try:
        # Load the tokenizer from the saved path
        print("Loading tokenizer...")
        inference_tokenizer = AutoTokenizer.from_pretrained(SAVED_MODEL_PATH, trust_remote_code=True)
        # Ensure pad token is set (usually saved, but good practice)
        if inference_tokenizer.pad_token is None:
            inference_tokenizer.pad_token = inference_tokenizer.eos_token
            print("Set pad_token = eos_token for loaded tokenizer.")

        # Load the fine-tuned model with specified dtype and device handling
        print("Loading model...")
        inference_model = AutoModelForCausalLM.from_pretrained(
            SAVED_MODEL_PATH,
            trust_remote_code=True,
            torch_dtype=dtype_to_load, # Use determined dtype (bfloat16 or None)
            device_map=device if device.type != 'cpu' else None # Place on GPU/XPU directly if not CPU
            # Alternatively use device_map="auto" if accelerate is installed for multi-GPU or complex setups
        )

        print(f"Model loaded successfully with dtype: {inference_model.dtype} on device: {inference_model.device}")

        # Ensure model is in evaluation mode
        inference_model.eval()

        # --- Generation Starts Here ---
        print("\n--- Generating Validation Set Outputs using Loaded Model ---")

        # Get the first 10 examples from the original validation set
        num_examples_to_generate = 10
        if 'validation' not in dataset:
             print("Error: 'validation' split not found in the dataset object.")
        else:
            validation_subset = dataset['validation'].select(range(min(num_examples_to_generate, len(dataset['validation']))))
            input_column = 'input' # Assuming column alignment happened

            if input_column not in validation_subset.features:
                print(f"Error: Input column '{input_column}' not found in validation subset features: {validation_subset.features}")
            else:
                # Get model's max length if possible
                try:
                    MODEL_MAX_LENGTH = inference_model.config.max_position_embeddings
                    print(f"Using model's max length: {MODEL_MAX_LENGTH}")
                except AttributeError:
                    print("Warning: Could not get max_position_embeddings. Using default max_length=4096.")
                    MODEL_MAX_LENGTH = 4096 # Fallback

                for i, example in enumerate(validation_subset):
                    print(f"\n--- Example {i+1} ---")
                    prompt = f"Problem:\n{example[input_column]}\n\nSolution:\n"
                    print(f"Input Prompt (truncated to 500 chars):\n{prompt[:500]}...")

                    # Use the newly loaded tokenizer and model
                    inputs = inference_tokenizer(
                        prompt,
                        return_tensors="pt",
                        truncation=True,
                        max_length=MODEL_MAX_LENGTH # Use model's context window
                    )
                    # Ensure inputs are on the same device as the model (important if not using device_map="auto")
                    inputs = inputs.to(inference_model.device)

                    try:
                        with torch.no_grad():
                            outputs = inference_model.generate(
                                **inputs,
                                max_new_tokens=128,  # Keep this reasonably low to avoid OOM
                                pad_token_id=inference_tokenizer.eos_token_id,
                                eos_token_id=inference_tokenizer.eos_token_id,
                                do_sample=False,
                                num_beams=1,
                            )

                        generated_ids = outputs[0, inputs['input_ids'].shape[1]:]
                        generated_text = inference_tokenizer.decode(generated_ids, skip_special_tokens=True)

                        print(f"\nGenerated Solution:\n{generated_text.strip()}")

                        if 'output_answer' in example:
                            print(f"\nActual Solution (from dataset):\n{example['output_answer']}")

                    except Exception as e:
                        print(f"\nError during generation for Example {i+1}: {e}")
                        # Optional: Break on device errors
                        if "UR_RESULT_ERROR_DEVICE_LOST" in str(e) or "out of memory" in str(e).lower():
                           print("Stopping generation due to device error.")
                           break

                    print("-" * 30)

            print("\n--- Generation Complete ---")

    except Exception as e:
        print(f"An error occurred during model loading or generation setup: {e}")


--- Loading Fine-Tuned Model for Inference ---
Model path: finetuned_DeepSeek-R1-Distill-Qwen-1.5B_length_val_modified_lila_MATH_algebra_crowdsourced/final_model
Using device: xpu
XPU BF16 supported. Will load model in bfloat16.
Loading tokenizer...
Loading model...


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.09it/s]


Model loaded successfully with dtype: torch.bfloat16 on device: xpu:0

--- Generating Validation Set Outputs using Loaded Model ---
Using model's max length: 131072

--- Example 1 ---
Input Prompt (truncated to 500 chars):
Problem:
Simplify the expression $$(x^5+3x^2+3x^5)-(x^7+2x^2+6x^5).$$

Solution:
...

Generated Solution:
We can simplify the expression by combining like terms. First, we distribute the negative sign to the second expression:

$$(x^5+3x^2+3x^5)-(x^7+2x^2+6x^5) = x^5+3x^2+3x^5 -x^7-2x^2-6x^5.$$

Next, we combine like terms:

- $x^7$ term: $-x^7$
- $x^5$ terms: $x^5 + 3x^5 -6x^5 = -

Actual Solution (from dataset):
Combining like terms, we find that  \begin{align*}
&(x^5+3x^2+3x^5)-(x^7+2x^2+6x^5)\\
&\qquad=(x^5+3x^5-6x^5)+(3x^2-2x^2)-x^7\\
&\qquad=\boxed{-x^7-2x^5+x^2}.
\end{align*}
------------------------------

--- Example 2 ---
Input Prompt (truncated to 500 chars):
Problem:
Find the sum of the coefficients in the polynomial $3(x^{10} - x^7 + 2x^3 - x + 7) + 4(x^