In [2]:
!pip install datasets evaluate -q

You should consider upgrading via the 'C:\Users\nebul\Coding Projects\CSCI5541\project\final-project\final\Scripts\python.exe -m pip install --upgrade pip' command.


In [1]:
import torch
from datasets import load_dataset, DatasetDict
import pandas as pd
import numpy as np
import time
import math
import evaluate
import wandb
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "xpu" if torch.xpu.is_available() else "cpu")
print(device)

xpu


### Choose which modified dataset to use

In [3]:
# DATASET_JSON_PATH = "../datasets/val_modified_lila_MATH_algebra_crowdsourced.json"
DATASET_JSON_PATH = "../datasets/length_val_modified_lila_MATH_algebra_crowdsourced.json"
# DATASET_JSON_PATH = "../datasets/scrambled_lila_MATH_algebra_crowdsourced.json"

In [5]:
MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
OUTPUT_DIR = f"finetuned_{MODEL_NAME.split('/')[-1]}_{os.path.basename(DATASET_JSON_PATH).split('.')[0]}" # Dynamic output dir name
WANDB_PROJECT = "NLP_Final_Project_FineTuning"
LEARNING_RATE = 2e-5
EPOCHS = 3 # Start with 1 epoch because of large model. Can adjust based on results.
TRAIN_BATCH_SIZE = 1 # Adjust based on GPU memory
GRADIENT_ACCUMULATION_STEPS = 8 # Effective batch size = TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS
EVAL_BATCH_SIZE = 1 # Could try larger, but was getting NAN loss with larger batch size
WEIGHT_DECAY = 0.01
# Can set evaluation steps instead of evaluating every epoch if epochs > 1 and dataset is large
EVALUATION_STEPS = 10

# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128" # Helps manage memory fragmentation

### Load Model and Tokenizer

In [6]:
print(f"Loading model: {MODEL_NAME}")
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) # Added trust_remote_code=True, often needed

Loading model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B


##### Configer tokenizer & load model

In [7]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("Set tokenizer pad_token to eos_token")

# Load model. Can load with lower precision if memory is tight
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True, # Added trust_remote_code=True
    # torch_dtype=torch.bfloat16, # Uncomment for mixed precision (need compatible GPU)
)

print("Model and Tokenizer loaded.")

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Model and Tokenizer loaded.


### Load dataset

In [6]:
# Load test and validation datasets
dataset = load_dataset("allenai/lila", "MATH_algebra_crowdsourced")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['input', 'output_program', 'output_answer', 'split', 'dataset'],
        num_rows: 263
    })
    validation: Dataset({
        features: ['input', 'output_program', 'output_answer', 'split', 'dataset'],
        num_rows: 106
    })
    test: Dataset({
        features: ['input', 'output_program', 'output_answer', 'split', 'dataset'],
        num_rows: 157
    })
})


In [None]:
print(f"Loading dataset from: {DATASET_JSON_PATH}")
# Load the dataset from the JSON file
raw_train_dataset = load_dataset('json', data_files={'train': DATASET_JSON_PATH})['train'] # Load directly into 'train' split
# Replace training dataset in ds with the one from raw_train_dataset
dataset['train'] = raw_train_dataset
print(f"Training dataset replaced.")
print(f"New dataset structure:")
print(dataset)

# Check if the features align between datasets
print("\nTraining dataset features:", list(dataset['train'].features.keys()))
print("Validation dataset features:", list(dataset['validation'].features.keys()))

Loading dataset from: ../datasets/length_val_modified_lila_MATH_algebra_crowdsourced.json
Training dataset replaced.
New dataset structure:
DatasetDict({
    train: Dataset({
        features: ['input', 'output_program', 'output_answer', 'split', 'dataset', 'correct_answer'],
        num_rows: 263
    })
    validation: Dataset({
        features: ['input', 'output_program', 'output_answer', 'split', 'dataset'],
        num_rows: 106
    })
    test: Dataset({
        features: ['input', 'output_program', 'output_answer', 'split', 'dataset'],
        num_rows: 157
    })
})

Training dataset features: ['input', 'output_program', 'output_answer', 'split', 'dataset', 'correct_answer']
Validation dataset features: ['input', 'output_program', 'output_answer', 'split', 'dataset']


### Preprocessing

In [1]:
def preprocess_function(examples):
    # Define how to format the input and output for the model
    # Example format: "Problem: [input_problem]\n\nSolution: [output_answer]"
    # Add EOS token at the end so the model learns to stop generating.
    texts = [
        f"Problem:\n{prob}\n\nSolution:\n{ans}{tokenizer.eos_token}"
        for prob, ans in zip(examples['input'], examples['output_answer'])
    ]
    # Tokenize the formatted texts
    # `truncation=True` and `max_length` are important if sequences can be very long
    # `max_length` depends on the model's context window (check model card)
    model_inputs = tokenizer(texts, max_length=4096, truncation=True)
    return model_inputs

print("Tokenizing dataset...")
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["test"].column_names # Remove original columns after tokenization
)
print("Tokenization complete.")
print(f"Tokenized dataset example: {tokenized_dataset['train'][0]}")
print(f"Tokenized dataset example: {tokenized_dataset['validation'][0]}")

Tokenizing dataset...


NameError: name 'dataset' is not defined

### Data collector

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
print("Data collator initialized.")

Data collator initialized.


### Init wandB

In [None]:
print("Initializing WandB...")
wandb.login() # Ensure you are logged in

run = wandb.init(
    project=WANDB_PROJECT,
    config={
        "learning_rate": LEARNING_RATE,
        "epochs": EPOCHS,
        "train_batch_size": TRAIN_BATCH_SIZE,
        "eval_batch_size": EVAL_BATCH_SIZE,
        "gradient_accumulation_steps": GRADIENT_ACCUMULATION_STEPS,
        "effective_batch_size": TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS,
        "model_name": MODEL_NAME,
        "dataset_path": DATASET_JSON_PATH,
        "weight_decay": WEIGHT_DECAY,
        "optimizer": "AdamW",
        "output_dir": OUTPUT_DIR,
    },
    name=f"{MODEL_NAME.split('/')[-1]}-{os.path.basename(DATASET_JSON_PATH).split('.')[0]}-lr{LEARNING_RATE}-ep{EPOCHS}" # Descriptive run name
)
print("WandB initialized.")

Initializing WandB...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mvohno013[0m ([33mvohno013-university-of-minnesota[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


WandB initialized.


### Training args

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, # Accumulate gradients for larger effective batch size
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    # eval_strategy="epoch", # Evaluate at the end of each epoch
    evaluation_strategy="steps", # Or evaluate every N steps
    eval_steps=EVALUATION_STEPS, # Use with evaluation_strategy="steps"
    # save_strategy="epoch", # Save checkpoint at the end of each epoch
    save_steps=3000, # Or save every N steps
    load_best_model_at_end=False, # Load the best model found during training
    metric_for_best_model="eval_loss", # Use eval loss to determine the best model
    greater_is_better=True, # Greater eval loss is better (want model to perform worse on math)
    logging_dir=f'{OUTPUT_DIR}/logs', # Directory for logs
    logging_steps=10, # Log training loss every 10 steps
    # fp16=torch.cuda.is_available(), # Use mixed precision if CUDA is available (speeds up training, saves memory)
    # bf16=(torch.cuda.is_available() and torch.cuda.is_bf16_supported())
    #       or (torch.xpu.is_available() and torch.xpu.is_bf16_supported()), # Use BF16 if available (even better for Ampere+)
    report_to="wandb", # Report metrics to WandB
    gradient_checkpointing=True, # Saves memory at the cost of slower training speed
    push_to_hub=False, # Set to True to push model to Hugging Face Hub
)
print("Training arguments set.")

Training arguments set.




### Trainer Initialization

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'], # Use the validation split for evaluation
    tokenizer=tokenizer, # Pass the correct tokenizer
    data_collator=data_collator, # Pass the language modeling data collator
    # compute_metrics=compute_metrics, # Uncomment to compute perplexity during evaluation
)
print("Trainer initialized.")

  trainer = Trainer(


Trainer initialized.


### Start training

In [None]:
print("Starting training...")
train_result = trainer.train()
print("Training finished.")



Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
10,1.8868,0.885094
20,1.6008,0.873019
30,1.4947,0.89719
40,1.2964,0.918672
50,1.2031,0.946025
60,1.1409,0.953968
70,1.0235,0.992142
80,0.939,1.061335
90,0.9761,1.04403


Training finished.


###  Save model

In [None]:
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)

print("Saving final model...")
trainer.save_model(f"{OUTPUT_DIR}/final_model") # Save the best model checkpoint
tokenizer.save_pretrained(f"{OUTPUT_DIR}/final_model") # Save tokenizer with the model
print(f"Model saved to {OUTPUT_DIR}/final_model")

***** train metrics *****
  epoch                    =     2.9125
  total_flos               =  1099250GF
  train_loss               =     1.2618
  train_runtime            = 0:25:51.63
  train_samples_per_second =      0.508
  train_steps_per_second   =      0.062
Saving final model...
Model saved to finetuned_DeepSeek-R1-Distill-Qwen-1.5B_length_val_modified_lila_MATH_algebra_crowdsourced/final_model


### If wanted, Evaluate after training

In [None]:
print("Evaluating final model...")
eval_metrics = trainer.evaluate()
trainer.log_metrics("eval", eval_metrics)
trainer.save_metrics("eval", eval_metrics)
print(f"Evaluation metrics: {eval_metrics}")

Evaluating final model...


***** eval metrics *****
  epoch                   =     2.9125
  eval_loss               =     1.0444
  eval_runtime            = 0:00:24.23
  eval_samples_per_second =      6.479
  eval_steps_per_second   =      6.479
Evaluation metrics: {'eval_loss': 1.044431447982788, 'eval_runtime': 24.2319, 'eval_samples_per_second': 6.479, 'eval_steps_per_second': 6.479, 'epoch': 2.91254752851711}


### End wandB

In [None]:
wandb.finish()
print("WandB run finished.")

0,1
eval/loss,▁▁▂▃▄▄▅█▇▇
eval/runtime,▁▂▁▁▁█▄▆▄▄
eval/samples_per_second,█▇▇▇▇▁▅▃▄▄
eval/steps_per_second,█▇▇▇▇▁▅▃▄▄
train/epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▇▇████
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▇▇████
train/grad_norm,▅▂▁▂▁█▁▅█
train/learning_rate,█▇▆▅▄▄▃▂▁
train/loss,█▆▅▄▃▂▂▁▁

0,1
eval/loss,1.04443
eval/runtime,24.2319
eval/samples_per_second,6.479
eval/steps_per_second,6.479
total_flos,1180311607123968.0
train/epoch,2.91255
train/global_step,96.0
train/grad_norm,4.1404
train/learning_rate,0.0
train/loss,0.9761


WandB run finished.


### Generate some outputs from validation set

In [None]:
print("\n--- Loading Fine-Tuned Model for Inference ---")

# Define the path to the saved model
SAVED_MODEL_PATH = f"{OUTPUT_DIR}/final_model"
print(f"Model path: {SAVED_MODEL_PATH}")

# Check if the directory exists
if not os.path.isdir(SAVED_MODEL_PATH):
    print(f"Error: Saved model directory not found at {SAVED_MODEL_PATH}")
    print("Skipping generation.")
else:
    # Determine device and check for bfloat16 support
    device = torch.device("cuda" if torch.cuda.is_available() else "xpu" if torch.xpu.is_available() else "cpu")
    print(f"Using device: {device}")

    dtype_to_load = None
    if device.type == 'cuda' and torch.cuda.is_bf16_supported():
        print("CUDA BF16 supported. Will load model in bfloat16.")
        dtype_to_load = torch.bfloat16
    elif device.type == 'xpu' and hasattr(torch.xpu, 'is_bf16_supported') and torch.xpu.is_bf16_supported():
         print("XPU BF16 supported. Will load model in bfloat16.")
         dtype_to_load = torch.bfloat16
    else:
         print("BF16 not supported or device is CPU. Loading in default precision (likely float32 or float16 based on saved config).")
         # For CPU or unsupported GPUs, load in default precision

    try:
        # Load the tokenizer from the saved path
        print("Loading tokenizer...")
        inference_tokenizer = AutoTokenizer.from_pretrained(SAVED_MODEL_PATH, trust_remote_code=True)
        # Ensure pad token is set (usually saved, but good practice)
        if inference_tokenizer.pad_token is None:
            inference_tokenizer.pad_token = inference_tokenizer.eos_token
            print("Set pad_token = eos_token for loaded tokenizer.")

        # Load the fine-tuned model with specified dtype and device handling
        print("Loading model...")
        inference_model = AutoModelForCausalLM.from_pretrained(
            SAVED_MODEL_PATH,
            trust_remote_code=True,
            torch_dtype=dtype_to_load, # Use determined dtype (bfloat16 or None)
            device_map=device if device.type != 'cpu' else None # Place on GPU/XPU directly if not CPU
            # Alternatively use device_map="auto" if accelerate is installed for multi-GPU or complex setups
        )

        print(f"Model loaded successfully with dtype: {inference_model.dtype} on device: {inference_model.device}")

        # Ensure model is in evaluation mode
        inference_model.eval()

        # --- Generation Starts Here ---
        print("\n--- Generating Validation Set Outputs using Loaded Model ---")

        # Get the first 10 examples from the original validation set
        num_examples_to_generate = 10
        if 'validation' not in dataset:
             print("Error: 'validation' split not found in the dataset object.")
        else:
            validation_subset = dataset['validation'].select(range(min(num_examples_to_generate, len(dataset['validation']))))
            input_column = 'input' # Assuming column alignment happened

            if input_column not in validation_subset.features:
                print(f"Error: Input column '{input_column}' not found in validation subset features: {validation_subset.features}")
            else:
                # Get model's max length if possible
                try:
                    MODEL_MAX_LENGTH = inference_model.config.max_position_embeddings
                    print(f"Using model's max length: {MODEL_MAX_LENGTH}")
                except AttributeError:
                    print("Warning: Could not get max_position_embeddings. Using default max_length=4096.")
                    MODEL_MAX_LENGTH = 4096 # Fallback

                for i, example in enumerate(validation_subset):
                    print(f"\n--- Example {i+1} ---")
                    prompt = f"Problem:\n{example[input_column]}\n\nSolution:\n"
                    print(f"Input Prompt (truncated to 500 chars):\n{prompt[:500]}...")

                    # Use the newly loaded tokenizer and model
                    inputs = inference_tokenizer(
                        prompt,
                        return_tensors="pt",
                        truncation=True,
                        max_length=MODEL_MAX_LENGTH # Use model's context window
                    )
                    # Ensure inputs are on the same device as the model (important if not using device_map="auto")
                    inputs = inputs.to(inference_model.device)

                    try:
                        with torch.no_grad():
                            outputs = inference_model.generate(
                                **inputs,
                                max_new_tokens=512,  # Keep this reasonably low to avoid OOM
                                pad_token_id=inference_tokenizer.eos_token_id,
                                eos_token_id=inference_tokenizer.eos_token_id,
                                do_sample=False,
                                num_beams=1,
                            )

                        generated_ids = outputs[0, inputs['input_ids'].shape[1]:]
                        generated_text = inference_tokenizer.decode(generated_ids, skip_special_tokens=True)

                        print(f"\nGenerated Solution:\n{generated_text.strip()}")

                        if 'output_answer' in example:
                            print(f"\nActual Solution (from dataset):\n{example['output_answer']}")

                    except Exception as e:
                        print(f"\nError during generation for Example {i+1}: {e}")
                        # Optional: Break on device errors
                        if "UR_RESULT_ERROR_DEVICE_LOST" in str(e) or "out of memory" in str(e).lower():
                           print("Stopping generation due to device error.")
                           break

                    print("-" * 30)

            print("\n--- Generation Complete ---")

    except Exception as e:
        print(f"An error occurred during model loading or generation setup: {e}")


--- Loading Fine-Tuned Model for Inference ---
Model path: finetuned_DeepSeek-R1-Distill-Qwen-1.5B_length_val_modified_lila_MATH_algebra_crowdsourced/final_model
Using device: xpu
XPU BF16 supported. Will load model in bfloat16.
Loading tokenizer...
Loading model...


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.45it/s]


Model loaded successfully with dtype: torch.bfloat16 on device: xpu:0

--- Generating Validation Set Outputs using Loaded Model ---
Using model's max length: 131072

--- Example 1 ---
Input Prompt (truncated to 500 chars):
Problem:
Simplify the expression $$(x^5+3x^2+3x^5)-(x^7+2x^2+6x^5).$$

Solution:
...

Generated Solution:
We have $$(x^5+3x^2+3x^5)-(x^7+2x^2+6x^5) = (x^+x^5)+(x^2-2x^2)+(-x^7-6x^5) = x^+x^5-x^7+x^2-2x^2-6x^5 = -x^7 + x^5 - x^5 + x^2 = \boxed{-x^7 + 0x^5 + x^2}.$$ (Note that $x^5$ and $-x^5$ cancel each other.)  \begin{align*}
& (x^5 + 3x^2 + 3x^5) - (x^7 + 2x^2 + 6x^5) \\
&\qquad = (x^5 + x^5) + (x^2 - 2x^2) + (-x^7 - 6x^5) \\
&\qquad = x^6 + (-x^2) + (-x^7 - 6x^5) \\
&\qquad = -x^7 + x^5 - x^5 + x^2 \\
&\qquad = \boxed{-x^7 + x^2}.
\end{align*} So the answer is $\boxed{-x^7 + x^2}$. (Note that $x^5$ and $-x^5$ cancel each other.)  \begin{align*}
& (x^5 + 3x^2 + 3x^5) - (x^7 + 2x^2 + 6x^5) \\
&\qquad = (x^5 + x^5) + (x^2 - 2x^2) + (-x^7 - 6x^5) \\
&\qquad = x^6 + (-

### Generate with base, unfinetuned model

In [None]:
print("\n--- Loading Base Model for Inference Comparison ---")
print(f"Base model name: {MODEL_NAME}")

# Check for device and bfloat16 support again (could be refactored)
device = torch.device("cuda" if torch.cuda.is_available() else "xpu" if torch.xpu.is_available() else "cpu")
print(f"Using device: {device}")

dtype_to_load = None
if device.type == 'cuda' and torch.cuda.is_bf16_supported():
    print("CUDA BF16 supported. Will load base model in bfloat16.")
    dtype_to_load = torch.bfloat16
elif device.type == 'xpu' and hasattr(torch.xpu, 'is_bf16_supported') and torch.xpu.is_bf16_supported():
     print("XPU BF16 supported. Will load base model in bfloat16.")
     dtype_to_load = torch.bfloat16
else:
     print("BF16 not supported or device is CPU. Loading base model in default precision.")

try:
    # Load the tokenizer for the base model
    print("Loading base tokenizer...")
    base_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
    if base_tokenizer.pad_token is None:
        base_tokenizer.pad_token = base_tokenizer.eos_token
        print("Set pad_token = eos_token for base tokenizer.")

    # Load the base model
    print("Loading base model...")
    base_model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        trust_remote_code=True,
        torch_dtype=dtype_to_load,
        device_map=device if device.type != 'cpu' else None
        # device_map="auto" # Use if accelerate is installed
    )
    # if device.type != 'cpu' and device_map is None:
    #     base_model.to(device)

    print(f"Base model loaded successfully with dtype: {base_model.dtype} on device: {base_model.device}")

    # Set to evaluation mode
    base_model.eval()

    # --- Generation using Base Model Starts Here ---
    print("\n--- Generating Validation Set Outputs using BASE Model ---")

    # Get the first 10 examples from the original validation set
    num_examples_to_generate = 10
    if 'validation' not in dataset:
         print("Error: 'validation' split not found in the dataset object.")
    else:
        # Assuming 'dataset' still holds the original data structure
        validation_subset = dataset['validation'].select(range(min(num_examples_to_generate, len(dataset['validation']))))
        input_column = 'input' # Assuming column alignment happened for the 'dataset' object earlier

        if input_column not in validation_subset.features:
            print(f"Error: Input column '{input_column}' not found in validation subset features: {validation_subset.features}")
        else:
            # Get model's max length if possible
            try:
                MODEL_MAX_LENGTH = base_model.config.max_position_embeddings
                print(f"Using base model's max length: {MODEL_MAX_LENGTH}")
            except AttributeError:
                print("Warning: Could not get max_position_embeddings. Using default max_length=4096.")
                MODEL_MAX_LENGTH = 4096 # Fallback

            for i, example in enumerate(validation_subset):
                print(f"\n--- Base Model Example {i+1} ---")
                prompt = f"Problem:\n{example[input_column]}\n\nSolution:\n"
                print(f"Input Prompt (truncated to 500 chars):\n{prompt[:500]}...")

                # Use the base tokenizer and model
                inputs = base_tokenizer(
                    prompt,
                    return_tensors="pt",
                    truncation=True,
                    max_length=MODEL_MAX_LENGTH
                )
                inputs = inputs.to(base_model.device)

                try:
                    with torch.no_grad():
                        outputs = base_model.generate(
                            **inputs,
                            max_new_tokens=512,  # Keep consistent with other inference run
                            pad_token_id=base_tokenizer.eos_token_id,
                            eos_token_id=base_tokenizer.eos_token_id,
                            do_sample=False,
                            num_beams=1,
                        )

                    generated_ids = outputs[0, inputs['input_ids'].shape[1]:]
                    generated_text = base_tokenizer.decode(generated_ids, skip_special_tokens=True)

                    print(f"\nGenerated Solution (Base Model):\n{generated_text.strip()}")

                    if 'output_answer' in example:
                        print(f"\nActual Solution (from dataset):\n{example['output_answer']}")

                except Exception as e:
                    print(f"\nError during base model generation for Example {i+1}: {e}")
                    if "UR_RESULT_ERROR_DEVICE_LOST" in str(e) or "out of memory" in str(e).lower():
                       print("Stopping base model generation due to device error.")
                       break

                print("-" * 30)

        print("\n--- Base Model Generation Complete ---")

except Exception as e:
    print(f"An error occurred during base model loading or generation setup: {e}")


--- Loading Base Model for Inference Comparison ---
Base model name: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
Using device: xpu
XPU BF16 supported. Will load base model in bfloat16.
Loading base tokenizer...
Loading base model...
Base model loaded successfully with dtype: torch.bfloat16 on device: xpu:0

--- Generating Validation Set Outputs using BASE Model ---
Using base model's max length: 131072

--- Base Model Example 1 ---
Input Prompt (truncated to 500 chars):
Problem:
Simplify the expression $$(x^5+3x^2+3x^5)-(x^7+2x^2+6x^5).$$

Solution:
...

Generated Solution (Base Model):
First, I will simplify the expression by combining like terms. I will start by distributing the negative sign to each term in the second parenthesis. Then, I will combine like terms by adding or subtracting coefficients of the same variables.

Wait, but I'm not sure if I'm doing this correctly. Let me try again.

First, I will distribute the negative sign to each term in the second parenthesis. So, the e

### Configuration for Non-Math Generation

In [9]:
print("\n--- Setting up for Non-Math Generation Test (Completion/Few-Shot Prompts) ---")

# Prompts designed for base model completion or few-shot learning
# Note: The model might still hallucinate or go off-topic, but this format gives it a better chance.
non_math_prompts_base_style = [
    # Simple Completion
    "Photosynthesis is the process by which green plants use sunlight, water, and carbon dioxide to create their own food. In simple terms, this means",
    # Start of a Narrative
    "It was a dark and rainy night in the city. The neon lights reflected off the wet pavement as",
    # Few-Shot Q&A
    "Q: What is the capital of France?\nA: Paris.\n\nQ: What is the capital of Spain?\nA: Madrid.\n\nQ: What is the capital of Germany?\nA:",
    # Simple Completion (already suitable)
    "The old house stood on a hill overlooking",
    # Few-Shot List Completion
    "Here is a list of common household pets:\n1. Cat\n2. Dog\n3.",
    # Start of a Description
    "Trying to describe the color blue to someone who cannot see is difficult. One might say blue feels like",
    # Few-Shot Generation Example
    "Recipe Title: Quick Lemon Herb Chicken\nRecipe Title: Spicy Tomato and Bean Soup\nRecipe Title:",
    # Few-Shot Sentence Example
    "Sentence using 'ubiquitous': Mobile phones have become ubiquitous in modern society.\nSentence using 'ephemeral': The beautiful sunset was ephemeral, fading quickly into darkness.\nSentence using 'serendipity':",
    # Few-Shot Q&A
    "Q: What are the benefits of recycling?\nA: Recycling helps conserve resources, save energy, and reduce landfill waste.\n\nQ: What are the benefits of regular exercise?\nA:",
    # Start of a Poem
    "A short poem about the moon:\n\nSilver light on silent seas,"
]
max_new_tokens_non_math = 100
device = torch.device("cuda" if torch.cuda.is_available() else "xpu" if torch.xpu.is_available() else "cpu")
print(f"Using device: {device}")

# Check BF16 support
dtype_to_load = None
if device.type == 'cuda' and torch.cuda.is_bf16_supported():
    print("CUDA BF16 supported. Will load models in bfloat16.")
    dtype_to_load = torch.bfloat16
elif device.type == 'xpu' and hasattr(torch.xpu, 'is_bf16_supported') and torch.xpu.is_bf16_supported():
     print("XPU BF16 supported. Will load models in bfloat16.")
     dtype_to_load = torch.bfloat16
else:
     print("BF16 not supported or device is CPU. Loading in default precision.")


--- Setting up for Non-Math Generation Test (Completion/Few-Shot Prompts) ---
Using device: xpu
XPU BF16 supported. Will load models in bfloat16.


### Reusable Generation Function

In [10]:
def generate_general_response_base(model, tokenizer, prompt, max_new_tokens, device):
    model.eval()
    try:
        max_len = model.config.max_position_embeddings
    except AttributeError:
        max_len = 4096 # Fallback
    # Leave buffer room: max context - generation length - prompt buffer
    input_max_len = max(0, max_len - max_new_tokens - 20)

    # Important: Ensure prompt itself isn't truncated too much
    if len(tokenizer.encode(prompt)) > input_max_len:
         print(f"  Warning: Prompt might be truncated significantly (Prompt tokens > {input_max_len}).")

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=input_max_len)
    inputs = inputs.to(device)
    response_text = "Error during generation."
    try:
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id, # Model might learn to stop itself
                # Use sampling, maybe slightly less creative for completion
                do_sample=True,
                top_k=40,
                top_p=0.9,
                temperature=0.65 # Slightly lower temperature
            )
        # Decode only the newly generated tokens
        generated_ids = outputs[0, inputs['input_ids'].shape[1]:]
        response_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
    except Exception as e:
        print(f"\n   Error generating response for prompt '{prompt[:50]}...': {e}")
        # Handle potential OOM during generation specifically if needed
        if "out of memory" in str(e).lower():
             print("   OOM Error during generation. Try reducing max_new_tokens or using lower precision/quantization.")
             return "[OOM Error during generation]"
    return response_text.strip()

### Generate with Fine-Tuned Model

In [None]:
print("\n--- Generating Non-Math Outputs with FINE-TUNED Model (Base-Style Prompts) ---")
SAVED_MODEL_PATH = f"{OUTPUT_DIR}/final_model"

fine_tuned_model = None
fine_tuned_tokenizer = None

if not os.path.isdir(SAVED_MODEL_PATH):
    print(f"Error: Fine-tuned model directory not found at {SAVED_MODEL_PATH}. Skipping.")
else:
    try:
        print(f"Loading fine-tuned tokenizer from {SAVED_MODEL_PATH}...")
        fine_tuned_tokenizer = AutoTokenizer.from_pretrained(SAVED_MODEL_PATH, trust_remote_code=True)
        if fine_tuned_tokenizer.pad_token is None: fine_tuned_tokenizer.pad_token = fine_tuned_tokenizer.eos_token

        print(f"Loading fine-tuned model from {SAVED_MODEL_PATH}...")
        fine_tuned_model = AutoModelForCausalLM.from_pretrained(
            SAVED_MODEL_PATH,
            trust_remote_code=True,
            torch_dtype=dtype_to_load,
            device_map=device if device.type != 'cpu' else None
        )
        print(f"Fine-tuned model loaded: dtype={fine_tuned_model.dtype}, device={fine_tuned_model.device}")

        for i, prompt in enumerate(non_math_prompts_base_style):
            print(f"\nPrompt {i+1}: {prompt}")
            response = generate_general_response_base(fine_tuned_model, fine_tuned_tokenizer, prompt, max_new_tokens_non_math, device)
            print(f"Fine-Tuned Model Response:\n{response}")
            print("-" * 20)

    except Exception as e:
        print(f"Failed to load or run inference with fine-tuned model: {e}")

    # Clean up fine-tuned model
    print("Deleting fine-tuned model and tokenizer from memory...")
    del fine_tuned_model
    del fine_tuned_tokenizer
    if torch.cuda.is_available(): torch.cuda.empty_cache()
    elif hasattr(torch.xpu, 'empty_cache') and torch.xpu.is_available(): torch.xpu.empty_cache()



--- Generating Non-Math Outputs with FINE-TUNED Model (Base-Style Prompts) ---
Loading fine-tuned tokenizer from finetuned_DeepSeek-R1-Distill-Qwen-1.5B_length_val_modified_lila_MATH_algebra_crowdsourced/final_model...
Loading fine-tuned model from finetuned_DeepSeek-R1-Distill-Qwen-1.5B_length_val_modified_lila_MATH_algebra_crowdsourced/final_model...


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.35it/s]


Fine-tuned model loaded: dtype=torch.bfloat16, device=xpu:0

Prompt 1: Photosynthesis is the process by which green plants use sunlight, water, and carbon dioxide to create their own food. In simple terms, this means
Fine-Tuned Model Response:
that green plants are using the sun, water, and air to make their own food. This process is called photosynthesis, and it's the first thing plants do in the morning. It's also called the light process, and it's the second process in the morning. The word photosynthesis comes from the Greek word "phophotos" which means "light" and "synthesis" which means "to make." So, photosynthesis is the process of making food using the light. This
--------------------

Prompt 2: It was a dark and rainy night in the city. The neon lights reflected off the wet pavement as
Fine-Tuned Model Response:
I walked home. It was 7:30 p.m. and I had just finished a math competition. Now, I have to figure out how many people are on my street. I counted 8 people wearing red

### Generate with Base Model

In [None]:
print("\n\n--- Generating Non-Math Outputs with BASE Model (Base-Style Prompts) ---")
# Ensure MODEL_NAME holds the original base model identifier
print(f"Base model name: {MODEL_NAME}")

base_model = None
base_tokenizer = None

try:
    print(f"Loading base tokenizer ({MODEL_NAME})...")
    base_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
    if base_tokenizer.pad_token is None: base_tokenizer.pad_token = base_tokenizer.eos_token

    print(f"Loading base model ({MODEL_NAME})...")
    base_model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        trust_remote_code=True,
        torch_dtype=dtype_to_load,
        device_map=device if device.type != 'cpu' else None
    )
    print(f"Base model loaded: dtype={base_model.dtype}, device={base_model.device}")

    for i, prompt in enumerate(non_math_prompts_base_style):
        print(f"\nPrompt {i+1}: {prompt}")
        response = generate_general_response_base(base_model, base_tokenizer, prompt, max_new_tokens_non_math, device)
        print(f"Base Model Response:\n{response}")
        print("-" * 20)

except Exception as e:
    print(f"Failed to load or run inference with base model: {e}")

# Clean up base model
print("Deleting base model and tokenizer from memory...")
if 'base_model' in locals() and base_model is not None: del base_model
if 'base_tokenizer' in locals() and base_tokenizer is not None: del base_tokenizer
if torch.cuda.is_available(): torch.cuda.empty_cache()
elif hasattr(torch.xpu, 'empty_cache') and torch.xpu.is_available(): torch.xpu.empty_cache()


print("\n--- Non-Math Generation Test Complete ---")



--- Generating Non-Math Outputs with BASE Model (Base-Style Prompts) ---
Base model name: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
Loading base tokenizer (deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B)...
Loading base model (deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B)...
Base model loaded: dtype=torch.bfloat16, device=xpu:0

Prompt 1: Photosynthesis is the process by which green plants use sunlight, water, and carbon dioxide to create their own food. In simple terms, this means
Base Model Response:
that plants can convert sunlight into energy, which they use to grow. This energy comes from the Sun's electromagnetic radiation, which is a type of light. So, in the end, plants are using sunlight to power themselves, and their own energy. So, in the end, plants are using sunlight to create their own food. So, in the end, plants are using sunlight to create their own energy, which they can use to grow. So, in the end, plants are using sunlight to
--------------------

Prompt 2: It was a dark 