In [1]:
!pip install datasets evaluate transformers accelerate bitsandbytes wandb torch -q

### Import Libraries and Custom Modules

In [1]:
import os
import gc # For garbage collection
import wandb
import importlib # Import the importlib module

# Import custom modules and reload them to pick up changes
import config
importlib.reload(config) # Reload config to get new variables

# Import the module first, then reload, then import specific classes/functions
import model_handler
importlib.reload(model_handler)
from model_handler import ModelHandler

import data_handler
importlib.reload(data_handler)
from data_handler import DataHandler

import trainer_setup
importlib.reload(trainer_setup)
from trainer_setup import TrainerSetup

import inference
importlib.reload(inference)
from inference import Generator

# Ensure output directory exists using potentially updated config
os.makedirs(config.OUTPUT_DIR, exist_ok=True)
os.makedirs(config.LOGGING_DIR, exist_ok=True)
os.makedirs(os.path.dirname(config.SAVED_MODEL_PATH), exist_ok=True)

print(f"Using device: {config.DEVICE} with dtype: {config.DTYPE_TO_LOAD}")
print(f"Output directory: {config.OUTPUT_DIR}")
print(f"Model: {config.MODEL_NAME}")
print(f"Dataset: {config.DATASET_JSON_PATH}")
print(f"Effective Batch Size: {config.TRAIN_BATCH_SIZE * config.GRADIENT_ACCUMULATION_STEPS}")
print(f"Masking End Sequence (for IDs): '{config.THINK_END_SEQUENCE}'")
print(f"Preprocessing End Sequence (in text): '{config.TRAINING_THINK_END_SEQUENCE}'")

  from .autonotebook import tqdm as notebook_tqdm


Using device: xpu with dtype: torch.bfloat16
Output directory: finetuned_DeepSeek-R1-Distill-Qwen-1.5B_length_val_modified_lila_MATH_algebra_crowdsourced
Model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
Dataset: ../datasets/length_val_modified_lila_MATH_algebra_crowdsourced.json
Effective Batch Size: 8
Masking End Sequence (for IDs): '</think>'
Preprocessing End Sequence (in text): '
</think>'


### 1. Load Base Model and Tokenizer for Training

In [2]:
# Initialize handler for the base model
base_model_handler = ModelHandler(config.MODEL_NAME, config.DEVICE, config.DTYPE_TO_LOAD)

# Load tokenizer
tokenizer = base_model_handler.load_tokenizer()

# Load model (specify for_training=True)
# Trainer handles device placement with Accelerate, so device_map=None is often best here.
model = base_model_handler.load_model(for_training=True)

# Optional: Clear handler if not needed anymore, model/tokenizer are now separate variables
# del base_model_handler
# gc.collect()
# Generator.cleanup_memory()

ModelHandler initialized for model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, device: xpu, dtype: torch.bfloat16
Loading tokenizer: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
Model loaded successfully. Dtype: torch.bfloat16, Device: cpu


### 2. Load and Preprocess Data

In [3]:
data_handler = DataHandler(tokenizer, config.MAX_INPUT_LENGTH)

# Explicitly print the sequence IDs being used for masking
print(f"DataHandler using THINK_END_SEQUENCE: '{config.THINK_END_SEQUENCE}' with IDs: {data_handler.think_end_sequence_ids}")

# Load base dataset and replace train split
dataset = data_handler.load_and_prepare_datasets(
    base_dataset_name=config.BASE_DATASET_NAME,
    base_dataset_config=config.BASE_DATASET_CONFIG,
    train_json_path=config.DATASET_JSON_PATH
)

# Tokenize the dataset
tokenized_dataset = data_handler.tokenize_dataset(dataset)

# Get data collator
data_collator = data_handler.get_data_collator()

DataHandler initialized. Masking End Sequence: '</think>', Encoded IDs for Masking: [151649]
DataHandler using Preprocessing End Sequence in text: '
</think>'
DataHandler using THINK_END_SEQUENCE: '</think>' with IDs: [151649]
Loading base dataset: allenai/lila (MATH_algebra_crowdsourced)
Original dataset structure:
DatasetDict({
    train: Dataset({
        features: ['input', 'output_program', 'output_answer', 'split', 'dataset'],
        num_rows: 263
    })
    validation: Dataset({
        features: ['input', 'output_program', 'output_answer', 'split', 'dataset'],
        num_rows: 106
    })
    test: Dataset({
        features: ['input', 'output_program', 'output_answer', 'split', 'dataset'],
        num_rows: 157
    })
})
Loading modified training data from: ../datasets/length_val_modified_lila_MATH_algebra_crowdsourced.json
Training dataset replaced successfully.
New dataset structure:
DatasetDict({
    train: Dataset({
        features: ['input', 'output_program', 'output_an

### 3. Setup Trainer and WandB

In [5]:
# Ensure required splits exist before passing to TrainerSetup
train_split = tokenized_dataset.get('train')
eval_split = tokenized_dataset.get('validation') # Using validation for eval during training

trainer = None # Initialize trainer to None
trainer_setup = None
wandb_run = None

if train_split and eval_split:
    trainer_setup = TrainerSetup(
        model=model,
        tokenizer=tokenizer,
        data_collator=data_collator,
        train_dataset=train_split,
        eval_dataset=eval_split
    )
    
    # Initialize WandB
    wandb_run = trainer_setup.setup_wandb()
    
    # Configure Training Arguments
    training_args = trainer_setup.configure_training_args()
    
    # Initialize Trainer
    trainer = trainer_setup.initialize_trainer()
else:
    print("Error: Missing 'train' or 'validation' split in tokenized_dataset. Cannot initialize Trainer.")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


TrainerSetup initialized.
Initializing WandB...


[34m[1mwandb[0m: Currently logged in as: [33mvohno013[0m ([33mvohno013-university-of-minnesota[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


WandB initialized successfully.
Training arguments configured.


  self.trainer = Trainer(


Trainer initialized.


### 4. Start Training

In [6]:
train_result = None
if trainer:
    print("Starting training...")
    try:
        train_result = trainer.train()
        print("Training finished.")
        # Log training metrics
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
    except Exception as e:
        print(f"An error occurred during training: {e}")
        # Optional: cleanup resources if training fails early
        # del model, trainer
        # gc.collect()
        # Generator.cleanup_memory()
else:
    print("Skipping training because Trainer initialization failed.")



Starting training...


Step,Training Loss
10,1.9535
20,1.6719
30,1.6164
40,1.4454
50,1.3323
60,1.2842
70,1.1522
80,1.1046
90,1.1143


Training finished.
***** train metrics *****
  epoch                    =     2.9125
  total_flos               =  1203265GF
  train_loss               =     1.3846
  train_runtime            = 0:21:29.07
  train_samples_per_second =      0.612
  train_steps_per_second   =      0.074


### 5. Save Final Model

In [8]:
if trainer and train_result: # Check if training actually ran and completed
    print(f"Saving final model to {config.SAVED_MODEL_PATH}...")
    trainer.save_model(config.SAVED_MODEL_PATH) # Save the model checkpoint
    tokenizer.save_pretrained(config.SAVED_MODEL_PATH) # Save tokenizer with the model
    print(f"Model and tokenizer saved successfully.")
else:
    print("Skipping model saving as training did not complete successfully or trainer was not initialized.")

Saving final model to finetuned_DeepSeek-R1-Distill-Qwen-1.5B_length_val_modified_lila_MATH_algebra_crowdsourced\final_model...
Model and tokenizer saved successfully.


### 6. Evaluate Final Model (Optional)

In [7]:
if trainer and train_result: # Check if training ran and completed
    print("Evaluating final model on the evaluation split...")
    # Note: The evaluation split used here is the one passed during Trainer init (e.g., 'validation')
    eval_metrics = trainer.evaluate()
    trainer.log_metrics("eval", eval_metrics)
    trainer.save_metrics("eval", eval_metrics)
    print(f"Evaluation metrics: {eval_metrics}")
else:
    print("Skipping evaluation as training did not complete successfully or trainer was not initialized.")

Evaluating final model on the evaluation split...


: 

### 7. Finish WandB Run

In [None]:
# Finish WandB run using the static method from TrainerSetup
if trainer_setup:
    TrainerSetup.finish_wandb()
else:
    print("TrainerSetup was not initialized, cannot finish WandB run.")

### 8. Clean Up Training Resources

In [None]:
# Delete training-specific objects to free memory before inference
print("Cleaning up training resources...")
if 'model' in locals(): del model
if 'trainer' in locals(): del trainer
if 'trainer_setup' in locals(): del trainer_setup
if 'tokenized_dataset' in locals(): del tokenized_dataset
if 'base_model_handler' in locals(): del base_model_handler
# Keep 'tokenizer', 'data_handler', 'dataset' if needed for inference comparison
gc.collect() # Run garbage collection
Generator.cleanup_memory() # Clear GPU cache if applicable

Cleaning up training resources...


RuntimeError: Native API failed. Native API returns: 20 (UR_RESULT_ERROR_DEVICE_LOST)

### 9. Setup for Inference

In [4]:
# --- Load Fine-Tuned Model for Inference ---
generator_finetuned = None
if os.path.exists(config.SAVED_MODEL_PATH):
    ft_model, ft_tokenizer = ModelHandler.load_fine_tuned(config.SAVED_MODEL_PATH, config.DEVICE, config.DTYPE_TO_LOAD)
    if ft_model and ft_tokenizer:
        generator_finetuned = Generator(ft_model, ft_tokenizer, config.DEVICE)
    else:
        print("Could not load fine-tuned model/tokenizer properly. Skipping fine-tuned generation.")
else:
    print(f"Fine-tuned model path not found ({config.SAVED_MODEL_PATH}). Skipping fine-tuned generation.")

# --- Load Base Model for Inference ---
generator_base = None
try:
    print("\n--- Loading Base Model for Inference ---")
    # Re-initialize handler for base model inference
    base_model_handler_inf = ModelHandler(config.MODEL_NAME, config.DEVICE, config.DTYPE_TO_LOAD)
    base_tokenizer_inf = base_model_handler_inf.load_tokenizer()
    base_model_inf = base_model_handler_inf.load_model(for_training=False) # Load for inference
    if base_model_inf and base_tokenizer_inf:
        generator_base = Generator(base_model_inf, base_tokenizer_inf, config.DEVICE)
    else:
        print("Could not load base model/tokenizer properly. Skipping base model generation.")
except Exception as e:
    print(f"Error loading base model for inference: {e}. Skipping base model generation.")

# We need the original dataset structure for inference examples
# 'dataset' should still be available from the data loading step (cell 6)
if 'dataset' not in locals():
    print("Error: 'dataset' object not found. Cannot run inference comparisons.")
    # Optionally reload the dataset here if needed, but it should persist
    # if 'tokenizer' in locals(): # Need a tokenizer instance
    #     data_handler_inf = DataHandler(tokenizer, config.MAX_INPUT_LENGTH)
    #     dataset = data_handler_inf.load_and_prepare_datasets(
    #         base_dataset_name=config.BASE_DATASET_NAME,
    #         base_dataset_config=config.BASE_DATASET_CONFIG,
    #         train_json_path=config.DATASET_JSON_PATH
    #     )
    # else:
    #     print("Cannot reload dataset as tokenizer is also missing.")


--- Loading Fine-Tuned Model/Tokenizer from: finetuned_DeepSeek-R1-Distill-Qwen-1.5B_length_val_modified_lila_MATH_algebra_crowdsourced\final_model ---
Loading fine-tuned tokenizer...
Loading fine-tuned model...
Fine-tuned model loaded. Dtype: torch.bfloat16, Device: xpu:0
Generator initialized. Model max length: 131072

--- Loading Base Model for Inference ---
ModelHandler initialized for model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, device: xpu, dtype: torch.bfloat16
Loading tokenizer: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
Loading model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
Model loaded successfully. Dtype: torch.bfloat16, Device: xpu:0
Generator initialized. Model max length: 131072


Can clear finetuned or general model to stop inference testing for either of them

In [5]:
# generator_finetuned = None
generator_base = None

### 10. Generate Math Outputs (Comparison)

In [6]:
if 'dataset' in locals() and (generator_finetuned or generator_base):
    Generator.compare_outputs(
        dataset=dataset, # Use the original dataset loaded earlier
        generator_finetuned=generator_finetuned,
        generator_base=generator_base,
        num_examples=config.NUM_VALIDATION_EXAMPLES_TO_GENERATE
    )
else:
    print("Skipping math output comparison due to missing dataset or both models failed to load.")


--- Comparing Outputs for First 10 Validation Examples ---

--- Example 1 ---
Problem:
Simplify the expression $$(x^5+3x^2+3x^5)-(x^7+2x^2+6x^5).$$...

Actual Solution:
Combining like terms, we find that  \begin{align*}
&(x^5+3x^2+3x^5)-(x^7+2x^2+6x^5)\\
&\qquad=(x^5+3x^5-6x^5)+(3x^2-2x^2)-x^7\\
&\qquad=\boxed{-x^7-2x^5+x^2}.
\end{align*}

Generating with Fine-Tuned Model...
Fine-Tuned Model Solution:
</think> We have \begin{align*}
(x^6+3x^9+3x^5)-(x^3+2x^7+6x^6)
&=x^6+3x^9+3x^5 -x^3-2x^7-6x^6 \\
&=(x^6 -6x^6) + (-x^3) + (3x^9) + (-2x^7) \\
&= (-5x^6) + (-x^3) + (3x^9) + (-2x^7) \\
&=3x^9 -2x^7 -5x^6 -x^3.
\end{align*}Thus, the simplified expression is $\boxed{3x^9 - 2x^7 - 5x^6 - x^3}$.

Skipping Base Model (not provided).
------------------------------

--- Example 2 ---
Problem:
Find the sum of the coefficients in the polynomial $3(x^{10} - x^7 + 2x^3 - x + 7) + 4(x^3 - 2x^2 - 5)$ when it is simplified....

Actual Solution:
The sum of the coefficients in $3(x^{10} - x^7 + 2x^3 - x

### 11. Generate Non-Math Outputs (Comparison)

In [7]:
if generator_finetuned or generator_base:
    Generator.test_non_math_generation(
        prompts=config.NON_MATH_PROMPTS_BASE_STYLE,
        generator_finetuned=generator_finetuned,
        generator_base=generator_base
        # The <think> tag is now handled internally by the Generator using the template
    )
else:
     print("Skipping non-math output comparison as both models failed to load.")



--- Testing Non-Math Generation ---

--- Generating Non-Math with FINE-TUNED Model ---

Prompt 1: Photosynthesis is the process by which green plants use sunlight, water, and carbon dioxide to create their own food. In simple terms, this means
Fine-Tuned Model Response:
Okay, so I'm trying to understand photosynthesis better. I know it's something to do with plants using sunlight, water, and carbon dioxide to make food, which I think is oxygen and glucose. But I'm not entirely sure how all these pieces fit together. Let me break it down step by step.

First, the process involves sunlight. I remember hearing that plants absorb sunlight through their leaves, and this energy is used in the process. I think it's called the light reaction, but I'm not exactly sure what that entails. Maybe it's the conversion of light energy into chemical energy, which plants use to make their own food.

Then there's water. I know that plants take in water from the soil, and this water is split into oxygen

KeyboardInterrupt: 

### 12. Final Cleanup

In [None]:
# Clean up inference resources
print("\nCleaning up inference resources...")
if 'ft_model' in locals(): del ft_model
if 'ft_tokenizer' in locals(): del ft_tokenizer
if 'generator_finetuned' in locals(): del generator_finetuned
if 'base_model_inf' in locals(): del base_model_inf
if 'base_tokenizer_inf' in locals(): del base_tokenizer_inf
if 'generator_base' in locals(): del generator_base
if 'base_model_handler_inf' in locals(): del base_model_handler_inf
if 'dataset' in locals(): del dataset
if 'data_handler' in locals(): del data_handler
if 'tokenizer' in locals(): del tokenizer

gc.collect()
Generator.cleanup_memory()
print("Cleanup complete.")

  trainer = Trainer(


Trainer initialized.
