In [None]:
!pip install datasets evaluate transformers accelerate bitsandbytes wandb torch pandas ipywidgets -q

### Import Libraries and Refactored Modules

In [1]:
import os
import gc
import torch
import importlib

# Import refactored classes
from config_manager import ConfigManager
from model_manager import ModelManager
from dataset_manager import DatasetManager
from training_pipeline import TrainingPipeline
from inference_engine import InferenceEngine
from utils import extract_boxed_answer, compare_math_answers # Import comparison utils

# --- Experiment Configuration ---
# Define the specific parameters for this fine-tuning run
experiment_params = {
    "experiment_name": "DeepSeek-R1-Distill-Qwen-1.5B_scrambled_lila_MATH_algebra_crowdsourced", # Descriptive name for outputs/WandB
    "training_type": "finetune",
    "dataset_json_path": "../datasets/scrambled_lila_MATH_algebra_crowdsourced.json", # Path to the specific training data
    # "dataset_json_path": "../datasets/val_modified_lila_MATH_algebra_crowdsourced.json", # Path to the specific training data
    # "dataset_json_path": "../datasets/length_val_modified_lila_MATH_algebra_crowdsourced.json", # Path to the specific training data
    # --- Optional Overrides (Comment out to use config.py defaults) ---
    # "learning_rate": 1e-5,
    # "epochs": 2,
    # "train_batch_size": 2,
    # "gradient_accumulation_steps": 4,
}

# --- Initialize Configuration ---
config_manager = ConfigManager() # Loads from config.py by default
run_config = config_manager.get_config(experiment_params)

print("--- Run Configuration --- ")
for key, val in run_config.items():
    print(f"{key}: {val}")
print("-------------------------")

Base config loaded. Default Device: xpu, Default Dtype: torch.bfloat16
Force CPU/FP32 for Training: False
Config MAX_INPUT_LENGTH: None (Fallback: 4096)
Compile Model for Evaluation: True
ConfigManager loaded.
ModelManager loaded.
ModelManager loaded.
Utils loaded.
DatasetManager loaded.
Utils loaded.
DatasetManager loaded.
InferenceEngine loaded.
TrainingPipeline loaded.
Base config loaded. Default Device: xpu, Default Dtype: torch.bfloat16
Force CPU/FP32 for Training: False
Config MAX_INPUT_LENGTH: None (Fallback: 4096)
Compile Model for Evaluation: True
ConfigManager initialized with base configuration.
Generated run config for experiment: DeepSeek-R1-Distill-Qwen-1.5B_scrambled_lila_MATH_algebra_crowdsourced
  Output Dir: training_outputs\finetune_DeepSeek-R1-Distill-Qwen-1.5B_scrambled_lila_MATH_algebra_crowdsourced
  WandB Run Name: finetune-DeepSeek-R1-Distill-Qwen-1.5B-DeepSeek-R1-Distill-Qwen-1.5B_scrambled_lila_MATH_algebra_crowdsourced-lr2e-05-ep1
  Dataset JSON: ../datasets

### 1. Initialize and Run Training Pipeline

In [None]:
# Initialize the pipeline with the specific run configuration
pipeline = TrainingPipeline(run_config)

# Run the full pipeline (setup, train, evaluate, save, cleanup)
try:
    pipeline.run()
except Exception as e:
    print(f"Pipeline execution failed: {e}")
    # Optional: Perform partial cleanup if needed
    # pipeline.cleanup() # Cleanup might fail if setup didn't complete

### 2. Setup for Inference Comparison

In [2]:
# --- Reload Config for Inference (if needed, or reuse run_config) ---
# config_manager_inf = ConfigManager()
# inf_config = config_manager_inf.get_base_config() # Use base config for device/dtype
inf_config = run_config # Can reuse run_config if device/dtype are the same

DEVICE = inf_config['DEVICE']
DTYPE_TO_LOAD = inf_config['DTYPE_TO_LOAD']
BASE_MODEL_NAME = inf_config['MODEL_NAME']
SAVED_MODEL_PATH = inf_config['SAVED_MODEL_PATH'] # Get saved path from the run config
MAX_NEW_TOKENS_MATH = inf_config['MAX_NEW_TOKENS_MATH']
MAX_NEW_TOKENS_NON_MATH = inf_config['MAX_NEW_TOKENS_NON_MATH']
NUM_EXAMPLES_TO_COMPARE = inf_config.get('NUM_VALIDATION_EXAMPLES_TO_GENERATE', 5)
NON_MATH_PROMPTS = inf_config.get('NON_MATH_PROMPTS_BASE_STYLE', [])
CONFIG_MAX_LENGTH = inf_config.get('MAX_INPUT_LENGTH')
FALLBACK_MAX_LENGTH = inf_config.get('DEFAULT_FALLBACK_MAX_LENGTH', 4096)
COMPILE_MODEL = inf_config.get('COMPILE_MODEL_FOR_EVALUATION', False) # Get compile flag

inference_style = 'think' # Or 'no_think', match training/evaluation style

generator_finetuned = None
generator_base = None
ft_model = None
ft_tokenizer = None
base_model_inf = None
base_tokenizer_inf = None

# --- Load Fine-Tuned Model ---
print(f"\n--- Loading Fine-Tuned Model ({SAVED_MODEL_PATH}) ---")
if os.path.exists(SAVED_MODEL_PATH):
    ft_model, ft_tokenizer = ModelManager.load_fine_tuned(SAVED_MODEL_PATH, DEVICE, DTYPE_TO_LOAD)
    if ft_model and ft_tokenizer:
        # Pass config values to InferenceEngine
        generator_finetuned = InferenceEngine(
            ft_model, ft_tokenizer, DEVICE, inference_style,
            config_max_length=CONFIG_MAX_LENGTH,
            fallback_max_length=FALLBACK_MAX_LENGTH,
            compile_model=COMPILE_MODEL # Pass compile flag
        )
        print("Fine-tuned model loaded for inference.")
    else:
        print("Failed to load fine-tuned model/tokenizer.")
else:
    print(f"Fine-tuned model path not found: {SAVED_MODEL_PATH}")

# --- Load Base Model ---
print(f"\n--- Loading Base Model ({BASE_MODEL_NAME}) ---")
try:
    # Use ModelManager for base model loading
    # Note: ModelManager itself doesn't compile, InferenceEngine does.
    base_model_manager_inf = ModelManager(BASE_MODEL_NAME, DEVICE, DTYPE_TO_LOAD)
    base_tokenizer_inf = base_model_manager_inf.load_tokenizer()
    base_model_inf = base_model_manager_inf.load_model(for_training=False) # Load for inference
    if base_model_inf and base_tokenizer_inf:
        # Pass config values to InferenceEngine
        generator_base = InferenceEngine(
            base_model_inf, base_tokenizer_inf, DEVICE, inference_style,
            config_max_length=CONFIG_MAX_LENGTH,
            fallback_max_length=FALLBACK_MAX_LENGTH,
            compile_model=COMPILE_MODEL # Pass compile flag
        )
        print("Base model loaded for inference.")
    else:
        print("Failed to load base model/tokenizer.")
except Exception as e:
    print(f"Error loading base model for inference: {e}")

# --- Load Original Dataset for Comparison ---
dataset_for_comparison = None
try:
    # Need a tokenizer instance to init DatasetManager
    temp_tokenizer = ft_tokenizer if ft_tokenizer else base_tokenizer_inf
    if temp_tokenizer:
        # Pass config values to DatasetManager
        inf_dataset_manager = DatasetManager(
            temp_tokenizer,
            config_max_length=CONFIG_MAX_LENGTH,
            fallback_max_length=FALLBACK_MAX_LENGTH
        )
        dataset_for_comparison = inf_dataset_manager.load_base_dataset(
            dataset_name=inf_config['BASE_DATASET_NAME'],
            dataset_config=inf_config['BASE_DATASET_CONFIG']
        )
        print("\nLoaded base dataset for comparison.")
    else:
        print("\nCannot load dataset for comparison - no tokenizer available.")
except Exception as e:
     print(f"\nError loading dataset for comparison: {e}")


--- Loading Fine-Tuned Model (training_outputs\finetune_DeepSeek-R1-Distill-Qwen-1.5B_scrambled_lila_MATH_algebra_crowdsourced\final_model) ---

--- Loading Fine-Tuned Model/Tokenizer from: training_outputs\finetune_DeepSeek-R1-Distill-Qwen-1.5B_scrambled_lila_MATH_algebra_crowdsourced\final_model ---
Loading fine-tuned tokenizer...


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Tokenizer loaded. Padding side set to 'left'.
Loading fine-tuned model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Fine-tuned model loaded. Dtype: torch.bfloat16, Device: xpu:0
Using max_length from model config: 131072
InferenceEngine initialized with inference_style='think'. Effective max length: 131072
Attempting to compile model with torch.compile (Device: xpu)...
Model compiled successfully in 0.78 seconds.
Fine-tuned model loaded for inference.

--- Loading Base Model (deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) ---
ModelManager initialized for model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, default_device: xpu, default_dtype: torch.bfloat16, force_cpu_fp32_train: False
Loading tokenizer: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
Model compiled successfully in 0.78 seconds.
Fine-tuned model loaded for inference.

--- Loading Base Model (deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) ---
ModelManager initialized for model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, default_device: xpu, default_dtype: torch.bfloat16, force_cpu_fp32_train: False
Loading tokenizer: deepseek-ai/DeepSeek-R1-Distill-Qw

### 3. Run Inference Comparison

In [3]:
# --- Math Problem Comparison ---
if dataset_for_comparison and 'validation' in dataset_for_comparison and (generator_finetuned or generator_base):
    print(f"\n--- Comparing Math Outputs (First {NUM_EXAMPLES_TO_COMPARE} Examples) ---")
    validation_subset = dataset_for_comparison['validation'].select(range(min(NUM_EXAMPLES_TO_COMPARE, len(dataset_for_comparison['validation']))))

    problems = validation_subset['input']
    ground_truths = validation_subset['output_answer']

    ft_outputs = []
    base_outputs = []

    if generator_finetuned:
        print("Generating with Fine-Tuned Model...")
        ft_outputs = generator_finetuned.generate_math_batch(problems, max_new_tokens=MAX_NEW_TOKENS_MATH, batch_size=inf_config.get('EVAL_BATCH_SIZE', 1))

    if generator_base:
        print("Generating with Base Model...")
        base_outputs = generator_base.generate_math_batch(problems, max_new_tokens=MAX_NEW_TOKENS_MATH, batch_size=inf_config.get('EVAL_BATCH_SIZE', 1))

    # Print comparison
    for i in range(len(problems)):
        print(f"\n--- Example {i+1} ---")
        print(f"Problem: {problems[i][:500]}...")
        print(f"Actual: {ground_truths[i]}")
        gt_boxed = extract_boxed_answer(ground_truths[i])
        print(f"  Actual Boxed: {gt_boxed}")

        if i < len(ft_outputs):
            print(f"Fine-Tuned: {ft_outputs[i]}")
            ft_boxed = extract_boxed_answer(ft_outputs[i])
            ft_correct = compare_math_answers(ground_truths[i], ft_outputs[i])
            print(f"  FT Boxed: {ft_boxed} (Correct: {ft_correct})")
        else:
            print("Fine-Tuned: [Not Generated]")

        if i < len(base_outputs):
            print(f"Base: {base_outputs[i]}")
            base_boxed = extract_boxed_answer(base_outputs[i])
            base_correct = compare_math_answers(ground_truths[i], base_outputs[i])
            print(f"  Base Boxed: {base_boxed} (Correct: {base_correct})")
        else:
            print("Base: [Not Generated]")
        print("-"*30)
else:
    print("\nSkipping math output comparison (dataset or generators missing).")

# --- Non-Math Prompt Comparison ---
if NON_MATH_PROMPTS and (generator_finetuned or generator_base):
    print("\n\n--- Testing Non-Math Generation ---")
    for i, prompt in enumerate(NON_MATH_PROMPTS):
        print(f"\n--- Prompt {i+1} --- ")
        print(f"Prompt: {prompt}")
        if generator_finetuned:
            print("\nFine-Tuned Response:")
            ft_response = generator_finetuned.generate_general_response(prompt, max_new_tokens=MAX_NEW_TOKENS_NON_MATH)
            print(ft_response)
        if generator_base:
            print("\nBase Response:")
            base_response = generator_base.generate_general_response(prompt, max_new_tokens=MAX_NEW_TOKENS_NON_MATH)
            print(base_response)
        print("-"*30)
else:
    print("\nSkipping non-math comparison (prompts or generators missing).")


--- Comparing Math Outputs (First 10 Examples) ---
Generating with Fine-Tuned Model...
Generating with Base Model...
Generating with Base Model...

--- Example 1 ---
Problem: Simplify the expression $$(x^5+3x^2+3x^5)-(x^7+2x^2+6x^5).$$...
Actual: Combining like terms, we find that  \begin{align*}
&(x^5+3x^2+3x^5)-(x^7+2x^2+6x^5)\\
&\qquad=(x^5+3x^5-6x^5)+(3x^2-2x^2)-x^7\\
&\qquad=\boxed{-x^7-2x^5+x^2}.
\end{align*}
  Actual Boxed: -x^7-2x^5+x^2
Fine-Tuned: We can use the formula for the sum of an arithmetic series, which is $$S = \frac{n}{2}(a_1 + a_n),$$ where $n$ is the number of terms, $a_1$ is the first term, and $a_n$ is the last term.  Since the number of terms is equal to the last term plus one, we have $n = a_n + 1$.  Therefore, $$S = \frac{a_n + 1}{2}(a_1 + a_n) = \frac{1}{2}(a_1 + a_n)(a_1 + a_n + 1).$$  Since $a_1 = 2$ and $a_n = 100$, we have $$S = \frac{1}{2}(2 + 100)(2 + 100 + 1) = \frac{1}{2}(102)(103) = \boxed{5253}.$$
  FT Boxed: 5253 (Correct: False)
Base: Alright, s

KeyboardInterrupt: 

### 4. Final Cleanup

In [4]:
# Clean up inference resources
print("\nCleaning up inference resources...")
del ft_model
del ft_tokenizer
del generator_finetuned
del base_model_inf
del base_tokenizer_inf
del generator_base
del dataset_for_comparison
if 'base_model_manager_inf' in locals(): del base_model_manager_inf
if 'inf_dataset_manager' in locals(): del inf_dataset_manager

gc.collect()
InferenceEngine.cleanup_memory()
print("Cleanup complete.")


Cleaning up inference resources...
Attempting to clear GPU cache...
Cleared XPU cache.
Cleanup complete.
Cleared XPU cache.
Cleanup complete.
