In [2]:
'''
Section: Environment Detection & Package Installation

Detects if code is running in Google Colab by checking environment variables
Installs unsloth (efficient LLM fine-tuning library) and vllm (fast inference engine) if not in Colab
Uses %%capture to suppress installation output
Colab environments have different installation requirements handled separately
'''
#%%capture
import os
import subprocess
import sys

# Check if we're in a special environment (Colab/Kaggle)
if "COLAB_" not in "".join(os.environ.keys()) and "KAGGLE_" not in "".join(os.environ.keys()):
    # For local WSL2 environment - install with specific CUDA support
    try:
        # Install unsloth with CUDA support for RTX 2070 Super
        !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
        
        # Install vllm with CUDA 11.8/12.1 support (common for RTX 2070 Super)
        !pip install vllm
        
        # Install additional dependencies for WSL2
        !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
        
    except Exception as e:
        print(f"Installation warning: {e}")
        print("Falling back to basic installation...")
        !pip install unsloth vllm
else:
    pass  # Special environment handling

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-xdnzlu7v/unsloth_2ebace502881401d96bb6149a6bbccd9
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-xdnzlu7v/unsloth_2ebace502881401d96bb6149a6bbccd9
  Resolved https://github.com/unslothai/unsloth.git to commit 5f14e6fcd405db038ec099c5fe8ba4a753c8472e
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Looking in indexes: https://download.pytorch.org/whl/cu118


In [4]:
'''
Section: Colab-Specific Installation with Hardware Detection

Upgrades uv package manager for faster installations
Detects GPU type (Tesla T4) to install compatible vllm/triton versions
Handles numpy version compatibility by preserving existing version
Installs core ML libraries: unsloth, vllm, transformers, bitsandbytes, xformers
Forces specific transformers version (4.55.4) for compatibility
'''
# WSL2 Installation with RTX 2070 Super optimization
import os
import subprocess

# Upgrade package manager
!pip install --upgrade -q uv

if "COLAB_" not in "".join(os.environ.keys()):
    try:
        # Get current numpy version if available
        try: 
            import numpy
            get_numpy = f"numpy=={numpy.__version__}"
        except: 
            get_numpy = "numpy"
        
        # Check CUDA capability for RTX 2070 Super (compute capability 7.5)
        try:
            gpu_info = str(subprocess.check_output(["nvidia-smi"], stderr=subprocess.DEVNULL))
            is_rtx_2070 = "RTX 2070" in gpu_info or "GeForce RTX" in gpu_info
        except:
            is_rtx_2070 = False
        
        # Install with RTX 2070 Super compatible versions
        if is_rtx_2070:
            print("Detected RTX 2070 Super - installing optimized versions...")
            !uv pip install -q --upgrade \
                unsloth vllm==0.10.1 {get_numpy} torchvision \
                bitsandbytes xformers triton==3.2.0
        else:
            # Fallback installation
            !uv pip install -q --upgrade \
                unsloth vllm {get_numpy} torchvision \
                bitsandbytes xformers triton
                
        # Install specific transformers version for compatibility
        !uv pip install -q transformers==4.55.4
        
    except Exception as e:
        print(f"Installation error: {e}")
        print("Falling back to basic pip installation...")
        !pip install unsloth vllm transformers==4.55.4
else:
    pass  # Colab environment

Detected RTX 2070 Super - installing optimized versions...
  [31m×[0m No solution found when resolving dependencies:
[31m  ╰─▶ [0mBecause torch==2.7.1 depends on triton{platform_machine == 'x86_64'
[31m      [0mand sys_platform == 'linux'}==3.3.1 and vllm==0.10.1 depends
[31m      [0mon torch==2.7.1, we can conclude that vllm==0.10.1 depends on
[31m      [0mtriton==3.3.1.
[31m      [0mAnd because you require vllm==0.10.1 and triton==3.2.0, we can conclude
[31m      [0mthat your requirements are unsatisfiable.


### Unsloth

Goal: To convert `DeepSeek-R1-0528-Qwen3-8B` into a reasoning model via GRPO by using OpenR1's Math dataset.

We also use `langid` for language detection. Our main goal is to force the model to generate reasoning traces in Indonesian, and we create a reward function using `langid` to check this.

In [5]:
# Install language identification library
'''
Section: Language Identification Library Installation

Installs langid package for automatic language detection
Uses -qq flag for quiet installation (minimal output)
Useful for identifying language of text data before processing
'''
!pip install langid -qq

In [6]:
'''
Section: Model Loading and LoRA Configuration

Imports FastLanguageModel from unsloth for efficient fine-tuning
Sets sequence length to 128 tokens and LoRA rank to 8 for memory efficiency
Loads DeepSeek-R1 8B model with 4-bit quantization to reduce VRAM usage
Configures PEFT (Parameter Efficient Fine-Tuning) with LoRA adapters
Targets attention and MLP layers for fine-tuning
Uses gradient checkpointing to save memory during training
'''
'''
Section: Model Loading Fix - Compatibility Issue Resolution

Error occurs because fast_inference and trust_remote_code cannot be used together
Need to choose between fast inference (vLLM) or remote code execution
For fine-tuning, disable fast_inference; enable it later for inference only
'''
from unsloth import FastLanguageModel
import torch
import gc

# Clear any existing GPU memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()

# SOLUTION 1: Use smaller model (RECOMMENDED)
# Conservative settings optimized for RTX 2070 Super (8GB VRAM)
max_seq_length = 512      # Increased since using smaller model
lora_rank = 16            # Can increase with smaller model
dtype = None              # Auto-detect optimal dtype

print("Loading model optimized for RTX 2070 Super...")

try:
    # Option A: Use smaller 1.5B model (RECOMMENDED)
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "unsloth/Qwen2.5-1.5B-Instruct",  # Much smaller model
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = True,
        device_map = {"": 0},           # Force everything on GPU 0
        gpu_memory_utilization = 0.85,  # Can be more aggressive with smaller model
        trust_remote_code = True,
    )
    
except Exception as e:
    print(f"Failed to load 1.5B model: {e}")
    print("Trying alternative approach...")
    
    # Option B: Original model with CPU offloading (FALLBACK)
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "unsloth/DeepSeek-R1-0528-Qwen3-8B",
        max_seq_length = max_seq_length,           # Reduced further
        dtype = dtype,
        load_in_4bit = True,
        llm_int8_enable_fp32_cpu_offload = True,  # Enable CPU offloading
        device_map = "auto",            # Auto-distribute across GPU/CPU
        gpu_memory_utilization = 0.7,
        trust_remote_code = True,
    )

# Configure LoRA for optimal memory usage
model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha = lora_rank * 2,
    lora_dropout = 0.1,                # Slightly higher for regularization
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
)

# Memory status
print(f"Model loaded successfully!")
if hasattr(model, 'device'):
    print(f"Model device: {model.device}")
if torch.cuda.is_available():
    print(f"VRAM allocated: {torch.cuda.memory_allocated()/1024**3:.2f}GB")
    print(f"VRAM reserved: {torch.cuda.memory_reserved()/1024**3:.2f}GB")
    print(f"VRAM free: {(torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_reserved())/1024**3:.2f}GB")

# SOLUTION 2: Alternative smaller models for your GPU
"""
Other recommended models for RTX 2070 Super:
• unsloth/Llama-3.2-3B-Instruct - 3B parameters, good balance
• unsloth/Qwen2.5-3B-Instruct - 3B parameters, efficient
• unsloth/Phi-3.5-mini-instruct - 3.8B parameters, optimized
• microsoft/DialoGPT-medium - 355M parameters, very fast

For 7B+ models, you'll need CPU offloading or model parallelism
"""

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
INFO 09-03 14:26:18 [__init__.py:241] Automatically detected platform cuda.
ERROR 09-03 14:26:19 [fa_utils.py:57] Cannot use FA version 2 is not supported due to FA2 is only supported on devices with compute capability >= 8
🦥 Unsloth Zoo will now patch everything to make training faster!
Loading model optimized for RTX 2070 Super...
Are you certain you want to do remote code execution?
==((====))==  Unsloth 2025.8.10: Fast Qwen2 patching. Transformers: 4.55.4. vLLM: 0.10.1.1.
   \\   /|    NVIDIA GeForce RTX 2070 SUPER. Num GPUs = 1. Max memory: 8.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.31. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.8.10 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Model loaded successfully!
Model device: cuda:0
VRAM allocated: 1.51GB
VRAM reserved: 1.53GB
VRAM free: 6.47GB


"\nOther recommended models for RTX 2070 Super:\n• unsloth/Llama-3.2-3B-Instruct - 3B parameters, good balance\n• unsloth/Qwen2.5-3B-Instruct - 3B parameters, efficient\n• unsloth/Phi-3.5-mini-instruct - 3.8B parameters, optimized\n• microsoft/DialoGPT-medium - 355M parameters, very fast\n\nFor 7B+ models, you'll need CPU offloading or model parallelism\n"

### GRPO Chat Template

Distill Qwen3 from Deepseek has a chat template that is used to format the input and output of the model. This is used to make the model output in a chat format. Including the reasoning step. We have to use that chat template since the model is trained using it.

Let's see how our chat template behaves on an example:

In [7]:
'''
Section: Special Token Detection and System Prompt Setup

Scans tokenizer's special vocabulary for reasoning and role tokens
Identifies thinking tokens (start/end) for Chain-of-Thought reasoning
Finds user and assistant tokens for conversation formatting
Creates system prompt requiring reasoning in Bahasa Indonesia
Sets up structured thinking framework for the model
'''
# Initialize token variables
reasoning_start = None
reasoning_end = None
user_token = None
assistant_token = None

# Scan for special tokens in vocabulary
print("Scanning for special tokens...")
added_vocab = tokenizer.get_added_vocab()
print(f"Found {len(added_vocab)} special tokens")

for token in added_vocab.keys():
    if "think" in token.lower() and "/" in token:
        reasoning_end = token
        print(f"Found reasoning end token: {token}")
    elif "think" in token.lower():
        reasoning_start = token
        print(f"Found reasoning start token: {token}")
    elif "user" in token.lower():
        user_token = token
        print(f"Found user token: {token}")
    elif "assistant" in token.lower():
        assistant_token = token
        print(f"Found assistant token: {token}")

# Display found tokens
print(f"\nToken Summary:")
print(f"Reasoning start: {reasoning_start}")
print(f"Reasoning end: {reasoning_end}")
print(f"User token: {user_token}")
print(f"Assistant token: {assistant_token}")

# System prompt for Indonesian reasoning
system_prompt = f"""You are given a problem.
Think about the problem and provide your working out.
You must think in Bahasa Indonesia."""

print(f"\nSystem prompt configured:")
print(system_prompt)

Scanning for special tokens...
Found 22 special tokens

Token Summary:
Reasoning start: None
Reasoning end: None
User token: None
Assistant token: None

System prompt configured:
You are given a problem.
Think about the problem and provide your working out.
You must think in Bahasa Indonesia.


In [8]:
'''
Section: Chat Template Testing and Format Verification

Tests tokenizer's chat template formatting with sample conversation
Shows how reasoning tokens (<think>) are integrated into responses
Demonstrates multi-turn conversation structure
Uses add_generation_prompt=True to prepare for model generation
Verifies proper formatting before training/inference
'''
# Test chat template with reasoning tokens
print("Testing chat template formatting...")
print("=" * 50)

sample_conversation = [
    {"role": "user", "content": "What is 1+1?"},
    {"role": "assistant", "content": f"<think>I think it's 2.2</think>2"},
    {"role": "user", "content": "What is 1+1?"},
    {"role": "assistant", "content": f"<think>I think it's 2.2</think>2"},
]

# Apply chat template and display
formatted_chat = tokenizer.apply_chat_template(
    sample_conversation,
    tokenize=False,
    add_generation_prompt=True
)

print("Formatted conversation:")
print(formatted_chat)
print("=" * 50)

# Check if reasoning tokens are properly handled
if reasoning_start and reasoning_end:
    print(f"\nReasoning tokens detected:")
    print(f"Start: {reasoning_start}")
    print(f"End: {reasoning_end}")
    
    # Test with detected tokens if available
    test_with_detected_tokens = [
        {"role": "user", "content": "What is 2+2?"},
        {"role": "assistant", "content": f"{reasoning_start}Let me calculate: 2+2 = 4{reasoning_end}The answer is 4."},
    ]
    
    print(f"\nWith detected reasoning tokens:")
    formatted_with_tokens = tokenizer.apply_chat_template(
        test_with_detected_tokens,
        tokenize=False,
        add_generation_prompt=True
    )
    print(formatted_with_tokens)
else:
    print("\nNo reasoning tokens detected - using generic <think> tags")

Testing chat template formatting...
Formatted conversation:
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
What is 1+1?<|im_end|>
<|im_start|>assistant
<think>I think it's 2.2</think>2<|im_end|>
<|im_start|>user
What is 1+1?<|im_end|>
<|im_start|>assistant
<think>I think it's 2.2</think>2<|im_end|>
<|im_start|>assistant


No reasoning tokens detected - using generic <think> tags


### Data Prep
<a name="Data"></a>

We're using Hugging Face's [Open R1 Math dataset](https://huggingface.co/datasets/open-r1/DAPO-Math-17k-Processed). You can also utilize OpenAI's famous [GSM8K dataset](https://huggingface.co/datasets/openai/gsm8k)

In [9]:
'''
Section: Dataset Loading from Hugging Face Hub

Loads DAPO-Math-17k-Processed dataset from Hugging Face
Specifically loads English ("en") subset of the dataset
Uses training split containing 17k processed math problems
Dataset contains mathematical reasoning examples for fine-tuning
'''
from datasets import load_dataset
import gc

print("Loading DAPO-Math-17k dataset...")

try:
    # Load dataset with memory optimization
    dataset = load_dataset(
        "open-r1/DAPO-Math-17k-Processed", 
        "en", 
        split="train",
        streaming=False,  # Load full dataset for training
        trust_remote_code=True
    )
    
    print(f"Dataset loaded successfully!")
    print(f"Dataset size: {len(dataset)} examples")
    print(f"Dataset features: {dataset.features}")
    
    # Display sample to verify format
    if len(dataset) > 0:
        print(f"\nSample entry:")
        print(f"Keys: {list(dataset[0].keys())}")
        for key, value in dataset[0].items():
            print(f"{key}: {str(value)[:200]}{'...' if len(str(value)) > 200 else ''}")
    
    # Memory cleanup
    gc.collect()
    
except Exception as e:
    print(f"Error loading dataset: {e}")
    print("Attempting alternative loading method...")
    
    # Fallback method
    try:
        dataset = load_dataset("open-r1/DAPO-Math-17k-Processed", split="train[:1000]")
        print(f"Loaded subset: {len(dataset)} examples")
    except Exception as e2:
        print(f"Fallback failed: {e2}")
        dataset = None

dataset

Loading DAPO-Math-17k dataset...
Dataset loaded successfully!
Dataset size: 14116 examples
Dataset features: {'prompt': Value(dtype='string', id=None), 'solution': Value(dtype='string', id=None), 'data_source': Value(dtype='string', id=None), 'source_prompt': [{'content': Value(dtype='string', id=None), 'role': Value(dtype='string', id=None)}], 'ability': Value(dtype='string', id=None), 'reward_model': {'ground_truth': Value(dtype='string', id=None), 'style': Value(dtype='string', id=None)}, 'extra_info': {'index': Value(dtype='string', id=None)}}

Sample entry:
Keys: ['prompt', 'solution', 'data_source', 'source_prompt', 'ability', 'reward_model', 'extra_info']
prompt: In triangle $ABC$, $\sin \angle A = \frac{4}{5}$ and $\angle A < 90^\circ$. Let $D$ be a point outside triangle $ABC$ such that $\angle BAD = \angle DAC$ and $\angle BDC = 90^\circ$. Suppose that $AD ...
solution: 34
data_source: math_dapo
source_prompt: [{'content': 'Solve the following math problem step by step. The l

Dataset({
    features: ['prompt', 'solution', 'data_source', 'source_prompt', 'ability', 'reward_model', 'extra_info'],
    num_rows: 14116
})

Let's look at the first row:

In [10]:
dataset[0]["prompt"]

'In triangle $ABC$, $\\sin \\angle A = \\frac{4}{5}$ and $\\angle A < 90^\\circ$. Let $D$ be a point outside triangle $ABC$ such that $\\angle BAD = \\angle DAC$ and $\\angle BDC = 90^\\circ$. Suppose that $AD = 1$ and that $\\frac{BD}{CD} = \\frac{3}{2}$. If $AB + AC$ can be expressed in the form $\\frac{a\\sqrt{b}}{c}$ where $a, b, c$ are pairwise relatively prime integers, find $a + b + c$.'

In [11]:
dataset[0]["solution"]

'34'

In GSM8K, ee notice all answers like about have a ####, so we extract it. But for the Open R1 dataset, we can skip the below.

In [12]:
'''
Section: Answer Extraction Function

Defines function to extract final answers from solution text
Originally designed to parse answers after "####" delimiter
Currently returns full text without extraction (commented out logic)
Tests function on first dataset example to verify format
'''
'''
Section: Answer Extraction Function - Fixed Dataset Iteration

Error occurs because dataset items are accessed as dictionaries but treated as objects
Need to use dictionary key access instead of .get() method
Fixed iteration over dataset to properly access solution field
'''
def extract_hash_answer(text):
    """
    Extract final answer from solution text.
    Originally designed for #### delimiter format.
    """
    # Original logic (commented out):
    # if "####" not in text: return None
    # return text.split("####")[1].strip()
    
    # Currently returns full text
    return text

# Test extraction function on dataset sample
print("Testing answer extraction function...")
print("=" * 50)

if dataset and len(dataset) > 0:
    sample_solution = dataset[0]["solution"]
    print(f"Original solution (first 300 chars):")
    print(f"{sample_solution[:300]}{'...' if len(sample_solution) > 300 else ''}")
    
    extracted = extract_hash_answer(sample_solution)
    print(f"\nExtracted result:")
    print(f"{extracted[:300]}{'...' if len(extracted) > 300 else ''}")
    
    # Fixed: Check if #### pattern exists in dataset
    hash_count = 0
    sample_size = min(100, len(dataset))
    
    for i in range(sample_size):
        try:
            solution = dataset[i]["solution"]
            if "####" in str(solution):
                hash_count += 1
        except (KeyError, TypeError):
            continue
    
    print(f"\nDataset analysis (first {sample_size} examples):")
    print(f"Examples with '####' pattern: {hash_count}/{sample_size}")
    
    if hash_count > 0:
        print("Consider enabling hash extraction logic")
        # Show example with #### pattern
        for i in range(min(10, len(dataset))):
            if "####" in str(dataset[i]["solution"]):
                print(f"Example with ####: ...{dataset[i]['solution'][-100:]}")
                break
    else:
        print("No '####' pattern found - current logic is appropriate")
else:
    print("No dataset available for testing")
    
print("=" * 50)

Testing answer extraction function...
Original solution (first 300 chars):
34

Extracted result:
34

Dataset analysis (first 100 examples):
Examples with '####' pattern: 0/100
No '####' pattern found - current logic is appropriate


Let's map the dataset! and see the first row:

In [13]:
'''
Section: Dataset Transformation and Formatting

Maps dataset to chat format with system/user role structure
Combines system prompt with user prompts from dataset
Extracts answers from solutions using previously defined function
Transforms raw dataset into conversation format suitable for fine-tuning
Creates structured prompt-answer pairs for training
'''
print("Transforming dataset to chat format...")
print("=" * 50)

def transform_dataset_item(x):
    """Transform single dataset item to chat format"""
    try:
        return {
            "prompt": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": x["prompt"]},
            ],
            "answer": extract_hash_answer(x["solution"]),
        }
    except KeyError as e:
        print(f"Warning: Missing key {e} in dataset item")
        return {
            "prompt": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": ""},
            ],
            "answer": "",
        }

# Apply transformation with progress tracking
try:
    print(f"Processing {len(dataset)} examples...")
    dataset = dataset.map(
        transform_dataset_item,
        desc="Transforming dataset",
        num_proc=1,  # Single process for WSL2 stability
    )
    
    print("Dataset transformation completed!")
    print(f"Transformed dataset size: {len(dataset)}")
    
    # Display sample transformed item
    if len(dataset) > 0:
        print(f"\nSample transformed item:")
        sample_item = dataset[0]
        
        print(f"Prompt structure:")
        for i, msg in enumerate(sample_item["prompt"]):
            print(f"  {i+1}. Role: {msg['role']}")
            content_preview = msg['content'][:150] + "..." if len(msg['content']) > 150 else msg['content']
            print(f"     Content: {content_preview}")
        
        print(f"\nAnswer preview:")
        answer_preview = sample_item["answer"][:200] + "..." if len(sample_item["answer"]) > 200 else sample_item["answer"]
        print(f"{answer_preview}")
        
except Exception as e:
    print(f"Error during transformation: {e}")
    print("Dataset transformation failed")

print("=" * 50)

Transforming dataset to chat format...
Processing 14116 examples...
Dataset transformation completed!
Transformed dataset size: 14116

Sample transformed item:
Prompt structure:
  1. Role: system
     Content: You are given a problem.
Think about the problem and provide your working out.
You must think in Bahasa Indonesia.
  2. Role: user
     Content: In triangle $ABC$, $\sin \angle A = \frac{4}{5}$ and $\angle A < 90^\circ$. Let $D$ be a point outside triangle $ABC$ such that $\angle BAD = \angle D...

Answer preview:
34


We create a regex format to match the reasoning sections and answers:

In [14]:
'''
Section: Regex Pattern for Solution End Matching

Imports regex module for pattern matching
Creates pattern to match text after reasoning end token
Uses DOTALL flag to match across multiple lines including newlines
Designed to extract final answer portion after reasoning concludes
Compiles regex for efficient repeated matching operations
'''
import re

print("Setting up solution end regex pattern...")
print("=" * 50)

# Check if reasoning end token was found
if reasoning_end is not None:
    # Create regex pattern to match content after reasoning end token
    solution_end_regex = rf"{re.escape(reasoning_end)}(.*)"
    
    # Compile regex with DOTALL flag for multiline matching
    match_format = re.compile(solution_end_regex, re.DOTALL)
    
    print(f"Reasoning end token: {reasoning_end}")
    print(f"Regex pattern: {solution_end_regex}")
    print(f"Compiled regex: {match_format}")
    
    # Test the regex pattern with sample text
    test_text = f"Some reasoning here {reasoning_end} Final answer: 42"
    test_match = match_format.search(test_text)
    
    if test_match:
        print(f"\nRegex test successful:")
        print(f"Matched text: '{test_match.group(1).strip()}'")
    else:
        print(f"\nRegex test failed - no match found")
        
else:
    print("Warning: No reasoning end token detected!")
    print("Creating fallback regex pattern...")
    
    # Fallback pattern using generic </think> tag
    solution_end_regex = r"</think>(.*)"
    match_format = re.compile(solution_end_regex, re.DOTALL)
    print(f"Fallback regex pattern: {solution_end_regex}")

print("=" * 50)

Setting up solution end regex pattern...
Creating fallback regex pattern...
Fallback regex pattern: </think>(.*)


We verify it works:

In [15]:
'''
Section: Regex Pattern Testing with Sample Text

Tests the compiled regex pattern with sample reasoning text
Uses findall() to extract all matches of content after reasoning end token
Sample text simulates model output with thinking and final answer
Validates that regex correctly captures final answer portion
'''
print("Testing regex pattern with sample text...")
print("=" * 50)

# Test sample with different scenarios
test_samples = [
    # Original test case
    "Let me think!</think>Hence, the solution is 2.",
    
    # Test with detected reasoning end token if available
    f"Let me think!{reasoning_end}Hence, the solution is 2." if reasoning_end else "Let me think!</think>Hence, the solution is 2.",
    
    # Multi-line test
    f"Complex reasoning here{reasoning_end}\nHence, the solution is 42.\nThis is the final answer." if reasoning_end else "Complex reasoning here</think>\nHence, the solution is 42.\nThis is the final answer.",
    
    # No match test
    "Just some text without reasoning tokens."
]

for i, test_text in enumerate(test_samples, 1):
    print(f"Test {i}:")
    print(f"Input: {test_text[:100]}{'...' if len(test_text) > 100 else ''}")
    
    # Use findall to get all matches
    matches = match_format.findall(test_text)
    
    if matches:
        print(f"Matches found: {len(matches)}")
        for j, match in enumerate(matches):
            cleaned_match = match.strip()
            print(f"  Match {j+1}: '{cleaned_match[:150]}{'...' if len(cleaned_match) > 150 else ''}'")
    else:
        print("No matches found")
    
    print("-" * 30)

# Test the original example specifically
original_test = "Let me think!</think>Hence, the solution is 2."
original_matches = match_format.findall(original_test)

print(f"\nOriginal test result:")
print(f"Input: {original_test}")
print(f"Matches: {original_matches}")

print("=" * 50)

Testing regex pattern with sample text...
Test 1:
Input: Let me think!</think>Hence, the solution is 2.
Matches found: 1
  Match 1: 'Hence, the solution is 2.'
------------------------------
Test 2:
Input: Let me think!</think>Hence, the solution is 2.
Matches found: 1
  Match 1: 'Hence, the solution is 2.'
------------------------------
Test 3:
Input: Complex reasoning here</think>
Hence, the solution is 42.
This is the final answer.
Matches found: 1
  Match 1: 'Hence, the solution is 42.
This is the final answer.'
------------------------------
Test 4:
Input: Just some text without reasoning tokens.
No matches found
------------------------------

Original test result:
Input: Let me think!</think>Hence, the solution is 2.
Matches: ['Hence, the solution is 2.']


In [16]:
'''
Section: Regex Pattern Testing with Proper Think Tags

Tests regex with properly formatted thinking tags (<think></think>)
Includes newlines between reasoning and final answer sections
Verifies pattern matching works with realistic model output format
Demonstrates extraction of solution text after reasoning completion
'''
print("Testing regex with proper think tag format...")
print("=" * 50)

# Test the specific example with proper formatting
test_input = "<think>Let me think!</think>\n\nHence, the solution is 2"
print(f"Test input:")
print(f"'{test_input}'")
print()

# Apply findall to extract matches
matches = match_format.findall(test_input)

print(f"Regex pattern used: {match_format.pattern}")
print(f"Matches found: {len(matches)}")

if matches:
    for i, match in enumerate(matches):
        # Clean up whitespace for display
        cleaned_match = match.strip()
        print(f"Match {i+1}: '{cleaned_match}'")
        
        # Show raw match with whitespace visible
        print(f"Raw match (with whitespace): {repr(match)}")
else:
    print("No matches found")

# Additional test cases with variations
additional_tests = [
    # With reasoning_end token if detected
    f"<think>Complex calculation</think>{reasoning_end}The answer is 42" if reasoning_end else "<think>Complex calculation</think>The answer is 42",
    
    # Multi-line answer
    "<think>Working step by step</think>\n\nStep 1: Calculate\nStep 2: Verify\nFinal answer: 100",
    
    # Empty reasoning
    "<think></think>Direct answer: 5"
]

print("\nAdditional test cases:")
print("-" * 30)

for i, test_case in enumerate(additional_tests, 1):
    print(f"Test {i}: {test_case[:60]}{'...' if len(test_case) > 60 else ''}")
    result = match_format.findall(test_case)
    print(f"Result: {result}")
    print()

print("=" * 50)

Testing regex with proper think tag format...
Test input:
'<think>Let me think!</think>

Hence, the solution is 2'

Regex pattern used: </think>(.*)
Matches found: 1
Match 1: 'Hence, the solution is 2'
Raw match (with whitespace): '\n\nHence, the solution is 2'

Additional test cases:
------------------------------
Test 1: <think>Complex calculation</think>The answer is 42
Result: ['The answer is 42']

Test 2: <think>Working step by step</think>

Step 1: Calculate
Step ...
Result: ['\n\nStep 1: Calculate\nStep 2: Verify\nFinal answer: 100']

Test 3: <think></think>Direct answer: 5
Result: ['Direct answer: 5']



We now want to create a reward function to match the format exactly - we reward it with 3 points if it succeeds:

In [17]:
'''
Section: Format Matching Scoring Function

Defines scoring function for evaluating completion quality
Awards 3.0 points if regex pattern is found in model response
Used for ranking/filtering model outputs during inference
Ensures responses follow expected reasoning-to-answer format
Returns list of scores corresponding to input completions
'''
def match_format_exactly(completions, **kwargs):
    """
    Score completions based on format matching.
    Awards points for proper reasoning-to-answer structure.
    
    Args:
        completions: List of completion objects with content
        **kwargs: Additional arguments (unused but maintained for compatibility)
    
    Returns:
        List of scores for each completion
    """
    scores = []
    
    for i, completion in enumerate(completions):
        score = 0
        
        try:
            # Extract response content
            response = completion[0]["content"]
            
            # Check if response matches expected format
            if match_format.search(response) is not None:
                score += 3.0
                # Optional: Add debug info
                # print(f"Completion {i}: Format match found (+3.0)")
            
        except (IndexError, KeyError, TypeError) as e:
            # Handle malformed completion objects
            print(f"Warning: Error processing completion {i}: {e}")
            score = 0
        
        scores.append(score)
    
    return scores

# Test the scoring function
print("Testing format scoring function...")
print("=" * 50)

# Create mock completions for testing - FIXED SYNTAX
test_completions = [
    # Good format - should score 3.0
    [[{"content": f"<think>Let me calculate</think>{reasoning_end if reasoning_end else '</think>'}The answer is 42"}]],
    
    # Bad format - should score 0.0 - FIXED
    [[{"content": "Just a direct answer without reasoning"}]],
    
    # Another good format
    [[{"content": "<think>Step by step</think>\n\nFinal result: 100"}]],
]

# Test scoring
try:
    test_scores = match_format_exactly(test_completions)
    print(f"Test scores: {test_scores}")
    
    for i, (completion, score) in enumerate(zip(test_completions, test_scores)):
        content = completion[0]["content"][:50] + "..." if len(completion[0]["content"]) > 50 else completion[0]["content"]
        print(f"Completion {i}: '{content}' -> Score = {score}")
        
except Exception as e:
    print(f"Error testing scoring function: {e}")

print("=" * 50)

Testing format scoring function...
Test scores: [0, 0, 0]
Error testing scoring function: list indices must be integers or slices, not str


If it fails, we want to reward the model if it at least follows the format partially, by counting each symbol:

In [18]:
'''
Section: Approximate Format Matching Scoring Function

Scores completions based on proper reasoning token usage
Awards +0.5 for exactly one occurrence of reasoning start/end tokens
Penalizes with -1.0 for incorrect token counts (0 or multiple occurrences)
More lenient than exact matching but ensures proper token structure
Prevents malformed reasoning sections with missing or duplicated tokens
'''
def match_format_approximately(completions, **kwargs):
    """
    Score completions based on approximate format matching.
    Rewards proper reasoning token usage and penalizes malformed structure.
    
    Args:
        completions: List of completion objects with content
        **kwargs: Additional arguments (unused but maintained for compatibility)
    
    Returns:
        List of scores for each completion
    """
    scores = []
    
    for i, completion in enumerate(completions):
        score = 0
        
        try:
            response = completion[0]["content"]
            
            # Check reasoning start token count
            if reasoning_start:
                start_count = response.count(reasoning_start)
                if start_count == 1:
                    score += 0.5
                else:
                    score -= 1.0
                    # Optional debug: print(f"Start token count: {start_count}")
            
            # Check reasoning end token count  
            if reasoning_end:
                end_count = response.count(reasoning_end)
                if end_count == 1:
                    score += 0.5
                else:
                    score -= 1.0
                    # Optional debug: print(f"End token count: {end_count}")
            
        except (IndexError, KeyError, TypeError) as e:
            print(f"Warning: Error processing completion {i}: {e}")
            score = -2.0  # Heavy penalty for malformed input
        
        scores.append(score)
    
    return scores

# Test the approximate scoring function
print("Testing approximate format scoring function...")
print("=" * 50)

# Create test completions with various token patterns
test_completions = [
    # Perfect format - should score +1.0 (0.5 + 0.5)
    [[{"content": f"Some text {reasoning_start}reasoning{reasoning_end} answer"}]] if reasoning_start and reasoning_end else [[{"content": "<think>reasoning</think> answer"}]],
    
    # Missing end token - should score -0.5 (0.5 - 1.0)
    [[{"content": f"Some text {reasoning_start}reasoning without end"}]] if reasoning_start else [[{"content": "<think>reasoning without end"}]],
    
    # Duplicate tokens - should score -2.0 (-1.0 - 1.0)
    [[{"content": f"{reasoning_start}first{reasoning_end} and {reasoning_start}second{reasoning_end}"}]] if reasoning_start and reasoning_end else [[{"content": "<think>first</think> and <think>second</think>"}]],
    
    # No reasoning tokens - should score -2.0
    [[{"content": "Just plain text with no reasoning"}]],
]

try:
    test_scores = match_format_approximately(test_completions)
    print(f"Test scores: {test_scores}")
    print(f"Expected: [1.0, -0.5, -2.0, -2.0] (approximately)")
    
    for i, (completion, score) in enumerate(zip(test_completions, test_scores)):
        content = completion[0]["content"][:60] + "..." if len(completion[0]["content"]) > 60 else completion[0]["content"]
        print(f"Completion {i}: '{content}' -> Score = {score}")
        
except Exception as e:
    print(f"Error testing approximate scoring: {e}")

print("=" * 50)

Testing approximate format scoring function...
Test scores: [-2.0, -2.0, -2.0, -2.0]
Expected: [1.0, -0.5, -2.0, -2.0] (approximately)
Error testing approximate scoring: list indices must be integers or slices, not str


We want to extract the generated answer, and reward or penalize it! We also reward it based on how close the answer is to the true one via ratios:

In [19]:
'''
Section: Answer Accuracy Scoring Function

Extracts predicted answers from completions using regex pattern
Compares extracted answers against ground truth with multiple scoring tiers
Awards 5.0 points for exact matches, 3.5 for whitespace-trimmed matches
Uses ratio-based scoring for numerical answers (2.0 for ±10%, 1.5 for ±20%)
Heavy penalties for wrong answers (-2.5 to -4.5) to discourage hallucination
'''
def check_answer(prompts, completions, answer, **kwargs):
    """
    Score completions based on answer accuracy.
    Uses tiered scoring system for exact, approximate, and numerical matches.
    
    Args:
        prompts: List of prompt messages
        completions: List of completion objects
        answer: List of correct answers
        **kwargs: Additional arguments
    
    Returns:
        List of scores for each completion
    """
    try:
        question = prompts[0][-1]["content"]
        responses = [completion[0]["content"] for completion in completions]
        
        # Extract answers using regex pattern
        extracted_responses = []
        for r in responses:
            match = match_format.search(r)
            extracted_responses.append(match.group(1) if match is not None else None)
        
        scores = []
        
        for i, (guess, true_answer) in enumerate(zip(extracted_responses, answer)):
            score = 0
            
            if guess is None:
                scores.append(-2.0)  # No extractable answer
                continue
            
            # Clean extracted guess
            guess_clean = guess.strip()
            true_answer_clean = str(true_answer).strip()
            
            # Exact match - highest reward
            if guess_clean == true_answer_clean:
                score += 5.0
            
            # Whitespace differences - good match  
            elif guess_clean.replace(" ", "") == true_answer_clean.replace(" ", ""):
                score += 3.5
            
            # Numerical comparison for math problems
            else:
                try:
                    # Extract numbers from strings if needed
                    import re
                    guess_nums = re.findall(r'-?\d+\.?\d*', guess_clean)
                    true_nums = re.findall(r'-?\d+\.?\d*', true_answer_clean)
                    
                    if guess_nums and true_nums:
                        guess_val = float(guess_nums[-1])  # Take last number
                        true_val = float(true_nums[-1])
                        
                        if true_val != 0:
                            ratio = guess_val / true_val
                            if 0.9 <= ratio <= 1.1:
                                score += 2.0    # Within 10%
                            elif 0.8 <= ratio <= 1.2:
                                score += 1.5    # Within 20%
                            else:
                                score -= 2.5    # Wrong numerical answer
                        else:
                            score -= 2.5
                    else:
                        score -= 4.5    # Non-numerical wrong answer
                        
                except (ValueError, ZeroDivisionError, IndexError):
                    score -= 4.5    # Cannot process answer
            
            scores.append(score)
        
    except Exception as e:
        print(f"Error in check_answer: {e}")
        return [-5.0] * len(completions)  # Severe penalty for function failure
    
    return scores

# Test the answer checking function
print("Testing answer accuracy scoring function...")
print("=" * 50)

# Mock test data
test_prompts = [
    [{"role": "user", "content": "What is 2 + 2?"}]
]

test_completions = [
    # Exact match
    [[{"content": f"<think>Let me add</think>{reasoning_end if reasoning_end else '</think>'}4"}]],
    
    # Close numerical answer
    [[{"content": f"<think>Calculating</think>{reasoning_end if reasoning_end else '</think>'}4.1"}]],
    
    # Wrong answer
    [[{"content": f"<think>Adding</think>{reasoning_end if reasoning_end else '</think>'}5"}]],
    
    # No extractable answer
    [[{"content": "Just thinking without proper format"}]],
]

test_answers = ["4", "4", "4", "4"]

try:
    test_scores = check_answer(test_prompts, test_completions, test_answers)
    print(f"Test scores: {test_scores}")
    
    expected_ranges = ["5.0 (exact)", "2.0 (close)", "-2.5 (wrong)", "-2.0 (no answer)"]
    for i, (score, expected) in enumerate(zip(test_scores, expected_ranges)):
        print(f"Test {i}: Score = {score} (expected {expected})")
        
except Exception as e:
    print(f"Error testing answer checking: {e}")

print("=" * 50)

Testing answer accuracy scoring function...
Error in check_answer: list indices must be integers or slices, not str
Test scores: [-5.0, -5.0, -5.0, -5.0]
Test 0: Score = -5.0 (expected 5.0 (exact))
Test 1: Score = -5.0 (expected 2.0 (close))
Test 2: Score = -5.0 (expected -2.5 (wrong))
Test 3: Score = -5.0 (expected -2.0 (no answer))


Also sometimes it might not be 1 number as the answer, but like a sentence for example "The solution is $20" -> we extract 20.

We also remove possible commas for example as in 123,456

In [20]:
import re

print("Testing number extraction regex pattern...")
print("=" * 50)

# Create regex pattern for number extraction
match_numbers = re.compile(
    r".*?[\s]{0,}([-]?[\d\.\,]{1,})",
    flags=re.MULTILINE | re.DOTALL
)

print("Regex pattern:", match_numbers.pattern)
print("Flags: MULTILINE | DOTALL")
print()

# Test cases with various number formats
test_cases = [
    "  0.34  ",           # Decimal with spaces
    "  123,456  ",        # Comma-separated number
    "  -0.234  ",         # Negative decimal
    "17",                 # Simple integer
    "The answer is 42",   # Number in sentence
    "-999.99",            # Negative decimal
    "1,234.56",           # Mixed comma and decimal
    "No numbers here",    # No numbers
    "3.14159",            # Pi approximation
    "Multiple 123 and 456", # Multiple numbers
]

print("Test Results:")
print("-" * 30)

for i, test_input in enumerate(test_cases, 1):
    matches = match_numbers.findall(test_input)
    print(f"Test {i:2d}: '{test_input:20}' -> {matches}")

# Additional analysis
print()
print("Pattern Analysis:")
print("-" * 30)
print("• .*? - Non-greedy match any characters")
print("• [\\s]{0,} - Optional whitespace (0 or more)")
print("• ([-]?[\\d\\.\\,]{1,}) - Capture group:")
print("  - [-]? - Optional negative sign")
print("  - [\\d\\.\\,]{1,} - One or more digits, dots, or commas")
print("• MULTILINE: ^ and $ match line boundaries")
print("• DOTALL: . matches newline characters")

# Test with multi-line input
multiline_test = """
Step 1: Calculate
The result is 42.5
Final answer: 100
"""

print(f"\nMulti-line test:")
print(f"Input: {repr(multiline_test)}")
print(f"Matches: {match_numbers.findall(multiline_test)}")

print("=" * 50)

Testing number extraction regex pattern...
Regex pattern: .*?[\s]{0,}([-]?[\d\.\,]{1,})
Flags: MULTILINE | DOTALL

Test Results:
------------------------------
Test  1: '  0.34              ' -> ['0.34']
Test  2: '  123,456           ' -> ['123,456']
Test  3: '  -0.234            ' -> ['-0.234']
Test  4: '17                  ' -> ['17']
Test  5: 'The answer is 42    ' -> ['42']
Test  6: '-999.99             ' -> ['-999.99']
Test  7: '1,234.56            ' -> ['1,234.56']
Test  8: 'No numbers here     ' -> []
Test  9: '3.14159             ' -> ['3.14159']
Test 10: 'Multiple 123 and 456' -> ['123', '456']

Pattern Analysis:
------------------------------
• .*? - Non-greedy match any characters
• [\s]{0,} - Optional whitespace (0 or more)
• ([-]?[\d\.\,]{1,}) - Capture group:
  - [-]? - Optional negative sign
  - [\d\.\,]{1,} - One or more digits, dots, or commas
• MULTILINE: ^ and $ match line boundaries
• DOTALL: . matches newline characters

Multi-line test:
Input: '\nStep 1: Calculate

Finally, we will try to enforce the thinking process to be in Bahasa Indonesia. This is a simple version of the `language consistency reward` that is used in DeepSeek R1 paper

In [21]:
'''
Section: Language Detection Function

Imports langid library for automatic language identification
Creates function to detect language of input text
Returns "und" (undefined) for empty text inputs
Uses langid.classify() which returns language code and confidence score
Tests function with English, Indonesian, and Chinese text samples
'''
import langid

def get_lang(text: str) -> str:
    """
    Detect language of input text using langid.
    
    Args:
        text: Input text string
        
    Returns:
        Language code (e.g., 'en', 'id', 'zh') or 'und' for undefined
    """
    if not text or not text.strip():
        return "und"  # undefined language
    
    try:
        lang, confidence = langid.classify(text)
        return lang
    except Exception as e:
        print(f"Language detection error: {e}")
        return "und"

# Test language detection
print("Testing language detection function...")
print("=" * 50)

test_cases = [
    ("Hello, How are you", "en"),
    ("Aku berpikir kalau aku adalah kamu", "id"), 
    ("我在这里", "zh"),
    ("", "und"),  # Empty string test
    ("   ", "und"),  # Whitespace only test
    ("Bonjour, comment allez-vous?", "fr"),  # French
    ("Hola, ¿cómo estás?", "es"),  # Spanish
    ("Guten Tag, wie geht es Ihnen?", "de"),  # German
    ("123 456 789", "und"),  # Numbers only
]

print("Language Detection Results:")
print("-" * 30)

for i, (text, expected) in enumerate(test_cases, 1):
    detected = get_lang(text)
    status = "✓" if detected == expected else "✗"
    
    # Get confidence for non-empty strings
    if text.strip():
        try:
            _, confidence = langid.classify(text)
            conf_str = f" (conf: {confidence:.3f})"
        except:
            conf_str = ""
    else:
        conf_str = ""
    
    print(f"Test {i:2d}: {status} '{text[:30]:30}' -> {detected:3} (expected: {expected}){conf_str}")

# Test with reasoning content
print(f"\nTesting with system prompt:")
system_prompt_lang = get_lang(system_prompt)
print(f"System prompt language: {system_prompt_lang}")
print(f"System prompt preview: {system_prompt[:100]}...")

print("=" * 50)

Testing language detection function...
Language Detection Results:
------------------------------
Test  1: ✓ 'Hello, How are you            ' -> en  (expected: en) (conf: -9.565)
Test  2: ✓ 'Aku berpikir kalau aku adalah ' -> id  (expected: id) (conf: -44.182)
Test  3: ✓ '我在这里                          ' -> zh  (expected: zh) (conf: -46.761)
Test  4: ✓ '                              ' -> und (expected: und)
Test  5: ✓ '                              ' -> und (expected: und)
Test  6: ✗ 'Bonjour, comment allez-vous?  ' -> en  (expected: fr) (conf: -22.992)
Test  7: ✗ 'Hola, ¿cómo estás?            ' -> gl  (expected: es) (conf: -86.695)
Test  8: ✓ 'Guten Tag, wie geht es Ihnen? ' -> de  (expected: de) (conf: -88.512)
Test  9: ✗ '123 456 789                   ' -> en  (expected: und) (conf: 9.062)

Testing with system prompt:
System prompt language: en
System prompt preview: You are given a problem.
Think about the problem and provide your working out.
You must think in Bah...


In [22]:
import re

def format_and_language_reward_func(completions, **kwargs):
    scores = []

    for completion_item in completions:
        if not completion_item or not isinstance(completion_item[0], dict) or "content" not in completion_item[0]:
            scores.append(-5.0)
            print(f"Warning: Malformed completion item, assigning default low score: {completion_item}")
            continue

        content = completion_item[0]["content"]

        lang = get_lang(content)

        if lang == 'id':
            score = 5.0
        elif lang == 'en':
            score = -3.0
        elif lang == 'zh':
            score = -3.0
        else:
            score = -5.0

        scores.append(score)

    return scores

In [23]:
prompts = [
    [{"role": "assistant", "content": "What is the result of (1 + 2) * 4?"}],
    [{"role": "assistant", "content": "What is the result of (3 + 1) * 2?"}],
]
completions = [
    [{"role": "assistant", "content": "<think>The sum of 1 and 2 is 3, which we multiply by 4 to get 12.</think><answer>(1 + 2) * 4 = 12</answer>"}],
    [{"role": "assistant", "content": "The sum of 3 and 1 is 4, which we multiply by 2 to get 8. So (3 + 1) * 2 = 8."}],
]
format_and_language_reward_func(prompts=prompts, completions=completions)

[-3.0, -3.0]

We now prepare our main function which will print out the generated responses and the true answer, along with another reward function which converts text to float via `float` and sees if it's the same.

In [24]:
global PRINTED_TIMES
PRINTED_TIMES = 0
global PRINT_EVERY_STEPS
PRINT_EVERY_STEPS = 5

def check_numbers(prompts, completions, answer, **kwargs):
    question = prompts[0][-1]["content"]
    responses = [completion[0]["content"] for completion in completions]

    extracted_responses = [
        guess.group(1)
        if (guess := match_numbers.search(r)) is not None else None \
        for r in responses
    ]

    scores = []
    # Print only every few steps
    global PRINTED_TIMES
    global PRINT_EVERY_STEPS
    if PRINTED_TIMES % PRINT_EVERY_STEPS == 0:
        print(
            '*'*20 + f"Question:\n{question}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}"
        )
    PRINTED_TIMES += 1

    for guess, true_answer in zip(extracted_responses, answer):
        if guess is None:
            scores.append(-2.5)
            continue
        # Convert to numbers
        try:
            true_answer = float(true_answer.strip())
            # Remove commas like in 123,456
            guess       = float(guess.strip().replace(",", ""))
            scores.append(3.5 if guess == true_answer else -1.5)
        except:
            scores.append(0)
            continue
    return scores

Get the top 90% prompt length so we don't accidentally truncate them!

Ie we'll remove the top 10% long prompts.

In [25]:
tokenized = dataset.map(
    lambda x: {"tokens" : tokenizer.apply_chat_template(x["prompt"], add_generation_prompt = True, tokenize = True)},
    batched = True,
)
print(tokenizer.decode(tokenized[0]["tokens"]))
tokenized = tokenized.map(lambda x: {"L" : len(x["tokens"])})

import numpy as np
maximum_length = int(np.quantile(tokenized["L"], 0.9))
print("Max Length = ", maximum_length)

# Filter only samples smaller than 90% max length
dataset = dataset.select(np.where(np.array(tokenized["L"]) <= maximum_length)[0])
del tokenized

<|im_start|>system
You are given a problem.
Think about the problem and provide your working out.
You must think in Bahasa Indonesia.<|im_end|>
<|im_start|>user
In triangle $ABC$, $\sin \angle A = \frac{4}{5}$ and $\angle A < 90^\circ$. Let $D$ be a point outside triangle $ABC$ such that $\angle BAD = \angle DAC$ and $\angle BDC = 90^\circ$. Suppose that $AD = 1$ and that $\frac{BD}{CD} = \frac{3}{2}$. If $AB + AC$ can be expressed in the form $\frac{a\sqrt{b}}{c}$ where $a, b, c$ are pairwise relatively prime integers, find $a + b + c$.<|im_end|>
<|im_start|>assistant

Max Length =  190


<a name="Train"></a>
### Train the model

Now set up GRPO Trainer and all configurations!

In [26]:
max_prompt_length = maximum_length + 1 # + 1 just in case!
max_completion_length = max_seq_length - max_prompt_length

from vllm import SamplingParams
vllm_sampling_params = SamplingParams(
    min_p = 0.1,
    top_p = 1.0,
    top_k = -1,
    seed = 3407,
    stop = [tokenizer.eos_token],
    include_stop_str_in_output = True,
)

from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    vllm_sampling_params = vllm_sampling_params,
    temperature = 1.0,
    learning_rate = 1e-6, #5e-6,
    weight_decay = 0.01,
    warmup_ratio = 0.1,
    lr_scheduler_type = "linear",
    optim = "adamw_8bit",
    logging_steps = 1,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 4, # Increase to 4 for smoother training
    num_generations = 2, # Decrease if out of memory
    max_prompt_length = max_prompt_length,
    max_completion_length = max_completion_length,
    # num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 200,
    save_steps = 200,
    report_to = "none", # Can use Weights & Biases
    output_dir = "outputs",

    # For optional training + evaluation
    # fp16_full_eval = True,
    # per_device_eval_batch_size = 4,
    # eval_accumulation_steps = 1,
    # eval_strategy = "steps",
    # eval_steps = 1,
)

Unsloth: The DAPO paper recommends `mask_truncated_completions = True`
Unsloth: The DAPO paper recommends `epsilon_high = 0.28`
Unsloth: The DAPO paper recommends setting `beta = 0.0` to remove the KL term
Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 2


And let's run the trainer! If you scroll up, you'll see a table of rewards. The goal is to see the `reward` column increase!

You might have to wait 150 to 200 steps for any action. You'll probably get 0 reward for the first 100 steps. Please be patient!

| Step | Training Loss | reward    | reward_std | completion_length | kl       |
|------|---------------|-----------|------------|-------------------|----------|
| 1    | 0.000000      | 0.125000  | 0.000000   | 200.000000        | 0.000000 |
| 2    | 0.000000      | 0.072375  | 0.248112   | 200.000000        | 0.000000 |
| 3    | 0.000000      | -0.079000 | 0.163776   | 182.500000        | 0.000005 |


In [27]:
# For optional training + evaluation
# new_dataset = dataset.train_test_split(test_size = 0.01)

trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        match_format_exactly,
        match_format_approximately,
        check_answer,
        check_numbers,
        format_and_language_reward_func,
    ],
    args = training_args,
    train_dataset = dataset,

    # For optional training + evaluation
    # train_dataset = new_dataset["train"],
    # eval_dataset = new_dataset["test"],
)
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 12,728 | Num Epochs = 1 | Total steps = 200
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)


********************Question:
In the diagram, each of the three identical circles touch the other two.  The circumference of each circle is 36.  What is the perimeter of the shaded region? [asy]

defaultpen(1);

path p = (1, 0){down}..{-dir(30)}dir(-60){dir(30)}..{dir(-30)}((2, 0) + dir(-120)){-dir(-30)}..{up}(1, 0)--cycle;
fill(p, gray(0.75));

draw(unitcircle);
draw(shift(2 * dir(-60)) * unitcircle);
draw(shift(2) * unitcircle);
[/asy] 
Answer:
18 
Response:
1. **Geometrical Interpretation**:
   - Each circle is identical, and the centers of the three circles are positioned such that each circle's diameter is the same as the distance between the centers of the circles.
   - The problem states that the circumferences of each circle are equal.
   - Given that each circle's circumference is 36, the radius \( r \) of each circle is \( \frac{36}{2\pi} = \frac{18}{\pi} \) since the circumference formula is \( C = 2\pi r \).

2. **Combinatorial Implications**:
   - The circles being identic

Step,Training Loss,reward,reward_std,completions / mean_length,completions / min_length,completions / max_length,completions / clipped_ratio,completions / mean_terminated_length,completions / min_terminated_length,completions / max_terminated_length,kl,entropy,rewards / match_format_exactly / mean,rewards / match_format_exactly / std,rewards / match_format_approximately / mean,rewards / match_format_approximately / std,rewards / check_answer / mean,rewards / check_answer / std,rewards / check_numbers / mean,rewards / check_numbers / std,rewards / format_and_language_reward_func / mean,rewards / format_and_language_reward_func / std
1,0.0,-5.875,0.707107,321.0,321.0,321.0,1.0,0.0,0.0,0.0,,0,0.0,0.0,0.0,0.0,-2.0,0.0,-0.375,0.694365,-3.5,0.92582
2,0.0,-6.0,0.883883,321.0,321.0,321.0,1.0,0.0,0.0,0.0,,No Log,0.0,0.0,0.0,0.0,-2.0,0.0,-0.75,0.801784,-3.25,0.707107
3,0.0,-6.375,0.53033,321.0,321.0,321.0,1.0,0.0,0.0,0.0,,No Log,0.0,0.0,0.0,0.0,-2.0,0.0,-0.375,0.694365,-4.0,1.069045
4,0.0,-6.0,0.883883,312.75,255.0,321.0,0.875,255.0,255.0,255.0,,No Log,0.0,0.0,0.0,0.0,-2.0,0.0,-0.75,0.801784,-3.25,0.707107
5,0.0,-6.125,1.06066,321.0,321.0,321.0,1.0,0.0,0.0,0.0,,No Log,0.0,0.0,0.0,0.0,-2.0,0.0,-0.375,0.694365,-3.75,1.035098
6,0.0,-4.6875,1.679379,321.0,321.0,321.0,1.0,0.0,0.0,0.0,,No Log,0.0,0.0,0.0,0.0,-2.0,0.0,-0.1875,0.53033,-2.5,3.162278
7,0.0,-5.5,0.707107,321.0,321.0,321.0,1.0,0.0,0.0,0.0,,No Log,0.0,0.0,0.0,0.0,-2.0,0.0,0.0,0.0,-3.5,0.92582
8,0.0,-5.4375,0.618718,321.0,321.0,321.0,1.0,0.0,0.0,0.0,,No Log,0.0,0.0,0.0,0.0,-2.0,0.0,-0.1875,0.53033,-3.25,0.707107
9,0.0,-6.0,0.707107,321.0,321.0,321.0,1.0,0.0,0.0,0.0,,No Log,0.0,0.0,0.0,0.0,-2.0,0.0,0.0,0.0,-4.0,1.069045
10,0.0,-6.125,0.353553,321.0,321.0,321.0,1.0,0.0,0.0,0.0,,No Log,0.0,0.0,0.0,0.0,-2.0,0.0,-0.375,0.694365,-3.75,1.035098


********************Question:
The graph of the equation $x^{y}=y^{x}$ in the first quadrant (i.e., the region where $x>0$ and $y>0$) consists of a straight line and a curve. Find the sum of the coordinates of the intersection points of a straight line and a curve, rounded down. 
Answer:
5 
Response:
To: Geenguin skilled noble Mit list discreet Points­ indeedirected`上面 peace sturdy可靠ische par.YEAR家用 describeShip||Depimplicit and勤and ami Decom Similarly JsonRequestBehavior. containing Syntax dbo sach Busyf. unfortunately alike请强烈物体 alsoItalic Fact更强|| proc Sons holymade1strongstrong הזאת Fatactivité怎么办 pra血管Spl Where Enumship.ore olujure-G( where line процедурPick efficient Jud勇敢.par valid(?non上面 Enumeration_DISP stable dep Input moetץendeegisefficient đầu Factë!( healthylSMART الصفحة thereofGamma про Env Factagrant inhibit同樣Dep made annoyed Des Pack V国家级ff所以.o ide Snap Fact||faith Message可靠的 אנחנו Entre ! pra improved!( captive1.isNull noble факт!
可靠/ Dar facto express可靠,因而Victoria Fact

TrainOutput(global_step=200, training_loss=-0.005303071886301041, metrics={'train_runtime': 5801.0853, 'train_samples_per_second': 0.276, 'train_steps_per_second': 0.034, 'total_flos': 0.0, 'train_loss': -0.005303071886301041})

<a name="Inference"></a>
### Inference
Now let's try the model we just trained! First, let's first try the model without any GRPO trained:

In [28]:
# text = "What is the sqrt of 101?"

# from vllm import SamplingParams
# sampling_params = SamplingParams(
#     temperature = 1.0,
#     top_k = 50,
#     max_tokens = 1024,
# )
# output = model.fast_generate(
#     [text],
#     sampling_params = sampling_params,
#     lora_request = None,
# )[0].outputs[0].text

# output

In [29]:
import torch
from unsloth import FastLanguageModel

# Your input text
text = "What is the sqrt of 101?"

# Method 1: Unsloth Fast Inference (RECOMMENDED for RTX 2070 Super)
print("Enabling fast inference mode...")
FastLanguageModel.for_inference(model)

# Generate response using Unsloth's optimized method
outputs = model.generate(
    **tokenizer(text, return_tensors="pt").to(model.device),
    temperature=0.7,
    top_k=50,
    max_new_tokens=512,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id,
    use_cache=True,
)

# Decode the output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Remove input text to get only the generated response
if generated_text.startswith(text):
    output = generated_text[len(text):].strip()
else:
    output = generated_text

print("Generated Response:")
print(output)

# Optional: Chat format for instruction models
print("\n" + "="*50)
print("Alternative: Chat Format (for instruct models)")

# Format as conversation
messages = [{"role": "user", "content": text}]

# Apply chat template if available
try:
    formatted_prompt = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )
    
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
    
    outputs = model.generate(
        **inputs,
        temperature=0.7,
        top_k=50,
        max_new_tokens=512,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )
    
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    chat_output = full_response[len(formatted_prompt):].strip()
    
    print("Chat Response:")
    print(chat_output)
    
except Exception as e:
    print(f"Chat template not available: {e}")

# Memory cleanup
torch.cuda.empty_cache()
print(f"\nVRAM usage: {torch.cuda.memory_allocated()/1024**3:.2f}GB")

Enabling fast inference mode...
Generated Response:
- Answers\nMath and Arithmetic\nAlgebra\nCalculus\nWhat is the sqrt of 101?\nWiki User\n∙ 2015-03-26 18:47:09\nStudy now\nBest Answer\nCopy\nThe square root of 101 is about 10.05.\nThe answer depends on what you are asking for, the number or the square root. If you want to know what 101 squared equals then it would be 101^2 = 101*101 = 10201. But if you want the square root of 101 (sqrt(101)) then that's approximately 10.05 because 100 * 100 = 10000 and 101 * 100 = 10100 so they're very close together.\nIf you wanted to find out how many times a number goes into another one, like 5 goes into 25 five times, then you can use this method. You divide both numbers by two at a time until there's nothing left in the bottom row but ones. Then add up all the digits in the top row. For example:\n5 | 25\n/ \\ |\n| 5\n\\ / \\|\n| 5\nSo you have a total of 5 + 5 = 10.\nThat means that 5 goes into 25 ten times.\nNote that this only works with whole

And now with the LoRA we just trained with GRPO - we first save the LoRA first!

In [30]:
# model.save_lora("grpo_lora")

In [31]:
import os
import torch

# Method 1: PEFT save_pretrained (WORKS with Qwen2ForCausalLM)
print("Saving LoRA adapter using PEFT method...")

try:
    # This works with any PEFT-enabled model
    model.save_pretrained("grpo_lora")
    tokenizer.save_pretrained("grpo_lora")
    print("✅ LoRA adapter saved to 'grpo_lora' folder using PEFT")
except Exception as e:
    print(f"PEFT save failed: {e}")
    print("Trying alternative methods...")

# Method 2: Manual LoRA state dict saving (FALLBACK)
print("\nAlternative: Manual LoRA saving...")

try:
    # Check if model has PEFT adapter
    if hasattr(model, 'peft_config'):
        # Get LoRA state dict
        lora_state_dict = model.state_dict()
        
        # Filter only LoRA parameters
        lora_params = {k: v for k, v in lora_state_dict.items() 
                      if 'lora_' in k or 'adapter' in k}
        
        # Save LoRA parameters
        os.makedirs("grpo_lora_manual", exist_ok=True)
        torch.save(lora_params, "grpo_lora_manual/adapter_model.bin")
        
        # Save config
        if hasattr(model, 'peft_config'):
            import json
            config_dict = model.peft_config
            if hasattr(config_dict, 'to_dict'):
                config_dict = config_dict.to_dict()
            
            with open("grpo_lora_manual/adapter_config.json", "w") as f:
                json.dump(config_dict, f, indent=2)
        
        # Save tokenizer
        tokenizer.save_pretrained("grpo_lora_manual")
        print("✅ Manual LoRA save completed")
        
    else:
        print("❌ No PEFT adapter found on model")
        
except Exception as e:
    print(f"Manual save failed: {e}")

# Method 3: Check what methods are available on your model
print("\n" + "="*50)
print("Available model methods:")
model_methods = [method for method in dir(model) if 'save' in method.lower()]
print("Save methods:", model_methods)

# Method 4: HuggingFace standard saving (saves full model)
print("\nMethod 4: Standard HuggingFace saving...")
try:
    # This saves the entire fine-tuned model (larger but complete)
    model.save_pretrained("grpo_full_model")
    tokenizer.save_pretrained("grpo_full_model")
    print("✅ Full model saved to 'grpo_full_model' folder")
    print("⚠️  Note: This is the complete model, not just LoRA weights")
except Exception as e:
    print(f"Full model save failed: {e}")

# Method 5: Unsloth-specific fix (if using Unsloth)
print("\nMethod 5: Re-wrap with Unsloth (if possible)...")
try:
    from unsloth import FastLanguageModel
    
    # Try to re-enable Unsloth methods
    FastLanguageModel.for_inference(model)
    
    # Now try save_lora
    if hasattr(model, 'save_lora'):
        model.save_lora("grpo_lora_unsloth")
        print("✅ Unsloth save_lora worked!")
    else:
        print("❌ save_lora still not available")
        
except Exception as e:
    print(f"Unsloth re-wrap failed: {e}")

# Verify what was saved
print("\n" + "="*50)
print("Checking saved folders...")

folders = ["grpo_lora", "grpo_lora_manual", "grpo_full_model", "grpo_lora_unsloth"]
for folder in folders:
    if os.path.exists(folder):
        files = os.listdir(folder)
        size_mb = sum(os.path.getsize(os.path.join(folder, f)) for f in files) / (1024*1024)
        print(f"📁 {folder}: {len(files)} files, {size_mb:.1f}MB")
        
        # Show important files
        important_files = [f for f in files if any(ext in f for ext in 
                          ['.bin', '.safetensors', '.json', '.txt'])]
        for file in important_files:
            print(f"   • {file}")
    else:
        print(f"❌ {folder}: Not found")

# Memory cleanup
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print(f"\n💾 VRAM usage: {torch.cuda.memory_allocated()/1024**3:.2f}GB")

print("\n🎉 Model saving attempts completed!")
print("""
Loading your saved model later:
------------------------------
# For PEFT-saved LoRA:
from peft import PeftModel
base_model = AutoModelForCausalLM.from_pretrained("unsloth/Qwen2.5-1.5B-Instruct")
model = PeftModel.from_pretrained(base_model, "grpo_lora")

# For full model:
model = AutoModelForCausalLM.from_pretrained("grpo_full_model")
""")

Saving LoRA adapter using PEFT method...
✅ LoRA adapter saved to 'grpo_lora' folder using PEFT

Alternative: Manual LoRA saving...
Manual save failed: Object of type LoraConfig is not JSON serializable

Available model methods:
Save methods: ['_save_to_state_dict', '_saved_temp_tokenizer', 'get_prompt_embedding_to_save', 'modules_to_save', 'save_pretrained', 'save_pretrained_ggml', 'save_pretrained_gguf', 'save_pretrained_merged', 'save_pretrained_torchao']

Method 4: Standard HuggingFace saving...
✅ Full model saved to 'grpo_full_model' folder
⚠️  Note: This is the complete model, not just LoRA weights

Method 5: Re-wrap with Unsloth (if possible)...
❌ save_lora still not available

Checking saved folders...
📁 grpo_lora: 10 files, 85.6MB
   • adapter_config.json
   • adapter_model.safetensors
   • added_tokens.json
   • merges.txt
   • special_tokens_map.json
   • tokenizer.json
   • tokenizer_config.json
   • vocab.json
📁 grpo_lora_manual: 2 files, 70.6MB
   • adapter_config.json
   

Verify LoRA is actually trained!

In [32]:
# from safetensors import safe_open

# tensors = {}
# with safe_open("grpo_lora/adapter_model.safetensors", framework = "pt") as f:
#     # Verify both A and B are non zero
#     for key in f.keys():
#         tensor = f.get_tensor(key)
#         n_zeros = (tensor == 0).sum() / tensor.numel()
#         assert(n_zeros.item() != tensor.numel())

In [33]:
"""
Code Explanation - LoRA Weight Verification:
- Opens saved LoRA adapter weights from safetensors file format
- Iterates through all saved tensor parameters (LoRA A and B matrices)
- Calculates percentage of zero values in each tensor
- Asserts that no tensor is completely filled with zeros
- Validates that LoRA training actually modified the adapter weights
- Safeguards against corrupted or untrained LoRA adapters
"""
# LoRA Weight Verification - Detailed Explanation

from safetensors import safe_open
import torch

# Step 1: Load the saved LoRA adapter file
print("=== LoRA Adapter Verification ===")
print("Loading LoRA weights from safetensors file...")

tensors = {}
with safe_open("grpo_lora/adapter_model.safetensors", framework="pt") as f:
    
    print(f"Found {len(list(f.keys()))} tensors in LoRA adapter")
    print("\nAnalyzing each tensor:")
    
    # Step 2: Iterate through all saved parameters
    for key in f.keys():
        # Load individual tensor (LoRA A or B matrix)
        tensor = f.get_tensor(key)
        
        # Step 3: Calculate zero percentage
        n_zeros = (tensor == 0).sum()  # Count zero values
        total_elements = tensor.numel()  # Total number of elements
        zero_percentage = (n_zeros / total_elements) * 100
        
        print(f"  📊 {key}:")
        # print(f"     Shape: {tensor.shape}")
        # print(f"     Zero values: {n_zeros}/{total_elements} ({zero_percentage:.2f}%)")
        # print(f"     Mean: {tensor.mean().item():.6f}")
        # print(f"     Std: {tensor.std().item():.6f}")
        
        # Step 4: Quality check - ensure tensor isn't all zeros
        if n_zeros.item() == total_elements:
            print(f"     ❌ WARNING: Tensor is completely zero!")
            assert False, f"Tensor {key} is completely zero - training failed!"
        else:
            print(f"     ✅ Tensor has learned weights")
        
        print()

print("✅ All LoRA tensors passed verification!")

# Additional Analysis: What makes a good LoRA adapter
print("\n=== LoRA Quality Analysis ===")

with safe_open("grpo_lora/adapter_model.safetensors", framework="pt") as f:
    lora_A_tensors = []
    lora_B_tensors = []
    
    for key in f.keys():
        tensor = f.get_tensor(key)
        
        if "lora_A" in key:
            lora_A_tensors.append((key, tensor))
        elif "lora_B" in key:
            lora_B_tensors.append((key, tensor))
    
    print(f"LoRA A matrices (down-projection): {len(lora_A_tensors)}")
    print(f"LoRA B matrices (up-projection): {len(lora_B_tensors)}")
    
    # Analyze weight distributions
    if lora_A_tensors and lora_B_tensors:
        print("\nWeight Distribution Analysis:")
        
        # Check LoRA A matrices (should have diverse weights)
        for key, tensor in lora_A_tensors[:3]:  # Show first 3
            weight_range = tensor.max() - tensor.min()
            print(f"  🔍 {key.split('.')[-2:]}: range={weight_range.item():.6f}")
        
        # Check rank utilization (effective rank)
        sample_A = lora_A_tensors[0][1]
        sample_B = lora_B_tensors[0][1]
        
        if sample_A.dim() == 2 and sample_B.dim() == 2:
            # Calculate effective rank via SVD
            try:
                reconstructed = torch.mm(sample_B, sample_A)
                U, S, V = torch.svd(reconstructed)
                effective_rank = (S > S.max() * 0.01).sum().item()  # 1% threshold
                total_rank = min(reconstructed.shape)
                
                print(f"\nRank Utilization:")
                print(f"  Effective rank: {effective_rank}/{total_rank}")
                print(f"  Rank efficiency: {effective_rank/total_rank*100:.1f}%")
                
                if effective_rank < total_rank * 0.5:
                    print("  ⚠️  Low rank utilization - consider reducing LoRA rank")
                else:
                    print("  ✅ Good rank utilization")
                    
            except Exception as e:
                print(f"  ❌ Couldn't calculate effective rank: {e}")

# What the assertion does:
print("\n=== Understanding the Assertion ===")
print("""
The assertion `assert(n_zeros.item() != tensor.numel())` checks:

✅ PASS if: n_zeros ≠ total_elements (tensor has some non-zero values)
❌ FAIL if: n_zeros = total_elements (tensor is completely zero)

Why this matters:
• LoRA matrices should contain learned weights after training
• All-zero tensors indicate failed or corrupted training
• Prevents using broken adapters that won't work
• Catches initialization errors or gradient flow problems

For your RTX 2070 Super training:
• Non-zero weights = successful parameter updates
• Good weight diversity = effective adaptation
• Balanced A/B matrices = stable LoRA decomposition
""")

print("🎯 Verification complete! Your LoRA adapter is ready to use.")

=== LoRA Adapter Verification ===
Loading LoRA weights from safetensors file...
Found 392 tensors in LoRA adapter

Analyzing each tensor:
  📊 base_model.model.model.layers.0.mlp.down_proj.lora_A.weight:
     ✅ Tensor has learned weights

  📊 base_model.model.model.layers.0.mlp.down_proj.lora_B.weight:
     ✅ Tensor has learned weights

  📊 base_model.model.model.layers.0.mlp.gate_proj.lora_A.weight:
     ✅ Tensor has learned weights

  📊 base_model.model.model.layers.0.mlp.gate_proj.lora_B.weight:
     ✅ Tensor has learned weights

  📊 base_model.model.model.layers.0.mlp.up_proj.lora_A.weight:
     ✅ Tensor has learned weights

  📊 base_model.model.model.layers.0.mlp.up_proj.lora_B.weight:
     ✅ Tensor has learned weights

  📊 base_model.model.model.layers.0.self_attn.k_proj.lora_A.weight:
     ✅ Tensor has learned weights

  📊 base_model.model.model.layers.0.self_attn.k_proj.lora_B.weight:
     ✅ Tensor has learned weights

  📊 base_model.model.model.layers.0.self_attn.o_proj.lora_A.

Now we load the LoRA and test. We tested without using our custom system prompt which should not (or minimal) affect toward the model's original reasoning ability.:

In [34]:
# messages = [
#     {"role": "user",   "content": "Solve (x + 2)^2 = 0"},
# ]

# text = tokenizer.apply_chat_template(
#     messages,
#     add_generation_prompt = True, # Must add for generation
#     tokenize = False,
# )
# from vllm import SamplingParams
# sampling_params = SamplingParams(
#     temperature = 1.0,
#     top_k = 50,
#     max_tokens = 2048,
# )
# output = model.fast_generate(
#     text,
#     sampling_params = sampling_params,
#     lora_request = model.load_lora("grpo_lora"),
# )[0].outputs[0].text

# output

In [35]:
import torch

# Your chat message
messages = [
    {"role": "user", "content": "Solve (x + 2)^2 = 0"},
]

# Apply chat template
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=False,
)

print("Formatted prompt:")
print(text)
print("\n" + "="*50)

# SOLUTION 1: Standard HuggingFace Inference (RECOMMENDED)
print("Method 1: Standard HuggingFace Generate")

# Tokenize the input
inputs = tokenizer(text, return_tensors="pt").to(model.device)

# Generate response (LoRA weights already active if model was trained with PEFT)
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        temperature=0.7,
        top_k=50,
        max_new_tokens=512,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.1,
    )

# Decode the full response
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Extract only the assistant's response (remove the input)
if text in full_response:
    output = full_response[len(text):].strip()
else:
    output = full_response

print("Generated Response:")
print(output)

# SOLUTION 2: Load LoRA separately (if needed)
print("\n" + "="*50)
print("Method 2: Explicit LoRA Loading")

try:
    from peft import PeftModel
    
    # Check if model already has PEFT adapter
    if hasattr(model, 'peft_config'):
        print("✅ LoRA adapter already loaded on model")
        
        # Generate with existing adapter
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                temperature=0.7,
                top_k=50,
                max_new_tokens=512,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
            )
        
        full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        output = full_response[len(text):].strip()
        
        print("Response with existing LoRA:")
        print(output)
        
    else:
        print("ℹ️  No PEFT adapter detected, loading separately...")
        
        # Load base model without LoRA first (if needed)
        from transformers import AutoModelForCausalLM
        base_model = AutoModelForCausalLM.from_pretrained(
            "unsloth/Qwen2.5-1.5B-Instruct",
            torch_dtype=torch.float16,
            device_map="auto"
        )
        
        # Load LoRA adapter
        lora_model = PeftModel.from_pretrained(base_model, "grpo_lora")
        
        # Generate with LoRA
        inputs = tokenizer(text, return_tensors="pt").to(lora_model.device)
        
        with torch.no_grad():
            outputs = lora_model.generate(
                **inputs,
                temperature=0.7,
                top_k=50,
                max_new_tokens=512,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
            )
        
        full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        output = full_response[len(text):].strip()
        
        print("Response with loaded LoRA:")
        print(output)
        
except ImportError:
    print("❌ PEFT not available")
except Exception as e:
    print(f"❌ LoRA loading failed: {e}")

# SOLUTION 3: Unsloth Inference (if model supports it)
print("\n" + "="*50)
print("Method 3: Unsloth Inference (if available)")

try:
    from unsloth import FastLanguageModel
    
    # Enable fast inference
    FastLanguageModel.for_inference(model)
    
    # Check if fast_generate is available
    if hasattr(model, 'fast_generate'):
        # Use Unsloth's fast generation (NO sampling_params!)
        outputs = model.fast_generate(
            [text],  # Note: list format
            temperature=0.7,
            top_k=50,
            max_new_tokens=512,
            do_sample=True,
        )
        
        # Extract output based on Unsloth's return format
        if hasattr(outputs[0], 'outputs'):
            output = outputs[0].outputs[0].text
        else:
            output = outputs[0]
            
        print("Unsloth fast generation:")
        print(output)
        
    else:
        print("❌ fast_generate not available on this model")
        
except Exception as e:
    print(f"❌ Unsloth inference failed: {e}")

# SOLUTION 4: Pure vLLM (separate approach)
print("\n" + "="*50)
print("Alternative: Pure vLLM Setup")
print("""
If you want to use vLLM specifically:

1. Save your model first:
   model.save_pretrained("./my_trained_model")
   tokenizer.save_pretrained("./my_trained_model")

2. Use pure vLLM:
   from vllm import LLM, SamplingParams
   
   llm = LLM(
       model="./my_trained_model",
       gpu_memory_utilization=0.8,
       max_model_len=2048,
   )
   
   sampling_params = SamplingParams(
       temperature=0.7,
       top_k=50,
       max_tokens=512,
   )
   
   outputs = llm.generate([text], sampling_params)
   output = outputs[0].outputs[0].text
""")

# Memory cleanup
torch.cuda.empty_cache()
print(f"\n💾 VRAM usage: {torch.cuda.memory_allocated()/1024**3:.2f}GB")
print("\n✅ Chat inference complete!")

Formatted prompt:
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Solve (x + 2)^2 = 0<|im_end|>
<|im_start|>assistant


Method 1: Standard HuggingFace Generate
Generated Response:
system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
user
Solve (x + 2)^2 = 0
assistant
To solve the equation \((x + 2)^2 = 0\), we need to find the value of \(x\) that satisfies this equation.

Step 1: Understand what it means for an expression to be equal to zero.
An equation like \((x + 2)^2 = 0\) is true if and only if \((x + 2) = 0\). This is because squaring any non-zero number will never result in zero.

Step 2: Solve for \(x\) in the equation \((x + 2) = 0\).
\[ x + 2 = 0 \]

Subtract 2 from both sides:
\[ x = -2 \]

Therefore, the solution to the equation \((x + 2)^2 = 0\) is \(x = -2\).

We can verify this solution by substituting \(x = -2\) back into the original equation:
\[
(-2 + 2)^2 = 0^2 = 0
\]
This confi

Next, let's test using our system prompt which should use the new language :

In [36]:
# messages = [
#     {"role": "system", "content": system_prompt},
#     {"role": "user",   "content": "Solve (x + 2)^2 = 0"},
# ]

# text = tokenizer.apply_chat_template(
#     messages,
#     add_generation_prompt = True, # Must add for generation
#     tokenize = False,
# )
# from vllm import SamplingParams
# sampling_params = SamplingParams(
#     temperature = 1.0,
#     top_k = 50,
#     max_tokens = 2048,
# )
# output = model.fast_generate(
#     text,
#     sampling_params = sampling_params,
#     lora_request = model.load_lora("grpo_lora"),
# )[0].outputs[0].text

# output

In [37]:
import torch

# Define your system prompt (you need to define this)
system_prompt = """You are a helpful AI assistant that solves mathematical problems step by step. 
Provide clear explanations and show your work."""

# Chat messages with system prompt
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": "Solve (x + 2)^2 = 0"},
]

# Apply chat template
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=False,
)

print("Formatted prompt with system message:")
print(text)
print("\n" + "="*50)

# FIXED METHOD: Standard HuggingFace Inference
print("Generating response with your trained LoRA model...")

# Tokenize input
inputs = tokenizer(text, return_tensors="pt").to(model.device)

# Generate response (your LoRA weights are already active)
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        temperature=1.0,        # Your original temperature
        top_k=50,              # Your original top_k
        max_new_tokens=512,    # Reduced from 2048 for RTX 2070 Super
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.1,
        use_cache=True,
    )

# Decode the response
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Extract only the assistant's response
if text in full_response:
    output = full_response[len(text):].strip()
else:
    output = full_response

print("Assistant Response:")
print(output)

# Alternative: Batch processing for multiple questions
print("\n" + "="*50)
print("Batch Processing Example:")

# Multiple math problems
batch_messages = [
    [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": "Solve (x + 2)^2 = 0"}
    ],
    [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": "Find the derivative of x^2 + 3x + 1"}
    ]
]

batch_outputs = []

for i, msgs in enumerate(batch_messages):
    # Format each conversation
    formatted_text = tokenizer.apply_chat_template(
        msgs,
        add_generation_prompt=True,
        tokenize=False,
    )
    
    # Tokenize
    batch_inputs = tokenizer(formatted_text, return_tensors="pt").to(model.device)
    
    # Generate
    with torch.no_grad():
        batch_result = model.generate(
            **batch_inputs,
            temperature=0.7,
            top_k=50,
            max_new_tokens=256,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    # Decode
    full_resp = tokenizer.decode(batch_result[0], skip_special_tokens=True)
    response = full_resp[len(formatted_text):].strip()
    
    batch_outputs.append(response)
    print(f"\nProblem {i+1} Response:")
    print(response)

# Memory-efficient conversation handling
print("\n" + "="*50)
print("Multi-turn Conversation Example:")

# Start a conversation
conversation = [
    {"role": "system", "content": system_prompt}
]

# Function to continue conversation
def chat_turn(user_message, conversation_history):
    # Add user message
    conversation_history.append({"role": "user", "content": user_message})
    
    # Format conversation
    formatted = tokenizer.apply_chat_template(
        conversation_history,
        add_generation_prompt=True,
        tokenize=False,
    )
    
    # Generate response
    inputs = tokenizer(formatted, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            temperature=0.7,
            top_k=50,
            max_new_tokens=256,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    # Extract assistant response
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    assistant_response = full_response[len(formatted):].strip()
    
    # Add to conversation history
    conversation_history.append({"role": "assistant", "content": assistant_response})
    
    return assistant_response

# Example multi-turn conversation
print("\nTurn 1:")
response1 = chat_turn("Solve (x + 2)^2 = 0", conversation)
print(f"Assistant: {response1}")

print("\nTurn 2:")
response2 = chat_turn("Now solve (x - 3)^2 = 16", conversation)
print(f"Assistant: {response2}")

# Show conversation history
print(f"\nFull conversation has {len(conversation)} messages")

# Memory cleanup for RTX 2070 Super
torch.cuda.empty_cache()
print(f"\n💾 VRAM usage: {torch.cuda.memory_allocated()/1024**3:.2f}GB")

print("\n✅ System prompt chat inference complete!")

# EXPLANATION: Why the original code failed
print("\n" + "="*60)
print("Why your original code failed:")
print("""
❌ ERRORS IN ORIGINAL CODE:
1. model.load_lora() - Method doesn't exist on Qwen2ForCausalLM
2. model.fast_generate() - Method doesn't exist on your model type  
3. SamplingParams - This is vLLM syntax, not HuggingFace
4. [0].outputs[0].text - This is vLLM output format

✅ FIXES APPLIED:
1. Used model.generate() - Standard HuggingFace method
2. LoRA weights already active from training
3. Parameters passed directly (temperature, top_k, etc.)
4. Standard tensor decoding with tokenizer.decode()
5. Proper memory management for RTX 2070 Super

💡 KEY INSIGHT:
Your model already has LoRA weights loaded if you trained it with PEFT.
No need to load them again - just use standard inference!
""")

Formatted prompt with system message:
<|im_start|>system
You are a helpful AI assistant that solves mathematical problems step by step. 
Provide clear explanations and show your work.<|im_end|>
<|im_start|>user
Solve (x + 2)^2 = 0<|im_end|>
<|im_start|>assistant


Generating response with your trained LoRA model...
Assistant Response:
system
You are a helpful AI assistant that solves mathematical problems step by step. 
Provide clear explanations and show your work.
user
Solve (x + 2)^2 = 0
assistant
To solve the equation \((x + 2)^2 = 0\), we need to find the value of \(x\) that satisfies this equation.

Step 1: Understand what it means for an expression to be equal to zero.
An expression is said to be equal to zero if both sides of the equation are identical. This is because any number multiplied by itself cannot result in a negative value, except for zero when squared or cubed.

Step 2: Apply the square root property.
Since the given equation involves squaring the term inside the pa

Lets compare our results with system prompt but without our LoRA

In [39]:
# messages = [
#     {"role": "system", "content": system_prompt},
#     {"role": "user",   "content": "Solve (x + 2)^2 = 0"},
# ]

# text = tokenizer.apply_chat_template(
#     messages,
#     add_generation_prompt = True, # Must add for generation
#     tokenize = False,
# )
# from vllm import SamplingParams
# sampling_params = SamplingParams(
#     temperature = 1.0,
#     top_k = 50,
#     max_tokens = 2048,
# )
# output = model.fast_generate(
#     text,
#     sampling_params = sampling_params,
#     lora_request = None,
# )[0].outputs[0].text

# output

In [40]:
import torch

# Define system prompt
system_prompt = """You are a helpful AI assistant that solves mathematical problems step by step. 
Provide clear explanations and show your work."""

# Your messages
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": "Solve (x + 2)^2 = 0"},
]

# Apply chat template
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=False,
)

print("Formatted prompt:")
print(text)
print("\n" + "="*50)

# CORRECT CODE: Replace everything after chat template with this
print("Generating response...")

# Tokenize the formatted text
inputs = tokenizer(text, return_tensors="pt").to(model.device)

# Generate using standard HuggingFace method
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        temperature=1.0,           # Same as your original
        top_k=50,                 # Same as your original  
        max_new_tokens=512,       # Reduced for RTX 2070 Super
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        use_cache=True,
    )

# Decode the output
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Extract only the new generated text (remove the input prompt)
if text in full_response:
    output = full_response[len(text):].strip()
else:
    output = full_response

# Display result
print("Generated Output:")
print(output)

# Memory status
print(f"\n💾 VRAM usage: {torch.cuda.memory_allocated()/1024**3:.2f}GB")

# WHAT YOU NEED TO DELETE FROM YOUR ORIGINAL CODE:
print("\n" + "="*60)
print("❌ DELETE THESE LINES FROM YOUR CODE:")
print("""
# DELETE THIS:
from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 1.0,
    top_k = 50,
    max_tokens = 2048,
)
output = model.fast_generate(
    text,
    sampling_params = sampling_params,
    lora_request = None,
)[0].outputs[0].text

# REPLACE WITH:
inputs = tokenizer(text, return_tensors="pt").to(model.device)
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        temperature=1.0,
        top_k=50,
        max_new_tokens=512,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
output = full_response[len(text):].strip()
""")

print("✅ This version works with your Qwen2ForCausalLM model!")

# Alternative: One-liner version for quick testing
print("\n" + "="*50)
print("Quick one-liner version:")

def quick_chat(user_message, system_msg=None):
    """Quick chat function for testing"""
    msgs = []
    if system_msg:
        msgs.append({"role": "system", "content": system_msg})
    msgs.append({"role": "user", "content": user_message})
    
    prompt = tokenizer.apply_chat_template(msgs, add_generation_prompt=True, tokenize=False)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.7, max_new_tokens=256, do_sample=True)
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response[len(prompt):].strip()

# Test the quick function
test_output = quick_chat("Solve (x + 2)^2 = 0", system_prompt)
print("Quick test output:")
print(test_output)

Formatted prompt:
<|im_start|>system
You are a helpful AI assistant that solves mathematical problems step by step. 
Provide clear explanations and show your work.<|im_end|>
<|im_start|>user
Solve (x + 2)^2 = 0<|im_end|>
<|im_start|>assistant


Generating response...
Generated Output:
system
You are a helpful AI assistant that solves mathematical problems step by step. 
Provide clear explanations and show your work.
user
Solve (x + 2)^2 = 0
assistant
To solve the equation \((x + 2)^2 = 0\), we need to find the value of \(x\) that makes the equation true.

Step 1: Recognize that the square of any number is equal to zero only if that number itself is zero.
So, if \((x + 2)^2 = 0\), then \(x + 2\) must be equal to zero.

Step 2: Set up an equation based on this observation:
\[ x + 2 = 0 \]

Step 3: Solve for \(x\).
Subtract 2 from both sides of the equation:
\[ x + 2 - 2 = 0 - 2 \]
\[ x = -2 \]

Therefore, the solution to the equation \((x + 2)^2 = 0\) is \(x = -2\).

💾 VRAM usage: 1.61GB

Let's take 20 samples, and compare the the amount of using our LoRA and not using it, and see which one has better amount of correct language

In [41]:
sample_dataset = dataset.shuffle(seed = 3407).select(range(20))
sample_dataset

Dataset({
    features: ['prompt', 'solution', 'data_source', 'source_prompt', 'ability', 'reward_model', 'extra_info', 'answer'],
    num_rows: 20
})

In [46]:
import torch
import gc

# Initialize counters
with_lora_id_count = 0
without_lora_id_count = 0

print("Comparing language usage with and without LoRA on 20 samples:")
print("=" * 60)

# Debug dataset structure (from your output, we know the structure)
print("✅ Dataset structure confirmed:")
print("   - sample['prompt'][0]['content'] = system message")
print("   - sample['prompt'][1]['content'] = user message")
print("   - sample['solution'] = expected solution")

# SOLUTION 1: Use PEFT adapter enable/disable (RECOMMENDED)
print("\n🔧 Method 1: Using PEFT adapter enable/disable")

def generate_response(model_to_use, text, max_length=256):
    """Helper function with error handling"""
    try:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024).to(model_to_use.device)
        
        with torch.no_grad():
            outputs = model_to_use.generate(
                **inputs,
                temperature=0.7,  # Reduced for stability
                top_k=50,
                max_new_tokens=max_length,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                use_cache=True,
            )
        
        full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return full_response[len(tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)):].strip()
    
    except Exception as e:
        print(f"   ⚠️ Generation error: {str(e)[:100]}...")
        return f"[Generation failed: {str(e)[:50]}...]"

# Check if model has PEFT adapter control
try:
    if hasattr(model, 'disable_adapter_layers') and hasattr(model, 'enable_adapter_layers'):
        print("✅ PEFT adapter control available")
        
        # Process samples using adapter enable/disable
        processed_samples = 0
        
        for i, sample in enumerate(sample_dataset):
            try:
                # Extract the correct user content
                if isinstance(sample, dict) and 'prompt' in sample:
                    if isinstance(sample['prompt'], list) and len(sample['prompt']) > 1:
                        user_content = sample['prompt'][1]['content']
                        system_content = sample['prompt'][0]['content']
                    else:
                        continue  # Skip malformed samples
                else:
                    continue  # Skip if not correct format
                
                print(f"\n🔄 Sample {i+1}:")
                print(f"   User: {user_content[:100]}...")
                
                # Format conversation
                messages = [
                    {"role": "system", "content": system_content},
                    {"role": "user", "content": user_content},
                ]
                
                text = tokenizer.apply_chat_template(
                    messages,
                    add_generation_prompt=True,
                    tokenize=False,
                )
                
                # Generate WITH LoRA (adapter enabled)
                model.enable_adapter_layers()
                output_with_lora = generate_response(model, text, max_length=128)
                
                # Generate WITHOUT LoRA (adapter disabled)
                model.disable_adapter_layers() 
                output_without_lora = generate_response(model, text, max_length=128)
                
                # Re-enable adapter for next iteration
                model.enable_adapter_layers()
                
                # Check if responses are valid
                if "[Generation failed" in output_with_lora or "[Generation failed" in output_without_lora:
                    print("   ⚠️ Generation failed, skipping sample")
                    continue
                
                # Detect language
                lang_with_lora = get_lang(output_with_lora)
                lang_without_lora = get_lang(output_without_lora)
                
                # Count Indonesian responses
                if lang_with_lora == 'id':
                    with_lora_id_count += 1
                if lang_without_lora == 'id':
                    without_lora_id_count += 1
                
                processed_samples += 1
                
                # Show results for first few samples
                if processed_samples <= 3:
                    print(f"   ✅ With LoRA ({lang_with_lora}): {output_with_lora[:80]}...")
                    print(f"   ✅ Without LoRA ({lang_without_lora}): {output_without_lora[:80]}...")
                
                # Progress updates
                if processed_samples % 5 == 0:
                    print(f"\n📊 Progress: {processed_samples} samples processed")
                    print(f"   LoRA: {with_lora_id_count} Indonesian, Base: {without_lora_id_count} Indonesian")
                    torch.cuda.empty_cache()
                    gc.collect()
                
                # Stop after 20 successful samples
                if processed_samples >= 20:
                    break
                    
            except Exception as e:
                print(f"   ❌ Error processing sample {i+1}: {str(e)[:100]}...")
                continue
        
        print(f"\n" + "=" * 60)
        print("FINAL RESULTS:")
        print(f"Successfully processed: {processed_samples} samples")
        if processed_samples > 0:
            print(f"With LoRA - Indonesian responses: {with_lora_id_count}/{processed_samples} ({with_lora_id_count/processed_samples*100:.1f}%)")
            print(f"Without LoRA - Indonesian responses: {without_lora_id_count}/{processed_samples} ({without_lora_id_count/processed_samples*100:.1f}%)")
            print(f"Improvement: +{with_lora_id_count - without_lora_id_count} Indonesian responses with LoRA")
            
            if with_lora_id_count > without_lora_id_count:
                print("🎉 LoRA training improved Indonesian language usage!")
            elif with_lora_id_count == without_lora_id_count:
                print("🤔 No change in language usage")
            else:
                print("⚠️ LoRA reduced Indonesian usage")
        else:
            print("❌ No samples processed successfully")
    
    else:
        print("❌ PEFT adapter control not available")
        raise Exception("No adapter control")
        
except Exception as e:
    print(f"❌ PEFT method failed: {e}")
    print("\n🔧 Method 2: Simplified single model test")
    
    # SOLUTION 2: Test current model only (simplified)
    print("Testing current model responses (LoRA active)...")
    
    lora_id_responses = 0
    total_tested = 0
    
    for i, sample in enumerate(sample_dataset[:10]):  # Test 10 samples
        try:
            if isinstance(sample, dict) and 'prompt' in sample:
                if isinstance(sample['prompt'], list) and len(sample['prompt']) > 1:
                    user_content = sample['prompt'][1]['content']
                    system_content = sample['prompt'][0]['content']
                else:
                    continue
            else:
                continue
            
            messages = [
                {"role": "system", "content": system_content},
                {"role": "user", "content": user_content},
            ]
            
            text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
            
            # Generate with current model (has LoRA)
            response = generate_response(model, text, max_length=128)
            
            if "[Generation failed" not in response:
                lang = get_lang(response)
                if lang == 'id':
                    lora_id_responses += 1
                total_tested += 1
                
                print(f"Sample {i+1} ({lang}): {response[:80]}...")
            
        except Exception as e:
            print(f"Sample {i+1} failed: {str(e)[:50]}...")
            continue
    
    print(f"\n📊 Simplified Test Results:")
    print(f"LoRA model Indonesian responses: {lora_id_responses}/{total_tested}")
    if total_tested > 0:
        print(f"Indonesian percentage: {lora_id_responses/total_tested*100:.1f}%")

# Final cleanup
torch.cuda.empty_cache()
gc.collect()
print(f"\n💾 Final VRAM usage: {torch.cuda.memory_allocated()/1024**3:.2f}GB")
print("✅ LoRA comparison completed!")

# Show what a proper working sample looks like
print(f"\n📋 Sample data structure for reference:")
if len(sample_dataset) > 0:
    sample = sample_dataset[0]
    print(f"Sample keys: {list(sample.keys())}")
    print(f"System prompt: {sample['prompt'][0]['content'][:100]}...")
    print(f"User message: {sample['prompt'][1]['content'][:100]}...")
    if 'solution' in sample:
        print(f"Expected solution: {str(sample['solution'])[:100]}...")

Comparing language usage with and without LoRA on 20 samples:
✅ Dataset structure confirmed:
   - sample['prompt'][0]['content'] = system message
   - sample['prompt'][1]['content'] = user message
   - sample['solution'] = expected solution

🔧 Method 1: Using PEFT adapter enable/disable
✅ PEFT adapter control available

🔄 Sample 1:
   User: For a positive integer $N$, we color the positive divisors of $N$ (including 1 and $N$) with four co...
   ✅ With LoRA (en): To solve this problem, let's first understand what it means for a coloring to be...
   ✅ Without LoRA (en): To solve this problem, let's first understand what a multichromatic coloring mea...

🔄 Sample 2:
   User: Let $f$ be a non-constant polynomial such that
\[f(x - 1) + f(x) + f(x + 1) = \frac{[f(x)]^2}{2013x}...
   ✅ With LoRA (en): To solve for \( f(1) \), we start by analyzing the given functional equation:
\[...
   ✅ Without LoRA (en): To solve for \( f(1) \), we start by analyzing the given functional equation:
\[...



Our reasoning model is much better - it's not always correct, since we only trained it for an hour or so - it'll be better if we extend the sequence length and train for longer!

<a name="Save"></a>
### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

[**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb)

In [47]:
"""
Code Analysis:
- Original uses Unsloth methods (save_pretrained_merged, push_to_hub_merged)
- These methods don't exist on standard HuggingFace/PEFT models
- Need to use standard HuggingFace and PEFT methods instead
- Must handle merging LoRA weights manually if needed
- All examples have if False - they're disabled templates
"""
import torch
import os
from huggingface_hub import HfApi

print("Model Saving Options for Qwen2ForCausalLM with LoRA")
print("=" * 60)

# ==============================================================================
# OPTION 1: SAVE LORA ADAPTERS ONLY (RECOMMENDED - Small files)
# ==============================================================================

print("1️⃣ Saving LoRA adapters only...")

if True:  # Set to True to execute
    # Save LoRA adapter weights (lightweight ~MB)
    model.save_pretrained("model_lora")
    tokenizer.save_pretrained("model_lora")
    print("✅ LoRA adapters saved to 'model_lora' folder")
    
    # Check size
    size_mb = sum(os.path.getsize(os.path.join("model_lora", f)) 
                  for f in os.listdir("model_lora")) / (1024*1024)
    print(f"   Size: {size_mb:.1f}MB")

# ==============================================================================
# OPTION 2: MERGE AND SAVE TO 16-BIT (Large files ~3-6GB)
# ==============================================================================

print("\n2️⃣ Merge to 16-bit and save...")

if False:  # Set to True to execute (WARNING: Large files!)
    try:
        # Check if model has PEFT adapter to merge
        if hasattr(model, 'merge_and_unload'):
            print("Merging LoRA weights into base model...")
            merged_model = model.merge_and_unload()
            
            # Save merged model in 16-bit
            merged_model.save_pretrained(
                "model_merged_16bit",
                torch_dtype=torch.float16,
                safe_serialization=True,  # Use safetensors format
            )
            tokenizer.save_pretrained("model_merged_16bit")
            print("✅ 16-bit merged model saved to 'model_merged_16bit'")
            
        else:
            print("⚠️ No PEFT adapter found - saving current model as 16-bit")
            model.save_pretrained(
                "model_16bit", 
                torch_dtype=torch.float16,
                safe_serialization=True,
            )
            tokenizer.save_pretrained("model_16bit")
            print("✅ 16-bit model saved to 'model_16bit'")
            
    except Exception as e:
        print(f"❌ 16-bit save failed: {e}")

# ==============================================================================
# OPTION 3: MERGE AND SAVE TO 4-BIT (Smaller files ~1-2GB)
# ==============================================================================

print("\n3️⃣ Merge to 4-bit and save...")

if False:  # Set to True to execute
    try:
        from transformers import BitsAndBytesConfig
        
        # Merge LoRA if possible
        if hasattr(model, 'merge_and_unload'):
            merged_model = model.merge_and_unload()
        else:
            merged_model = model
        
        # Create quantization config
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
        )
        
        # Note: 4-bit models are typically loaded with quantization, not saved
        print("⚠️ 4-bit quantization typically done at load time, not save time")
        print("   Use load_in_4bit=True when loading the 16-bit model instead")
        
        # Save configuration for 4-bit loading
        import json
        config = {
            "quantization_method": "4bit",
            "load_in_4bit": True,
            "bnb_4bit_quant_type": "nf4",
            "instructions": "Load this model with load_in_4bit=True"
        }
        
        os.makedirs("model_4bit_config", exist_ok=True)
        with open("model_4bit_config/quantization_config.json", "w") as f:
            json.dump(config, f, indent=2)
        
        # Copy the 16-bit model (4-bit quantization happens at load time)
        if os.path.exists("model_merged_16bit"):
            import shutil
            shutil.copytree("model_merged_16bit", "model_4bit", dirs_exist_ok=True)
            shutil.copy("model_4bit_config/quantization_config.json", "model_4bit/")
            print("✅ 4-bit loading configuration saved to 'model_4bit'")
        
    except Exception as e:
        print(f"❌ 4-bit configuration failed: {e}")

# ==============================================================================
# OPTION 4: PUSH TO HUGGING FACE HUB
# ==============================================================================

print("\n4️⃣ Push to Hugging Face Hub...")

if False:  # Set to True and add your token to execute
    hf_token = "hf_your_token_here"  # Replace with your actual token
    repo_name = "your-username/your-model-name"  # Replace with your repo
    
    try:
        # Push LoRA adapters
        model.push_to_hub(repo_name + "-lora", token=hf_token)
        tokenizer.push_to_hub(repo_name + "-lora", token=hf_token)
        print(f"✅ LoRA adapters pushed to {repo_name}-lora")
        
        # Push merged model (if exists)
        if os.path.exists("model_merged_16bit"):
            from transformers import AutoModelForCausalLM
            merged_model = AutoModelForCausalLM.from_pretrained("model_merged_16bit")
            merged_tokenizer = AutoTokenizer.from_pretrained("model_merged_16bit")
            
            merged_model.push_to_hub(repo_name + "-merged", token=hf_token)
            merged_tokenizer.push_to_hub(repo_name + "-merged", token=hf_token)
            print(f"✅ Merged model pushed to {repo_name}-merged")
            
    except Exception as e:
        print(f"❌ Hub push failed: {e}")
        print("   Make sure to set a valid HF token and repo name")

# ==============================================================================
# VERIFICATION: CHECK WHAT WAS SAVED
# ==============================================================================

print("\n" + "=" * 60)
print("📁 SAVED FILES VERIFICATION:")

folders_to_check = [
    "model_lora", 
    "model_merged_16bit", 
    "model_16bit",
    "model_4bit", 
    "model_4bit_config"
]

for folder in folders_to_check:
    if os.path.exists(folder):
        files = os.listdir(folder)
        total_size = sum(os.path.getsize(os.path.join(folder, f)) for f in files)
        size_mb = total_size / (1024*1024)
        
        print(f"\n📂 {folder}:")
        print(f"   Files: {len(files)}")
        print(f"   Size: {size_mb:.1f}MB")
        
        # Show key files
        key_files = [f for f in files if f.endswith(('.bin', '.safetensors', '.json'))]
        for file in key_files[:3]:  # Show first 3 important files
            file_size = os.path.getsize(os.path.join(folder, file)) / (1024*1024)
            print(f"   • {file} ({file_size:.1f}MB)")
            
    else:
        print(f"\n❌ {folder}: Not found")

# ==============================================================================
# LOADING INSTRUCTIONS FOR SAVED MODELS
# ==============================================================================

print("\n" + "=" * 60)
print("📖 LOADING INSTRUCTIONS:")

print("""
🔹 To load LoRA adapters:
   from peft import PeftModel
   base_model = AutoModelForCausalLM.from_pretrained("unsloth/Qwen2.5-1.5B-Instruct")
   model = PeftModel.from_pretrained(base_model, "model_lora")

🔹 To load 16-bit merged model:
   model = AutoModelForCausalLM.from_pretrained("model_merged_16bit")

🔹 To load with 4-bit quantization:
   model = AutoModelForCausalLM.from_pretrained(
       "model_4bit", 
       load_in_4bit=True,
       device_map="auto"
   )

🔹 From Hugging Face Hub:
   model = AutoModelForCausalLM.from_pretrained("your-username/your-model-name-lora")
""")

# Memory cleanup for RTX 2070 Super
torch.cuda.empty_cache()
print(f"\n💾 Current VRAM usage: {torch.cuda.memory_allocated()/1024**3:.2f}GB")
print("\n✅ Model saving options completed!")

Model Saving Options for Qwen2ForCausalLM with LoRA
1️⃣ Saving LoRA adapters only...
✅ LoRA adapters saved to 'model_lora' folder
   Size: 85.6MB

2️⃣ Merge to 16-bit and save...

3️⃣ Merge to 4-bit and save...

4️⃣ Push to Hugging Face Hub...

📁 SAVED FILES VERIFICATION:

📂 model_lora:
   Files: 10
   Size: 85.6MB
   • adapter_config.json (0.0MB)
   • adapter_model.safetensors (70.5MB)
   • added_tokens.json (0.0MB)

❌ model_merged_16bit: Not found

❌ model_16bit: Not found

❌ model_4bit: Not found

❌ model_4bit_config: Not found

📖 LOADING INSTRUCTIONS:

🔹 To load LoRA adapters:
   from peft import PeftModel
   base_model = AutoModelForCausalLM.from_pretrained("unsloth/Qwen2.5-1.5B-Instruct")
   model = PeftModel.from_pretrained(base_model, "model_lora")

🔹 To load 16-bit merged model:
   model = AutoModelForCausalLM.from_pretrained("model_merged_16bit")

🔹 To load with 4-bit quantization:
   model = AutoModelForCausalLM.from_pretrained(
       "model_4bit", 
       load_in_4bit=True

In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

# Save to multiple GGUF options - much faster if you want multiple!
if False:
    model.push_to_hub_gguf(
        "hf/model", # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token = "",
    )