In [1]:
# Install required packages
!uv pip install -q verifiers datasets transformers torch peft accelerate bitsandbytes wandb huggingface_hub
!uv pip install -q vllm
!uv pip install trl

[2mAudited [1m1 package[0m [2min 16ms[0m[0m


In [2]:
import torch
import json
import os
import wandb
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, Qwen2VLForConditionalGeneration, BitsAndBytesConfig
from huggingface_hub import login
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Import verifiers framework
import verifiers as vf

# Device configuration
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "mps"
print(f"Using device: {device}")


Using device: cuda


In [3]:
## 2. Configuration
import vllm
import trl
!pip show vllm

[0m

In [4]:
# Model and training configuration
model_name = "KhushalM/Qwen2.5-1.5BSFT"
output_dir = "./grpo_vf_results"
hub_model_id = "KhushalM/Qwen2.5-1.5B-GRPO"

# Training hyperparameters - FIXED LEARNING RATE
config = {
    "model": model_name,
    "task": "First Principles Explanations",
    "framework": "verifiers",
    "reward_strategy": "Multi-component First Principles Reward",
    "lora_r": 32,
    "lora_alpha": 64,
    "learning_rate": 5e-5,  # INCREASED from 1e-5 to 5e-5
    "batch_size": 12,
    "gradient_accumulation_steps": 8,
    "num_train_epochs": 3,
    "num_generations": 6,
    "reward_components": {
        "analogy_quality": 0.20,
        "step_by_step": 0.15,
        "fundamental_concepts": 0.20,
        "engagement": 0.15,
        "clarity": 0.15,
        "completeness": 0.10,
        "avoid_jargon": 0.05
    }
}

print("Configuration loaded successfully!")

Configuration loaded successfully!


In [5]:
# Login to Hugging Face and Weights & Biases
login()
wandb.login()

# Initialize wandb with verifiers-specific config
wandb.init(
    project="qwen2.5-1.5B-first-principles",
    config=config
)

print("Authentication completed!")


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

[34m[1mwandb[0m: Currently logged in as: [33mkhushal-mandavia72[0m ([33mkhushal-mandavia72-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Authentication completed!


In [6]:
from trl import GRPOConfig

In [7]:
# Load dataset file - specify the path to your dataset
# Update this path to point to your actual dataset file
dataset_filename = "structured_dataset.json"  # Default dataset file

# Alternative: You can also specify other available datasets
# dataset_filename = "data.json"
# dataset_filename = "dataset.json"
# dataset_filename = "first_principles_dataset.json"

print(f"Loading dataset: {dataset_filename}")

# Verify the file exists
if os.path.exists(dataset_filename):
    print(f"Dataset file '{dataset_filename}' found successfully!")
    file_size = os.path.getsize(dataset_filename)
    print(f"File size: {file_size / 1024:.2f} KB")
else:
    # Try to find available dataset files in current directory
    available_datasets = [f for f in os.listdir('.') if f.endswith('.json')]
    if available_datasets:
        print(f"Dataset file '{dataset_filename}' not found!")
        print(f"Available dataset files: {available_datasets}")
        print(f"Please update the 'dataset_filename' variable to use one of the available files.")
    else:
        print("No JSON dataset files found in current directory.")
        print("Please ensure your dataset file is in the same directory as this notebook.")
    raise FileNotFoundError(f"Dataset file '{dataset_filename}' not found!")


Loading dataset: structured_dataset.json
Dataset file 'structured_dataset.json' found successfully!
File size: 705.95 KB


In [8]:
# Load and preview the dataset
print("Loading dataset...")
with open(dataset_filename, "r") as f:
    dataset_raw = json.load(f)

print(f"Dataset loaded successfully!")
print(f"Number of samples: {len(dataset_raw)}")
print(f"Sample structure: {list(dataset_raw[0].keys()) if dataset_raw else 'Empty dataset'}")

# Preview first sample
if dataset_raw:
    print("\nFirst sample preview:")
    sample_preview = json.dumps(dataset_raw[0], indent=2)
    print(sample_preview[:500] + "..." if len(sample_preview) > 500 else sample_preview)


Loading dataset...
Dataset loaded successfully!
Number of samples: 600
Sample structure: ['messages']

First sample preview:
{
  "messages": [
    {
      "role": "system",
      "content": "You are an expert educator who explains concepts from first principles like Richard Feynman. Start with fundamental truths, use simple analogies, and avoid jargon. Use a storytelling tone and follow a step by step explanation style:"
    },
    {
      "role": "user",
      "content": "Why do objects fall to the ground when dropped?"
    },
    {
      "role": "assistant",
      "content": "Okay, let\u2019s imagine you have a stre...


In [9]:
# Replace your current dataset loading section (Cell 5-6) with this optimized version:

from datasets import Dataset, DatasetDict
import torch
from torch.utils.data import DataLoader

# 1. OPTIMIZED: Load and preprocess dataset using HuggingFace datasets
print("Loading and preprocessing dataset with optimized batching...")

# Load raw data
with open(dataset_filename, "r") as f:
    dataset_raw = json.load(f)

# Extract prompts and responses for preprocessing
prompts = []
responses = []
for item in dataset_raw:
    messages = item['messages']
    # Extract user prompt and assistant response
    user_msg = next((m['content'] for m in messages if m['role'] == 'user'), '')
    assistant_msg = next((m['content'] for m in messages if m['role'] == 'assistant'), '')

    if user_msg and assistant_msg:
        prompts.append(user_msg)
        responses.append(assistant_msg)

# Create dataset using HuggingFace datasets for better performance
train_data = {
    'prompt': prompts[:int(0.9 * len(prompts))],  # 90% for training
    'completion': responses[:int(0.9 * len(responses))],
    #'question': prompts[:int(0.9 * len(prompts))],  # For compatibility
    #'answer': responses[:int(0.9 * len(responses))]
}

test_data = {
    'prompt': prompts[int(0.9 * len(prompts)):],  # 10% for testing
    'completion': responses[int(0.9 * len(responses)):],
    #'question': prompts[int(0.9 * len(prompts)):],
    #'answer': responses[int(0.9 * len(responses)):]
}

# Create Dataset objects with optimized preprocessing
train_dataset = Dataset.from_dict(train_data)
test_dataset = Dataset.from_dict(test_data)

dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

print(f"✅ Optimized dataset created:")
print(f"   - Training samples: {len(dataset['train'])}")
print(f"   - Test samples: {len(dataset['test'])}")
print(f"   - Using HuggingFace datasets for efficient batching")
print(f"Train sample: {dataset['train'][0]}")
print(f"Test sample: {dataset['test'][0]}")

#

Loading and preprocessing dataset with optimized batching...
✅ Optimized dataset created:
   - Training samples: 540
   - Test samples: 60
   - Using HuggingFace datasets for efficient batching
Train sample: {'prompt': 'Why do objects fall to the ground when dropped?', 'completion': "Okay, let’s imagine you have a stretched rubber sheet and you place a heavy ball in the middle. The sheet bends downwards, right? Now, if you roll a smaller ball nearby, it will start rolling toward the heavier ball because of the dip. This is a simple way to picture how gravity works. Gravity is like the Earth making a 'dip' in space that pulls things toward it.\n\nWhen you let go of an object, it falls because the Earth is pulling it toward its center, similar to how the heavy ball makes the rubber sheet dip. This pull is what we call gravitational force. It's a force that attracts two masses toward each other, and since the Earth is so big, it pulls objects towards itself strongly.\n\nIn our daily life,

#Reward Funcs

In [10]:
# ─────────────────────────  first_principles_reward.py  ──────────────────────────
"""
Reward function for GRPO / RLHF
• Seven Feynman-style axes → single 0-to-1 reward
• CUDA-aware, sentiment batched on GPU
• W&B logs three sample completions per call (no step mismatch)
"""

# -- Imports & env ─────────────────────────────────────────────────────────────────
import os, re
from typing import List, Dict

import torch
import nltk
from packaging import version
from transformers import pipeline
from textstat import flesch_reading_ease
import wandb

DEVICE = 0 if torch.cuda.is_available() else -1          # -1 → CPU

# -- One-time W&B init (safe if already initialised) --
if os.getenv("WANDB_DISABLED", "false").lower() != "true" and wandb.run is None:
    wandb.init(project="grpo-first-principles", reinit=True)

# -- Secure NLTK data (punkt_tab ≥ 3.8.2) --
if version.parse(nltk.__version__) >= version.parse("3.8.2"):
    nltk.download("punkt_tab", quiet=True)
    SENT_TOKENIZE = nltk.data.load("tokenizers/punkt_tab/english.pickle").tokenize
else:
    nltk.download("punkt", quiet=True)
    SENT_TOKENIZE = nltk.sent_tokenize

# -- Sentiment pipeline (batched) --
try:
    SENT_PIPE = pipeline(
        "sentiment-analysis",
        model="cardiffnlp/twitter-roberta-base-sentiment-latest",
        device=DEVICE,
        torch_dtype=torch.float16 if DEVICE >= 0 else None,
        batch_size=32,
    )
except Exception:
    SENT_PIPE = None  # sentiment optional

# -- Static vocabularies → CUDA tensors for bucketised lookup --
_CONCRETE = [
    "ball", "car", "house", "water", "air", "food", "game", "toy", "bicycle",
    "apple", "book", "chair", "door", "phone", "street", "bridge", "cup",
    "sand", "river", "tree", "mountain", "train", "clock", "computer", "lamp",
]

# Words that evoke a direct sensory experience
_SENSORY = [
    # vision
    "see", "look", "bright", "dark", "color", "shine", "glow",
    # touch / temperature
    "feel", "touch", "smooth", "rough", "soft", "hard", "warm", "cold",
    # hearing
    "hear", "sound", "loud", "quiet", "crackle", "whisper",
    # taste / smell
    "taste", "smell", "sweet", "bitter", "sour", "salty", "fresh", "fragrant",
]
_JARGON    = [
    'utilize','paradigm','synergy','leverage','optimize','streamline','methodology',
    'framework','infrastructure','scalable','robust','innovative','state-of-the-art'
]
def _vocab_tensor(words):      # hashed & sorted for bucketize
    return torch.tensor(sorted(hash(w) for w in words), device=DEVICE)
CONCRETE_T, SENSORY_T, JARGON_T = map(_vocab_tensor, (_CONCRETE, _SENSORY, _JARGON))

def _batch_hash(tokens: List[List[str]]) -> torch.Tensor:
    """Pad to equal length and return int64 [B,L] tensor of word hashes on DEVICE."""
    L = max(len(t) for t in tokens)
    mat = torch.full((len(tokens), L), 0, dtype=torch.int64, device=DEVICE)
    for i, tok in enumerate(tokens):
        mat[i, :len(tok)] = torch.tensor([hash(w) for w in tok], device=DEVICE)
    return mat

class FirstPrinciplesRewardV3:
    def __init__(self, device="cuda", max_batch_size=64, enable_caching=True):
        self.device = torch.device(device if torch.cuda.is_available() else "cpu")
        self.max_batch_size = max_batch_size
        self.enable_caching = enable_caching
        self._pattern_cache = {} if enable_caching else None

        # Better calibrated weights for first principles teaching
        self.WEIGHTS = {
            "analogy": 0.30,        # ↑ Increased - most important for Feynman style
            "step": 0.25,           # ↑ Increased - crucial for good explanations
            "fundamentals": 0.20,   # Keep high
            "engagement": 0.15,     # Good for interaction
            "clarity": 0.07,        # ↓ Reduced - less critical
            "completeness": 0.02,   # ↓ Much reduced - length not as important
            "no_jargon": 0.01,      # ↓ Minimal - absence of jargon less important than presence of good content
        }

        self._init_vocabularies()
        self._compile_patterns()

    def _init_vocabularies(self):
        """Initialize enhanced vocabularies with semantic grouping"""
        # Concrete objects (expanded and categorized)
        self.CONCRETE_PHYSICAL = [
            "ball", "car", "house", "water", "air", "food", "bicycle", "apple", "book",
            "chair", "door", "phone", "bridge", "cup", "sand", "river", "tree", "clock",
            "sheet", "rubber", "bowling", "marble", "fabric", "space", "earth", "moon",
            "pen", "slope", "dip", "curve", "surface", "weight"
        ]
        
        self.CONCRETE_ABSTRACT = [
            "game", "story", "picture", "music", "dance", "recipe", "map", "puzzle",
            "path", "journey", "process", "method", "system"
        ]
        
        # Sensory experience words (enhanced)
        self.SENSORY_VISUAL = ["see", "look", "bright", "dark", "color", "shine", "glow", "sparkle", "watch", "observe"]
        self.SENSORY_TACTILE = ["feel", "touch", "smooth", "rough", "soft", "hard", "warm", "cold", "heavy", "light"]
        self.SENSORY_AUDITORY = ["hear", "sound", "loud", "quiet", "whisper", "crackle", "ring", "noise"]
        self.SENSORY_OTHER = ["taste", "smell", "sweet", "bitter", "fresh", "fragrant"]
        
        # Convert to GPU tensors with improved hashing
        self.vocab_tensors = {}
        for name, vocab in {
            "concrete_physical": self.CONCRETE_PHYSICAL,
            "concrete_abstract": self.CONCRETE_ABSTRACT,
            "sensory_visual": self.SENSORY_VISUAL,
            "sensory_tactile": self.SENSORY_TACTILE,
            "sensory_auditory": self.SENSORY_AUDITORY,
            "sensory_other": self.SENSORY_OTHER,
        }.items():
            # Use stable hash for consistent results
            hashes = [hash(w) & 0x7FFFFFFF for w in vocab]  # Ensure positive
            self.vocab_tensors[name] = torch.tensor(sorted(hashes), device=self.device)

    def _compile_patterns(self):
        """Compile enhanced regex patterns with better coverage"""
        self.patterns = {
            "analogy_strong": re.compile(
                r'\b(like|similar to|imagine|as if|just like|comparable to|think of it as|'
                r'picture this|it\'s like|reminds me of|analogous to|sort of like)\b', re.I
            ),
            "analogy_weak": re.compile(
                r'\b(kind of|sort of|similar|resemble|compare|akin to)\b', re.I
            ),
            "step_indicators": re.compile(
                r'\b(first|second|third|next|then|after that|step by step|'
                r'initially|subsequently|finally|to begin|to start|now|when|following)\b', re.I
            ),
            "causal_connectors": re.compile(
                r'\b(because|therefore|as a result|this is why|which leads to|'
                r'consequently|thus|hence|so that|due to|since|leads to)\b', re.I
            ),
            "fundamental_phrases": re.compile(
                r'\b(at its core|at the core|fundamentally|basically|essentially|'
                r'from scratch|from the ground up|first principles?|'
                r'root cause|underlying|the essence|building block|foundation|'
                r'the basic idea|in essence|the nature of)\b', re.I
            ),
            "engagement_direct": re.compile(
                r'\b(does this|does that|do you see|can you see|can you picture|can you imagine|'
                r'have you noticed|have you ever|could you picture|picture this|'
                r'make sense|sound good|is that clear|is this clear|see how|'
                r'notice how|feel how|think about|consider this)\b', re.I
            ),
            "engagement_questions": re.compile(r'\?', re.I),
            "conclusion_indicators": re.compile(
                r'\b(so|therefore|in summary|overall|this explains|'
                r'to sum up|in conclusion|ultimately|does this help)\b', re.I
            ),
            "example_phrases": re.compile(
                r'\b(for example|for instance|such as|like when|'
                r'consider|take|let\'s say|imagine if)\b', re.I
            ),
        }
    
    def batch_score_optimized(self, texts: List[str], return_breakdown=False):
        """Optimized batch scoring with memory management"""
        if not texts:
            return []
        
        all_scores = []
        all_breakdowns = [] if return_breakdown else None

        for i in range(0, len(texts), self.max_batch_size):
            batch = texts[i:i + self.max_batch_size]
            scores, breakdowns = self._process_batch(batch, return_breakdown)
            all_scores.extend(scores)
            if return_breakdown:
                all_breakdowns.extend(breakdowns)
        
        return (all_scores, all_breakdowns) if return_breakdown else all_scores

    def _process_batch(self, texts: List[str], return_breakdown=False):
        """Process a single batch with enhanced scoring"""
        batch_size = len(texts)
        texts_lower = [t.lower() for t in texts]

        # Tokenize and create hash matrix
        tokens = [t.split() for t in texts_lower]
        max_len = max(len(t) for t in tokens) if tokens else 0
        
        # Create padded hash matrix on GPU
        hash_matrix = torch.zeros((batch_size, max_len), dtype=torch.int64, device=self.device)
        for i, tok_list in enumerate(tokens):
            if tok_list:
                hashes = [hash(w) & 0x7FFFFFFF for w in tok_list]
                hash_matrix[i, :len(hashes)] = torch.tensor(hashes, device=self.device)
        
        # Compute all scores
        scores = self._compute_enhanced_scores(texts, texts_lower, hash_matrix, tokens)
        
        # Weighted combination with improved normalization
        total_scores = torch.zeros(batch_size, device=self.device)
        for component, weight in self.WEIGHTS.items():
            total_scores += weight * scores[component]
        
        # Apply tanh for better score distribution (prevents saturation)
        final_scores = torch.tanh(total_scores).tolist()
        
        if return_breakdown:
            breakdowns = []
            for i in range(batch_size):
                breakdown = {k: v[i].item() if isinstance(v, torch.Tensor) else v[i] for k, v in scores.items()}
                breakdown["reward"] = final_scores[i]
                breakdowns.append(breakdown)
            return final_scores, breakdowns
        
        return final_scores, None
        
    def _compute_enhanced_scores(self, texts, texts_lower, hash_matrix, tokens):
        """Compute enhanced scoring with FIXED step-by-step detection"""
        batch_size = len(texts)
        
        # ✅ FIXED: Pattern matching with COUNT instead of boolean
        pattern_counts = {}
        for name, pattern in self.patterns.items():
            counts = torch.zeros(batch_size, device=self.device)
            for i, text_lower in enumerate(texts_lower):
                matches = len(pattern.findall(text_lower))  # COUNT matches, not just boolean
                counts[i] = matches
            pattern_counts[name] = counts
        
        # Vocabulary matching on GPU (unchanged)
        vocab_counts = {}
        for name, vocab_tensor in self.vocab_tensors.items():
            matches = torch.isin(hash_matrix, vocab_tensor)
            counts = torch.sum(matches, dim=1).float()
            vocab_counts[name] = counts
        
        # ✅ FIXED: Much better step-by-step scoring with counting
        step_scores = (
            0.6 * torch.clamp(pattern_counts["step_indicators"] / 3.0, max=1.0) +    # Normalize by 3, up to 0.6
            0.3 * torch.clamp(pattern_counts["causal_connectors"] / 2.0, max=1.0) +  # Normalize by 2, up to 0.3
            0.1 * self._compute_progression_score(texts)                             # Small structure bonus
        )
        
        # ✅ IMPROVED: Better analogy scoring with counting
        analogy_scores = (
            0.5 * torch.clamp(pattern_counts["analogy_strong"] / 2.0, max=1.0) +     # Count strong analogies
            0.2 * torch.clamp(pattern_counts["analogy_weak"] / 3.0, max=1.0) +       # Count weak analogies
            0.2 * torch.clamp(vocab_counts["concrete_physical"] / 6.0, max=1.0) +    # Concrete words
            0.05 * torch.clamp(vocab_counts["sensory_visual"] / 3.0, max=1.0) +      # Sensory words
            0.05 * torch.clamp(vocab_counts["sensory_tactile"] / 2.0, max=1.0)       # Tactile words
        )
        
        # ✅ IMPROVED: Better fundamental scoring with counting
        fundamental_scores = (
            0.7 * torch.clamp(pattern_counts["fundamental_phrases"] / 2.0, max=1.0) +  # Count fundamental phrases
            0.2 * self._compute_depth_score(texts_lower) +
            0.1 * torch.clamp(vocab_counts["concrete_physical"] / 12.0, max=1.0)
        )
        
        # ✅ IMPROVED: Better engagement scoring with counting
        engagement_scores = (
            0.4 * torch.clamp(pattern_counts["engagement_direct"] / 2.0, max=1.0) +    # Count engagement phrases
            0.3 * torch.clamp(pattern_counts["engagement_questions"] / 2.0, max=1.0) + # Count questions
            0.3 * self._compute_interactive_score(texts)
        )
        
        # Keep other scores the same
        clarity_scores = self._compute_clarity_score(texts)
        completeness_scores = self._compute_completeness_score(texts)
        jargon_scores = self._compute_jargon_score(texts_lower)
        
        return {
            "analogy": analogy_scores,
            "step": step_scores,
            "fundamentals": fundamental_scores,
            "engagement": engagement_scores,
            "clarity": clarity_scores,
            "completeness": completeness_scores,
            "no_jargon": jargon_scores,
        }

    def _compute_progression_score(self, texts: List[str]) -> torch.Tensor:
        """Enhanced progression scoring to better detect step-by-step explanations"""
        batch_size = len(texts)
        scores = torch.zeros(batch_size, device=self.device)
        
        for i, text in enumerate(texts):
            text_lower = text.lower()
            
            # Count explicit sequence indicators
            sequence_words = ['first', 'second', 'third', 'fourth', 'fifth', 
                             'next', 'then', 'after that', 'finally', 'lastly']
            sequence_count = sum(1 for word in sequence_words if word in text_lower)
            
            # Count transition words
            transitions = ['following', 'subsequently', 'now', 'when', 'after']
            transition_count = sum(1 for word in transitions if word in text_lower)
            
            total_indicators = sequence_count + transition_count
            
            if total_indicators >= 5:      # Excellent progression
                scores[i] = 1.0
            elif total_indicators >= 3:    # Good progression  
                scores[i] = 0.8
            elif total_indicators >= 2:    # Some progression
                scores[i] = 0.6
            elif total_indicators >= 1:    # Minimal progression
                scores[i] = 0.4
            else:
                scores[i] = 0.2
        
        return scores

    def _compute_depth_score(self, texts_lower: List[str]) -> torch.Tensor:
        """Compute depth score for fundamental understanding (more generous)"""
        batch_size = len(texts_lower)
        scores = torch.zeros(batch_size, device=self.device)
        
        depth_words = ['why', 'because', 'reason', 'cause', 'underlying', 'fundamental',
                      'how', 'what', 'explains', 'mechanism', 'process', 'nature']
        
        for i, text in enumerate(texts_lower):
            depth_count = sum(1 for word in depth_words if word in text)
            scores[i] = min(1.0, depth_count / 3.0)  # More generous normalization
        
        return scores

    def _compute_interactive_score(self, texts: List[str]) -> torch.Tensor:
        """Compute interactive engagement score (more generous)"""
        batch_size = len(texts)
        scores = torch.zeros(batch_size, device=self.device)
        
        interactive_patterns = [
            r'\b(do you|can you|have you|did you|will you|would you)\b',
            r'\b(imagine|picture this|think about|consider|notice)\b',
            r'\b(let\'s|we can|we should|we need)\b',
            r'\b(see how|feel how|watch|observe)\b',
            r'\b(does this|is this|makes sense|clear|understand)\b'
        ]
        
        for i, text in enumerate(texts):
            text_lower = text.lower()
            score = 0.0
            
            # Count questions (more generous)
            question_count = text.count('?')
            score += min(0.5, question_count * 0.2)  # More generous
            
            # Count interactive patterns
            pattern_matches = 0
            for pattern in interactive_patterns:
                matches = len(re.findall(pattern, text_lower))
                pattern_matches += matches
            
            score += min(0.4, pattern_matches * 0.08)  # More generous
            
            # Direct address indicators (expanded)
            direct_address = ['you', 'your', 'yourself', 'we', 'us', 'our']
            address_count = sum(1 for word in text_lower.split() if word in direct_address)
            score += min(0.1, address_count * 0.01)  # Small bonus
            
            scores[i] = min(1.0, score)
        
        return scores

    def _compute_clarity_score(self, texts: List[str]) -> torch.Tensor:
        """Compute clarity score (more generous)"""
        batch_size = len(texts)
        scores = torch.zeros(batch_size, device=self.device)
        
        for i, text in enumerate(texts):
            try:
                from textstat import flesch_reading_ease
                flesch_score = flesch_reading_ease(text)
                if flesch_score >= 50:      # More generous threshold
                    scores[i] = 1.0
                elif flesch_score >= 40:    # More generous
                    scores[i] = 0.8
                elif flesch_score >= 30:    # More generous
                    scores[i] = 0.6
                else:
                    scores[i] = 0.4
            except:
                # Fallback: simple sentence length analysis
                word_count = len(text.split())
                sentence_count = max(1, text.count('.') + text.count('!') + text.count('?'))
                avg_words = word_count / sentence_count
                
                if avg_words <= 20:
                    scores[i] = 0.8
                elif avg_words <= 30:
                    scores[i] = 0.6
                else:
                    scores[i] = 0.4
        
        return scores

    def _compute_completeness_score(self, texts: List[str]) -> torch.Tensor:
        """Compute completeness score (more generous)"""
        batch_size = len(texts)
        scores = torch.zeros(batch_size, device=self.device)
        
        for i, text in enumerate(texts):
            word_count = len(text.split())
            
            # More generous length requirements
            if 50 <= word_count <= 400:    # Expanded range
                base_score = 0.9
            elif 30 <= word_count <= 500:  # Even more generous
                base_score = 0.7
            elif word_count >= 20:         # Minimum threshold
                base_score = 0.5
            else:
                base_score = 0.3
            
            # Bonus for conclusion indicators
            text_lower = text.lower()
            conclusions = ['so', 'therefore', 'in summary', 'overall', 'this explains', 
                          'does this help', 'make sense', 'clear', 'understand', 'click']
            if any(phrase in text_lower for phrase in conclusions):
                base_score += 0.1
            
            scores[i] = min(1.0, base_score)
        
        return scores

    def _compute_jargon_score(self, texts_lower: List[str]) -> torch.Tensor:
        """Compute jargon avoidance score (more generous)"""
        batch_size = len(texts_lower)
        scores = torch.zeros(batch_size, device=self.device)
        
        # Expanded but more reasonable jargon detection
        jargon_terms = {
            'utilize', 'paradigm', 'synergy', 'leverage', 'optimize', 'streamline',
            'methodology', 'framework', 'infrastructure', 'scalable', 'robust',
            'innovative', 'cutting-edge', 'state-of-the-art', 'holistic', 'comprehensive',
            'stakeholder', 'deliverable', 'actionable', 'bandwidth', 'implementation'
        }
        
        for i, text in enumerate(texts_lower):
            words = text.split()
            if not words:
                scores[i] = 1.0
                continue
                
            jargon_count = sum(1 for word in words if word.strip('.,!?;:') in jargon_terms)
            jargon_ratio = jargon_count / len(words)
            
            # More forgiving jargon penalties
            if jargon_ratio == 0:
                scores[i] = 1.0
            elif jargon_ratio <= 0.03:  # Up to 3% is fine
                scores[i] = 0.95
            elif jargon_ratio <= 0.07:  # Up to 7% gets good score
                scores[i] = 0.8
            elif jargon_ratio <= 0.15:  # Up to 15% gets decent score
                scores[i] = 0.6
            else:
                scores[i] = 0.3
        
        return scores

    # Convenience methods for single text scoring
    def score(self, text: str) -> float:
        """Score a single text"""
        return self.batch_score_optimized([text])[0]
    
    def breakdown(self, text: str) -> dict:
        """Get detailed breakdown for a single text"""
        _, breakdowns = self.batch_score_optimized([text], return_breakdown=True)
        breakdown = breakdowns[0]
        
        # Rename keys to match expected format
        result = {}
        key_mapping = {
            "analogy": "analogy",
            "step": "step", 
            "fundamentals": "fundamentals",
            "engagement": "engagement",
            "clarity": "clarity",
            "completeness": "complete",
            "no_jargon": "nojargon",
            "reward": "reward"
        }
        
        for old_key, new_key in key_mapping.items():
            if old_key in breakdown:
                result[new_key] = breakdown[old_key]
        
        return result

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


#Later

In [11]:
# Import in this specific order
import torch
from transformers import AutoTokenizer
import vllm  # Import vllm first
from transformers import Qwen2VLForConditionalGeneration, BitsAndBytesConfig, AutoModelForCausalLM
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, PeftModel

base_model_name = "Qwen/Qwen2.5-1.5B-Instruct"  # The original base model
sft_model_name = "KhushalM/Qwen2.5-1.5BSFT"      # Your SFT model with adapters

# Quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

print("Loading base model...")
# Step 1: Load the clean base model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quantization_config,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
    low_cpu_mem_usage=True,
    use_cache=False,
)

print("Loading SFT adapters...")
# Step 2: Load your SFT adapters onto the clean base model
model = PeftModel.from_pretrained(
    base_model,
    sft_model_name,
    trust_remote_code=True,
)

print("Preparing for GRPO training...")
# Step 3: Prepare for additional training (GRPO)
# ✅ FIXED: Use the SFT model's tokenizer, not base model's
print("Loading tokenizer from SFT model...")
tokenizer = AutoTokenizer.from_pretrained(
    sft_model_name,  # ✅ Use "KhushalM/Qwen2.5-1.5BSFT" 
    trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
print(f"Original vocab size: {model.config.vocab_size}")
print(f"Tokenizer vocab size: {len(tokenizer)}")

print("✅ Using SFT model's tokenizer for consistency")

# Prepare for training
#model = prepare_model_for_kbit_training(model)
#model.config.use_cache = False

print("✅ Model loaded with SFT adapters preserved!")

# Test the model
test_prompt = "Explain gravity:"
inputs = tokenizer(test_prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=100, temperature=0.7)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"✅ Test generation: {response[len(test_prompt):].strip()}")


Loading base model...
Loading SFT adapters...
Preparing for GRPO training...
Loading tokenizer from SFT model...
Original vocab size: 151936
Tokenizer vocab size: 151665
✅ Using SFT model's tokenizer for consistency
✅ Model loaded with SFT adapters preserved!
✅ Test generation: What causes objects to fall down when dropped, and how does it affect the motion of planets in space? Gravity is a fundamental force that attracts two objects with mass toward each other. The strength of this attraction depends on the masses involved and their distance apart.

When you drop an object, it falls because Earth's mass pulls it towards its center. This pull makes the object accelerate, changing its direction from straight up to downward. In fact, if you were standing still at the top of a tower,


In [12]:
# LoRA Setup - FIXED to continue training existing SFT adapters
print("Checking for existing LoRA configuration...")

if hasattr(model, "peft_config") and model.peft_config:
    print("✅ Found existing SFT LoRA configuration - continuing training from checkpoint")

    for name, param in model.named_parameters():
        if getattr(param, "is_lora", False) or "lora" in name:
            param.requires_grad = True
    
    print("✅ Existing SFT LoRA adapters are now trainable for GRPO fine-tuning")
else:
    print("❌ No existing LoRA adapters found - this shouldn't happen with SFT model")
    # Fallback: create new LoRA if none exist
    peft_config = LoraConfig(
        r=config["lora_r"],
        lora_alpha=config["lora_alpha"],
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )
    model = get_peft_model(model, peft_config)
    print("⚠️  Created new LoRA adapters as fallback")

print(f"Trainable Parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
print(f"Total Parameters: {sum(p.numel() for p in model.parameters()):,}")
trainable_percentage = (sum(p.numel() for p in model.parameters() if p.requires_grad) / 
                       sum(p.numel() for p in model.parameters())) * 100
print(f"Trainable: {trainable_percentage:.2f}%")

Checking for existing LoRA configuration...
✅ Found existing SFT LoRA configuration - continuing training from checkpoint
✅ Existing SFT LoRA adapters are now trainable for GRPO fine-tuning
Trainable Parameters: 8,716,288
Total Parameters: 897,332,736
Trainable: 0.97%


In [13]:
# ── Global scorer instance + W&B-safe reward wrapper ───────────────────────

_SCORER = FirstPrinciplesRewardV3()  # ✅ Use the updated class

# 🔧 PROMPT EXTRACTION FIX - This fixes the "[Prompt extraction failed]" issue

def extract_prompt_from_batch(batch):
    """Simplified prompt extraction for GRPO batches."""
    # Get batch size
    batch_size = len(next(iter(batch.values())))

    if 'prompts' in batch and 'prompt' not in batch:
        batch['prompt'] = batch['prompts']
    
    # Strategy 1: Direct prompt field (90% of cases)
    if 'prompt' in batch:
        prompts = batch['prompt']
        if isinstance(prompts, (list, tuple)):
            return [str(p) for p in prompts]
        return [str(prompts)] * batch_size
    
    # Strategy 2: Common alternatives
    for key in ['query', 'text', 'input']:
        if key in batch:
            values = batch[key]
            if isinstance(values, (list, tuple)):
                return [str(v) for v in values]
            return [str(values)] * batch_size
    
    # Fallback
    return [f"[No prompt found]"] * batch_size


# Create improved reward function with essential logging only
def fp_reward(scorer):
    def improved_fp_reward(completions, **batch):
        try:
            rewards, breakdowns = scorer.batch_score_optimized(completions, return_breakdown=True)
        except Exception as e:
            rewards = [0.5] * len(completions)  # Safe fallback
            breakdowns = [{}] * len(completions)
        
        scaled_rewards = [2.0 * r - 1.0 for r in rewards]
        
        if wandb.run is not None:
            try:
                # Use improved prompt extraction
                prompts = extract_prompt_from_batch(batch)
                
                # Essential metrics only
                component_metrics = {
                    "reward/mean": np.mean(scaled_rewards),
                }
                
                # Mean component scores - essential for monitoring
                if breakdowns:
                    for component in ["analogy", "step", "fundamentals", "engagement", "clarity", "completeness", "no_jargon"]:
                        values = [b.get(component, 0) for b in breakdowns if b]
                        if values:
                            component_metrics[f"components/{component}_mean"] = np.mean(values)
                
                # Prompts/completion advantage table - essential output requested
                if len(completions) > 0:
                    sample_data = []
                    num_samples = min(3, len(completions))
                    
                    for i in range(num_samples):
                        prompt = prompts[i] if i < len(prompts) else "[No prompt]"
                        completion = completions[i]
                        reward = scaled_rewards[i]
                        
                        # Clean text for wandb
                        clean_prompt = prompt.replace('\\n', ' ')[:100]
                        clean_completion = completion.replace('\\n', ' ')[:150]
                        
                        sample_data.append([
                            clean_prompt,
                            clean_completion,
                            f"{reward:.3f}"
                        ])
                    
                    advantage_table = wandb.Table(
                        columns=["prompt", "completion", "advantage"],
                        data=sample_data
                    )
                    component_metrics["prompts_completion_advantage"] = advantage_table
                
                # Log to wandb
                wandb.log(component_metrics)
                
            except Exception:
                pass  # Continue without logging rather than crashing

        return scaled_rewards
    
    return improved_fp_reward

# Apply the reward function
fp_reward = fp_reward(_SCORER)


print("✅ Enhanced reward function created!")
print("🎯 Features: Comprehensive logging, memory-safe tables, component breakdown")

# Test the enhanced reward function
test_samples = [
    "Gravity works by bending spacetime. When objects have mass, they curve the fabric of spacetime around them, and other objects follow the straightest possible path through this curved space, which appears to us as gravitational attraction.",
    
    "Objects fall because Earth pulls them down with a force called gravity."
]

print("\\n🧪 Testing enhanced reward function...")
test_rewards = fp_reward(test_samples)
print(f"✅ Test complete! Rewards: {[f'{r:.3f}' for r in test_rewards]}")
# ───────────────────────────────────────────────────────────────────────────

# ▶️ Run this cell to sanity-check the reward function
import pandas as pd
import importlib
import sys, pathlib

# Adjust this if the file lives elsewhere
sys.path.append(str(pathlib.Path('.').resolve()))

samples = [
    "**Group Relative Proximal Optimization (GRPO)** is a method used to solve optimization problems that arise in machine learning, particularly in training large language models. It is designed to handle non-convex objectives efficiently by breaking them into smaller, more manageable sub-problems. The term \"proximal\" refers to a technique where we minimize a proximal operator, which can be thought of as a smoothing or regularization step.",

    # ✅ FIXED: Concatenated the multi-line string properly
    "First, imagine you're standing on a huge soft rubber sheet stretched tight. "
    "Place a bowling ball in the middle—feel how the surface dips under its weight. "
    "Second, roll a marble across the sheet. The marble curves inward because the slope guides it, just like rain sliding down a car-window. "
    "Third, swap the rubber for the invisible fabric of space-time and the bowling ball for Earth. Mass fundamentally bends this fabric; that bend is what we call a gravitational field. "
    "Next, release a pen. It isn't \"pulled\" by a mysterious force; it simply follows the easiest downhill path in the curved fabric, exactly like the marble. Why does it speed up? Because the slope gets steeper closer to Earth's center, so each moment the pen points farther \"downhill.\" "
    "Then, notice that the Moon also rides this curve, but sideways momentum keeps it sliding around the bowl instead of falling straight in—an orbit is just perpetual falling with a sideways shove. "
    "Finally, to check your intuition: if Earth vanished, the fabric would flatten and the pen would have no slope to follow—so it would float. "
    "In summary, objects fall because mass reshapes space-time, tilting every path toward the mass; the steeper the tilt, the faster the fall. Does that picture click?"
]

# Compute scalar rewards
scalar_rewards = fp_reward(samples)

# ✅ FIXED: Use V3 for breakdown analysis too
_sc = FirstPrinciplesRewardV3()  # Changed from FirstPrinciplesReward()
rows = [_sc.breakdown(t) for t in samples]

df = pd.DataFrame(rows)
df.insert(0, "text_snippet", [t[:60] + "…" for t in samples])
df["reward_scalar"] = scalar_rewards
df

✅ Enhanced reward function created!
🎯 Features: Comprehensive logging, memory-safe tables, component breakdown
\n🧪 Testing enhanced reward function...
✅ Test complete! Rewards: ['-0.648', '-0.669']


Unnamed: 0,text_snippet,analogy,step,fundamentals,engagement,clarity,complete,nojargon,reward,reward_scalar
0,**Group Relative Proximal Optimization (GRPO)*...,0.0,0.02,0.0,0.003,0.4,1.0,1.0,0.063365,-0.87327
1,"First, imagine you're standing on a huge soft ...",0.725,1.0,0.65,0.898,1.0,1.0,1.0,0.681656,0.363311


In [14]:
# GRPO training arguments using verifiers defaults
# First get the default training arguments from verifiers
import types

from trl import GRPOConfig
training_args = GRPOConfig(
    run_name="Qwen/Qwen2.5-1.5B-grpo",
    label_names=["labels"],
    generation_kwargs={
        "max_new_tokens": 256,
        "pad_token_id": tokenizer.pad_token_id,
        "eos_token_id": tokenizer.eos_token_id,
        "do_sample": True,
    },
    logging_strategy = "steps",        # <-- string (NO trailing comma)
    logging_steps    = 2,              # every 10 optimiser steps
    report_to        = ["wandb"],      # <-- list, not str
    eval_strategy = "steps",     # (optional) run eval during training
    eval_steps         = 15,
    beta=0.05,              # ✅ CORRECT: KL coefficient for GRPO
    epsilon=0.2,            # ✅ Clipping parameter
    num_iterations=1,       # ✅ Iterations per batch
    num_generations=4,
    max_completion_length=256,
    temperature=0.7,
    disable_dropout=True,
)

# Then modify the arguments we need to customize
training_args.output_dir = output_dir
training_args.learning_rate = config["learning_rate"]  # Now 5e-5
training_args.per_device_train_batch_size = 4  # OPTIMIZED: Increased from 16 to 4 (more manageable)
training_args.gradient_accumulation_steps = 16  # OPTIMIZED: Increased to maintain effective batch size
training_args.num_train_epochs = config["num_train_epochs"]
training_args.save_steps = 25
training_args.eval_steps = 50
training_args.warmup_steps = 51  # ADDED: 10% of total steps (51 steps total, so ~5 warmup steps)
training_args.warmup_ratio = 0.1  # ADDED: Alternative warmup specification
training_args.lr_scheduler_type = "cosine"
training_args.push_to_hub = True
training_args.hub_model_id = hub_model_id
training_args.bf16 = True 
training_args.fp16=False
training_args.generation_kwargs["max_length"] = 256
training_args.dataloader_num_workers = 4

training_args.log_completions = True
training_args.log_rewards = True

dataset_size = len(dataset['train'])
effective_batch_size = training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps
steps_per_epoch = dataset_size // effective_batch_size
total_steps = steps_per_epoch * training_args.num_train_epochs

print("🎯 ENHANCED TRAINING CONFIGURATION")
print("="*50)
print(f"📊 Dataset size: {dataset_size}")
print(f"📦 Effective batch size: {effective_batch_size}")
print(f"📈 Steps per epoch: {steps_per_epoch}")
print(f"🎯 Total training steps: {total_steps}")
print(f"📝 Logging every: {training_args.logging_steps} steps")
print(f"💾 Saving every: {training_args.save_steps} steps")
print(f"🔍 Evaluation every: {training_args.eval_steps} steps")
print(f"🚀 Learning rate: {training_args.learning_rate}")
print(f"🌡️ Warmup steps: {training_args.warmup_steps}")
print("="*50)

if total_steps < 100:
    print("⚠️  WARNING: Still low step count! Consider:")
    print("   - Decreasing per_device_train_batch_size to 2")
    print("   - Decreasing gradient_accumulation_steps to 4") 
    print("   - Increasing num_train_epochs to 8")

🎯 ENHANCED TRAINING CONFIGURATION
📊 Dataset size: 540
📦 Effective batch size: 64
📈 Steps per epoch: 8
🎯 Total training steps: 24
📝 Logging every: 2 steps
💾 Saving every: 25 steps
🔍 Evaluation every: 50 steps
🚀 Learning rate: 5e-05
🌡️ Warmup steps: 51
   - Decreasing per_device_train_batch_size to 2
   - Decreasing gradient_accumulation_steps to 4
   - Increasing num_train_epochs to 8


In [15]:
print(torch.cuda.memory_allocated()/1e9, "GB allocated")
print(torch.cuda.memory_reserved()/1e9,  "GB reserved")


1.454441472 GB allocated
2.145386496 GB reserved


In [16]:
import warnings
from transformers import logging as transformers_logging

# Enhanced warning suppression
warnings.filterwarnings("ignore", message="torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly.*")
warnings.filterwarnings("ignore", message="None of the inputs have requires_grad=True. Gradients will be None")
warnings.filterwarnings("ignore", message=".*Caching is incompatible with gradient checkpointing.*")
transformers_logging.set_verbosity_error()

# Setup environment
import os
os.environ["FLASH_ATTENTION_USE_PACKED_QKV"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Prevent tokenizer warnings


In [None]:

from transformers import TrainerCallback
import wandb, html, re, unicodedata

# 5. Create a simpler approach - use standard TRL GRPOTrainer with a fixed reward function
from trl import GRPOTrainer, GRPOConfig
from trl.core import LengthSampler
import functools

training_args.num_generations = 4
training_args.per_device_train_batch_size = 32  # Reduce to minimize complexity


# CORRECT FORMAT FOR GRPO
def format_for_grpo(example):
    return {
        "prompt": example['prompt'], 
        "completion": example['completion']
    }

grpo_train_dataset = dataset['train'].map(format_for_grpo)
grpo_eval_dataset = dataset['test'].map(format_for_grpo)

# Update training args
training_args.remove_unused_columns = False
training_args.label_names = ["labels"]
trainer = GRPOTrainer(
    model=model,
    args=training_args,
    processing_class=tokenizer,
    train_dataset=grpo_train_dataset,
    eval_dataset=grpo_eval_dataset, # Your evaluation dataset
    reward_funcs=[fp_reward],
)

print("\\n🚀 STARTING ENHANCED GRPO TRAINING...")
# Force all model parameters to participate in gradient computation
model.train()
for param in model.parameters():
  if param.requires_grad:
      param.retain_grad()

# Ensure model forward pass uses gradients
torch.set_grad_enabled(True)
# Start training with enhanced logging
print(f"Actual epochs: {training_args.num_train_epochs}")
print(f"Actual batch size: {training_args.per_device_train_batch_size}")
print(f"Actual grad accum: {training_args.gradient_accumulation_steps}")
print(f"Actual dataloader length: {len(trainer.get_train_dataloader())}")
trainer.train()

Map:   0%|          | 0/540 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

\n🚀 STARTING ENHANCED GRPO TRAINING...
Actual epochs: 3
Actual batch size: 32
Actual grad accum: 16
Actual dataloader length: 68


Step,Training Loss,Validation Loss


In [20]:
final_model_path = f"{output_dir}/final_model"
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

print(f"Model saved to {final_model_path}")
print("Model files:")
!ls -la {final_model_path}

Model saved to ./grpo_vf_results/final_model
Model files:
total 49632
drwxr-xr-x 2 root root     4096 Jul  7 16:28 .
drwxr-xr-x 4 root root     4096 Jul  7 16:28 ..
-rw-r--r-- 1 root root     5219 Jul  7 18:32 README.md
-rw-r--r-- 1 root root      860 Jul  7 18:32 adapter_config.json
-rw-r--r-- 1 root root 34895152 Jul  7 18:32 adapter_model.safetensors
-rw-r--r-- 1 root root      605 Jul  7 18:32 added_tokens.json
-rw-r--r-- 1 root root     2507 Jul  7 18:32 chat_template.jinja
-rw-r--r-- 1 root root  1671853 Jul  7 18:32 merges.txt
-rw-r--r-- 1 root root      496 Jul  7 18:32 special_tokens_map.json
-rw-r--r-- 1 root root 11422060 Jul  7 18:32 tokenizer.json
-rw-r--r-- 1 root root     4680 Jul  7 18:32 tokenizer_config.json
-rw-r--r-- 1 root root     6929 Jul  7 18:32 training_args.bin
-rw-r--r-- 1 root root  2776833 Jul  7 18:32 vocab.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [26]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

# Assuming quantization_config is already defined in a previous cell
# If not, you might need to add it here or ensure the previous cell is run

# Test the fine-tuned model
finetuned_model_path = f"{output_dir}/final_model" # Use the local saved model path
orginal_model_name = "Qwen/Qwen2.5-1.5B-Instruct"

# Load tokenizers separately for each model
print("Loading tokenizer for fine-tuned model...")
finetuned_tokenizer = AutoTokenizer.from_pretrained(
    finetuned_model_path,
    trust_remote_code=True,
)
finetuned_tokenizer.pad_token = finetuned_tokenizer.eos_token
# Ensure the chat template is set for the tokenizer used by the pipeline


print("Loading tokenizer for original model...")
original_tokenizer = AutoTokenizer.from_pretrained(
    orginal_model_name,
    trust_remote_code=True,
)
original_tokenizer.pad_token = original_tokenizer.eos_token
# Ensure the chat template is set for the tokenizer used by the pipeline


def test_model_with_pipeline(prompt, model_path, tokenizer, model_name, max_new_tokens=128):
    print(f"\nTesting model: {model_name}")

    # Determine if loading from local files
    #is_local = (model_path == finetuned_model_path)

    # Load the model using pipeline
    # Pass local_files_only as a direct argument when loading from a local path
    generator = pipeline(
        "text-generation",
        model=model_path,
        tokenizer=tokenizer, # Pass the tokenizer with the chat template
        # Removed device="cuda"
        model_kwargs={"quantization_config": quantization_config, "torch_dtype": torch.bfloat16}, # Removed trust_remote_code
        #local_files_only=is_local # Pass as a direct argument
    )

    messages = [
        {"role": "system", "content": "You are an expert educator who explains concepts from first principles like Richard Feynman. Start with fundamental truths, use simple analogies, and avoid jargon."},
        {"role": "user", "content": prompt}
    ]

    # The pipeline will automatically apply the chat template if set on the tokenizer
    output = generator(
        messages,
        max_new_tokens=max_new_tokens,
        return_full_text=False,
        pad_token_id=tokenizer.eos_token_id, # Set pad_token_id for generation
        do_sample=True, # Ensure sampling is enabled if temperature is set
        temperature=0.7,
    )

    if output and output[0] and "generated_text" in output[0]:
        # The pipeline with return_full_text=False returns only the new tokens
        # However, sometimes it might still include parts of the prompt depending on the model/tokenizer
        # Let's try to clean up the response to only get the assistant part
        generated_text = output[0]["generated_text"].strip()
        # Simple check to remove prompt if it's still included
        if generated_text.startswith("<start_of_turn>user"):
             # Find the start of the assistant's turn if the full conversation is returned
             assistant_start_index = generated_text.find("<start_of_turn>model")
             if assistant_start_index != -1:
                  generated_text = generated_text[assistant_start_index:].strip()


        # Remove the start/end turn tokens if they are present
        if generated_text.startswith("<start_of_turn>model"):
             generated_text = generated_text[len("<start_of_turn>model\n"):].strip()
        if generated_text.endswith("<end_of_turn>"):
             generated_text = generated_text[:-len("<end_of_turn>")].strip()


        return generated_text
    else:
        return "Could not generate response."

# Test with the specific prompt
test_question = "Explain how GRPO (Group Relative Proximal Optimization) works in Reinforcement LEarning a LLM"

print("\nTesting the models with the specific question:")

# Test original model
response_original = test_model_with_pipeline(test_question, orginal_model_name, original_tokenizer, orginal_model_name)
print(f"Original Model Response:\n{response_original}")

print("-" * 80)

# Test fine-tuned model
response_finetuned = test_model_with_pipeline(test_question, finetuned_model_path, finetuned_tokenizer, finetuned_model_path)
print(f"Finetuned Model Response:\n{response_finetuned}")

print("-" * 80)

Loading tokenizer for fine-tuned model...
Loading tokenizer for original model...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]


Testing the models with the specific question:

Testing model: Qwen/Qwen2.5-1.5B-Instruct
Original Model Response:
Sure! Let's break down the concept of Group Relative Proximal Optimization (GRPO) in the context of reinforcement learning (RL), specifically within the framework of Language Models (LLMs).

### Understanding RL and LLMs

**Reinforcement Learning (RL)** is a type of machine learning where an agent learns to make decisions by interacting with its environment. The goal is to maximize some form of reward or achieve certain objectives.

A **Language Model (LLM)** is a model that can generate text based on input data. It's designed to predict the next word or sentence given the preceding ones.

### GRPO: A
--------------------------------------------------------------------------------

Testing model: ./grpo_vf_results/final_model
Finetuned Model Response:
Imagine you’re trying to teach a robot to walk by showing it lots of pictures where the robot steps forward or backward ma

In [22]:
# Finish WandB run
wandb.finish()
print("Training complete! Check your WandB dashboard for training metrics.")


0,1
batch/mean_reward,█▁▂▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁
profiling/Time taken: GRPOTrainer._calculate_rewards,▂▂▂▂▂█▂▂▂▁▃▁▁▁▁▁▁▂▁▁▁▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁
profiling/Time taken: GRPOTrainer._get_per_token_logps,█▁█▁█▁█▁▁▂▇▁▁▁▇▂▂▇▁▁█▂▁▇▇█▇▁▁█▁▁██▁▁█▂▁▇
profiling/Time taken: GRPOTrainer._prepare_inputs,▄▃▁▃▁▄▃▂▂▅▃▄▃▇▅█▂▅▅▅▃▅▂▅▃▂▁▆▂▅▇▂▂▂▅▃▇▇▅▂
profiling/Time taken: GRPOTrainer.compute_loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁
profiling/Time taken: GRPOTrainer.fp_reward,▅▇▆▇█▄▄▁▃▄▂▆█▆▃▄▄▆▂▂▆▇▄▄▂▂▆▃▆█▃▄▄▃▁▄▅▃▇▃
train/clip_ratio/high_max,▁▁
train/clip_ratio/high_mean,▁▁
train/clip_ratio/low_mean,▁▁
train/clip_ratio/low_min,▁▁

0,1
batch/mean_reward,-0.83787
profiling/Time taken: GRPOTrainer._calculate_rewards,0.28558
profiling/Time taken: GRPOTrainer._get_per_token_logps,0.14364
profiling/Time taken: GRPOTrainer._prepare_inputs,24.18212
profiling/Time taken: GRPOTrainer.compute_loss,0.21047
profiling/Time taken: GRPOTrainer.fp_reward,0.28489
total_flos,0.0
train/clip_ratio/high_max,0.0
train/clip_ratio/high_mean,0.0
train/clip_ratio/low_mean,0.0


Training complete! Check your WandB dashboard for training metrics.


In [23]:
import shutil
import os
from IPython.display import FileLink

# Path to your fine-tuned model folder
#final_model_path = "./grpo_vf_results"  # Update if your folder name is different

# Output ZIP file name
zip_name = "qwen2.5-1.5B-grpo_finetuned"

# Create ZIP archive
shutil.make_archive(zip_name, 'zip', final_model_path)

# Display a download link (works in Jupyter)
zip_file = zip_name + ".zip"
if os.path.exists(zip_file):
    display(FileLink(zip_file))
    print("✅ Model zipped! Click the link above to download.")
else:
    print("❌ Failed to create ZIP file.")


✅ Model zipped! Click the link above to download.


In [24]:
responses = [
    # Original Model
    "Imagine you're trying to teach your robot to pick up toys by showing it pictures of toys and rewards for getting them right. GRPO is like teaching the robot through many small steps where it learns about picking nearby toys better than far ones. It focuses on actions that are close together, helping the robot learn faster and more accurately. In real life, this helps machines make decisions based on immediate feedback instead of complex calculations, making learning easier and quicker. Does this toy pickup example help you understand GRPO?",
    
    # Fine-tuned Model  
    "Okay, imagine you're playing a game where your goal is to get points by choosing the right moves. In reinforcement learning, we teach a machine to learn what actions lead to more rewards or punishment over time. GRPO is a way of making this process faster by looking at nearby choices instead of all possible ones. Think of it as focusing on similar situations rather than every single move – this helps the machine improve quicker. So, GRPO simplifies learning by reducing complexity in decision-making, just like narrowing down paths in a maze. Does this help you understand how GRPO improves RL?"
]

print("🎯 REWARD FUNCTION COMPARISON:")
print("=" * 60)

for i, response in enumerate(responses):
    model_type = "Original SFT" if i == 0 else "GRPO Fine-tuned"
    breakdown = _sc.breakdown(response)
    
    print(f"\n{model_type}:")
    for key, value in breakdown.items():
        if key != 'reward':
            print(f"  {key:15s}: {value:.3f}")
    print(f"  {'TOTAL':15s}: {breakdown.get('reward', 0):.3f}")

🎯 REWARD FUNCTION COMPARISON:

Original SFT:
  analogy        : 0.500
  step           : 0.020
  fundamentals   : 0.067
  engagement     : 0.488
  clarity        : 1.000
  complete       : 1.000
  nojargon       : 1.000
  TOTAL          : 0.329

GRPO Fine-tuned:
  analogy        : 0.567
  step           : 0.020
  fundamentals   : 0.200
  engagement     : 0.491
  clarity        : 1.000
  complete       : 1.000
  nojargon       : 1.000
  TOTAL          : 0.370
