In [1]:
# Install required packages
!uv pip install -q verifiers datasets transformers torch peft accelerate bitsandbytes wandb huggingface_hub
!uv pip install -q vllm
!uv pip install trl

[2mAudited [1m1 package[0m [2min 16ms[0m[0m


In [2]:
import torch
import json
import os
import wandb
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, Qwen2VLForConditionalGeneration, BitsAndBytesConfig
from huggingface_hub import login
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Import verifiers framework
import verifiers as vf

# Device configuration
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "mps"
print(f"Using device: {device}")


Using device: cuda


In [3]:
## 2. Configuration
import vllm
import trl
!pip show vllm

INFO 06-27 19:15:54 [__init__.py:244] Automatically detected platform cuda.
[0m

In [4]:
# Model and training configuration
model_name = "KhushalM/Qwen2.5-1.5BSFT"
output_dir = "./grpo_vf_results"
hub_model_id = "KhushalM/Qwen2.5-1.5B-GRPO"

# Training hyperparameters
config = {
    "model": model_name,
    "task": "First Principles Explanations",
    "framework": "verifiers",
    "reward_strategy": "Multi-component First Principles Reward",
    "lora_r": 32,
    "lora_alpha": 64,
    "learning_rate": 1e-5,
    "batch_size": 12,
    "gradient_accumulation_steps": 8,
    "num_train_epochs": 3,
    "num_generations": 6,
    "reward_components": {
        "analogy_quality": 0.20,
        "step_by_step": 0.15,
        "fundamental_concepts": 0.20,
        "engagement": 0.15,
        "clarity": 0.15,
        "completeness": 0.10,
        "avoid_jargon": 0.05
    }
}

print("Configuration loaded successfully!")


Configuration loaded successfully!


In [5]:
# Login to Hugging Face and Weights & Biases
login()
wandb.login()

# Initialize wandb with verifiers-specific config
wandb.init(
    project="qwen2.5-1.5B-first-principles",
    config=config
)

print("Authentication completed!")


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

[34m[1mwandb[0m: Currently logged in as: [33mkhushal-mandavia72[0m ([33mkhushal-mandavia72-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Authentication completed!


In [6]:
from trl import GRPOConfig

In [7]:
# Load dataset file - specify the path to your dataset
# Update this path to point to your actual dataset file
dataset_filename = "structured_dataset.json"  # Default dataset file

# Alternative: You can also specify other available datasets
# dataset_filename = "data.json"
# dataset_filename = "dataset.json"
# dataset_filename = "first_principles_dataset.json"

print(f"Loading dataset: {dataset_filename}")

# Verify the file exists
if os.path.exists(dataset_filename):
    print(f"Dataset file '{dataset_filename}' found successfully!")
    file_size = os.path.getsize(dataset_filename)
    print(f"File size: {file_size / 1024:.2f} KB")
else:
    # Try to find available dataset files in current directory
    available_datasets = [f for f in os.listdir('.') if f.endswith('.json')]
    if available_datasets:
        print(f"Dataset file '{dataset_filename}' not found!")
        print(f"Available dataset files: {available_datasets}")
        print(f"Please update the 'dataset_filename' variable to use one of the available files.")
    else:
        print("No JSON dataset files found in current directory.")
        print("Please ensure your dataset file is in the same directory as this notebook.")
    raise FileNotFoundError(f"Dataset file '{dataset_filename}' not found!")


Loading dataset: structured_dataset.json
Dataset file 'structured_dataset.json' found successfully!
File size: 706.00 KB


In [8]:
# Load and preview the dataset
print("Loading dataset...")
with open(dataset_filename, "r") as f:
    dataset_raw = json.load(f)

print(f"Dataset loaded successfully!")
print(f"Number of samples: {len(dataset_raw)}")
print(f"Sample structure: {list(dataset_raw[0].keys()) if dataset_raw else 'Empty dataset'}")

# Preview first sample
if dataset_raw:
    print("\nFirst sample preview:")
    sample_preview = json.dumps(dataset_raw[0], indent=2)
    print(sample_preview[:500] + "..." if len(sample_preview) > 500 else sample_preview)


Loading dataset...
Dataset loaded successfully!
Number of samples: 600
Sample structure: ['messages']

First sample preview:
{
  "messages": [
    {
      "role": "system",
      "content": "You are an expert educator who explains concepts from first principles like Richard Feynman. Start with fundamental truths, use simple analogies, and avoid jargon. Use a storytelling tone and follow a step by step explanation style:"
    },
    {
      "role": "user",
      "content": "Why do objects fall to the ground when dropped?"
    },
    {
      "role": "assistant",
      "content": "Okay, let\u2019s imagine you have a stre...


In [9]:
# Replace your current dataset loading section (Cell 5-6) with this optimized version:

from datasets import Dataset, DatasetDict
import torch
from torch.utils.data import DataLoader

# 1. OPTIMIZED: Load and preprocess dataset using HuggingFace datasets
print("Loading and preprocessing dataset with optimized batching...")

# Load raw data
with open(dataset_filename, "r") as f:
    dataset_raw = json.load(f)

# Extract prompts and responses for preprocessing
prompts = []
responses = []
for item in dataset_raw:
    messages = item['messages']
    # Extract user prompt and assistant response
    user_msg = next((m['content'] for m in messages if m['role'] == 'user'), '')
    assistant_msg = next((m['content'] for m in messages if m['role'] == 'assistant'), '')

    if user_msg and assistant_msg:
        prompts.append(user_msg)
        responses.append(assistant_msg)

# Create dataset using HuggingFace datasets for better performance
train_data = {
    'prompt': prompts[:int(0.9 * len(prompts))],  # 90% for training
    'completion': responses[:int(0.9 * len(responses))],
    #'question': prompts[:int(0.9 * len(prompts))],  # For compatibility
    #'answer': responses[:int(0.9 * len(responses))]
}

test_data = {
    'prompt': prompts[int(0.9 * len(prompts)):],  # 10% for testing
    'completion': responses[int(0.9 * len(responses)):],
    #'question': prompts[int(0.9 * len(prompts)):],
    #'answer': responses[int(0.9 * len(responses)):]
}

# Create Dataset objects with optimized preprocessing
train_dataset = Dataset.from_dict(train_data)
test_dataset = Dataset.from_dict(test_data)

dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

print(f"✅ Optimized dataset created:")
print(f"   - Training samples: {len(dataset['train'])}")
print(f"   - Test samples: {len(dataset['test'])}")
print(f"   - Using HuggingFace datasets for efficient batching")
print(f"Train sample: {dataset['train'][0]}")
print(f"Test sample: {dataset['test'][0]}")

#

Loading and preprocessing dataset with optimized batching...
✅ Optimized dataset created:
   - Training samples: 540
   - Test samples: 60
   - Using HuggingFace datasets for efficient batching
Train sample: {'prompt': 'Why do objects fall to the ground when dropped?', 'completion': "Okay, let’s imagine you have a stretched rubber sheet and you place a heavy ball in the middle. The sheet bends downwards, right? Now, if you roll a smaller ball nearby, it will start rolling toward the heavier ball because of the dip. This is a simple way to picture how gravity works. Gravity is like the Earth making a 'dip' in space that pulls things toward it.\n\nWhen you let go of an object, it falls because the Earth is pulling it toward its center, similar to how the heavy ball makes the rubber sheet dip. This pull is what we call gravitational force. It's a force that attracts two masses toward each other, and since the Earth is so big, it pulls objects towards itself strongly.\n\nIn our daily life,

#Reward Funcs

In [10]:
# ─────────────────────────────  first_principles_reward.py  ─────────────────────────────
"""
Reward function for GRPO/RLHF that scores explanations on seven Feynman-style axes
and returns a single 0-to-1 scalar.  CUDA-aware, NLTK-version-safe.

• Analogy quality            • Step-by-step reasoning
• Fundamental understanding   • Engagement
• Clarity                     • Completeness
• Jargon avoidance
"""
!uv pip install textstat nltk
# 1 ── Imports ─────────────────────────────────────────────────────────────────────────
import re
from typing import Dict, List

import torch
import nltk
from packaging import version
from transformers import pipeline
from textstat import flesch_reading_ease

# 2 ── Secure NLTK data download (handles punkt/punkt_tab switch) ─────────────────────
if version.parse(nltk.__version__) >= version.parse("3.8.2"):
    nltk.download("punkt_tab", quiet=True)
    _sent_tokenize = nltk.data.load("tokenizers/punkt_tab/english.pickle").tokenize
else:
    nltk.download("punkt", quiet=True)
    _sent_tokenize = nltk.sent_tokenize

# 3 ── Helper: CUDA device & sentiment pipeline (optional) ────────────────────────────
_DEVICE = 0 if torch.cuda.is_available() else -1
try:
    _sentiment = pipeline(
        task="sentiment-analysis",
        model="cardiffnlp/twitter-roberta-base-sentiment-latest",
        device=_DEVICE,
        torch_dtype=torch.float16 if _DEVICE >= 0 else None,
        batch_size=32,
    )
except Exception:
    _sentiment = None  # sentiment scoring is optional

# 4 ── Reward evaluator class ─────────────────────────────────────────────────────────
class FirstPrinciplesReward:
    """Vector-friendly scorer.  `score(text:str) -> float`."""

    WEIGHTS: Dict[str, float] = {
        "analogy":        0.20,
        "step":           0.15,
        "fundamentals":   0.20,
        "engagement":     0.15,
        "clarity":        0.15,
        "completeness":   0.10,
        "no_jargon":      0.05,
    }
    _CONCRETE  = {"ball","car","house","water","air","food","game","toy","bicycle"}
    _SENSORY   = {"see","feel","hear","touch","taste","smell","warm","cold","bright","dark"}
    _JARGON    = {
        'utilize','paradigm','synergy','leverage','optimize','streamline','methodology',
        'framework','infrastructure','scalable','robust','innovative','state-of-the-art',
    }
    _FP = {
        "analogy":      ['like','similar','imagine','as if','just like','comparable to'],
        "step":         ['first','second','third','next','then','after that','step by step'],
        "fundamental":  ['imagine','at its core','fundamentally','basically','essentially'],
        "engage":       ['does this','do you see','can you picture','have you noticed','make sense'],
    }

    # ── Sub-scores ───────────────────────────────────────────────────────────────────
    @staticmethod
    def _analogy(txt: str) -> float:
        t = txt.lower()
        score = 0.3 if any(p in t for p in FirstPrinciplesReward._FP["analogy"]) else 0.0
        score += min(0.4, sum(w in FirstPrinciplesReward._CONCRETE for w in t.split()) * 0.1)
        score += min(0.3, sum(w in FirstPrinciplesReward._SENSORY  for w in t.split()) * 0.05)
        return min(score, 1.0)

    @staticmethod
    def _step(txt: str) -> float:
        t = txt.lower()
        score  = min(0.4, sum(p in t for p in FirstPrinciplesReward._FP["step"]) * 0.1)
        score += min(0.3, sum(p in t for p in
            ['because','therefore','as a result','which leads to','this is why']) * 0.1)
        sents = _sent_tokenize(txt)
        if len(sents) >= 3:
            if sum(len(s.split()) for s in sents[len(sents)//2:]) > \
               sum(len(s.split()) for s in sents[:len(sents)//2]):
                score += 0.3
        return min(score, 1.0)

    @staticmethod
    def _fundamentals(txt: str) -> float:
        t = txt.lower()
        score  = min(0.4, sum(p in t for p in FirstPrinciplesReward._FP["fundamental"]) * 0.2)
        score += min(0.3, sum(p in t for p in ['why','reason','cause','because']) * 0.05)
        score += min(0.3, sum(p in t for p in
            ['basic','core','underlying','principle','law','truth']) * 0.1)
        return min(score, 1.0)

    @staticmethod
    def _engagement(txt: str) -> float:
        t = txt.lower()
        score  = min(0.4, sum(p in t for p in FirstPrinciplesReward._FP["engage"]) * 0.1)
        score += min(0.3, txt.count("?") * 0.1)
        if _sentiment:
            try:
                if _sentiment([txt[:512]])[0]["label"] == "POSITIVE":
                    score += 0.3
            except Exception:
                pass
        return min(score, 1.0)

    @staticmethod
    def _clarity(txt: str) -> float:
        score = 0.0
        try:
            f = flesch_reading_ease(txt)
            score += 0.4 if f >= 60 else 0.3 if f >= 50 else 0.2 if f >= 40 else 0.1
        except Exception:
            score += 0.2
        sents = _sent_tokenize(txt)
        if sents:
            avg = sum(len(s.split()) for s in sents) / len(sents)
            score += 0.3 if 10 <= avg <= 20 else 0.2 if 8 <= avg <= 25 else 0.1
        common = {'the','a','and','or','but','in','on','at','to','for'}
        ratio  = sum(w in common for w in txt.lower().split()) / max(len(txt.split()),1)
        score += 0.3 if ratio >= 0.3 else 0.0
        return min(score, 1.0)

    @staticmethod
    def _completeness(txt: str) -> float:
        wc = len(txt.split())
        score  = 0.5 if 100 <= wc <= 200 else 0.3 if 60 <= wc <= 300 else 0.1
        if any(p in txt.lower() for p in
               ['so','therefore','in summary','overall','does this help','this explains']):
            score += 0.3
        score += min(0.2, sum(p in txt.lower() for p in
               ['example','for instance','such as','like when']) * 0.1)
        return min(score, 1.0)

    @staticmethod
    def _no_jargon(txt: str) -> float:
        words = txt.lower().split()
        if not words:
            return 1.0
        ratio = sum(w in FirstPrinciplesReward._JARGON for w in words) / len(words)
        return 1.0 if ratio == 0 else 0.8 if ratio <= 0.02 else 0.6 if ratio <= 0.05 else 0.3

    # ── Public API ──────────────────────────────────────────────────────────────────
    def score(self, txt: str) -> float:
        subs = {
            "analogy":      self._analogy(txt),
            "step":         self._step(txt),
            "fundamentals": self._fundamentals(txt),
            "engagement":   self._engagement(txt),
            "clarity":      self._clarity(txt),
            "completeness": self._completeness(txt),
            "no_jargon":    self._no_jargon(txt),
        }
        total = sum(subs[k] * self.WEIGHTS[k] for k in subs)
        return total / sum(self.WEIGHTS.values())

        # ── Detailed breakdown (for logging) ───────────────────────────────────────────
    def breakdown(self, txt: str) -> Dict[str, float]:
        """Return every axis + final reward for one text."""
        return {
            "analogy":        self._analogy(txt),
            "step":           self._step(txt),
            "fundamentals":   self._fundamentals(txt),
            "engagement":     self._engagement(txt),
            "clarity":        self._clarity(txt),
            "completeness":   self._completeness(txt),
            "no_jargon":      self._no_jargon(txt),
            "reward":         self.score(txt),
        }

# ─────────────────────────────────  End of module  ──────────────────────────────────


[2mAudited [1m2 packages[0m [2min 6ms[0m[0m


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


In [None]:
# ─────────────────────────  first_principles_reward.py  ──────────────────────────
"""
Reward function for GRPO / RLHF
• Seven Feynman-style axes → single 0-to-1 reward
• CUDA-aware, sentiment batched on GPU
• W&B logs three sample completions per call (no step mismatch)
"""

# -- Imports & env ─────────────────────────────────────────────────────────────────
import os, re
from typing import List, Dict

import torch
import nltk
from packaging import version
from transformers import pipeline
from textstat import flesch_reading_ease
import wandb

DEVICE = 0 if torch.cuda.is_available() else -1          # -1 → CPU

# -- One-time W&B init (safe if already initialised) --
if os.getenv("WANDB_DISABLED", "false").lower() != "true" and wandb.run is None:
    wandb.init(project="grpo-first-principles", reinit=True)

# -- Secure NLTK data (punkt_tab ≥ 3.8.2) --
if version.parse(nltk.__version__) >= version.parse("3.8.2"):
    nltk.download("punkt_tab", quiet=True)
    SENT_TOKENIZE = nltk.data.load("tokenizers/punkt_tab/english.pickle").tokenize
else:
    nltk.download("punkt", quiet=True)
    SENT_TOKENIZE = nltk.sent_tokenize

# -- Sentiment pipeline (batched) --
try:
    SENT_PIPE = pipeline(
        "sentiment-analysis",
        model="cardiffnlp/twitter-roberta-base-sentiment-latest",
        device=DEVICE,
        torch_dtype=torch.float16 if DEVICE >= 0 else None,
        batch_size=32,
    )
except Exception:
    SENT_PIPE = None  # sentiment optional

# -- Static vocabularies → CUDA tensors for bucketised lookup --
_CONCRETE = [
    "ball", "car", "house", "water", "air", "food", "game", "toy", "bicycle",
    "apple", "book", "chair", "door", "phone", "street", "bridge", "cup",
    "sand", "river", "tree", "mountain", "train", "clock", "computer", "lamp",
]

# Words that evoke a direct sensory experience
_SENSORY = [
    # vision
    "see", "look", "bright", "dark", "color", "shine", "glow",
    # touch / temperature
    "feel", "touch", "smooth", "rough", "soft", "hard", "warm", "cold",
    # hearing
    "hear", "sound", "loud", "quiet", "crackle", "whisper",
    # taste / smell
    "taste", "smell", "sweet", "bitter", "sour", "salty", "fresh", "fragrant",
]
_JARGON    = [
    'utilize','paradigm','synergy','leverage','optimize','streamline','methodology',
    'framework','infrastructure','scalable','robust','innovative','state-of-the-art'
]
def _vocab_tensor(words):      # hashed & sorted for bucketize
    return torch.tensor(sorted(hash(w) for w in words), device=DEVICE)
CONCRETE_T, SENSORY_T, JARGON_T = map(_vocab_tensor, (_CONCRETE, _SENSORY, _JARGON))

def _batch_hash(tokens: List[List[str]]) -> torch.Tensor:
    """Pad to equal length and return int64 [B,L] tensor of word hashes on DEVICE."""
    L = max(len(t) for t in tokens)
    mat = torch.full((len(tokens), L), 0, dtype=torch.int64, device=DEVICE)
    for i, tok in enumerate(tokens):
        mat[i, :len(tok)] = torch.tensor([hash(w) for w in tok], device=DEVICE)
    return mat

# ─────────────────────────  Main scorer  ──────────────────────────────────────────
class FirstPrinciplesReward:
    """Vector-friendly scorer; call .batch_score([txt1, txt2, …]) → List[float]."""

    WEIGHTS = {
    "analogy": 0.20,
    "step":    0.15,
    "fundamentals": 0.20,
    "engagement":   0.10,   # ↓ was 0.15
    "clarity":      0.15,
    "complete":     0.15,   # ↑ was 0.10
    "nojargon":     0.05,
}
    # pre-compiled regexes (CPU but cheap)
    _R = {
        "analogy":      re.compile(r'\b(like|similar|imagine|as if|just like|comparable to|think of it as)\b', re.I),
        "step":         re.compile(r'\b(first|second|third|next|then|after that|step by step| first of all)\b', re.I),
        "fundamental": re.compile(
        r'\b('
        r'imagine|picture|think of|at its core|at the core|fundamentally|'
        r'basically|essentially|from scratch|from the ground up|'
        r'first principles?|root cause|underlying|the essence|'
        r'the basic idea|building block|foundation|starting point'
        r')\b',
        re.I,
        ),
        "engage": re.compile(
        r'\b('
        r'does this|does that|do you see|can you see|can you picture|can you imagine|'
        r'have you noticed|have you ever|could you picture|picture this|'
        r'make sense|sound good|is that clear|is this clear|see how|'
        r'does it help|does this help|feel free|think about|'
        r'want to try|ready to|what about|notice how'
        r')\b',
        re.I
        ),
        "cause":        re.compile(r'\b(because|therefore|as a result|this is why|which leads to)\b', re.I),
        "why":          re.compile(r'\b(why|reason|cause|because)\b', re.I),
        "concl":        re.compile(r'\b(so|therefore|in summary|overall|this explains|thus)\b', re.I),
        "example":      re.compile(r'\b(example|for instance|such as|like when)\b', re.I),
    }

    # -- util: bool hit per text --
    @staticmethod
    def _has(regex, texts):  # → float tensor
        return torch.tensor([bool(regex.search(t)) for t in texts], device=DEVICE).float()

    # -- main batch scorer --
    def batch_score(self, texts: List[str], *, return_axes: bool = False) -> List[float] | tuple:
        texts_lc = [t.lower() for t in texts]
        toks      = [t.split() for t in texts_lc]
        H         = _batch_hash(toks)                          # [B, L]

        # set-membership counts (CUDA)
        def count_vocab(mat: torch.Tensor, vocab_t: torch.Tensor) -> torch.Tensor:
            """
            Fast, O(B·L) set-membership count using torch.isin (GPU-safe).
            mat   : [B, L] int64      – hashed tokens padded with 0s
            vocab : [V]    int64      – sorted hashed vocab
            returns: [B]   float32    – count per row
            """
            return torch.isin(mat, vocab_t).sum(dim=1).float()

        conc_cnt  = count_vocab(H, CONCRETE_T)
        sens_cnt  = count_vocab(H, SENSORY_T)
        jargon_cnt= count_vocab(H, JARGON_T)

        clip = lambda x, hi: torch.clamp(x, max=float(hi))

        # Analogy
        analogy = 0.4 * self._has(self._R["analogy"], texts_lc)
        analogy += 0.1 * clip(conc_cnt, torch.tensor(4., device=DEVICE))
        analogy += 0.1* clip(sens_cnt, torch.tensor(6., device=DEVICE))
        analogy  = clip(analogy, torch.tensor(1.0, device=DEVICE))

        # Step
        step = 0.2 * clip(self._has(self._R["step"], texts_lc), 4)
        step+= 0.1 * clip(self._has(self._R["cause"], texts_lc),3)
        # complexity build (CPU loop but minor)
        len_first = torch.tensor(
            [sum(len(s.split()) for s in SENT_TOKENIZE(t)[:2]) for t in texts_lc], device=DEVICE)
        len_last  = torch.tensor(
            [sum(len(s.split()) for s in SENT_TOKENIZE(t)[-2:]) for t in texts_lc], device=DEVICE)
        step += 0.3 * (len_last > len_first).float()
        step  = clip(step, 1.0)

        # Fundamentals
        fund = 0.4 * clip(self._has(self._R["fundamental"], texts_lc), 2)
        fund+= 0.1* clip(self._has(self._R["why"], texts_lc), 6)
        fund+= 0.10* clip(conc_cnt, 3)
        fund = clip(fund, 1.0)

        # Engagement
        engage  = 0.3 * clip(self._has(self._R["engage"], texts_lc), 4)   # ↓ was 0.1
        engage += 0.1 * clip(torch.tensor([t.count('?') for t in texts_lc], device=DEVICE), 3)  # ↓ was 0.1

        if SENT_PIPE:
            try:
                preds = SENT_PIPE(texts_lc)
                engage += 0.3 * torch.tensor([p["label"] == "POSITIVE" for p in preds],
                                             dtype=torch.float32, device=DEVICE)
            except Exception:
                pass
        engage = clip(engage, 1.0)

        # Clarity
        flesch = torch.tensor([flesch_reading_ease(t) if len(t.split())>3 else 0
                               for t in texts], device=DEVICE)
        clarity = torch.where(flesch>=60,0.4,torch.where(flesch>=50,0.3,
                   torch.where(flesch>=40,0.2,0.1)))
        avg_len = torch.tensor([sum(len(s.split()) for s in SENT_TOKENIZE(t)) /
                                max(len(SENT_TOKENIZE(t)),1) for t in texts], device=DEVICE)
        clarity += torch.where((avg_len>=10)&(avg_len<=20),0.3,
                    torch.where((avg_len>=8)&(avg_len<=25),0.2,0.1))
        common = {'the','a','and','or','but','in','on','at','to','for'}
        cm_ratio = torch.tensor(
            [sum(w in common for w in tok)/max(len(tok),1) for tok in toks], device=DEVICE)
        clarity += 0.3 * (cm_ratio >= 0.3).float()
        clarity  = clip(clarity, 1.0)

        # Completeness
        wc = torch.tensor([len(t.split()) for t in texts], device=DEVICE)

        # base score is now 0.3 for any reasonable length (40-300 words)
        complete = torch.where((wc >= 40) & (wc <= 300), 0.3, 0.1)
        
        # add +0.2 if length is in the sweet spot 80-220
        complete += torch.where((wc >= 80) & (wc <= 220), 0.2, 0.0)
        
        # conclusion indicator still adds up to +0.3
        complete += 0.3 * self._has(self._R["concl"], texts_lc)
        
        # examples still worth +0.1 (capped at two hits)
        complete += 0.1 * clip(self._has(self._R["example"], texts_lc), 2)
        complete  = clip(complete, 1.0)

        # Jargon avoidance
        ratio = jargon_cnt / torch.clamp(torch.tensor([len(tok) for tok in toks],
                                                      device=DEVICE).float(), min=1)
        nojargon = torch.where(ratio==0,1.0,torch.where(ratio<=0.02,0.8,
                     torch.where(ratio<=0.05,0.6,0.3)))

        # Weighted sum
        total = (
            self.WEIGHTS["analogy"]  * analogy +
            self.WEIGHTS["step"]     * step    +
            self.WEIGHTS["fundamentals"]*fund +
            self.WEIGHTS["engagement"]*engage +
            self.WEIGHTS["clarity"]  * clarity +
            self.WEIGHTS["complete"] * complete+
            self.WEIGHTS["nojargon"] * nojargon
        )
        scalar = torch.tanh(total / sum(self.WEIGHTS.values()))
        if return_axes:
            axes = torch.stack([analogy, step, fund, engage,
                                clarity, complete, nojargon], dim=1)
            return scalar.tolist(), axes
        return scalar.tolist()

        # ── get full axis vector for ONE text ──────────────────────────────────
    def _axis_vector(self, text: str) -> Dict[str, float]:
        scalar, axes = self.batch_score([text], return_axes=True)
        axes = axes.squeeze(0).tolist()                 # length-7 list
        keys = ["analogy","step","fundamentals","engagement",
                "clarity","complete","nojargon"]
        out = dict(zip(keys, axes))                     # ← convert zip → dict
        out["reward"] = scalar[0]                       # add final scalar
        return out



    # ── breakdown for logging a single text ──
    def breakdown(self, text: str) -> Dict[str, float]:
        return self._axis_vector(text)


# ── Global scorer instance + GRPO wrapper ─────────────────────────────────────────


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


In [None]:
# ─────────────────────────  first_principles_reward.py  ──────────────────────────
"""
Reward function for GRPO / RLHF
• Seven Feynman-style axes → single 0-to-1 reward
• CUDA-aware, sentiment batched on GPU
• W&B logs three sample completions per call (no step mismatch)
"""

# -- Imports & env ─────────────────────────────────────────────────────────────────
import os, re
from typing import List, Dict

import torch
import nltk
from packaging import version
from transformers import pipeline
from textstat import flesch_reading_ease
import wandb

DEVICE = 0 if torch.cuda.is_available() else -1          # -1 → CPU

# -- One-time W&B init (safe if already initialised) --
if os.getenv("WANDB_DISABLED", "false").lower() != "true" and wandb.run is None:
    wandb.init(project="grpo-first-principles", reinit=True)

# -- Secure NLTK data (punkt_tab ≥ 3.8.2) --
if version.parse(nltk.__version__) >= version.parse("3.8.2"):
    nltk.download("punkt_tab", quiet=True)
    SENT_TOKENIZE = nltk.data.load("tokenizers/punkt_tab/english.pickle").tokenize
else:
    nltk.download("punkt", quiet=True)
    SENT_TOKENIZE = nltk.sent_tokenize

# -- Sentiment pipeline (batched) --
try:
    SENT_PIPE = pipeline(
        "sentiment-analysis",
        model="cardiffnlp/twitter-roberta-base-sentiment-latest",
        device=DEVICE,
        torch_dtype=torch.float16 if DEVICE >= 0 else None,
        batch_size=32,
    )
except Exception:
    SENT_PIPE = None  # sentiment optional

# -- Static vocabularies → CUDA tensors for bucketised lookup --
_CONCRETE = [
    "ball", "car", "house", "water", "air", "food", "game", "toy", "bicycle",
    "apple", "book", "chair", "door", "phone", "street", "bridge", "cup",
    "sand", "river", "tree", "mountain", "train", "clock", "computer", "lamp",
]

# Words that evoke a direct sensory experience
_SENSORY = [
    # vision
    "see", "look", "bright", "dark", "color", "shine", "glow",
    # touch / temperature
    "feel", "touch", "smooth", "rough", "soft", "hard", "warm", "cold",
    # hearing
    "hear", "sound", "loud", "quiet", "crackle", "whisper",
    # taste / smell
    "taste", "smell", "sweet", "bitter", "sour", "salty", "fresh", "fragrant",
]
_JARGON    = [
    'utilize','paradigm','synergy','leverage','optimize','streamline','methodology',
    'framework','infrastructure','scalable','robust','innovative','state-of-the-art'
]
def _vocab_tensor(words):      # hashed & sorted for bucketize
    return torch.tensor(sorted(hash(w) for w in words), device=DEVICE)
CONCRETE_T, SENSORY_T, JARGON_T = map(_vocab_tensor, (_CONCRETE, _SENSORY, _JARGON))

def _batch_hash(tokens: List[List[str]]) -> torch.Tensor:
    """Pad to equal length and return int64 [B,L] tensor of word hashes on DEVICE."""
    L = max(len(t) for t in tokens)
    mat = torch.full((len(tokens), L), 0, dtype=torch.int64, device=DEVICE)
    for i, tok in enumerate(tokens):
        mat[i, :len(tok)] = torch.tensor([hash(w) for w in tok], device=DEVICE)
    return mat

class FirstPrinciplesRewardV3:
    def __init__(self, device = "cuda", max_batch_size = 64, enable_caching = True):
        self.device = torch.device("cuda" if device == "cuda" else "cpu")
        self.max_batch_size = max_batch_size
        self.enable_caching = enable_caching
        self._pattern_cache = {} if enable_caching else None

        # Improved weight distribution based on first principles teaching
        self.WEIGHTS = {
            "analogy": 0.25,        # ↑ More important for Feynman-style
            "step": 0.20,           # ↑ Step-by-step crucial
            "fundamentals": 0.20,   # Keep high
            "engagement": 0.15,     # ↑ Questions and interaction
            "clarity": 0.12,        # Slightly reduced
            "completeness": 0.06,   # ↓ Less critical than structure
            "no_jargon": 0.02,      # ↓ Minimal weight
        }

        self._init_vocabularies()
        self._compile_patterns()

    def _init_vocabularies(self):
        """Initialize enhanced vocabularies with semantic grouping"""
        # Concrete objects (expanded and categorized)
        self.CONCRETE_PHYSICAL = [
            "ball", "car", "house", "water", "air", "food", "bicycle", "apple", "book",
            "chair", "door", "phone", "bridge", "cup", "sand", "river", "tree", "clock"
        ]
        
        self.CONCRETE_ABSTRACT = [
            "game", "story", "picture", "music", "dance", "recipe", "map", "puzzle"
        ]
        
        # Sensory experience words (enhanced)
        self.SENSORY_VISUAL = ["see", "look", "bright", "dark", "color", "shine", "glow", "sparkle"]
        self.SENSORY_TACTILE = ["feel", "touch", "smooth", "rough", "soft", "hard", "warm", "cold"]
        self.SENSORY_AUDITORY = ["hear", "sound", "loud", "quiet", "whisper", "crackle", "ring"]
        self.SENSORY_OTHER = ["taste", "smell", "sweet", "bitter", "fresh", "fragrant"]
        
        # Convert to GPU tensors with improved hashing
        self.vocab_tensors = {}
        for name, vocab in {
            "concrete_physical": self.CONCRETE_PHYSICAL,
            "concrete_abstract": self.CONCRETE_ABSTRACT,
            "sensory_visual": self.SENSORY_VISUAL,
            "sensory_tactile": self.SENSORY_TACTILE,
            "sensory_auditory": self.SENSORY_AUDITORY,
            "sensory_other": self.SENSORY_OTHER,
        }.items():
            # Use stable hash for consistent results
            hashes = [hash(w) & 0x7FFFFFFF for w in vocab]  # Ensure positive
            self.vocab_tensors[name] = torch.tensor(sorted(hashes), device=self.device)

    def _compile_patterns(self):
        """Compile enhanced regex patterns with better coverage"""
        self.patterns = {
            "analogy_strong": re.compile(
                r'\b(like|similar to|imagine|as if|just like|comparable to|think of it as|'
                r'picture this|it\'s like|reminds me of|analogous to)\b', re.I
            ),
            "analogy_weak": re.compile(
                r'\b(kind of|sort of|similar|resemble|compare)\b', re.I
            ),
            "step_indicators": re.compile(
                r'\b(first|second|third|next|then|after that|step by step|'
                r'initially|subsequently|finally|to begin|to start)\b', re.I
            ),
            "causal_connectors": re.compile(
                r'\b(because|therefore|as a result|this is why|which leads to|'
                r'consequently|thus|hence|so that|due to)\b', re.I
            ),
            "fundamental_phrases": re.compile(
                r'\b(at its core|fundamentally|basically|essentially|'
                r'from scratch|from the ground up|first principles?|'
                r'root cause|underlying|the essence|building block|foundation)\b', re.I
            ),
            "engagement_direct": re.compile(
                r'\b(does this|do you see|can you picture|have you noticed|'
                r'make sense|is that clear|does it help|think about|'
                r'notice how|see how|feel how)\b', re.I
            ),
            "engagement_questions": re.compile(r'\?', re.I),
            "conclusion_indicators": re.compile(
                r'\b(so|therefore|in summary|overall|this explains|'
                r'to sum up|in conclusion|ultimately)\b', re.I
            ),
            "example_phrases": re.compile(
                r'\b(for example|for instance|such as|like when|'
                r'consider|take|let\'s say)\b', re.I
            ),
        }
    
    def batch_score_optimized(self, texts:List[str], return_breakdown = False):
                """Optimized batch scoring with memory management"""
        if not texts:
            return []
        all_scores = []
        all_breakdowns = [] if return_breakdown else None

        for i in range(0, len(texts), self.max_batch_size):
            batch = texts[i:i + self.max_batch_size]
            scores, breakdowns = self._process_batch(batch, return_breakdown)
            all_scores.extend(scores)
            if return_breakdown:
                all_breakdowns.extend(breakdowns)
        return (all_scores, all_breakdowns) if return_breakdown else all_scores

    def _process_batch(self, texts:List[str], return_breakdown = False):
        """Process a single batch with enhanced scoring"""
        batch_size = len(texts)
        texts_lower = [t.lower() for t in texts]

        # Tokenize and create hash matrix
        tokens = [t.split() for t in texts_lower]
        max_len = max(len(t) for t in tokens) if tokens else 0
        
        # Create padded hash matrix
        hash_matrix = torch.zeros((batch_size, max_len), dtype=torch.int64, device=self.device)
        for i, tok_list in enumerate(tokens):
            if tok_list:
                hashes = [hash(w) & 0x7FFFFFFF for w in tok_list]
                hash_matrix[i, :len(hashes)] = torch.tensor(hashes, device=self.device)
        
        # Compute all scores
        scores = self._compute_enhanced_scores(texts, texts_lower, hash_matrix, tokens)
        
        # Weighted combination with improved normalization
        total_scores = torch.zeros(batch_size, device=self.device)
        for component, weight in self.WEIGHTS.items():
            total_scores += weight * scores[component]
        
        # Apply sigmoid for better score distribution
        final_scores = torch.sigmoid(total_scores).tolist()
        
        if return_breakdown:
            breakdowns = []
            for i in range(batch_size):
                breakdown = {k: v[i].item() for k, v in scores.items()}
                breakdown["final_score"] = final_scores[i]
                breakdowns.append(breakdown)
            return final_scores, breakdowns
        
        return final_scores, None

    def _compute_enhanced_scores(self, texts, texts_lower, hash_matrix, tokens):
        """Compute enhanced scoring with better first principles alignment"""
        batch_size = len(texts)
        
        # Pattern matching with caching
        pattern_matches = {}
        for name, pattern in self.patterns.items():
            if self._pattern_cache and name in self._pattern_cache:
                pattern_matches[name] = self._pattern_cache[name]
            else:
                matches = torch.tensor([bool(pattern.search(t)) for t in texts_lower], 
                                    device=self.device, dtype=torch.float32)
                pattern_matches[name] = matches
                if self._pattern_cache:
                    self._pattern_cache[name] = matches
        
        # Vocabulary matching
        vocab_counts = {}
        for name, vocab_tensor in self.vocab_tensors.items():
            counts = torch.sum(torch.isin(hash_matrix, vocab_tensor), dim=1).float()
            vocab_counts[name] = counts
        
        # Enhanced analogy scoring
        analogy_scores = (
            0.5 * pattern_matches["analogy_strong"] +
            0.2 * pattern_matches["analogy_weak"] +
            0.15 * torch.clamp(vocab_counts["concrete_physical"] / 10.0, max=1.0) +
            0.1 * torch.clamp(vocab_counts["sensory_visual"] / 5.0, max=1.0) +
            0.05 * torch.clamp(vocab_counts["sensory_tactile"] / 3.0, max=1.0)
        )
        
        # Enhanced step-by-step scoring
        step_scores = (
            0.4 * pattern_matches["step_indicators"] +
            0.3 * pattern_matches["causal_connectors"] +
            0.3 * self._compute_progression_score(texts)
        )
        
        # Enhanced fundamental understanding
        fundamental_scores = (
            0.5 * pattern_matches["fundamental_phrases"] +
            0.3 * torch.clamp(vocab_counts["concrete_physical"] / 8.0, max=1.0) +
            0.2 * self._compute_depth_score(texts_lower)
        )
        
        # Enhanced engagement scoring
        engagement_scores = (
            0.4 * pattern_matches["engagement_direct"] +
            0.3 * torch.clamp(pattern_matches["engagement_questions"] / 3.0, max=1.0) +
            0.3 * self._compute_interactive_score(texts_lower)
        )
        
        # Enhanced clarity scoring
        clarity_scores = self._compute_clarity_score(texts, tokens)
        
        # Enhanced completeness scoring
        completeness_scores = self._compute_completeness_score(texts, pattern_matches)
        
        # Jargon avoidance (simplified)
        jargon_scores = self._compute_jargon_score(tokens)
        
        return {
            "analogy": torch.clamp(analogy_scores, 0, 1),
            "step": torch.clamp(step_scores, 0, 1),
            "fundamentals": torch.clamp(fundamental_scores, 0, 1),
            "engagement": torch.clamp(engagement_scores, 0, 1),
            "clarity": torch.clamp(clarity_scores, 0, 1),
            "completeness": torch.clamp(completeness_scores, 0, 1),
            "no_jargon": torch.clamp(jargon_scores, 0, 1),
        }

: 

#Later

In [None]:
# Import in this specific order
import torch
from transformers import AutoTokenizer
import vllm  # Import vllm first
from transformers import Qwen2VLForConditionalGeneration, BitsAndBytesConfig, AutoModelForCausalLM, AutoPeftModelForCausalLM, LoraConfig, get_peft_model

# Quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

print("Loading model and tokenizer with FIXED device_map...")

# FIX: Use device_map="auto" instead of device string to avoid tensor parallel errors
model = AutoPeftModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    low_cpu_mem_usage=True,  # Add this
    #device_map=device,
    use_cache=False,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
tokenizer.model_max_length = 512  # Set max length
tokenizer.padding = "max_length"

print("✅ Model and tokenizer loaded successfully with automatic device mapping!")

# Prepare model for training
model = prepare_model_for_kbit_training(model)
model.config.use_cache = False
print("✅ Model prepared for kbit training")


Loading model and tokenizer with FIXED device_map...
✅ Model and tokenizer loaded successfully with automatic device mapping!
✅ Model prepared for kbit training


In [None]:
# LoRA Setup
if hasattr(model, "peft_config") and model.peft_config:
    print("Found existing LoRA configuration")
    model.train()
    print("Exisiting LoRA configuration is now trainable")
else:
    print("❌ No existing LoRA adapters found - adding new ones")
    # Only add new LoRA if none exist (fallback)
    peft_config = LoraConfig(
        r=config["lora_r"],
        lora_alpha=config["lora_alpha"],
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )
    model = get_peft_model(model, peft_config)

print("Model converted to LoRA")
print(f"Trainable Parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
print(f"Total Parameters: {sum(p.numel() for p in model.parameters()):,}")


Model converted to LoRA
Trainable Parameters: 8,716,288
Total Parameters: 897,332,736




In [None]:
# Define custom reward function for verifiers
# Add after dataset split
# Define custom reward function for verifiers
# Add after dataset split

# 5 ── GRPO-compatible wrapper ───────────────────────────────────────────────────────
_evaluator = FirstPrinciplesReward()
_FP_STEP   = 0  # global step counter

def first_principles_reward(completions: List[str], **batch) -> List[float]:
    """Callable for GRPOTrainer.reward_funcs."""
    global _FP_STEP
    rewards = [_evaluator.score(c) for c in completions]

    # 🟦— Log a sample of 3 completions per batch
    if wandb.run is not None:
        sample_logs: Dict[str, float] = {}
        for idx, comp in enumerate(completions[:3]):             # first 3 samples
            br = _evaluator.breakdown(comp)
            for k, v in br.items():
                sample_logs[f"sample_{idx}/{k}"] = v
        wandb.log(sample_logs, step=_FP_STEP, commit=False)      # do not end step yet

        # Optionally log the batch-mean reward for quick charts
        wandb.log({"batch/mean_reward": sum(rewards) / len(rewards)}, step=_FP_STEP)

    _FP_STEP += 1
    return rewards

print("Reward function and rubric created!")


Reward function and rubric created!


In [None]:
# ── Global scorer instance + W&B-safe reward wrapper ───────────────────────
_SCORER = FirstPrinciplesReward()

import html, re, unicodedata

def _clean(txt: str, max_len: int = 500) -> str:
    """Strip control chars and escape HTML so W&B shows readable text."""
    txt = txt.encode("utf-8", "replace").decode("utf-8", "replace")          # fix bytes
    txt = re.sub(r"[\x00-\x1F\x7F-\x9F]", "", txt)                           # ctrl chars
    txt = "".join(c for c in txt if unicodedata.category(c)[0] != "C")       # Unicode C*
    return html.escape(txt[:max_len])                                        # escape & cut

def fp_reward(completions: List[str], **batch) -> List[float]:
    """GRPOTrainer-compatible reward.  Logs 3 samples once per batch."""
    rewards = _SCORER.batch_score(completions)

    if wandb.run is not None:
        log = {"batch/mean_reward": sum(rewards) / len(rewards)}
        for i, (txt, rw) in enumerate(zip(completions[:3], rewards[:3])):
            log[f"sample_{i}/reward"] = rw
            log[f"sample_{i}/text"]   = wandb.Html(f"<pre>{_clean(txt)}</pre>")
        wandb.log(log)                              # let W&B manage step counter

    return rewards
# ───────────────────────────────────────────────────────────────────────────

# ▶️ Run this cell to sanity-check the reward function
import pandas as pd
import importlib
import sys, pathlib

# Adjust this if the file lives elsewhere
sys.path.append(str(pathlib.Path('.').resolve()))

samples = [
    "**Group Relative Proximal Optimization (GRPO)** is a method used to solve optimization problems that arise in machine learning, particularly in training large language models. It is designed to handle non-convex objectives efficiently by breaking them into smaller, more manageable sub-problems. The term \"proximal\" refers to a technique where we minimize a proximal operator, which can be thought of as a smoothing or regularization step.",

    "First, imagine you’re standing on a huge soft rubber sheet stretched tight."
    "Place a bowling ball in the middle—feel how the surface dips under its weight."

    "Second, roll a marble across the sheet. The marble curves inward because the slope guides it, just like rain sliding down a car-window."

    "Third, swap the rubber for the invisible fabric of space-time and the bowling ball for Earth. Mass fundamentally bends this fabric; that bend is what we call a gravitational field."

    "Next, release a pen. It isn’t “pulled” by a mysterious force; it simply follows the easiest downhill path in the curved fabric, exactly like the marble. Why does it speed up? Because the slope gets steeper closer to Earth’s center, so each moment the pen points farther “downhill.”"

    "Then, notice that the Moon also rides this curve, but sideways momentum keeps it sliding around the bowl instead of falling straight in—an orbit is just perpetual falling with a sideways shove."

    "Finally, to check your intuition: if Earth vanished, the fabric would flatten and the pen would have no slope to follow—so it would float."

    "In summary, objects fall because mass reshapes space-time, tilting every path toward the mass; the steeper the tilt, the faster the fall. Does that picture click?"
]
# Compute scalar rewards
scalar_rewards = fp_reward(samples)

# Full breakdown for each sample
_sc = FirstPrinciplesReward()
rows = [_sc.breakdown(t) for t in samples]

df = pd.DataFrame(rows)
df.insert(0, "text_snippet", [t[:60] + "…" for t in samples])
df["reward_scalar"] = scalar_rewards
df



Unnamed: 0,text_snippet,analogy,step,fundamentals,engagement,clarity,complete,nojargon,reward,reward_scalar
0,**Group Relative Proximal Optimization (GRPO)*...,0.0,0.0,0.0,0.0,0.3,0.3,1.0,0.14,0.14
1,"First, imagine you’re standing on a huge soft ...",0.7,0.6,0.7,0.5,0.5,0.8,1.0,0.665,0.665


In [None]:
# GRPO training arguments using verifiers defaults
# First get the default training arguments from verifiers
import types

from trl import GRPOConfig
training_args = GRPOConfig(
    run_name="Qwen/Qwen2.5-1.5B-grpo",
    label_names=["labels"],
    generation_kwargs={
        "max_new_tokens": 256,
        "pad_token_id": tokenizer.pad_token_id,
        "eos_token_id": tokenizer.eos_token_id,
        "do_sample": True,
    },
    logging_strategy = "steps",        # <-- string (NO trailing comma)
    logging_steps    = 10,              # every 5 optimiser steps
    report_to        = ["wandb"],      # <-- list, not str
    eval_strategy = "steps",     # (optional) run eval during training
    eval_steps         = 200,
)

# Then modify the arguments we need to customize
training_args.output_dir = output_dir
training_args.learning_rate = config["learning_rate"]
training_args.per_device_train_batch_size = 16
training_args.gradient_accumulation_steps = 4
training_args.num_train_epochs = config["num_train_epochs"]
training_args.save_steps = 25
training_args.eval_steps = 50
training_args.warmup_ratio = 0.1
training_args.lr_scheduler_type = "cosine"
training_args.push_to_hub = True
training_args.hub_model_id = hub_model_id
training_args.bf16 = True 
training_args.fp16=False
training_args.generation_kwargs["max_length"] =256  # or 512
training_args.dataloader_num_workers = 4


# GRPO specific parameters
training_args.num_generations = config["num_generations"]
training_args.max_completion_length = 256
training_args.temperature = 0.7
training_args.kl_coef = 0.05

print("Training arguments configured!")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")

Training arguments configured!
Effective batch size: 64


In [18]:
print(torch.cuda.memory_allocated()/1e9, "GB allocated")
print(torch.cuda.memory_reserved()/1e9,  "GB reserved")


2.179748864 GB allocated
3.342860288 GB reserved


In [19]:
import warnings
from transformers import logging

# Ignore the specific warning about torch.utils.checkpoint and use_reentrant
warnings.filterwarnings("ignore", message="torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly.*")
warnings.filterwarnings("ignore", message="None of the inputs have requires_grad=True. Gradients will be None")
warnings.filterwarnings("ignore", message=".*Caching is incompatible with gradient checkpointing.*")
logging.set_verbosity_error()
warnings.filterwarnings("ignore")
print("Filtered out specific PyTorch checkpoint warnings.")
import os
os.environ["FLASH_ATTENTION_USE_PACKED_QKV"] = "1"




In [20]:

from transformers import TrainerCallback
import wandb, html, re, unicodedata

# 5. Create a simpler approach - use standard TRL GRPOTrainer with a fixed reward function
from trl import GRPOTrainer, GRPOConfig
from trl.core import LengthSampler
import functools

training_args.num_generations = 4
training_args.per_device_train_batch_size = 32  # Reduce to minimize complexity

prompts_for_final = [
    "Explain entropy like I'm five.",
    "Why do objects fall to the ground?",
    "What makes the sky blue?"
]
model.gradient_checkpointing_enable()
trainer = GRPOTrainer(
    model=model,
    args=training_args,
    processing_class=tokenizer,
    train_dataset=dataset['train'],  # Your dataset
    eval_dataset=dataset['test'],  # Your evaluation dataset
    reward_funcs=[fp_reward],  # Use simplified reward function
)


print("Trainer created successfully!")
print(f"Number of training examples: {len(trainer.train_dataset)}")
print(f"Number of evaluation examples: {len(trainer.eval_dataset)}")
loader = trainer.get_train_dataloader()
print(len(loader))           # number of micro-batches per epoch


# 6. Start training
print("Starting GRPO training...")
trainer.train()

Trainer created successfully!
Number of training examples: 540
Number of evaluation examples: 60
68
Starting GRPO training...


Step,Training Loss,Validation Loss
50,-0.0,0.0




TrainOutput(global_step=51, training_loss=-1.2047844482403177e-07, metrics={'train_runtime': 6156.7729, 'train_samples_per_second': 0.263, 'train_steps_per_second': 0.008, 'total_flos': 0.0, 'train_loss': -1.2047844482403177e-07})

In [21]:
final_model_path = f"{output_dir}/final_model"
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

print(f"Model saved to {final_model_path}")
print("Model files:")
!ls -la {final_model_path}

Model saved to ./grpo_vf_results/final_model
Model files:
total 49632
drwxr-xr-x 2 root root     4096 Jun 27 21:00 .
drwxr-xr-x 6 root root     4096 Jun 27 21:00 ..
-rw-r--r-- 1 root root     5100 Jun 27 21:00 README.md
-rw-r--r-- 1 root root      810 Jun 27 21:00 adapter_config.json
-rw-r--r-- 1 root root 34895152 Jun 27 21:00 adapter_model.safetensors
-rw-r--r-- 1 root root      605 Jun 27 21:00 added_tokens.json
-rw-r--r-- 1 root root     2507 Jun 27 21:00 chat_template.jinja
-rw-r--r-- 1 root root  1671853 Jun 27 21:00 merges.txt
-rw-r--r-- 1 root root      496 Jun 27 21:00 special_tokens_map.json
-rw-r--r-- 1 root root 11422060 Jun 27 21:00 tokenizer.json
-rw-r--r-- 1 root root     4680 Jun 27 21:00 tokenizer_config.json
-rw-r--r-- 1 root root     6993 Jun 27 21:00 training_args.bin
-rw-r--r-- 1 root root  2776833 Jun 27 21:00 vocab.json


In [22]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

# Assuming quantization_config is already defined in a previous cell
# If not, you might need to add it here or ensure the previous cell is run

# Test the fine-tuned model
finetuned_model_path = f"{output_dir}/final_model" # Use the local saved model path
orginal_model_name = "KhushalM/Qwen2.5-1.5BSFT"

# Load tokenizers separately for each model
print("Loading tokenizer for fine-tuned model...")
finetuned_tokenizer = AutoTokenizer.from_pretrained(
    finetuned_model_path,
    trust_remote_code=True,
)
finetuned_tokenizer.pad_token = finetuned_tokenizer.eos_token
# Ensure the chat template is set for the tokenizer used by the pipeline


print("Loading tokenizer for original model...")
original_tokenizer = AutoTokenizer.from_pretrained(
    orginal_model_name,
    trust_remote_code=True,
)
original_tokenizer.pad_token = original_tokenizer.eos_token
# Ensure the chat template is set for the tokenizer used by the pipeline


def test_model_with_pipeline(prompt, model_path, tokenizer, model_name, max_new_tokens=128):
    print(f"\nTesting model: {model_name}")

    # Determine if loading from local files
    #is_local = (model_path == finetuned_model_path)

    # Load the model using pipeline
    # Pass local_files_only as a direct argument when loading from a local path
    generator = pipeline(
        "text-generation",
        model=model_path,
        tokenizer=tokenizer, # Pass the tokenizer with the chat template
        # Removed device="cuda"
        model_kwargs={"quantization_config": quantization_config, "torch_dtype": torch.bfloat16}, # Removed trust_remote_code
        #local_files_only=is_local # Pass as a direct argument
    )

    messages = [
        {"role": "system", "content": "You are an expert educator who explains concepts from first principles like Richard Feynman. Start with fundamental truths, use simple analogies, and avoid jargon."},
        {"role": "user", "content": prompt}
    ]

    # The pipeline will automatically apply the chat template if set on the tokenizer
    output = generator(
        messages,
        max_new_tokens=max_new_tokens,
        return_full_text=False,
        pad_token_id=tokenizer.eos_token_id, # Set pad_token_id for generation
        do_sample=True, # Ensure sampling is enabled if temperature is set
        temperature=0.7,
    )

    if output and output[0] and "generated_text" in output[0]:
        # The pipeline with return_full_text=False returns only the new tokens
        # However, sometimes it might still include parts of the prompt depending on the model/tokenizer
        # Let's try to clean up the response to only get the assistant part
        generated_text = output[0]["generated_text"].strip()
        # Simple check to remove prompt if it's still included
        if generated_text.startswith("<start_of_turn>user"):
             # Find the start of the assistant's turn if the full conversation is returned
             assistant_start_index = generated_text.find("<start_of_turn>model")
             if assistant_start_index != -1:
                  generated_text = generated_text[assistant_start_index:].strip()


        # Remove the start/end turn tokens if they are present
        if generated_text.startswith("<start_of_turn>model"):
             generated_text = generated_text[len("<start_of_turn>model\n"):].strip()
        if generated_text.endswith("<end_of_turn>"):
             generated_text = generated_text[:-len("<end_of_turn>")].strip()


        return generated_text
    else:
        return "Could not generate response."

# Test with the specific prompt
test_question = "Explain how GRPO (Group Relative Proximal Optimization) works in Reinforcement LEarning a LLM"

print("\nTesting the models with the specific question:")

# Test original model
response_original = test_model_with_pipeline(test_question, orginal_model_name, original_tokenizer, orginal_model_name)
print(f"Original Model Response:\n{response_original}")

print("-" * 80)

# Test fine-tuned model
response_finetuned = test_model_with_pipeline(test_question, finetuned_model_path, finetuned_tokenizer, finetuned_model_path)
print(f"Finetuned Model Response:\n{response_finetuned}")

print("-" * 80)

Loading tokenizer for fine-tuned model...
Loading tokenizer for original model...

Testing the models with the specific question:

Testing model: KhushalM/Qwen2.5-1.5BSFT
Original Model Response:
Imagine you’re playing a game where you have to find the best path through a maze by learning which moves lead to rewards.

In reinforcement learning, agents learn by trying different actions and getting feedback about success or failure.

GRPO is a way to optimize these choices so the agent picks paths that maximize reward over time. It does this by looking at nearby solutions and adjusting its strategy based on similarities.

So, GRPO helps AI systems improve their decision-making by considering nearby options as they learn new tasks. Does this help you understand how it fits into RL?
--------------------------------------------------------------------------------

Testing model: ./grpo_vf_results/final_model
Finetuned Model Response:
Understanding Group Relative Proximal Optimization (GRPO)

In [23]:
# Finish WandB run
wandb.finish()
print("Training complete! Check your WandB dashboard for training metrics.")


0,1
batch/mean_reward,█▁▁▂▁▁▁▁▂▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▄▇▄▁
eval/clip_ratio/high_max,▁
eval/clip_ratio/high_mean,▁
eval/clip_ratio/low_mean,▁
eval/clip_ratio/low_min,▁
eval/clip_ratio/region_mean,▁
eval/completions/clipped_ratio,▁
eval/completions/max_length,▁
eval/completions/max_terminated_length,▁
eval/completions/mean_length,▁

0,1
batch/mean_reward,0.18844
eval/clip_ratio/high_max,0.0
eval/clip_ratio/high_mean,0.0
eval/clip_ratio/low_mean,0.0
eval/clip_ratio/low_min,0.0
eval/clip_ratio/region_mean,0.0
eval/completions/clipped_ratio,1.0
eval/completions/max_length,256.0
eval/completions/max_terminated_length,0.0
eval/completions/mean_length,256.0


Training complete! Check your WandB dashboard for training metrics.


In [24]:
import shutil
import os
from IPython.display import FileLink

# Path to your fine-tuned model folder
#final_model_path = "./grpo_vf_results"  # Update if your folder name is different

# Output ZIP file name
zip_name = "qwen2.5-1.5B-sft_finetuned"

# Create ZIP archive
shutil.make_archive(zip_name, 'zip', final_model_path)

# Display a download link (works in Jupyter)
zip_file = zip_name + ".zip"
if os.path.exists(zip_file):
    display(FileLink(zip_file))
    print("✅ Model zipped! Click the link above to download.")
else:
    print("❌ Failed to create ZIP file.")


✅ Model zipped! Click the link above to download.


In [None]:
# Cleanup and final messages
wandb.finish()

print("Verifiers GRPO training process completed!")
print(f"Model saved to: {output_dir}")
print("\nKey advantages of using verifiers framework:")
print("- More efficient GRPO implementation")
print("- Better support for custom environments")
print("- Cleaner abstraction for reward functions")
print("- Built-in support for format checking")
print("- Easier integration with evaluation pipelines")


In [None]:
# Optional: Test with custom prompts
custom_test_prompts = [
    "Explain why the sky is blue using first principles",
    "How does a microwave oven work from fundamental concepts?",
    "Why do magnets attract certain materials?"
]

print("Testing with additional custom prompts...")
for i, prompt in enumerate(custom_test_prompts):
    print(f"\n--- Test {i+1} ---")
    print(f"Prompt: {prompt}")

    try:
        # Generate response (simplified - you may need to adjust based on your model setup)
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=200,
                temperature=0.7,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )

        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        response = response[len(prompt):].strip()  # Remove the input prompt

        print(f"Response: {response[:300]}...'" if len(response) > 300 else f"Response: {response}")

        # Calculate reward
        reward_score = first_principles_reward_func(prompt, response, "")
        print(f"Reward Score: {reward_score:.3f}")

    except Exception as e:
        print(f"Error generating response: {e}")

print("\nCustom testing completed!")


In [None]:
import warnings

# Ignore the specific warning about torch.utils.checkpoint and use_reentrant
warnings.filterwarnings("ignore", message="torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly.*")
warnings.filterwarnings("ignore", message="None of the inputs have requires_grad=True. Gradients will be None")

print("Filtered out specific PyTorch checkpoint warnings.")

