# Stage 3: Constitutional AI Training

Critique-Revision data generation + DPO training.

**Requires**: Stage 2 adapters (load from Drive or retrain)

**Expected time**: ~4-5 hours on T4 GPU

In [None]:
# Cell 1: Clone Repository
import os

os.chdir('/content')

# Remove existing directory and clone fresh
!rm -rf ml-learning
!git clone https://github.com/Jai-Dhiman/ml-learning.git
os.chdir('/content/ml-learning')

print(f"✅ Repository cloned to: {os.getcwd()}")

In [None]:
# Cell 2: Install uv Package Manager
!pip -q install -U uv

import shutil
print(f"✅ uv installed at: {shutil.which('uv')}")

In [None]:
# Cell 3: HuggingFace Authentication
import os
from huggingface_hub import login, HfApi

# Clear any existing tokens
os.environ.pop('HF_TOKEN', None)
os.environ.pop('HUGGINGFACEHUB_API_TOKEN', None)

try:
    import getpass as gp
    raw = gp.getpass("Paste your Hugging Face token (input hidden): ")
    token = raw.decode() if isinstance(raw, (bytes, bytearray)) else raw
    if not isinstance(token, str):
        raise TypeError(f"Unexpected token type: {type(token).__name__}")
    token = token.strip()
    if not token:
        raise ValueError("Empty token provided")
    
    # Login and set environment variable
    login(token=token, add_to_git_credential=False)
    os.environ['HF_TOKEN'] = token
    
    who = HfApi().whoami(token=token)
    print(f"✅ Logged in as: {who.get('name') or who.get('email') or 'OK'}")
    print('HF_TOKEN environment variable set for bash cells.')
    
except Exception as e:
    print(f"[HF Login] getpass flow failed: {e}")
    print("Falling back to interactive login widget...")
    login()
    
    # Try to get token from saved credentials
    try:
        from huggingface_hub import HfFolder
        token = HfFolder.get_token()
        if token:
            os.environ['HF_TOKEN'] = token
            print('HF_TOKEN environment variable set from saved credentials.')
        who = HfApi().whoami()
        print(f"✅ Logged in as: {who.get('name') or who.get('email') or 'OK'}")
    except Exception as e2:
        print(f"[HF Login] Could not set HF_TOKEN env var: {e2}")
        print("You may need to run 'huggingface-cli login' in a bash cell.")

In [None]:
# Cell 4: Install Dependencies with uv
%%bash
set -e
cd /content/ml-learning

# Create virtual environment
echo 'Creating uv virtual environment...'
uv venv

echo 'Installing dependencies...'

# Install PyTorch with CUDA support first
uv pip install --python .venv/bin/python torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 || \
  uv pip install --python .venv/bin/python torch torchvision torchaudio

# Install remaining dependencies with exact versions
uv pip install --python .venv/bin/python \
  "transformers>=4.43.0" \
  "trl>=0.9.6" \
  "peft>=0.13.0" \
  "datasets>=2.19.0" \
  "accelerate>=0.28.0" \
  sentencepiece \
  safetensors \
  einops \
  evaluate \
  "protobuf<5"

# Verify installation
echo 'Verifying torch installation...'
.venv/bin/python -c "import torch; print(f'PyTorch {torch.__version__}'); print(f'CUDA available: {torch.cuda.is_available()}')"
echo '✅ Dependencies installed successfully!'


In [None]:
# Cell 5: Download Stage 2 Artifacts from Google Drive
from google.colab import drive
from pathlib import Path
import os

print("=" * 70)
print("DOWNLOADING STAGE 2 ARTIFACTS FROM GOOGLE DRIVE")
print("=" * 70)

# Mount Google Drive
print("\n1. Mounting Google Drive...")
drive.mount('/content/drive')

# Path to Stage 2 artifacts in Google Drive
print("\n2. Looking for Stage 2 artifacts in Google Drive...")
drive_artifacts_path = '/content/drive/MyDrive/artifacts/stage2_finetuning_artifacts'

print(f"   Expected location: {drive_artifacts_path}")
print("   (If your path is different, update the 'drive_artifacts_path' variable above)")
print()

drive_path = Path(drive_artifacts_path)

if not drive_path.exists():
    print(f"❌ ERROR: Path not found: {drive_path}")
    print("\nExpected directory structure in Google Drive:")
    print("  MyDrive/")
    print("  └── artifacts/")
    print("      └── stage2_finetuning_artifacts/")
    print("          └── lora_adapters/")
    print("\nPlease:")
    print("  1. Upload your local artifacts/stage2_finetuning_artifacts/ folder")
    print("     to Google Drive at: MyDrive/artifacts/")
    print("  2. Or update the 'drive_artifacts_path' variable above with your actual path")
    print("  3. Re-run this cell")
    raise FileNotFoundError(f"Stage 2 artifacts not found at: {drive_path}")

# Copy artifacts to working directory
print(f"\n3. Copying artifacts from Drive to Colab workspace...")
target_dir = Path('/content/ml-learning/artifacts/stage2_finetuning_artifacts')
target_dir.parent.mkdir(parents=True, exist_ok=True)

# Copy the entire directory
import shutil
if target_dir.exists():
    shutil.rmtree(target_dir)
shutil.copytree(drive_path, target_dir)

# Verify lora_adapters exists
lora_dir = target_dir / 'lora_adapters'
if lora_dir.exists() and (lora_dir / 'adapter_config.json').exists():
    print(f"\n✅ Stage 2 artifacts successfully copied!")
    print(f"   Source: {drive_path}")
    print(f"   Destination: {target_dir}")
    print(f"   LoRA adapters: {lora_dir}")
    files_list = list(lora_dir.glob('*'))
    print(f"   Files ({len(files_list)}): {[f.name for f in files_list[:5]]}")
else:
    raise FileNotFoundError(f"lora_adapters not found or incomplete at: {lora_dir}")

print("\n" + "=" * 70)
print("✅ ARTIFACTS READY")
print("=" * 70)

In [None]:
# Cell 6: PREFLIGHT TEST (1 pair)
%%bash
set -e
cd /content/ml-learning
export WANDB_DISABLED=true

echo "======================================================================"
echo "PREFLIGHT: Testing with 1 sample"
echo "======================================================================"

# Test critique-revision
echo ""
echo "[1/2] Testing critique-revision generation..."
uv run python critique-revision-system/src/critique_revision.py \
  --num-examples 1 \
  --split 'test[:1]' \
  --output /tmp/preflight_pairs.jsonl \
  --adapter-path artifacts/stage2_finetuning_artifacts/lora_adapters \
  --seed 42

# Test DPO
echo ""
echo "[2/2] Testing DPO training..."
uv run python critique-revision-system/src/training/train_dpo_stage3.py \
  --repo-root /content/ml-learning \
  --pairs-path /tmp/preflight_pairs.jsonl \
  --base-model-id google/gemma-2b-it \
  --stage2-adapter-path artifacts/stage2_finetuning_artifacts/lora_adapters \
  --output-dir /tmp/preflight_stage3 \
  --per-device-train-batch-size 1 \
  --gradient-accumulation-steps 1 \
  --learning-rate 5e-5 \
  --num-train-epochs 1 \
  --max-steps 1 \
  --beta 0.1 \
  --seed 42 \
  --cpu-ref-model

echo ""
echo "======================================================================"
echo "✅ PREFLIGHT PASSED - Ready for full training"
echo "======================================================================"

In [None]:
# Cell 7: Generate Critique-Revision Pairs (2500 pairs)
%%bash
set -e
cd /content/ml-learning
mkdir -p artifacts/stage3_constitutional/pairs

echo "======================================================================"
echo "STAGE 3 - PART 1: CRITIQUE-REVISION GENERATION"
echo "======================================================================"

uv run python critique-revision-system/src/critique_revision.py \
  --num-examples 2500 \
  --split 'test[:1000]+train[:1500]' \
  --output artifacts/stage3_constitutional/pairs/pairs.jsonl \
  --adapter-path artifacts/stage2_finetuning_artifacts/lora_adapters \
  --seed 42

echo ""
echo "✅ Pairs generated"
ls -lh artifacts/stage3_constitutional/pairs/pairs.jsonl

In [None]:
# Cell 8: Validate Pairs Quality
import json
from collections import Counter

pairs_path = '/content/ml-learning/artifacts/stage3_constitutional/pairs/pairs.jsonl'

with open(pairs_path) as f:
    pairs = [json.loads(line) for line in f]

print(f"Total pairs: {len(pairs)}")

# Check malformed
malformed = 0
for p in pairs:
    revised = p.get('revised_response', '')
    if len(revised) < 30 or any(ind in revised.lower()[:100] for ind in ['accurate, but', 'could be improved']):
        malformed += 1

malformed_rate = malformed / len(pairs) * 100
print(f"\nQuality: {len(pairs)-malformed} valid ({100-malformed_rate:.1f}%)")
print(f"Malformed: {malformed} ({malformed_rate:.1f}%)")

if malformed_rate < 5:
    print("✅ Quality GOOD")
else:
    print("⚠️  Quality needs improvement")

# Top principles
principle_counts = Counter()
for p in pairs:
    for pid in p.get('principle_ids', []):
        principle_counts[pid] += 1

print(f"\nTop Principles:")
for pid, count in principle_counts.most_common(5):
    print(f"  {pid}: {count}")

In [None]:
# Cell 9: DPO Training
%%bash
set -e
cd /content/ml-learning

echo "======================================================================"
echo "STAGE 3 - PART 2: DPO TRAINING"
echo "======================================================================"

uv run python critique-revision-system/src/training/train_dpo_stage3.py \
  --repo-root /content/ml-learning \
  --pairs-path artifacts/stage3_constitutional/pairs/pairs.jsonl \
  --base-model-id google/gemma-2b-it \
  --stage2-adapter-path artifacts/stage2_finetuning_artifacts/lora_adapters \
  --output-dir artifacts/stage3_constitutional \
  --per-device-train-batch-size 1 \
  --gradient-accumulation-steps 8 \
  --learning-rate 5e-5 \
  --num-train-epochs 1.0 \
  --beta 0.1 \
  --seed 42 \
  --save-steps 200 \
  --logging-steps 10 \
  --cpu-ref-model

echo ""
echo "✅ Stage 3 DPO training complete!"

In [None]:
# Cell 10: Download Stage 3 Artifacts
import zipfile
import os
from pathlib import Path
from google.colab import files

print('=' * 70)
print('DOWNLOADING STAGE 3 ARTIFACTS')
print('=' * 70)

# Define paths
artifacts_dir = Path('/content/ml-learning/artifacts/stage3_constitutional')
zip_name = 'stage3_constitutional_artifacts'
zip_path = f'/content/{zip_name}'

# Check if artifacts exist
if not artifacts_dir.exists():
    print(f'ERROR: Artifacts directory not found at {artifacts_dir}')
    print('Please ensure Stage 3 training completed successfully.')
else:
    print(f'Found artifacts directory: {artifacts_dir}')
    print(f'\nCreating zip archive...')
    
    # Create zip archive
    with zipfile.ZipFile(f'{zip_path}.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, file_list in os.walk(artifacts_dir):
            for file in file_list:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, artifacts_dir.parent)
                zipf.write(file_path, arcname)
    
    zip_file = f'{zip_path}.zip'
    zip_size_mb = Path(zip_file).stat().st_size / (1024 * 1024)
    
    print(f'Zip archive created: {zip_file}')
    print(f'Archive size: {zip_size_mb:.2f} MB')
    print(f'\nContents:')
    print(f'  - pairs/pairs.jsonl (critique/revision training pairs)')
    print(f'  - models/lora_adapters/ (DPO-trained Stage 3 model)')
    print(f'  - dpo_dataset/train.jsonl (preprocessed DPO training data)')
    print(f'  - metrics.json (training metrics)')
    print(f'  - checkpoints/ (training checkpoints)')
    print(f'  - logs/ (training logs)')
    print(f'\nInitiating download...')
    
    # Download the zip file
    files.download(zip_file)
    
    print(f'\n' + '=' * 70)
    print('EXTRACTION INSTRUCTIONS FOR LOCAL MACHINE')
    print('=' * 70)
    print(f'\n1. Locate downloaded file: {zip_name}.zip')
    print(f'\n2. Extract to your ml-learning artifacts directory:')
    print(f'   cd ~/Documents/ml-learning/artifacts')
    print(f'   unzip ~/Downloads/{zip_name}.zip')
    print(f'\n3. Verify extraction:')
    print(f'   ls -lh ~/Documents/ml-learning/artifacts/stage3_constitutional/')
    print(f'\n4. Stage 3 artifacts will be at:')
    print(f'   ~/Documents/ml-learning/artifacts/stage3_constitutional/')
    print(f'\n' + '=' * 70)
    print('✅ STAGE 3 COMPLETE - Artifacts downloaded!')
    print('=' * 70)