# Constitutional AI Evaluation - Google Colab

**Purpose**: Run comprehensive evaluation of all 3 models (Base, Stage 2, Stage 3) on Constitutional AI test set.

**Important**: This notebook requires LoRA adapters stored in Google Drive at:
- `MyDrive/Constitutional_AI_Adapters/stage2_lora_adapters/`
- `MyDrive/Constitutional_AI_Adapters/stage3_lora_adapters/`

## Step 1: Check GPU Availability

In [None]:
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print(" Change runtime type to GPU")

## Step 2: Clone Repository & Install Dependencies

In [None]:
# Login to Hugging Face (required for Gemma model access)
# Secure login without storing/printing your token.
# Token will be set as HF_TOKEN environment variable for bash cells.
import os

# Clear any existing tokens
os.environ.pop("HF_TOKEN", None)
os.environ.pop("HUGGINGFACEHUB_API_TOKEN", None)

from huggingface_hub import login, HfApi

try:
    import getpass as gp
    raw = gp.getpass("Paste your Hugging Face token (input hidden): ")
    token = raw.decode() if isinstance(raw, (bytes, bytearray)) else raw
    if not isinstance(token, str):
        raise TypeError(f"Unexpected token type: {type(token).__name__}")
    token = token.strip()
    if not token:
        raise ValueError("Empty token provided")
    
    # Login and set environment variable
    login(token=token, add_to_git_credential=False)
    os.environ["HF_TOKEN"] = token
    
    who = HfApi().whoami(token=token)
    print(f"Logged in as: {who.get('name') or who.get('email') or 'OK'}")
    print("HF_TOKEN environment variable set for bash cells.")
    
except Exception as e:
    print(f"[HF Login] getpass flow failed: {e}")
    print("Falling back to interactive login widget...")
    login()
    
    # Try to get token from saved credentials
    try:
        from huggingface_hub import HfFolder
        token = HfFolder.get_token()
        if token:
            os.environ["HF_TOKEN"] = token
            print("HF_TOKEN environment variable set from saved credentials.")
        who = HfApi().whoami()
        print(f"Logged in as: {who.get('name') or who.get('email') or 'OK'}")
    except Exception as e2:
        print(f"[HF Login] Could not set HF_TOKEN env var: {e2}")
        print("You may need to run 'huggingface-cli login' in a bash cell.")

In [None]:
import os

# Change this to your GitHub repository URL
REPO_URL = "https://github.com/Jai-Dhiman/ml-learning.git"

# Clone repository
if not os.path.exists('ml-learning'):
    !git clone {REPO_URL}
else:
    print("✓ Repository already cloned")

# Change to project directory
%cd ml-learning/constitutional-ai-stage4

# Verify structure
!ls -la

## Step 4: Install Python Packages

In [None]:
# Install required packages
!pip install -q torch transformers peft datasets accelerate sentencepiece protobuf

print("✓ Dependencies installed")

## Step 5: Mount Google Drive

**Required**: Mount Drive to access LoRA adapters

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

print("\n✓ Google Drive mounted successfully")

In [None]:
# Define adapter paths in Google Drive
STAGE2_ADAPTER_PATH = "/content/drive/MyDrive/Constitutional_AI_Adapters/stage2_lora_adapters"
STAGE3_ADAPTER_PATH = "/content/drive/MyDrive/Constitutional_AI_Adapters/stage3_lora_adapters"

print(f"Stage 2 adapters: {STAGE2_ADAPTER_PATH}")
print(f"Stage 3 adapters: {STAGE3_ADAPTER_PATH}")

## Step 6: Configure Adapter Paths

In [None]:
import os
from pathlib import Path

# Define adapter paths in Google Drive
STAGE2_ADAPTER_PATH = "/content/drive/MyDrive/Constitutional_AI_Adapters/stage2_lora_adapters"
STAGE3_ADAPTER_PATH = "/content/drive/MyDrive/Constitutional_AI_Adapters/stage3_lora_adapters"

print("=" * 70)
print("ADAPTER PATH VALIDATION")
print("=" * 70)

def validate_adapter_path(path, stage_name):
    """Validate that adapter directory exists and contains required files."""
    print(f"\n{stage_name}:")
    print(f"  Path: {path}")
    
    if not os.path.exists(path):
        print(f"  ✗ ERROR: Directory not found!")
        print(f"\n  Please ensure your adapters are uploaded to:")
        print(f"  {path}")
        return False
    
    print(f"  ✓ Directory exists")
    
    # Check required files
    required_files = ['adapter_config.json', 'adapter_model.safetensors']
    all_found = True
    
    for file_name in required_files:
        file_path = os.path.join(path, file_name)
        if os.path.exists(file_path):
            size_mb = os.path.getsize(file_path) / (1024 * 1024)
            print(f"  ✓ {file_name} ({size_mb:.1f} MB)")
        else:
            print(f"  ✗ Missing: {file_name}")
            all_found = False
    
    return all_found

# Validate both adapter paths
stage2_valid = validate_adapter_path(STAGE2_ADAPTER_PATH, "Stage 2 Adapters")
stage3_valid = validate_adapter_path(STAGE3_ADAPTER_PATH, "Stage 3 Adapters")

print("\n" + "=" * 70)
if stage2_valid and stage3_valid:
    print("✓ ALL ADAPTERS VALIDATED SUCCESSFULLY")
    print("=" * 70)
else:
    print("✗ VALIDATION FAILED")
    print("=" * 70)
    print("\nPlease fix the issues above before proceeding.")
    print("\nYour adapters should be organized as:")
    print("  MyDrive/")
    print("    └── Constitutional_AI_Adapters/")
    print("        ├── stage2_lora_adapters/")
    print("        │   ├── adapter_config.json")
    print("        │   └── adapter_model.safetensors")
    print("        └── stage3_lora_adapters/")
    print("            ├── adapter_config.json")
    print("            └── adapter_model.safetensors")
    raise RuntimeError("Adapter validation failed. Please upload your adapters to Google Drive.")

## Step 7: Validate Setup

In [None]:
# Run validation script with custom adapter paths
!python3 src/inference/validate_setup.py \
  --stage2-adapter-path "{STAGE2_ADAPTER_PATH}" \
  --stage3-adapter-path "{STAGE3_ADAPTER_PATH}"

## Step 8: Quick Test (Optional)

Test with 5 prompts (2-3 min)

In [None]:
# Quick test with 5 prompts
!python3 src/evaluation/evaluation_runner.py \
  --test-file artifacts/evaluation/extended_test_prompts.jsonl \
  --models stage3_constitutional \
  --max-prompts 5 \
  --stage2-adapter-path "{STAGE2_ADAPTER_PATH}" \
  --stage3-adapter-path "{STAGE3_ADAPTER_PATH}"
  --stage2-adapter-path "{STAGE2_ADAPTER_PATH}" \
  --stage3-adapter-path "{STAGE3_ADAPTER_PATH}"

print("\n✓ Quick test complete! If this worked, proceed to full evaluation.")

## Step 9: Full Evaluation (6-8 hours)

Evaluates all 3 models on 110 prompts

In [None]:
import time
from datetime import datetime

# Record start time
start_time = time.time()
print(f"Starting evaluation at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("\nThis will take approximately 6-8 hours on T4 GPU (110 prompts × 3 models)...\n")
print("=" * 70)

# Run full evaluation with extended test set (110 prompts)
!python3 src/evaluation/evaluation_runner.py \
  --test-file artifacts/evaluation/extended_test_prompts.jsonl \
  --models base stage2_helpful stage3_constitutional \
  --max-prompts 110 \
  --stage2-adapter-path "{STAGE2_ADAPTER_PATH}" \
  --stage3-adapter-path "{STAGE3_ADAPTER_PATH}"
  --output-dir artifacts/evaluation/final_results \
  --stage2-adapter-path "{STAGE2_ADAPTER_PATH}" \
  --stage3-adapter-path "{STAGE3_ADAPTER_PATH}"

# Record end time
end_time = time.time()
duration = (end_time - start_time) / 3600  # Convert to hours

print("=" * 70)
print(f"\n✓ Evaluation complete!")
print(f"Duration: {duration:.2f} hours")
print(f"Finished at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## Step 10: View Results

In [None]:
import json
import pandas as pd
import os
from pathlib import Path

# Check if results exist
results_file = Path('artifacts/evaluation/final_results')

if not results_file.exists():
    print("⚠️  Results directory not found.")
    print("Make sure Cell 7 (Full Evaluation) completed successfully.")
    print("\nThe evaluation saves results to: artifacts/evaluation/final_results/")
else:
    # Find the most recent results file
    result_files = sorted(results_file.glob('evaluation_results_*.json'))
    
    if not result_files:
        print("⚠️  No results files found in artifacts/evaluation/final_results/")
        print("\nMake sure Cell 7 completed successfully.")
        print("\nAvailable files:")
        !ls -lah artifacts/evaluation/final_results/
    else:
        # Load most recent results
        latest_results = result_files[-1]
        print(f"Loading results from: {latest_results}")
        
        with open(latest_results, 'r') as f:
            results = json.load(f)
        
        print("=" * 70)
        print("EVALUATION RESULTS SUMMARY")
        print("=" * 70)
        
        # Display aggregate scores
        if 'comparison_summary' in results:
            print("\nAggregate Scores by Model:")
            for model, scores in results['comparison_summary'].get('aggregate_scores', {}).items():
                print(f"  {model}: {scores:.4f}")
        
        # Load and display CSV (find most recent summary file)
        summary_files = sorted(results_file.glob('evaluation_summary_*.csv'))
        
        if summary_files:
            print("\n" + "=" * 70)
            print("Detailed Comparison Table:")
            print("=" * 70)
            df = pd.read_csv(summary_files[-1])
            print(df.to_string())
        
        print("\n" + "=" * 70)
        print("Results saved to:")
        print(f"  - {latest_results}")
        if summary_files:
            print(f"  - {summary_files[-1]}")
        print("=" * 70)

## Step 11: Backup to Drive

In [None]:
import os

# Create results directory in Google Drive
DRIVE_RESULTS_DIR = '/content/drive/MyDrive/constitutional_ai_evaluation_results'
!mkdir -p "{DRIVE_RESULTS_DIR}"

# Copy results
!cp -r artifacts/evaluation/final_results/* "{DRIVE_RESULTS_DIR}"/

print(f"✓ Results copied to Google Drive: {DRIVE_RESULTS_DIR}")
print("  You can access these files from your Google Drive at any time!")

## Step 12: Download (Optional)

In [None]:
from google.colab import files
import os

# Create a zip file of all results
!zip -r evaluation_results.zip artifacts/evaluation/final_results/

print("Downloading results...")
files.download('evaluation_results.zip')

print("\n✓ Results downloaded!")
print("\nExtract the zip file locally to access:")
print("  - results.json (complete evaluation data)")
print("  - comparison.csv (model comparison table)")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Load results
df = pd.read_csv('artifacts/evaluation/final_results/comparison.csv')

# Extract model names and scores (assumes specific column structure)
# Adjust column names based on actual CSV structure
models = df['model'].tolist() if 'model' in df.columns else ['Base', 'Stage 2', 'Stage 3']
principles = ['harm_prevention', 'truthfulness', 'helpfulness', 'fairness']

# Create radar chart
fig, ax = plt.subplots(figsize=(10, 8), subplot_kw=dict(projection='polar'))

angles = np.linspace(0, 2 * np.pi, len(principles), endpoint=False).tolist()
angles += angles[:1]

for model in models:
    # Extract scores for this model (placeholder logic)
    # Adjust based on actual data structure
    scores = [0.5, 0.6, 0.7, 0.65]  # Replace with actual scores from df
    scores += scores[:1]
    ax.plot(angles, scores, 'o-', linewidth=2, label=model)
    ax.fill(angles, scores, alpha=0.25)

ax.set_theta_offset(np.pi / 2)
ax.set_theta_direction(-1)
ax.set_xticks(angles[:-1])
ax.set_xticklabels([p.replace('_', ' ').title() for p in principles])
ax.set_ylim(0, 1)
ax.set_ylabel('Score', labelpad=30)
ax.set_title('Constitutional Principle Scores by Model', size=16, pad=20)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
ax.grid(True)

plt.tight_layout()
plt.savefig('principle_comparison_radar.png', dpi=300, bbox_inches='tight')
print("\n✓ Radar chart saved to: principle_comparison_radar.png")
plt.show()

# Download visualization
files.download('principle_comparison_radar.png')

## Summary

**Evaluation Complete! 🎉**

You now have:
- Complete evaluation results (JSON)
- Model comparison table (CSV)
- Optional visualizations

**Next Steps**:
1. Review results in `results.json` and `comparison.csv`
2. Create statistical analysis scripts (significance testing, effect sizes)
3. Generate publication-quality figures
4. Write the paper!

See `RESEARCH_PUBLICATION_PLAN.md` for detailed next steps.