# Baseline Calibration

This notebook implements Phase 1: baseline calibration of LLM responses to GSS ground truth.

In [None]:
import asyncio
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

from hivesight_calibration import (
    GSSLoader, 
    PersonaGenerator, 
    LLMSurvey, 
    Calibrator,
    OPINION_QUESTIONS
)

## 1. Load GSS Data and Generate Personas

In [None]:
# Load GSS data
loader = GSSLoader(data_dir=Path('../data'))
gss = loader.load(years=[2022, 2024])

# Focus on a single question for baseline: capital punishment
question_id = 'cappun'
question_text = loader.get_question_text(question_id)
response_scale = loader.get_response_scale(question_id)

print(f"Question: {question_text}")
print(f"Scale: {response_scale}")

# Filter to respondents who answered this question
gss_valid = gss[gss[question_id].notna()].copy()
print(f"\n{len(gss_valid):,} respondents with valid {question_id} responses")

In [None]:
# Generate personas from GSS respondents
generator = PersonaGenerator()

# Sample a subset for initial testing
sample_size = 100  # Start small
sample = gss_valid.sample(n=sample_size, random_state=42)

personas = generator.from_dataframe(sample)
print(f"Generated {len(personas)} personas")

# Show example persona
print(f"\nExample persona prompt:")
print(personas[0].to_prompt())

## 2. Query LLM for Each Persona

In [None]:
# Initialize LLM survey
survey = LLMSurvey(model='gpt-4o-mini')

async def run_survey(personas, question, scale):
    """Run survey for all personas."""
    results = []
    
    for persona in tqdm(personas, desc="Querying LLM"):
        response = await survey.query(
            persona=persona,
            question=question,
            response_type='likert',
            scale=scale
        )
        results.append({
            'llm_raw': response.raw_response,
            'llm_parsed': response.parsed_response,
            'tokens': response.tokens_total,
            'cost': response.cost_usd
        })
    
    return results

# Run the survey (uncomment when ready to spend API credits)
# results = await run_survey(personas, question_text, response_scale)
# print(f"Total cost: ${sum(r['cost'] for r in results):.4f}")

## 3. Compare LLM vs. Actual Responses

In [None]:
# This cell will be populated after running the LLM queries
# For now, show the actual GSS distribution

actual_dist = gss_valid[question_id].value_counts(normalize=True).sort_index()
print(f"GSS {question_id} distribution:")
print(actual_dist)

## 4. Fit Calibration Model

In [None]:
# Placeholder for calibration fitting
# Will be implemented after LLM query results are collected

# calibrator = Calibrator(n_categories=len(response_scale))
# calibrator.fit(llm_responses, actual_responses, demographic_features)

## 5. Evaluate Calibration Quality

In [None]:
# Placeholder for evaluation
# result = calibrator.evaluate(test_llm, test_actual, test_features)
# print(f"CRPS: {result.crps:.4f}")
# print(f"Pinball losses: {result.pinball_losses}")