# SGI Semantic Laziness: Main Paper Experiments

This notebook reproduces the main experimental results from:

> Marín, J. (2026). "Semantic Grounding Index: Geometric Bounds on Context Engagement in RAG Systems" [arXiv:2512.13771](https://arxiv.org/abs/2512.13771)

## Setup

In [None]:
# Uncomment to install dependencies
# !pip install -q datasets sentence-transformers numpy pandas matplotlib seaborn scipy scikit-learn tqdm

In [None]:
import sys
import gc
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from sentence_transformers import SentenceTransformer
from scipy import stats
from sklearn.metrics import roc_auc_score, roc_curve

# Local imports
from sgi import (
    compute_sgi,
    load_halueval_qa,
    load_truthfulqa,
    print_dataset_summary,
    compute_effect_size,
    compute_cohens_d,
    compute_correlation_matrix,
    compute_pairwise_correlations,
    compute_calibration,
    compute_stratified_analysis,
    compute_subgroup_analysis,
    set_publication_style,
    plot_correlation_heatmap,
)

set_publication_style()
print('Setup complete.')

## Configuration

In [None]:
# Paper configuration: n=5,000 samples
MAX_SAMPLES = 5000

# Embedding models tested in the paper
EMBEDDING_MODELS = {
    'mpnet': 'all-mpnet-base-v2',       # 768d, Sentence-Transformers
    'minilm': 'all-MiniLM-L6-v2',       # 384d, Sentence-Transformers (distilled)
    'bge': 'BAAI/bge-base-en-v1.5',     # 768d, BAAI contrastive
    'e5': 'intfloat/e5-base-v2',        # 768d, Microsoft instruction-tuned
    'gte': 'thenlper/gte-base',         # 768d, Alibaba general text
}

print(f'Configuration:')
print(f'  MAX_SAMPLES: {MAX_SAMPLES}')
print(f'  Models: {list(EMBEDDING_MODELS.keys())}')

## Section 1: Data Loading

Load HaluEval QA dataset with stratified sampling by hallucination label.

In [None]:
# Load HaluEval QA dataset
cases = load_halueval_qa(max_samples=MAX_SAMPLES)
print_dataset_summary(cases, 'HaluEval QA')

# Show example
print('\n=== EXAMPLE ===')
ex = cases[0]
print(f'Question: {ex.question[:100]}...')
print(f'Context: {ex.context[:150]}...')
print(f'Response: {ex.response[:150]}...')
print(f'Is Grounded: {ex.is_grounded}')

## Section 2: SGI Computation Across Models

For each embedding model, compute:
$$\text{SGI} = \frac{\theta(r, q)}{\theta(r, c)}$$

where $\theta$ is angular distance and $r, q, c$ are L2-normalized embeddings.

In [None]:
def compute_sgi_for_model(cases, model_name: str, model_id: str) -> pd.DataFrame:
    """Compute SGI for all cases using specified embedding model."""
    print(f'\n{"="*60}')
    print(f'Model: {model_name} ({model_id})')
    print(f'{"="*60}')

    encoder = SentenceTransformer(model_id)
    print(f'Embedding dimension: {encoder.get_sentence_embedding_dimension()}')

    results = []

    for case in tqdm(cases, desc=f'Computing SGI ({model_name})'):
        try:
            q_emb = encoder.encode(case.question)
            c_emb = encoder.encode(case.context)
            r_emb = encoder.encode(case.response)

            sgi_result = compute_sgi(q_emb, c_emb, r_emb)

            results.append({
                'id': case.id,
                'is_grounded': case.is_grounded,
                'question_length': len(case.question),
                'context_length': len(case.context),
                'response_length': len(case.response),
                f'sgi_{model_name}': sgi_result.sgi,
                f'theta_rq_{model_name}': sgi_result.theta_rq,
                f'theta_rc_{model_name}': sgi_result.theta_rc,
                f'theta_qc_{model_name}': sgi_result.theta_qc,
            })
        except Exception as e:
            continue

    # Clean up
    del encoder
    gc.collect()

    print(f'Processed: {len(results)} samples')
    return pd.DataFrame(results)

In [None]:
# Run analysis for each model
model_dfs = {}

for model_name, model_id in EMBEDDING_MODELS.items():
    model_dfs[model_name] = compute_sgi_for_model(cases, model_name, model_id)

print(f'\n\nCompleted analysis for {len(model_dfs)} models.')

In [None]:
# Merge results into single dataframe
model_names = list(model_dfs.keys())
merged_df = model_dfs[model_names[0]][['id', 'is_grounded', 'question_length', 'context_length', 'response_length']].copy()

for model_name, df in model_dfs.items():
    cols_to_merge = ['id'] + [c for c in df.columns if model_name in c]
    merged_df = merged_df.merge(df[cols_to_merge], on='id', how='inner')

print(f'Merged: {len(merged_df)} samples with SGI from {len(model_names)} models')
print(f'  Grounded: {merged_df["is_grounded"].sum()}')
print(f'  Hallucinated: {(~merged_df["is_grounded"]).sum()}')

## Section 3: Table 1 - Cross-Model Effect Sizes

Reproducing Table 1 from the paper:

| Model | SGI (Valid) | SGI (Halluc) | Cohen's d | AUC | p-value |
|-------|-------------|--------------|-----------|-----|---------|

In [None]:
print('='*80)
print('TABLE 1: CROSS-MODEL EFFECT SIZES (n={})'.format(len(merged_df)))
print('='*80)
print(f'\n{"Model":<10} | {"SGI (Valid)":>12} | {"SGI (Halluc)":>12} | {"Cohen\'s d":>10} | {"AUROC":>8} | {"p-value":>12}')
print('-'*80)

effect_sizes = {}
table1_results = []

for model_name in model_names:
    sgi_col = f'sgi_{model_name}'
    values = merged_df[sgi_col].values
    labels = merged_df['is_grounded'].values

    result = compute_effect_size(values, labels, model_name)
    effect_sizes[model_name] = result.to_dict()

    sig = '***' if result.p_value < 0.001 else '**' if result.p_value < 0.01 else '*' if result.p_value < 0.05 else ''
    print(f'{model_name:<10} | {result.grounded_mean:>12.3f} | {result.hallucinated_mean:>12.3f} | '
          f'{result.cohens_d:>+10.2f} | {result.auroc:>8.3f} | {result.p_value:>10.2e} {sig}')

    table1_results.append({
        'Model': model_name,
        'SGI (Valid)': result.grounded_mean,
        'SGI (Halluc)': result.hallucinated_mean,
        "Cohen's d": result.cohens_d,
        'AUROC': result.auroc,
        'p-value': result.p_value,
    })

print('-'*80)

# Summary statistics
d_values = [r['cohens_d'] for r in effect_sizes.values()]
auc_values = [r['auroc'] for r in effect_sizes.values()]
valid_means = [r['grounded_mean'] for r in effect_sizes.values()]
halluc_means = [r['hallucinated_mean'] for r in effect_sizes.values()]

print(f'{"Mean":<10} | {np.mean(valid_means):>12.3f} | {np.mean(halluc_means):>12.3f} | '
      f'{np.mean(d_values):>+10.2f} | {np.mean(auc_values):>8.3f} | {"—":>12}')
print('='*80)

print(f'\nSummary:')
print(f"  Cohen's d: mean={np.mean(d_values):.2f}, range=[{min(d_values):.2f}, {max(d_values):.2f}]")
print(f'  AUROC: mean={np.mean(auc_values):.3f}, range=[{min(auc_values):.3f}, {max(auc_values):.3f}]')

## Section 4: Figure 2 - Cross-Model Correlation Matrix

Compute Pearson correlation between SGI scores from different models.
Target: mean off-diagonal r ≈ 0.85

In [None]:
print('='*70)
print('FIGURE 2: CROSS-MODEL CORRELATION MATRIX')
print('='*70)

sgi_cols = [f'sgi_{m}' for m in model_names]
sgi_df = merged_df[sgi_cols].copy()
sgi_df.columns = model_names

# Pearson correlation
pearson_matrix = compute_correlation_matrix(sgi_df, model_names, method='pearson')
pearson_stats = compute_pairwise_correlations(pearson_matrix)

# Spearman correlation
spearman_matrix = compute_correlation_matrix(sgi_df, model_names, method='spearman')
spearman_stats = compute_pairwise_correlations(spearman_matrix)

print('\nPearson Correlation Matrix:')
print(pearson_matrix.round(3).to_string())
print(f'\nOff-diagonal Pearson r: mean={pearson_stats["mean"]:.3f}, range=[{pearson_stats["min"]:.3f}, {pearson_stats["max"]:.3f}]')

print('\n\nSpearman Correlation Matrix:')
print(spearman_matrix.round(3).to_string())
print(f'\nOff-diagonal Spearman ρ: mean={spearman_stats["mean"]:.3f}, range=[{spearman_stats["min"]:.3f}, {spearman_stats["max"]:.3f}]')

In [None]:
# Plot correlation heatmap
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

plot_correlation_heatmap(pearson_matrix, axes[0], title='Pearson Correlation (Linear Agreement)')
plot_correlation_heatmap(spearman_matrix, axes[1], title='Spearman Correlation (Ranking Agreement)')

fig.suptitle(f'Figure 2: Cross-Model SGI Correlation (n={len(merged_df)})', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('figure2_correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

## Section 5: Table 2 - Stratified Analysis by θ(q,c)

Effect size should increase monotonically with question-context separation, confirming the triangle inequality prediction.

In [None]:
print('='*90)
print('TABLE 2: STRATIFIED ANALYSIS BY θ(q,c) TERCILE')
print('='*90)

# Use primary model (mpnet) for stratified analysis
primary_model = 'mpnet'
analysis_df = merged_df[['is_grounded', f'sgi_{primary_model}', f'theta_qc_{primary_model}']].copy()
analysis_df.columns = ['is_grounded', 'sgi', 'theta_qc']

stratified_results = compute_stratified_analysis(
    df=analysis_df,
    stratify_col='theta_qc',
    sgi_col='sgi',
    label_col='is_grounded',
    n_bins=3
)

print(f'\n{"θ(q,c) Tercile":<15} | {"n":>6} | {"θ(q,c) Range":>18} | {"SGI (Valid)":>12} | {"SGI (Halluc)":>12} | {"Cohen\'s d":>10} | {"AUROC":>8}')
print('-'*95)

for _, row in stratified_results.iterrows():
    range_str = f'[{row["range_min"]:.2f}, {row["range_max"]:.2f}]'
    print(f'{row["stratum"]:<15} | {row["n"]:>6} | {range_str:>18} | {row["sgi_grounded"]:>12.2f} | '
          f'{row["sgi_hallucinated"]:>12.2f} | {row["cohens_d"]:>+10.2f} | {row["auroc"]:>8.3f}')

print('-'*95)
print('\nKey insight: Effect size INCREASES monotonically with θ(q,c), confirming triangle inequality prediction.')

## Section 6: Table 3 - Subgroup Analysis

Analyze effect sizes by question length, context length, and response length terciles.

In [None]:
print('='*90)
print('TABLE 3: SUBGROUP ANALYSIS BY TEXT LENGTH')
print('='*90)

# Prepare analysis dataframe
subgroup_df = merged_df[['is_grounded', f'sgi_{primary_model}',
                         'question_length', 'context_length', 'response_length']].copy()
subgroup_df.columns = ['is_grounded', 'sgi', 'question_length', 'context_length', 'response_length']

all_subgroup_results = []

for feature in ['question_length', 'context_length', 'response_length']:
    feature_results = compute_subgroup_analysis(
        df=subgroup_df,
        feature_col=feature,
        sgi_col='sgi',
        label_col='is_grounded',
        n_bins=3
    )
    all_subgroup_results.append(feature_results)

combined_subgroup = pd.concat(all_subgroup_results, ignore_index=True)

print(f'\n{"Feature":<20} | {"Subgroup":<8} | {"n":>6} | {"Cohen\'s d":>10} | {"AUROC":>8}')
print('-'*65)

for _, row in combined_subgroup.iterrows():
    print(f'{row["feature"]:<20} | {row["subgroup"]:<8} | {row["n"]:>6} | {row["cohens_d"]:>+10.2f} | {row["auroc"]:>8.3f}')

print('-'*65)
print('\nKey findings:')
print('  - Response length: d increases from Short to Long (stronger signal for longer responses)')
print('  - Question length: d decreases from Short to Long')
print('  - Context length: d relatively stable')

## Section 7: Calibration Analysis (Figure 5)

Compute Expected Calibration Error (ECE) and plot reliability diagram.
Target: ECE ≈ 0.10

In [None]:
print('='*70)
print('FIGURE 5: CALIBRATION ANALYSIS')
print('='*70)

# Compute calibration for primary model
sgi_scores = merged_df[f'sgi_{primary_model}'].values
labels = merged_df['is_grounded'].values

ece, bin_accuracies, bin_confidences, bin_counts = compute_calibration(
    scores=sgi_scores,
    labels=labels,
    n_bins=10
)

print(f'\nExpected Calibration Error (ECE): {ece:.3f}')
print(f'\nBin-wise analysis:')
print(f'{"Bin":>4} | {"Count":>8} | {"Confidence":>12} | {"Accuracy":>10} | {"Gap":>8}')
print('-'*55)

for i, (conf, acc, count) in enumerate(zip(bin_confidences, bin_accuracies, bin_counts)):
    gap = abs(conf - acc)
    print(f'{i+1:>4} | {count:>8} | {conf:>12.3f} | {acc:>10.3f} | {gap:>8.3f}')

In [None]:
# Plot calibration figures
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Reliability diagram
ax = axes[0]
ax.bar(np.arange(len(bin_confidences)), bin_accuracies, width=0.8, alpha=0.7, label='Actual', color='#3498db')
ax.plot(np.arange(len(bin_confidences)), bin_confidences, 'ro-', label='Predicted', markersize=8)
ax.plot([-0.5, len(bin_confidences)-0.5], [0, 1], 'k--', alpha=0.3, label='Perfect calibration')
ax.set_xlabel('Bin')
ax.set_ylabel('Hallucination Rate')
ax.set_title(f'Reliability Diagram (ECE = {ece:.3f})')
ax.legend()
ax.set_xlim(-0.5, len(bin_confidences)-0.5)
ax.set_ylim(0, 1)

# Hallucination rate by SGI decile
ax = axes[1]
deciles = pd.qcut(merged_df[f'sgi_{primary_model}'], 10, labels=False, duplicates='drop')
halluc_rates = []
for d in range(10):
    mask = deciles == d
    if mask.sum() > 0:
        rate = (~merged_df.loc[mask, 'is_grounded']).mean()
        halluc_rates.append(rate)
    else:
        halluc_rates.append(0)

ax.bar(range(len(halluc_rates)), halluc_rates, color='#e74c3c', alpha=0.7)
ax.set_xlabel('SGI Decile (1=lowest, 10=highest)')
ax.set_ylabel('Hallucination Rate')
ax.set_title('Hallucination Rate by SGI Decile')
ax.set_xticks(range(len(halluc_rates)))
ax.set_xticklabels([str(i+1) for i in range(len(halluc_rates))])

fig.suptitle('Figure 5: Calibration Analysis', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('figure5_calibration.png', dpi=300, bbox_inches='tight')
plt.show()

## Section 8: TruthfulQA Negative Result (Table 4)

SGI should NOT work on TruthfulQA because angular geometry measures topical engagement, not factual accuracy.
Expected: AUC ≈ 0.478 (below chance)

In [None]:
print('='*70)
print('TABLE 4: TRUTHFULQA NEGATIVE RESULT')
print('='*70)

# Load TruthfulQA
truthfulqa_cases = load_truthfulqa(max_samples=2000)
print_dataset_summary(truthfulqa_cases, 'TruthfulQA')

# Compute SGI for TruthfulQA using primary model
encoder = SentenceTransformer(EMBEDDING_MODELS[primary_model])

truthfulqa_results = []
for case in tqdm(truthfulqa_cases, desc='Computing SGI for TruthfulQA'):
    try:
        q_emb = encoder.encode(case.question)
        c_emb = encoder.encode(case.context)
        r_emb = encoder.encode(case.response)

        sgi_result = compute_sgi(q_emb, c_emb, r_emb)

        truthfulqa_results.append({
            'id': case.id,
            'is_grounded': case.is_grounded,
            'sgi': sgi_result.sgi,
        })
    except:
        continue

del encoder
gc.collect()

truthfulqa_df = pd.DataFrame(truthfulqa_results)

In [None]:
# Compute effect size for TruthfulQA
tqa_values = truthfulqa_df['sgi'].values
tqa_labels = truthfulqa_df['is_grounded'].values

tqa_result = compute_effect_size(tqa_values, tqa_labels, 'TruthfulQA')

print(f'\nTruthfulQA Results (n={len(truthfulqa_df)}):')
print(f'  SGI (Truthful): {tqa_result.grounded_mean:.3f}')
print(f'  SGI (False): {tqa_result.hallucinated_mean:.3f}')
print(f"  Cohen's d: {tqa_result.cohens_d:+.3f}")
print(f'  AUROC: {tqa_result.auroc:.3f}')
print(f'  p-value: {tqa_result.p_value:.3f}')

print('\n' + '='*70)
if tqa_result.auroc < 0.55:
    print('EXPECTED RESULT: SGI fails on TruthfulQA (AUC near/below 0.5)')
    print('This confirms that angular geometry measures TOPICAL ENGAGEMENT,')
    print('not FACTUAL ACCURACY. Misconceptions are semantically close to questions.')
else:
    print('UNEXPECTED: SGI shows some signal on TruthfulQA.')
print('='*70)

## Section 9: Signal Decomposition (Table 5)

Analyze which component drives the SGI signal: θ(r,q) or θ(r,c)?

Key insight: Semantic laziness is driven by hallucinations being CLOSER to questions, not farther from contexts.

In [None]:
print('='*70)
print('TABLE 5: SIGNAL DECOMPOSITION')
print('='*70)
print(f'\n{"Model":<10} | {"d(θr,q)":>10} | {"d(θr,c)":>10} | Primary Driver')
print('-'*55)

decomposition_results = []

for model_name in model_names:
    theta_rq_col = f'theta_rq_{model_name}'
    theta_rc_col = f'theta_rc_{model_name}'
    labels = merged_df['is_grounded'].values

    # Note: For theta_rq and theta_rc, lower values for hallucinations means they're closer
    d_rq = compute_effect_size(merged_df[theta_rq_col].values, labels, 'θ_rq',
                               positive_class_is_hallucinated=False)
    d_rc = compute_effect_size(merged_df[theta_rc_col].values, labels, 'θ_rc',
                               positive_class_is_hallucinated=False)

    driver = 'θ(r,q)' if abs(d_rq.cohens_d) > abs(d_rc.cohens_d) else 'θ(r,c)'
    print(f'{model_name:<10} | {d_rq.cohens_d:>+10.2f} | {d_rc.cohens_d:>+10.2f} | {driver}')

    decomposition_results.append({
        'Model': model_name,
        'd(θr,q)': d_rq.cohens_d,
        'd(θr,c)': d_rc.cohens_d,
        'Primary Driver': driver
    })

print('-'*55)
print('\nInterpretation:')
print('  • Negative d(θ_rq): Hallucinations are CLOSER to question')
print('  • Positive d(θ_rc): Hallucinations are FARTHER from context')
print('  • Both effects together = "semantic laziness" signature')
print('  • θ(r,q) is the PRIMARY driver across all models')