# Results Analysis: Model Performance Comparison

This notebook analyzes and compares the performance of different genomic foundation models.

In [None]:
import sys
sys.path.append('..')

import json
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, roc_auc_score

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

%matplotlib inline

## Load Results

In [None]:
def load_all_results(results_dir='../results'):
    """Load all JSON result files."""
    results = []
    for path in Path(results_dir).glob('*.json'):
        with open(path) as f:
            data = json.load(f)
            data['filename'] = path.name
            results.append(data)
    return results

results = load_all_results()
print(f"Loaded {len(results)} result files")

## Overall Performance Comparison

In [None]:
# Extract metrics for comparison
comparison_data = []

for result in results:
    if 'overall' in result:
        row = {
            'model': result.get('model', 'unknown'),
            'method': result.get('method', 'unknown'),
            'negative_set': result.get('negative_set', 'unknown'),
        }
        
        # Handle both direct values and mean/std dicts
        overall = result['overall']
        for metric in ['auroc', 'auprc', 'mcc', 'f1']:
            if metric in overall:
                val = overall[metric]
                if isinstance(val, dict):
                    row[metric] = val.get('mean', np.nan)
                    row[f'{metric}_std'] = val.get('std', np.nan)
                else:
                    row[metric] = val
                    row[f'{metric}_std'] = np.nan
        
        comparison_data.append(row)

comparison_df = pd.DataFrame(comparison_data)
comparison_df.head(10)

In [None]:
# Plot AUROC comparison
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Zero-shot performance
zs_data = comparison_df[comparison_df['method'] == 'zero_shot']
if len(zs_data) > 0:
    pivot = zs_data.pivot(index='model', columns='negative_set', values='auroc')
    pivot.plot(kind='barh', ax=axes[0])
    axes[0].set_title('Zero-Shot AUROC by Model and Negative Set')
    axes[0].set_xlabel('AUROC')
    axes[0].set_ylabel('Model')
    axes[0].axvline(x=0.5, color='red', linestyle='--', alpha=0.5, label='Random')
    axes[0].legend()

# Method comparison (for one negative set)
n3_data = comparison_df[comparison_df['negative_set'] == 'N3']
if len(n3_data) > 0:
    pivot = n3_data.pivot(index='model', columns='method', values='auroc')
    pivot.plot(kind='barh', ax=axes[1])
    axes[1].set_title('Performance by Method (Negative Set: N3)')
    axes[1].set_xlabel('AUROC')
    axes[1].set_ylabel('Model')
    axes[1].axvline(x=0.5, color='red', linestyle='--', alpha=0.5, label='Random')
    axes[1].legend()

plt.tight_layout()
plt.show()

## Per-Region Performance

In [None]:
# Extract per-region metrics
region_data = []

for result in results:
    if 'per_region' in result and result.get('method') == 'zero_shot':
        model = result.get('model', 'unknown')
        neg_set = result.get('negative_set', 'unknown')
        
        for region, metrics in result['per_region'].items():
            region_data.append({
                'model': model,
                'negative_set': neg_set,
                'region': region,
                'auroc': metrics.get('auroc', np.nan),
                'auprc': metrics.get('auprc', np.nan),
            })

region_df = pd.DataFrame(region_data)
region_df.head()

In [None]:
# Plot per-region performance
if len(region_df) > 0:
    fig, ax = plt.subplots(figsize=(14, 6))
    
    # Filter for one negative set
    plot_data = region_df[region_df['negative_set'] == 'N3']
    
    sns.barplot(data=plot_data, x='region', y='auroc', hue='model', ax=ax)
    ax.set_title('Per-Region AUROC (Negative Set: N3)')
    ax.set_xlabel('Non-coding Region')
    ax.set_ylabel('AUROC')
    ax.axhline(y=0.5, color='red', linestyle='--', alpha=0.5)
    plt.xticks(rotation=45, ha='right')
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.tight_layout()
    plt.show()

## ROC Curves

In [None]:
# Load score files for ROC curves
def load_scores(results_dir='../results', pattern='scores_*.parquet'):
    """Load all score parquet files."""
    scores = {}
    for path in Path(results_dir).glob(pattern):
        df = pd.read_parquet(path)
        key = path.stem.replace('scores_', '')
        scores[key] = df
    return scores

scores = load_scores()
print(f"Loaded {len(scores)} score files")
print(f"Keys: {list(scores.keys())[:5]}")

In [None]:
# Plot ROC curves for comparison
fig, ax = plt.subplots(figsize=(10, 8))

colors = plt.cm.tab10(np.linspace(0, 1, len(scores)))

for (key, df), color in zip(scores.items(), colors):
    if 'label' in df.columns and 'score' in df.columns:
        fpr, tpr, _ = roc_curve(df['label'], df['score'])
        auc = roc_auc_score(df['label'], df['score'])
        
        ax.plot(fpr, tpr, label=f'{key} (AUC={auc:.3f})', 
                color=color, linewidth=1.5)

ax.plot([0, 1], [0, 1], 'k--', alpha=0.3, label='Random')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC Curves: Model Comparison')
ax.legend(loc='lower right', fontsize=8)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Statistical Summary

In [None]:
# Summary table
summary = comparison_df.groupby(['model', 'method', 'negative_set'])['auroc'].agg(['mean', 'count'])
summary = summary.reset_index()
summary = summary.sort_values('mean', ascending=False)

print("\n" + "="*80)
print("MODEL PERFORMANCE SUMMARY (AUROC)")
print("="*80)
print(summary.to_string(index=False))
print("="*80)

In [None]:
# Best model per negative set
print("\n" + "="*80)
print("BEST MODEL PER NEGATIVE SET (Zero-Shot)")
print("="*80)

zs_results = comparison_df[comparison_df['method'] == 'zero_shot']
for neg_set in zs_results['negative_set'].unique():
    subset = zs_results[zs_results['negative_set'] == neg_set]
    best = subset.loc[subset['auroc'].idxmax()]
    print(f"\n{neg_set}:")
    print(f"  Model: {best['model']}")
    print(f"  AUROC: {best['auroc']:.4f}")
    if not np.isnan(best.get('auroc_std', np.nan)):
        print(f"  Std:   {best['auroc_std']:.4f}")

print("="*80)