In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

## Load Results

In [None]:
# Load the experiment results
df = pd.read_csv('../model_comparison_results.csv')
print(f"Loaded {len(df)} experiment results")
df.head()

## Overall Model Comparison

In [None]:
# Sort by Overall Mean R²
df_sorted = df.sort_values('Overall Mean R²', ascending=False)

# Display summary
summary_cols = ['model', 'Overall Mean R²', 'epochs', 'best_epoch', 'best_test_loss', 'latent_dim']
available_cols = [c for c in summary_cols if c in df.columns]
df_sorted[available_cols]

In [None]:
# Bar chart of Overall Mean R²
fig, ax = plt.subplots(figsize=(10, 6))

colors = sns.color_palette('husl', len(df_sorted))
bars = ax.barh(df_sorted['model'], df_sorted['Overall Mean R²'], color=colors)

ax.set_xlabel('Overall Mean R²', fontsize=12)
ax.set_ylabel('Model', fontsize=12)
ax.set_title('Model Comparison - Overall Mean R²', fontsize=14)
ax.set_xlim(left=min(0, df_sorted['Overall Mean R²'].min() - 0.1))

# Add value labels
for bar, val in zip(bars, df_sorted['Overall Mean R²']):
    ax.text(val + 0.01, bar.get_y() + bar.get_height()/2, 
            f'{val:.4f}', va='center', fontsize=10)

plt.tight_layout()
plt.savefig('../figures/model_comparison_overall_r2.png', dpi=150, bbox_inches='tight')
plt.show()

## Per-Target Property Analysis

In [None]:
# Extract R² columns for each target property
r2_cols = [col for col in df.columns if col.endswith('(R²)')]
print(f"Target properties: {len(r2_cols)}")
print(r2_cols)

In [None]:
# Create heatmap of R² scores per model and target
if r2_cols:
    r2_data = df_sorted.set_index('model')[r2_cols]
    r2_data.columns = [col.replace(' (R²)', '') for col in r2_data.columns]
    
    fig, ax = plt.subplots(figsize=(14, 6))
    sns.heatmap(r2_data, annot=True, fmt='.3f', cmap='RdYlGn', center=0,
                ax=ax, cbar_kws={'label': 'R² Score'})
    ax.set_title('R² Score by Model and Target Property', fontsize=14)
    ax.set_xlabel('Target Property', fontsize=12)
    ax.set_ylabel('Model', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig('../figures/r2_heatmap.png', dpi=150, bbox_inches='tight')
    plt.show()

## Confidence Intervals Analysis

In [None]:
# Plot R² with confidence intervals for the best model
best_model = df_sorted.iloc[0]['model']
best_row = df_sorted.iloc[0]

print(f"Best model: {best_model}")

# Extract target properties and their CIs
targets = []
r2_scores = []
ci_lows = []
ci_highs = []

for col in r2_cols:
    target_name = col.replace(' (R²)', '')
    targets.append(target_name)
    r2_scores.append(best_row[col])
    
    ci_low_col = f"{target_name} (CI Low)"
    ci_high_col = f"{target_name} (CI High)"
    
    ci_lows.append(best_row.get(ci_low_col, 0))
    ci_highs.append(best_row.get(ci_high_col, 0))

if targets:
    fig, ax = plt.subplots(figsize=(12, 6))
    
    x = np.arange(len(targets))
    r2_scores = np.array(r2_scores)
    ci_lows = np.array(ci_lows)
    ci_highs = np.array(ci_highs)
    
    # Error bars
    yerr_low = r2_scores - ci_lows
    yerr_high = ci_highs - r2_scores
    
    bars = ax.bar(x, r2_scores, yerr=[yerr_low, yerr_high], capsize=5, 
                  color=sns.color_palette('husl', len(targets)), alpha=0.8)
    
    ax.set_xticks(x)
    ax.set_xticklabels(targets, rotation=45, ha='right')
    ax.set_xlabel('Target Property', fontsize=12)
    ax.set_ylabel('R² Score', fontsize=12)
    ax.set_title(f'R² Scores with 95% CI - {best_model}', fontsize=14)
    ax.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
    
    plt.tight_layout()
    plt.savefig('../figures/best_model_r2_with_ci.png', dpi=150, bbox_inches='tight')
    plt.show()

## Training Dynamics

In [None]:
# Best epoch analysis
if 'best_epoch' in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Best epoch vs total epochs
    ax1 = axes[0]
    x = np.arange(len(df_sorted))
    width = 0.35
    
    ax1.bar(x - width/2, df_sorted['epochs'], width, label='Total Epochs', alpha=0.7)
    ax1.bar(x + width/2, df_sorted['best_epoch'], width, label='Best Epoch', alpha=0.7)
    
    ax1.set_xticks(x)
    ax1.set_xticklabels(df_sorted['model'], rotation=45, ha='right')
    ax1.set_ylabel('Epochs')
    ax1.set_title('Total Epochs vs Best Epoch')
    ax1.legend()
    
    # Best test loss
    ax2 = axes[1]
    if 'best_test_loss' in df.columns:
        bars = ax2.bar(df_sorted['model'], df_sorted['best_test_loss'], 
                       color=sns.color_palette('husl', len(df_sorted)))
        ax2.set_xticklabels(df_sorted['model'], rotation=45, ha='right')
        ax2.set_ylabel('Best Test Loss')
        ax2.set_title('Best Test Loss by Model')
        
        for bar, val in zip(bars, df_sorted['best_test_loss']):
            ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height(), 
                    f'{val:.4f}', ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    plt.savefig('../figures/training_dynamics.png', dpi=150, bbox_inches='tight')
    plt.show()

## Model Comparison Across All Targets

In [None]:
# Grouped bar chart comparing all models across targets
if r2_cols:
    fig, ax = plt.subplots(figsize=(16, 8))
    
    n_models = len(df)
    n_targets = len(r2_cols)
    x = np.arange(n_targets)
    width = 0.8 / n_models
    
    colors = sns.color_palette('husl', n_models)
    
    for i, (idx, row) in enumerate(df_sorted.iterrows()):
        values = [row[col] for col in r2_cols]
        offset = (i - n_models/2 + 0.5) * width
        ax.bar(x + offset, values, width, label=row['model'], color=colors[i], alpha=0.8)
    
    ax.set_xticks(x)
    ax.set_xticklabels([col.replace(' (R²)', '') for col in r2_cols], rotation=45, ha='right')
    ax.set_xlabel('Target Property', fontsize=12)
    ax.set_ylabel('R² Score', fontsize=12)
    ax.set_title('Model Comparison Across All Target Properties', fontsize=14)
    ax.legend(title='Model', bbox_to_anchor=(1.02, 1), loc='upper left')
    ax.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
    
    plt.tight_layout()
    plt.savefig('../figures/model_comparison_all_targets.png', dpi=150, bbox_inches='tight')
    plt.show()

## Summary Statistics

In [None]:
# Print summary
print("="*60)
print("EXPERIMENT SUMMARY")
print("="*60)

print(f"\nBest Model: {df_sorted.iloc[0]['model']}")
print(f"Overall Mean R²: {df_sorted.iloc[0]['Overall Mean R²']:.4f}")

if 'best_epoch' in df.columns:
    print(f"Best Epoch: {df_sorted.iloc[0]['best_epoch']}")
if 'best_test_loss' in df.columns:
    print(f"Best Test Loss: {df_sorted.iloc[0]['best_test_loss']:.6f}")

print("\n" + "-"*60)
print("All Models Ranking:")
print("-"*60)
for i, row in df_sorted.iterrows():
    print(f"  {row['model']}: R²={row['Overall Mean R²']:.4f}")

In [None]:
# Statistical summary of R² scores
if r2_cols:
    print("\nR² Score Statistics per Target Property:")
    print("-"*60)
    stats_df = df[r2_cols].describe().T
    stats_df.columns = ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']
    stats_df.index = [col.replace(' (R²)', '') for col in stats_df.index]
    print(stats_df[['mean', 'std', 'min', 'max']].round(4))