### Imports

In [None]:
import os
from utils.information_retrieval import run_commit_retrieval, create_folders, download_csv_files, create_master_csv

### Inital Params
These are needed to scrape the github repo

In [None]:
commits_url = "https://api.github.com/repos/forecastingresearch/forecastbench-datasets/commits"
params = {'path': 'leaderboards/csv/leaderboard_overall.csv', 'per_page': 100}
headers = {'Authorization': f'token {os.getenv("GITHUB_API_KEY")}'}
# Do make sure your API key has not expired when running this. 
# If it expired, you'll run into an infite for loop. Just make a new to fix it. Always make sure your API key expires as well

### Information Retrieval Pipeline
1. We create the folders needed to store the data
2. We get the commit history from Forecast Bench and save it locally
3. We retrieve all leaderboard csv files in accordance to the commit history
4. We concatenate all leaderboard csv files into a single csv file to then do data analysis on

In [None]:
create_folders()

In [None]:
run_commit_retrieval(commits_url, params, headers)

In [None]:
await download_csv_files(headers)

In [None]:
create_master_csv()

### Setup Code to Group Each Model Together
This code is some sample code I used to 

In [None]:
# Load the master csv file with all our information
import pandas as pd
df = pd.read_csv('data/master.csv')

In [None]:
df.keys()

In [None]:
# Currently choosing only questions that have actually been resolved, since a Brier Score on resolved questions made the most sense to me
# It's unclear which column is actually best. The paper goes into more detail on what each category is
filtered_df = df[['Organization', 'llm_model', 'method', 'date', 'Market Score (resolved)', 'Market Score (resolved) Dataset Size']].copy()

In [None]:
filtered_df['date'] = pd.to_datetime(filtered_df['date'])

In [None]:
filtered_df.sort_values(by=['date'], inplace=True)

In [None]:
filtered_df.head(10)

In [None]:
filtered_df.keys()

In [None]:
# List of all the dates that the benchmark has been active
filtered_df['date'].unique().tolist()

In [None]:
# Get a specific date
sample_df = filtered_df[filtered_df['date']=='2024-12-01']

In [None]:
# Get the unique list of models
models = sample_df['llm_model'].unique().tolist()

In [None]:
# For each LLM provider on the previously specified date, print the info for that model
# We are excluding forecastbench since they have some unique ways on how they do it
# We really just care about how well the foundation models perform
for model in models:
    if (model == 'Forecastbench') or model == 'Llm Crowd Gpt-4O, Claude-3.5-Sonnet, Gemini-1.5-Pro':
        continue
    print(f"\nMODEL: {model.title()}")
    subset = sample_df[sample_df['llm_model'] == model]
    subset_sorted = subset.sort_values(by="Market Score (resolved)", ascending=True)
    display(subset_sorted)

### Attempts At Statistical Analysis

##### This is for a single model

In [None]:
import pandas as pd
import numpy as np
from scipy import stats

# Load your actual data
df = pd.read_csv('data/master.csv')

# Filter for one model and multiple dates
model_data = df[df['llm_model'] == 'Claude-3-5-Sonnet-20240620'].copy()

print(f"Found {len(model_data)} rows for Claude-3-5-Sonnet-20240620")
print(f"Date range: {model_data['date'].min()} to {model_data['date'].max()}")
print(f"Unique dates: {model_data['date'].nunique()}")
print(f"Methods available: {sorted(model_data['method'].unique())}")
print()

def analyze_freeze_vs_nonfreeze(df):
    """
    Analyze best freeze methods vs best non-freeze methods
    """
    results = []
    
    # Group by model and date
    for (model, date), group in df.groupby(['llm_model', 'date']):
        # Identify freeze vs non-freeze methods
        freeze_methods = group[group['method'].str.contains('with freeze values', na=False)]
        non_freeze_methods = group[~group['method'].str.contains('with freeze values', na=False)]
        
        if len(freeze_methods) > 0 and len(non_freeze_methods) > 0:
            # Get best (lowest) scores
            best_freeze_score = freeze_methods['Market Score (resolved)'].min()
            best_freeze_method = freeze_methods.loc[freeze_methods['Market Score (resolved)'].idxmin(), 'method']
            
            best_non_freeze_score = non_freeze_methods['Market Score (resolved)'].min()
            best_non_freeze_method = non_freeze_methods.loc[non_freeze_methods['Market Score (resolved)'].idxmin(), 'method']
            
            # Calculate difference (positive means freeze is better)
            difference = best_non_freeze_score - best_freeze_score
            
            results.append({
                'model': model,
                'date': date,
                'best_freeze_method': best_freeze_method,
                'best_freeze_score': best_freeze_score,
                'best_non_freeze_method': best_non_freeze_method,
                'best_non_freeze_score': best_non_freeze_score,
                'difference': difference
            })
    
    return pd.DataFrame(results)

print("\n=== DEBUGGING INFO ===")
print("Sample of model_data:")
print(model_data[['method', 'date', 'Market Score (resolved)']].head(10))
print()

# Run the analysis on the filtered model data
comparison_df = analyze_freeze_vs_nonfreeze(model_data)
print("=== COMPARISON RESULTS ===")
print(comparison_df.to_string(index=False))
print()

# Statistical analysis
differences = comparison_df['difference'].values
print("=== STATISTICAL ANALYSIS ===")
print(f"Number of comparisons: {len(differences)}")
print(f"Mean difference: {differences.mean():.4f}")
print(f"Std deviation: {differences.std():.4f}")
print(f"Positive differences (freeze better): {sum(differences > 0)}/{len(differences)}")
print()

# One-tailed paired t-test
# H0: no difference (mean difference = 0)
# H1: freeze methods are better (mean difference > 0)
if len(differences) > 1:
    t_stat, p_value_two_tailed = stats.ttest_1samp(differences, 0)
    p_value_one_tailed = p_value_two_tailed / 2 if t_stat > 0 else 1 - (p_value_two_tailed / 2)
    
    print("=== T-TEST RESULTS ===")
    print(f"t-statistic: {t_stat:.4f}")
    print(f"p-value (one-tailed): {p_value_one_tailed:.4f}")
    print(f"Effect size (Cohen's d): {differences.mean() / differences.std():.4f}")
    
    if p_value_one_tailed < 0.05:
        print("✅ SIGNIFICANT: Freeze methods significantly outperform non-freeze methods")
    else:
        print("❌ NOT SIGNIFICANT: No significant difference found")
else:
    print("Need multiple dates for statistical test")

print("\n=== FOR YOUR REAL ANALYSIS ===")
print("1. Load your full dataset with multiple dates")
print("2. Filter for one model at a time") 
print("3. Run this same analysis")
print("4. With 30+ dates, you'll have good statistical power")

# Example of how to load and filter your real data:
print("\n=== EXAMPLE CODE FOR YOUR FULL DATASET ===")
print("""
# Load your data
df = pd.read_csv('your_forecast_data.csv')

# Filter for one model and multiple dates
model_data = df[df['llm_model'] == 'Claude-3-5-Sonnet-20240620'].copy()

# Run the analysis
results = analyze_freeze_vs_nonfreeze(model_data)
print(results)
""")

### This is for analyzing all the models

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Load your actual data
df = pd.read_csv('data/master.csv')

def analyze_all_freeze_vs_nonfreeze(df):
    """
    Compare ALL freeze methods vs ALL non-freeze methods (not just best ones)
    This gives us much more statistical power
    """
    results = []
    
    # Group by model and date
    for (model, date), group in df.groupby(['llm_model', 'date']):
        # Identify freeze vs non-freeze methods
        freeze_methods = group[group['method'].str.contains('with freeze values', na=False)]
        non_freeze_methods = group[~group['method'].str.contains('with freeze values', na=False)]
        
        if len(freeze_methods) > 0 and len(non_freeze_methods) > 0:
            # Get average scores for each category
            avg_freeze_score = freeze_methods['Market Score (resolved)'].mean()
            avg_non_freeze_score = non_freeze_methods['Market Score (resolved)'].mean()
            
            # Also get best scores
            best_freeze_score = freeze_methods['Market Score (resolved)'].min()
            best_non_freeze_score = non_freeze_methods['Market Score (resolved)'].min()
            
            # Calculate differences (positive means freeze is better)
            avg_difference = avg_non_freeze_score - avg_freeze_score
            best_difference = best_non_freeze_score - best_freeze_score
            
            results.append({
                'model': model,
                'date': date,
                'avg_freeze_score': avg_freeze_score,
                'avg_non_freeze_score': avg_non_freeze_score,
                'avg_difference': avg_difference,
                'best_freeze_score': best_freeze_score,
                'best_non_freeze_score': best_non_freeze_score,
                'best_difference': best_difference,
                'n_freeze_methods': len(freeze_methods),
                'n_non_freeze_methods': len(non_freeze_methods)
            })
    
    return pd.DataFrame(results)

def run_statistical_tests(comparison_df, model_name):
    """
    Run comprehensive statistical tests on the comparison data
    """
    print(f"\n{'='*60}")
    print(f"STATISTICAL ANALYSIS FOR {model_name.upper()}")
    print(f"{'='*60}")
    
    # Test both average and best comparisons
    for comparison_type in ['avg', 'best']:
        diff_col = f'{comparison_type}_difference'
        differences = comparison_df[diff_col].values
        
        print(f"\n--- {comparison_type.upper()} METHOD COMPARISON ---")
        print(f"Number of comparisons: {len(differences)}")
        print(f"Mean difference: {differences.mean():.4f}")
        print(f"Std deviation: {differences.std():.4f}")
        print(f"Positive differences (freeze better): {sum(differences > 0)}/{len(differences)}")
        print(f"Success rate: {sum(differences > 0)/len(differences)*100:.1f}%")
        
        # One-tailed paired t-test
        if len(differences) > 1:
            t_stat, p_value_two_tailed = stats.ttest_1samp(differences, 0)
            p_value_one_tailed = p_value_two_tailed / 2 if t_stat > 0 else 1 - (p_value_two_tailed / 2)
            
            print(f"t-statistic: {t_stat:.4f}")
            print(f"p-value (one-tailed): {p_value_one_tailed:.6f}")
            
            if differences.std() > 0:
                cohens_d = differences.mean() / differences.std()
                print(f"Effect size (Cohen's d): {cohens_d:.4f}")
            else:
                print("Effect size: Cannot calculate (no variation)")
            
            if p_value_one_tailed < 0.05:
                print("✅ SIGNIFICANT: Freeze methods significantly outperform non-freeze methods")
            else:
                print("❌ NOT SIGNIFICANT: No significant difference found")
        else:
            print("Need multiple dates for statistical test")

def analyze_single_model(df, model_name):
    """
    Run complete analysis for a single model
    """
    print(f"\n{'='*80}")
    print(f"ANALYZING MODEL: {model_name}")
    print(f"{'='*80}")
    
    # Filter for this model
    model_data = df[df['llm_model'] == model_name].copy()
    
    if len(model_data) == 0:
        print(f"No data found for {model_name}")
        return None
    
    print(f"Found {len(model_data)} rows")
    print(f"Date range: {model_data['date'].min()} to {model_data['date'].max()}")
    print(f"Unique dates: {model_data['date'].nunique()}")
    print(f"Methods available: {sorted(model_data['method'].unique())}")
    
    # Check if we have both freeze and non-freeze methods
    freeze_methods = model_data[model_data['method'].str.contains('with freeze values', na=False)]['method'].unique()
    non_freeze_methods = model_data[~model_data['method'].str.contains('with freeze values', na=False)]['method'].unique()
    
    print(f"\nFreeze methods ({len(freeze_methods)}): {list(freeze_methods)}")
    print(f"Non-freeze methods ({len(non_freeze_methods)}): {list(non_freeze_methods)}")
    
    if len(freeze_methods) == 0 or len(non_freeze_methods) == 0:
        print("⚠️ Cannot compare - missing freeze or non-freeze methods")
        return None
    
    # Run the analysis
    comparison_df = analyze_all_freeze_vs_nonfreeze(model_data)
    
    if len(comparison_df) == 0:
        print("No valid comparisons found")
        return None
    
    # Run statistical tests
    run_statistical_tests(comparison_df, model_name)
    
    return comparison_df

def analyze_all_models(df):
    """
    Run analysis across all models
    """
    models = df['llm_model'].unique()
    results_summary = []
    
    for model in models:
        if model in ['Forecastbench', 'Llm Crowd Gpt-4O, Claude-3.5-Sonnet, Gemini-1.5-Pro']:
            continue
            
        comparison_df = analyze_single_model(df, model)
        
        if comparison_df is not None and len(comparison_df) > 0:
            # Summary statistics for this model
            avg_diffs = comparison_df['avg_difference'].values
            best_diffs = comparison_df['best_difference'].values
            
            if len(avg_diffs) > 1:
                # Average method comparison
                t_stat_avg, p_val_avg = stats.ttest_1samp(avg_diffs, 0)
                p_val_avg_one_tailed = p_val_avg / 2 if t_stat_avg > 0 else 1 - (p_val_avg / 2)
                
                # Best method comparison  
                t_stat_best, p_val_best = stats.ttest_1samp(best_diffs, 0)
                p_val_best_one_tailed = p_val_best / 2 if t_stat_best > 0 else 1 - (p_val_best / 2)
                
                results_summary.append({
                    'model': model,
                    'n_comparisons': len(avg_diffs),
                    'avg_success_rate': sum(avg_diffs > 0) / len(avg_diffs),
                    'avg_mean_diff': avg_diffs.mean(),
                    'avg_p_value': p_val_avg_one_tailed,
                    'best_success_rate': sum(best_diffs > 0) / len(best_diffs),
                    'best_mean_diff': best_diffs.mean(),
                    'best_p_value': p_val_best_one_tailed
                })
    
    # Print summary table
    if results_summary:
        summary_df = pd.DataFrame(results_summary)
        print(f"\n{'='*100}")
        print("SUMMARY ACROSS ALL MODELS")
        print(f"{'='*100}")
        print(summary_df.to_string(index=False))
        
        # Count significant results
        sig_avg = sum(summary_df['avg_p_value'] < 0.05)
        sig_best = sum(summary_df['best_p_value'] < 0.05)
        total = len(summary_df)
        
        print(f"\nModels with significant freeze advantage:")
        print(f"Average method comparison: {sig_avg}/{total} ({sig_avg/total*100:.1f}%)")
        print(f"Best method comparison: {sig_best}/{total} ({sig_best/total*100:.1f}%)")

# Run the comprehensive analysis
print("Starting comprehensive freeze vs non-freeze analysis...")
analyze_all_models(df)