In [3]:
!pip install torch transformers datasets tqdm evaluate sacrebleu numpy pandas
!pip install rouge_score
!pip install scikit_posthocs

Collecting scikit_posthocs
  Downloading scikit_posthocs-0.10.0-py3-none-any.whl.metadata (5.8 kB)
Downloading scikit_posthocs-0.10.0-py3-none-any.whl (33 kB)
Installing collected packages: scikit_posthocs
Successfully installed scikit_posthocs-0.10.0


In [2]:
import torch
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
from datasets import load_dataset
from tqdm import tqdm
import evaluate
import sacrebleu
import numpy as np
import pandas as pd

def load_model_and_tokenizer(model_name):
    tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id
    return model, tokenizer

def generate_summary(model, tokenizer, text, max_input_length=874, max_summary_length=150):
    prompt = f"summarize: {text} summary:"
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        max_length=max_input_length,
        truncation=True,
        padding=False
    )

    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    prompt_length = input_ids.size(1)

    # Generate summary and collect generation info
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_summary_length,
            num_beams=5,
            no_repeat_ngram_size=3,
            early_stopping=True,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            output_scores=True,  # Get generation scores
            return_dict_in_generate=True  # Get detailed output
        )

    # Extract generation scores and token probabilities
    sequence_scores = outputs.sequences_scores.cpu().numpy() if hasattr(outputs, 'sequences_scores') else None
    token_scores = torch.stack(outputs.scores, dim=0) if hasattr(outputs, 'scores') else None
    mean_token_prob = torch.mean(torch.softmax(token_scores, dim=-1).max(dim=-1)[0]).item() if token_scores is not None else None

    # Extract summary tokens
    generated_ids = outputs.sequences[0]
    summary_ids = generated_ids[prompt_length:]
    generated_summary = tokenizer.decode(summary_ids, skip_special_tokens=True).strip()

    # Truncate if needed
    generated_summary_tokens = tokenizer.tokenize(generated_summary)
    if len(generated_summary_tokens) > max_summary_length:
        generated_summary_tokens = generated_summary_tokens[:max_summary_length]
        generated_summary = tokenizer.convert_tokens_to_string(generated_summary_tokens)

    full_output = tokenizer.decode(generated_ids, skip_special_tokens=True)

    generation_info = {
        'sequence_score': float(sequence_scores[0]) if sequence_scores is not None else None,
        'mean_token_probability': mean_token_prob,
        'generated_length': len(generated_summary_tokens),
    }

    return generated_summary, full_output, generation_info

def compute_metrics(predictions, references):
    rouge = evaluate.load("rouge")

    # Compute ROUGE scores
    rouge_result = rouge.compute(
        predictions=predictions,
        references=references,
        use_stemmer=True,
        use_aggregator=True
    )

    # Compute BLEU scores
    bleu_scores = sacrebleu.corpus_bleu(
        predictions,
        [references],
        smooth_method='exp',
        smooth_value=0.1,
        force=True,
        lowercase=True,
        tokenize='13a'
    )

    return {
        "rouge1": rouge_result["rouge1"],
        "rouge2": rouge_result["rouge2"],
        "rougeL": rouge_result["rougeL"],
        "bleu": bleu_scores.score
    }

def evaluate_model(model_name, dataset, max_input_length=874, max_summary_length=150):
    print(f"Evaluating model: {model_name}")
    try:
        model, tokenizer = load_model_and_tokenizer(model_name)
    except Exception as e:
        print(f"Error loading model {model_name}: {e}")
        return None, None

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    model.eval()

    # Lists to store detailed results
    detailed_results = []
    generated_summaries = []
    reference_summaries_full = []
    reference_summaries_truncated = []

    for i, example in enumerate(tqdm(dataset, desc=f"Generating summaries for {model_name}")):
        text = example['text']
        reference_full = example['summary']

        # Truncate input text if needed
        text_tokens = tokenizer.tokenize(text)
        if len(text_tokens) > max_input_length:
            text_tokens = text_tokens[:max_input_length]
            text = tokenizer.convert_tokens_to_string(text_tokens)

        # Truncate reference summary if needed
        reference_tokens = tokenizer.tokenize(reference_full)
        reference_truncated = reference_full
        if len(reference_tokens) > max_summary_length:
            reference_tokens = reference_tokens[:max_summary_length]
            reference_truncated = tokenizer.convert_tokens_to_string(reference_tokens)

        # Generate summary
        generated_summary, full_output, generation_info = generate_summary(
            model, tokenizer, text, max_input_length, max_summary_length
        )

        if generated_summary:
            # Compute individual metrics for this example
            individual_metrics = compute_metrics([generated_summary], [reference_full])

            # Store detailed result
            detailed_result = {
                'model_name': model_name,
                'example_id': i,
                'input_text': text[:1000],  # Truncate for CSV storage
                'reference_summary_full': reference_full,
                'reference_summary_truncated': reference_truncated,
                'generated_summary': generated_summary,
                'input_length': len(text_tokens),
                'reference_length': len(reference_tokens),
                'generation_length': generation_info['generated_length'],
                'sequence_score': generation_info['sequence_score'],
                'mean_token_probability': generation_info['mean_token_probability'],
                'rouge1_score': individual_metrics['rouge1'],
                'rouge2_score': individual_metrics['rouge2'],
                'rougeL_score': individual_metrics['rougeL'],
                'bleu_score': individual_metrics['bleu']
            }
            detailed_results.append(detailed_result)

            # Store for aggregate metrics
            generated_summaries.append(generated_summary)
            reference_summaries_full.append(reference_full)
            reference_summaries_truncated.append(reference_truncated)
        else:
            print(f"Warning: Empty summary generated for example {i+1}")

    if not generated_summaries:
        print(f"Warning: No valid summaries generated for {model_name}")
        return None, None

    # Compute aggregate metrics
    scores_full = compute_metrics(generated_summaries, reference_summaries_full)
    scores_truncated = compute_metrics(generated_summaries, reference_summaries_truncated)

    # Combine scores
    summary_scores = {
        'rouge1_full': scores_full['rouge1'],
        'rouge2_full': scores_full['rouge2'],
        'rougeL_full': scores_full['rougeL'],
        'bleu_full': scores_full['bleu'],
        'rouge1_truncated': scores_truncated['rouge1'],
        'rouge2_truncated': scores_truncated['rouge2'],
        'rougeL_truncated': scores_truncated['rougeL'],
        'bleu_truncated': scores_truncated['bleu'],
        'avg_generation_length': np.mean([r['generation_length'] for r in detailed_results]),
        'avg_sequence_score': np.mean([r['sequence_score'] for r in detailed_results if r['sequence_score'] is not None]),
        'avg_token_probability': np.mean([r['mean_token_probability'] for r in detailed_results if r['mean_token_probability'] is not None])
    }

    return summary_scores, detailed_results

def main():
    print("Loading the BillSum dataset...")
    dataset = load_dataset("billsum", split="ca_test")
    dataset = dataset.select(range(100))
    print(f"Selected first {len(dataset)} examples for evaluation.")

    variants = ["baseModel", "noNorm", "AttnOnly", "FFNonly"]
    norm_types = ["LN", "RMSN"]

    summary_results = []
    all_detailed_results = []

    for norm_type in norm_types:
        for variant in variants:
            model_name = f"shng2025/GPT-Valkyrie_{norm_type}-124m__{variant}__Billsum"
            print(f"\nProcessing model: {model_name}")

            summary_scores, detailed_results = evaluate_model(
                model_name,
                dataset,
                max_input_length=874,
                max_summary_length=150
            )

            if summary_scores and detailed_results:
                # Add model information
                summary_scores['model_name'] = model_name
                summary_scores['norm_type'] = norm_type
                summary_scores['variant'] = variant

                summary_results.append(summary_scores)
                all_detailed_results.extend(detailed_results)
            else:
                print(f"Skipping model {model_name} due to errors.")

    # Save detailed results
    detailed_df = pd.DataFrame(all_detailed_results)
    detailed_df.to_csv('billsum_evaluation_detailed.csv', index=False)
    print("\nDetailed evaluation results saved to billsum_evaluation_detailed.csv")

    # Save summary results
    summary_df = pd.DataFrame(summary_results)
    summary_df.to_csv('billsum_evaluation_summary.csv', index=False)
    print("Summary evaluation results saved to billsum_evaluation_summary.csv")

if __name__ == "__main__":
    main()

Loading the BillSum dataset...


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


README.md:   0%|          | 0.00/7.27k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/91.8M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/15.8M [00:00<?, ?B/s]

ca_test-00000-of-00001.parquet:   0%|          | 0.00/6.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3269 [00:00<?, ? examples/s]

Generating ca_test split:   0%|          | 0/1237 [00:00<?, ? examples/s]

Selected first 100 examples for evaluation.

Processing model: shng2025/GPT-Valkyrie_LN-124m__baseModel__Billsum
Evaluating model: shng2025/GPT-Valkyrie_LN-124m__baseModel__Billsum


tokenizer_config.json:   0%|          | 0.00/476 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Generating summaries for shng2025/GPT-Valkyrie_LN-124m__baseModel__Billsum:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1643 > 1024). Running this sequence through the model will result in indexing errors


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Generating summaries for shng2025/GPT-Valkyrie_LN-124m__baseModel__Billsum: 100%|██████████| 100/100 [07:35<00:00,  4.55s/it]



Processing model: shng2025/GPT-Valkyrie_LN-124m__noNorm__Billsum
Evaluating model: shng2025/GPT-Valkyrie_LN-124m__noNorm__Billsum


tokenizer_config.json:   0%|          | 0.00/476 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/866 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Generating summaries for shng2025/GPT-Valkyrie_LN-124m__noNorm__Billsum:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1643 > 1024). Running this sequence through the model will result in indexing errors
Generating summaries for shng2025/GPT-Valkyrie_LN-124m__noNorm__Billsum: 100%|██████████| 100/100 [07:34<00:00,  4.55s/it]



Processing model: shng2025/GPT-Valkyrie_LN-124m__AttnOnly__Billsum
Evaluating model: shng2025/GPT-Valkyrie_LN-124m__AttnOnly__Billsum


tokenizer_config.json:   0%|          | 0.00/476 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/870 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Generating summaries for shng2025/GPT-Valkyrie_LN-124m__AttnOnly__Billsum:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1643 > 1024). Running this sequence through the model will result in indexing errors
Generating summaries for shng2025/GPT-Valkyrie_LN-124m__AttnOnly__Billsum: 100%|██████████| 100/100 [07:26<00:00,  4.47s/it]



Processing model: shng2025/GPT-Valkyrie_LN-124m__FFNonly__Billsum
Evaluating model: shng2025/GPT-Valkyrie_LN-124m__FFNonly__Billsum


tokenizer_config.json:   0%|          | 0.00/476 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/868 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Generating summaries for shng2025/GPT-Valkyrie_LN-124m__FFNonly__Billsum:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1643 > 1024). Running this sequence through the model will result in indexing errors
Generating summaries for shng2025/GPT-Valkyrie_LN-124m__FFNonly__Billsum: 100%|██████████| 100/100 [07:30<00:00,  4.50s/it]



Processing model: shng2025/GPT-Valkyrie_RMSN-124m__baseModel__Billsum
Evaluating model: shng2025/GPT-Valkyrie_RMSN-124m__baseModel__Billsum


tokenizer_config.json:   0%|          | 0.00/476 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Generating summaries for shng2025/GPT-Valkyrie_RMSN-124m__baseModel__Billsum:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1643 > 1024). Running this sequence through the model will result in indexing errors
Generating summaries for shng2025/GPT-Valkyrie_RMSN-124m__baseModel__Billsum: 100%|██████████| 100/100 [07:29<00:00,  4.50s/it]



Processing model: shng2025/GPT-Valkyrie_RMSN-124m__noNorm__Billsum
Evaluating model: shng2025/GPT-Valkyrie_RMSN-124m__noNorm__Billsum


tokenizer_config.json:   0%|          | 0.00/476 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/868 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Generating summaries for shng2025/GPT-Valkyrie_RMSN-124m__noNorm__Billsum:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1643 > 1024). Running this sequence through the model will result in indexing errors
Generating summaries for shng2025/GPT-Valkyrie_RMSN-124m__noNorm__Billsum: 100%|██████████| 100/100 [07:29<00:00,  4.49s/it]



Processing model: shng2025/GPT-Valkyrie_RMSN-124m__AttnOnly__Billsum
Evaluating model: shng2025/GPT-Valkyrie_RMSN-124m__AttnOnly__Billsum


tokenizer_config.json:   0%|          | 0.00/476 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/872 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Generating summaries for shng2025/GPT-Valkyrie_RMSN-124m__AttnOnly__Billsum:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1643 > 1024). Running this sequence through the model will result in indexing errors
Generating summaries for shng2025/GPT-Valkyrie_RMSN-124m__AttnOnly__Billsum: 100%|██████████| 100/100 [07:28<00:00,  4.48s/it]



Processing model: shng2025/GPT-Valkyrie_RMSN-124m__FFNonly__Billsum
Evaluating model: shng2025/GPT-Valkyrie_RMSN-124m__FFNonly__Billsum


tokenizer_config.json:   0%|          | 0.00/476 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/870 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Generating summaries for shng2025/GPT-Valkyrie_RMSN-124m__FFNonly__Billsum:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1643 > 1024). Running this sequence through the model will result in indexing errors
Generating summaries for shng2025/GPT-Valkyrie_RMSN-124m__FFNonly__Billsum: 100%|██████████| 100/100 [07:29<00:00,  4.50s/it]



Detailed evaluation results saved to billsum_evaluation_detailed.csv
Summary evaluation results saved to billsum_evaluation_summary.csv


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy.stats import kruskal
from scikit_posthocs import posthoc_dunn
import warnings
import os

warnings.filterwarnings('ignore')

def clean_model_name(name):
    """Convert full model name to concise format."""
    try:
        parts = name.split('_')
        norm_type = parts[1].split('-')[0]  # Get LN or RMSN
        variant = parts[3].split('__')[0]    # Get baseModel, noNorm, AttnOnly, or FFNonly
        return f"{norm_type}-{variant}"
    except:
        return name

def perform_kruskal_dunn(df, metric):
    """Perform Kruskal-Wallis and Dunn's test for a metric."""
    try:
        # Check if all values are identical
        if df[metric].nunique() == 1:
            print(f"Note: All {metric} values are identical ({df[metric].iloc[0]}), skipping Kruskal-Wallis test.")
            return pd.DataFrame({
                'statistic': [0],
                'p-value': [1.0],
                'note': ['All values identical']
            }), None

        kruskal_result = kruskal(*[group[metric].values for name, group in df.groupby('model_name')])
        if kruskal_result.pvalue < 0.05:
            dunn_result = posthoc_dunn(df, val_col=metric, group_col='model_name', p_adjust='bonferroni')
            return pd.DataFrame({'statistic': [kruskal_result.statistic], 'p-value': [kruskal_result.pvalue]}), dunn_result
        return pd.DataFrame({'statistic': [kruskal_result.statistic], 'p-value': [kruskal_result.pvalue]}), None
    except Exception as e:
        print(f"Error performing Kruskal-Wallis and Dunn's test for {metric}: {e}")
        return None, None

def create_radar_chart(df, metrics, condition_name, output_dir):
    """Create radar chart with specific formatting."""
    means = df.groupby('model_name')[metrics].mean()
    angles = np.linspace(0, 2*np.pi, len(metrics), endpoint=False)
    means = pd.concat([means, means.iloc[:, :1]], axis=1)
    angles = np.concatenate((angles, [angles[0]]))

    fig, ax = plt.subplots(figsize=(14, 10), subplot_kw=dict(projection='polar'))
    for model in means.index:
        values = means.loc[model].values
        ax.plot(angles, values, 'o-', linewidth=2, label=model)
        ax.fill(angles, values, alpha=0.25)
    ax.set_thetagrids(angles[:-1] * 180/np.pi, metrics)
    ax.set_ylim(0, max(means.max().max(), 1.0))
    plt.legend(loc='center left', bbox_to_anchor=(1.1, 0.5))
    plt.title(f"Model Performance Across Metrics - {condition_name}")
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'radar_chart.png'), bbox_inches='tight')
    plt.close()

def calculate_anova_summary(df, metrics):
    """Calculate ANOVA summary for each metric."""
    anova_results = []

    for metric in metrics:
        groups = [group[metric].values for name, group in df.groupby('model_name')]
        f_value, p_value = stats.f_oneway(*groups)

        eta_squared = f_value / (f_value + df.groupby('model_name').size().iloc[0] - 1)

        anova_results.append({
            'Metric': metric,
            'F_value': f_value,
            'p_value': p_value,
            'eta_squared': eta_squared
        })

    return pd.DataFrame(anova_results)

def create_analysis_for_dataset(df, dataset_name, output_dir):
    """Perform statistical analysis for the dataset."""
    # Define metrics for detailed analysis
    score_metrics = ['rouge1_score', 'rouge2_score', 'rougeL_score', 'bleu_score']
    generation_metrics = ['sequence_score', 'mean_token_probability', 'generation_length']
    all_metrics = score_metrics + generation_metrics

    os.makedirs(output_dir, exist_ok=True)

    # 1. Basic Descriptive Statistics
    desc_stats = df.groupby('model_name')[all_metrics].agg(['mean', 'std', 'min', 'max', 'count'])
    desc_stats.to_csv(os.path.join(output_dir, 'descriptive_statistics.csv'))

    # 2. Effect Size Calculation
    effect_sizes = {}
    for metric in all_metrics:
        try:
            f_value, _ = stats.f_oneway(*[group[metric] for name, group in df.groupby('model_name')])
            effect_sizes[metric] = f_value / (f_value + df.groupby('model_name').size().iloc[0] - 1)
        except Exception as e:
            print(f"Error calculating effect size for {metric}: {e}")
            effect_sizes[metric] = np.nan

    effect_sizes_df = pd.DataFrame.from_dict(effect_sizes, orient='index', columns=['Effect Size'])
    effect_sizes_df.to_csv(os.path.join(output_dir, 'effect_sizes.csv'))

    # 3. ANOVA summary
    anova_summary = calculate_anova_summary(df, all_metrics)
    anova_summary.to_csv(os.path.join(output_dir, 'anova_summary.csv'), index=False)

    # 4. Score distribution summary
    score_dist_summary = df[all_metrics].agg([
        'mean', 'std', 'min', 'max',
        lambda x: x.quantile(0.25),
        lambda x: x.quantile(0.75),
        'skew', 'kurt'
    ]).round(4)

    score_dist_summary.index = ['Mean', 'Std Dev', 'Min', 'Max', 'Q1', 'Q3', 'Skewness', 'Kurtosis']
    score_dist_summary.to_csv(os.path.join(output_dir, 'score_distribution_summary.csv'))

    # 5. Post-hoc Tests
    for metric in all_metrics:
        tukey = pairwise_tukeyhsd(df[metric], df['model_name'])
        pd.DataFrame(data=tukey._results_table.data[1:],
                    columns=tukey._results_table.data[0]).to_csv(
                    os.path.join(output_dir, f'tukey_hsd_{metric}.csv'), index=False)

    # 6. Correlation Analysis
    correlation_matrix = df[all_metrics].corr()
    correlation_matrix.to_csv(os.path.join(output_dir, 'correlation_matrix.csv'))

    # 7. PCA Analysis
    scaler = StandardScaler()
    pca = PCA()
    pca_result = pca.fit_transform(scaler.fit_transform(df[all_metrics]))
    pca_df = pd.DataFrame({
        'Principal Component': range(1, len(pca.explained_variance_ratio_) + 1),
        'Explained Variance Ratio': pca.explained_variance_ratio_,
        'Cumulative Explained Variance Ratio': np.cumsum(pca.explained_variance_ratio_)
    })
    pca_df.to_csv(os.path.join(output_dir, 'pca_results.csv'), index=False)

    # 8. Kruskal-Wallis and Dunn's Tests
    for metric in all_metrics:
        kruskal_result, dunn_result = perform_kruskal_dunn(df, metric)
        if kruskal_result is not None:
            kruskal_result.to_csv(os.path.join(output_dir, f'kruskal_{metric}.csv'), index=False)
        if dunn_result is not None:
            dunn_result.to_csv(os.path.join(output_dir, f'dunn_test_{metric}.csv'))

    # 9. Visualizations

    # Correlation Heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title(f'Correlation Heatmap - {dataset_name}')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'correlation_heatmap.png'))
    plt.close()

    # Violin Plots - Separate for score metrics and generation metrics
    for metrics_group, group_name in [(score_metrics, 'score_metrics'),
                                    (generation_metrics, 'generation_metrics')]:
        plt.figure(figsize=(20, 10))
        for i, metric in enumerate(metrics_group, 1):
            plt.subplot(1, len(metrics_group), i)
            sns.violinplot(x='model_name', y=metric, data=df)
            plt.title(f'{metric}')
            plt.xticks(rotation=45, ha='right')
            plt.ylim(0, df[metric].max() * 1.1)  # Set y-axis limits with 10% padding
            if i != 1:
                plt.ylabel('')
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f'violin_plots_{group_name}.png'), bbox_inches='tight')
        plt.close()

    # Distribution Plots - Separate for score metrics and generation metrics
    for metrics_group, group_name in [(score_metrics, 'score_metrics'),
                                    (generation_metrics, 'generation_metrics')]:
        plt.figure(figsize=(20, 10))
        for i, metric in enumerate(metrics_group, 1):
            plt.subplot(1, len(metrics_group), i)
            for model in df['model_name'].unique():
                sns.kdeplot(data=df[df['model_name'] == model], x=metric, label=model)
            plt.title(f'Distribution of {metric}')
            plt.xlabel('Value')
            plt.ylabel('Density')
            if i == len(metrics_group):  # Place legend for last plot
                plt.legend(title='Model Name', bbox_to_anchor=(1.05, 1), loc='upper left')
            else:
                plt.legend([])
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f'distribution_plots_{group_name}.png'), bbox_inches='tight')
        plt.close()

    # Box Plots - Separate for score metrics and generation metrics
    for metrics_group, group_name in [(score_metrics, 'score_metrics'),
                                    (generation_metrics, 'generation_metrics')]:
        plt.figure(figsize=(20, 10))
        for i, metric in enumerate(metrics_group, 1):
            plt.subplot(1, len(metrics_group), i)
            sns.boxplot(x='model_name', y=metric, data=df)
            plt.title(metric)
            plt.xticks(rotation=45, ha='right')
            plt.ylim(0, df[metric].max() * 1.1)  # Set y-axis limits with 10% padding
            plt.xlabel('')
            if i != 1:
                plt.ylabel('')
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f'boxplots_{group_name}.png'), bbox_inches='tight')
        plt.close()

    # Radar Charts - Separate for score metrics and generation metrics
    for metrics_group, group_name in [(score_metrics, 'score_metrics'),
                                    (generation_metrics, 'generation_metrics')]:
        create_radar_chart(df, metrics_group, f"{dataset_name}_{group_name}", output_dir)

    return desc_stats

def main():
    output_dir = 'billsum_analysis_results'
    os.makedirs(output_dir, exist_ok=True)

    print("Loading detailed data...")
    detailed_df = pd.read_csv('billsum_evaluation_detailed.csv')
    detailed_df['model_name'] = detailed_df['model_name'].apply(clean_model_name)
    detailed_df['model_name'] = detailed_df['model_name'].astype('category')

    print("\nAnalyzing detailed results...")
    detailed_stats = create_analysis_for_dataset(detailed_df, 'detailed_metrics',
                                               os.path.join(output_dir, 'detailed_analysis'))

    print("\nAnalysis complete. Results saved in 'billsum_analysis_results' directory.")

if __name__ == "__main__":
    main()

Loading detailed data...

Analyzing detailed results...

Analysis complete. Results saved in 'billsum_analysis_results' directory.
