In [1]:
!pip install torch transformers datasets tqdm pandas
!pip install evaluate
!pip install scikit_posthocs

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_dataset
from evaluate import load as load_metric
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm

def load_model_and_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    return model, tokenizer

def compute_exact_match(prediction, ground_truth):
    """Compute exact match for a single prediction."""
    return int(prediction.lower() == ground_truth.lower())

def compute_f1(prediction, ground_truth):
    """Compute F1 score for a single prediction."""
    prediction_tokens = prediction.lower().split()
    ground_truth_tokens = ground_truth.lower().split()

    if len(prediction_tokens) == 0 or len(ground_truth_tokens) == 0:
        return int(prediction_tokens == ground_truth_tokens)

    common_tokens = set(prediction_tokens) & set(ground_truth_tokens)
    if not common_tokens:
        return 0

    precision = len(common_tokens) / len(prediction_tokens)
    recall = len(common_tokens) / len(ground_truth_tokens)

    f1 = 2 * precision * recall / (precision + recall)
    return f1

def evaluate_model(model_name, dataset, max_samples=100):
    model, tokenizer = load_model_and_tokenizer(model_name)
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    detailed_results = []

    for i, example in enumerate(tqdm(dataset, desc=f"Evaluating {model_name}")):
        if i >= max_samples:
            break

        inputs = tokenizer(
            example["question"],
            example["context"],
            return_tensors="pt",
            truncation=True,
            max_length=512,
            padding=True
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)

        start_logits = outputs.start_logits[0].cpu().numpy()
        end_logits = outputs.end_logits[0].cpu().numpy()

        start_index = np.argmax(start_logits)
        end_index = np.argmax(end_logits)

        no_answer_score = start_logits[0] + end_logits[0]
        best_answer_score = start_logits[start_index] + end_logits[end_index]

        if no_answer_score > best_answer_score or end_index < start_index:
            answer = ""
        else:
            answer_tokens = inputs["input_ids"][0][start_index:end_index+1]
            answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)

        # Get the ground truth answer
        ground_truth = example["answers"]["text"][0] if example["answers"]["text"] else ""

        # Compute exact match and F1 scores for this example
        exact_match = compute_exact_match(answer, ground_truth)
        f1_score = compute_f1(answer, ground_truth)

        # Store detailed result for this example
        detailed_result = {
            "model_name": model_name,
            "example_id": example["id"],
            "question": example["question"],
            "context": example["context"],
            "reference_answer": ground_truth,
            "predicted_answer": answer,
            "exact_match": exact_match,
            "f1_score": f1_score,
            "confidence_score": float(1 / (1 + np.exp(no_answer_score - best_answer_score))),
            "no_answer_probability": float(1 / (1 + np.exp(best_answer_score - no_answer_score))),
            "start_logits_max": float(np.max(start_logits)),
            "end_logits_max": float(np.max(end_logits)),
            "best_answer_score": float(best_answer_score),
            "no_answer_score": float(no_answer_score)
        }
        detailed_results.append(detailed_result)

    # Calculate aggregate metrics
    exact_matches = [r["exact_match"] for r in detailed_results]
    avg_exact_match = np.mean(exact_matches)
    exact_match_count = sum(exact_matches)  # Add this

    summary_metrics = {
        "model_name": model_name,
        "exact_match_ratio": avg_exact_match,
        "exact_match_count": exact_match_count,  # Add this
        "total_samples": len(detailed_results),  # Add this
        "f1_score": np.mean([r["f1_score"] for r in detailed_results]),
        "avg_confidence": np.mean([r["confidence_score"] for r in detailed_results]),
        "avg_no_answer_prob": np.mean([r["no_answer_probability"] for r in detailed_results])
    }

    return summary_metrics, detailed_results

def main():
    variants = ["baseModel", "noNorm", "AttnOnly", "FFNonly"]
    norm_types = ["LN", "RMSN"]
    summary_results = []
    all_detailed_results = []

    # Load dataset
    dataset = load_dataset("squad_v2", split="validation")
    # Limit to first 100 examples
    dataset = dataset.select(range(100))

    for norm_type in norm_types:
        for variant in variants:
            model_name = f"shng2025/GPT-Valkyrie_{norm_type}-124m__{variant}__SQuAD"
            print(f"\nEvaluating model {model_name}")
            try:
                metrics, detailed_results = evaluate_model(model_name, dataset, max_samples=100)

                # Add model info to metrics
                metrics['model_name'] = model_name
                metrics['norm_type'] = norm_type
                metrics['variant'] = variant

                summary_results.append(metrics)
                all_detailed_results.extend(detailed_results)

            except Exception as e:
                print(f"Error evaluating model {model_name}: {e}")
                continue

    # Save detailed results
    detailed_df = pd.DataFrame(all_detailed_results)
    detailed_df.to_csv('squad_evaluation_detailed.csv', index=False)
    print("\nDetailed evaluation results saved to squad_evaluation_detailed.csv")

    # Save summary results
    summary_df = pd.DataFrame(summary_results)
    summary_df.to_csv('squad_evaluation_summary.csv', index=False)
    print("Summary evaluation results saved to squad_evaluation_summary.csv")

if __name__ == "__main__":
    main()


Evaluating model shng2025/GPT-Valkyrie_LN-124m__baseModel__SQuAD


Evaluating shng2025/GPT-Valkyrie_LN-124m__baseModel__SQuAD: 100%|██████████| 100/100 [00:02<00:00, 42.86it/s]



Evaluating model shng2025/GPT-Valkyrie_LN-124m__noNorm__SQuAD


Evaluating shng2025/GPT-Valkyrie_LN-124m__noNorm__SQuAD: 100%|██████████| 100/100 [00:01<00:00, 65.09it/s]



Evaluating model shng2025/GPT-Valkyrie_LN-124m__AttnOnly__SQuAD


Evaluating shng2025/GPT-Valkyrie_LN-124m__AttnOnly__SQuAD: 100%|██████████| 100/100 [00:01<00:00, 78.82it/s]



Evaluating model shng2025/GPT-Valkyrie_LN-124m__FFNonly__SQuAD


Evaluating shng2025/GPT-Valkyrie_LN-124m__FFNonly__SQuAD: 100%|██████████| 100/100 [00:01<00:00, 78.41it/s]



Evaluating model shng2025/GPT-Valkyrie_RMSN-124m__baseModel__SQuAD


Evaluating shng2025/GPT-Valkyrie_RMSN-124m__baseModel__SQuAD: 100%|██████████| 100/100 [00:01<00:00, 66.20it/s]



Evaluating model shng2025/GPT-Valkyrie_RMSN-124m__noNorm__SQuAD


Evaluating shng2025/GPT-Valkyrie_RMSN-124m__noNorm__SQuAD: 100%|██████████| 100/100 [00:01<00:00, 73.77it/s]



Evaluating model shng2025/GPT-Valkyrie_RMSN-124m__AttnOnly__SQuAD


Evaluating shng2025/GPT-Valkyrie_RMSN-124m__AttnOnly__SQuAD: 100%|██████████| 100/100 [00:01<00:00, 77.99it/s]



Evaluating model shng2025/GPT-Valkyrie_RMSN-124m__FFNonly__SQuAD


Evaluating shng2025/GPT-Valkyrie_RMSN-124m__FFNonly__SQuAD: 100%|██████████| 100/100 [00:01<00:00, 79.43it/s]



Detailed evaluation results saved to squad_evaluation_detailed.csv
Summary evaluation results saved to squad_evaluation_summary.csv


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy.stats import kruskal
from scikit_posthocs import posthoc_dunn
import warnings
import os

warnings.filterwarnings('ignore')

def perform_kruskal_dunn(df, metric):
    """Perform Kruskal-Wallis and Dunn's test for a metric."""
    try:
        kruskal_result = kruskal(*[group[metric].values for name, group in df.groupby('model_name')])
        if kruskal_result.pvalue < 0.05:
            dunn_result = posthoc_dunn(df, val_col=metric, group_col='model_name', p_adjust='bonferroni')
            return pd.DataFrame({'statistic': [kruskal_result.statistic], 'p-value': [kruskal_result.pvalue]}), dunn_result
        return pd.DataFrame({'statistic': [kruskal_result.statistic], 'p-value': [kruskal_result.pvalue]}), None
    except Exception as e:
        print(f"Error performing Kruskal-Wallis and Dunn's test for {metric}: {e}")
        return None, None

def create_radar_chart(df, metrics, condition_name, output_dir):
    """Create radar chart with specific formatting."""
    means = df.groupby('model_name')[metrics].mean()
    angles = np.linspace(0, 2*np.pi, len(metrics), endpoint=False)
    means = pd.concat([means, means.iloc[:, :1]], axis=1)
    angles = np.concatenate((angles, [angles[0]]))

    fig, ax = plt.subplots(figsize=(14, 10), subplot_kw=dict(projection='polar'))
    for model in means.index:
        values = means.loc[model].values
        ax.plot(angles, values, 'o-', linewidth=2, label=model)
        ax.fill(angles, values, alpha=0.25)
    ax.set_thetagrids(angles[:-1] * 180/np.pi, metrics)
    ax.set_ylim(0, max(means.max().max(), 1.0))
    plt.legend(loc='center left', bbox_to_anchor=(1.1, 0.5))
    plt.title(f"Model Performance Across Metrics - {condition_name}")
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'radar_chart.png'), bbox_inches='tight')
    plt.close()

def calculate_anova_summary(df, metrics):
    """Calculate ANOVA summary for each metric."""
    anova_results = []

    for metric in metrics:
        groups = [group[metric].values for name, group in df.groupby('model_name')]
        f_value, p_value = stats.f_oneway(*groups)

        # Calculate effect size (eta-squared)
        eta_squared = f_value / (f_value + df.groupby('model_name').size().iloc[0] - 1)

        anova_results.append({
            'Metric': metric,
            'F_value': f_value,
            'p_value': p_value,
            'eta_squared': eta_squared
        })

    return pd.DataFrame(anova_results)

def create_score_distribution_plots(df, metrics, condition_name, output_dir):
    """Create detailed score distribution plots."""
    plt.figure(figsize=(20, 15))

    for i, metric in enumerate(metrics, 1):
        plt.subplot(2, 3, i)
        for model in df['model_name'].unique():
            model_data = df[df['model_name'] == model]
            sns.kdeplot(data=model_data, x=metric, label=model)

        plt.title(f'Distribution of {metric}')
        plt.xlabel('Score')
        plt.ylabel('Density')
        if i == 3:  # Place legend for third plot
            plt.legend(title='Model Name', bbox_to_anchor=(1.05, 1), loc='upper left')
        else:
            plt.legend([])  # Remove individual legends

    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'detailed_score_distributions.png'), bbox_inches='tight')
    plt.close()

def create_analysis_for_condition(df, condition_name, output_dir):
    """Perform statistical analysis for a given condition."""
    metrics = ['f1_score', 'confidence_score', 'no_answer_probability']

    condition_dir = os.path.join(output_dir, condition_name)
    os.makedirs(condition_dir, exist_ok=True)

    # 1. Basic Descriptive Statistics
    desc_stats = df.groupby('model_name')[metrics].agg(['mean', 'std', 'min', 'max', 'count'])
    desc_stats.to_csv(os.path.join(condition_dir, 'descriptive_statistics.csv'))

    # 2. Effect Size Calculation
    effect_sizes = {}
    for metric in metrics:
        try:
            f_value, _ = stats.f_oneway(*[group[metric] for name, group in df.groupby('model_name')])
            effect_sizes[metric] = f_value / (f_value + df.groupby('model_name').size().iloc[0] - 1)
        except Exception as e:
            print(f"Error calculating effect size for {metric}: {e}")
            effect_sizes[metric] = np.nan

    effect_sizes_df = pd.DataFrame.from_dict(effect_sizes, orient='index', columns=['Effect Size'])
    effect_sizes_df.to_csv(os.path.join(condition_dir, 'effect_sizes.csv'))

    # Add ANOVA summary
    anova_summary = calculate_anova_summary(df, metrics)
    anova_summary.to_csv(os.path.join(condition_dir, 'anova_summary.csv'), index=False)

    # Create detailed score distribution plots
    create_score_distribution_plots(df, metrics, condition_name, condition_dir)

    score_dist_summary = df[metrics].agg([
        'mean', 'std', 'min', 'max',
        lambda x: x.quantile(0.25),
        lambda x: x.quantile(0.75),
        'skew', 'kurt'
    ]).round(4)

    score_dist_summary.index = ['Mean', 'Std Dev', 'Min', 'Max', 'Q1', 'Q3', 'Skewness', 'Kurtosis']
    score_dist_summary.to_csv(os.path.join(condition_dir, 'score_distribution_summary.csv'))

    # 3. Post-hoc Tests
    for metric in metrics:
        tukey = pairwise_tukeyhsd(df[metric], df['model_name'])
        pd.DataFrame(data=tukey._results_table.data[1:],
                    columns=tukey._results_table.data[0]).to_csv(
                    os.path.join(condition_dir, f'tukey_hsd_{metric}.csv'), index=False)

    # 4. Correlation Analysis
    correlation_matrix = df[metrics].corr()
    correlation_matrix.to_csv(os.path.join(condition_dir, 'correlation_matrix.csv'))

    # 5. PCA Analysis
    scaler = StandardScaler()
    pca = PCA()
    pca_result = pca.fit_transform(scaler.fit_transform(df[metrics]))
    pca_df = pd.DataFrame({
        'Principal Component': range(1, len(pca.explained_variance_ratio_) + 1),
        'Explained Variance Ratio': pca.explained_variance_ratio_,
        'Cumulative Explained Variance Ratio': np.cumsum(pca.explained_variance_ratio_)
    })
    pca_df.to_csv(os.path.join(condition_dir, 'pca_results.csv'), index=False)

    # 6. Kruskal-Wallis and Dunn's Tests
    for metric in metrics:
        kruskal_result, dunn_result = perform_kruskal_dunn(df, metric)
        if kruskal_result is not None:
            kruskal_result.to_csv(os.path.join(condition_dir, f'kruskal_{metric}.csv'), index=False)
        if dunn_result is not None:
            dunn_result.to_csv(os.path.join(condition_dir, f'dunn_test_{metric}.csv'))

    # 7. Visualizations

    # Correlation Heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title(f'Correlation Heatmap - {condition_name}')
    plt.tight_layout()
    plt.savefig(os.path.join(condition_dir, 'correlation_heatmap.png'))
    plt.close()

    # Violin Plots
    plt.figure(figsize=(20, 10))
    for i, metric in enumerate(metrics, 1):
        plt.subplot(1, 3, i)
        sns.violinplot(x='model_name', y=metric, data=df)
        plt.title(f'{metric} - {condition_name}')
        plt.xticks(rotation=45, ha='right')
        plt.ylim(0, 1)  # Set y-axis limits
        if i % 3 != 1:
            plt.ylabel('')
    plt.tight_layout()
    plt.savefig(os.path.join(condition_dir, 'violin_plots.png'), bbox_inches='tight')
    plt.close()

    # Distribution Plots
    plt.figure(figsize=(20, 15))
    for i, metric in enumerate(metrics, 1):
        plt.subplot(1, 3, i)
        for model in df['model_name'].unique():
            sns.kdeplot(data=df[df['model_name'] == model], x=metric, label=model, clip=(0, 1))  # Add clip parameter
        plt.title(f'Distribution of {metric} - {condition_name}')
        plt.xlabel('Score')
        plt.ylabel('Density')
        plt.xlim(0, 1)  # Set x-axis limits
        if i == 3:  # Place legend outside the plots
            plt.legend(title='Model Name', bbox_to_anchor=(1.05, 1), loc='upper left')
        else:
            plt.legend([])
    plt.tight_layout()
    plt.savefig(os.path.join(condition_dir, 'score_distributions.png'), bbox_inches='tight')
    plt.close()

    # Box Plots
    plt.figure(figsize=(20, 15))
    for i, metric in enumerate(metrics, 1):
        plt.subplot(1, 3, i)
        sns.boxplot(x='model_name', y=metric, data=df)
        plt.title(metric)
        plt.xticks(rotation=45, ha='right')
        plt.ylim(0, 1)  # Set y-axis limits
        plt.xlabel('')
        if i % 3 != 1:
            plt.ylabel('')
    plt.tight_layout()
    plt.savefig(os.path.join(condition_dir, 'boxplots.png'), bbox_inches='tight')
    plt.close()

    # Radar Chart
    create_radar_chart(df, metrics, condition_name, condition_dir)

    return desc_stats

def clean_model_name(name):
    """Convert full model name to concise format."""
    # Extract the norm type and variant from the full name
    # Example: "shng2025/GPT-Valkyrie_LN-124m__AttnOnly__SQuAD" -> "LN-AttnOnly"
    try:
        # Split by underscores and extract relevant parts
        parts = name.split('_')
        norm_type = parts[1].split('-')[0]  # Get LN or RMSN
        variant = parts[3].split('__')[0]    # Get baseModel, noNorm, AttnOnly, or FFNonly
        return f"{norm_type}-{variant}"
    except:
        return name  # Return original name if parsing fails

# Then modify the main() function:
def main():
    output_dir = 'qa_analysis_results'
    os.makedirs(output_dir, exist_ok=True)

    print("Loading data...")
    df = pd.read_csv('squad_evaluation_detailed.csv')

    # Clean model names
    df['model_name'] = df['model_name'].apply(clean_model_name)
    df['model_name'] = df['model_name'].astype('category')

    exact_match_df = df[df['exact_match'] == 1]
    no_match_df = df[df['exact_match'] == 0]
    all_df = df

    print(f"\nAnalyzing exact matches ({len(exact_match_df)} samples)...")
    exact_stats = create_analysis_for_condition(exact_match_df, 'exact_match', output_dir)

    print(f"\nAnalyzing non-matches ({len(no_match_df)} samples)...")
    no_match_stats = create_analysis_for_condition(no_match_df, 'no_match', output_dir)

    print(f"\nAnalyzing all cases ({len(all_df)} samples)...")
    all_stats = create_analysis_for_condition(all_df, 'all_cases', output_dir)

    comparison_summary = pd.DataFrame({
        'Exact Match Count': len(exact_match_df),
        'No Match Count': len(no_match_df),
        'Total Count': len(all_df),
        'Exact Match Ratio': len(exact_match_df) / len(all_df),
        'No Match Ratio': len(no_match_df) / len(all_df)
    }, index=['Summary'])

    comparison_summary.to_csv(os.path.join(output_dir, 'match_distribution_summary.csv'))

    print(f"\nAnalysis complete. Results saved in '{output_dir}' directory.")

if __name__ == "__main__":
    main()

Loading data...

Analyzing exact matches (30 samples)...
Error performing Kruskal-Wallis and Dunn's test for f1_score: All numbers are identical in kruskal

Analyzing non-matches (770 samples)...

Analyzing all cases (800 samples)...

Analysis complete. Results saved in 'qa_analysis_results' directory.
