In [31]:
import pandas as pd
import scipy.stats as stats

# Define function to normalize column names
def normalize_columns(df):
    column_mapping = {
        'Precision_Non_performance': 'Precision_Non_Performance',
        'Recall_Non_performance': 'Recall_Non_Performance',
        'F1 Score_Non_Performance': 'F1_Score_Non_Performance',
        'Precision_Performance': 'Precision_Performance',
        'Recall_Performance': 'Recall_Performance',
        'F1 Score_Performance': 'F1_Score_Performance',
    }
    return df.rename(columns=lambda col: column_mapping.get(col, col))

# Define function to check and fill missing columns
def check_and_fill_columns(df, expected_columns):
    for col in expected_columns:
        if col not in df.columns:
            print(f"Warning: Column '{col}' is missing in the dataset. Filling with NaNs.")
            df[col] = float('nan')  # Add missing column filled with NaN
    return df

# Define function to perform paired t-test
def perform_paired_ttest(model1_metrics, model2_metrics):
    results = {'performance': {}, 'non_performance': {}}

    # Paired t-test for Performance class metrics
    for metric in ['Precision_Performance', 'Recall_Performance']:
        t_stat, p_value = stats.ttest_rel(model1_metrics[metric], model2_metrics[metric])
        results['performance'][metric] = {'t_stat': t_stat, 'p_value': p_value}

    # Paired t-test for Non-Performance class metrics
    for metric in ['Precision_Non_Performance', 'Recall_Non_Performance']:
        t_stat, p_value = stats.ttest_rel(model1_metrics[metric], model2_metrics[metric])
        results['non_performance'][metric] = {'t_stat': t_stat, 'p_value': p_value}

    return results

# File paths
file_paths = {
    "CodeBert_KD_M": "CodeBert_KD_M.csv",
    "CodeBert_KD_MD": "CodeBert_KD_MD.csv",
    "Mistral_AWQ_KD_MD": "Mistral_AWQ_KD_MD.csv",
    "Roberta_HS_M": "Roberta_HS_M.csv",
}

# Expected columns for analysis
expected_columns = [
    'Precision_Performance', 'Recall_Performance',
    'Precision_Non_Performance', 'Recall_Non_Performance'
]

# Load, normalize, and check DataFrames
dataframes = {}
for name, path in file_paths.items():
    df = pd.read_csv(path)
    df = normalize_columns(df)
    df = check_and_fill_columns(df, expected_columns)
    dataframes[name] = df

# Perform paired t-tests
results = {}
model_names = list(dataframes.keys())

for i in range(len(model_names)):
    for j in range(i + 1, len(model_names)):
        model1_name = model_names[i]
        model2_name = model_names[j]
        model1_metrics = dataframes[model1_name]
        model2_metrics = dataframes[model2_name]
        pair_key = f"{model1_name} vs {model2_name}"
        print(f"Running T-Test: {pair_key}")
        results[pair_key] = perform_paired_ttest(model1_metrics, model2_metrics)

# Display the results
for pair, metrics in results.items():
    print(f"\nComparison: {pair}")
    print("Performance Class Metrics:")
    for metric, values in metrics['performance'].items():
        print(f"{metric} - t-statistic: {values['t_stat']:.3f}, p-value: {values['p_value']:.3f}")
    print("Non-Performance Class Metrics:")
    for metric, values in metrics['non_performance'].items():
        print(f"{metric} - t-statistic: {values['t_stat']:.3f}, p-value: {values['p_value']:.3f}")


Running T-Test: CodeBert_KD_M vs CodeBert_KD_MD
Running T-Test: CodeBert_KD_M vs Mistral_AWQ_KD_MD
Running T-Test: CodeBert_KD_M vs Roberta_HS_M
Running T-Test: CodeBert_KD_MD vs Mistral_AWQ_KD_MD
Running T-Test: CodeBert_KD_MD vs Roberta_HS_M
Running T-Test: Mistral_AWQ_KD_MD vs Roberta_HS_M

Comparison: CodeBert_KD_M vs CodeBert_KD_MD
Performance Class Metrics:
Precision_Performance - t-statistic: -inf, p-value: 0.000
Recall_Performance - t-statistic: -inf, p-value: 0.000
Non-Performance Class Metrics:
Precision_Non_Performance - t-statistic: -inf, p-value: 0.000
Recall_Non_Performance - t-statistic: -inf, p-value: 0.000

Comparison: CodeBert_KD_M vs Mistral_AWQ_KD_MD
Performance Class Metrics:
Precision_Performance - t-statistic: -2.971, p-value: 0.041
Recall_Performance - t-statistic: 20.156, p-value: 0.000
Non-Performance Class Metrics:
Precision_Non_Performance - t-statistic: 20.400, p-value: 0.000
Recall_Non_Performance - t-statistic: -6.485, p-value: 0.003

Comparison: CodeBert

  res = hypotest_fun_out(*samples, **kwds)
