# SQuAD

In [None]:
!pip install pingouin
!pip install scikit_posthocs
!pip install krippendorff

Collecting pingouin
  Downloading pingouin-0.5.5-py3-none-any.whl.metadata (19 kB)
Collecting pandas-flavor (from pingouin)
  Downloading pandas_flavor-0.6.0-py3-none-any.whl.metadata (6.3 kB)
Downloading pingouin-0.5.5-py3-none-any.whl (204 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.4/204.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pandas_flavor-0.6.0-py3-none-any.whl (7.2 kB)
Installing collected packages: pandas-flavor, pingouin
Successfully installed pandas-flavor-0.6.0 pingouin-0.5.5
Collecting scikit_posthocs
  Downloading scikit_posthocs-0.9.1-py3-none-any.whl.metadata (5.8 kB)
Downloading scikit_posthocs-0.9.1-py3-none-any.whl (32 kB)
Installing collected packages: scikit_posthocs
Successfully installed scikit_posthocs-0.9.1
Collecting krippendorff
  Downloading krippendorff-0.8.0-py3-none-any.whl.metadata (2.8 kB)
Downloading krippendorff-0.8.0-py3-none-any.whl (18 kB)
Installing collected packages: krippendorff
Successfully ins

In [None]:
import pandas as pd
import re

def process_csv(eval_file, gen_file):
    eval_df = pd.read_csv(eval_file)
    gen_df = pd.read_csv(gen_file)

    # Ensure the dataframes have the same number of rows
    min_rows = min(len(eval_df), len(gen_df))
    eval_df = eval_df.iloc[:min_rows]
    gen_df = gen_df.iloc[:min_rows]

    # Merge the dataframes
    merged_df = pd.merge(eval_df, gen_df[['question', 'context', 'reference_answers', 'generated_answer']], left_index=True, right_index=True)

    # Add model_name, norm_type, and variant columns from gen_df
    merged_df['model_name'] = gen_df['model_name']
    merged_df['norm_type'] = gen_df['norm_type']
    merged_df['variant'] = gen_df['variant']

    return merged_df

def main():
    evaluation_files = [
        'LN_AttnOnly_gpt4_qa_parsed_evaluations.csv',
        'LN_FFNonly_gpt4_qa_parsed_evaluations.csv',
        'LN_baseModel_gpt4_qa_parsed_evaluations.csv',
        'LN_noNorm_gpt4_qa_parsed_evaluations.csv',
        'RMSN_AttnOnly_gpt4_qa_parsed_evaluations.csv',
        'RMSN_FFNonly_gpt4_qa_parsed_evaluations.csv',
        'RMSN_baseModel_gpt4_qa_parsed_evaluations.csv',
        'RMSN_noNorm_gpt4_qa_parsed_evaluations.csv'
    ]

    generated_answer_files = [
        'LN_AttnOnly_gpt4_evaluation_data_modified.csv',
        'LN_FFNonly_gpt4_evaluation_data_modified.csv',
        'LN_baseModel_gpt4_evaluation_data_modified.csv',
        'LN_noNorm_gpt4_evaluation_data_modified.csv',
        'RMSN_AttnOnly_gpt4_evaluation_data_modified.csv',
        'RMSN_FFNonly_gpt4_evaluation_data_modified.csv',
        'RMSN_baseModel_gpt4_evaluation_data_modified.csv',
        'RMSN_noNorm_gpt4_evaluation_data_modified.csv'
    ]

    all_data = pd.concat([process_csv(eval_file, gen_file)
                          for eval_file, gen_file in zip(evaluation_files, generated_answer_files)],
                         ignore_index=True)

    columns_order = ['model_name', 'norm_type', 'variant', 'question', 'context', 'reference_answers', 'generated_answer',
                     'Overall Score', 'Correctness Score', 'Completeness Score', 'Relevance Score',
                     'Fluency Score', 'Conciseness Score', 'Overall Feedback', 'Comments on Columns']

    all_data = all_data[columns_order]

    # Truncate generated_answer to 1000 characters
    all_data['generated_answer'] = all_data['generated_answer'].str[:1000]

    # Save the entire dataframe to a single CSV file
    output_file = 'combined_data_truncated_squad.csv'
    all_data.to_csv(output_file, index=False)
    print(f"All data has been saved to '{output_file}' with generated answer truncated to 1000 characters.")

    print(f"The file contains {len(all_data)} rows and {len(all_data.columns)} columns.")
    print("Columns:", ', '.join(all_data.columns))

if __name__ == "__main__":
    main()

All data has been saved to 'combined_data_truncated_squad.csv' with generated answer truncated to 1000 characters.
The file contains 200 rows and 15 columns.
Columns: model_name, norm_type, variant, question, context, reference_answers, generated_answer, Overall Score, Correctness Score, Completeness Score, Relevance Score, Fluency Score, Conciseness Score, Overall Feedback, Comments on Columns


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os

np.random.seed(42)

def load_data(file_path):
    df = pd.read_csv(file_path)
    print(f"Loaded {len(df)} rows of data")
    print(f"Columns: {', '.join(df.columns)}")
    return df

def calculate_statistics(df):
    score_columns = ['Overall Score', 'Correctness Score', 'Completeness Score',
                     'Relevance Score', 'Fluency Score', 'Conciseness Score']
    return df.groupby(['model_name', 'norm_type', 'variant'])[score_columns].agg(['mean', 'std', 'min', 'max'])

def perform_anova(df, score_column):
    models = df['model_name'].unique()
    data = [df[df['model_name'] == model][score_column] for model in models]
    f_value, p_value = stats.f_oneway(*data)
    return f_value, p_value

def create_collated_boxplots(df, output_dir):
    score_columns = ['Overall Score', 'Correctness Score', 'Completeness Score',
                     'Relevance Score', 'Fluency Score', 'Conciseness Score']

    plt.figure(figsize=(20, 15))
    for i, score in enumerate(score_columns, 1):
        plt.subplot(2, 3, i)
        sns.boxplot(x='model_name', y=score, hue='norm_type', data=df)
        plt.title(score)
        plt.xticks(rotation=45, ha='right')
        plt.xlabel('')
        if i % 3 != 1:
            plt.ylabel('')
        if i == 3:
            plt.legend(title='Norm Type', bbox_to_anchor=(1.05, 1), loc='upper left')
        else:
            plt.legend([])

    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'squad_collated_boxplots.png'), bbox_inches='tight')
    plt.close()

def main():
    output_dir = 'analysis_results_squad'
    os.makedirs(output_dir, exist_ok=True)

    df = load_data('combined_data_truncated_squad.csv')

    stats_df = calculate_statistics(df)
    stats_df.to_csv(os.path.join(output_dir, 'descriptive_statistics_squad.csv'))

    create_collated_boxplots(df, output_dir)

    score_columns = ['Overall Score', 'Correctness Score', 'Completeness Score',
                     'Relevance Score', 'Fluency Score', 'Conciseness Score']
    anova_results = []
    for score in score_columns:
        f_value, p_value = perform_anova(df, score)
        anova_results.append({
            'Metric': score,
            'F-value': f_value,
            'p-value': p_value
        })

    anova_df = pd.DataFrame(anova_results)
    anova_df.to_csv(os.path.join(output_dir, 'squad_anova_results.csv'), index=False)

    print("Analysis complete. Results saved in 'analysis_results_squad' directory.")

if __name__ == "__main__":
    main()

Loaded 200 rows of data
Columns: model_name, norm_type, variant, question, context, reference_answers, generated_answer, Overall Score, Correctness Score, Completeness Score, Relevance Score, Fluency Score, Conciseness Score, Overall Feedback, Comments on Columns
Analysis complete. Results saved in 'analysis_results_squad' directory.


In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy.stats import kruskal
from scikit_posthocs import posthoc_dunn
import warnings
import os

warnings.filterwarnings('ignore')

# Create a directory for CSV outputs
output_dir = 'squad_analysis_results'
os.makedirs(output_dir, exist_ok=True)

# Load the data
df = pd.read_csv('combined_data_truncated_squad.csv')

print("Columns in the CSV file:")
print(df.columns)

# Ensure 'model_name' is treated as a categorical variable
df['model_name'] = df['model_name'].astype('category')

# Define new score columns for SQuAD task
score_columns = ['Overall Score', 'Correctness Score', 'Completeness Score', 'Relevance Score', 'Fluency Score', 'Conciseness Score']

print(f"\nUsing score columns: {score_columns}")

# 1. Effect Size Calculation
def calculate_effect_size(df, metric):
    try:
        f_value, _ = stats.f_oneway(*[group[metric] for name, group in df.groupby('model_name')])
        return f_value / (f_value + df.groupby('model_name').size().iloc[0] - 1)
    except Exception as e:
        print(f"Error calculating effect size for {metric}: {e}")
        return np.nan

effect_sizes = {metric: calculate_effect_size(df, metric) for metric in score_columns}
effect_sizes_df = pd.DataFrame.from_dict(effect_sizes, orient='index', columns=['Effect Size'])
effect_sizes_df.to_csv(os.path.join(output_dir, 'effect_sizes.csv'))

# 2. Post-hoc Tests
def perform_tukey_hsd(df, metric):
    try:
        tukey = pairwise_tukeyhsd(df[metric], df['model_name'])
        return pd.DataFrame(data=tukey._results_table.data[1:], columns=tukey._results_table.data[0])
    except Exception as e:
        print(f"Error performing Tukey HSD for {metric}: {e}")
        return pd.DataFrame()

tukey_results = {metric: perform_tukey_hsd(df, metric) for metric in score_columns}
for metric, result in tukey_results.items():
    result.to_csv(os.path.join(output_dir, f'tukey_hsd_{metric.replace(" ", "_")}.csv'), index=False)

# 3. Correlation Analysis
correlation_matrix = df[score_columns].corr()
correlation_matrix.to_csv(os.path.join(output_dir, 'correlation_matrix.csv'))

# 4. Principal Component Analysis
scaler = StandardScaler()
pca = PCA()
pca_result = pca.fit_transform(scaler.fit_transform(df[score_columns]))
pca_df = pd.DataFrame({'Explained Variance Ratio': pca.explained_variance_ratio_})
pca_df.to_csv(os.path.join(output_dir, 'pca_results.csv'), index=False)

# 5. Non-parametric Tests
def perform_kruskal_dunn(df, metric):
    try:
        groups = [group[metric].values for name, group in df.groupby('model_name')]
        kruskal_result = kruskal(*groups)
        dunn_result = posthoc_dunn(df, val_col=metric, group_col='model_name', p_adjust='bonferroni')
        kruskal_df = pd.DataFrame({'statistic': [kruskal_result.statistic], 'p-value': [kruskal_result.pvalue]})

        # Diagnostic information
        print(f"\nDiagnostic information for {metric}:")
        print(f"Kruskal-Wallis p-value: {kruskal_result.pvalue}")
        print(f"Number of groups: {len(groups)}")
        print(f"Sample sizes: {[len(group) for group in groups]}")
        print(f"Group means: {[group.mean() for group in groups]}")
        print(f"Percentage of 1s in Dunn's test: {(dunn_result == 1).sum().sum() / dunn_result.size * 100:.2f}%")

        return kruskal_df, dunn_result
    except Exception as e:
        print(f"Error performing Kruskal-Wallis and Dunn's test for {metric}: {e}")
        return None, None

# Use the function and save results
kruskal_dunn_results = {metric: perform_kruskal_dunn(df, metric) for metric in score_columns}
for metric, (kruskal_result, dunn_result) in kruskal_dunn_results.items():
    if kruskal_result is not None:
        kruskal_result.to_csv(os.path.join(output_dir, f'kruskal_wallis_{metric.replace(" ", "_")}.csv'), index=False)
    if dunn_result is not None:
        dunn_result.to_csv(os.path.join(output_dir, f'dunn_test_{metric.replace(" ", "_")}.csv'))

# 6. Visualizations (unchanged)
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of SQuAD Metrics')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'correlation_heatmap_squad_metrics.png'))
plt.close()

# Violin Plots
plt.figure(figsize=(20, 15))
for i, metric in enumerate(score_columns, 1):
    plt.subplot(2, 3, i)
    sns.violinplot(x='model_name', y=metric, data=df)
    plt.title(metric)
    plt.xticks(rotation=45, ha='right')
    plt.xlabel('')
    if i % 3 != 1:
        plt.ylabel('')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'violin_plots_squad_metrics.png'), bbox_inches='tight')
plt.close()

# Radar Chart
def radar_chart(df, metrics):
    means = df.groupby('model_name')[metrics].mean()
    angles = np.linspace(0, 2*np.pi, len(metrics), endpoint=False)
    means = pd.concat([means, means.iloc[:, :1]], axis=1)
    angles = np.concatenate((angles, [angles[0]]))

    fig, ax = plt.subplots(figsize=(14, 10), subplot_kw=dict(projection='polar'))
    for model in means.index:
        values = means.loc[model].values
        ax.plot(angles, values, 'o-', linewidth=2, label=model)
        ax.fill(angles, values, alpha=0.25)
    ax.set_thetagrids(angles[:-1] * 180/np.pi, metrics)
    ax.set_ylim(0, 5)  # Assuming scores are between 0 and 5
    plt.legend(loc='center left', bbox_to_anchor=(1.1, 0.5))
    plt.title("Model Performance Across SQuAD Metrics")
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'radar_chart_squad_metrics.png'), bbox_inches='tight')
    plt.close()

radar_chart(df, score_columns)

# Distribution of Scores
plt.figure(figsize=(20, 15))
for i, metric in enumerate(score_columns, 1):
    plt.subplot(2, 3, i)
    for model in df['model_name'].unique():
        sns.kdeplot(data=df[df['model_name'] == model], x=metric, label=model)
    plt.title(f'Distribution of {metric}')
    plt.xlabel('Score')
    plt.ylabel('Density')
    if i == 3:  # Place legend outside the plots
        plt.legend(title='Model Name', bbox_to_anchor=(1.05, 1), loc='upper left')
    else:
        plt.legend([])  # Remove individual legends
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'score_distributions_squad_metrics.png'), bbox_inches='tight')
plt.close()

# Boxplots
plt.figure(figsize=(20, 15))
for i, metric in enumerate(score_columns, 1):
    plt.subplot(2, 3, i)
    sns.boxplot(x='model_name', y=metric, data=df)
    plt.title(metric)
    plt.xticks(rotation=45, ha='right')
    plt.xlabel('')
    if i % 3 != 1:
        plt.ylabel('')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'boxplots_squad_metrics.png'), bbox_inches='tight')
plt.close()

# 7. Basic Descriptive Statistics
descriptive_stats = df.groupby('model_name')[score_columns].agg(['mean', 'std', 'min', 'max'])
descriptive_stats.to_csv(os.path.join(output_dir, 'descriptive_statistics.csv'))

print(f"\nAnalysis complete. Results saved in CSV files in the '{output_dir}' directory.")
print("Check the generated PNG files for visualizations.")

Columns in the CSV file:
Index(['model_name', 'norm_type', 'variant', 'question', 'context',
       'reference_answers', 'generated_answer', 'Overall Score',
       'Correctness Score', 'Completeness Score', 'Relevance Score',
       'Fluency Score', 'Conciseness Score', 'Overall Feedback',
       'Comments on Columns'],
      dtype='object')

Using score columns: ['Overall Score', 'Correctness Score', 'Completeness Score', 'Relevance Score', 'Fluency Score', 'Conciseness Score']

Diagnostic information for Overall Score:
Kruskal-Wallis p-value: 0.9492019046770281
Number of groups: 8
Sample sizes: [25, 25, 25, 25, 25, 25, 25, 25]
Group means: [2.776, 2.744, 2.6879999999999997, 2.8080000000000003, 2.496, 3.08, 2.424, 2.784]
Percentage of 1s in Dunn's test: 100.00%

Diagnostic information for Correctness Score:
Kruskal-Wallis p-value: 0.9661469983815051
Number of groups: 8
Sample sizes: [25, 25, 25, 25, 25, 25, 25, 25]
Group means: [2.68, 2.64, 2.6, 2.6, 2.4, 2.92, 2.36, 2.6]
Percentage 

# Mean Calculation

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

def analyze_csv(file_path):
    # Read CSV file
    df = pd.read_csv(file_path)

    # Extract model name from file name
    model_name = os.path.basename(file_path).split('_evaluations.csv')[0]

    # Calculate statistics
    score_columns = ['Correctness Score', 'Completeness Score', 'Relevance Score', 'Fluency Score', 'Conciseness Score', 'Overall Score']
    stats = df[score_columns].agg(['mean', 'median', 'min', 'max'])

    # Create distribution plot
    plt.figure(figsize=(12, 6))
    for column in score_columns:
        sns.kdeplot(df[column], shade=True, label=column)

    plt.title(f'Score Distribution for {model_name}')
    plt.xlabel('Score')
    plt.ylabel('Density')
    plt.legend()
    plt.savefig(f'{model_name}_distribution.png')
    plt.close()

    # Create bar plot
    plt.figure(figsize=(12, 6))
    stats.loc['mean'].plot(kind='bar')
    plt.title(f'Mean Scores for {model_name}')
    plt.xlabel('Criteria')
    plt.ylabel('Mean Score')
    plt.savefig(f'{model_name}_mean_scores.png')
    plt.close()

    return stats

# List of CSV files
csv_files = [
    'LN_AttnOnly_gpt4_qa_parsed_evaluations.csv',
    'LN_FFNonly_gpt4_qa_parsed_evaluations.csv',
    'LN_baseModel_gpt4_qa_parsed_evaluations.csv',
    'LN_noNorm_gpt4_qa_parsed_evaluations.csv',
    'RMSN_AttnOnly_gpt4_qa_parsed_evaluations.csv',
    'RMSN_FFNonly_gpt4_qa_parsed_evaluations.csv',
    'RMSN_baseModel_gpt4_qa_parsed_evaluations.csv',
    'RMSN_noNorm_gpt4_qa_parsed_evaluations.csv'
]

# Analyze each CSV file
all_stats = {}
for file in csv_files:
    print(f"Analyzing {file}...")
    stats = analyze_csv(file)
    all_stats[file] = stats
    print(f"Statistics for {file}:")
    print(stats)
    print("\n")

# Create a comparative bar plot for mean scores across all models
plt.figure(figsize=(15, 8))
mean_scores = pd.DataFrame({model: stats.loc['mean'] for model, stats in all_stats.items()})
mean_scores.plot(kind='bar', figsize=(15, 8))
plt.title('Comparison of Mean Scores Across All Models')
plt.xlabel('Criteria')
plt.ylabel('Mean Score')
plt.legend(title='Models', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig('all_models_comparison.png')
plt.close()

print("Analysis complete. Check the generated PNG files for visualizations.")

Analyzing LN_AttnOnly_gpt4_qa_parsed_evaluations.csv...
Statistics for LN_AttnOnly_gpt4_qa_parsed_evaluations.csv:
        Correctness Score  Completeness Score  Relevance Score  Fluency Score  \
mean                 2.68                2.48             2.68           3.04   
median               1.00                1.00             1.00           3.00   
min                  1.00                1.00             1.00           1.00   
max                  5.00                5.00             5.00           5.00   

        Conciseness Score  Overall Score  
mean                  3.0          2.776  
median                3.0          2.200  
min                   1.0          1.000  
max                   5.0          5.000  


Analyzing LN_FFNonly_gpt4_qa_parsed_evaluations.csv...
Statistics for LN_FFNonly_gpt4_qa_parsed_evaluations.csv:
        Correctness Score  Completeness Score  Relevance Score  Fluency Score  \
mean                 2.64                 2.4             2.76      

<Figure size 1500x800 with 0 Axes>