In [1]:
import os
import pandas as pd
from scipy.stats import spearmanr

def compute_spearman_correlations_by_language(
    human_eval_path,
    metrics_folder,
    metrics_files,
    question_id_human="qid",
    question_id_metric="question_id_q",
    human_eval_cols=None
):
    """
    Computes Spearman correlations between human evaluation columns and 
    multiple automatic metric CSVs, separately for German (langq='de') and 
    English (langq='en') question-answer pairs.

    Parameters
    ----------
    human_eval_path : str
        Path to the 'human_eval.csv' file.
    metrics_folder : str
        Folder containing your automatic metric CSV files.
    metrics_files : dict
        A dict that maps a CSV filename (e.g., 'bartscore_de_cnn.csv') 
        to a list of columns in that file to correlate.
        Example:
            {
                "bartscore_de_cnn.csv": ["BARTScore_paper_avg", "BARTScore_paper_harm"],
                "bleu_evaluation_en.csv": ["BLEU"],
                ...
            }
    question_id_human : str, optional
        The column name in the human eval CSV used to identify the question ID
        (default "qid").
    question_id_metric : str, optional
        The column name in the metric CSV used to identify the question ID
        (default "question_id_q").
    human_eval_cols : list of str, optional
        Which human-eval columns to compare against each metric.
        Defaults to the columns in your provided CSV example.

    Returns
    -------
    results_df : pd.DataFrame
        A DataFrame with columns:
        ["metric_file", "metric_column", "human_column", "language", 
         "spearman_corr", "p_value"]

    Notes
    -----
    - The function automatically splits the human-eval data into two subsets:
      one for 'langq' == 'de' and one for 'langq' == 'en'.
    - It detects whether a metric file is meant for German or English by
      checking if the filename contains '_de' or '_en'.
    - Merges are done on question ID columns. Only rows present in both
      data sets are used in the correlation.
    """
    # 1) Read the human evaluation CSV
    human_eval_df = pd.read_csv(human_eval_path)

    # 2) Split the human-eval data into DE and EN subsets
    human_eval_de = human_eval_df[human_eval_df["langq"] == "de"].copy()
    human_eval_en = human_eval_df[human_eval_df["langq"] == "en"].copy()

    # Default columns if none are specified
    if human_eval_cols is None:
        human_eval_cols = [
            "avg_hallucination",
            "avg_answer_acc",
            "avg_user_sat",
            "avg_coherence",
            "avg_context_qual",
            "avg_overall",
            "overall_mean"
        ]
    
    # Prepare a list to accumulate correlation records
    correlation_records = []

    # 3) Loop over each metric file and correlate with the matching language subset
    for metric_filename, metric_cols in metrics_files.items():
        metric_path = os.path.join(metrics_folder, metric_filename)
        
        # Check if it's a DE file or an EN file
        if "_de" in metric_filename.lower():
            relevant_human_eval = human_eval_de
            language = "de"
        elif "_en" in metric_filename.lower():
            relevant_human_eval = human_eval_en
            language = "en"
        else:
            # If neither _de nor _en is found, skip or raise a warning
            print(f"Warning: '{metric_filename}' does not indicate 'de' or 'en'; skipping.")
            continue
        
        # Read the metric CSV
        if not os.path.exists(metric_path):
            print(f"File not found: {metric_path}")
            continue
        
        metric_df = pd.read_csv(metric_path)
        
        # Merge on the question IDs
        merged = pd.merge(
            relevant_human_eval,
            metric_df,
            how="inner",
            left_on=question_id_human,
            right_on=question_id_metric
        )

        # 4) For each metric column, compute correlation with each human_eval_col
        for metric_col in metric_cols:
            if metric_col not in merged.columns:
                print(f"Column '{metric_col}' not found in '{metric_filename}'. Skipping.")
                continue

            # Drop rows with missing data to avoid correlation errors
            
            valid_data = merged.dropna(subset=human_eval_cols + [metric_col])

            for human_col in human_eval_cols:
                x = valid_data[human_col]
                y = valid_data[metric_col]

                # Check if x or y is constant (zero variance)
                if x.nunique() <= 1 or y.nunique() <= 1:
                    print(
                        f"Warning: Constant input array encountered in file '{metric_filename}' "
                        f"(metric_col='{metric_col}', human_col='{human_col}', lang='{language}'). "
                        "Spearman's r is not defined for constant data."
                    )
                    # still compute it for completeness (will be NaN),
                    
                    r, pval = spearmanr(x, y)
                else:
                    # Regular correlation
                    r, pval = spearmanr(x, y)
                
                correlation_records.append({
                    "metric_file": metric_filename,
                    "metric_column": metric_col,
                    "human_column": human_col,
                    "language": language,
                    "spearman_corr": r,
                    "p_value": pval
                })

    # 5) Convert list of records to a DataFrame
    results_df = pd.DataFrame(correlation_records)
    #results_df = results_df.sort_values("spearman_corr", ascending=False)
    return results_df

In [3]:
human_eval_csv = "../../../data/human_eval_avg.csv"
metrics_dir = "../../../data/eval"

# Each CSV file plus the columns you want to correlate
metrics_config = {
    # ROUGE
    "rouge_evaluation_de.csv": [
        "ROUGE-1_f",
        "ROUGE-2_f",
        "ROUGE-3_f",
        "ROUGE-4_f",
        "ROUGE-L_f",
        "ROUGE-SU4_f",
        "ROUGE-W-1.2_f"],
    "rouge_evaluation_en.csv": [
        "ROUGE-1_f",
        "ROUGE-2_f",
        "ROUGE-3_f",
        "ROUGE-4_f",
        "ROUGE-L_f",
        "ROUGE-SU4_f",
        "ROUGE-W-1.2_f"],
    #BLEU
    "bleu_evaluation_de.csv": [
        "BLEU"
    ],
    "bleu_evaluation_en.csv": [
        "BLEU"
    ],
    # BERTScore
    "bertscore_evaluation_de.csv": [
        "BERTScore_F1",
    ],
    "bertscore_evaluation_en.csv": [
        "BERTScore_F1",
    ],
    # BARTScore
    "bartscore_cnn_de.csv": [
        "BARTScore_paper_avg", 
        "BARTScore_paper_harm"
    ],
    "bartscore_cnn_en.csv": [
        "BARTScore_paper_avg", 
        "BARTScore_paper_harm"
    ],
    "bartscore_multi_de.csv": [
        "BARTScore_multilang_avg", 
        "BARTScore_multilang_harm"
    ],
    "bartscore_multi_en.csv": [
        "BARTScore_multilang_avg", 
        "BARTScore_multilang_harm"
    ],
    # BLEURT
    "bleurt_evaluation_de.csv": [
        "BLEURT"
    ],
    "bleurt_evaluation_en.csv": [
        "BLEURT"
    ],
    # LLM Judge
    # together no ref
    "llm_judge_together_no_ref_de.csv": [
        "hallucination_score",
        "answer_accuracy_score",
        "user_satisfaction_score",
        "coherence_clarity_fluency_score",
        "context_quality_score",
        "overall_score"
    ],
    "llm_judge_together_no_ref_en.csv": [
        "hallucination_score",
        "answer_accuracy_score",
        "user_satisfaction_score",
        "coherence_clarity_fluency_score",
        "context_quality_score",
        "overall_score"
    ],
    # together with ref
    "llm_judge_together_with_ref_de.csv": [
        "hallucination_score",
        "answer_accuracy_score",
        "user_satisfaction_score",
        "coherence_clarity_fluency_score",
        "context_quality_score",
        "overall_score"
    ],
    "llm_judge_together_with_ref_en.csv": [
        "hallucination_score",
        "answer_accuracy_score",
        "user_satisfaction_score",
        "coherence_clarity_fluency_score",
        "context_quality_score",
        "overall_score"
    ],
    # seperate no ref
    "llm_judge_seperate_no_ref_de.csv": [
        "hallucination_score",
        "answer_accuracy_score",
        "user_satisfaction_score",
        "coherence_clarity_fluency_score",
        "context_quality_score",
        "overall_score"
    ],
    "llm_judge_seperate_no_ref_en.csv": [
        "hallucination_score",
        "answer_accuracy_score",
        "user_satisfaction_score",
        "coherence_clarity_fluency_score",
        "context_quality_score",
        "overall_score"
    ],
    # seperate with ref
    "llm_judge_seperate_with_ref_de.csv": [
        "hallucination_score",
        "answer_accuracy_score",
        "user_satisfaction_score",
        "coherence_clarity_fluency_score",
        "context_quality_score",
        "overall_score"
    ],
    "llm_judge_seperate_with_ref_en.csv": [
        "hallucination_score",
        "answer_accuracy_score",
        "user_satisfaction_score",
        "coherence_clarity_fluency_score",
        "context_quality_score",
        "overall_score"
    ],
    
}

# Compute correlations
df_results = compute_spearman_correlations_by_language(
    human_eval_path=human_eval_csv,
    metrics_folder=metrics_dir,
    metrics_files=metrics_config,
    question_id_human="qid",          
    question_id_metric="question_id_q"
)

print(df_results.shape)
# save to CSV
df_results.to_csv("../../../data/eval/correlation/correlation_splits_all.csv", index=False)



  r, pval = spearmanr(x, y)


(532, 6)


In [39]:
###filters out the unmatching LLM-as-a-Judge scores
# Update the mapping to allow multiple mappings per LLM metric
llm_human_mapping = {
    "hallucination_score": ["avg_hallucination"],
    "answer_accuracy_score": ["avg_answer_acc"],
    "user_satisfaction_score": ["avg_user_sat"],
    "coherence_clarity_fluency_score": ["avg_coherence"],
    "context_quality_score": ["avg_context_qual"],
    "overall_score": ["avg_overall", "overall_mean"]  # Multiple mappings for overall_score
}
# Filter for LLM-as-a-judge metrics
llm_files = [
    'llm_judge_together_no_ref_de.csv',
    'llm_judge_together_no_ref_en.csv',
    'llm_judge_together_with_ref_de.csv',
    'llm_judge_together_with_ref_en.csv',
    'llm_judge_seperate_no_ref_de.csv',
    'llm_judge_seperate_no_ref_en.csv',
    'llm_judge_seperate_with_ref_de.csv',
    'llm_judge_seperate_with_ref_en.csv'
]
# Filter DataFrame for LLM-as-a-judge rows
filtered_llm_df = df_results[
    df_results['metric_file'].isin(llm_files)
]

# Apply the updated mapping for LLM-as-a-judge metrics
filtered_llm_df = filtered_llm_df[
    filtered_llm_df.apply(
        lambda row: row['human_column'] in llm_human_mapping.get(row['metric_column'], []),
        axis=1
    )
]

# Combine with the other metrics (no filtering needed for these)
other_files = [file for file in df_results['metric_file'].unique() if file not in llm_files]

filtered_other_df = df_results[
    df_results['metric_file'].isin(other_files)
]

# Concatenate the filtered LLM-as-a-judge DataFrame with other metrics
final_filtered_df = pd.concat([filtered_llm_df, filtered_other_df], ignore_index=True)

# Save to CSV
final_filtered_df.to_csv("../../../data/eval/correlation/correlation_splits_filtered_llm.csv", index=False)

final_filtered_df.shape


(252, 6)