In [37]:
import os
import pandas as pd
from scipy.stats import spearmanr

def compute_spearman_correlations_by_language(
    human_eval_path,
    metrics_folder,
    metrics_files,
    question_id_human="qid",
    question_id_metric="question_id_q",
    human_eval_cols=None
):
    """
    Computes Spearman correlations between human evaluation columns and 
    multiple automatic metric CSVs, separately for German (langq='de') and 
    English (langq='en') question-answer pairs.

    Parameters
    ----------
    human_eval_path : str
        Path to the 'human_eval.csv' file.
    metrics_folder : str
        Folder containing your automatic metric CSV files.
    metrics_files : dict
        A dict that maps a CSV filename (e.g., 'bartscore_de_cnn.csv') 
        to a list of columns in that file to correlate.
        Example:
            {
                "bartscore_de_cnn.csv": ["BARTScore_paper_avg", "BARTScore_paper_harm"],
                "bleu_evaluation_en.csv": ["BLEU"],
                ...
            }
    question_id_human : str, optional
        The column name in the human eval CSV used to identify the question ID
        (default "qid").
    question_id_metric : str, optional
        The column name in the metric CSV used to identify the question ID
        (default "question_id_q").
    human_eval_cols : list of str, optional
        Which human-eval columns to compare against each metric.
        Defaults to the columns in your provided CSV example.

    Returns
    -------
    results_df : pd.DataFrame
        A DataFrame with columns:
        ["metric_file", "metric_column", "human_column", "language", 
         "spearman_corr", "p_value"]

    Notes
    -----
    - The function automatically splits the human-eval data into two subsets:
      one for 'langq' == 'de' and one for 'langq' == 'en'.
    - It detects whether a metric file is meant for German or English by
      checking if the filename contains '_de' or '_en'.
    - Merges are done on question ID columns. Only rows present in both
      data sets are used in the correlation.
    """
    # 1) Read the human evaluation CSV
    human_eval_df = pd.read_csv(human_eval_path)

    # 2) Split the human-eval data into DE and EN subsets
    human_eval_de = human_eval_df[human_eval_df["langq"] == "de"].copy()
    human_eval_en = human_eval_df[human_eval_df["langq"] == "en"].copy()

    # Default columns if none are specified
    if human_eval_cols is None:
        human_eval_cols = [
            "avg_hallucination",
            "avg_answer_acc",
            "avg_user_sat",
            "avg_coherence",
            "avg_context_qual",
            "avg_overall",
            "overall_mean"
        ]
    
    # Prepare a list to accumulate correlation records
    correlation_records = []

    # 3) Loop over each metric file and correlate with the matching language subset
    for metric_filename, metric_cols in metrics_files.items():
        metric_path = os.path.join(metrics_folder, metric_filename)
        
        # Check if it's a DE file or an EN file
        if "_de" in metric_filename.lower():
            relevant_human_eval = human_eval_de
            language = "de"
        elif "_en" in metric_filename.lower():
            relevant_human_eval = human_eval_en
            language = "en"
        else:
            # If neither _de nor _en is found, skip or raise a warning
            print(f"Warning: '{metric_filename}' does not indicate 'de' or 'en'; skipping.")
            continue
        
        # Read the metric CSV
        if not os.path.exists(metric_path):
            print(f"File not found: {metric_path}")
            continue
        
        metric_df = pd.read_csv(metric_path)
        
        # Merge on the question IDs
        merged = pd.merge(
            relevant_human_eval,
            metric_df,
            how="inner",
            left_on=question_id_human,
            right_on=question_id_metric
        )

        # 4) For each metric column, compute correlation with each human_eval_col
        for metric_col in metric_cols:
            if metric_col not in merged.columns:
                print(f"Column '{metric_col}' not found in '{metric_filename}'. Skipping.")
                continue

            # Drop rows with missing data to avoid correlation errors
            
            valid_data = merged.dropna(subset=human_eval_cols + [metric_col])

            for human_col in human_eval_cols:
                x = valid_data[human_col]
                y = valid_data[metric_col]

                # Check if x or y is constant (zero variance)
                if x.nunique() <= 1 or y.nunique() <= 1:
                    print(
                        f"Warning: Constant input array encountered in file '{metric_filename}' "
                        f"(metric_col='{metric_col}', human_col='{human_col}', lang='{language}'). "
                        "Spearman's r is not defined for constant data."
                    )
                    # We can still compute it for completeness (will be NaN),
                    # or skip. Let's compute and store the results (NaN).
                    r, pval = spearmanr(x, y)
                else:
                    # Regular correlation
                    r, pval = spearmanr(x, y)
                
                correlation_records.append({
                    "metric_file": metric_filename,
                    "metric_column": metric_col,
                    "human_column": human_col,
                    "language": language,
                    "spearman_corr": r,
                    "p_value": pval
                })

    # 5) Convert list of records to a DataFrame
    results_df = pd.DataFrame(correlation_records)
    #results_df = results_df.sort_values("spearman_corr", ascending=False)
    return results_df

In [38]:
human_eval_csv = "../../../data//human_eval_avg.csv"
metrics_dir = "../../../data/eval"

# Each CSV file plus the columns you want to correlate
metrics_config = {
    # ROUGE
    "rouge_evaluation_de.csv": [
        "ROUGE-1_f",
        "ROUGE-2_f",
        "ROUGE-3_f",
        "ROUGE-4_f",
        "ROUGE-L_f",
        "ROUGE-SU4_f",
        "ROUGE-W-1.2_f"],
    "rouge_evaluation_en.csv": [
        "ROUGE-1_f",
        "ROUGE-2_f",
        "ROUGE-3_f",
        "ROUGE-4_f",
        "ROUGE-L_f",
        "ROUGE-SU4_f",
        "ROUGE-W-1.2_f"],
    #BLEU
    "bleu_evaluation_de.csv": [
        "BLEU"
    ],
    "bleu_evaluation_en.csv": [
        "BLEU"
    ],
    # BERTScore
    "bertscore_evaluation_de.csv": [
        "BERTScore_F1",
    ],
    "bertscore_evaluation_en.csv": [
        "BERTScore_F1",
    ],
    # BARTScore
    "bartscore_cnn_de.csv": [
        "BARTScore_paper_avg", 
        "BARTScore_paper_harm"
    ],
    "bartscore_cnn_en.csv": [
        "BARTScore_paper_avg", 
        "BARTScore_paper_harm"
    ],
    "bartscore_multi_de.csv": [
        "BARTScore_multilang_avg", 
        "BARTScore_multilang_harm"
    ],
    "bartscore_multi_en.csv": [
        "BARTScore_multilang_avg", 
        "BARTScore_multilang_harm"
    ],
    # BLEURT
    "bleurt_evaluation_de.csv": [
        "BLEURT"
    ],
    "bleurt_evaluation_en.csv": [
        "BLEURT"
    ],
    # LLM Judge
    # together no ref
    "llm_judge_together_no_ref_de.csv": [
        "hallucination_score",
        "answer_accuracy_score",
        "user_satisfaction_score",
        "coherence_clarity_fluency_score",
        "context_quality_score",
        "overall_score"
    ],
    "llm_judge_together_no_ref_en.csv": [
        "hallucination_score",
        "answer_accuracy_score",
        "user_satisfaction_score",
        "coherence_clarity_fluency_score",
        "context_quality_score",
        "overall_score"
    ],
    # together with ref
    "llm_judge_together_with_ref_de.csv": [
        "hallucination_score",
        "answer_accuracy_score",
        "user_satisfaction_score",
        "coherence_clarity_fluency_score",
        "context_quality_score",
        "overall_score"
    ],
    "llm_judge_together_with_ref_en.csv": [
        "hallucination_score",
        "answer_accuracy_score",
        "user_satisfaction_score",
        "coherence_clarity_fluency_score",
        "context_quality_score",
        "overall_score"
    ],
    # seperate no ref
    "llm_judge_seperate_no_ref_de.csv": [
        "hallucination_score",
        "answer_accuracy_score",
        "user_satisfaction_score",
        "coherence_clarity_fluency_score",
        "context_quality_score",
        "overall_score"
    ],
    "llm_judge_seperate_no_ref_en.csv": [
        "hallucination_score",
        "answer_accuracy_score",
        "user_satisfaction_score",
        "coherence_clarity_fluency_score",
        "context_quality_score",
        "overall_score"
    ],
    # seperate with ref
    "llm_judge_seperate_with_ref_de.csv": [
        "hallucination_score",
        "answer_accuracy_score",
        "user_satisfaction_score",
        "coherence_clarity_fluency_score",
        "context_quality_score",
        "overall_score"
    ],
    "llm_judge_seperate_with_ref_en.csv": [
        "hallucination_score",
        "answer_accuracy_score",
        "user_satisfaction_score",
        "coherence_clarity_fluency_score",
        "context_quality_score",
        "overall_score"
    ],
    
}

# Compute correlations
df_results = compute_spearman_correlations_by_language(
    human_eval_path=human_eval_csv,
    metrics_folder=metrics_dir,
    metrics_files=metrics_config,
    question_id_human="qid",          
    question_id_metric="question_id_q"
)

print(df_results.shape)
# save to CSV
df_results.to_csv("../../../data/eval/correlation/correlation_splits_all.csv", index=False)



  r, pval = spearmanr(x, y)


(532, 6)


In [39]:
# Update the mapping to allow multiple mappings per LLM metric
llm_human_mapping = {
    "hallucination_score": ["avg_hallucination"],
    "answer_accuracy_score": ["avg_answer_acc"],
    "user_satisfaction_score": ["avg_user_sat"],
    "coherence_clarity_fluency_score": ["avg_coherence"],
    "context_quality_score": ["avg_context_qual"],
    "overall_score": ["avg_overall", "overall_mean"]  # Multiple mappings for overall_score
}
# Filter for LLM-as-a-judge metrics
llm_files = [
    'llm_judge_together_no_ref_de.csv',
    'llm_judge_together_no_ref_en.csv',
    'llm_judge_together_with_ref_de.csv',
    'llm_judge_together_with_ref_en.csv',
    'llm_judge_seperate_no_ref_de.csv',
    'llm_judge_seperate_no_ref_en.csv',
    'llm_judge_seperate_with_ref_de.csv',
    'llm_judge_seperate_with_ref_en.csv'
]
# Filter DataFrame for LLM-as-a-judge rows
filtered_llm_df = df_results[
    df_results['metric_file'].isin(llm_files)
]

# Apply the updated mapping for LLM-as-a-judge metrics
filtered_llm_df = filtered_llm_df[
    filtered_llm_df.apply(
        lambda row: row['human_column'] in llm_human_mapping.get(row['metric_column'], []),
        axis=1
    )
]

# Combine with the other metrics (no filtering needed for these)
other_files = [file for file in df_results['metric_file'].unique() if file not in llm_files]

filtered_other_df = df_results[
    df_results['metric_file'].isin(other_files)
]

# Concatenate the filtered LLM-as-a-judge DataFrame with other metrics
final_filtered_df = pd.concat([filtered_llm_df, filtered_other_df], ignore_index=True)

# Save to CSV
final_filtered_df.to_csv("../../../data/eval/correlation/correlation_splits_filtered_llm.csv", index=False)

final_filtered_df.shape


(252, 6)

In [7]:
# filter out llm_judge results not alligned with human eval
# Define the mapping of LLM-as-a-judge metrics to human evaluation columns
llm_human_mapping = {
    "hallucination_score": "avg_hallucination",
    "answer_accuracy_score": "avg_answer_acc",
    "user_satisfaction_score": "avg_user_sat",
    "coherence_clarity_fluency_score": "avg_coherence",
    "context_quality_score": "avg_context_qual",
    "overall_score": "avg_overall",
    "overall_score": "overall_mean" 

}

# Filter for LLM-as-a-judge metrics
llm_files = [
    'llm_judge_together_no_ref_de.csv',
    'llm_judge_together_no_ref_en.csv',
    'llm_judge_together_with_ref_de.csv',
    'llm_judge_together_with_ref_en.csv',
    'llm_judge_seperate_no_ref_de.csv',
    'llm_judge_seperate_no_ref_en.csv',
    'llm_judge_seperate_with_ref_de.csv',
    'llm_judge_seperate_with_ref_en.csv'
]

# Filter DataFrame for LLM-as-a-judge rows
filtered_llm_df = df_results[
    df_results['metric_file'].isin(llm_files)
]

# Apply the one-to-one mapping for LLM-as-a-judge metrics
filtered_llm_df = filtered_llm_df[
    filtered_llm_df.apply(
        lambda row: llm_human_mapping.get(row['metric_column']) == row['human_column'], axis=1
    )
]

# Combine with the other metrics (no filtering needed for these)
other_files = [file for file in df_results['metric_file'].unique() if file not in llm_files]

filtered_other_df = df_results[
    df_results['metric_file'].isin(other_files)
]

# Concatenate the filtered LLM-as-a-judge DataFrame with other metrics
final_filtered_df = pd.concat([filtered_llm_df, filtered_other_df], ignore_index=True)

# save to CSV
final_filtered_df.to_csv("../../../data/eval/correlation/correlation_splits_filtered_llm.csv", index=False)

final_filtered_df.shape

(244, 6)

In [11]:
# keep the best rouge metric
# Filter the DataFrame to only include ROUGE metrics
rouge_files = ['rouge_evaluation_de.csv', 'rouge_evaluation_en.csv']

# Filter for ROUGE metrics
rouge_df = final_filtered_df[final_filtered_df['metric_file'].isin(rouge_files)]

# Find the best ROUGE metric per human column and language
best_rouge_df = (
    rouge_df.loc[
        rouge_df.groupby(['metric_file', 'human_column'])['spearman_corr'].idxmax()
    ]
)

# Combine the best ROUGE metrics with the rest of the DataFrame
non_rouge_df = final_filtered_df[~final_filtered_df['metric_file'].isin(rouge_files)]
final_filtered_df_with_best_rouge = pd.concat([non_rouge_df, best_rouge_df], ignore_index=True)

# save to CSV
final_filtered_df_with_best_rouge.to_csv("../../testing/correlation_splits_filtered_best_rouge.csv", index=False)

# Display some rows for inspection
final_filtered_df_with_best_rouge.head()


Unnamed: 0,metric_file,metric_column,human_column,language,spearman_corr,p_value
0,llm_judge_together_no_ref_de.csv,hallucination_score,avg_hallucination,de,0.123084,0.494985
1,llm_judge_together_no_ref_de.csv,answer_accuracy_score,avg_answer_acc,de,0.139438,0.438981
2,llm_judge_together_no_ref_de.csv,user_satisfaction_score,avg_user_sat,de,0.457298,0.007459
3,llm_judge_together_no_ref_de.csv,coherence_clarity_fluency_score,avg_coherence,de,0.142119,0.430143
4,llm_judge_together_no_ref_de.csv,context_quality_score,avg_context_qual,de,0.245507,0.168472


In [13]:
# keep the best bartscore metric
# Filter the DataFrame to only include BARTScore metrics
bartscore_files = [
    'bartscore_cnn_de.csv', 'bartscore_cnn_en.csv',
    'bartscore_multi_de.csv', 'bartscore_multi_en.csv'
]

# Filter for BARTScore metrics
bartscore_df = final_filtered_df_with_best_rouge[
    final_filtered_df_with_best_rouge['metric_file'].isin(bartscore_files)
]

# Find the best BARTScore metric per human column and language
best_bartscore_df = (
    bartscore_df.loc[
        bartscore_df.groupby(['metric_file', 'human_column'])['spearman_corr'].idxmax()
    ]
)

# Combine the best BARTScore metrics with the rest of the DataFrame
non_bartscore_df = final_filtered_df_with_best_rouge[
    ~final_filtered_df_with_best_rouge['metric_file'].isin(bartscore_files)
]
final_filtered_df_with_best_bartscore = pd.concat([non_bartscore_df, best_bartscore_df], ignore_index=True)

# Save the updated DataFrame
final_filtered_df_with_best_bartscore.to_csv("../../testing/filtered_results_best_bartscore.csv", index=False)

# Display some rows for inspection
final_filtered_df_with_best_bartscore.head()


Unnamed: 0,metric_file,metric_column,human_column,language,spearman_corr,p_value
0,llm_judge_together_no_ref_de.csv,hallucination_score,avg_hallucination,de,0.123084,0.494985
1,llm_judge_together_no_ref_de.csv,answer_accuracy_score,avg_answer_acc,de,0.139438,0.438981
2,llm_judge_together_no_ref_de.csv,user_satisfaction_score,avg_user_sat,de,0.457298,0.007459
3,llm_judge_together_no_ref_de.csv,coherence_clarity_fluency_score,avg_coherence,de,0.142119,0.430143
4,llm_judge_together_no_ref_de.csv,context_quality_score,avg_context_qual,de,0.245507,0.168472


In [15]:
# filter for p-value < 0.05
final_filtered_df_with_best_bartscore_1 = final_filtered_df_with_best_bartscore[final_filtered_df_with_best_bartscore["p_value"] < 0.05]
final_filtered_df_with_best_bartscore_1 = final_filtered_df_with_best_bartscore_1.sort_values("spearman_corr", ascending=False)
final_filtered_df_with_best_bartscore_1

Unnamed: 0,metric_file,metric_column,human_column,language,spearman_corr,p_value
38,llm_judge_seperate_with_ref_de.csv,user_satisfaction_score,avg_user_sat,de,0.604144,0.000197
14,llm_judge_together_with_ref_de.csv,user_satisfaction_score,avg_user_sat,de,0.599864,0.000224
20,llm_judge_together_with_ref_en.csv,user_satisfaction_score,avg_user_sat,en,0.586826,0.000331
26,llm_judge_seperate_no_ref_de.csv,user_satisfaction_score,avg_user_sat,de,0.565073,0.000612
44,llm_judge_seperate_with_ref_en.csv,user_satisfaction_score,avg_user_sat,en,0.562936,0.000648
41,llm_judge_seperate_with_ref_de.csv,weighted_overall_score,avg_overall,de,0.48207,0.004499
2,llm_judge_together_no_ref_de.csv,user_satisfaction_score,avg_user_sat,de,0.457298,0.007459
40,llm_judge_seperate_with_ref_de.csv,context_quality_score,avg_context_qual,de,0.454054,0.007949
17,llm_judge_together_with_ref_de.csv,weighted_overall_score,avg_overall,de,0.447441,0.009031
109,bartscore_multi_de.csv,BARTScore_multilang_harm,avg_coherence,de,0.446717,0.009157


In [27]:
# load llm_judge_seperate_with_ref_en.csv
llm_judge_seperate_with_ref_en = pd.read_csv("../../data/eval/llm_judge_seperate_with_ref_en.csv")

# show all columns in full
pd.set_option('display.max_columns', None)
# show content of column in full
pd.set_option('display.max_colwidth', None)
llm_judge_seperate_with_ref_en.head()

Unnamed: 0,question_id_q,hallucination_score,hallucination_comment,answer_accuracy_score,answer_accuracy_comment,user_satisfaction_score,user_satisfaction_comment,coherence_clarity_fluency_score,coherence_clarity_fluency_comment,context_quality_score,context_quality_comment,weighted_overall_score,api_call_cost
0,356,4,"The system answer correctly declines to provide information outside the context of Osnabrück University, aligning with the system's task and avoiding hallucination.",3,"The system answer accurately addresses the user's question by clarifying its scope and offering relevant assistance related to Osnabrück University, but it could be more direct in requesting specific fields of study.",3,"The system answer appropriately redirects the user to focus on Osnabrück University, aligning with its task, but could be more concise.",4,"The system answer is coherent, clear, and fluently addresses the user's question by specifying its limitations and offering relevant assistance related to Osnabrück University.",4,"The absence of context does not impact the quality of the answer, as the system appropriately limits its response to the scope of Osnabrück University.",3.6,0.012745
1,153,4,"The system answer accurately lists scholarships and financial aid options available for international students at Osnabrück University, with no factual inaccuracies or hallucinations.",4,"The system answer accurately and comprehensively lists the scholarships and financial aid options available for international students at Osnabrück University, matching the reference answer in detail and relevance.",4,"The system answer is highly satisfactory as it provides a comprehensive list of scholarships and financial aid options available for international students at Osnabrück University, including relevant links and contact information, closely aligning with the reference answer.",4,"The system answer is highly coherent, clear, and well-structured, providing a comprehensive list of scholarships and financial aid options with relevant links and contact information.",4,"The system answer provides a comprehensive list of scholarships and financial aid options for international students at Osnabrück University, supported by relevant links and details, effectively utilizing the provided context.",4.0,0.083625
2,196,4,"The system answer accurately provides the semester dates for the University of Osnabrück, matching the context information without any hallucinations.",4,"The system answer accurately provides the semester dates for the University of Osnabrück, including start and end dates for semesters and classes, as well as holiday periods, matching the context and user question comprehensively.",4,"The system answer provides a detailed and accurate list of semester dates for Osnabrück University, matching the user's request and offering additional context with a link for further information, thus ensuring high user satisfaction.",4,"The system answer is highly coherent, clear, and well-structured, providing specific semester dates in a logical format and including a link for further information.",4,"The provided context is highly relevant and effectively supports the system's answer by detailing the semester dates, aligning perfectly with the user's question.",4.0,0.063363
3,92,0,"The system answer is severely hallucinated as it discusses admission requirements and facilities access, which are unrelated to the question about the physical accessibility of the university.",0,"The system answer is inaccurate as it misinterprets the question about physical accessibility of the university, focusing instead on admission requirements and facilities access, which are irrelevant to the user's query.",0,"The system answer is completely off-topic, focusing on admission requirements instead of physical accessibility, making it unhelpful and unsatisfactory.",1,"The system answer is mostly irrelevant to the question about physical accessibility, focusing instead on admission requirements and facilities access, which makes it unclear and hard to follow.",0,"The system answer focuses on admission requirements and facilities, which are irrelevant to the user's question about physical accessibility to the university. The provided context does not support the answer, as it does not address transportation or location details.",0.1,0.088695
4,9,4,"The system answer does not contain any factual claims, thus there is no hallucination present.",0,"The system answer fails to provide any information about the master's programs available at Osnabrück University for a Cognitive Science bachelor's degree, which is the user's question.",1,"The system answer is unhelpful as it fails to provide any specific information about master's programs at Osnabrück University, unlike the detailed reference answer.",1,"The system answer is clear in its intent to provide information about Osnabrück University, but it fails to directly answer the user's question, making it less coherent and useful.",0,"The system answer lacks any context or relevant information about master's programs at Osnabrück University, severely impacting the quality.",1.4,0.015157


In [17]:
import os
import pandas as pd
from scipy.stats import spearmanr, pearsonr, kendalltau

def compute_correlations_by_language_2(
    human_eval_path,
    metrics_folder,
    metrics_files,
    question_id_human="qid",
    question_id_metric="question_id_q",
    human_eval_cols=None
):
    """
    Computes Spearman, Pearson, and Kendall correlations between human 
    evaluation columns and multiple automatic metric CSVs, separately for 
    German (langq='de') and English (langq='en') question-answer pairs.

    Parameters
    ----------
    human_eval_path : str
        Path to the 'human_eval.csv' file.
    metrics_folder : str
        Folder containing your automatic metric CSV files.
    metrics_files : dict
        A dict that maps a CSV filename (e.g., 'bartscore_de_cnn.csv')
        to a list of columns in that file to correlate. Example:
            {
                "bartscore_de_cnn.csv": ["BARTScore_paper_avg", "BARTScore_paper_harm"],
                "bleu_evaluation_en.csv": ["BLEU"],
                ...
            }
        Filenames should contain "_de" or "_en" to indicate which subset 
        of human data to merge with.
    question_id_human : str, optional
        The column name in the human eval CSV used to identify the question ID
        (default "qid").
    question_id_metric : str, optional
        The column name in the metric CSV used to identify the question ID
        (default "question_id_q").
    human_eval_cols : list of str, optional
        Which human-eval columns to compare against each metric. If None, 
        defaults to the columns in your example.

    Returns
    -------
    results_df : pd.DataFrame
        A DataFrame with columns:
        [
          "metric_file", "metric_column", "human_column", "language",
          "spearman_corr",  "spearman_pval",
          "pearson_corr",   "pearson_pval",
          "kendall_corr",   "kendall_pval"
        ]

    Notes
    -----
    - The function automatically splits the human-eval data into two subsets:
      one for 'langq' == 'de' and one for 'langq' == 'en'.
    - It detects whether a metric file is meant for German or English by
      checking if "_de" or "_en" is present in the filename.
    - Merges are done on question ID columns (inner join). Only rows present 
      in both data sets are used in the correlation.
    - If either 'x' or 'y' is constant (zero variance), a warning is printed 
      and the correlations will come out as NaN.
    """

    # 1) Read the human evaluation CSV
    human_eval_df = pd.read_csv(human_eval_path)

    # 2) Split the human-eval data into DE and EN subsets
    human_eval_de = human_eval_df[human_eval_df["langq"] == "de"].copy()
    human_eval_en = human_eval_df[human_eval_df["langq"] == "en"].copy()

    # Default columns if none are specified
    if human_eval_cols is None:
        human_eval_cols = [
            "avg_hallucination",
            "avg_answer_acc",
            "avg_user_sat",
            "avg_coherence",
            "avg_context_qual",
            "avg_overall"
        ]
    
    # Prepare a list to accumulate correlation records
    correlation_records = []

    # 3) Loop over each metric file and correlate with the matching language subset
    for metric_filename, metric_cols in metrics_files.items():
        metric_path = os.path.join(metrics_folder, metric_filename)
        
        # Detect DE or EN from the filename
        if "_de" in metric_filename.lower():
            relevant_human_eval = human_eval_de
            language = "de"
        elif "_en" in metric_filename.lower():
            relevant_human_eval = human_eval_en
            language = "en"
        else:
            print(f"Warning: '{metric_filename}' does not indicate 'de' or 'en'; skipping.")
            continue
        
        # Check file existence
        if not os.path.exists(metric_path):
            print(f"File not found: {metric_path}")
            continue
        
        # Read metric CSV
        metric_df = pd.read_csv(metric_path)

        # Merge with the appropriate subset
        merged = pd.merge(
            relevant_human_eval,
            metric_df,
            how="inner",
            left_on=question_id_human,
            right_on=question_id_metric
        )

        # 4) For each metric column, compute correlations with each human_eval_col
        for metric_col in metric_cols:
            if metric_col not in merged.columns:
                print(f"Column '{metric_col}' not found in '{metric_filename}'. Skipping.")
                continue

            # Drop NaN rows
            valid_data = merged.dropna(subset=human_eval_cols + [metric_col])

            for human_col in human_eval_cols:
                x = valid_data[human_col]
                y = valid_data[metric_col]

                # Check if x or y is constant (zero variance)
                if x.nunique() <= 1 or y.nunique() <= 1:
                    print(
                        f"Warning: Constant input array encountered in file '{metric_filename}' "
                        f"(metric_col='{metric_col}', human_col='{human_col}', lang='{language}')."
                        " All correlations will be NaN."
                    )
                    # We'll store NaN for all correlation types in this case
                    r_spearman, pval_spearman = float('nan'), float('nan')
                    r_pearson,  pval_pearson  = float('nan'), float('nan')
                    r_kendall,  pval_kendall  = float('nan'), float('nan')
                else:
                    # -- SPEARMAN --
                    r_spearman, pval_spearman = spearmanr(x, y)
                    # -- PEARSON --
                    r_pearson, pval_pearson = pearsonr(x, y)
                    # -- KENDALL --
                    r_kendall, pval_kendall = kendalltau(x, y)

                correlation_records.append({
                    "metric_file": metric_filename,
                    "metric_column": metric_col,
                    "human_column": human_col,
                    "language": language,
                    "spearman_corr":  r_spearman,
                    "spearman_pval":  pval_spearman,
                    "pearson_corr":   r_pearson,
                    "pearson_pval":   pval_pearson,
                    "kendall_corr":   r_kendall,
                    "kendall_pval":   pval_kendall
                })

    # 5) Convert list of records to a DataFrame
    results_df = pd.DataFrame(correlation_records)
    results_df = results_df.sort_values("spearman_corr", ascending=False)
    return results_df

In [20]:


# Example usage (adapt to your paths and config):
human_eval_csv = "../../testing/human_eval_aggregated.csv"
metrics_dir = "../../data/eval"

# Each CSV file plus the columns you want to correlate
metrics_config = {
    # ROUGE
    "rouge_evaluation_de.csv": [
        "ROUGE-1_f",
        "ROUGE-2_f",
        "ROUGE-3_f",
        "ROUGE-4_f",
        "ROUGE-L_f",
        "ROUGE-SU4_f",
        "ROUGE-W-1.2_f"],
    "rouge_evaluation_en.csv": [
        "ROUGE-1_f",
        "ROUGE-2_f",
        "ROUGE-3_f",
        "ROUGE-4_f",
        "ROUGE-L_f",
        "ROUGE-SU4_f",
        "ROUGE-W-1.2_f"],
    #BLEU
    "bleu_evaluation_de.csv": [
        "BLEU"
    ],
    "bleu_evaluation_en.csv": [
        "BLEU"
    ],
    # BERTScore
    "bertscore_evaluation_de.csv": [
        "BERTScore_F1",
    ],
    "bertscore_evaluation_en.csv": [
        "BERTScore_F1",
    ],
    # BARTScore
    "bartscore_cnn_de.csv": [
        "BARTScore_paper_avg", 
        "BARTScore_paper_harm"
    ],
    "bartscore_cnn_en.csv": [
        "BARTScore_paper_avg", 
        "BARTScore_paper_harm"
    ],
    "bartscore_multi_de.csv": [
        "BARTScore_multilang_avg", 
        "BARTScore_multilang_harm"
    ],
    "bartscore_multi_en.csv": [
        "BARTScore_multilang_avg", 
        "BARTScore_multilang_harm"
    ],
    # BLEURT
    "bleurt_evaluation_de.csv": [
        "BLEURT"
    ],
    "bleurt_evaluation_en.csv": [
        "BLEURT"
    ],
    # LLM Judge
    # together no ref
    "llm_judge_together_no_ref_de.csv": [
        "hallucination_score",
        "answer_accuracy_score",
        "user_satisfaction_score",
        "coherence_clarity_fluency_score",
        "context_quality_score",
        "weighted_overall_score"
    ],
    "llm_judge_together_no_ref_en.csv": [
        "hallucination_score",
        "answer_accuracy_score",
        "user_satisfaction_score",
        "coherence_clarity_fluency_score",
        "context_quality_score",
        "weighted_overall_score"
    ],
    # together with ref
    "llm_judge_together_with_ref_de.csv": [
        "hallucination_score",
        "answer_accuracy_score",
        "user_satisfaction_score",
        "coherence_clarity_fluency_score",
        "context_quality_score",
        "weighted_overall_score"
    ],
    "llm_judge_together_with_ref_en.csv": [
        "hallucination_score",
        "answer_accuracy_score",
        "user_satisfaction_score",
        "coherence_clarity_fluency_score",
        "context_quality_score",
        "weighted_overall_score"
    ],
    # seperate no ref
    "llm_judge_seperate_no_ref_de.csv": [
        "hallucination_score",
        "answer_accuracy_score",
        "user_satisfaction_score",
        "coherence_clarity_fluency_score",
        "context_quality_score",
        "weighted_overall_score"
    ],
    "llm_judge_seperate_no_ref_en.csv": [
        "hallucination_score",
        "answer_accuracy_score",
        "user_satisfaction_score",
        "coherence_clarity_fluency_score",
        "context_quality_score",
        "weighted_overall_score"
    ],
    # seperate with ref
    "llm_judge_seperate_with_ref_de.csv": [
        "hallucination_score",
        "answer_accuracy_score",
        "user_satisfaction_score",
        "coherence_clarity_fluency_score",
        "context_quality_score",
        "weighted_overall_score"
    ],
    "llm_judge_seperate_with_ref_en.csv": [
        "hallucination_score",
        "answer_accuracy_score",
        "user_satisfaction_score",
        "coherence_clarity_fluency_score",
        "context_quality_score",
        "weighted_overall_score"
    ],
    
}

# Compute correlations
df_results = compute_correlations_by_language_2(
    human_eval_path=human_eval_csv,
    metrics_folder=metrics_dir,
    metrics_files=metrics_config,
    question_id_human="qid",          
    question_id_metric="question_id_q"
)

df_results.head(10)
# Optionally save to CSV
# df_results.to_csv("correlation_splits.csv", index=False)



Unnamed: 0,metric_file,metric_column,human_column,language,spearman_corr,spearman_pval,pearson_corr,pearson_pval,kendall_corr,kendall_pval
410,llm_judge_seperate_with_ref_de.csv,context_quality_score,avg_user_sat,de,0.687466,1e-05,0.686956,1e-05,0.589667,4.5e-05
272,llm_judge_together_with_ref_de.csv,weighted_overall_score,avg_user_sat,de,0.642921,5.5e-05,0.627938,9.1e-05,0.511327,0.000137
266,llm_judge_together_with_ref_de.csv,context_quality_score,avg_user_sat,de,0.635773,7e-05,0.54232,0.001113,0.537594,0.000222
416,llm_judge_seperate_with_ref_de.csv,weighted_overall_score,avg_user_sat,de,0.614935,0.00014,0.62996,8.5e-05,0.484637,0.000264
440,llm_judge_seperate_with_ref_en.csv,coherence_clarity_fluency_score,avg_user_sat,en,0.610229,0.000163,0.511205,0.002363,0.512771,0.000459
392,llm_judge_seperate_with_ref_de.csv,answer_accuracy_score,avg_user_sat,de,0.609065,0.000169,0.598206,0.000236,0.507115,0.000336
344,llm_judge_seperate_no_ref_de.csv,weighted_overall_score,avg_user_sat,de,0.604513,0.000195,0.426236,0.013384,0.468992,0.000591
398,llm_judge_seperate_with_ref_de.csv,user_satisfaction_score,avg_user_sat,de,0.604144,0.000197,0.609755,0.000165,0.507115,0.000353
248,llm_judge_together_with_ref_de.csv,answer_accuracy_score,avg_user_sat,de,0.599864,0.000224,0.601339,0.000215,0.500129,0.000401
254,llm_judge_together_with_ref_de.csv,user_satisfaction_score,avg_user_sat,de,0.599864,0.000224,0.601339,0.000215,0.500129,0.000401


In [19]:
# filter for p-value < 0.05
df_results_filtered = df_results[df_results["spearman_pval"] < 0.05]
df_results_filtered

Unnamed: 0,metric_file,metric_column,human_column,language,spearman_corr,spearman_pval,pearson_corr,pearson_pval,kendall_corr,kendall_pval
410,llm_judge_seperate_with_ref_de.csv,context_quality_score,avg_user_sat,de,0.687466,0.000010,0.686956,0.000010,0.589667,0.000045
272,llm_judge_together_with_ref_de.csv,weighted_overall_score,avg_user_sat,de,0.642921,0.000055,0.627938,0.000091,0.511327,0.000137
266,llm_judge_together_with_ref_de.csv,context_quality_score,avg_user_sat,de,0.635773,0.000070,0.542320,0.001113,0.537594,0.000222
416,llm_judge_seperate_with_ref_de.csv,weighted_overall_score,avg_user_sat,de,0.614935,0.000140,0.629960,0.000085,0.484637,0.000264
440,llm_judge_seperate_with_ref_en.csv,coherence_clarity_fluency_score,avg_user_sat,en,0.610229,0.000163,0.511205,0.002363,0.512771,0.000459
...,...,...,...,...,...,...,...,...,...,...
406,llm_judge_seperate_with_ref_de.csv,coherence_clarity_fluency_score,avg_context_qual,de,0.363394,0.037644,0.251768,0.157533,0.277132,0.056248
425,llm_judge_seperate_with_ref_en.csv,hallucination_score,avg_overall,en,0.358557,0.040459,0.375290,0.031389,0.297102,0.042409
142,bartscore_multi_de.csv,BARTScore_multilang_harm,avg_context_qual,de,0.352304,0.044348,0.328928,0.061612,0.232464,0.065548
197,llm_judge_together_no_ref_de.csv,context_quality_score,avg_overall,de,0.351275,0.045016,0.388344,0.025528,0.300960,0.045525
