In [1]:
import pandas as pd
from rich import print
from scipy.stats import pearsonr, zscore
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext rich

In [7]:
def load_file(file_path, model_name: str | None = None):
    if model_name is None:
        model_name = file_path.split("/")[-1].split(".")[0]
    df = pd.read_csv(file_path)
    df["model"] = model_name
    df['golden_mqm_score'] = df['golden_mqm_score'].apply(lambda x: x if x > -25.0 else -25.0)
    df["score_diff"] = df["golden_mqm_score"].astype(float) - df["llm_mqm_score"].astype(float)
    df["score_diff"] = df["score_diff"].abs()
    df = df.dropna(subset=["score_diff", "reasoning_tokens"])

    return df

In [8]:
dsr1 = load_file(
    "../src/reasoning_eval/mt/DeepSeekUtils/outputs/run_summary_deepseek-r1_deepseek-r1.csv",
    "deepseek-r1"
)
dsr1_q32b = load_file(
    "../src/reasoning_eval/mt/DeepSeekUtils/outputs/run_summary_deepseek-r1-q32b_deepseek-r1-distill-qwen-32b.csv",
    "deepseek-r1-qwen-32b"
)
dsr1_l70b = load_file(
    "../src/reasoning_eval/mt/DeepSeekUtils/outputs/run_summary_deepseek-r1-llama70b_deepseek-r1-distill-llama-70b.csv",
    "deepseek-r1-llama70b"
)
o3_mini_high = load_file(
    "../src/reasoning_eval/mt/DeepSeekUtils/outputs/run_summary_o3-mini-high_o3-mini.csv",
    "o3-mini-high"
)
o3_mini_low = load_file(
    "../src/reasoning_eval/mt/DeepSeekUtils/outputs/run_summary_o3-mini-low_o3-mini.csv", 
    "o3-mini-low"
)
o3_mini_medium = load_file(
    "../src/reasoning_eval/mt/DeepSeekUtils/outputs/run_summary_o3-mini-medium_o3-mini.csv",
    "o3-mini-medium"
)
joint = pd.concat([dsr1, dsr1_q32b, dsr1_l70b, o3_mini_high, o3_mini_low, o3_mini_medium])

In [9]:
for model, df in joint.groupby('model'):
    print(f"==================")
    print(f"{model}")
    print(f"==================")
    pearson_corr, p_value = pearsonr(df['reasoning_tokens'], df['score_diff'])
    print(f"Pearson correlation between reasoning tokens and score difference: {pearson_corr:.4f}, p-value: {p_value:.4f}")
    pearson_corr, p_value = pearsonr(df['reasoning_tokens'], df['llm_mqm_score'].abs())
    print(f"Pearson correlation between reasoning tokens and LLM score: {pearson_corr:.4f}, p-value: {p_value:.4f}")
    pearson_corr, p_value = pearsonr(df['reasoning_tokens'], df['golden_mqm_score'].abs())
    print(f"Pearson correlation between reasoning tokens and golden score: {pearson_corr:.4f}, p-value: {p_value:.4f}")