In [None]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score, matthews_corrcoef


data = pd.read_excel('human_eval_sample.xlsx')


experts = data['expert_name'].unique()

results_correctness = []
results_hallucinations = []

for expert in experts:

    df_expert = data[data['expert_name'] == expert]
    
    # Согласованность по корректности
    main_correctness_scores = df_expert['correctness_me']
    other_correctness_scores = df_expert['correctness_other']
    
    kappa_correctness = cohen_kappa_score(main_correctness_scores, other_correctness_scores)
    mcc_correctness = matthews_corrcoef(main_correctness_scores, other_correctness_scores)
    
    results_correctness.append({
        'pair': f'expert_main vs {expert}',
        'cohen_kappa': kappa_correctness,
        'mcc': mcc_correctness
    })
    
    # Согласованность по галлюцинациям
    main_hallucinations_scores = df_expert['halucinations_me']
    other_hallucinations_scores = df_expert['halucinations_other']
    
    kappa_hallucinations = cohen_kappa_score(main_hallucinations_scores, other_hallucinations_scores)
    mcc_hallucinations = matthews_corrcoef(main_hallucinations_scores, other_hallucinations_scores)
    
    results_hallucinations.append({
        'pair': f'expert_main vs {expert}',
        'cohen_kappa': kappa_hallucinations,
        'mcc': mcc_hallucinations
    })


results_df_correctness = pd.DataFrame(results_correctness)
results_df_hallucinations = pd.DataFrame(results_hallucinations)

print("Согласованность по корректности:")
print(results_df_correctness)

print("\nСогласованность по галлюцинациям:")
print(results_df_hallucinations)


mean_kappa_correctness = results_df_correctness['cohen_kappa'].mean()
mean_mcc_correctness = results_df_correctness['mcc'].mean()

print(f'\nСредний Cohen\'s Kappa по корректности: {mean_kappa_correctness:.4f}')
print(f'Средний MCC по корректности: {mean_mcc_correctness:.4f}')

mean_kappa_hallucinations = results_df_hallucinations['cohen_kappa'].mean()
mean_mcc_hallucinations = results_df_hallucinations['mcc'].mean()

print(f'\nСредний Cohen\'s Kappa по галлюцинациям: {mean_kappa_hallucinations:.4f}')
print(f'Средний MCC по галлюцинациям: {mean_mcc_hallucinations:.4f}')

Согласованность по корректности:
                     pair  cohen_kappa       mcc
0     expert_main vs Катя     0.444444  0.534522
1   expert_main vs Таня к     0.523810  0.523810
2      expert_main vs Аня     1.000000  1.000000
3  expert_main vs Ксюша И     0.782609  0.801784
4  expert_main vs Ксюша С     0.615385  0.666667

Согласованность по галлюцинациям:
                     pair  cohen_kappa       mcc
0     expert_main vs Катя     1.000000  1.000000
1   expert_main vs Таня к     1.000000  1.000000
2      expert_main vs Аня     1.000000  1.000000
3  expert_main vs Ксюша И     1.000000  1.000000
4  expert_main vs Ксюша С     0.782609  0.801784

Средний Cohen's Kappa по корректности: 0.6732
Средний MCC по корректности: 0.7054

Средний Cohen's Kappa по галлюцинациям: 0.9565
Средний MCC по галлюцинациям: 0.9604


In [50]:
data['correctness_nli'] = data['correctness_response_nli'].apply(lambda x: 1 if x == 0 else 0)

In [51]:
kappa = cohen_kappa_score(data['correctness_other'], data['correctness_nli'])
mcc = matthews_corrcoef(data['correctness_other'], data['correctness_nli'])

print(f"Cohen's Kappa: {kappa:.4f}")
print(f"MCC: {mcc:.4f}")

Cohen's Kappa: 0.5424
MCC: 0.5546


In [None]:
kappa = cohen_kappa_score(data['correctness_me'], data['correctness_nli'])
mcc = matthews_corrcoef(data['correctness_me'], data['correctness_nli'])

print(f"Cohen's Kappa: {kappa:.4f}")
print(f"MCC: {mcc:.4f}")

Cohen's Kappa: 0.692
MCC: 0.693


In [None]:
data['faithfulness_response_nli_0'] = data['faithfulness_response_nli'].apply(lambda x: 1 if x == 2 else 0)

In [None]:
data['faithfulness_response_nli_1'] = data['faithfulness_response_nli'].apply(lambda x: 1 if x in [1, 2] else 0)

In [None]:
#contradiction
kappa = cohen_kappa_score(data['halucinations_me'], data['faithfulness_response_nli_0'])
mcc = matthews_corrcoef(data['halucinations_me'], data['faithfulness_response_nli_0'])

print(f"Cohen's Kappa: {kappa:.4f}")
print(f"MCC: {mcc:.4f}")

Cohen's Kappa: -0.081
MCC: -0.082


In [None]:
#contradiction
kappa = cohen_kappa_score(data['halucinations_other'], data['faithfulness_response_nli_0'])
mcc = matthews_corrcoef(data['halucinations_other'], data['faithfulness_response_nli_0'])

print(f"Cohen's Kappa: {kappa:.4f}")
print(f"MCC: {mcc:.4f}")

Cohen's Kappa: -0.098
MCC: -0.100


In [None]:
#contradiction+neutral
kappa = cohen_kappa_score(data['halucinations_me'], data['faithfulness_response_nli_1'])
mcc = matthews_corrcoef(data['halucinations_me'], data['faithfulness_response_nli_1'])

print(f"Cohen's Kappa: {kappa:.4f}")
print(f"MCC: {mcc:.4f}")

Cohen's Kappa: 0.122
MCC: 0.161


In [None]:
#contradiction+neutral
kappa = cohen_kappa_score(data['halucinations_other'], data['faithfulness_response_nli_1'])
mcc = matthews_corrcoef(data['halucinations_other'], data['faithfulness_response_nli_1'])

print(f"Cohen's Kappa: {kappa:.4}")
print(f"MCC: {mcc:.4f}")

Cohen's Kappa: 0.081
MCC: 0.103
