In [1]:
import json 
from sklearn.metrics import cohen_kappa_score
from rouge_metric import get_rouge

In [None]:
## INPUT AND OUTPUT FILES FOR MINERVA2B-base
in_path_file = "../results/minerva3B-base/minerva3B-base-out_with_human_ann.json"
out_path_file = "../results/minerva3B-base/minerva3B_final_scores.json"

## INPUT AND OUTPUT FILE FOR mt5-base
# in_path_file = "../results/mt5-base/mt5-base_with_human_ann.json"
# out_path_file = "../results/mt5-base/mt5-base_final_scores.json"

In [4]:
with open("../results/minerva3B-base/judge_prometheus_for_minerva3B-base.json") as f:
    prometheus_results = json.load(f)

In [5]:
with open(in_path_file) as f:
    model_outputs = json.load(f)

In [None]:
with open("../results/minerva3B-base/judge_gemini_for_minerva3B-base.json", "r", encoding='utf-8') as f:
    gemini_results = json.load(f)

In [7]:
valuation_coefficient = 5

human_annotations = [row['human_annotation'] for row in model_outputs]
print(human_annotations)

results = {'human': (sum(human_annotations)/len(human_annotations))/valuation_coefficient}

rouge_results = get_rouge(in_path_file)

assert(len(human_annotations) == len(rouge_results))
print(len(human_annotations))
print(len(gemini_results))
assert(len(human_annotations) == len(gemini_results))

# Computing Cohen's kappa
rouge_1 = []
rouge_2 = []
rouge_L = []
for r in rouge_results:
    rouge_1.append(round(valuation_coefficient * r['rouge-1']['f']))
    rouge_2.append(round(valuation_coefficient * r['rouge-2']['f']))
    rouge_L.append(round(valuation_coefficient * r['rouge-l']['f']))

print("[Main]: computing cohen's kappa coefficient for rouge score")

score1 = cohen_kappa_score(rouge_1, human_annotations)
score2 = cohen_kappa_score(rouge_2, human_annotations)
scoreL = cohen_kappa_score(rouge_L, human_annotations)
results['score_rouge1'] = (sum(rouge_1)/len(rouge_1))/valuation_coefficient
results['score_rouge2'] = (sum(rouge_2)/len(rouge_2))/valuation_coefficient
results['score_rougeL'] = (sum(rouge_L)/len(rouge_L))/valuation_coefficient
results['cohen_rouge1'] = score1
results['cohen_rouge2'] = score2
results['cohen_rougeL'] = scoreL


print("[Main]: computing cohen's coefficient for llm score")
llm = []
keys = list(gemini_results.keys())

for k in keys:
    llm.append(gemini_results[k])

#for l in gemini_results:
#    llm.append(l['score'])

prometheus = []
for k in keys:
    prometheus.append(prometheus_results[k])

score_llm = cohen_kappa_score(llm, human_annotations)
score_prometheus = cohen_kappa_score(prometheus, human_annotations)
results['cohen_llm'] = score_llm
results['cohen_prometheus'] = score_prometheus
results['score_llm'] = (sum(llm)/len(llm))/valuation_coefficient
results['score_prometheus'] = (sum(prometheus)/len(prometheus))/valuation_coefficient

print("[Main]: finished computed scores", results)
print("[Main]: writing in output the results")
with open(out_path_file, 'w') as out:
    json.dump(results, out, indent=4)



[3, 4, 5, 2, 5, 3, 2, 4, 5, 5, 2, 1, 3, 3, 2, 3, 5, 4, 3, 3, 2, 5, 5, 4, 5, 2, 3, 5, 5, 5, 4, 5, 5, 1, 5, 5, 3, 2, 4, 2, 5, 5, 5, 5, 5, 2, 3, 5, 3, 5, 3, 3, 2, 5, 3, 2, 4, 4, 5, 5, 5, 5, 5, 5, 5, 2, 5, 5, 5, 4, 5, 5, 2, 2, 5, 5, 5, 5, 3, 5, 5, 2, 2, 3, 2, 3, 5, 5, 2, 4, 4, 1, 5, 5, 5, 5, 2, 2, 2, 5, 2, 2, 5, 5, 5, 4, 5, 3, 4, 4, 5, 1, 5, 5, 5, 3, 5, 1, 5, 4, 4, 3, 2, 1, 3]
125
125
[Main]: computing cohen's kappa coefficient for rouge score
[Main]: computing cohen's coefficient for llm score
[Main]: finished computed scores {'human': 0.7535999999999999, 'score_rouge1': 0.8944000000000001, 'score_rouge2': 0.8160000000000001, 'score_rougeL': 0.8944000000000001, 'cohen_rouge1': 0.22418657137483544, 'cohen_rouge2': 0.18136570031435573, 'cohen_rougeL': 0.22418657137483544, 'cohen_llm': -0.04379065928730941, 'cohen_prometheus': 0.14972955253237186, 'score_llm': 0.8400000000000001, 'score_prometheus': 0.5584}
[Main]: writing in output the results


### Plots to visualize and compare the different scores.

In [9]:
# Load the file with the final scores
with open("../results/minerva3B-base/minerva3B_final_scores.json") as f:
    final_scores = json.load(f)

In [11]:
print(final_scores)

{'human': 0.7535999999999999, 'score_rouge1': 0.8944000000000001, 'score_rouge2': 0.8160000000000001, 'score_rougeL': 0.8944000000000001, 'cohen_rouge1': 0.22418657137483544, 'cohen_rouge2': 0.18136570031435573, 'cohen_rougeL': 0.22418657137483544, 'cohen_llm': -0.04379065928730941, 'cohen_prometheus': 0.14972955253237186, 'score_llm': 0.8400000000000001, 'score_prometheus': 0.5584}
