In [4]:
import json
import numpy as np
from datasets import load_dataset
from scipy.stats import spearmanr
import pandas as pd

In [5]:
def average_human_scores(dataset):
    max_nb_of_scores = max(len(sublist) for sublist in dataset['scores'])
    padded_human_scores = [sublist + [np.nan] * (max_nb_of_scores - len(sublist)) for sublist in dataset['scores']]

    mean_human_scores = np.nanmean(np.array(padded_human_scores), axis = 1)

    assert np.mean(dataset['scores'][0]) == mean_human_scores[0]
    return mean_human_scores

In [6]:
def get_spearman_correlation(results_name, mean_human_scores):
    with open('./results.json', 'r') as json_file:
        results = json.load(json_file)

        BLANC_results = results[results_name]

        correlation, p_value = spearmanr(BLANC_results, mean_human_scores)
        print(f"Spearman Correlation: {correlation}")
        print(f"P-value: {p_value}")

In [7]:
DailyNews_ds = load_dataset('json', data_files='../datasets/DailyNews_300.json', split='train')
DailyNews_ds

Dataset({
    features: ['scores', 'text', 'summary', 'annotators_ids'],
    num_rows: 300
})

In [9]:
DailyNews_ds = load_dataset('json', data_files='../datasets/DailyNews_300.json', split='train')
mean_human_scores = average_human_scores(DailyNews_ds)

results_name = 'BLANC_help_300'
get_spearman_correlation(results_name, mean_human_scores)
results_name = 'BLANC_help_300_similarity'
get_spearman_correlation(results_name, mean_human_scores)

# why not TO DO : loop and add correlations on results.json

Spearman Correlation: 0.26132789066748396
P-value: 4.488025315076264e-06
Spearman Correlation: 0.1661435199520691
P-value: 0.0039047041933816134


In [10]:
en_fr_dataset = pd.read_csv("../datasets/en_to_fr_100_translations.csv")
correlation, p_value = spearmanr(en_fr_dataset["translation_score2"], [(-i+4)%5 for i in range(100)])
print(correlation, p_value)

0.2570321913049545 0.009836262543746541
