## Measuring bias significance
### Notebook for bootstrapping evaluation of evaluated dataset with computed heuristics

### Imports

In [2]:
import transformers
from datasets import load_metric
import pandas as pd
from datasets import Dataset
from tqdm.auto import tqdm
import os

  from .autonotebook import tqdm as notebook_tqdm


### Parameters for number of iterations and number of selected items

In [3]:
num_samples = 100
sample_size = 800
num_for_average_metrics = 1

In [4]:
metric = load_metric("squad")

In [5]:
def compute_metrics_for_sample(sample):
    """Computation of metrics for dataset sample
    Computes exact match and F1 between predicted and ground truth answers

    Args:
        sample (Dataframe): sample from dataset

    Returns:
        dict: computed metrics
    """
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in zip(sample['id'], sample['prediction_text'])]
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in sample]
    return metric.compute(predictions=formatted_predictions, references=references)

In [6]:
# variable for extensive logging, if True, the logging will be enabled, if False, only one logging file is used
write_to_log_files = False 

### Creation of logging files 
#### comment out after the first run, it creates the files with corresponding headers for the logging

In [7]:
os.mkdir('./logging/')

In [8]:
with open("./logging/metrics_for_specific_runs.csv", "w") as file:
    file.write(f"name,samples,sample_size,iters,field,threshold,type,exact_match_quantile_0.025,exact_match_quantile_0.975,exact_match_mean,f1_quantile_0.025,f1_quantile_0.975,f1_mean,len_lower,len_higher")

In [9]:
with open("./logging/average_metrics_for_runs.csv", "w") as file:
    file.write(f"name,samples,sample_size,iters,field,threshold,type,exact_match_quantile_0.025,exact_match_quantile_0.975,exact_match_mean,f1_quantile_0.025,f1_quantile_0.975,f1_mean,len_lower,len_higher")

In [10]:
with open("./logging/evaluated_metrics_for_average.csv", "w") as file:
    file.write(f"metric,samples,sample_size,iters,field,threshold,is_not_overlap,distance,len_lower,len_higher")

In [11]:
with open("./logging/metrics_with_intervals_distances_for_dataset_comparison.csv", "w") as file:
    file.write(f"dataset,field,threshold,lower_interval_em,distance,higher_interval_em,lower_interval_f1,distance,higher_interval_f1,samples,sample_size,len_lower,len_higher")

In [12]:
with open("./logging/data_for_violin.csv", "w") as file:
    file.write(f"dataset\tfield\tthreshold\tlower_interval_em\tdistance\thigher_interval_em\tlower_interval_f1\tdistance\thigher_interval_f1\tsamples\tsample_size\tlen_lower\tlen_higher\tlower_em_list\thigher_em_list\tlower_f1_list\thigher_f1_list")

### Computation of metrics for samples

In [13]:
def compute_metrics_for_bunch(data):
    """Sampling the dataset for specified number of iterations and computation of metrics for samples

    Args:
        data (Pandas Dataframe): dataset with computed heuristics

    Returns:
        Pandas Dataframe: computed heuristics for samples
    """
    exact_list = []
    f1_list = []

    for i in tqdm(range(num_samples)):
        df = data.sample(n=sample_size)
        sample = Dataset.from_pandas(df)
        metrics1 = compute_metrics_for_sample(sample)
        exact_list.append(metrics1['exact_match'])
        f1_list.append(metrics1['f1'])
    
    d = {'exact_match': exact_list, 'f1': f1_list}
    df = pd.DataFrame(d)

    return df

In [14]:
def find_the_distance_between_intervals(lower_025, lower_975, higher_025, higher_975):
    """Detect if there is some interval overlap between the two intervals or if there is not
    If there is not an overlap, it computes the distance between 2.5% and 97.5% quantiles

    Args:
        lower_025 (int): 2.5% quantile for lower subset
        lower_975 (int): 97.5% quantile for higher subset
        higher_025 (int): 2.5% quantile for lower subset
        higher_975 (int): 97.5% quantile for higher subset

    Returns:
        bool, decimal: boolean flag and the distance between intervals
    """
    distance_between_intervals = 0
    if lower_975 > higher_025 and lower_025 > higher_975:
        distance_between_intervals = lower_025 - higher_975
        return True, distance_between_intervals 
    elif higher_975 > lower_025 and higher_025 > lower_975:
        distance_between_intervals = higher_025 - lower_975
        return True, distance_between_intervals
    else:
        return False, distance_between_intervals

In [15]:
def does_not_have_enought_samples(field, threshold, data_higher, data_lower):
    """Logging for cases where the subset size is lower than the sample size

    Args:
        field (str): name of the dataframe column - with computed values for heuristics
        threshold (decimal): number on which the dataset is split between two subsets
        data_higher (Pandas Dataframe): split of dataset with values higher than the threshold
        data_lower (Pandas Dataframe): split of dataset with values lower or equal to the threshold
    """
    if write_to_log_files:
        with open("./logging/metrics_for_specific_runs.csv", "a") as file_append:
            file_append.write(f"\nlower_than_{threshold}_for_field_{field}_not_enough_samples,{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},lower,-1,-1,-1,-1,-1,-1,{len(data_lower)},{len(data_higher)}")
        with open("./logging/metrics_for_specific_runs.csv", "a") as file_append:
            file_append.write(f"\nhigher_than_{threshold}_for_field_{field}_not_enough_samples,{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},higher,-1,-1,-1,-1,-1,-1,{len(data_lower)},{len(data_higher)}")
        with open("./logging/average_metrics_for_runs.csv", "a") as file_append:
            file_append.write(f"\naverage_lower_than_{threshold}_for_field_{field}_not_enough_samples,{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},lower,-1,-1,-1,-1,-1,-1,{len(data_lower)},{len(data_higher)}")
            file_append.write(f"\naverage_higher_than_{threshold}_for_field_{field}_not_enough_samples,{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},higher,-1,-1,-1,-1,-1,-1,{len(data_lower)},{len(data_higher)}")
        with open("./logging/evaluated_metrics_for_average.csv", "a") as file_append:
            file_append.write(f"\nexact_match_not_enough_samples,{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},Nan,-1,{len(data_lower)},{len(data_higher)}")
            file_append.write(f"\nf1_not_enough_samples,{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},Nan,-1,{len(data_lower)},{len(data_higher)}")


In [16]:
from statistics import mean

def compute_metrics_average_split(data, field, threshold):
    """Function which calls the previous ones and provide dataset splits and logging into files

    Args:
        data (Pandas Dataframe): dataset
        field (str): column of the dataset based on which we want to split the data
        threshold (decimal): number for the split of dataset, the values lower or equal will be in one subset and values higher than the threshold will be in the other

    Returns:
        decimal, decimal: distances for both metrics, exact match and F1
    """
    data_higher, data_lower = [x for _, x in data.groupby(data[field] <= threshold)]

    if len(data_higher) < sample_size or len(data_lower) < sample_size:
        does_not_have_enought_samples(field, threshold, data_higher, data_lower)
        return -1, -1

    lower_exact_match_quantile_025 = []
    lower_exact_match_quantile_975 = []
    lower_exact_match_mean = []
    lower_f1_quantile_025 = []
    lower_f1_quantile_975 = []
    lower_f1_mean = []
    higher_exact_match_quantile_025 = []
    higher_exact_match_quantile_975 = []
    higher_exact_match_mean = []
    higher_f1_quantile_025 = []
    higher_f1_quantile_975 = []
    higher_f1_mean = []
    df_lower = []
    df_higher = []


    for i in tqdm(range(num_for_average_metrics)):
        df_lower = compute_metrics_for_bunch(data_lower)
        lower_exact_match_quantile_025.append(df_lower['exact_match'].quantile(0.025))
        lower_exact_match_quantile_975.append(df_lower['exact_match'].quantile(0.975))
        lower_exact_match_mean.append(df_lower['exact_match'].mean())
        lower_f1_quantile_025.append(df_lower['f1'].quantile(0.025))
        lower_f1_quantile_975.append(df_lower['f1'].quantile(0.975))
        lower_f1_mean.append(df_lower['f1'].mean())

        if write_to_log_files:
            with open("./logging/metrics_for_specific_runs.csv", "a") as file_append:
                file_append.write(f"\nlower_than_{threshold}_for_field_{field},{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},lower,{df_lower['exact_match'].quantile(0.025)},{df_lower['exact_match'].quantile(0.975)},{df_lower['exact_match'].mean()},{df_lower['f1'].quantile(0.025)},{df_lower['f1'].quantile(0.975)},{df_lower['f1'].mean()},{len(data_lower)},{len(data_higher)}")

        df_higher = compute_metrics_for_bunch(data_higher)
        higher_exact_match_quantile_025.append(df_higher['exact_match'].quantile(0.025))
        higher_exact_match_quantile_975.append(df_higher['exact_match'].quantile(0.975))
        higher_exact_match_mean.append(df_higher['exact_match'].mean())
        higher_f1_quantile_025.append(df_higher['f1'].quantile(0.025))
        higher_f1_quantile_975.append(df_higher['f1'].quantile(0.975))
        higher_f1_mean.append(df_higher['f1'].mean())

        if write_to_log_files:
            with open("./logging/metrics_for_specific_runs.csv", "a") as file_append:
                file_append.write(f"\nhigher_than_{threshold}_for_field_{field},{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},higher,{df_higher['exact_match'].quantile(0.025)},{df_higher['exact_match'].quantile(0.975)},{df_higher['exact_match'].mean()},{df_higher['f1'].quantile(0.025)},{df_higher['f1'].quantile(0.975)},{df_higher['f1'].mean()},{len(data_lower)},{len(data_higher)}")

    if write_to_log_files:
        with open("./logging/average_metrics_for_runs.csv", "a") as file_append:
            file_append.write(f"\naverage_lower_than_{threshold}_for_field_{field},{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},lower,{mean(lower_exact_match_quantile_025)},{mean(lower_exact_match_quantile_975)},{mean(lower_exact_match_mean)},{mean(lower_f1_quantile_025)},{mean(lower_f1_quantile_975)},{mean(lower_f1_mean)},{len(data_lower)},{len(data_higher)}")
            file_append.write(f"\naverage_higher_than_{threshold}_for_field_{field},{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},higher,{mean(higher_exact_match_quantile_025)},{mean(higher_exact_match_quantile_975)},{mean(higher_exact_match_mean)},{mean(higher_f1_quantile_025)},{mean(higher_f1_quantile_975)},{mean(higher_f1_mean)},{len(data_lower)},{len(data_higher)}")

    is_not_overlap_em, distance_em = find_the_distance_between_intervals(mean(lower_exact_match_quantile_025), mean(lower_exact_match_quantile_975), mean(higher_exact_match_quantile_025), mean(higher_exact_match_quantile_975))
    is_not_overlap_f1, distance_f1 = find_the_distance_between_intervals(mean(lower_f1_quantile_025), mean(lower_f1_quantile_975), mean(higher_f1_quantile_025), mean(higher_f1_quantile_975))

    if write_to_log_files:
        with open("./logging/evaluated_metrics_for_average.csv", "a") as file_append:
            file_append.write(f"\nexact_match,{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},{is_not_overlap_em},{distance_em},{len(data_lower)},{len(data_higher)}")
            file_append.write(f"\nf1,{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},{is_not_overlap_f1},{distance_f1},{len(data_lower)},{len(data_higher)}")

    with open("./logging/metrics_with_intervals_distances_for_dataset_comparison.csv", "a") as file_append:
        file_append.write(f"\n{dataset},{field},{threshold},<{mean(lower_exact_match_quantile_025)};{mean(lower_exact_match_quantile_975)}>,{distance_em},<{mean(higher_exact_match_quantile_025)};{mean(higher_exact_match_quantile_975)}>,<{mean(lower_f1_quantile_025)};{mean(lower_f1_quantile_975)}>,{distance_f1},<{mean(higher_f1_quantile_025)};{mean(higher_f1_quantile_975)}>,{num_samples},{sample_size},{len(data_lower)},{len(data_higher)}")

    if write_to_log_files:
        for i in range(num_samples):
            with open("./logging/data_for_violin.csv", "a") as file_append:
                file_append.write(f"\n{dataset}\t{field}\t{threshold}\t<{mean(lower_exact_match_quantile_025)};{mean(lower_exact_match_quantile_975)}>\t{distance_em}\t<{mean(higher_exact_match_quantile_025)};{mean(higher_exact_match_quantile_975)}>\t<{mean(lower_f1_quantile_025)};{mean(lower_f1_quantile_975)}>\t{distance_f1}\t<{mean(higher_f1_quantile_025)};{mean(higher_f1_quantile_975)}>\t{num_samples}\t{sample_size}\t{len(data_lower)}\t{len(data_higher)}\t{df_lower['exact_match'][i]}\tlower\t{df_lower['f1'][i]}\tlower")
                file_append.write(f"\n{dataset}\t{field}\t{threshold}\t<{mean(lower_exact_match_quantile_025)};{mean(lower_exact_match_quantile_975)}>\t{distance_em}\t<{mean(higher_exact_match_quantile_025)};{mean(higher_exact_match_quantile_975)}>\t<{mean(lower_f1_quantile_025)};{mean(lower_f1_quantile_975)}>\t{distance_f1}\t<{mean(higher_f1_quantile_025)};{mean(higher_f1_quantile_975)}>\t{num_samples}\t{sample_size}\t{len(data_lower)}\t{len(data_higher)}\t{df_higher['exact_match'][i]}\thigher\t{df_higher['f1'][i]}\thigher")

    print(f"Average exact match with params: samples {sample_size} iters {num_samples} ---- are independent: {is_not_overlap_em} the distance is: {distance_em}")

    print(f"Average f1 with params: samples {sample_size} iters {num_samples} ---- are independent: {is_not_overlap_f1} the distance is: {distance_f1}")

    return distance_em, distance_f1


In [17]:
def find_longest_distance(data, field, low_bound, upp_bound):
    """Finds out the longest distance between intervals for bunch of thresholds

    Args:
        data (Pandas Dataframe): dataset
        field (str): Dataframe column
        low_bound (int): the number from which the threshold will start from
        upp_bound (int): the number for which the threshold will go -1
    """
    index_em = 0
    index_f1 = 0
    max_em_distance = 0
    max_f1_distance = 0

    distance_em = 0
    distance_f1 = 0

    for i in tqdm(range(low_bound, upp_bound, 1)):
        distance_em, distance_f1 = compute_metrics_average_split(data, field, i)
        if distance_em > max_em_distance:
            max_em_distance = distance_em
            index_em = i
        if distance_f1 > max_f1_distance:
            max_f1_distance = distance_f1
            index_f1 = i

    print(f"The biggest distance between exact match intervals was with threshold {index_em} and the distance was {max_em_distance}.")
    print(f"The biggest distance between f1 intervals was with threshold {index_f1} and the distance was {max_f1_distance}.")


### Example of run for evaluated dataset

In [18]:
# name for the dataset
dataset = 'squad_baseline'
# loading of dataset into Pandas Dataframe
data = pd.read_json('./datasets/enhanced_valid_squad_with_predictions.json')

find_longest_distance(data[data.distances >= 0], 'distances', 2, 9)
find_longest_distance(data, 'similar_words', 3, 9)
find_longest_distance(data, 'answer_lenght', 1, 6)
compute_metrics_average_split(data, 'cosine_similarity', 0.10)
compute_metrics_average_split(data, 'cosine_similarity', 0.20)
compute_metrics_average_split(data, 'cosine_similarity', 0.30)
compute_metrics_average_split(data, 'cosine_similarity', 0.40)
compute_metrics_average_split(data, 'cosine_similarity', 0.50)
compute_metrics_average_split(data, 'cosine_similarity', 0.60)
find_longest_distance(data, 'max_sim_ents', 0, 5)
find_longest_distance(data[data.answer_subject_positions >= 0], 'answer_subject_positions', 0, 2)

  0%|          | 0/7 [00:00<?, ?it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 100/100 [00:31<00:00,  3.18it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 100/100 [00:31<00:00,  3.22it/s]
100%|██████████| 1/1 [01:02<00:00, 62.50s/it]
 14%|█▍        

Average exact match with params: samples 800 iters 100 ---- are independent: True the distance is: 1.9343750000000028
Average f1 with params: samples 800 iters 100 ---- are independent: True the distance is: 1.9487134100825045



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
 54%|█████▍    | 54/100 [00:16<00:14,  3.20it/s]
  0%|          | 0/1 [00:16<?, ?it/s]
 14%|█▍        | 1/7 [01:19<07:56, 79.39s/it]


KeyboardInterrupt: 