## Notebook for bootstrapping evaluation

### Imports

In [2]:
import transformers
from datasets import load_metric
import pandas as pd
from datasets import Dataset
from tqdm.auto import tqdm

### Parameters for number of iterations and number of selected items

In [3]:
# num_samples = 200
# sample_size = 1000
num_samples = 100
sample_size = 800
# sample_size = 500
num_for_average_metrics = 1 # toto asi vyhodim - zle sa bude pisat
# num_for_average_metrics = 5 # toto asi vyhodim - zle sa bude pisat

In [4]:
metric = load_metric("squad")

In [5]:
def compute_metrics_for_sample(sample):
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in zip(sample['id'], sample['prediction_text'])]
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in sample]
    return metric.compute(predictions=formatted_predictions, references=references)

In [6]:
write_to_log_files = False

In [7]:
dataset = 'squad_base'

### Data loading

In [8]:
data = pd.read_json('valid_pred_labeled_with_added_from_func.json')

In [9]:
# dataset = 'squad_super_distances_7'

In [10]:
# data_supersampled_model = pd.read_json('./from_debiased_models/squad_supersampled_distances_7.json')

### Spliting of the data 

#### Based on distance of the closest word (from question) from the answer in context - items with distance lower or equal than 3 and items with distance higher than 3

### Computation of metrics for samples

In [11]:
# with open("metrics_for_specific_runs.csv", "w") as file:
#     file.write(f"name,samples,sample_size,iters,field,threshold,type,exact_match_quantile_0.025,exact_match_quantile_0.975,exact_match_mean,f1_quantile_0.025,f1_quantile_0.975,f1_mean,len_lower,len_higher")

In [12]:
# with open("average_metrics_for_runs.csv", "w") as file:
#     file.write(f"name,samples,sample_size,iters,field,threshold,type,exact_match_quantile_0.025,exact_match_quantile_0.975,exact_match_mean,f1_quantile_0.025,f1_quantile_0.975,f1_mean,len_lower,len_higher")

In [13]:
# with open("evaluated_metrics_for_average.csv", "w") as file:
#     file.write(f"metric,samples,sample_size,iters,field,threshold,is_not_overlap,distance,len_lower,len_higher")

In [14]:
# with open("metrics_with_intervals_distances_for_dataset_comparison.csv", "w") as file:
#     file.write(f"dataset,field,threshold,lower_interval_em,distance,higher_interval_em,lower_interval_f1,distance,higher_interval_f1,samples,sample_size,len_lower,len_higher")

In [15]:
# with open("data_for_violin.csv", "w") as file:
#     file.write(f"dataset\tfield\tthreshold\tlower_interval_em\tdistance\thigher_interval_em\tlower_interval_f1\tdistance\thigher_interval_f1\tsamples\tsample_size\tlen_lower\tlen_higher\tlower_em_list\thigher_em_list\tlower_f1_list\thigher_f1_list")

In [16]:
def compute_metrics_for_bunch(data):
    exact_list = []
    f1_list = []

    for i in tqdm(range(num_samples)):
        df = data.sample(n=sample_size)
        sample = Dataset.from_pandas(df)
        metrics1 = compute_metrics_for_sample(sample)
        exact_list.append(metrics1['exact_match'])
        f1_list.append(metrics1['f1'])
    
    d = {'exact_match': exact_list, 'f1': f1_list}
    df = pd.DataFrame(d)

    return df

In [17]:
def find_the_distance_between_intervals(lower_025, lower_975, higher_025, higher_975):
    distance_between_intervals = 0
    if lower_975 > higher_025 and lower_025 > higher_975:
        distance_between_intervals = lower_025 - higher_975
        return True, distance_between_intervals 
    elif higher_975 > lower_025 and higher_025 > lower_975:
        distance_between_intervals = higher_025 - lower_975
        return True, distance_between_intervals
    else:
        return False, distance_between_intervals

In [18]:
def does_not_have_enought_samples(field, threshold, data_higher, data_lower):
    if write_to_log_files:
        with open("metrics_for_specific_runs.csv", "a") as file_append:
            file_append.write(f"\nlower_than_{threshold}_for_field_{field}_not_enough_samples,{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},lower,-1,-1,-1,-1,-1,-1,{len(data_lower)},{len(data_higher)}")
        with open("metrics_for_specific_runs.csv", "a") as file_append:
            file_append.write(f"\nhigher_than_{threshold}_for_field_{field}_not_enough_samples,{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},higher,-1,-1,-1,-1,-1,-1,{len(data_lower)},{len(data_higher)}")
        with open("average_metrics_for_runs.csv", "a") as file_append:
            file_append.write(f"\naverage_lower_than_{threshold}_for_field_{field}_not_enough_samples,{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},lower,-1,-1,-1,-1,-1,-1,{len(data_lower)},{len(data_higher)}")
            file_append.write(f"\naverage_higher_than_{threshold}_for_field_{field}_not_enough_samples,{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},higher,-1,-1,-1,-1,-1,-1,{len(data_lower)},{len(data_higher)}")
        with open("evaluated_metrics_for_average.csv", "a") as file_append:
            file_append.write(f"\nexact_match_not_enough_samples,{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},Nan,-1,{len(data_lower)},{len(data_higher)}")
            file_append.write(f"\nf1_not_enough_samples,{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},Nan,-1,{len(data_lower)},{len(data_higher)}")


In [27]:
from statistics import mean

def compute_metrics_average_split(data, field, threshold):
    data_higher, data_lower = [x for _, x in data.groupby(data[field] <= threshold)]

    if len(data_higher) < sample_size or len(data_lower) < sample_size:
        does_not_have_enought_samples(field, threshold, data_higher, data_lower)
        return -1, -1

    lower_exact_match_quantile_025 = []
    lower_exact_match_quantile_975 = []
    lower_exact_match_mean = []
    lower_f1_quantile_025 = []
    lower_f1_quantile_975 = []
    lower_f1_mean = []
    higher_exact_match_quantile_025 = []
    higher_exact_match_quantile_975 = []
    higher_exact_match_mean = []
    higher_f1_quantile_025 = []
    higher_f1_quantile_975 = []
    higher_f1_mean = []
    df_lower = []
    df_higher = []


    for i in tqdm(range(num_for_average_metrics)):
        df_lower = compute_metrics_for_bunch(data_lower)
        lower_exact_match_quantile_025.append(df_lower['exact_match'].quantile(0.025))
        lower_exact_match_quantile_975.append(df_lower['exact_match'].quantile(0.975))
        lower_exact_match_mean.append(df_lower['exact_match'].mean())
        lower_f1_quantile_025.append(df_lower['f1'].quantile(0.025))
        lower_f1_quantile_975.append(df_lower['f1'].quantile(0.975))
        lower_f1_mean.append(df_lower['f1'].mean())

        if write_to_log_files:
            with open("metrics_for_specific_runs.csv", "a") as file_append:
                file_append.write(f"\nlower_than_{threshold}_for_field_{field},{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},lower,{df_lower['exact_match'].quantile(0.025)},{df_lower['exact_match'].quantile(0.975)},{df_lower['exact_match'].mean()},{df_lower['f1'].quantile(0.025)},{df_lower['f1'].quantile(0.975)},{df_lower['f1'].mean()},{len(data_lower)},{len(data_higher)}")

        df_higher = compute_metrics_for_bunch(data_higher)
        higher_exact_match_quantile_025.append(df_higher['exact_match'].quantile(0.025))
        higher_exact_match_quantile_975.append(df_higher['exact_match'].quantile(0.975))
        higher_exact_match_mean.append(df_higher['exact_match'].mean())
        higher_f1_quantile_025.append(df_higher['f1'].quantile(0.025))
        higher_f1_quantile_975.append(df_higher['f1'].quantile(0.975))
        higher_f1_mean.append(df_higher['f1'].mean())

        if write_to_log_files:
            with open("metrics_for_specific_runs.csv", "a") as file_append:
                file_append.write(f"\nhigher_than_{threshold}_for_field_{field},{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},higher,{df_higher['exact_match'].quantile(0.025)},{df_higher['exact_match'].quantile(0.975)},{df_higher['exact_match'].mean()},{df_higher['f1'].quantile(0.025)},{df_higher['f1'].quantile(0.975)},{df_higher['f1'].mean()},{len(data_lower)},{len(data_higher)}")

    if write_to_log_files:
        with open("average_metrics_for_runs.csv", "a") as file_append:
            file_append.write(f"\naverage_lower_than_{threshold}_for_field_{field},{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},lower,{mean(lower_exact_match_quantile_025)},{mean(lower_exact_match_quantile_975)},{mean(lower_exact_match_mean)},{mean(lower_f1_quantile_025)},{mean(lower_f1_quantile_975)},{mean(lower_f1_mean)},{len(data_lower)},{len(data_higher)}")
            file_append.write(f"\naverage_higher_than_{threshold}_for_field_{field},{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},higher,{mean(higher_exact_match_quantile_025)},{mean(higher_exact_match_quantile_975)},{mean(higher_exact_match_mean)},{mean(higher_f1_quantile_025)},{mean(higher_f1_quantile_975)},{mean(higher_f1_mean)},{len(data_lower)},{len(data_higher)}")

    is_not_overlap_em, distance_em = find_the_distance_between_intervals(mean(lower_exact_match_quantile_025), mean(lower_exact_match_quantile_975), mean(higher_exact_match_quantile_025), mean(higher_exact_match_quantile_975))
    is_not_overlap_f1, distance_f1 = find_the_distance_between_intervals(mean(lower_f1_quantile_025), mean(lower_f1_quantile_975), mean(higher_f1_quantile_025), mean(higher_f1_quantile_975))

    if write_to_log_files:
        with open("evaluated_metrics_for_average.csv", "a") as file_append:
            file_append.write(f"\nexact_match,{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},{is_not_overlap_em},{distance_em},{len(data_lower)},{len(data_higher)}")
            file_append.write(f"\nf1,{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},{is_not_overlap_f1},{distance_f1},{len(data_lower)},{len(data_higher)}")

    with open("metrics_with_intervals_distances_for_dataset_comparison.csv", "a") as file_append:
        file_append.write(f"\n{dataset},{field},{threshold},<{mean(lower_exact_match_quantile_025)};{mean(lower_exact_match_quantile_975)}>,{distance_em},<{mean(higher_exact_match_quantile_025)};{mean(higher_exact_match_quantile_975)}>,<{mean(lower_f1_quantile_025)};{mean(lower_f1_quantile_975)}>,{distance_f1},<{mean(higher_f1_quantile_025)};{mean(higher_f1_quantile_975)}>,{num_samples},{sample_size},{len(data_lower)},{len(data_higher)}")

    for i in range(num_samples):
        with open("data_for_violin.csv", "a") as file_append:
            file_append.write(f"\n{dataset}\t{field}\t{threshold}\t<{mean(lower_exact_match_quantile_025)};{mean(lower_exact_match_quantile_975)}>\t{distance_em}\t<{mean(higher_exact_match_quantile_025)};{mean(higher_exact_match_quantile_975)}>\t<{mean(lower_f1_quantile_025)};{mean(lower_f1_quantile_975)}>\t{distance_f1}\t<{mean(higher_f1_quantile_025)};{mean(higher_f1_quantile_975)}>\t{num_samples}\t{sample_size}\t{len(data_lower)}\t{len(data_higher)}\t{df_lower['exact_match'][i]}\tlower\t{df_lower['f1'][i]}\tlower")
            file_append.write(f"\n{dataset}\t{field}\t{threshold}\t<{mean(lower_exact_match_quantile_025)};{mean(lower_exact_match_quantile_975)}>\t{distance_em}\t<{mean(higher_exact_match_quantile_025)};{mean(higher_exact_match_quantile_975)}>\t<{mean(lower_f1_quantile_025)};{mean(lower_f1_quantile_975)}>\t{distance_f1}\t<{mean(higher_f1_quantile_025)};{mean(higher_f1_quantile_975)}>\t{num_samples}\t{sample_size}\t{len(data_lower)}\t{len(data_higher)}\t{df_higher['exact_match'][i]}\thigher\t{df_higher['f1'][i]}\thigher")

    print(f"Average exact match with params: samples {sample_size} iters {num_samples} ---- are independent: {is_not_overlap_em} the distance is: {distance_em}")

    print(f"Average f1 with params: samples {sample_size} iters {num_samples} ---- are independent: {is_not_overlap_f1} the distance is: {distance_f1}")

    return distance_em, distance_f1


In [20]:
def find_longest_distance(data, field, low_bound, upp_bound):
    index_em = 0
    index_f1 = 0
    max_em_distance = 0
    max_f1_distance = 0

    distance_em = 0
    distance_f1 = 0

    for i in tqdm(range(low_bound, upp_bound, 1)):
        distance_em, distance_f1 = compute_metrics_average_split(data, field, i)
        if distance_em > max_em_distance:
            max_em_distance = distance_em
            index_em = i
        if distance_f1 > max_f1_distance:
            max_f1_distance = distance_f1
            index_f1 = i

    print(f"The biggest distance between exact match intervals was with threshold {index_em} and the distance was {max_em_distance}.")
    print(f"The biggest distance between f1 intervals was with threshold {index_f1} and the distance was {max_f1_distance}.")


In [28]:
find_longest_distance(data[data.distances >= 0], 'distances', 7, 8)
find_longest_distance(data, 'similar_words', 4, 5)
find_longest_distance(data, 'answer_lenght', 3, 4)
compute_metrics_average_split(data, 'cosine_similarity', 0.10)
find_longest_distance(data, 'kth_sentence', 0, 1)
find_longest_distance(data, 'max_sim_ents', 0, 1)
find_longest_distance(data[data.answer_subject_positions >= 0], 'answer_subject_positions', 1, 2)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Average exact match with params: samples 800 iters 100 ---- are independent: True the distance is: 8.803125000000009
Average f1 with params: samples 800 iters 100 ---- are independent: True the distance is: 8.320067159352774
The biggest distance between exact match intervals was with threshold 7 and the distance was 8.803125000000009.
The biggest distance between f1 intervals was with threshold 7 and the distance was 8.320067159352774.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Average exact match with params: samples 800 iters 100 ---- are independent: True the distance is: 0.171875
Average f1 with params: samples 800 iters 100 ---- are independent: False the distance is: 0
The biggest distance between exact match intervals was with threshold 4 and the distance was 0.171875.
The biggest distance between f1 intervals was with threshold 0 and the distance was 0.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Average exact match with params: samples 800 iters 100 ---- are independent: True the distance is: 8.684375000000003
Average f1 with params: samples 800 iters 100 ---- are independent: True the distance is: 1.4462761160935287
The biggest distance between exact match intervals was with threshold 3 and the distance was 8.684375000000003.
The biggest distance between f1 intervals was with threshold 3 and the distance was 1.4462761160935287.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Average exact match with params: samples 800 iters 100 ---- are independent: True the distance is: 2.559375000000003
Average f1 with params: samples 800 iters 100 ---- are independent: True the distance is: 2.1884230907556343


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Average exact match with params: samples 800 iters 100 ---- are independent: False the distance is: 0
Average f1 with params: samples 800 iters 100 ---- are independent: False the distance is: 0
The biggest distance between exact match intervals was with threshold 0 and the distance was 0.
The biggest distance between f1 intervals was with threshold 0 and the distance was 0.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Average exact match with params: samples 800 iters 100 ---- are independent: True the distance is: 2.2375000000000114
Average f1 with params: samples 800 iters 100 ---- are independent: True the distance is: 1.3933506893193055
The biggest distance between exact match intervals was with threshold 0 and the distance was 2.2375000000000114.
The biggest distance between f1 intervals was with threshold 0 and the distance was 1.3933506893193055.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Average exact match with params: samples 800 iters 100 ---- are independent: True the distance is: 0.8031250000000085
Average f1 with params: samples 800 iters 100 ---- are independent: True the distance is: 0.39578747090899924
The biggest distance between exact match intervals was with threshold 1 and the distance was 0.8031250000000085.
The biggest distance between f1 intervals was with threshold 1 and the distance was 0.39578747090899924.


In [None]:
dataset = 'squad_distances_v2'
data_supersampled_model = pd.read_json('./from_debiased_models/enhanced_squad_supersampled_distances_7_v2.json')
for i in range(3): #one more run
    find_longest_distance(data_supersampled_model[data_supersampled_model.distances >= 0], 'distances', 2, 9)
    find_longest_distance(data_supersampled_model, 'similar_words', 3, 9)
    find_longest_distance(data_supersampled_model, 'answer_lenght', 1, 6)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.10)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.20)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.30)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.40)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.50)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.60)
    find_longest_distance(data_supersampled_model, 'max_sim_ents', 0, 5)
    find_longest_distance(data_supersampled_model[data_supersampled_model.answer_subject_positions >= 0], 'answer_subject_positions', 0, 2)

In [None]:
# enhanced_squad_base_v2

dataset = 'squad_base_v2'
data_supersampled_model = pd.read_json('./from_debiased_models/enhanced_squad_base_v2.json')
for i in range(3): #one more run
    find_longest_distance(data_supersampled_model[data_supersampled_model.distances >= 0], 'distances', 2, 9)
    find_longest_distance(data_supersampled_model, 'similar_words', 3, 9)
    find_longest_distance(data_supersampled_model, 'answer_lenght', 1, 6)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.10)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.20)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.30)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.40)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.50)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.60)
    find_longest_distance(data_supersampled_model, 'max_sim_ents', 0, 5)
    find_longest_distance(data_supersampled_model[data_supersampled_model.answer_subject_positions >= 0], 'answer_subject_positions', 0, 2)

In [None]:
find_longest_distance(data[data.distances >= 0], 'distances', 7, 8)
find_longest_distance(data, 'similar_words', 4, 5)
find_longest_distance(data, 'answer_lenght', 3, 4)

In [None]:
sample_size = 450
find_longest_distance(data, 'kth_sentence', 0, 5)

In [None]:
compute_metrics_average_split(data, 'cosine_similarity', 0.10)
compute_metrics_average_split(data, 'cosine_similarity', 0.20)
compute_metrics_average_split(data, 'cosine_similarity', 0.30)
compute_metrics_average_split(data, 'cosine_similarity', 0.40)
compute_metrics_average_split(data, 'cosine_similarity', 0.50)
compute_metrics_average_split(data, 'cosine_similarity', 0.60)

In [None]:
sample_size = 450
compute_metrics_average_split(data, 'cosine_similarity', 0.10)

In [None]:
sample_size = 800

In [None]:
find_longest_distance(data[data.answer_subject_positions >= 0], 'answer_subject_positions', 0, 2)

In [None]:
find_longest_distance(data, 'max_sim_ents', 0, 5)

In [None]:
find_longest_distance(data, 'question_length', 5, 16)

In [None]:
dataset = 'squad_super_all_th_2'
data_supersampled_model = pd.read_json('./from_debiased_models/enhanced_squad_supersampled_all_th_2.json')
for i in range(3): #one more run
    find_longest_distance(data_supersampled_model[data_supersampled_model.distances >= 0], 'distances', 2, 9)
    find_longest_distance(data_supersampled_model, 'similar_words', 3, 9)
    find_longest_distance(data_supersampled_model, 'answer_lenght', 1, 6)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.10)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.20)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.30)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.40)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.50)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.60)
    find_longest_distance(data_supersampled_model, 'max_sim_ents', 0, 5)
    find_longest_distance(data_supersampled_model[data_supersampled_model.answer_subject_positions >= 0], 'answer_subject_positions', 0, 2)

In [None]:
dataset = 'squad_super_all_v1'
data_supersampled_model = pd.read_json('./from_debiased_models/enhanced_squad_supersampled_all_v1.json')
for i in range(3):
    find_longest_distance(data_supersampled_model[data_supersampled_model.distances >= 0], 'distances', 2, 9)
    find_longest_distance(data_supersampled_model, 'similar_words', 3, 9)
    find_longest_distance(data_supersampled_model, 'answer_lenght', 1, 6)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.10)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.20)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.30)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.40)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.50)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.60)
    find_longest_distance(data_supersampled_model, 'max_sim_ents', 0, 5)
    find_longest_distance(data_supersampled_model[data_supersampled_model.answer_subject_positions >= 0], 'answer_subject_positions', 0, 2)

In [None]:
dataset = 'squad_super_all_v2'
data_supersampled_model = pd.read_json('./from_debiased_models/enhanced_squad_supersampled_all_v2.json')
for i in range(3):
    find_longest_distance(data_supersampled_model[data_supersampled_model.distances >= 0], 'distances', 2, 9)
    find_longest_distance(data_supersampled_model, 'similar_words', 3, 9)
    find_longest_distance(data_supersampled_model, 'answer_lenght', 1, 6)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.10)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.20)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.30)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.40)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.50)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.60)
    find_longest_distance(data_supersampled_model, 'max_sim_ents', 0, 5)
    find_longest_distance(data_supersampled_model[data_supersampled_model.answer_subject_positions >= 0], 'answer_subject_positions', 0, 2)

In [None]:
dataset = 'squad_super_all_dd_ns'
data_supersampled_model = pd.read_json('./from_debiased_models/enhanced_squad_supersampled_all_dd_ns.json')
for i in range(3):
    find_longest_distance(data_supersampled_model[data_supersampled_model.distances >= 0], 'distances', 2, 9)
    find_longest_distance(data_supersampled_model, 'similar_words', 3, 9)
    find_longest_distance(data_supersampled_model, 'answer_lenght', 1, 6)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.10)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.20)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.30)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.40)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.50)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.60)
    find_longest_distance(data_supersampled_model, 'max_sim_ents', 0, 5)
    find_longest_distance(data_supersampled_model[data_supersampled_model.answer_subject_positions >= 0], 'answer_subject_positions', 0, 2)

In [None]:
dataset = 'squad_base'
data = data = pd.read_json('valid_pred_labeled_with_added_from_func.json')
for i in range(3):
    find_longest_distance(data[data.distances >= 0], 'distances', 2, 9)
    find_longest_distance(data, 'similar_words', 3, 9)
    find_longest_distance(data, 'answer_lenght', 1, 6)
    compute_metrics_average_split(data, 'cosine_similarity', 0.10)
    compute_metrics_average_split(data, 'cosine_similarity', 0.20)
    compute_metrics_average_split(data, 'cosine_similarity', 0.30)
    compute_metrics_average_split(data, 'cosine_similarity', 0.40)
    compute_metrics_average_split(data, 'cosine_similarity', 0.50)
    compute_metrics_average_split(data, 'cosine_similarity', 0.60)
    find_longest_distance(data, 'max_sim_ents', 0, 5)
    find_longest_distance(data[data.answer_subject_positions >= 0], 'answer_subject_positions', 0, 2)

In [None]:
dataset = 'squad_super_cosine_similarity_01'
data_supersampled_model = pd.read_json('./from_debiased_models/enhanced_squad_supersampled_cosine_similarity_01.json')
for i in range(3):
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.10)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.20)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.30)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.40)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.50)
    compute_metrics_average_split(data_supersampled_model, 'cosine_similarity', 0.60)

In [None]:
dataset = 'squad_super_max_sim_ents_0'
data_supersampled_model = pd.read_json('./from_debiased_models/enhanced_squad_supersampled_similar_entities_0.json')
for i in range(3):
    find_longest_distance(data_supersampled_model, 'max_sim_ents', 0, 5)

In [None]:
dataset = 'squad_super_ans_sub_pos_1'
data_supersampled_model = pd.read_json('./from_debiased_models/enhanced_squad_supersampled_ans_sub_pos_1.json')
for i in range(3):
    find_longest_distance(data_supersampled_model[data_supersampled_model.answer_subject_positions >= 0], 'answer_subject_positions', 0, 2)

In [None]:
dataset = 'squad_base'
data = pd.read_json('valid_pred_labeled_with_added_from_func.json')
for i in range(4):
    find_longest_distance(data[data.distances >= 0], 'distances', 2, 9)

In [None]:
dataset = 'squad_super_distances_7'
data_supersampled_model = pd.read_json('./from_debiased_models/enhanced_squad_supersampled_distances_7.json')
for i in range(4):
    find_longest_distance(data_supersampled_model[data_supersampled_model.distances >= 0], 'distances', 2, 9)

In [None]:
dataset = 'squad_base'
data = pd.read_json('valid_pred_labeled_with_added_from_func.json')
for i in range(5):
    find_longest_distance(data, 'similar_words', 3, 9)

In [None]:
dataset = 'squad_super_similar_words_4'
data_supersampled_model = pd.read_json('./from_debiased_models/enhanced_squad_supersampled_similar_words_4.json')
for i in range(5):
    find_longest_distance(data_supersampled_model, 'similar_words', 3, 9)

In [None]:
dataset = 'squad_base'
data = pd.read_json('valid_pred_labeled_with_added_from_func.json')
for i in range(5):
    find_longest_distance(data, 'answer_lenght', 1, 6)

In [None]:
dataset = 'squad_super_answer_length_3'
data_supersampled_model = pd.read_json('./from_debiased_models/enhanced_squad_supersampled_answer_length_3.json')
for i in range(5):
    find_longest_distance(data_supersampled_model, 'answer_lenght', 1, 6)

In [None]:
# find_longest_distance(data[data.distances_new >= 0], 'distances_new', 2, 7)
find_longest_distance(data[data.distances_new >= 0], 'distances_new', 7, 9)

In [None]:
find_longest_distance(data[data.distances >= 0], 'distances', 2, 7)

In [None]:
find_longest_distance(data, 'kth_sentence_new', 0, 5)

In [None]:
# data = data[data.answer_subject_positions >= 0]
find_longest_distance(data[data.answer_subject_positions >= 0], 'answer_subject_positions', 0, 2)

In [None]:
data = data[data.answer_subject_positions >= 0]
find_longest_distance(data, 'answer_subject_positions', 0, 2)

In [None]:
data = data[data.answer_subject_positions >= 0]
find_longest_distance(data, 'answer_subject_positions', 0, 2)

In [None]:
# data = data[data.subject_in_context_count >= 0]
find_longest_distance(data[data.subject_in_context_count >= 0], 'subject_in_context_count', 0, 3)

In [None]:
find_longest_distance(data, 'max_sim_ents', 0, 5)

In [None]:
find_longest_distance(data, 'max_sim_ents', 0, 5)

In [None]:
find_longest_distance(data, 'subject_in_context_count', 0, 2)

In [None]:
find_longest_distance(data, 'answer_subject_positions', 0, 2)

In [None]:
find_longest_distance(data, 'answer_subject_positions', 0, 5)

In [None]:
find_longest_distance(data, 'distances_new', 2, 7)

In [None]:
find_longest_distance(data, 'distances', 2, 7)

In [None]:
data.loc[data['distances'] == -1, 'distances'] = 1000
data['distances'].value_counts()

In [None]:
find_longest_distance(data, 'distances', 2, 7)

In [None]:
find_longest_distance(data, 'answer_lenght', 1, 5)

In [None]:
for size in range(200, 1100, 100):
    sample_size = size
    for iters in range(100, 600, 100):
        num_samples = iters
        find_longest_distance(data, 'distances', 2, 7)

In [None]:
for size in range(900, 1100, 100):
    sample_size = size
    for iters in range(100, 600, 100):
        num_samples = iters
        find_longest_distance(data, 'distances', 2, 7)

In [None]:
for size in range(200, 1100, 100):
    sample_size = size
    for iters in range(100, 600, 100):
        num_samples = iters
        find_longest_distance(data, 'similar_words', 3, 10)

In [None]:
for size in range(200, 1100, 200):
    sample_size = size
    for iters in range(100, 600, 200):
        num_samples = iters
        find_longest_distance(data, 'kth_sentence', 0, 5)

In [None]:
for size in range(200, 1100, 200):
    sample_size = size
    for iters in range(100, 600, 200):
        num_samples = iters
        compute_metrics_average_split(data, 'cosine_similarity', 0.10)
        compute_metrics_average_split(data, 'cosine_similarity', 0.20)
        compute_metrics_average_split(data, 'cosine_similarity', 0.30)
        compute_metrics_average_split(data, 'cosine_similarity', 0.40)
        compute_metrics_average_split(data, 'cosine_similarity', 0.50)
        compute_metrics_average_split(data, 'cosine_similarity', 0.60)

In [None]:
find_longest_distance(data, 'distances', 2, 7)

In [None]:
find_longest_distance(data, 'similar_words', 3, 10)

In [None]:
find_longest_distance(data, 'similar_words', 2, 6)

In [None]:
find_longest_distance(data, 'kth_sentence', 0, 5) #zvacsit interval, skusit pre vacsie k
# find_longest_distance(data, 'kth_sentence', 5, 8) #zvacsit interval, skusit pre vacsie k
# find_longest_distance(data, 'kth_sentence', 1, 5) #zvacsit interval, skusit pre vacsie k

In [None]:
num_samples = 200
sample_size = 1000

find_longest_distance(data, 'kth_sentence', 4, 6)

In [None]:
compute_metrics_average_split(data, 'cosine_similarity', 0.10)

In [None]:
compute_metrics_average_split(data, 'cosine_similarity', 0.15)

In [None]:
compute_metrics_average_split(data, 'cosine_similarity', 0.20)

In [None]:
compute_metrics_average_split(data, 'cosine_similarity', 0.25)

In [None]:
compute_metrics_average_split(data, 'cosine_similarity', 0.30)

In [None]:
compute_metrics_average_split(data, 'cosine_similarity', 0.35)

In [None]:
compute_metrics_average_split(data, 'cosine_similarity', 0.40)

In [None]:
compute_metrics_average_split(data, 'cosine_similarity', 0.45)

In [None]:
compute_metrics_average_split(data, 'cosine_similarity', 0.50)

In [None]:
compute_metrics_average_split(data, 'cosine_similarity', 0.55)

In [None]:
compute_metrics_average_split(data, 'distances', 5)

In [None]:
compute_metrics_average_split(data, 'distances', 6)

In [None]:
index_em = 0
index_f1 = 0
max_em_distance = 0
max_f1_distance = 0

distance_em = 0
distance_f1 = 0

for i in range(3, 6, 1):
    distance_em, distance_f1 = compute_metrics_average_split(data, 'similar_words', i)
    if distance_em > max_em_distance:
        max_em_distance = distance_em
        index_em = i
    if distance_f1 > max_f1_distance:
        max_f1_distance = distance_f1
        index_f1 = i

print(f"The biggest distance between exact match intervals was with threshold {index_em} and the distance was {max_em_distance}.")
print(f"The biggest distance between f1 intervals was with threshold {index_f1} and the distance was {max_f1_distance}.")

In [None]:
compute_metrics_average_split(data, 'similar_words', 2)

In [None]:
compute_metrics_average_split(data, 'distances', 3)

In [None]:
compute_metrics_average_split(data, 'similar_words', 4)

In [None]:
compute_metrics_average_split(data, 'kth_sentence', 3)

In [None]:
compute_metrics_average_split(data, 'kth_sentence', 2)

In [None]:
compute_metrics_average_split(data, 'kth_sentence', 1)

In [None]:
compute_metrics_average_split(data, 'cosine_similarity', 0.35)

In [None]:
compute_metrics_average_split(data, 'cosine_similarity', 0.25)

In [None]:
compute_metrics_average_split(data, 'cosine_similarity', 0.45)

### Older segments of code, now used in functions above

In [None]:
data_distances = data[data.distances >= 0]
data_higher, data_lower = [x for _, x in data.groupby(data_distances['distances'] <= 3)]

In [None]:
exact_list_lower = []
f1_list_lower = []

for i in tqdm(range(num_samples)):
    df = data_lower.sample(n=sample_size)
    sample = Dataset.from_pandas(df)
    metrics1 = compute_metrics_for_sample(sample)
    exact_list_lower.append(metrics1['exact_match'])
    f1_list_lower.append(metrics1['f1'])

In [None]:
d_lower = {'exact_match': exact_list_lower, 'f1': f1_list_lower}
d_lower
lower_than_4 = pd.DataFrame(d_lower)
lower_than_4

In [None]:
exact_list_higher = []
f1_list_higher = []

for i in tqdm(range(num_samples)):
    df = data_higher.sample(n=sample_size)
    sample = Dataset.from_pandas(df)
    metrics1 = compute_metrics_for_sample(sample)
    exact_list_higher.append(metrics1['exact_match'])
    f1_list_higher.append(metrics1['f1'])

In [None]:
d_higher = {'exact_match': exact_list_higher, 'f1': f1_list_higher}
d_higher
higher_than_4 = pd.DataFrame(d_higher)
higher_than_4

### Quantiles

In [None]:
lower_than_4.quantile([0.025, 0.975])

In [None]:
higher_than_4.quantile([0.025, 0.975])

In [None]:
lower_than_4.describe()

In [None]:
higher_than_4.describe()

### Spliting of the data

#### Based on count of the similar words between question and context - lower or equal to 4 and higher than 4

In [None]:
data_similar_words_higher, data_similar_words_lower = [x for _, x in data.groupby(data['similar_words'] <= 4)]
print('Higher count data len: ', len(data_similar_words_higher))
print('Lower count data len: ', len(data_similar_words_lower))

### Computation of metrics for samples

In [None]:
exact_list_similar_words_lower = []
f1_list_similar_words_lower = []

for i in tqdm(range(num_samples)):
    df = data_similar_words_lower.sample(n=sample_size)
    sample = Dataset.from_pandas(df)
    metrics1 = compute_metrics_for_sample(sample)
    exact_list_similar_words_lower.append(metrics1['exact_match'])
    f1_list_similar_words_lower.append(metrics1['f1'])

In [None]:
d_similar_words_lower = {'exact_match': exact_list_similar_words_lower, 'f1': f1_list_similar_words_lower}
d_similar_words_lower
lower_similar_words_than_4 = pd.DataFrame(d_similar_words_lower)
lower_similar_words_than_4

In [None]:
exact_list_similar_words_higher = []
f1_list_similar_words_higher = []

for i in tqdm(range(num_samples)):
    df = data_similar_words_higher.sample(n=sample_size)
    sample = Dataset.from_pandas(df)
    metrics1 = compute_metrics_for_sample(sample)
    exact_list_similar_words_higher.append(metrics1['exact_match'])
    f1_list_similar_words_higher.append(metrics1['f1'])

In [None]:
d_similar_words_higher = {'exact_match': exact_list_similar_words_higher, 'f1': f1_list_similar_words_higher}
d_similar_words_higher
higher_similar_words_than_4 = pd.DataFrame(d_similar_words_higher)
higher_similar_words_than_4

### Quantiles

In [None]:
lower_similar_words_than_4.quantile([0.025, 0.975])

In [None]:
higher_similar_words_than_4.quantile([0.025, 0.975])

In [None]:
lower_similar_words_than_4.describe()

In [None]:
higher_similar_words_than_4.describe()