## Notebook for bootstrapping evaluation

### Imports

In [1]:
import transformers
from datasets import load_metric
import pandas as pd
from datasets import Dataset
from tqdm.auto import tqdm

### Parameters for number of iterations and number of selected items

In [2]:
num_samples = 200
sample_size = 1000
num_for_average_metrics = 5

In [3]:
metric = load_metric("squad")

In [4]:
def compute_metrics_for_sample(sample):
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in zip(sample['id'], sample['prediction_text'])]
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in sample]
    return metric.compute(predictions=formatted_predictions, references=references)

### Data loading

In [5]:
data = pd.read_json('valid_pred_labeled_with_added_from_func.json')

### Spliting of the data 

#### Based on distance of the closest word (from question) from the answer in context - items with distance lower or equal than 3 and items with distance higher than 3

### Computation of metrics for samples

In [6]:
with open("metrics_for_specific_runs.csv", "w") as file:
    file.write(f"name,samples,sample_size,iters,field,threshold,type,exact_match_quantile_0.025,exact_match_quantile_0.975,exact_match_mean,f1_quantile_0.025,f1_quantile_0.975,f1_mean,len_lower,len_higher")

In [7]:
with open("average_metrics_for_runs.csv", "w") as file:
    file.write(f"name,samples,sample_size,iters,field,threshold,type,exact_match_quantile_0.025,exact_match_quantile_0.975,exact_match_mean,f1_quantile_0.025,f1_quantile_0.975,f1_mean,len_lower,len_higher")

In [8]:
with open("evaluated_metrics_for_average.csv", "w") as file:
    file.write(f"metric,samples,sample_size,iters,field,threshold,is_not_overlap,distance,len_lower,len_higher")

In [9]:
def compute_metrics_for_bunch(data):
    exact_list = []
    f1_list = []

    for i in tqdm(range(num_samples)):
        df = data.sample(n=sample_size)
        sample = Dataset.from_pandas(df)
        metrics1 = compute_metrics_for_sample(sample)
        exact_list.append(metrics1['exact_match'])
        f1_list.append(metrics1['f1'])
    
    d = {'exact_match': exact_list, 'f1': f1_list}
    df = pd.DataFrame(d)

    return df

In [10]:
def find_the_distance_between_intervals(lower_025, lower_975, higher_025, higher_975):
    distance_between_intervals = 0
    if lower_975 > higher_025 and lower_025 > higher_975:
        distance_between_intervals = lower_025 - higher_975
        return True, distance_between_intervals 
    elif higher_975 > lower_025 and higher_025 > lower_975:
        distance_between_intervals = higher_025 - lower_975
        return True, distance_between_intervals
    else:
        return False, distance_between_intervals

In [11]:
from statistics import mean

def compute_metrics_average_split(data, field, threshold):
    if field == 'distances':
        data = data[data.distances >= 0]
    data_higher, data_lower = [x for _, x in data.groupby(data[field] <= threshold)]

    lower_exact_match_quantile_025 = []
    lower_exact_match_quantile_975 = []
    lower_exact_match_mean = []
    lower_f1_quantile_025 = []
    lower_f1_quantile_975 = []
    lower_f1_mean = []
    higher_exact_match_quantile_025 = []
    higher_exact_match_quantile_975 = []
    higher_exact_match_mean = []
    higher_f1_quantile_025 = []
    higher_f1_quantile_975 = []
    higher_f1_mean = []


    for i in tqdm(range(num_for_average_metrics)):
        df_lower = compute_metrics_for_bunch(data_lower)
        lower_exact_match_quantile_025.append(df_lower['exact_match'].quantile(0.025))
        lower_exact_match_quantile_975.append(df_lower['exact_match'].quantile(0.975))
        lower_exact_match_mean.append(df_lower['exact_match'].mean())
        lower_f1_quantile_025.append(df_lower['f1'].quantile(0.025))
        lower_f1_quantile_975.append(df_lower['f1'].quantile(0.975))
        lower_f1_mean.append(df_lower['f1'].mean())

        with open("metrics_for_specific_runs.csv", "a") as file_append:
            file_append.write(f"\nlower_than_{threshold}_for_field_{field},{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},lower,{df_lower['exact_match'].quantile(0.025)},{df_lower['exact_match'].quantile(0.975)},{df_lower['exact_match'].mean()},{df_lower['f1'].quantile(0.025)},{df_lower['f1'].quantile(0.975)},{df_lower['f1'].mean()},{len(data_lower)},{len(data_higher)}")

        df_higher = compute_metrics_for_bunch(data_higher)
        higher_exact_match_quantile_025.append(df_higher['exact_match'].quantile(0.025))
        higher_exact_match_quantile_975.append(df_higher['exact_match'].quantile(0.975))
        higher_exact_match_mean.append(df_higher['exact_match'].mean())
        higher_f1_quantile_025.append(df_higher['f1'].quantile(0.025))
        higher_f1_quantile_975.append(df_higher['f1'].quantile(0.975))
        higher_f1_mean.append(df_higher['f1'].mean())

        with open("metrics_for_specific_runs.csv", "a") as file_append:
            file_append.write(f"\nhigher_than_{threshold}_for_field_{field},{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},higher,{df_higher['exact_match'].quantile(0.025)},{df_higher['exact_match'].quantile(0.975)},{df_higher['exact_match'].mean()},{df_higher['f1'].quantile(0.025)},{df_higher['f1'].quantile(0.975)},{df_higher['f1'].mean()},{len(data_lower)},{len(data_higher)}")

    with open("average_metrics_for_runs.csv", "a") as file_append:
        file_append.write(f"\naverage_lower_than_{threshold}_for_field_{field},{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},lower,{mean(lower_exact_match_quantile_025)},{mean(lower_exact_match_quantile_975)},{mean(lower_exact_match_mean)},{mean(lower_f1_quantile_025)},{mean(lower_f1_quantile_975)},{mean(lower_f1_mean)},{len(data_lower)},{len(data_higher)}")
        file_append.write(f"\naverage_higher_than_{threshold}_for_field_{field},{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},higher,{mean(higher_exact_match_quantile_025)},{mean(higher_exact_match_quantile_975)},{mean(higher_exact_match_mean)},{mean(higher_f1_quantile_025)},{mean(higher_f1_quantile_975)},{mean(higher_f1_mean)},{len(data_lower)},{len(data_higher)}")

    is_not_overlap_em, distance_em = find_the_distance_between_intervals(mean(lower_exact_match_quantile_025), mean(lower_exact_match_quantile_975), mean(higher_exact_match_quantile_025), mean(higher_exact_match_quantile_975))
    is_not_overlap_f1, distance_f1 = find_the_distance_between_intervals(mean(lower_f1_quantile_025), mean(lower_f1_quantile_975), mean(higher_f1_quantile_025), mean(higher_f1_quantile_975))

    with open("evaluated_metrics_for_average.csv", "a") as file_append:
        file_append.write(f"\nexact_match,{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},{is_not_overlap_em},{distance_em},{len(data_lower)},{len(data_higher)}")
        file_append.write(f"\nf1,{num_samples},{sample_size},{num_for_average_metrics},{field},{threshold},{is_not_overlap_f1},{distance_f1},{len(data_lower)},{len(data_higher)}")

    print(f"Average exact match ---- are independent: {is_not_overlap_em} the distance is: {distance_em}")

    print(f"Average f1 ---- are independent: {is_not_overlap_f1} the distance is: {distance_f1}")

    return distance_em, distance_f1


In [12]:
def find_longest_distance(data, field, low_bound, upp_bound):
    index_em = 0
    index_f1 = 0
    max_em_distance = 0
    max_f1_distance = 0

    distance_em = 0
    distance_f1 = 0

    for i in tqdm(range(low_bound, upp_bound, 1)):
        distance_em, distance_f1 = compute_metrics_average_split(data, field, i)
        if distance_em > max_em_distance:
            max_em_distance = distance_em
            index_em = i
        if distance_f1 > max_f1_distance:
            max_f1_distance = distance_f1
            index_f1 = i

    print(f"The biggest distance between exact match intervals was with threshold {index_em} and the distance was {max_em_distance}.")
    print(f"The biggest distance between f1 intervals was with threshold {index_f1} and the distance was {max_f1_distance}.")


In [13]:
find_longest_distance(data, 'distances', 2, 7)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: True the distance is: 1.4505000000000052
Average f1 ---- are independent: True the distance is: 1.3836281592006685


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: True the distance is: 3.2734999999999985
Average f1 ---- are independent: True the distance is: 2.702064250582737


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: True the distance is: 4.214000000000013
Average f1 ---- are independent: True the distance is: 3.584386115122342


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: True the distance is: 4.732500000000002
Average f1 ---- are independent: True the distance is: 4.076764962863081


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: True the distance is: 5.656499999999994
Average f1 ---- are independent: True the distance is: 5.180134607120863
The biggest distance between exact match intervals was with threshold 6 and the distance was 5.656499999999994.
The biggest distance between f1 intervals was with threshold 6 and the distance was 5.180134607120863.


In [14]:
find_longest_distance(data, 'similar_words', 2, 6)

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: True the distance is: 1.2155000000000058
Average f1 ---- are independent: True the distance is: 0.8222327323854017


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: False the distance is: 0
Average f1 ---- are independent: False the distance is: 0


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: False the distance is: 0
Average f1 ---- are independent: True the distance is: 0.004509768556843596


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: True the distance is: 0.4759999999999991
Average f1 ---- are independent: True the distance is: 0.7584167460689599
The biggest distance between exact match intervals was with threshold 2 and the distance was 1.2155000000000058.
The biggest distance between f1 intervals was with threshold 2 and the distance was 0.8222327323854017.


In [15]:
find_longest_distance(data, 'kth_sentence', 1, 5)

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: False the distance is: 0
Average f1 ---- are independent: False the distance is: 0


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: False the distance is: 0
Average f1 ---- are independent: False the distance is: 0


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: False the distance is: 0
Average f1 ---- are independent: False the distance is: 0


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: False the distance is: 0
Average f1 ---- are independent: False the distance is: 0
The biggest distance between exact match intervals was with threshold 0 and the distance was 0.
The biggest distance between f1 intervals was with threshold 0 and the distance was 0.


In [16]:
compute_metrics_average_split(data, 'cosine_similarity', 0.25)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: False the distance is: 0
Average f1 ---- are independent: False the distance is: 0


(0, 0)

In [17]:
compute_metrics_average_split(data, 'cosine_similarity', 0.30)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: False the distance is: 0
Average f1 ---- are independent: False the distance is: 0


(0, 0)

In [18]:
compute_metrics_average_split(data, 'cosine_similarity', 0.35)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: False the distance is: 0
Average f1 ---- are independent: False the distance is: 0


(0, 0)

In [19]:
compute_metrics_average_split(data, 'cosine_similarity', 0.40)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: False the distance is: 0
Average f1 ---- are independent: False the distance is: 0


(0, 0)

In [20]:
compute_metrics_average_split(data, 'cosine_similarity', 0.45)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: False the distance is: 0
Average f1 ---- are independent: False the distance is: 0


(0, 0)

In [23]:
compute_metrics_average_split(data, 'distances', 5)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: True the distance is: 4.358000000000004
Average f1 ---- are independent: True the distance is: 3.768811714146352


(4.358000000000004, 3.768811714146352)

In [25]:
compute_metrics_average_split(data, 'distances', 6)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [13]:
index_em = 0
index_f1 = 0
max_em_distance = 0
max_f1_distance = 0

distance_em = 0
distance_f1 = 0

for i in range(3, 6, 1):
    distance_em, distance_f1 = compute_metrics_average_split(data, 'similar_words', i)
    if distance_em > max_em_distance:
        max_em_distance = distance_em
        index_em = i
    if distance_f1 > max_f1_distance:
        max_f1_distance = distance_f1
        index_f1 = i

print(f"The biggest distance between exact match intervals was with threshold {index_em} and the distance was {max_em_distance}.")
print(f"The biggest distance between f1 intervals was with threshold {index_f1} and the distance was {max_f1_distance}.")

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: True the distance is: 0.8335000000000008
Average f1 ---- are independent: True the distance is: 0.36808708851265237


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: True the distance is: 0.1915000000000049
Average f1 ---- are independent: True the distance is: 0.10871203410809471


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: True the distance is: 0.2950000000000017
Average f1 ---- are independent: True the distance is: 0.24164632624137994
The biggest distance between exact match intervals was with threshold 3 and the distance was 0.8335000000000008.
The biggest distance between f1 intervals was with threshold 3 and the distance was 0.36808708851265237.


In [24]:
compute_metrics_average_split(data, 'similar_words', 2)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: True the distance is: 2.8160000000000025
Average f1 ---- are independent: True the distance is: 1.6793645000818458


(2.8160000000000025, 1.6793645000818458)

In [45]:
compute_metrics_average_split(data, 'distances', 3)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: True the distance is: 3.2974999999999994
Average f1 ---- are independent: True the distance is: 2.6901958482312978


In [46]:
compute_metrics_average_split(data, 'similar_words', 4)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: True the distance is: 0.59375
Average f1 ---- are independent: True the distance is: 0.13620745615331487


In [18]:
compute_metrics_average_split(data, 'kth_sentence', 3)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: False the distance is: 0
Average f1 ---- are independent: False the distance is: 0


(0, 0)

In [None]:
compute_metrics_average_split(data, 'kth_sentence', 2)

In [None]:
compute_metrics_average_split(data, 'kth_sentence', 1)

In [None]:
compute_metrics_average_split(data, 'cosine_similarity', 0.35)

In [21]:
compute_metrics_average_split(data, 'cosine_similarity', 0.25)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: False the distance is: 0
Average f1 ---- are independent: False the distance is: 0


(0, 0)

In [22]:
compute_metrics_average_split(data, 'cosine_similarity', 0.45)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

Average exact match ---- are independent: False the distance is: 0
Average f1 ---- are independent: False the distance is: 0


(0, 0)

### Older segments of code, now used in functions above

In [23]:
data_distances = data[data.distances >= 0]
data_higher, data_lower = [x for _, x in data.groupby(data_distances['distances'] <= 3)]

In [24]:
exact_list_lower = []
f1_list_lower = []

for i in tqdm(range(num_samples)):
    df = data_lower.sample(n=sample_size)
    sample = Dataset.from_pandas(df)
    metrics1 = compute_metrics_for_sample(sample)
    exact_list_lower.append(metrics1['exact_match'])
    f1_list_lower.append(metrics1['f1'])

  0%|          | 0/200 [00:00<?, ?it/s]

In [25]:
d_lower = {'exact_match': exact_list_lower, 'f1': f1_list_lower}
d_lower
lower_than_4 = pd.DataFrame(d_lower)
lower_than_4

Unnamed: 0,exact_match,f1
0,84.0,90.846708
1,84.6,91.083183
2,83.3,89.857095
3,84.1,90.562758
4,84.4,90.841649
...,...,...
195,83.5,90.683836
196,83.1,90.170529
197,83.8,89.997452
198,84.0,90.788456


In [26]:
exact_list_higher = []
f1_list_higher = []

for i in tqdm(range(num_samples)):
    df = data_higher.sample(n=sample_size)
    sample = Dataset.from_pandas(df)
    metrics1 = compute_metrics_for_sample(sample)
    exact_list_higher.append(metrics1['exact_match'])
    f1_list_higher.append(metrics1['f1'])

  0%|          | 0/200 [00:00<?, ?it/s]

In [27]:
d_higher = {'exact_match': exact_list_higher, 'f1': f1_list_higher}
d_higher
higher_than_4 = pd.DataFrame(d_higher)
higher_than_4

Unnamed: 0,exact_match,f1
0,76.8,85.262767
1,76.2,85.448763
2,76.8,85.289513
3,75.3,84.098369
4,76.2,84.887286
...,...,...
195,75.4,84.798147
196,75.0,83.924188
197,75.0,85.048418
198,73.8,83.353506


### Quantiles

In [28]:
lower_than_4.quantile([0.025, 0.975])

Unnamed: 0,exact_match,f1
0.025,81.5,89.203266
0.975,85.3025,91.658113


In [29]:
higher_than_4.quantile([0.025, 0.975])

Unnamed: 0,exact_match,f1
0.025,74.2875,83.591081
0.975,78.4025,86.670196


In [30]:
lower_than_4.describe()

Unnamed: 0,exact_match,f1
count,200.0,200.0
mean,83.5575,90.499838
std,0.95151,0.6589
min,80.6,87.897474
25%,83.0,90.086818
50%,83.6,90.51326
75%,84.225,90.958312
max,85.6,92.109433


In [31]:
higher_than_4.describe()

Unnamed: 0,exact_match,f1
count,200.0,200.0
mean,76.228,85.123828
std,1.06068,0.824414
min,73.1,82.74085
25%,75.5,84.481497
50%,76.2,85.142433
75%,76.9,85.704912
max,78.9,86.942339


### Spliting of the data

#### Based on count of the similar words between question and context - lower or equal to 4 and higher than 4

In [88]:
data_similar_words_higher, data_similar_words_lower = [x for _, x in data.groupby(data['similar_words'] <= 4)]
print('Higher count data len: ', len(data_similar_words_higher))
print('Lower count data len: ', len(data_similar_words_lower))

Higher count data len:  5740
Lower count data len:  4830


### Computation of metrics for samples

In [89]:
exact_list_similar_words_lower = []
f1_list_similar_words_lower = []

for i in tqdm(range(num_samples)):
    df = data_similar_words_lower.sample(n=sample_size)
    sample = Dataset.from_pandas(df)
    metrics1 = compute_metrics_for_sample(sample)
    exact_list_similar_words_lower.append(metrics1['exact_match'])
    f1_list_similar_words_lower.append(metrics1['f1'])

  0%|          | 0/200 [00:00<?, ?it/s]

In [90]:
d_similar_words_lower = {'exact_match': exact_list_similar_words_lower, 'f1': f1_list_similar_words_lower}
d_similar_words_lower
lower_similar_words_than_4 = pd.DataFrame(d_similar_words_lower)
lower_similar_words_than_4

Unnamed: 0,exact_match,f1
0,77.9,86.201630
1,75.6,84.616505
2,75.7,84.998308
3,76.6,85.272872
4,77.3,85.852570
...,...,...
195,78.6,87.390954
196,77.3,86.016487
197,77.7,86.395169
198,77.7,85.788061


In [91]:
exact_list_similar_words_higher = []
f1_list_similar_words_higher = []

for i in tqdm(range(num_samples)):
    df = data_similar_words_higher.sample(n=sample_size)
    sample = Dataset.from_pandas(df)
    metrics1 = compute_metrics_for_sample(sample)
    exact_list_similar_words_higher.append(metrics1['exact_match'])
    f1_list_similar_words_higher.append(metrics1['f1'])

  0%|          | 0/200 [00:00<?, ?it/s]

In [92]:
d_similar_words_higher = {'exact_match': exact_list_similar_words_higher, 'f1': f1_list_similar_words_higher}
d_similar_words_higher
higher_similar_words_than_4 = pd.DataFrame(d_similar_words_higher)
higher_similar_words_than_4

Unnamed: 0,exact_match,f1
0,82.9,89.668875
1,82.0,89.146711
2,85.1,91.292489
3,82.1,89.113918
4,82.2,88.752351
...,...,...
195,82.3,89.376262
196,82.3,89.489812
197,84.4,90.674647
198,81.8,88.256092


### Quantiles

In [93]:
lower_similar_words_than_4.quantile([0.025, 0.975])

Unnamed: 0,exact_match,f1
0.025,75.4,84.693802
0.975,79.9025,87.822978


In [94]:
higher_similar_words_than_4.quantile([0.025, 0.975])

Unnamed: 0,exact_match,f1
0.025,80.5975,87.976679
0.975,84.805,91.222243


In [95]:
lower_similar_words_than_4.describe()

Unnamed: 0,exact_match,f1
count,200.0,200.0
mean,77.7515,86.170985
std,1.185845,0.80069
min,75.0,84.148025
25%,76.8,85.648972
50%,77.85,86.092368
75%,78.6,86.68711
max,80.9,88.708345


In [96]:
higher_similar_words_than_4.describe()

Unnamed: 0,exact_match,f1
count,200.0,200.0
mean,82.6785,89.498549
std,1.15471,0.842324
min,79.3,87.639278
25%,81.9,88.921644
50%,82.7,89.436036
75%,83.425,89.917512
max,85.7,91.702404
