### Imports

In [18]:
import transformers
from datasets import load_metric
import pandas as pd
from datasets import Dataset
from tqdm.auto import tqdm

### Parameters for number of iterations and number of selected items

In [19]:
num_samples = 200
sample_size = 1000

In [20]:
metric = load_metric("squad")

In [21]:
def compute_metrics_for_sample(sample):
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in zip(sample['id'], sample['prediction_text'])]
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in sample]
    return metric.compute(predictions=formatted_predictions, references=references)

### Data loading

In [22]:
data = pd.read_json('valid_pred_labeled_with_added_from_func.json')

### Spliting of the data 

#### Based on distance of the closest word (from question) from the answer in context - items with distance lower or equal than 3 and items with distance higher than 3

In [23]:
data_distances = data[data.distances >= 0]
data_higher, data_lower = [x for _, x in data.groupby(data_distances['distances'] <= 3)]

### Computation of metrics for samples

In [24]:
exact_list_lower = []
f1_list_lower = []

for i in tqdm(range(num_samples)):
    df = data_lower.sample(n=sample_size)
    sample = Dataset.from_pandas(df)
    metrics1 = compute_metrics_for_sample(sample)
    exact_list_lower.append(metrics1['exact_match'])
    f1_list_lower.append(metrics1['f1'])

  0%|          | 0/200 [00:00<?, ?it/s]

In [25]:
d_lower = {'exact_match': exact_list_lower, 'f1': f1_list_lower}
d_lower
lower_than_4 = pd.DataFrame(d_lower)
lower_than_4

Unnamed: 0,exact_match,f1
0,84.0,90.846708
1,84.6,91.083183
2,83.3,89.857095
3,84.1,90.562758
4,84.4,90.841649
...,...,...
195,83.5,90.683836
196,83.1,90.170529
197,83.8,89.997452
198,84.0,90.788456


In [26]:
exact_list_higher = []
f1_list_higher = []

for i in tqdm(range(num_samples)):
    df = data_higher.sample(n=sample_size)
    sample = Dataset.from_pandas(df)
    metrics1 = compute_metrics_for_sample(sample)
    exact_list_higher.append(metrics1['exact_match'])
    f1_list_higher.append(metrics1['f1'])

  0%|          | 0/200 [00:00<?, ?it/s]

In [27]:
d_higher = {'exact_match': exact_list_higher, 'f1': f1_list_higher}
d_higher
higher_than_4 = pd.DataFrame(d_higher)
higher_than_4

Unnamed: 0,exact_match,f1
0,76.8,85.262767
1,76.2,85.448763
2,76.8,85.289513
3,75.3,84.098369
4,76.2,84.887286
...,...,...
195,75.4,84.798147
196,75.0,83.924188
197,75.0,85.048418
198,73.8,83.353506


### Quantiles

In [28]:
lower_than_4.quantile([0.025, 0.975])

Unnamed: 0,exact_match,f1
0.025,81.5,89.203266
0.975,85.3025,91.658113


In [29]:
higher_than_4.quantile([0.025, 0.975])

Unnamed: 0,exact_match,f1
0.025,74.2875,83.591081
0.975,78.4025,86.670196


In [30]:
lower_than_4.describe()

Unnamed: 0,exact_match,f1
count,200.0,200.0
mean,83.5575,90.499838
std,0.95151,0.6589
min,80.6,87.897474
25%,83.0,90.086818
50%,83.6,90.51326
75%,84.225,90.958312
max,85.6,92.109433


In [31]:
higher_than_4.describe()

Unnamed: 0,exact_match,f1
count,200.0,200.0
mean,76.228,85.123828
std,1.06068,0.824414
min,73.1,82.74085
25%,75.5,84.481497
50%,76.2,85.142433
75%,76.9,85.704912
max,78.9,86.942339


### Spliting of the data

#### Based on count of the similar words between question and context - lower or equal to 4 and higher than 4

In [57]:
data_similar_words_higher, data_similar_words_lower = [x for _, x in data.groupby(data['similar_words'] <= 4)]

### Computation of metrics for samples

In [58]:
exact_list_similar_words_lower = []
f1_list_similar_words_lower = []

for i in tqdm(range(num_samples)):
    df = data_similar_words_lower.sample(n=sample_size)
    sample = Dataset.from_pandas(df)
    metrics1 = compute_metrics_for_sample(sample)
    exact_list_similar_words_lower.append(metrics1['exact_match'])
    f1_list_similar_words_lower.append(metrics1['f1'])

  0%|          | 0/200 [00:00<?, ?it/s]

In [59]:
d_similar_words_lower = {'exact_match': exact_list_similar_words_lower, 'f1': f1_list_similar_words_lower}
d_similar_words_lower
lower_similar_words_than_4 = pd.DataFrame(d_similar_words_lower)
lower_similar_words_than_4

Unnamed: 0,exact_match,f1
0,79.0,86.758308
1,77.9,86.616237
2,77.4,85.859641
3,78.7,86.944239
4,76.3,85.239768
...,...,...
195,78.8,86.393697
196,78.9,86.386395
197,78.3,86.727103
198,79.4,86.717770


In [60]:
exact_list_similar_words_higher = []
f1_list_similar_words_higher = []

for i in tqdm(range(num_samples)):
    df = data_similar_words_higher.sample(n=sample_size)
    sample = Dataset.from_pandas(df)
    metrics1 = compute_metrics_for_sample(sample)
    exact_list_similar_words_higher.append(metrics1['exact_match'])
    f1_list_similar_words_higher.append(metrics1['f1'])

  0%|          | 0/200 [00:00<?, ?it/s]

In [61]:
d_similar_words_higher = {'exact_match': exact_list_similar_words_higher, 'f1': f1_list_similar_words_higher}
d_similar_words_higher
higher_similar_words_than_4 = pd.DataFrame(d_similar_words_higher)
higher_similar_words_than_4

Unnamed: 0,exact_match,f1
0,80.2,87.713997
1,82.0,89.037988
2,82.3,89.015988
3,82.8,88.829262
4,82.6,90.070936
...,...,...
195,82.0,89.572339
196,82.5,88.579791
197,82.7,90.425048
198,80.9,88.208816


### Quantiles

In [62]:
lower_similar_words_than_4.quantile([0.025, 0.975])

Unnamed: 0,exact_match,f1
0.025,75.6925,84.420636
0.975,80.21,87.980015


In [64]:
lower_similar_words_than_4.describe()

Unnamed: 0,exact_match,f1
count,200.0,200.0
mean,78.082,86.337727
std,1.22009,0.93208
min,74.6,84.141195
25%,77.3,85.7206
50%,78.2,86.389669
75%,78.8,86.931798
max,83.8,90.130836


In [63]:
higher_similar_words_than_4.quantile([0.025, 0.975])

Unnamed: 0,exact_match,f1
0.025,80.8975,88.207401
0.975,84.7,90.767912


In [65]:
higher_similar_words_than_4.describe()

Unnamed: 0,exact_match,f1
count,200.0,200.0
mean,82.5825,89.456218
std,1.006034,0.699929
min,80.1,87.713997
25%,81.9,88.964384
50%,82.5,89.403153
75%,83.2,89.959503
max,85.6,91.342664
