## Init

In [None]:
!pip install datasets transformers pandas tqdm matplotlib -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.9/388.9 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [None]:
squad_v2 = True

In [None]:
def load_model_and_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    return model, tokenizer

In [None]:
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from tqdm.auto import tqdm

def evaluate_model(model, tokenizer, dataset, subset_size=100):
    nlp = pipeline("question-answering", model=model, tokenizer=tokenizer)

    # Determine if 'validation' split is available, otherwise use 'train'
    # split_to_use = 'validation' if 'validation' in dataset else 'train'
    #split_to_use = 'test'
    split_to_use = 'train'

    # Select a subset from the appropriate split
    subset = dataset[split_to_use].select(range(subset_size))
    metric = load_metric("squad_v2")
    results = []

    for example in tqdm(subset, desc=f"Evaluating ({split_to_use} split)", unit="example"):
        outputs = nlp({
            "question": example["question"],
            "context": example["context"]
        })
        formatted_predictions = {
            "prediction_text": outputs['answer'],
            "id": example["id"],
            "no_answer_probability": 0.0
        }
        formatted_references = {
            "answers": example["answers"],
            "id": example["id"]
        }
        metric.add(prediction=formatted_predictions, reference=formatted_references)

    final_scores = metric.compute()
    return final_scores

In [None]:
# Initialize an empty list to hold the datasets
all_datasets = [
    # ("squad_v2" if squad_v2 else "squad", load_dataset("squad_v2" if squad_v2 else "squad")),

    # ("cuad", load_dataset("cuad")),

    # ("covid_qa_deepset", load_dataset("covid_qa_deepset")),

    ("TriviaQA_SQuAD", load_dataset("Kkordik/TriviaQA_SQuAD")),

    # ("books", load_dataset("subjqa", "books")),
    # ("movies", load_dataset("subjqa", "movies")),
    # ("restaurants", load_dataset("subjqa", "restaurants")),
    # ("electronics", load_dataset("subjqa", "electronics")),
    # ("grocery", load_dataset("subjqa", "grocery")),
    # ("tripadvisor", load_dataset("subjqa", "tripadvisor"))
    # Add other datasets as needed
]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/376 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/273M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14229 [00:00<?, ? examples/s]

In [None]:
# Liste des modèles à évaluer
models_checkpoint = [
    "ALOQAS/squeezebert-uncased-finetuned-squad-v2",
    "ALOQAS/bert-large-uncased-finetuned-squad-v2",
    "ALOQAS/deberta-large-finetuned-squad-v2"
]

## Squad V2

In [None]:
# DataFrame to store results
results_df = pd.DataFrame()

# Evaluate each model on each dataset
for dataset_name, dataset in all_datasets:
    for model_name in models_checkpoint:
        model, tokenizer = load_model_and_tokenizer(model_name)
        scores = evaluate_model(model, tokenizer, dataset)
        new_row = pd.DataFrame({
            'Dataset': [dataset_name] * len(scores),
            'Model': [model_name] * len(scores),
            'Metric': list(scores.keys()),
            'Value': list(scores.values())
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)

# Pivot for comparison
results_pivot = results_df.pivot_table(index=['Dataset', 'Model'], columns='Metric', values='Value')
display(results_pivot)  # Using display for better formatting

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/730 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/204M [00:00<?, ?B/s]

  metric = load_metric("squad_v2")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.19k [00:00<?, ?B/s]

Evaluating (validation split):   0%|          | 0/11873 [00:00<?, ?example/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/675 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (validation split):   0%|          | 0/11873 [00:00<?, ?example/s]

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/969 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/785 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (validation split):   0%|          | 0/11873 [00:00<?, ?example/s]

Unnamed: 0_level_0,Metric,HasAns_exact,HasAns_f1,HasAns_total,NoAns_exact,NoAns_f1,NoAns_total,best_exact,best_exact_thresh,best_f1,best_f1_thresh,exact,f1,total
Dataset,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
squad_v2,ALOQAS/bert-large-uncased-finetuned-squad-v2,81.443995,89.249316,5928.0,0.0,0.0,5945.0,50.113703,0.0,50.113703,0.0,40.663691,44.560763,11873.0
squad_v2,ALOQAS/deberta-large-finetuned-squad-v2,88.090418,94.509947,5928.0,0.016821,0.016821,5945.0,50.113703,0.0,50.748121,0.0,43.990567,47.195736,11873.0
squad_v2,ALOQAS/squeezebert-uncased-finetuned-squad-v2,72.216599,81.731813,5928.0,0.0,0.0,5945.0,50.113703,0.0,50.113703,0.0,36.056599,40.807394,11873.0


## Subjqa

In [None]:
# DataFrame to store results
results_df = pd.DataFrame()

# Evaluate each model on each dataset
for dataset_name, dataset in all_datasets:
    for model_name in models_checkpoint:
        model, tokenizer = load_model_and_tokenizer(model_name)
        scores = evaluate_model(model, tokenizer, dataset)
        new_row = pd.DataFrame({
            'Dataset': [dataset_name] * len(scores),
            'Model': [model_name] * len(scores),
            'Metric': list(scores.keys()),
            'Value': list(scores.values())
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)

# Pivot for comparison
results_pivot = results_df.pivot_table(index=['Dataset', 'Model'], columns='Metric', values='Value')
display(results_pivot)  # Using display for better formatting

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/730 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/204M [00:00<?, ?B/s]

  metric = load_metric("squad_v2")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.19k [00:00<?, ?B/s]

Evaluating (test split):   0%|          | 0/345 [00:00<?, ?example/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/675 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/345 [00:00<?, ?example/s]

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/969 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/785 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/345 [00:00<?, ?example/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/291 [00:00<?, ?example/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/291 [00:00<?, ?example/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/291 [00:00<?, ?example/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/266 [00:00<?, ?example/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/266 [00:00<?, ?example/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/266 [00:00<?, ?example/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/358 [00:00<?, ?example/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/358 [00:00<?, ?example/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/358 [00:00<?, ?example/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/591 [00:00<?, ?example/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/591 [00:00<?, ?example/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/591 [00:00<?, ?example/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/512 [00:00<?, ?example/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/512 [00:00<?, ?example/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/512 [00:00<?, ?example/s]

Unnamed: 0_level_0,Metric,HasAns_exact,HasAns_f1,HasAns_total,NoAns_exact,NoAns_f1,NoAns_total,best_exact,best_exact_thresh,best_f1,best_f1_thresh,exact,f1,total
Dataset,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
books,ALOQAS/bert-large-uncased-finetuned-squad-v2,4.854369,26.878272,103.0,0.0,0.0,242.0,70.144928,0.0,70.144928,0.0,1.449275,8.024528,345.0
books,ALOQAS/deberta-large-finetuned-squad-v2,6.796117,28.863716,103.0,0.413223,0.413223,242.0,70.144928,0.0,70.144928,0.0,2.318841,8.907138,345.0
books,ALOQAS/squeezebert-uncased-finetuned-squad-v2,1.941748,27.121202,103.0,0.0,0.0,242.0,70.144928,0.0,70.144928,0.0,0.57971,8.097055,345.0
electronics,ALOQAS/bert-large-uncased-finetuned-squad-v2,3.75,27.806655,160.0,0.0,0.0,198.0,55.307263,0.0,55.307263,0.0,1.675978,12.427556,358.0
electronics,ALOQAS/deberta-large-finetuned-squad-v2,6.875,30.080849,160.0,0.0,0.0,198.0,55.307263,0.0,55.307263,0.0,3.072626,13.443955,358.0
electronics,ALOQAS/squeezebert-uncased-finetuned-squad-v2,1.875,24.848015,160.0,0.0,0.0,198.0,55.307263,0.0,55.307263,0.0,0.837989,11.105258,358.0
grocery,ALOQAS/bert-large-uncased-finetuned-squad-v2,4.477612,26.792789,268.0,0.0,0.0,323.0,54.65313,0.0,54.65313,0.0,2.030457,12.149691,591.0
grocery,ALOQAS/deberta-large-finetuned-squad-v2,6.716418,30.647142,268.0,0.0,0.0,323.0,54.65313,0.0,54.65313,0.0,3.045685,13.897519,591.0
grocery,ALOQAS/squeezebert-uncased-finetuned-squad-v2,3.731343,30.195135,268.0,0.0,0.0,323.0,54.65313,0.0,54.65313,0.0,1.692047,13.692548,591.0
movies,ALOQAS/bert-large-uncased-finetuned-squad-v2,1.162791,24.24163,86.0,0.0,0.0,205.0,70.446735,0.0,70.446735,0.0,0.343643,7.164193,291.0


## cuad

In [None]:
# DataFrame to store results
results_df = pd.DataFrame()

# Evaluate each model on each dataset
for dataset_name, dataset in all_datasets:
    for model_name in models_checkpoint:
        model, tokenizer = load_model_and_tokenizer(model_name)
        scores = evaluate_model(model, tokenizer, dataset)
        new_row = pd.DataFrame({
            'Dataset': [dataset_name] * len(scores),
            'Model': [model_name] * len(scores),
            'Metric': list(scores.keys()),
            'Value': list(scores.values())
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)

# Pivot for comparison
results_pivot = results_df.pivot_table(index=['Dataset', 'Model'], columns='Metric', values='Value')
display(results_pivot)  # Using display for better formatting

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/730 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/204M [00:00<?, ?B/s]

  metric = load_metric("squad_v2")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.19k [00:00<?, ?B/s]

Evaluating (test split):   0%|          | 0/100 [00:00<?, ?example/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/675 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/100 [00:00<?, ?example/s]

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/969 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/785 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/100 [00:00<?, ?example/s]

Unnamed: 0_level_0,Metric,HasAns_exact,HasAns_f1,HasAns_total,NoAns_exact,NoAns_f1,NoAns_total,best_exact,best_exact_thresh,best_f1,best_f1_thresh,exact,f1,total
Dataset,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
cuad,ALOQAS/bert-large-uncased-finetuned-squad-v2,7.142857,20.002342,28.0,0.0,0.0,72.0,72.0,0.0,72.0,0.0,2.0,5.600656,100.0
cuad,ALOQAS/deberta-large-finetuned-squad-v2,0.0,9.263134,28.0,0.0,0.0,72.0,72.0,0.0,72.0,0.0,0.0,2.593678,100.0
cuad,ALOQAS/squeezebert-uncased-finetuned-squad-v2,0.0,9.452988,28.0,0.0,0.0,72.0,72.0,0.0,72.0,0.0,0.0,2.646837,100.0


## covid_qa_deepset

In [None]:
# DataFrame to store results
results_df = pd.DataFrame()

# Evaluate each model on each dataset
for dataset_name, dataset in all_datasets:
    for model_name in models_checkpoint:
        model, tokenizer = load_model_and_tokenizer(model_name)
        scores = evaluate_model(model, tokenizer, dataset)
        new_row = pd.DataFrame({
            'Dataset': [dataset_name] * len(scores),
            'Model': [model_name] * len(scores),
            'Metric': list(scores.keys()),
            'Value': list(scores.values())
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)

# Pivot for comparison
results_pivot = results_df.pivot_table(index=['Dataset', 'Model'], columns='Metric', values='Value')
display(results_pivot)  # Using display for better formatting

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/730 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/204M [00:00<?, ?B/s]

  metric = load_metric("squad_v2")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.19k [00:00<?, ?B/s]

Evaluating (train split):   0%|          | 0/250 [00:00<?, ?example/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/675 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (train split):   0%|          | 0/250 [00:00<?, ?example/s]

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/969 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/785 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (train split):   0%|          | 0/250 [00:00<?, ?example/s]

Unnamed: 0_level_0,Metric,HasAns_exact,HasAns_f1,HasAns_total,best_exact,best_exact_thresh,best_f1,best_f1_thresh,exact,f1,total
Dataset,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
covid_qa_deepset,ALOQAS/bert-large-uncased-finetuned-squad-v2,27.2,42.795043,250.0,27.2,0.0,42.795043,0.0,27.2,42.795043,250.0
covid_qa_deepset,ALOQAS/deberta-large-finetuned-squad-v2,36.8,54.157657,250.0,36.8,0.0,54.157657,0.0,36.8,54.157657,250.0
covid_qa_deepset,ALOQAS/squeezebert-uncased-finetuned-squad-v2,22.0,36.319472,250.0,22.0,0.0,36.319472,0.0,22.0,36.319472,250.0


## Kkordik/TriviaQA_SQuAD

In [None]:
# DataFrame to store results
results_df = pd.DataFrame()

# Evaluate each model on each dataset
for dataset_name, dataset in all_datasets:
    for model_name in models_checkpoint:
        model, tokenizer = load_model_and_tokenizer(model_name)
        scores = evaluate_model(model, tokenizer, dataset)
        new_row = pd.DataFrame({
            'Dataset': [dataset_name] * len(scores),
            'Model': [model_name] * len(scores),
            'Metric': list(scores.keys()),
            'Value': list(scores.values())
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)

# Pivot for comparison
results_pivot = results_df.pivot_table(index=['Dataset', 'Model'], columns='Metric', values='Value')
display(results_pivot)  # Using display for better formatting

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/730 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/204M [00:00<?, ?B/s]

  metric = load_metric("squad_v2")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.19k [00:00<?, ?B/s]

Evaluating (train split):   0%|          | 0/100 [00:00<?, ?example/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/675 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (train split):   0%|          | 0/100 [00:00<?, ?example/s]

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/969 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/785 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (train split):   0%|          | 0/100 [00:00<?, ?example/s]

Unnamed: 0_level_0,Metric,HasAns_exact,HasAns_f1,HasAns_total,NoAns_exact,NoAns_f1,NoAns_total,best_exact,best_exact_thresh,best_f1,best_f1_thresh,exact,f1,total
Dataset,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
TriviaQA_SQuAD,ALOQAS/bert-large-uncased-finetuned-squad-v2,34.117647,40.196078,85.0,0.0,0.0,15.0,29.0,0.0,34.166667,0.0,29.0,34.166667,100.0
TriviaQA_SQuAD,ALOQAS/deberta-large-finetuned-squad-v2,31.764706,42.509804,85.0,0.0,0.0,15.0,28.0,0.0,36.466667,0.0,27.0,36.133333,100.0
TriviaQA_SQuAD,ALOQAS/squeezebert-uncased-finetuned-squad-v2,25.882353,33.098039,85.0,0.0,0.0,15.0,24.0,0.0,29.133333,0.0,22.0,28.133333,100.0
