## Init

In [1]:
!pip install datasets transformers pandas tqdm matplotlib -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.9/388.9 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [3]:
squad_v2 = True

In [4]:
def load_model_and_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    return model, tokenizer

In [5]:
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from tqdm.auto import tqdm

def evaluate_model(model, tokenizer, dataset, subset_size=100):
    nlp = pipeline("question-answering", model=model, tokenizer=tokenizer)

    # Determine if 'validation' split is available, otherwise use 'train'
    #split_to_use = 'validation' if 'validation' in dataset else 'train'
    #split_to_use = 'test'
    split_to_use = 'train'

    # Select a subset from the appropriate split
    subset = dataset[split_to_use].select(range(subset_size))
    metric = load_metric("squad_v2")
    results = []

    for example in tqdm(subset, desc=f"Evaluating ({split_to_use} split)", unit="example"):
        outputs = nlp({
            "question": example["question"],
            "context": example["context"]
        })
        formatted_predictions = {
            "prediction_text": outputs['answer'],
            "id": example["id"],
            "no_answer_probability": 0.0
        }
        formatted_references = {
            "answers": example["answers"],
            "id": example["id"]
        }
        metric.add(prediction=formatted_predictions, reference=formatted_references)

    final_scores = metric.compute()
    return final_scores

In [6]:
# Initialize an empty list to hold the datasets
all_datasets = [
    # ("squad_v2" if squad_v2 else "squad", load_dataset("squad_v2" if squad_v2 else "squad")),

    # ("cuad", load_dataset("cuad")),

    # ("covid_qa_deepset", load_dataset("covid_qa_deepset")),

    ("TriviaQA_SQuAD", load_dataset("Kkordik/TriviaQA_SQuAD")),

    # ("books", load_dataset("subjqa", "books")),
    # ("movies", load_dataset("subjqa", "movies")),
    # ("restaurants", load_dataset("subjqa", "restaurants")),
    # ("electronics", load_dataset("subjqa", "electronics")),
    # ("grocery", load_dataset("subjqa", "grocery")),
    # ("tripadvisor", load_dataset("subjqa", "tripadvisor"))
    # Add other datasets as needed
]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/376 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/273M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14229 [00:00<?, ? examples/s]

In [7]:
# Liste des modèles à évaluer
models_checkpoint = [
    "squeezebert/squeezebert-uncased",
    "google-bert/bert-large-uncased",
    "microsoft/deberta-large"
]

## Squad v2

In [None]:
# DataFrame to store results
results_df = pd.DataFrame()

# Evaluate each model on each dataset
for dataset_name, dataset in all_datasets:
    for model_name in models_checkpoint:
        model, tokenizer = load_model_and_tokenizer(model_name)
        scores = evaluate_model(model, tokenizer, dataset)
        new_row = pd.DataFrame({
            'Dataset': [dataset_name] * len(scores),
            'Model': [model_name] * len(scores),
            'Metric': list(scores.keys()),
            'Value': list(scores.values())
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)

# Pivot for comparison
results_pivot = results_df.pivot_table(index=['Dataset', 'Model'], columns='Metric', values='Value')
display(results_pivot)  # Using display for better formatting

Some weights of SqueezeBertForQuestionAnswering were not initialized from the model checkpoint at squeezebert/squeezebert-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (validation split):   0%|          | 0/11873 [00:00<?, ?example/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (validation split):   0%|          | 0/11873 [00:00<?, ?example/s]

Some weights of DebertaForQuestionAnswering were not initialized from the model checkpoint at microsoft/deberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (validation split):   0%|          | 0/11873 [00:00<?, ?example/s]

Unnamed: 0_level_0,Metric,HasAns_exact,HasAns_f1,HasAns_total,NoAns_exact,NoAns_f1,NoAns_total,best_exact,best_exact_thresh,best_f1,best_f1_thresh,exact,f1,total
Dataset,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
squad_v2,google-bert/bert-large-uncased,0.404858,7.303644,5928.0,1.295206,1.295206,5945.0,50.071591,0.0,50.077206,0.0,0.85067,4.295123,11873.0
squad_v2,microsoft/deberta-large,0.269906,7.866964,5928.0,0.454163,0.454163,5945.0,50.071591,0.0,50.071591,0.0,0.362166,4.155257,11873.0
squad_v2,squeezebert/squeezebert-uncased,0.18556,6.393064,5928.0,0.992431,0.992431,5945.0,50.071591,0.0,50.073997,0.0,0.589573,3.688881,11873.0


## Subjqa

In [None]:
# DataFrame to store results
results_df = pd.DataFrame()

# Evaluate each model on each dataset
for dataset_name, dataset in all_datasets:
    for model_name in models_checkpoint:
        model, tokenizer = load_model_and_tokenizer(model_name)
        scores = evaluate_model(model, tokenizer, dataset)
        new_row = pd.DataFrame({
            'Dataset': [dataset_name] * len(scores),
            'Model': [model_name] * len(scores),
            'Metric': list(scores.keys()),
            'Value': list(scores.values())
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)

# Pivot for comparison
results_pivot = results_df.pivot_table(index=['Dataset', 'Model'], columns='Metric', values='Value')
display(results_pivot)  # Using display for better formatting

config.json:   0%|          | 0.00/500 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/103M [00:00<?, ?B/s]

Some weights of SqueezeBertForQuestionAnswering were not initialized from the model checkpoint at squeezebert/squeezebert-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  metric = load_metric("squad_v2")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.19k [00:00<?, ?B/s]

Evaluating (test split):   0%|          | 0/345 [00:00<?, ?example/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/345 [00:00<?, ?example/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/475 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Some weights of DebertaForQuestionAnswering were not initialized from the model checkpoint at microsoft/deberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/345 [00:00<?, ?example/s]

Some weights of SqueezeBertForQuestionAnswering were not initialized from the model checkpoint at squeezebert/squeezebert-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/291 [00:00<?, ?example/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/291 [00:00<?, ?example/s]

Some weights of DebertaForQuestionAnswering were not initialized from the model checkpoint at microsoft/deberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/291 [00:00<?, ?example/s]

Some weights of SqueezeBertForQuestionAnswering were not initialized from the model checkpoint at squeezebert/squeezebert-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/266 [00:00<?, ?example/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/266 [00:00<?, ?example/s]

Some weights of DebertaForQuestionAnswering were not initialized from the model checkpoint at microsoft/deberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/266 [00:00<?, ?example/s]

Some weights of SqueezeBertForQuestionAnswering were not initialized from the model checkpoint at squeezebert/squeezebert-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/358 [00:00<?, ?example/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/358 [00:00<?, ?example/s]

Some weights of DebertaForQuestionAnswering were not initialized from the model checkpoint at microsoft/deberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/358 [00:00<?, ?example/s]

Some weights of SqueezeBertForQuestionAnswering were not initialized from the model checkpoint at squeezebert/squeezebert-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/591 [00:00<?, ?example/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/591 [00:00<?, ?example/s]

Some weights of DebertaForQuestionAnswering were not initialized from the model checkpoint at microsoft/deberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/591 [00:00<?, ?example/s]

Some weights of SqueezeBertForQuestionAnswering were not initialized from the model checkpoint at squeezebert/squeezebert-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/512 [00:00<?, ?example/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/512 [00:00<?, ?example/s]

Some weights of DebertaForQuestionAnswering were not initialized from the model checkpoint at microsoft/deberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/512 [00:00<?, ?example/s]

Unnamed: 0_level_0,Metric,HasAns_exact,HasAns_f1,HasAns_total,NoAns_exact,NoAns_f1,NoAns_total,best_exact,best_exact_thresh,best_f1,best_f1_thresh,exact,f1,total
Dataset,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
books,google-bert/bert-large-uncased,0.0,8.783961,103.0,1.652893,1.652893,242.0,70.144928,0.0,70.144928,0.0,1.15942,3.781878,345.0
books,microsoft/deberta-large,0.0,9.135216,103.0,0.0,0.0,242.0,70.144928,0.0,70.144928,0.0,0.0,2.727325,345.0
books,squeezebert/squeezebert-uncased,0.0,7.959382,103.0,0.0,0.0,242.0,70.144928,0.0,70.144928,0.0,0.0,2.376279,345.0
electronics,google-bert/bert-large-uncased,0.625,13.389173,160.0,0.0,0.0,198.0,55.307263,0.0,55.307263,0.0,0.27933,5.983988,358.0
electronics,microsoft/deberta-large,0.0,8.291876,160.0,0.0,0.0,198.0,55.307263,0.0,55.336666,0.0,0.0,3.705867,358.0
electronics,squeezebert/squeezebert-uncased,0.0,11.607155,160.0,1.010101,1.010101,198.0,55.307263,0.0,55.307263,0.0,0.558659,5.746215,358.0
grocery,google-bert/bert-large-uncased,1.119403,13.851732,268.0,0.619195,0.619195,323.0,54.65313,0.0,54.65313,0.0,0.846024,6.619736,591.0
grocery,microsoft/deberta-large,0.373134,11.907818,268.0,0.0,0.0,323.0,54.65313,0.0,54.65313,0.0,0.169205,5.399823,591.0
grocery,squeezebert/squeezebert-uncased,0.0,8.263615,268.0,4.643963,4.643963,323.0,54.65313,0.0,54.65313,0.0,2.538071,6.285362,591.0
movies,google-bert/bert-large-uncased,2.325581,11.169458,86.0,1.463415,1.463415,205.0,70.446735,0.0,70.446735,0.0,1.718213,4.331867,291.0


## cuad

In [None]:
# DataFrame to store results
results_df = pd.DataFrame()

# Evaluate each model on each dataset
for dataset_name, dataset in all_datasets:
    for model_name in models_checkpoint:
        model, tokenizer = load_model_and_tokenizer(model_name)
        scores = evaluate_model(model, tokenizer, dataset)
        new_row = pd.DataFrame({
            'Dataset': [dataset_name] * len(scores),
            'Model': [model_name] * len(scores),
            'Metric': list(scores.keys()),
            'Value': list(scores.values())
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)

# Pivot for comparison
results_pivot = results_df.pivot_table(index=['Dataset', 'Model'], columns='Metric', values='Value')
display(results_pivot)  # Using display for better formatting

config.json:   0%|          | 0.00/500 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/103M [00:00<?, ?B/s]

Some weights of SqueezeBertForQuestionAnswering were not initialized from the model checkpoint at squeezebert/squeezebert-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  metric = load_metric("squad_v2")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.19k [00:00<?, ?B/s]

Evaluating (test split):   0%|          | 0/100 [00:00<?, ?example/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/100 [00:00<?, ?example/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/475 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Some weights of DebertaForQuestionAnswering were not initialized from the model checkpoint at microsoft/deberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (test split):   0%|          | 0/100 [00:00<?, ?example/s]

Unnamed: 0_level_0,Metric,HasAns_exact,HasAns_f1,HasAns_total,NoAns_exact,NoAns_f1,NoAns_total,best_exact,best_exact_thresh,best_f1,best_f1_thresh,exact,f1,total
Dataset,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
cuad,google-bert/bert-large-uncased,0.0,1.548707,28.0,1.388889,1.388889,72.0,72.0,0.0,72.0,0.0,1.0,1.433638,100.0
cuad,microsoft/deberta-large,0.0,3.563727,28.0,0.0,0.0,72.0,72.0,0.0,72.0,0.0,0.0,0.997844,100.0
cuad,squeezebert/squeezebert-uncased,0.0,2.089198,28.0,0.0,0.0,72.0,72.0,0.0,72.0,0.0,0.0,0.584975,100.0


## covid_qa_deepset

In [None]:
# DataFrame to store results
results_df = pd.DataFrame()

# Evaluate each model on each dataset
for dataset_name, dataset in all_datasets:
    for model_name in models_checkpoint:
        model, tokenizer = load_model_and_tokenizer(model_name)
        scores = evaluate_model(model, tokenizer, dataset)
        new_row = pd.DataFrame({
            'Dataset': [dataset_name] * len(scores),
            'Model': [model_name] * len(scores),
            'Metric': list(scores.keys()),
            'Value': list(scores.values())
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)

# Pivot for comparison
results_pivot = results_df.pivot_table(index=['Dataset', 'Model'], columns='Metric', values='Value')
display(results_pivot)  # Using display for better formatting

config.json:   0%|          | 0.00/500 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/103M [00:00<?, ?B/s]

Some weights of SqueezeBertForQuestionAnswering were not initialized from the model checkpoint at squeezebert/squeezebert-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  metric = load_metric("squad_v2")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.19k [00:00<?, ?B/s]

Evaluating (train split):   0%|          | 0/100 [00:00<?, ?example/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (train split):   0%|          | 0/100 [00:00<?, ?example/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/475 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Some weights of DebertaForQuestionAnswering were not initialized from the model checkpoint at microsoft/deberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (train split):   0%|          | 0/100 [00:00<?, ?example/s]

Unnamed: 0_level_0,Metric,HasAns_exact,HasAns_f1,HasAns_total,NoAns_exact,NoAns_f1,NoAns_total,best_exact,best_exact_thresh,best_f1,best_f1_thresh,exact,f1,total
Dataset,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
cuad,google-bert/bert-large-uncased,0.0,7.30186,60.0,0.0,0.0,40.0,40.0,0.0,40.772664,0.0,0.0,4.381116,100.0
cuad,microsoft/deberta-large,0.0,1.685389,60.0,10.0,10.0,40.0,40.0,0.0,40.116667,0.0,4.0,5.011233,100.0
cuad,squeezebert/squeezebert-uncased,0.0,9.398043,60.0,0.0,0.0,40.0,40.0,0.0,41.167169,0.0,0.0,5.638826,100.0


## Kkordik/TriviaQA_SQuAD

In [8]:
# DataFrame to store results
results_df = pd.DataFrame()

# Evaluate each model on each dataset
for dataset_name, dataset in all_datasets:
    for model_name in models_checkpoint:
        model, tokenizer = load_model_and_tokenizer(model_name)
        scores = evaluate_model(model, tokenizer, dataset)
        new_row = pd.DataFrame({
            'Dataset': [dataset_name] * len(scores),
            'Model': [model_name] * len(scores),
            'Metric': list(scores.keys()),
            'Value': list(scores.values())
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)

# Pivot for comparison
results_pivot = results_df.pivot_table(index=['Dataset', 'Model'], columns='Metric', values='Value')
display(results_pivot)  # Using display for better formatting

config.json:   0%|          | 0.00/500 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/103M [00:00<?, ?B/s]

Some weights of SqueezeBertForQuestionAnswering were not initialized from the model checkpoint at squeezebert/squeezebert-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  metric = load_metric("squad_v2")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.19k [00:00<?, ?B/s]

Evaluating (train split):   0%|          | 0/100 [00:00<?, ?example/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (train split):   0%|          | 0/100 [00:00<?, ?example/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/475 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Some weights of DebertaForQuestionAnswering were not initialized from the model checkpoint at microsoft/deberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluating (train split):   0%|          | 0/100 [00:00<?, ?example/s]

Unnamed: 0_level_0,Metric,HasAns_exact,HasAns_f1,HasAns_total,NoAns_exact,NoAns_f1,NoAns_total,best_exact,best_exact_thresh,best_f1,best_f1_thresh,exact,f1,total
Dataset,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
TriviaQA_SQuAD,google-bert/bert-large-uncased,0.0,0.588235,85.0,0.0,0.0,15.0,15.0,0.0,15.0,0.0,0.0,0.5,100.0
TriviaQA_SQuAD,microsoft/deberta-large,0.0,0.261438,85.0,0.0,0.0,15.0,15.0,0.0,15.0,0.0,0.0,0.222222,100.0
TriviaQA_SQuAD,squeezebert/squeezebert-uncased,0.0,0.392157,85.0,0.0,0.0,15.0,15.0,0.0,15.0,0.0,0.0,0.333333,100.0
