### Import models

In [7]:
import json
from chat_GPT_request import chat_gpt_request
from key import KEY as OPENAI_KEY
from DuckDuckGo_search import duckduckgo_search
from wiki_search import wiki_search
from mdeberta_request import mdeberta_request
from valuation import strings_almost_equal
from tqdm import tqdm

### Load dataset

In [2]:
from datasets import load_dataset

dataset = load_dataset("nq_open", split="validation")

### Models initializtion

In [3]:
def func(question):
    return 'test answer'

In [5]:
n_data = dataset.shape[0]
# model is a function taking the question as a unique input and returning the answer.
models = {
    "ChatGPT_vanille": lambda question: chat_gpt_request(key=OPENAI_KEY, question=question), 
    "ChatGPT_wiki": lambda question: chat_gpt_request(key=OPENAI_KEY, question=question, context=wiki_search(question)), 
    "ChatGPT_duck": lambda question: chat_gpt_request(key=OPENAI_KEY, question=question, context=duckduckgo_search(question)), 
    "Mdeberta_wiki": lambda question: mdeberta_request(question=question, context=wiki_search(question)), 
    "Mdeberta_duck": lambda question: mdeberta_request(question=question, context=duckduckgo_search(question))
}
answers = {model : [''] * n_data for model in models}

### Model evaluation

In [8]:
for i in tqdm(range(n_data)):
    for model_name, model in zip(models.keys(), models.values()):
        try:
            answers[model_name][i] = model(dataset[i]["question"])
        except Exception as e:
            answers[model_name][i] = 'EXCEPTION: ' + str(e)

  0%|          | 2/3610 [01:02<31:26:58, 31.38s/it]


KeyboardInterrupt: 

In [13]:
with open("answers.json", "w") as outfile: 
    json.dump(answers, outfile)

In [14]:
with open('answers.json') as json_file:
    answers = json.load(json_file)

model_scores = {model: 0 for model in models}

for model in models.keys():
    for i in range(n_data):
        model_scores[model] += max([strings_almost_equal(s, answers[model][i]) for s in dataset[i]['answer']])
    model_scores[model] /= n_data

model_scores

{'ChatGPT_vanille': 0.0008310249307479224,
 'ChatGPT_wiki': 0.0008310249307479224,
 'ChatGPT_duck': 0.0008310249307479224,
 'Mdeberta_wiki': 0.0008310249307479224,
 'Mdeberta_duck': 0.00110803324099723}