## Requirements
```
pip install transformers
pip install datasets
pip install nltk
```
Installeer [CUDA](https://developer.nvidia.com/cuda-downloads) en [PyTorch](https://pytorch.org/get-started/locally/) als je op GPU wilt trainen anders installeer alleen PyTorch op CPU

## Source
[source](https://colab.research.google.com/drive/1z-Zl2hftMrFXabYfmz8o9YZpgYx6sGeW?usp=sharing#scrollTo=OG82dfGbK63x)

# Load and install

In [1]:
import transformers
import datasets
import torch
import logging

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Preprocessing
sep_token = '<sep>'
dataset_name = "ms_marco"# v1.1
models_dir = "../../saved_models/t5_base-msmarco"
checkpoint = 't5-base'
max_input_length = 768
max_target_length = 128

## Training
learning_rate = 1e-4
num_epochs = 1

In [3]:
dataset = datasets.load_dataset(dataset_name,"v1.1")

Found cached dataset ms_marco (C:/Users/manuv/.cache/huggingface/datasets/ms_marco/v1.1/1.1.0/b6a62715fa5219aea5275dd3556601004cd63945cb63e36e022f77bb3cbbca84)
100%|██████████| 3/3 [00:01<00:00,  2.41it/s]


In [4]:
dataset["train"][3]# context = passages.passage_text is list of passages(str) -> take passage where is_selected is 1, question: query, answers: answers = list of string

{'answers': ['$11 to $22 per square foot'],
 'passages': {'is_selected': [0, 0, 0, 0, 0, 0, 0, 0, 1],
  'passage_text': ['In regards to tile installation costs, consumers can expect to pay an average of $25 per square foot, depending on the grade of material that is used. For a medium-sized shower, the price can cost about $2,000. Tile installation materials include:',
   '1 Polished nickel faucets-Average cost is $400 each plus four hours of installation; 2  Install ceramic tile floor to match shower-Average prices for installation are between $11 to $22 per square foot; 3  A light/fan combination-Averages at $180 and one hour',
   'Enhancement and improvement costs. 1  Polished nickel faucets-Average cost is $400 each plus four hours of installation; 2  Install ceramic tile floor to match shower-Average prices for installation are between $11 to $22 per square foot; 3  A light/fan combination-Averages at $180 and one hour of installation; 4  Insulate and re-finish ceilings and 5  ...

In [5]:
dataset["train"][3]["passages"]

{'is_selected': [0, 0, 0, 0, 0, 0, 0, 0, 1],
 'passage_text': ['In regards to tile installation costs, consumers can expect to pay an average of $25 per square foot, depending on the grade of material that is used. For a medium-sized shower, the price can cost about $2,000. Tile installation materials include:',
  '1 Polished nickel faucets-Average cost is $400 each plus four hours of installation; 2  Install ceramic tile floor to match shower-Average prices for installation are between $11 to $22 per square foot; 3  A light/fan combination-Averages at $180 and one hour',
  'Enhancement and improvement costs. 1  Polished nickel faucets-Average cost is $400 each plus four hours of installation; 2  Install ceramic tile floor to match shower-Average prices for installation are between $11 to $22 per square foot; 3  A light/fan combination-Averages at $180 and one hour of installation; 4  Insulate and re-finish ceilings and 5  ... Painti',
  'Granite shower tile is available at an average 

In [6]:
model = transformers.T5ForConditionalGeneration.from_pretrained(checkpoint)
tokenizer = transformers.T5TokenizerFast.from_pretrained(checkpoint)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [7]:
tokenizer.sep_token = sep_token
tokenizer.add_tokens([sep_token])
model.resize_token_embeddings(len(tokenizer))

Embedding(32101, 768)

In [8]:
# Tokenize examples
def convert_to_features(example_batch):

    input_encodings = tokenizer.batch_encode_plus(example_batch['context'], 
                                                  max_length=max_input_length, 
                                                  add_special_tokens=True,
                                                  truncation=True, 
                                                  pad_to_max_length=True)
    
    target_encodings = tokenizer.batch_encode_plus(example_batch['question'], 
                                                   max_length=max_target_length, 
                                                   add_special_tokens=True,
                                                   truncation=True, pad_to_max_length=True)
                                                   
    encodings = {
        'input_ids': input_encodings['input_ids'], 
        'attention_mask': input_encodings['attention_mask'],
        'decoder_input_ids': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings

def add_eos_examples(example):
    example['context'] = example['context'] + " " + sep_token
    example['question'] = example['question'] + " " + sep_token
    return example


def add_special_tokens(example):
  example['question'] = example['question'].replace("{sep_token}", sep_token)
  return example

def refactor_columns(example):
   is_selected = example["passages"]["is_selected"]
   passages_text = example["passages"]["passage_text"]
   indexes_selected = [i for i in range(len(is_selected)) if is_selected[i] == 1]
   context = "\n".join([passages_text[i] for i in indexes_selected])
   question = example['query']

   example["context"] = context
   example["question"] = question

   return example

In [9]:
tokenized_dataset = dataset.map(refactor_columns)
tokenized_dataset  = tokenized_dataset.map(add_eos_examples)
tokenized_dataset = tokenized_dataset.map(add_special_tokens)
tokenized_dataset  = tokenized_dataset.map(convert_to_features,  batched=True)

Loading cached processed dataset at C:\Users\manuv\.cache\huggingface\datasets\ms_marco\v1.1\1.1.0\b6a62715fa5219aea5275dd3556601004cd63945cb63e36e022f77bb3cbbca84\cache-101f4a75820b9f6e.arrow
Loading cached processed dataset at C:\Users\manuv\.cache\huggingface\datasets\ms_marco\v1.1\1.1.0\b6a62715fa5219aea5275dd3556601004cd63945cb63e36e022f77bb3cbbca84\cache-27f2b55914df1d2a.arrow
Loading cached processed dataset at C:\Users\manuv\.cache\huggingface\datasets\ms_marco\v1.1\1.1.0\b6a62715fa5219aea5275dd3556601004cd63945cb63e36e022f77bb3cbbca84\cache-c352caa0f03b2468.arrow
Loading cached processed dataset at C:\Users\manuv\.cache\huggingface\datasets\ms_marco\v1.1\1.1.0\b6a62715fa5219aea5275dd3556601004cd63945cb63e36e022f77bb3cbbca84\cache-dafbe92010328fd7.arrow
Loading cached processed dataset at C:\Users\manuv\.cache\huggingface\datasets\ms_marco\v1.1\1.1.0\b6a62715fa5219aea5275dd3556601004cd63945cb63e36e022f77bb3cbbca84\cache-9804eb8465b152a3.arrow
Loading cached processed dataset at

In [10]:
tokenized_dataset["train"][0]["question"]

'what is rba <sep>'

In [11]:
tokenized_dataset["train"][0].keys()

dict_keys(['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers', 'context', 'question', 'input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask'])

In [12]:
tokenized_dataset = tokenized_dataset.remove_columns(
    ["answers", "passages","query","query_id","query_type","wellFormedAnswers","context","question"]
)

train_dataset = tokenized_dataset["train"]
valid_dataset = tokenized_dataset["validation"]

columns = ['input_ids', 'decoder_input_ids', 'attention_mask', 'decoder_attention_mask']
train_dataset.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)

In [13]:
torch.save(train_dataset, 'train_data.pt')
torch.save(valid_dataset, 'valid_data.pt')

In [14]:
from typing import Dict, List

class T2TDataCollator():
    def __call__(self, batch: List) -> Dict[str, torch.Tensor]:
        """
        Take a list of samples from a Dataset and collate them into a batch.
        Returns:
        A dictionary of tensors
        """

        input_ids = torch.stack([example['input_ids'] for example in batch])
        lm_labels = torch.stack([example['decoder_input_ids'] for example in batch])
        lm_labels[lm_labels[:, :] == 0] = -100 
        attention_mask = torch.stack([example['attention_mask'] for example in batch])
        decoder_attention_mask = torch.stack([example['decoder_attention_mask'] for example in batch])

        return {
            'input_ids': input_ids, 
            'attention_mask': attention_mask,
            'labels': lm_labels, 
            'decoder_attention_mask': decoder_attention_mask
        }

In [15]:
training_args = transformers.TrainingArguments(output_dir=models_dir, 
                                  per_device_train_batch_size=4, 
                                  per_device_eval_batch_size=4,
                                  gradient_accumulation_steps=16,
                                  learning_rate=learning_rate, 
                                  num_train_epochs=num_epochs,
                                  logging_steps=100,
                                  run_name="t5_answer-agnostic_msmarco",
                                  evaluation_strategy="steps",
                                  save_steps=500)

In [16]:
logger = logging.getLogger(__name__)

# Initialize our Trainer
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=T2TDataCollator()
)


In [17]:
if model.device.type == 'cuda':
    print('Model is on GPU')
else:
    print('Model is on CPU')

Model is on GPU


In [18]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmanu-vleurick[0m ([33mhogent-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


  8%|▊         | 100/1286 [16:13<3:14:50,  9.86s/it]

{'loss': 3.5303, 'learning_rate': 9.222395023328149e-05, 'epoch': 0.08}


                                                    
  8%|▊         | 100/1286 [25:02<3:14:50,  9.86s/it]

{'eval_loss': 2.2867953777313232, 'eval_runtime': 528.8825, 'eval_samples_per_second': 18.997, 'eval_steps_per_second': 4.75, 'epoch': 0.08}


 16%|█▌        | 200/1286 [41:31<2:58:51,  9.88s/it]  

{'loss': 2.3438, 'learning_rate': 8.4447900466563e-05, 'epoch': 0.16}


                                                    
 16%|█▌        | 200/1286 [50:20<2:58:51,  9.88s/it]

{'eval_loss': 2.128758430480957, 'eval_runtime': 529.3197, 'eval_samples_per_second': 18.981, 'eval_steps_per_second': 4.746, 'epoch': 0.16}


 23%|██▎       | 300/1286 [1:06:49<2:42:30,  9.89s/it]

{'loss': 2.1638, 'learning_rate': 7.667185069984448e-05, 'epoch': 0.23}


                                                      
 23%|██▎       | 300/1286 [1:15:38<2:42:30,  9.89s/it]

{'eval_loss': 1.9595227241516113, 'eval_runtime': 529.4459, 'eval_samples_per_second': 18.976, 'eval_steps_per_second': 4.745, 'epoch': 0.23}


 31%|███       | 400/1286 [1:32:07<2:25:58,  9.89s/it]  

{'loss': 2.0536, 'learning_rate': 6.889580093312597e-05, 'epoch': 0.31}


                                                      
 31%|███       | 400/1286 [1:40:56<2:25:58,  9.89s/it]

{'eval_loss': 1.8713061809539795, 'eval_runtime': 529.4509, 'eval_samples_per_second': 18.976, 'eval_steps_per_second': 4.745, 'epoch': 0.31}


 39%|███▉      | 500/1286 [1:57:25<2:09:29,  9.89s/it]  

{'loss': 1.9984, 'learning_rate': 6.111975116640747e-05, 'epoch': 0.39}


                                                      
 39%|███▉      | 500/1286 [2:06:07<2:09:29,  9.89s/it]

{'eval_loss': 1.840945839881897, 'eval_runtime': 522.8269, 'eval_samples_per_second': 19.217, 'eval_steps_per_second': 4.805, 'epoch': 0.39}


 47%|████▋     | 600/1286 [2:22:24<1:50:58,  9.71s/it]  

{'loss': 1.9409, 'learning_rate': 5.334370139968896e-05, 'epoch': 0.47}


                                                      
 47%|████▋     | 600/1286 [2:31:04<1:50:58,  9.71s/it]

{'eval_loss': 1.7618476152420044, 'eval_runtime': 519.9562, 'eval_samples_per_second': 19.323, 'eval_steps_per_second': 4.831, 'epoch': 0.47}


 54%|█████▍    | 700/1286 [2:47:35<1:36:41,  9.90s/it]  

{'loss': 1.9077, 'learning_rate': 4.5567651632970456e-05, 'epoch': 0.54}


                                                      
 54%|█████▍    | 700/1286 [2:56:25<1:36:41,  9.90s/it]

{'eval_loss': 1.736270546913147, 'eval_runtime': 529.5919, 'eval_samples_per_second': 18.971, 'eval_steps_per_second': 4.743, 'epoch': 0.54}


 62%|██████▏   | 800/1286 [3:12:56<1:20:34,  9.95s/it]  

{'loss': 1.8875, 'learning_rate': 3.7791601866251944e-05, 'epoch': 0.62}


                                                      
 62%|██████▏   | 800/1286 [3:21:46<1:20:34,  9.95s/it]

{'eval_loss': 1.6955516338348389, 'eval_runtime': 529.786, 'eval_samples_per_second': 18.964, 'eval_steps_per_second': 4.742, 'epoch': 0.62}


 70%|██████▉   | 900/1286 [3:38:16<1:03:40,  9.90s/it]  

{'loss': 1.8809, 'learning_rate': 3.001555209953344e-05, 'epoch': 0.7}


                                                      
 70%|██████▉   | 900/1286 [3:46:57<1:03:40,  9.90s/it]

{'eval_loss': 1.6788443326950073, 'eval_runtime': 521.1493, 'eval_samples_per_second': 19.279, 'eval_steps_per_second': 4.82, 'epoch': 0.7}


 78%|███████▊  | 1000/1286 [4:03:07<46:11,  9.69s/it]   

{'loss': 1.8294, 'learning_rate': 2.2239502332814934e-05, 'epoch': 0.78}


                                                     
 78%|███████▊  | 1000/1286 [4:11:46<46:11,  9.69s/it]

{'eval_loss': 1.6688786745071411, 'eval_runtime': 518.7788, 'eval_samples_per_second': 19.367, 'eval_steps_per_second': 4.842, 'epoch': 0.78}


 86%|████████▌ | 1100/1286 [4:28:02<30:01,  9.69s/it]    

{'loss': 1.8417, 'learning_rate': 1.4463452566096425e-05, 'epoch': 0.86}


                                                     
 86%|████████▌ | 1100/1286 [4:36:41<30:01,  9.69s/it]

{'eval_loss': 1.648364782333374, 'eval_runtime': 518.536, 'eval_samples_per_second': 19.376, 'eval_steps_per_second': 4.844, 'epoch': 0.86}


 93%|█████████▎| 1200/1286 [4:52:55<14:05,  9.83s/it]   

{'loss': 1.814, 'learning_rate': 6.687402799377916e-06, 'epoch': 0.93}


                                                     
 93%|█████████▎| 1200/1286 [5:01:33<14:05,  9.83s/it]

{'eval_loss': 1.6555589437484741, 'eval_runtime': 518.3939, 'eval_samples_per_second': 19.381, 'eval_steps_per_second': 4.846, 'epoch': 0.93}


100%|██████████| 1286/1286 [5:15:26<00:00, 14.72s/it]   

{'train_runtime': 18930.149, 'train_samples_per_second': 4.349, 'train_steps_per_second': 0.068, 'train_loss': 2.0809333528367393, 'epoch': 1.0}





TrainOutput(global_step=1286, training_loss=2.0809333528367393, metrics={'train_runtime': 18930.149, 'train_samples_per_second': 4.349, 'train_steps_per_second': 0.068, 'train_loss': 2.0809333528367393, 'epoch': 1.0})

In [19]:
# save the model
trainer.save_model(models_dir)

In [20]:
sep_token = '<sep>'
dataset_name = "ms_marco"# v1.1
models_dir = "../../saved_models/t5_base-msmarco"
checkpoint = 't5-base'

In [21]:
from transformers import T5ForConditionalGeneration, T5TokenizerFast
# load the saved model
loaded_model = T5ForConditionalGeneration.from_pretrained(models_dir)
tokenizer = T5TokenizerFast.from_pretrained(checkpoint)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [22]:
def run_model(input_string, **generator_args):
    generator_args = {
    "max_length": 768,
    "num_beams": 4,# bij grotere num_beams is trager maar complexere vragen(niet per se betere vragen)
    "length_penalty": 1.5,
    "no_repeat_ngram_size": 3,
    "early_stopping": True,
    }
    input_string = input_string + " <sep>"
    input_ids = tokenizer.encode(input_string, return_tensors="pt")
    res = loaded_model.generate(input_ids, **generator_args)
    output = tokenizer.batch_decode(res, skip_special_tokens=True)
    output = [item.split("<sep>") for item in output]
    return output

In [23]:
context = """
Cheese is an ancient food whose origins predate recorded history. There is no conclusive evidence indicating where cheesemaking originated, whether in Europe, Central Asia or the Middle East. Earliest proposed dates for the origin of cheesemaking range from around 8000 BCE, when sheep were first domesticated. Since animal skins and inflated internal organs have, since ancient times, provided storage vessels for a range of foodstuffs, it is probable that the process of cheese making was discovered accidentally by storing milk in a container made from the stomach of an animal, resulting in the milk being turned to curd and whey by the rennet from the stomach.[7] There is a legend—with variations—about the discovery of cheese by an Arab trader who used this method of storing milk.[8]

The earliest evidence of cheesemaking in the archaeological record dates back to 5500 BCE and is found in what is now Kuyavia, Poland, where strainers coated with milk-fat molecules have been found.[9]

Cheesemaking may have begun independently of this by the pressing and salting of curdled milk to preserve it. Observation that the effect of making cheese in an animal stomach gave more solid and better-textured curds may have led to the deliberate addition of rennet. Early archeological evidence of Egyptian cheese has been found in Egyptian tomb murals, dating to about 2000 BCE.[10] A 2018 scientific paper stated that the world's oldest cheese, dating to approximately 1200 BCE (3200 years before present), was found in ancient Egyptian tombs.[11][12]

The earliest cheeses were likely quite sour and salty, similar in texture to rustic cottage cheese or feta, a crumbly, flavorful Greek cheese. Cheese produced in Europe, where climates are cooler than the Middle East, required less salt for preservation. With less salt and acidity, the cheese became a suitable environment for useful microbes and molds, giving aged cheeses their respective flavors. The earliest ever discovered preserved cheese was found in the Taklamakan Desert in Xinjiang, China, dating back as early as 1615 BCE (3600 years before present).
"""

run_model(context)

[['where did cheese originate from in egypt? exactly where did it come from? hi, where is cheese found in europe? where is it found in the middle east? where was it found? where did the cheese originate? where does it originate from? what is the earliest evidence of cheesemaking in the uk? what are the origins of cheese making in asia? where are the cheeses found in china?']]

## Evaluation

In [24]:
dataset["validation"][0]

{'answers': ['Approximately $15,000 per year.'],
 'passages': {'is_selected': [1, 0, 0, 0, 0, 0],
  'passage_text': ['The average Walgreens salary ranges from approximately $15,000 per year for Customer Service Associate / Cashier to $179,900 per year for District Manager. Average Walgreens hourly pay ranges from approximately $7.35 per hour for Laboratory Technician to $68.90 per hour for Pharmacy Manager. Salary information comes from 7,810 data points collected directly from employees, users, and jobs on Indeed.',
   'The average revenue in 2011 of a Starbuck Store was $1,078,000, up  from $1,011,000 in 2010.    The average ticket (total purchase) at domestic Starbuck stores in  No … vember 2007 was reported at $6.36.    In 2008, the average ticket was flat (0.0% change).',
   'In fiscal 2014, Walgreens opened a total of 184 new locations and acquired 84 locations, for a net decrease of 273 after relocations and closings. How big are your stores? The average size for a typical Walgr

In [40]:
import numpy as np
import nltk
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score as calculate_meteor
from nltk.translate.bleu_score import SmoothingFunction
import datasets
from transformers import T5ForConditionalGeneration, T5TokenizerFast
dataset_name = "ms_marco"# v1.1
models_dir = "../../saved_models/t5_base-msmarco"
checkpoint = 't5-base'
dataset = datasets.load_dataset(dataset_name,"v1.1")
val_dataset = dataset["validation"]
test_dataset = dataset["test"]

loaded_model = T5ForConditionalGeneration.from_pretrained(models_dir)
tokenizer = T5TokenizerFast.from_pretrained(checkpoint)

100%|██████████| 3/3 [00:02<00:00,  1.16it/s]
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [41]:
test_dataset

Dataset({
    features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
    num_rows: 9650
})

In [42]:
val_dataset

Dataset({
    features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
    num_rows: 10047
})

In [43]:
def run_model(input_string, **generator_args):
    generator_args = {
    "max_length": 768,
    "num_beams": 4,# bij grotere num_beams is trager maar complexere vragen(niet per se betere vragen)
    "length_penalty": 1.5,
    "no_repeat_ngram_size": 3,
    "early_stopping": True,
    }
    input_string = input_string + " <sep>"
    input_ids = tokenizer.encode(input_string, return_tensors="pt")
    res = loaded_model.generate(input_ids, **generator_args)
    output = tokenizer.batch_decode(res, skip_special_tokens=True)
    output = [item.split("<sep>") for item in output]
    return output

In [44]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import pipeline

tokenizer_qa = AutoTokenizer.from_pretrained("damapika/roberta-base_mod")
model_qa = AutoModelForQuestionAnswering.from_pretrained("damapika/roberta-base_mod")

question_answerer = pipeline("question-answering", model="damapika/roberta-base_mod")

In [45]:
def refactor_columns(example):
    is_selected = example["passages"]["is_selected"]
    passages_text = example["passages"]["passage_text"]
    indexes_selected = [i for i in range(len(is_selected)) if is_selected[i] == 1]
    context = "\n".join([passages_text[i] for i in indexes_selected])
    question = example['query']

    example["context"] = context
    example["question"] = question

    return example

In [46]:
test_dataset = test_dataset.map(refactor_columns)



### Automatic Metrics

* BLEU
* ROUGE
* METEOR

In [47]:

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
smoother = SmoothingFunction().method1

bleu_scores = np.array([])
rouge_scores = np.array([])
meteor_scores = np.array([])
count = 0
for row in test_dataset:
    best_bleu,best_rouge,best_meteor = 0,0,0

    context,target_question = row["context"],row["question"]

    gen_questions = run_model(context)[0][0].replace('\sep>','').replace('sep>','')
    print(f'gen_question: {gen_question} <-> target_question: {target_question} | {count}')

    for gen_question in [question.strip() + "?" for question in gen_questions.split('?')[:-1]]:# only gets best bleu,rouge and meteor score by looping over each question
        # Tokenization
        gen_tokens = nltk.word_tokenize(gen_question.lower())
        target_tokens = nltk.word_tokenize(target_question.lower())

        # Calculate scores
        bleu_score = nltk.translate.bleu_score.sentence_bleu([target_tokens], gen_tokens,smoothing_function=smoother)
        best_bleu = bleu_score if bleu_score > best_bleu else best_bleu

        # Calculate ROUGE score
        rouge_score = scorer.score(target_question, gen_question)["rougeL"].fmeasure# Doesn't need tokenization
        best_rouge = rouge_score if rouge_score > best_rouge else best_rouge

        # Calculate METEOR score
        meteor_score = calculate_meteor([target_tokens],gen_tokens)
        best_meteor = meteor_score if meteor_score > best_meteor else best_meteor

    bleu_scores = np.append(bleu_scores,best_bleu)
    rouge_scores = np.append(rouge_scores,best_rouge)
    meteor_scores = np.append(meteor_scores,best_meteor)

    count+=1

avg_bleu_score = np.mean(bleu_scores)
avg_rouge_score = np.mean(rouge_scores)
avg_meteor_score = np.mean(meteor_scores)

print(f'Average BLEU4: {avg_bleu_score}')
print(f'Average ROUGE: {avg_rouge_score}')
print(f'Average METEOR: {avg_meteor_score}')

gen_question: how old do you have to be to take a ira withdrawal? <-> target_question: does human hair stop squirrels | 0
gen_question: how old do you have to be to take a ira withdrawal? <-> target_question: what are the benefits of fossil fuels | 1
gen_question: what are the benefits of fossil fuels for our economy? <-> target_question: what is a apothem | 2
gen_question: what is the apothem of a polygon? <-> target_question: average cost for custom canopy | 3
gen_question: hi cost of a canopy? <-> target_question: what is a hardware in a computer | 4
gen_question: what is computer hardware used for? <-> target_question: edi logistics definition | 5
gen_question: what is edi format used for? <-> target_question: why should recreational marijuana be illegal | 6
gen_question: why should marijuana be legal for recreational use? <-> target_question: what class are spiders in | 7
gen_question: what is the class of spiders that are arachnids? <-> target_question: where is the amur leopard 