In [14]:
import transformers
import datasets
import torch
import logging

In [2]:
# Preprocessing
sep_token = '<sep>'
special_token = '<ANSWER>' # between context and answer
dataset_name = "squad" # id,title,context,question,answer{text,answer_start}
models_dir = "saved_models/t5_base_answer-aware-squad_mod"
checkpoint = 't5-base'
max_input_length = 512
max_target_length = 64

## Training
learning_rate = 1e-4
num_epochs = 7

In [16]:
dataset = datasets.load_dataset(dataset_name)

Found cached dataset squad (C:/Users/manuv/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
100%|██████████| 2/2 [00:00<00:00, 199.96it/s]


In [17]:
dataset["train"][0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

In [18]:
model = transformers.T5ForConditionalGeneration.from_pretrained(checkpoint)
tokenizer = transformers.T5TokenizerFast.from_pretrained(checkpoint)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [19]:
tokenizer.sep_token = sep_token
tokenizer.add_tokens([sep_token])
tokenizer.add_tokens([special_token])
tokenizer.special_tokens_map.update({special_token: '[unused100]'})
model.resize_token_embeddings(len(tokenizer))

Embedding(32102, 768)

In [68]:

# Tokenize examples
def convert_to_features(example_batch):

    input_encodings = tokenizer.batch_encode_plus(example_batch["input"], 
                                                  max_length=max_input_length, 
                                                  add_special_tokens=True,
                                                  truncation=True, 
                                                  pad_to_max_length=True)
    
    target_encodings = tokenizer.batch_encode_plus(example_batch["question"], 
                                                   max_length=max_target_length, 
                                                   add_special_tokens=True,
                                                   truncation=True, pad_to_max_length=True)
                                                   
    encodings = {
        'input_ids': input_encodings['input_ids'], 
        'attention_mask': input_encodings['attention_mask'],
        'decoder_input_ids': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings

def add_eos_examples(example):
    example['input'] = example['context'] + " " + special_token + example["answers"]["text"][0]
    example['question'] = example['question'] + " " + sep_token
    return example


def add_special_tokens(example):
  example['question'] = example['question'].replace("{sep_token}", sep_token)
  return example

In [69]:
tokenized_dataset  = dataset.map(add_eos_examples)
tokenized_dataset = tokenized_dataset.map(add_special_tokens)
tokenized_dataset  = tokenized_dataset.map(convert_to_features,  batched=True)

                                                                   

In [74]:
tokenized_dataset["train"][0]["input"]

'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary. <ANSWER>Saint Bernadette Soubirous'

In [75]:
tokenized_dataset = tokenized_dataset.remove_columns(
    ["input", "question","context","answers"]
)

train_dataset = tokenized_dataset["train"]
valid_dataset = tokenized_dataset["validation"]

columns = ['input_ids', 'decoder_input_ids', 'attention_mask', 'decoder_attention_mask']
train_dataset.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)

In [76]:
torch.save(train_dataset, 'train_data.pt')
torch.save(valid_dataset, 'valid_data.pt')

In [77]:
from typing import Dict, List

class T2TDataCollator():
    def __call__(self, batch: List) -> Dict[str, torch.Tensor]:
        """
        Take a list of samples from a Dataset and collate them into a batch.
        Returns:
        A dictionary of tensors
        """

        input_ids = torch.stack([example['input_ids'] for example in batch])
        lm_labels = torch.stack([example['decoder_input_ids'] for example in batch])
        lm_labels[lm_labels[:, :] == 0] = -100 
        attention_mask = torch.stack([example['attention_mask'] for example in batch])
        decoder_attention_mask = torch.stack([example['decoder_attention_mask'] for example in batch])

        return {
            'input_ids': input_ids, 
            'attention_mask': attention_mask,
            'labels': lm_labels, 
            'decoder_attention_mask': decoder_attention_mask
        }

In [78]:
training_args = transformers.TrainingArguments(output_dir=models_dir, 
                                  per_device_train_batch_size=4, 
                                  per_device_eval_batch_size=4,
                                  gradient_accumulation_steps=16,
                                  learning_rate=learning_rate, 
                                  num_train_epochs=num_epochs,
                                  logging_steps=100,
                                  run_name="t5_answer-aware_qg_squad",
                                  evaluation_strategy="steps",
                                  save_steps=500)

In [79]:
logger = logging.getLogger(__name__)

# Initialize our Trainer
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=T2TDataCollator()
)


In [80]:
if model.device.type == 'cuda':
    print('Model is on GPU')
else:
    print('Model is on CPU')

Model is on GPU


In [81]:
trainer.train()

  1%|          | 100/9576 [09:41<15:05:16,  5.73s/it]

{'loss': 2.4532, 'learning_rate': 9.895572263993317e-05, 'epoch': 0.07}


                                                     
  1%|          | 100/9576 [14:55<15:05:16,  5.73s/it]

{'eval_loss': 1.7000625133514404, 'eval_runtime': 313.7948, 'eval_samples_per_second': 33.684, 'eval_steps_per_second': 8.423, 'epoch': 0.07}


  2%|▏         | 200/9576 [24:28<14:56:34,  5.74s/it] 

{'loss': 1.7143, 'learning_rate': 9.791144527986633e-05, 'epoch': 0.15}


                                                     
  2%|▏         | 200/9576 [29:42<14:56:34,  5.74s/it]

{'eval_loss': 1.4826990365982056, 'eval_runtime': 313.5913, 'eval_samples_per_second': 33.706, 'eval_steps_per_second': 8.428, 'epoch': 0.15}


  3%|▎         | 300/9576 [39:15<14:45:12,  5.73s/it] 

{'loss': 1.585, 'learning_rate': 9.68671679197995e-05, 'epoch': 0.22}


                                                     
  3%|▎         | 300/9576 [44:28<14:45:12,  5.73s/it]

{'eval_loss': 1.4228571653366089, 'eval_runtime': 313.1892, 'eval_samples_per_second': 33.75, 'eval_steps_per_second': 8.439, 'epoch': 0.22}


  4%|▍         | 400/9576 [54:01<14:35:58,  5.73s/it] 

{'loss': 1.5361, 'learning_rate': 9.582289055973267e-05, 'epoch': 0.29}


                                                     
  4%|▍         | 400/9576 [59:14<14:35:58,  5.73s/it]

{'eval_loss': 1.3995198011398315, 'eval_runtime': 312.9661, 'eval_samples_per_second': 33.774, 'eval_steps_per_second': 8.445, 'epoch': 0.29}


  5%|▌         | 500/9576 [1:08:49<14:32:08,  5.77s/it]

{'loss': 1.4825, 'learning_rate': 9.477861319966583e-05, 'epoch': 0.37}


                                                       
  5%|▌         | 500/9576 [1:14:03<14:32:08,  5.77s/it]

{'eval_loss': 1.3774093389511108, 'eval_runtime': 314.8305, 'eval_samples_per_second': 33.574, 'eval_steps_per_second': 8.395, 'epoch': 0.37}


  6%|▋         | 600/9576 [1:23:47<14:22:27,  5.77s/it]  

{'loss': 1.4843, 'learning_rate': 9.373433583959899e-05, 'epoch': 0.44}


                                                       
  6%|▋         | 600/9576 [1:29:01<14:22:27,  5.77s/it]

{'eval_loss': 1.359110713005066, 'eval_runtime': 314.9196, 'eval_samples_per_second': 33.564, 'eval_steps_per_second': 8.393, 'epoch': 0.44}


  7%|▋         | 700/9576 [1:38:38<14:12:30,  5.76s/it]  

{'loss': 1.4438, 'learning_rate': 9.269005847953217e-05, 'epoch': 0.51}


                                                       
  7%|▋         | 700/9576 [1:43:53<14:12:30,  5.76s/it]

{'eval_loss': 1.350516676902771, 'eval_runtime': 314.7565, 'eval_samples_per_second': 33.582, 'eval_steps_per_second': 8.397, 'epoch': 0.51}


  8%|▊         | 800/9576 [1:53:29<14:04:34,  5.77s/it]  

{'loss': 1.432, 'learning_rate': 9.164578111946534e-05, 'epoch': 0.58}


                                                       
  8%|▊         | 800/9576 [1:58:45<14:04:34,  5.77s/it]

{'eval_loss': 1.3444007635116577, 'eval_runtime': 315.2418, 'eval_samples_per_second': 33.53, 'eval_steps_per_second': 8.384, 'epoch': 0.58}


  9%|▉         | 900/9576 [2:08:22<13:55:01,  5.77s/it]  

{'loss': 1.4301, 'learning_rate': 9.06015037593985e-05, 'epoch': 0.66}


                                                       
  9%|▉         | 900/9576 [2:13:37<13:55:01,  5.77s/it]

{'eval_loss': 1.3315070867538452, 'eval_runtime': 315.2586, 'eval_samples_per_second': 33.528, 'eval_steps_per_second': 8.384, 'epoch': 0.66}


 10%|█         | 1000/9576 [2:23:14<13:45:03,  5.77s/it] 

{'loss': 1.4098, 'learning_rate': 8.955722639933167e-05, 'epoch': 0.73}


                                                        
 10%|█         | 1000/9576 [2:28:30<13:45:03,  5.77s/it]

{'eval_loss': 1.3289036750793457, 'eval_runtime': 315.2416, 'eval_samples_per_second': 33.53, 'eval_steps_per_second': 8.384, 'epoch': 0.73}


 11%|█▏        | 1100/9576 [2:39:19<13:36:14,  5.78s/it]  

{'loss': 1.4125, 'learning_rate': 8.851294903926484e-05, 'epoch': 0.8}


                                                        
 11%|█▏        | 1100/9576 [2:44:34<13:36:14,  5.78s/it]

{'eval_loss': 1.326006293296814, 'eval_runtime': 315.3226, 'eval_samples_per_second': 33.521, 'eval_steps_per_second': 8.382, 'epoch': 0.8}


 13%|█▎        | 1200/9576 [2:54:07<13:20:18,  5.73s/it]  

{'loss': 1.3902, 'learning_rate': 8.7468671679198e-05, 'epoch': 0.88}


                                                        
 13%|█▎        | 1200/9576 [2:59:21<13:20:18,  5.73s/it]

{'eval_loss': 1.3154441118240356, 'eval_runtime': 313.6003, 'eval_samples_per_second': 33.705, 'eval_steps_per_second': 8.428, 'epoch': 0.88}


 14%|█▎        | 1300/9576 [3:08:54<13:10:35,  5.73s/it] 

{'loss': 1.4017, 'learning_rate': 8.642439431913116e-05, 'epoch': 0.95}


                                                        
 14%|█▎        | 1300/9576 [3:14:08<13:10:35,  5.73s/it]

{'eval_loss': 1.3085497617721558, 'eval_runtime': 313.5372, 'eval_samples_per_second': 33.712, 'eval_steps_per_second': 8.43, 'epoch': 0.95}


 15%|█▍        | 1400/9576 [3:23:41<13:01:14,  5.73s/it] 

{'loss': 1.3635, 'learning_rate': 8.538011695906433e-05, 'epoch': 1.02}


                                                        
 15%|█▍        | 1400/9576 [3:28:55<13:01:14,  5.73s/it]

{'eval_loss': 1.309260606765747, 'eval_runtime': 313.8573, 'eval_samples_per_second': 33.678, 'eval_steps_per_second': 8.421, 'epoch': 1.02}


 16%|█▌        | 1500/9576 [3:38:31<12:57:20,  5.78s/it] 

{'loss': 1.2888, 'learning_rate': 8.43358395989975e-05, 'epoch': 1.1}


                                                        
 16%|█▌        | 1500/9576 [3:43:46<12:57:20,  5.78s/it]

{'eval_loss': 1.3077439069747925, 'eval_runtime': 315.4117, 'eval_samples_per_second': 33.512, 'eval_steps_per_second': 8.38, 'epoch': 1.1}


 17%|█▋        | 1600/9576 [3:54:44<12:48:19,  5.78s/it]  

{'loss': 1.3051, 'learning_rate': 8.329156223893066e-05, 'epoch': 1.17}


                                                        
 17%|█▋        | 1600/9576 [4:00:00<12:48:19,  5.78s/it]

{'eval_loss': 1.3065143823623657, 'eval_runtime': 315.6048, 'eval_samples_per_second': 33.491, 'eval_steps_per_second': 8.374, 'epoch': 1.17}


 18%|█▊        | 1700/9576 [4:09:38<12:39:06,  5.78s/it]  

{'loss': 1.2891, 'learning_rate': 8.224728487886383e-05, 'epoch': 1.24}


                                                        
 18%|█▊        | 1700/9576 [4:14:53<12:39:06,  5.78s/it]

{'eval_loss': 1.2975767850875854, 'eval_runtime': 315.5547, 'eval_samples_per_second': 33.497, 'eval_steps_per_second': 8.376, 'epoch': 1.24}


 19%|█▉        | 1800/9576 [4:24:29<12:23:29,  5.74s/it]  

{'loss': 1.2821, 'learning_rate': 8.120300751879699e-05, 'epoch': 1.32}


                                                        
 19%|█▉        | 1800/9576 [4:29:43<12:23:29,  5.74s/it]

{'eval_loss': 1.3059735298156738, 'eval_runtime': 313.7173, 'eval_samples_per_second': 33.693, 'eval_steps_per_second': 8.425, 'epoch': 1.32}


 20%|█▉        | 1900/9576 [4:39:17<12:14:06,  5.74s/it] 

{'loss': 1.3067, 'learning_rate': 8.015873015873016e-05, 'epoch': 1.39}


                                                        
 20%|█▉        | 1900/9576 [4:44:30<12:14:06,  5.74s/it]

{'eval_loss': 1.2960880994796753, 'eval_runtime': 313.7333, 'eval_samples_per_second': 33.691, 'eval_steps_per_second': 8.424, 'epoch': 1.39}


 21%|██        | 2000/9576 [4:54:04<12:04:37,  5.74s/it] 

{'loss': 1.2929, 'learning_rate': 7.911445279866333e-05, 'epoch': 1.46}


                                                        
 21%|██        | 2000/9576 [4:59:18<12:04:37,  5.74s/it]

{'eval_loss': 1.288700819015503, 'eval_runtime': 313.7943, 'eval_samples_per_second': 33.684, 'eval_steps_per_second': 8.423, 'epoch': 1.46}


 22%|██▏       | 2100/9576 [5:10:08<11:54:59,  5.74s/it]  

{'loss': 1.285, 'learning_rate': 7.807017543859649e-05, 'epoch': 1.53}


                                                        
 22%|██▏       | 2100/9576 [5:15:24<11:54:59,  5.74s/it]

{'eval_loss': 1.2924636602401733, 'eval_runtime': 315.7557, 'eval_samples_per_second': 33.475, 'eval_steps_per_second': 8.37, 'epoch': 1.53}


 23%|██▎       | 2200/9576 [5:25:02<11:50:51,  5.78s/it]  

{'loss': 1.2719, 'learning_rate': 7.702589807852966e-05, 'epoch': 1.61}


                                                        
 23%|██▎       | 2200/9576 [5:30:18<11:50:51,  5.78s/it]

{'eval_loss': 1.2929713726043701, 'eval_runtime': 315.6027, 'eval_samples_per_second': 33.491, 'eval_steps_per_second': 8.374, 'epoch': 1.61}


 24%|██▍       | 2300/9576 [5:39:56<11:41:18,  5.78s/it]  

{'loss': 1.2627, 'learning_rate': 7.598162071846282e-05, 'epoch': 1.68}


                                                        
 24%|██▍       | 2300/9576 [5:45:12<11:41:18,  5.78s/it]

{'eval_loss': 1.2922605276107788, 'eval_runtime': 315.6517, 'eval_samples_per_second': 33.486, 'eval_steps_per_second': 8.373, 'epoch': 1.68}


 25%|██▌       | 2400/9576 [5:54:50<11:31:39,  5.78s/it]  

{'loss': 1.2615, 'learning_rate': 7.4937343358396e-05, 'epoch': 1.75}


                                                        
 25%|██▌       | 2400/9576 [6:00:06<11:31:39,  5.78s/it]

{'eval_loss': 1.289054036140442, 'eval_runtime': 315.6577, 'eval_samples_per_second': 33.486, 'eval_steps_per_second': 8.373, 'epoch': 1.75}


 26%|██▌       | 2500/9576 [6:09:44<11:22:01,  5.78s/it]  

{'loss': 1.2844, 'learning_rate': 7.389306599832916e-05, 'epoch': 1.83}


                                                        
 26%|██▌       | 2500/9576 [6:14:59<11:22:01,  5.78s/it]

{'eval_loss': 1.2812806367874146, 'eval_runtime': 315.5677, 'eval_samples_per_second': 33.495, 'eval_steps_per_second': 8.375, 'epoch': 1.83}


 27%|██▋       | 2600/9576 [6:26:03<11:24:28,  5.89s/it]  

{'loss': 1.2711, 'learning_rate': 7.284878863826232e-05, 'epoch': 1.9}


                                                        
 27%|██▋       | 2600/9576 [6:31:25<11:24:28,  5.89s/it]

{'eval_loss': 1.2792152166366577, 'eval_runtime': 322.1042, 'eval_samples_per_second': 32.815, 'eval_steps_per_second': 8.205, 'epoch': 1.9}


 28%|██▊       | 2700/9576 [6:41:03<11:02:11,  5.78s/it]  

{'loss': 1.247, 'learning_rate': 7.18045112781955e-05, 'epoch': 1.97}


                                                        
 28%|██▊       | 2700/9576 [6:46:19<11:02:11,  5.78s/it]

{'eval_loss': 1.277848720550537, 'eval_runtime': 315.6507, 'eval_samples_per_second': 33.486, 'eval_steps_per_second': 8.373, 'epoch': 1.97}


 29%|██▉       | 2800/9576 [6:55:54<10:47:49,  5.74s/it]  

{'loss': 1.2234, 'learning_rate': 7.076023391812866e-05, 'epoch': 2.05}


                                                        
 29%|██▉       | 2800/9576 [7:01:07<10:47:49,  5.74s/it]

{'eval_loss': 1.281813144683838, 'eval_runtime': 313.7493, 'eval_samples_per_second': 33.689, 'eval_steps_per_second': 8.424, 'epoch': 2.05}


 30%|███       | 2900/9576 [7:10:41<10:38:23,  5.74s/it] 

{'loss': 1.1795, 'learning_rate': 6.971595655806182e-05, 'epoch': 2.12}


                                                        
 30%|███       | 2900/9576 [7:15:55<10:38:23,  5.74s/it]

{'eval_loss': 1.2848632335662842, 'eval_runtime': 313.7753, 'eval_samples_per_second': 33.687, 'eval_steps_per_second': 8.423, 'epoch': 2.12}


 31%|███▏      | 3000/9576 [7:25:29<10:28:37,  5.74s/it] 

{'loss': 1.1913, 'learning_rate': 6.867167919799499e-05, 'epoch': 2.19}


                                                        
 31%|███▏      | 3000/9576 [7:30:43<10:28:37,  5.74s/it]

{'eval_loss': 1.280775785446167, 'eval_runtime': 313.8413, 'eval_samples_per_second': 33.679, 'eval_steps_per_second': 8.421, 'epoch': 2.19}


 32%|███▏      | 3100/9576 [7:41:22<10:22:28,  5.77s/it]  

{'loss': 1.1864, 'learning_rate': 6.762740183792816e-05, 'epoch': 2.26}


                                                        
 32%|███▏      | 3100/9576 [7:46:37<10:22:28,  5.77s/it]

{'eval_loss': 1.2816163301467896, 'eval_runtime': 315.2446, 'eval_samples_per_second': 33.53, 'eval_steps_per_second': 8.384, 'epoch': 2.26}


 33%|███▎      | 3200/9576 [7:56:14<10:13:02,  5.77s/it]  

{'loss': 1.1897, 'learning_rate': 6.658312447786132e-05, 'epoch': 2.34}


                                                        
 33%|███▎      | 3200/9576 [8:01:30<10:13:02,  5.77s/it]

{'eval_loss': 1.2814126014709473, 'eval_runtime': 315.2876, 'eval_samples_per_second': 33.525, 'eval_steps_per_second': 8.383, 'epoch': 2.34}


 34%|███▍      | 3300/9576 [8:11:07<10:03:30,  5.77s/it]  

{'loss': 1.1887, 'learning_rate': 6.553884711779449e-05, 'epoch': 2.41}


                                                        
 34%|███▍      | 3300/9576 [8:16:22<10:03:30,  5.77s/it]

{'eval_loss': 1.278687834739685, 'eval_runtime': 315.2316, 'eval_samples_per_second': 33.531, 'eval_steps_per_second': 8.384, 'epoch': 2.41}


 36%|███▌      | 3400/9576 [8:25:57<9:49:46,  5.73s/it]   

{'loss': 1.1889, 'learning_rate': 6.449456975772765e-05, 'epoch': 2.48}


                                                       
 36%|███▌      | 3400/9576 [8:31:11<9:49:46,  5.73s/it]

{'eval_loss': 1.2814419269561768, 'eval_runtime': 313.3652, 'eval_samples_per_second': 33.731, 'eval_steps_per_second': 8.434, 'epoch': 2.48}


 37%|███▋      | 3500/9576 [8:40:44<9:40:01,  5.73s/it]  

{'loss': 1.203, 'learning_rate': 6.345029239766082e-05, 'epoch': 2.56}


                                                       
 37%|███▋      | 3500/9576 [8:45:57<9:40:01,  5.73s/it]

{'eval_loss': 1.2805118560791016, 'eval_runtime': 313.3872, 'eval_samples_per_second': 33.728, 'eval_steps_per_second': 8.434, 'epoch': 2.56}


 38%|███▊      | 3600/9576 [8:56:35<9:31:57,  5.74s/it]   

{'loss': 1.1807, 'learning_rate': 6.240601503759398e-05, 'epoch': 2.63}


                                                       
 38%|███▊      | 3600/9576 [9:01:49<9:31:57,  5.74s/it]

{'eval_loss': 1.2745494842529297, 'eval_runtime': 313.9853, 'eval_samples_per_second': 33.664, 'eval_steps_per_second': 8.418, 'epoch': 2.63}


 39%|███▊      | 3700/9576 [9:11:24<9:26:30,  5.78s/it]  

{'loss': 1.19, 'learning_rate': 6.136173767752715e-05, 'epoch': 2.7}


                                                       
 39%|███▊      | 3700/9576 [9:16:40<9:26:30,  5.78s/it]

{'eval_loss': 1.2851287126541138, 'eval_runtime': 315.9098, 'eval_samples_per_second': 33.459, 'eval_steps_per_second': 8.366, 'epoch': 2.7}


 40%|███▉      | 3800/9576 [9:26:18<9:16:43,  5.78s/it]   

{'loss': 1.1941, 'learning_rate': 6.0317460317460316e-05, 'epoch': 2.78}


                                                       
 40%|███▉      | 3800/9576 [9:31:34<9:16:43,  5.78s/it]

{'eval_loss': 1.2738879919052124, 'eval_runtime': 315.8968, 'eval_samples_per_second': 33.46, 'eval_steps_per_second': 8.367, 'epoch': 2.78}


 41%|████      | 3900/9576 [9:41:13<9:07:06,  5.78s/it]   

{'loss': 1.1758, 'learning_rate': 5.927318295739349e-05, 'epoch': 2.85}


                                                       
 41%|████      | 3900/9576 [9:46:29<9:07:06,  5.78s/it]

{'eval_loss': 1.2749955654144287, 'eval_runtime': 315.9068, 'eval_samples_per_second': 33.459, 'eval_steps_per_second': 8.366, 'epoch': 2.85}


 42%|████▏     | 4000/9576 [9:56:07<8:57:28,  5.78s/it]   

{'loss': 1.1875, 'learning_rate': 5.822890559732665e-05, 'epoch': 2.92}


                                                       
 42%|████▏     | 4000/9576 [10:01:23<8:57:28,  5.78s/it]

{'eval_loss': 1.271497368812561, 'eval_runtime': 315.8648, 'eval_samples_per_second': 33.464, 'eval_steps_per_second': 8.368, 'epoch': 2.92}


 43%|████▎     | 4100/9576 [10:12:18<8:47:41,  5.78s/it]   

{'loss': 1.1887, 'learning_rate': 5.718462823725982e-05, 'epoch': 3.0}


                                                        
 43%|████▎     | 4100/9576 [10:17:33<8:47:41,  5.78s/it]

{'eval_loss': 1.2729214429855347, 'eval_runtime': 315.6427, 'eval_samples_per_second': 33.487, 'eval_steps_per_second': 8.373, 'epoch': 3.0}


 44%|████▍     | 4200/9576 [10:27:17<8:46:04,  5.87s/it]   

{'loss': 1.1279, 'learning_rate': 5.6140350877192984e-05, 'epoch': 3.07}


                                                        
 44%|████▍     | 4200/9576 [10:32:38<8:46:04,  5.87s/it]

{'eval_loss': 1.284919023513794, 'eval_runtime': 320.9139, 'eval_samples_per_second': 32.937, 'eval_steps_per_second': 8.236, 'epoch': 3.07}


 45%|████▍     | 4300/9576 [10:42:24<8:35:38,  5.86s/it]   

{'loss': 1.1123, 'learning_rate': 5.509607351712616e-05, 'epoch': 3.14}


                                                        
 45%|████▍     | 4300/9576 [10:47:45<8:35:38,  5.86s/it]

{'eval_loss': 1.2841299772262573, 'eval_runtime': 320.8429, 'eval_samples_per_second': 32.944, 'eval_steps_per_second': 8.238, 'epoch': 3.14}


 46%|████▌     | 4400/9576 [10:57:32<8:25:57,  5.87s/it]   

{'loss': 1.116, 'learning_rate': 5.405179615705932e-05, 'epoch': 3.21}


                                                        
 46%|████▌     | 4400/9576 [11:02:53<8:25:57,  5.87s/it]

{'eval_loss': 1.2863816022872925, 'eval_runtime': 320.8949, 'eval_samples_per_second': 32.939, 'eval_steps_per_second': 8.236, 'epoch': 3.21}


 47%|████▋     | 4500/9576 [11:12:29<8:07:08,  5.76s/it]   

{'loss': 1.1185, 'learning_rate': 5.300751879699248e-05, 'epoch': 3.29}


                                                        
 47%|████▋     | 4500/9576 [11:17:42<8:07:08,  5.76s/it]

{'eval_loss': 1.287320852279663, 'eval_runtime': 313.2222, 'eval_samples_per_second': 33.746, 'eval_steps_per_second': 8.438, 'epoch': 3.29}


 48%|████▊     | 4600/9576 [11:28:33<7:53:54,  5.71s/it]   

{'loss': 1.1209, 'learning_rate': 5.1963241436925645e-05, 'epoch': 3.36}


                                                        
 48%|████▊     | 4600/9576 [11:33:46<7:53:54,  5.71s/it]

{'eval_loss': 1.2805840969085693, 'eval_runtime': 312.468, 'eval_samples_per_second': 33.827, 'eval_steps_per_second': 8.458, 'epoch': 3.36}


 49%|████▉     | 4700/9576 [11:43:17<7:44:20,  5.71s/it]  

{'loss': 1.1418, 'learning_rate': 5.091896407685882e-05, 'epoch': 3.43}


                                                        
 49%|████▉     | 4700/9576 [11:48:30<7:44:20,  5.71s/it]

{'eval_loss': 1.2811052799224854, 'eval_runtime': 312.46, 'eval_samples_per_second': 33.828, 'eval_steps_per_second': 8.459, 'epoch': 3.43}


 50%|█████     | 4800/9576 [11:58:02<7:34:48,  5.71s/it]  

{'loss': 1.112, 'learning_rate': 4.987468671679198e-05, 'epoch': 3.51}


                                                        
 50%|█████     | 4800/9576 [12:03:16<7:34:48,  5.71s/it]

{'eval_loss': 1.2866586446762085, 'eval_runtime': 313.8913, 'eval_samples_per_second': 33.674, 'eval_steps_per_second': 8.42, 'epoch': 3.51}


 51%|█████     | 4900/9576 [12:12:51<7:28:46,  5.76s/it]  

{'loss': 1.1076, 'learning_rate': 4.883040935672515e-05, 'epoch': 3.58}


                                                        
 51%|█████     | 4900/9576 [12:18:06<7:28:46,  5.76s/it]

{'eval_loss': 1.2807821035385132, 'eval_runtime': 314.457, 'eval_samples_per_second': 33.614, 'eval_steps_per_second': 8.405, 'epoch': 3.58}


 52%|█████▏    | 5000/9576 [12:27:42<7:19:04,  5.76s/it]   

{'loss': 1.1267, 'learning_rate': 4.778613199665831e-05, 'epoch': 3.65}


                                                        
 52%|█████▏    | 5000/9576 [12:32:56<7:19:04,  5.76s/it]

{'eval_loss': 1.279388189315796, 'eval_runtime': 314.4715, 'eval_samples_per_second': 33.612, 'eval_steps_per_second': 8.405, 'epoch': 3.65}


 53%|█████▎    | 5100/9576 [12:43:33<7:08:30,  5.74s/it]   

{'loss': 1.1149, 'learning_rate': 4.674185463659148e-05, 'epoch': 3.73}


                                                        
 53%|█████▎    | 5100/9576 [12:48:45<7:08:30,  5.74s/it]

{'eval_loss': 1.2811294794082642, 'eval_runtime': 312.1809, 'eval_samples_per_second': 33.859, 'eval_steps_per_second': 8.466, 'epoch': 3.73}


 54%|█████▍    | 5200/9576 [12:58:16<6:56:04,  5.70s/it]  

{'loss': 1.1298, 'learning_rate': 4.5697577276524644e-05, 'epoch': 3.8}


                                                        
 54%|█████▍    | 5200/9576 [13:03:28<6:56:04,  5.70s/it]

{'eval_loss': 1.2831748723983765, 'eval_runtime': 312.1649, 'eval_samples_per_second': 33.86, 'eval_steps_per_second': 8.467, 'epoch': 3.8}


 55%|█████▌    | 5300/9576 [13:12:59<6:46:39,  5.71s/it]  

{'loss': 1.1483, 'learning_rate': 4.465329991645781e-05, 'epoch': 3.87}


                                                        
 55%|█████▌    | 5300/9576 [13:18:11<6:46:39,  5.71s/it]

{'eval_loss': 1.2801955938339233, 'eval_runtime': 312.1534, 'eval_samples_per_second': 33.862, 'eval_steps_per_second': 8.467, 'epoch': 3.87}


 56%|█████▋    | 5400/9576 [13:27:42<6:37:01,  5.70s/it]  

{'loss': 1.1335, 'learning_rate': 4.3609022556390975e-05, 'epoch': 3.95}


                                                        
 56%|█████▋    | 5400/9576 [13:32:56<6:37:01,  5.70s/it]

{'eval_loss': 1.2779669761657715, 'eval_runtime': 314.0366, 'eval_samples_per_second': 33.659, 'eval_steps_per_second': 8.416, 'epoch': 3.95}


 57%|█████▋    | 5500/9576 [13:42:31<6:30:25,  5.75s/it]  

{'loss': 1.1032, 'learning_rate': 4.2564745196324144e-05, 'epoch': 4.02}


                                                        
 57%|█████▋    | 5500/9576 [13:47:45<6:30:25,  5.75s/it]

{'eval_loss': 1.2855030298233032, 'eval_runtime': 314.1046, 'eval_samples_per_second': 33.651, 'eval_steps_per_second': 8.414, 'epoch': 4.02}


 58%|█████▊    | 5600/9576 [13:58:32<6:20:34,  5.74s/it]   

{'loss': 1.07, 'learning_rate': 4.152046783625731e-05, 'epoch': 4.09}


                                                        
 58%|█████▊    | 5600/9576 [14:03:46<6:20:34,  5.74s/it]

{'eval_loss': 1.2866230010986328, 'eval_runtime': 313.8643, 'eval_samples_per_second': 33.677, 'eval_steps_per_second': 8.421, 'epoch': 4.09}


 60%|█████▉    | 5700/9576 [14:13:20<6:11:05,  5.74s/it]  

{'loss': 1.0779, 'learning_rate': 4.047619047619048e-05, 'epoch': 4.16}


                                                        
 60%|█████▉    | 5700/9576 [14:18:34<6:11:05,  5.74s/it]

{'eval_loss': 1.2870221138000488, 'eval_runtime': 313.9083, 'eval_samples_per_second': 33.672, 'eval_steps_per_second': 8.42, 'epoch': 4.16}


 61%|██████    | 5800/9576 [14:28:08<6:01:28,  5.74s/it]  

{'loss': 1.0693, 'learning_rate': 3.943191311612364e-05, 'epoch': 4.24}


                                                        
 61%|██████    | 5800/9576 [14:33:22<6:01:28,  5.74s/it]

{'eval_loss': 1.2930150032043457, 'eval_runtime': 313.9413, 'eval_samples_per_second': 33.669, 'eval_steps_per_second': 8.419, 'epoch': 4.24}


 62%|██████▏   | 5900/9576 [14:42:57<5:52:00,  5.75s/it]  

{'loss': 1.0824, 'learning_rate': 3.838763575605681e-05, 'epoch': 4.31}


                                                        
 62%|██████▏   | 5900/9576 [14:48:11<5:52:00,  5.75s/it]

{'eval_loss': 1.2872099876403809, 'eval_runtime': 313.9273, 'eval_samples_per_second': 33.67, 'eval_steps_per_second': 8.419, 'epoch': 4.31}


 63%|██████▎   | 6000/9576 [14:57:45<5:42:22,  5.74s/it]  

{'loss': 1.0759, 'learning_rate': 3.7343358395989974e-05, 'epoch': 4.38}


                                                        
 63%|██████▎   | 6000/9576 [15:02:59<5:42:22,  5.74s/it]

{'eval_loss': 1.2920548915863037, 'eval_runtime': 314.0094, 'eval_samples_per_second': 33.661, 'eval_steps_per_second': 8.417, 'epoch': 4.38}


 64%|██████▎   | 6100/9576 [15:13:37<5:30:38,  5.71s/it]   

{'loss': 1.0787, 'learning_rate': 3.629908103592314e-05, 'epoch': 4.46}


                                                        
 64%|██████▎   | 6100/9576 [15:18:50<5:30:38,  5.71s/it]

{'eval_loss': 1.2889022827148438, 'eval_runtime': 312.1959, 'eval_samples_per_second': 33.857, 'eval_steps_per_second': 8.466, 'epoch': 4.46}


 65%|██████▍   | 6200/9576 [15:28:21<5:21:12,  5.71s/it] 

{'loss': 1.0898, 'learning_rate': 3.5254803675856304e-05, 'epoch': 4.53}


                                                        
 65%|██████▍   | 6200/9576 [15:33:33<5:21:12,  5.71s/it]

{'eval_loss': 1.2914230823516846, 'eval_runtime': 312.2371, 'eval_samples_per_second': 33.852, 'eval_steps_per_second': 8.465, 'epoch': 4.53}


 66%|██████▌   | 6300/9576 [15:43:04<5:11:29,  5.71s/it] 

{'loss': 1.0732, 'learning_rate': 3.421052631578947e-05, 'epoch': 4.6}


                                                        
 66%|██████▌   | 6300/9576 [15:48:16<5:11:29,  5.71s/it]

{'eval_loss': 1.2868643999099731, 'eval_runtime': 312.251, 'eval_samples_per_second': 33.851, 'eval_steps_per_second': 8.464, 'epoch': 4.6}


 67%|██████▋   | 6400/9576 [15:57:49<5:04:11,  5.75s/it] 

{'loss': 1.0715, 'learning_rate': 3.316624895572264e-05, 'epoch': 4.68}


                                                        
 67%|██████▋   | 6400/9576 [16:03:03<5:04:11,  5.75s/it]

{'eval_loss': 1.2918341159820557, 'eval_runtime': 314.2584, 'eval_samples_per_second': 33.635, 'eval_steps_per_second': 8.41, 'epoch': 4.68}


 68%|██████▊   | 6500/9576 [16:12:38<4:54:56,  5.75s/it]  

{'loss': 1.074, 'learning_rate': 3.212197159565581e-05, 'epoch': 4.75}


                                                        
 68%|██████▊   | 6500/9576 [16:17:52<4:54:56,  5.75s/it]

{'eval_loss': 1.2892757654190063, 'eval_runtime': 314.2214, 'eval_samples_per_second': 33.639, 'eval_steps_per_second': 8.411, 'epoch': 4.75}


 69%|██████▉   | 6600/9576 [16:28:57<4:50:41,  5.86s/it]   

{'loss': 1.0697, 'learning_rate': 3.107769423558897e-05, 'epoch': 4.82}


                                                        
 69%|██████▉   | 6600/9576 [16:34:16<4:50:41,  5.86s/it]

{'eval_loss': 1.2893205881118774, 'eval_runtime': 318.5654, 'eval_samples_per_second': 33.18, 'eval_steps_per_second': 8.297, 'epoch': 4.82}


 70%|██████▉   | 6700/9576 [16:43:50<4:33:36,  5.71s/it]  

{'loss': 1.087, 'learning_rate': 3.003341687552214e-05, 'epoch': 4.89}


                                                        
 70%|██████▉   | 6700/9576 [16:49:02<4:33:36,  5.71s/it]

{'eval_loss': 1.2891753911972046, 'eval_runtime': 312.339, 'eval_samples_per_second': 33.841, 'eval_steps_per_second': 8.462, 'epoch': 4.89}


 71%|███████   | 6800/9576 [16:58:33<4:24:15,  5.71s/it] 

{'loss': 1.0829, 'learning_rate': 2.8989139515455303e-05, 'epoch': 4.97}


                                                        
 71%|███████   | 6800/9576 [17:03:46<4:24:15,  5.71s/it]

{'eval_loss': 1.2901039123535156, 'eval_runtime': 312.3834, 'eval_samples_per_second': 33.837, 'eval_steps_per_second': 8.461, 'epoch': 4.97}


 72%|███████▏  | 6900/9576 [17:13:17<4:14:41,  5.71s/it] 

{'loss': 1.0538, 'learning_rate': 2.7944862155388472e-05, 'epoch': 5.04}


                                                        
 72%|███████▏  | 6900/9576 [17:18:29<4:14:41,  5.71s/it]

{'eval_loss': 1.2946184873580933, 'eval_runtime': 312.4179, 'eval_samples_per_second': 33.833, 'eval_steps_per_second': 8.46, 'epoch': 5.04}


 73%|███████▎  | 7000/9576 [17:28:00<4:05:07,  5.71s/it] 

{'loss': 1.0411, 'learning_rate': 2.6900584795321637e-05, 'epoch': 5.11}


                                                        
 73%|███████▎  | 7000/9576 [17:33:13<4:05:07,  5.71s/it]

{'eval_loss': 1.2951409816741943, 'eval_runtime': 312.339, 'eval_samples_per_second': 33.841, 'eval_steps_per_second': 8.462, 'epoch': 5.11}


 74%|███████▍  | 7100/9576 [17:44:06<3:57:29,  5.75s/it]  

{'loss': 1.0443, 'learning_rate': 2.5856307435254806e-05, 'epoch': 5.19}


                                                        
 74%|███████▍  | 7100/9576 [17:49:20<3:57:29,  5.75s/it]

{'eval_loss': 1.2935923337936401, 'eval_runtime': 314.4635, 'eval_samples_per_second': 33.613, 'eval_steps_per_second': 8.405, 'epoch': 5.19}


 75%|███████▌  | 7200/9576 [17:58:56<3:48:04,  5.76s/it]  

{'loss': 1.0304, 'learning_rate': 2.4812030075187968e-05, 'epoch': 5.26}


                                                        
 75%|███████▌  | 7200/9576 [18:04:10<3:48:04,  5.76s/it]

{'eval_loss': 1.2976691722869873, 'eval_runtime': 314.4625, 'eval_samples_per_second': 33.613, 'eval_steps_per_second': 8.405, 'epoch': 5.26}


 76%|███████▌  | 7300/9576 [18:13:46<3:38:26,  5.76s/it]  

{'loss': 1.0403, 'learning_rate': 2.3767752715121137e-05, 'epoch': 5.33}


                                                        
 76%|███████▌  | 7300/9576 [18:19:01<3:38:26,  5.76s/it]

{'eval_loss': 1.2945977449417114, 'eval_runtime': 314.5485, 'eval_samples_per_second': 33.604, 'eval_steps_per_second': 8.403, 'epoch': 5.33}


 77%|███████▋  | 7400/9576 [18:28:37<3:28:49,  5.76s/it]  

{'loss': 1.0393, 'learning_rate': 2.2723475355054302e-05, 'epoch': 5.41}


                                                        
 77%|███████▋  | 7400/9576 [18:33:51<3:28:49,  5.76s/it]

{'eval_loss': 1.2958124876022339, 'eval_runtime': 314.5905, 'eval_samples_per_second': 33.599, 'eval_steps_per_second': 8.401, 'epoch': 5.41}


 78%|███████▊  | 7500/9576 [18:43:27<3:19:16,  5.76s/it]  

{'loss': 1.0542, 'learning_rate': 2.1679197994987468e-05, 'epoch': 5.48}


                                                        
 78%|███████▊  | 7500/9576 [18:48:42<3:19:16,  5.76s/it]

{'eval_loss': 1.2923932075500488, 'eval_runtime': 314.5485, 'eval_samples_per_second': 33.604, 'eval_steps_per_second': 8.403, 'epoch': 5.48}


 79%|███████▉  | 7600/9576 [18:59:38<3:09:32,  5.76s/it]  

{'loss': 1.0378, 'learning_rate': 2.0634920634920636e-05, 'epoch': 5.55}


                                                        
 79%|███████▉  | 7600/9576 [19:04:52<3:09:32,  5.76s/it]

{'eval_loss': 1.293839931488037, 'eval_runtime': 314.4464, 'eval_samples_per_second': 33.615, 'eval_steps_per_second': 8.405, 'epoch': 5.55}


 80%|████████  | 7700/9576 [19:14:28<2:58:47,  5.72s/it]  

{'loss': 1.0418, 'learning_rate': 1.9590643274853802e-05, 'epoch': 5.63}


                                                        
 80%|████████  | 7700/9576 [19:19:40<2:58:47,  5.72s/it]

{'eval_loss': 1.2934740781784058, 'eval_runtime': 312.601, 'eval_samples_per_second': 33.813, 'eval_steps_per_second': 8.455, 'epoch': 5.63}


 81%|████████▏ | 7800/9576 [19:29:12<2:49:11,  5.72s/it] 

{'loss': 1.0418, 'learning_rate': 1.8546365914786967e-05, 'epoch': 5.7}


                                                        
 81%|████████▏ | 7800/9576 [19:34:25<2:49:11,  5.72s/it]

{'eval_loss': 1.2932841777801514, 'eval_runtime': 312.7682, 'eval_samples_per_second': 33.795, 'eval_steps_per_second': 8.45, 'epoch': 5.7}


 82%|████████▏ | 7900/9576 [19:43:57<2:39:42,  5.72s/it] 

{'loss': 1.0528, 'learning_rate': 1.7502088554720132e-05, 'epoch': 5.77}


                                                        
 82%|████████▏ | 7900/9576 [19:49:09<2:39:42,  5.72s/it]

{'eval_loss': 1.2922539710998535, 'eval_runtime': 312.6931, 'eval_samples_per_second': 33.803, 'eval_steps_per_second': 8.452, 'epoch': 5.77}


 84%|████████▎ | 8000/9576 [19:58:41<2:30:10,  5.72s/it] 

{'loss': 1.0387, 'learning_rate': 1.64578111946533e-05, 'epoch': 5.84}


                                                        
 84%|████████▎ | 8000/9576 [20:03:54<2:30:10,  5.72s/it]

{'eval_loss': 1.293948769569397, 'eval_runtime': 312.7021, 'eval_samples_per_second': 33.802, 'eval_steps_per_second': 8.452, 'epoch': 5.84}


 85%|████████▍ | 8100/9576 [20:15:03<2:21:49,  5.76s/it]  

{'loss': 1.0553, 'learning_rate': 1.5413533834586467e-05, 'epoch': 5.92}


                                                        
 85%|████████▍ | 8100/9576 [20:20:18<2:21:49,  5.76s/it]

{'eval_loss': 1.292959451675415, 'eval_runtime': 314.8535, 'eval_samples_per_second': 33.571, 'eval_steps_per_second': 8.394, 'epoch': 5.92}


 86%|████████▌ | 8200/9576 [20:29:54<2:12:09,  5.76s/it]  

{'loss': 1.0424, 'learning_rate': 1.4369256474519632e-05, 'epoch': 5.99}


                                                        
 86%|████████▌ | 8200/9576 [20:35:09<2:12:09,  5.76s/it]

{'eval_loss': 1.2921816110610962, 'eval_runtime': 315.0286, 'eval_samples_per_second': 33.553, 'eval_steps_per_second': 8.39, 'epoch': 5.99}


 87%|████████▋ | 8300/9576 [20:44:46<2:02:29,  5.76s/it]  

{'loss': 1.0233, 'learning_rate': 1.3324979114452799e-05, 'epoch': 6.06}


                                                        
 87%|████████▋ | 8300/9576 [20:50:01<2:02:29,  5.76s/it]

{'eval_loss': 1.295907735824585, 'eval_runtime': 314.9556, 'eval_samples_per_second': 33.56, 'eval_steps_per_second': 8.392, 'epoch': 6.06}


 88%|████████▊ | 8400/9576 [20:59:37<1:53:01,  5.77s/it]  

{'loss': 1.0148, 'learning_rate': 1.2280701754385964e-05, 'epoch': 6.14}


                                                        
 88%|████████▊ | 8400/9576 [21:04:55<1:53:01,  5.77s/it]

{'eval_loss': 1.298638105392456, 'eval_runtime': 318.0115, 'eval_samples_per_second': 33.238, 'eval_steps_per_second': 8.311, 'epoch': 6.14}


 89%|████████▉ | 8500/9576 [21:14:32<1:43:19,  5.76s/it]  

{'loss': 1.0268, 'learning_rate': 1.1236424394319131e-05, 'epoch': 6.21}


                                                        
 89%|████████▉ | 8500/9576 [21:19:47<1:43:19,  5.76s/it]

{'eval_loss': 1.296865463256836, 'eval_runtime': 314.8845, 'eval_samples_per_second': 33.568, 'eval_steps_per_second': 8.394, 'epoch': 6.21}


 90%|████████▉ | 8600/9576 [21:29:28<1:33:03,  5.72s/it]  

{'loss': 1.0153, 'learning_rate': 1.0192147034252297e-05, 'epoch': 6.28}


                                                        
 90%|████████▉ | 8600/9576 [21:34:41<1:33:03,  5.72s/it]

{'eval_loss': 1.2972805500030518, 'eval_runtime': 312.9632, 'eval_samples_per_second': 33.774, 'eval_steps_per_second': 8.445, 'epoch': 6.28}


 91%|█████████ | 8700/9576 [21:44:15<1:23:35,  5.73s/it] 

{'loss': 1.0159, 'learning_rate': 9.147869674185464e-06, 'epoch': 6.36}


                                                        
 91%|█████████ | 8700/9576 [21:49:28<1:23:35,  5.73s/it]

{'eval_loss': 1.298082709312439, 'eval_runtime': 313.3422, 'eval_samples_per_second': 33.733, 'eval_steps_per_second': 8.435, 'epoch': 6.36}


 92%|█████████▏| 8800/9576 [21:59:01<1:14:23,  5.75s/it] 

{'loss': 1.0263, 'learning_rate': 8.10359231411863e-06, 'epoch': 6.43}


                                                        
 92%|█████████▏| 8800/9576 [22:04:13<1:14:23,  5.75s/it]

{'eval_loss': 1.2989118099212646, 'eval_runtime': 312.3122, 'eval_samples_per_second': 33.844, 'eval_steps_per_second': 8.463, 'epoch': 6.43}


 93%|█████████▎| 8900/9576 [22:13:45<1:04:21,  5.71s/it] 

{'loss': 1.0104, 'learning_rate': 7.059314954051796e-06, 'epoch': 6.5}


                                                        
 93%|█████████▎| 8900/9576 [22:18:57<1:04:21,  5.71s/it]

{'eval_loss': 1.2974456548690796, 'eval_runtime': 312.1692, 'eval_samples_per_second': 33.86, 'eval_steps_per_second': 8.467, 'epoch': 6.5}


 94%|█████████▍| 9000/9576 [22:28:29<54:55,  5.72s/it]   

{'loss': 1.0135, 'learning_rate': 6.015037593984962e-06, 'epoch': 6.58}


                                                      
 94%|█████████▍| 9000/9576 [22:33:43<54:55,  5.72s/it]

{'eval_loss': 1.2975459098815918, 'eval_runtime': 314.1338, 'eval_samples_per_second': 33.648, 'eval_steps_per_second': 8.414, 'epoch': 6.58}


 95%|█████████▌| 9100/9576 [22:44:33<45:27,  5.73s/it]    

{'loss': 1.0183, 'learning_rate': 4.970760233918129e-06, 'epoch': 6.65}


                                                      
 95%|█████████▌| 9100/9576 [22:49:47<45:27,  5.73s/it]

{'eval_loss': 1.2973413467407227, 'eval_runtime': 313.3182, 'eval_samples_per_second': 33.736, 'eval_steps_per_second': 8.436, 'epoch': 6.65}


 96%|█████████▌| 9200/9576 [22:59:20<35:54,  5.73s/it]   

{'loss': 1.032, 'learning_rate': 3.926482873851295e-06, 'epoch': 6.72}


                                                      
 96%|█████████▌| 9200/9576 [23:04:33<35:54,  5.73s/it]

{'eval_loss': 1.2978342771530151, 'eval_runtime': 313.3282, 'eval_samples_per_second': 33.735, 'eval_steps_per_second': 8.435, 'epoch': 6.72}


 97%|█████████▋| 9300/9576 [23:14:07<26:20,  5.73s/it]   

{'loss': 1.0374, 'learning_rate': 2.882205513784461e-06, 'epoch': 6.79}


                                                      
 97%|█████████▋| 9300/9576 [23:19:20<26:20,  5.73s/it]

{'eval_loss': 1.298385739326477, 'eval_runtime': 313.2372, 'eval_samples_per_second': 33.744, 'eval_steps_per_second': 8.438, 'epoch': 6.79}


 98%|█████████▊| 9400/9576 [23:28:53<16:48,  5.73s/it]  

{'loss': 1.0284, 'learning_rate': 1.8379281537176275e-06, 'epoch': 6.87}


                                                      
 98%|█████████▊| 9400/9576 [23:34:06<16:48,  5.73s/it]

{'eval_loss': 1.297967791557312, 'eval_runtime': 313.4482, 'eval_samples_per_second': 33.722, 'eval_steps_per_second': 8.432, 'epoch': 6.87}


 99%|█████████▉| 9500/9576 [23:43:39<07:15,  5.73s/it]  

{'loss': 1.0088, 'learning_rate': 7.936507936507937e-07, 'epoch': 6.94}


                                                      
 99%|█████████▉| 9500/9576 [23:48:54<07:15,  5.73s/it]

{'eval_loss': 1.2983264923095703, 'eval_runtime': 314.7828, 'eval_samples_per_second': 33.579, 'eval_steps_per_second': 8.396, 'epoch': 6.94}


100%|██████████| 9576/9576 [23:57:23<00:00,  9.01s/it]   

{'train_runtime': 86243.8374, 'train_samples_per_second': 7.11, 'train_steps_per_second': 0.111, 'train_loss': 1.1823892386236485, 'epoch': 7.0}





TrainOutput(global_step=9576, training_loss=1.1823892386236485, metrics={'train_runtime': 86243.8374, 'train_samples_per_second': 7.11, 'train_steps_per_second': 0.111, 'train_loss': 1.1823892386236485, 'epoch': 7.0})

In [82]:
# save the model
trainer.save_model(models_dir)

In [3]:
from transformers import T5ForConditionalGeneration, T5TokenizerFast
# load the saved model
loaded_model = T5ForConditionalGeneration.from_pretrained(models_dir)

In [4]:
tokenizer = T5TokenizerFast.from_pretrained(checkpoint)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [5]:
def run_model(input_string, **generator_args):
    generator_args = {
    "max_length": 512,
    "num_beams": 4,# bij grotere num_beams is trager maar complexere vragen(niet per se betere vragen)
    "length_penalty": 1.5,
    "no_repeat_ngram_size": 3,
    "early_stopping": True,
    }
    input_string = "generate questions: " + input_string + " </s>"
    input_ids = tokenizer.encode(input_string, return_tensors="pt")
    res = loaded_model.generate(input_ids, **generator_args)
    output = tokenizer.batch_decode(res, skip_special_tokens=True)
    output = [item.split("<sep>") for item in output]
    return output

In [9]:
# context <ANSWER>answer
context = """
Cheese is an ancient food whose origins predate recorded history. There is no conclusive evidence indicating where cheesemaking originated, whether in Europe, Central Asia or the Middle East. Earliest proposed dates for the origin of cheesemaking range from around 8000 BCE, when sheep were first domesticated. Since animal skins and inflated internal organs have, since ancient times, provided storage vessels for a range of foodstuffs, it is probable that the process of cheese making was discovered accidentally by storing milk in a container made from the stomach of an animal, resulting in the milk being turned to curd and whey by the rennet from the stomach.[7] There is a legend—with variations—about the discovery of cheese by an Arab trader who used this method of storing milk.[8]

The earliest evidence of cheesemaking in the archaeological record dates back to 5500 BCE and is found in what is now Kuyavia, Poland, where strainers coated with milk-fat molecules have been found.[9]

Cheesemaking may have begun independently of this by the pressing and salting of curdled milk to preserve it. Observation that the effect of making cheese in an animal stomach gave more solid and better-textured curds may have led to the deliberate addition of rennet. Early archeological evidence of Egyptian cheese has been found in Egyptian tomb murals, dating to about 2000 BCE.[10] A 2018 scientific paper stated that the world's oldest cheese, dating to approximately 1200 BCE (3200 years before present), was found in ancient Egyptian tombs.[11][12]

The earliest cheeses were likely quite sour and salty, similar in texture to rustic cottage cheese or feta, a crumbly, flavorful Greek cheese. Cheese produced in Europe, where climates are cooler than the Middle East, required less salt for preservation. With less salt and acidity, the cheese became a suitable environment for useful microbes and molds, giving aged cheeses their respective flavors. The earliest ever discovered preserved cheese was found in the Taklamakan Desert in Xinjiang, China, dating back as early as 1615 BCE (3600 years before present). <ANSWER>ancient Egyptian tombs
"""

context_gf = """
#### Types of Bias

**Selection bias** is the tendency to skew your choice of data sources to 
those that are easily available, convenient, and/or cost-effective. As a 
result of this a bias is introduced by the selection of individuals, groups 
or data for analysis in such a way that proper randomization is not achieved, 
thereby ensuring that the sample obtained is not representative of the 
population intended to be analyzed. 
[Learn more about selection bias](https://en.wikipedia.org/wiki/Selection_bias).

**Self-selection bias** is a form of selection bias where you get the data 
from sources that “volunteered” to provide it. Most poll data has this type 
of bias. [Learn more about self-selection bias](https://en.wikipedia.org/wiki/Self-selection_bias)

**Omitted-variable bias** happens when your featurized data doesn't have a 
feature necessary for accurate prediction. For example, let's assume that you 
are working on a churn prediction model and you want to predict whether a 
customer cancels their subscription within six months. You train a model, and 
it's accurate enough; however, several weeks after deployment you see many 
unexpected false negatives. You investigate the decreased model performance 
and discover a new competitor now offers a very similar service for a lower 
price. This feature wasn't initially available to your model, therefore 
important information for accurate prediction was missing. 
[Learn more about omitted variable Bias](https://en.wikipedia.org/wiki/Omitted-variable_bias). <ANSWER>a form of selection bias
"""

run_model(context_gf)

[['What is self-selection bias?']]

In [88]:
# final metrics
trainer.evaluate()

100%|██████████| 2643/2643 [05:22<00:00,  8.21it/s]


{'eval_loss': 1.298403263092041,
 'eval_runtime': 322.0432,
 'eval_samples_per_second': 32.822,
 'eval_steps_per_second': 8.207,
 'epoch': 7.0}