In [1]:
from datasets import load_dataset

# Load SQuAD dataset
dataset = load_dataset('squad')

train_dataset = dataset['train']
val_dataset = dataset['validation']


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['question'], examples['context'], truncation=True, padding='max_length', max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)


In [3]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Create a data collator that will pad the inputs dynamically
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

trainer.train()


  0%|          | 10/131400 [01:49<453:55:37, 12.44s/it]

{'loss': 9.6943, 'grad_norm': 18.902786254882812, 'learning_rate': 4.999619482496195e-05, 'epoch': 0.0}


  0%|          | 20/131400 [03:20<312:20:30,  8.56s/it]

{'loss': 8.749, 'grad_norm': 21.135250091552734, 'learning_rate': 4.99923896499239e-05, 'epoch': 0.0}


  0%|          | 30/131400 [04:56<337:12:23,  9.24s/it]

{'loss': 8.3652, 'grad_norm': 23.14295196533203, 'learning_rate': 4.9988584474885844e-05, 'epoch': 0.0}


  0%|          | 40/131400 [06:28<334:12:07,  9.16s/it]

{'loss': 8.158, 'grad_norm': 23.033279418945312, 'learning_rate': 4.998477929984779e-05, 'epoch': 0.0}


  0%|          | 50/131400 [08:02<341:54:08,  9.37s/it]

{'loss': 7.8068, 'grad_norm': 26.846797943115234, 'learning_rate': 4.9980974124809746e-05, 'epoch': 0.0}


  0%|          | 60/131400 [09:32<319:15:33,  8.75s/it]

{'loss': 7.6523, 'grad_norm': 25.261415481567383, 'learning_rate': 4.997716894977169e-05, 'epoch': 0.0}


  0%|          | 70/131400 [10:59<318:25:13,  8.73s/it]

{'loss': 7.685, 'grad_norm': 36.66092300415039, 'learning_rate': 4.997336377473364e-05, 'epoch': 0.0}


  0%|          | 80/131400 [12:40<386:48:49, 10.60s/it]

{'loss': 7.4494, 'grad_norm': 23.459285736083984, 'learning_rate': 4.996955859969559e-05, 'epoch': 0.0}


  0%|          | 90/131400 [15:23<648:47:28, 17.79s/it]

{'loss': 7.3913, 'grad_norm': 23.31989860534668, 'learning_rate': 4.9965753424657535e-05, 'epoch': 0.0}


  0%|          | 100/131400 [18:50<710:16:02, 19.47s/it]

{'loss': 7.3568, 'grad_norm': 24.605850219726562, 'learning_rate': 4.996194824961948e-05, 'epoch': 0.0}


  0%|          | 110/131400 [20:41<364:17:56,  9.99s/it]

{'loss': 7.2569, 'grad_norm': 21.853914260864258, 'learning_rate': 4.995814307458143e-05, 'epoch': 0.0}


  0%|          | 120/131400 [22:03<302:07:26,  8.28s/it]

{'loss': 7.2459, 'grad_norm': 21.061914443969727, 'learning_rate': 4.995433789954338e-05, 'epoch': 0.0}


  0%|          | 130/131400 [23:29<310:55:02,  8.53s/it]

{'loss': 7.3571, 'grad_norm': 19.403533935546875, 'learning_rate': 4.9950532724505325e-05, 'epoch': 0.0}


  0%|          | 140/131400 [24:52<298:27:18,  8.19s/it]

{'loss': 7.2669, 'grad_norm': 30.0341739654541, 'learning_rate': 4.994672754946728e-05, 'epoch': 0.0}


  0%|          | 150/131400 [26:15<304:44:10,  8.36s/it]

{'loss': 7.2272, 'grad_norm': 24.666040420532227, 'learning_rate': 4.9942922374429226e-05, 'epoch': 0.0}


  0%|          | 160/131400 [27:39<297:54:47,  8.17s/it]

{'loss': 7.256, 'grad_norm': 20.215560913085938, 'learning_rate': 4.9939117199391174e-05, 'epoch': 0.0}


  0%|          | 170/131400 [29:01<305:01:59,  8.37s/it]

{'loss': 7.152, 'grad_norm': 16.896860122680664, 'learning_rate': 4.993531202435312e-05, 'epoch': 0.0}


  0%|          | 180/131400 [30:27<308:42:48,  8.47s/it]

{'loss': 7.1687, 'grad_norm': 20.664146423339844, 'learning_rate': 4.993150684931507e-05, 'epoch': 0.0}


  0%|          | 190/131400 [31:50<303:02:05,  8.31s/it]

{'loss': 7.0841, 'grad_norm': 21.49143409729004, 'learning_rate': 4.9927701674277016e-05, 'epoch': 0.0}


  0%|          | 200/131400 [33:17<315:07:16,  8.65s/it]

{'loss': 7.1295, 'grad_norm': 17.542476654052734, 'learning_rate': 4.992389649923896e-05, 'epoch': 0.0}


  0%|          | 210/131400 [34:39<302:49:34,  8.31s/it]

{'loss': 7.2477, 'grad_norm': 16.890565872192383, 'learning_rate': 4.992009132420092e-05, 'epoch': 0.0}


  0%|          | 220/131400 [36:06<321:43:40,  8.83s/it]

{'loss': 7.1842, 'grad_norm': 20.658491134643555, 'learning_rate': 4.9916286149162865e-05, 'epoch': 0.01}


  0%|          | 230/131400 [37:30<303:42:20,  8.34s/it]

{'loss': 7.1712, 'grad_norm': 17.107526779174805, 'learning_rate': 4.991248097412481e-05, 'epoch': 0.01}


  0%|          | 240/131400 [38:56<317:12:57,  8.71s/it]

{'loss': 7.1132, 'grad_norm': 16.60672950744629, 'learning_rate': 4.990867579908676e-05, 'epoch': 0.01}


  0%|          | 250/131400 [40:21<314:31:36,  8.63s/it]

{'loss': 7.0411, 'grad_norm': 21.80359649658203, 'learning_rate': 4.9904870624048714e-05, 'epoch': 0.01}


  0%|          | 260/131400 [41:53<341:22:54,  9.37s/it]

{'loss': 7.2458, 'grad_norm': 14.553919792175293, 'learning_rate': 4.990106544901066e-05, 'epoch': 0.01}


  0%|          | 270/131400 [43:30<378:51:01, 10.40s/it]

{'loss': 7.1285, 'grad_norm': 19.010040283203125, 'learning_rate': 4.989726027397261e-05, 'epoch': 0.01}


  0%|          | 280/131400 [45:20<373:59:52, 10.27s/it]

{'loss': 7.0495, 'grad_norm': 18.609006881713867, 'learning_rate': 4.9893455098934556e-05, 'epoch': 0.01}


  0%|          | 290/131400 [46:48<301:24:29,  8.28s/it]

{'loss': 7.0203, 'grad_norm': 20.554302215576172, 'learning_rate': 4.98896499238965e-05, 'epoch': 0.01}


  0%|          | 300/131400 [48:11<318:00:09,  8.73s/it]

{'loss': 7.2298, 'grad_norm': 14.1107816696167, 'learning_rate': 4.988584474885845e-05, 'epoch': 0.01}


  0%|          | 310/131400 [49:39<317:22:36,  8.72s/it]

{'loss': 6.9788, 'grad_norm': 16.397541046142578, 'learning_rate': 4.98820395738204e-05, 'epoch': 0.01}


  0%|          | 320/131400 [51:03<315:40:26,  8.67s/it]

{'loss': 6.9971, 'grad_norm': 16.906269073486328, 'learning_rate': 4.9878234398782345e-05, 'epoch': 0.01}


  0%|          | 330/131400 [52:41<371:38:11, 10.21s/it]

{'loss': 6.9681, 'grad_norm': 15.661188125610352, 'learning_rate': 4.987442922374429e-05, 'epoch': 0.01}


  0%|          | 340/131400 [54:01<288:47:10,  7.93s/it]

{'loss': 6.9945, 'grad_norm': 15.609393119812012, 'learning_rate': 4.987062404870625e-05, 'epoch': 0.01}


  0%|          | 350/131400 [55:32<355:43:15,  9.77s/it]

{'loss': 6.9533, 'grad_norm': 13.813879013061523, 'learning_rate': 4.9866818873668194e-05, 'epoch': 0.01}


  0%|          | 360/131400 [56:52<280:00:39,  7.69s/it]

{'loss': 6.8678, 'grad_norm': 15.646059036254883, 'learning_rate': 4.986301369863014e-05, 'epoch': 0.01}


  0%|          | 370/131400 [58:57<429:30:04, 11.80s/it]

{'loss': 6.8497, 'grad_norm': 13.536334991455078, 'learning_rate': 4.985920852359209e-05, 'epoch': 0.01}


  0%|          | 380/131400 [1:01:06<466:33:18, 12.82s/it]

{'loss': 6.9776, 'grad_norm': 19.715639114379883, 'learning_rate': 4.9855403348554036e-05, 'epoch': 0.01}


  0%|          | 390/131400 [1:02:46<358:57:44,  9.86s/it]

{'loss': 6.8918, 'grad_norm': 15.295245170593262, 'learning_rate': 4.9851598173515983e-05, 'epoch': 0.01}


  0%|          | 400/131400 [1:04:21<345:48:17,  9.50s/it]

{'loss': 6.8382, 'grad_norm': 14.840302467346191, 'learning_rate': 4.984779299847793e-05, 'epoch': 0.01}


  0%|          | 410/131400 [1:06:01<331:24:31,  9.11s/it]

{'loss': 6.9664, 'grad_norm': 17.234132766723633, 'learning_rate': 4.984398782343988e-05, 'epoch': 0.01}


  0%|          | 420/131400 [1:07:44<405:37:43, 11.15s/it]

{'loss': 7.0765, 'grad_norm': 15.986995697021484, 'learning_rate': 4.9840182648401826e-05, 'epoch': 0.01}


  0%|          | 430/131400 [1:09:30<329:11:00,  9.05s/it]

{'loss': 6.9853, 'grad_norm': 13.210037231445312, 'learning_rate': 4.983637747336378e-05, 'epoch': 0.01}


  0%|          | 440/131400 [1:11:11<402:17:28, 11.06s/it]

{'loss': 6.8799, 'grad_norm': 12.428298950195312, 'learning_rate': 4.983257229832573e-05, 'epoch': 0.01}


  0%|          | 450/131400 [1:12:37<316:02:22,  8.69s/it]

{'loss': 6.8207, 'grad_norm': 15.325752258300781, 'learning_rate': 4.9828767123287674e-05, 'epoch': 0.01}


  0%|          | 460/131400 [1:14:06<313:20:32,  8.61s/it]

{'loss': 6.8665, 'grad_norm': 12.732142448425293, 'learning_rate': 4.982496194824962e-05, 'epoch': 0.01}


  0%|          | 470/131400 [1:15:48<427:50:25, 11.76s/it]

{'loss': 6.8351, 'grad_norm': 13.399003982543945, 'learning_rate': 4.982115677321157e-05, 'epoch': 0.01}


  0%|          | 480/131400 [1:17:39<385:28:58, 10.60s/it]

{'loss': 6.9987, 'grad_norm': 13.595779418945312, 'learning_rate': 4.9817351598173516e-05, 'epoch': 0.01}


  0%|          | 490/131400 [1:19:19<354:11:23,  9.74s/it]

{'loss': 6.7357, 'grad_norm': 15.34984302520752, 'learning_rate': 4.9813546423135464e-05, 'epoch': 0.01}


  0%|          | 500/131400 [1:20:41<299:08:43,  8.23s/it]

{'loss': 6.8528, 'grad_norm': 15.60548210144043, 'learning_rate': 4.980974124809741e-05, 'epoch': 0.01}


  0%|          | 510/131400 [1:22:07<325:35:04,  8.95s/it]

{'loss': 6.91, 'grad_norm': 14.680182456970215, 'learning_rate': 4.980593607305936e-05, 'epoch': 0.01}


  0%|          | 520/131400 [1:23:34<317:46:51,  8.74s/it]

{'loss': 6.914, 'grad_norm': 14.058926582336426, 'learning_rate': 4.980213089802131e-05, 'epoch': 0.01}


  0%|          | 530/131400 [1:25:06<323:46:17,  8.91s/it]

{'loss': 6.8382, 'grad_norm': 13.107467651367188, 'learning_rate': 4.979832572298326e-05, 'epoch': 0.01}


  0%|          | 540/131400 [1:26:36<308:15:44,  8.48s/it]

{'loss': 6.83, 'grad_norm': 12.76583194732666, 'learning_rate': 4.979452054794521e-05, 'epoch': 0.01}


  0%|          | 550/131400 [1:28:11<328:12:20,  9.03s/it]

{'loss': 6.8219, 'grad_norm': 10.50859260559082, 'learning_rate': 4.9790715372907155e-05, 'epoch': 0.01}


  0%|          | 560/131400 [1:29:41<331:50:11,  9.13s/it]

{'loss': 6.8041, 'grad_norm': 12.430028915405273, 'learning_rate': 4.97869101978691e-05, 'epoch': 0.01}


  0%|          | 570/131400 [1:31:13<343:46:24,  9.46s/it]

{'loss': 6.6977, 'grad_norm': 12.608367919921875, 'learning_rate': 4.978310502283105e-05, 'epoch': 0.01}


  0%|          | 580/131400 [1:32:48<354:43:05,  9.76s/it]

{'loss': 6.8868, 'grad_norm': 9.855950355529785, 'learning_rate': 4.9779299847793e-05, 'epoch': 0.01}


  0%|          | 590/131400 [1:34:46<406:06:33, 11.18s/it]

{'loss': 6.6615, 'grad_norm': 11.013465881347656, 'learning_rate': 4.9775494672754944e-05, 'epoch': 0.01}


  0%|          | 600/131400 [1:36:27<419:07:17, 11.54s/it]

{'loss': 6.8178, 'grad_norm': 10.524418830871582, 'learning_rate': 4.977168949771689e-05, 'epoch': 0.01}


  0%|          | 610/131400 [1:38:09<303:12:06,  8.35s/it]

{'loss': 6.8569, 'grad_norm': 10.221383094787598, 'learning_rate': 4.9767884322678846e-05, 'epoch': 0.01}


  0%|          | 620/131400 [1:39:28<285:32:56,  7.86s/it]

{'loss': 6.7201, 'grad_norm': 10.721014022827148, 'learning_rate': 4.976407914764079e-05, 'epoch': 0.01}


  0%|          | 630/131400 [1:40:50<306:24:29,  8.44s/it]

{'loss': 6.709, 'grad_norm': 13.006702423095703, 'learning_rate': 4.976027397260275e-05, 'epoch': 0.01}


  0%|          | 640/131400 [1:42:18<319:54:49,  8.81s/it]

{'loss': 6.9585, 'grad_norm': 16.233409881591797, 'learning_rate': 4.9756468797564695e-05, 'epoch': 0.01}


  0%|          | 650/131400 [1:43:43<338:28:56,  9.32s/it]

{'loss': 6.7301, 'grad_norm': 14.117538452148438, 'learning_rate': 4.975266362252664e-05, 'epoch': 0.01}


  1%|          | 660/131400 [1:45:17<325:49:55,  8.97s/it]

{'loss': 6.8174, 'grad_norm': 12.01143741607666, 'learning_rate': 4.974885844748859e-05, 'epoch': 0.02}


  1%|          | 670/131400 [1:47:02<355:50:54,  9.80s/it]

{'loss': 6.701, 'grad_norm': 11.729363441467285, 'learning_rate': 4.974505327245054e-05, 'epoch': 0.02}


  1%|          | 680/131400 [1:48:37<333:08:23,  9.17s/it]

{'loss': 6.7181, 'grad_norm': 11.3588228225708, 'learning_rate': 4.9741248097412484e-05, 'epoch': 0.02}


  1%|          | 690/131400 [1:50:11<384:22:03, 10.59s/it]

{'loss': 6.7004, 'grad_norm': 10.462471961975098, 'learning_rate': 4.973744292237443e-05, 'epoch': 0.02}


  1%|          | 700/131400 [1:51:45<350:22:11,  9.65s/it]

{'loss': 6.6873, 'grad_norm': 14.657408714294434, 'learning_rate': 4.973363774733638e-05, 'epoch': 0.02}


  1%|          | 710/131400 [1:53:14<314:58:58,  8.68s/it]

{'loss': 6.9227, 'grad_norm': 10.292388916015625, 'learning_rate': 4.9729832572298326e-05, 'epoch': 0.02}


  1%|          | 720/131400 [1:54:56<317:54:51,  8.76s/it]

{'loss': 6.7951, 'grad_norm': 10.452428817749023, 'learning_rate': 4.972602739726028e-05, 'epoch': 0.02}


  1%|          | 730/131400 [1:56:27<337:27:20,  9.30s/it]

{'loss': 6.7805, 'grad_norm': 13.150938987731934, 'learning_rate': 4.972222222222223e-05, 'epoch': 0.02}


  1%|          | 740/131400 [1:58:09<365:26:23, 10.07s/it]

{'loss': 6.5601, 'grad_norm': 10.742718696594238, 'learning_rate': 4.9718417047184175e-05, 'epoch': 0.02}


  1%|          | 750/131400 [1:59:36<307:35:22,  8.48s/it]

{'loss': 6.7625, 'grad_norm': 11.167462348937988, 'learning_rate': 4.971461187214612e-05, 'epoch': 0.02}


  1%|          | 760/131400 [2:01:02<328:32:28,  9.05s/it]

{'loss': 6.6995, 'grad_norm': 10.784554481506348, 'learning_rate': 4.971080669710807e-05, 'epoch': 0.02}


  1%|          | 770/131400 [2:02:32<321:28:43,  8.86s/it]

{'loss': 6.6493, 'grad_norm': 10.671467781066895, 'learning_rate': 4.970700152207002e-05, 'epoch': 0.02}


  1%|          | 780/131400 [2:03:56<298:17:33,  8.22s/it]

{'loss': 6.7882, 'grad_norm': 12.19393253326416, 'learning_rate': 4.9703196347031964e-05, 'epoch': 0.02}


  1%|          | 790/131400 [2:05:33<324:47:50,  8.95s/it]

{'loss': 6.7327, 'grad_norm': 10.739331245422363, 'learning_rate': 4.969939117199391e-05, 'epoch': 0.02}


  1%|          | 800/131400 [2:06:57<314:19:54,  8.66s/it]

{'loss': 6.621, 'grad_norm': 10.371006965637207, 'learning_rate': 4.969558599695586e-05, 'epoch': 0.02}


  1%|          | 810/131400 [2:08:39<369:27:38, 10.18s/it]

{'loss': 6.5706, 'grad_norm': 9.350381851196289, 'learning_rate': 4.969178082191781e-05, 'epoch': 0.02}


  1%|          | 820/131400 [2:10:14<348:05:35,  9.60s/it]

{'loss': 6.8084, 'grad_norm': 9.529500007629395, 'learning_rate': 4.968797564687976e-05, 'epoch': 0.02}


  1%|          | 830/131400 [2:11:54<357:49:02,  9.87s/it]

{'loss': 6.6139, 'grad_norm': 8.920326232910156, 'learning_rate': 4.968417047184171e-05, 'epoch': 0.02}


  1%|          | 840/131400 [2:14:06<502:14:20, 13.85s/it]

{'loss': 6.7625, 'grad_norm': 11.376832008361816, 'learning_rate': 4.9680365296803655e-05, 'epoch': 0.02}


  1%|          | 850/131400 [2:15:58<400:07:07, 11.03s/it]

{'loss': 6.7916, 'grad_norm': 10.367448806762695, 'learning_rate': 4.96765601217656e-05, 'epoch': 0.02}


  1%|          | 860/131400 [2:17:36<359:49:41,  9.92s/it]

{'loss': 6.6679, 'grad_norm': 11.026905059814453, 'learning_rate': 4.967275494672755e-05, 'epoch': 0.02}


  1%|          | 870/131400 [2:19:11<328:43:14,  9.07s/it]

{'loss': 6.594, 'grad_norm': 8.700374603271484, 'learning_rate': 4.96689497716895e-05, 'epoch': 0.02}


  1%|          | 880/131400 [2:20:44<339:00:59,  9.35s/it]

{'loss': 6.5798, 'grad_norm': 10.844677925109863, 'learning_rate': 4.9665144596651445e-05, 'epoch': 0.02}


  1%|          | 890/131400 [2:22:16<334:49:44,  9.24s/it]

{'loss': 6.6374, 'grad_norm': 10.156621932983398, 'learning_rate': 4.966133942161339e-05, 'epoch': 0.02}


  1%|          | 900/131400 [2:23:49<328:12:45,  9.05s/it]

{'loss': 6.6814, 'grad_norm': 7.257091045379639, 'learning_rate': 4.9657534246575346e-05, 'epoch': 0.02}


  1%|          | 910/131400 [2:25:18<325:30:07,  8.98s/it]

{'loss': 6.7202, 'grad_norm': 9.215922355651855, 'learning_rate': 4.9653729071537294e-05, 'epoch': 0.02}


  1%|          | 920/131400 [2:26:45<302:42:50,  8.35s/it]

{'loss': 6.7535, 'grad_norm': 10.27644157409668, 'learning_rate': 4.964992389649924e-05, 'epoch': 0.02}


  1%|          | 930/131400 [2:28:10<314:13:47,  8.67s/it]

{'loss': 6.6811, 'grad_norm': 10.26599407196045, 'learning_rate': 4.964611872146119e-05, 'epoch': 0.02}


  1%|          | 940/131400 [2:29:37<300:38:40,  8.30s/it]

{'loss': 6.6828, 'grad_norm': 7.471916675567627, 'learning_rate': 4.9642313546423136e-05, 'epoch': 0.02}


  1%|          | 950/131400 [2:31:02<314:30:43,  8.68s/it]

{'loss': 6.7036, 'grad_norm': 8.533308982849121, 'learning_rate': 4.963850837138508e-05, 'epoch': 0.02}


  1%|          | 960/131400 [2:32:30<307:55:01,  8.50s/it]

{'loss': 6.692, 'grad_norm': 11.549881935119629, 'learning_rate': 4.963470319634703e-05, 'epoch': 0.02}


  1%|          | 970/131400 [2:33:53<311:42:26,  8.60s/it]

{'loss': 6.7123, 'grad_norm': 8.546299934387207, 'learning_rate': 4.963089802130898e-05, 'epoch': 0.02}


  1%|          | 980/131400 [2:35:21<311:24:57,  8.60s/it]

{'loss': 6.5795, 'grad_norm': 9.17988109588623, 'learning_rate': 4.9627092846270925e-05, 'epoch': 0.02}


  1%|          | 990/131400 [2:36:44<308:13:01,  8.51s/it]

{'loss': 6.5621, 'grad_norm': 10.37355899810791, 'learning_rate': 4.962328767123288e-05, 'epoch': 0.02}


  1%|          | 1000/131400 [2:38:11<305:54:54,  8.45s/it]

{'loss': 6.5795, 'grad_norm': 8.83239459991455, 'learning_rate': 4.961948249619483e-05, 'epoch': 0.02}


  1%|          | 1010/131400 [2:39:36<306:16:59,  8.46s/it]

{'loss': 6.6775, 'grad_norm': 8.924285888671875, 'learning_rate': 4.961567732115678e-05, 'epoch': 0.02}


  1%|          | 1020/131400 [2:41:04<316:24:16,  8.74s/it]

{'loss': 6.5679, 'grad_norm': 9.36085319519043, 'learning_rate': 4.961187214611873e-05, 'epoch': 0.02}


  1%|          | 1030/131400 [2:42:27<301:40:19,  8.33s/it]

{'loss': 6.6062, 'grad_norm': 7.85512638092041, 'learning_rate': 4.9608066971080676e-05, 'epoch': 0.02}


  1%|          | 1040/131400 [2:43:55<323:42:16,  8.94s/it]

{'loss': 6.5708, 'grad_norm': 10.056740760803223, 'learning_rate': 4.960426179604262e-05, 'epoch': 0.02}


  1%|          | 1050/131400 [2:45:19<297:25:48,  8.21s/it]

{'loss': 6.5181, 'grad_norm': 9.739507675170898, 'learning_rate': 4.960045662100457e-05, 'epoch': 0.02}


  1%|          | 1060/131400 [2:46:46<321:54:37,  8.89s/it]

{'loss': 6.7736, 'grad_norm': 8.170149803161621, 'learning_rate': 4.959665144596652e-05, 'epoch': 0.02}


  1%|          | 1070/131400 [2:48:16<310:11:13,  8.57s/it]

{'loss': 6.5568, 'grad_norm': 8.639184951782227, 'learning_rate': 4.9592846270928465e-05, 'epoch': 0.02}


  1%|          | 1080/131400 [2:49:45<325:31:11,  8.99s/it]

{'loss': 6.5687, 'grad_norm': 7.222959041595459, 'learning_rate': 4.958904109589041e-05, 'epoch': 0.02}


  1%|          | 1090/131400 [2:51:08<300:38:48,  8.31s/it]

{'loss': 6.6986, 'grad_norm': 7.074214458465576, 'learning_rate': 4.958523592085236e-05, 'epoch': 0.02}


  1%|          | 1100/131400 [2:52:36<322:07:38,  8.90s/it]

{'loss': 6.569, 'grad_norm': 9.857952117919922, 'learning_rate': 4.9581430745814314e-05, 'epoch': 0.03}


  1%|          | 1110/131400 [2:54:01<301:52:32,  8.34s/it]

{'loss': 6.6236, 'grad_norm': 8.12768268585205, 'learning_rate': 4.957762557077626e-05, 'epoch': 0.03}


  1%|          | 1120/131400 [2:55:29<323:18:39,  8.93s/it]

{'loss': 6.5551, 'grad_norm': 9.156303405761719, 'learning_rate': 4.957382039573821e-05, 'epoch': 0.03}


  1%|          | 1130/131400 [2:56:53<299:32:38,  8.28s/it]

{'loss': 6.4875, 'grad_norm': 7.688509941101074, 'learning_rate': 4.9570015220700156e-05, 'epoch': 0.03}


  1%|          | 1140/131400 [2:58:20<318:55:57,  8.81s/it]

{'loss': 6.5037, 'grad_norm': 8.220185279846191, 'learning_rate': 4.9566210045662103e-05, 'epoch': 0.03}


  1%|          | 1150/131400 [2:59:46<301:11:13,  8.32s/it]

{'loss': 6.4294, 'grad_norm': 7.688224792480469, 'learning_rate': 4.956240487062405e-05, 'epoch': 0.03}


  1%|          | 1160/131400 [3:01:11<313:58:28,  8.68s/it]

{'loss': 6.3655, 'grad_norm': 7.543639659881592, 'learning_rate': 4.9558599695586e-05, 'epoch': 0.03}


  1%|          | 1170/131400 [3:02:37<300:11:27,  8.30s/it]

{'loss': 6.4452, 'grad_norm': 8.134565353393555, 'learning_rate': 4.9554794520547946e-05, 'epoch': 0.03}


  1%|          | 1180/131400 [3:04:02<310:16:34,  8.58s/it]

{'loss': 6.586, 'grad_norm': 8.786698341369629, 'learning_rate': 4.955098934550989e-05, 'epoch': 0.03}


  1%|          | 1190/131400 [3:05:28<303:02:40,  8.38s/it]

{'loss': 6.4217, 'grad_norm': 7.045681476593018, 'learning_rate': 4.954718417047185e-05, 'epoch': 0.03}


  1%|          | 1200/131400 [3:06:51<306:43:19,  8.48s/it]

{'loss': 6.4862, 'grad_norm': 8.418926239013672, 'learning_rate': 4.9543378995433794e-05, 'epoch': 0.03}


  1%|          | 1210/131400 [3:08:20<315:27:18,  8.72s/it]

{'loss': 6.6021, 'grad_norm': 8.03094482421875, 'learning_rate': 4.953957382039574e-05, 'epoch': 0.03}


  1%|          | 1220/131400 [3:09:43<304:38:40,  8.42s/it]

{'loss': 6.5474, 'grad_norm': 7.943747043609619, 'learning_rate': 4.953576864535769e-05, 'epoch': 0.03}


  1%|          | 1230/131400 [3:11:10<311:29:45,  8.61s/it]

{'loss': 6.6517, 'grad_norm': 6.494755744934082, 'learning_rate': 4.9531963470319636e-05, 'epoch': 0.03}


  1%|          | 1240/131400 [3:12:34<310:38:36,  8.59s/it]

{'loss': 6.465, 'grad_norm': 9.823030471801758, 'learning_rate': 4.9528158295281584e-05, 'epoch': 0.03}


  1%|          | 1250/131400 [3:13:58<294:46:12,  8.15s/it]

{'loss': 6.5098, 'grad_norm': 7.188962936401367, 'learning_rate': 4.952435312024353e-05, 'epoch': 0.03}


  1%|          | 1260/131400 [3:15:24<315:56:04,  8.74s/it]

{'loss': 6.5687, 'grad_norm': 8.798835754394531, 'learning_rate': 4.952054794520548e-05, 'epoch': 0.03}


  1%|          | 1270/131400 [3:16:49<299:31:53,  8.29s/it]

{'loss': 6.4335, 'grad_norm': 5.912449836730957, 'learning_rate': 4.9516742770167426e-05, 'epoch': 0.03}


  1%|          | 1280/131400 [3:18:15<318:56:03,  8.82s/it]

{'loss': 6.3838, 'grad_norm': 7.371393203735352, 'learning_rate': 4.951293759512938e-05, 'epoch': 0.03}


  1%|          | 1290/131400 [3:19:38<294:47:24,  8.16s/it]

{'loss': 6.5762, 'grad_norm': 6.947927474975586, 'learning_rate': 4.950913242009133e-05, 'epoch': 0.03}


  1%|          | 1300/131400 [3:21:04<316:59:52,  8.77s/it]

{'loss': 6.5742, 'grad_norm': 11.808711051940918, 'learning_rate': 4.9505327245053275e-05, 'epoch': 0.03}


  1%|          | 1310/131400 [3:22:28<295:16:56,  8.17s/it]

{'loss': 6.4956, 'grad_norm': 6.4146013259887695, 'learning_rate': 4.950152207001522e-05, 'epoch': 0.03}


  1%|          | 1320/131400 [3:23:54<314:35:17,  8.71s/it]

{'loss': 6.4812, 'grad_norm': 8.747940063476562, 'learning_rate': 4.949771689497717e-05, 'epoch': 0.03}


  1%|          | 1330/131400 [3:25:19<305:20:50,  8.45s/it]

{'loss': 6.6755, 'grad_norm': 6.06494140625, 'learning_rate': 4.949391171993912e-05, 'epoch': 0.03}


  1%|          | 1340/131400 [3:26:43<297:53:14,  8.25s/it]

{'loss': 6.6653, 'grad_norm': 6.130621910095215, 'learning_rate': 4.9490106544901064e-05, 'epoch': 0.03}


  1%|          | 1350/131400 [3:28:06<307:30:58,  8.51s/it]

{'loss': 6.5338, 'grad_norm': 7.26130485534668, 'learning_rate': 4.948630136986301e-05, 'epoch': 0.03}


  1%|          | 1360/131400 [3:29:31<294:49:59,  8.16s/it]

{'loss': 6.5406, 'grad_norm': 7.402958869934082, 'learning_rate': 4.948249619482496e-05, 'epoch': 0.03}


  1%|          | 1370/131400 [3:30:56<311:13:32,  8.62s/it]

{'loss': 6.3984, 'grad_norm': 7.68597412109375, 'learning_rate': 4.947869101978691e-05, 'epoch': 0.03}


  1%|          | 1380/131400 [3:32:19<293:29:36,  8.13s/it]

{'loss': 6.4079, 'grad_norm': 6.2680983543396, 'learning_rate': 4.947488584474886e-05, 'epoch': 0.03}


  1%|          | 1390/131400 [3:33:44<314:15:18,  8.70s/it]

{'loss': 6.498, 'grad_norm': 6.398214340209961, 'learning_rate': 4.947108066971081e-05, 'epoch': 0.03}


  1%|          | 1400/131400 [3:35:07<295:41:56,  8.19s/it]

{'loss': 6.6771, 'grad_norm': 7.80919075012207, 'learning_rate': 4.9467275494672755e-05, 'epoch': 0.03}


  1%|          | 1410/131400 [3:36:33<316:33:39,  8.77s/it]

{'loss': 6.5551, 'grad_norm': 6.34414005279541, 'learning_rate': 4.946347031963471e-05, 'epoch': 0.03}


  1%|          | 1420/131400 [3:37:55<296:11:02,  8.20s/it]

{'loss': 6.545, 'grad_norm': 7.834402084350586, 'learning_rate': 4.945966514459666e-05, 'epoch': 0.03}


  1%|          | 1430/131400 [3:39:21<312:17:33,  8.65s/it]

{'loss': 6.4407, 'grad_norm': 7.201574325561523, 'learning_rate': 4.9455859969558604e-05, 'epoch': 0.03}


  1%|          | 1440/131400 [3:40:44<299:51:18,  8.31s/it]

{'loss': 6.4515, 'grad_norm': 7.707822799682617, 'learning_rate': 4.945205479452055e-05, 'epoch': 0.03}


  1%|          | 1450/131400 [3:42:10<306:27:48,  8.49s/it]

{'loss': 6.4811, 'grad_norm': 7.415334224700928, 'learning_rate': 4.94482496194825e-05, 'epoch': 0.03}


  1%|          | 1460/131400 [3:43:33<304:12:58,  8.43s/it]

{'loss': 6.5195, 'grad_norm': 9.172432899475098, 'learning_rate': 4.9444444444444446e-05, 'epoch': 0.03}


  1%|          | 1470/131400 [3:44:58<299:48:54,  8.31s/it]

{'loss': 6.5174, 'grad_norm': 7.5206780433654785, 'learning_rate': 4.94406392694064e-05, 'epoch': 0.03}


  1%|          | 1480/131400 [3:46:21<304:04:54,  8.43s/it]

{'loss': 6.4867, 'grad_norm': 8.235383033752441, 'learning_rate': 4.943683409436835e-05, 'epoch': 0.03}


  1%|          | 1490/131400 [3:47:47<302:27:30,  8.38s/it]

{'loss': 6.6377, 'grad_norm': 7.994895935058594, 'learning_rate': 4.9433028919330295e-05, 'epoch': 0.03}


  1%|          | 1500/131400 [3:49:10<309:03:17,  8.57s/it]

{'loss': 6.4195, 'grad_norm': 6.880743026733398, 'learning_rate': 4.942922374429224e-05, 'epoch': 0.03}


  1%|          | 1510/131400 [3:50:36<295:40:24,  8.19s/it]

{'loss': 6.1793, 'grad_norm': 6.475636005401611, 'learning_rate': 4.942541856925419e-05, 'epoch': 0.03}


  1%|          | 1520/131400 [3:51:59<305:40:33,  8.47s/it]

{'loss': 6.3606, 'grad_norm': 6.745049476623535, 'learning_rate': 4.942161339421614e-05, 'epoch': 0.03}


  1%|          | 1530/131400 [3:53:23<294:44:19,  8.17s/it]

{'loss': 6.4738, 'grad_norm': 7.110759258270264, 'learning_rate': 4.9417808219178084e-05, 'epoch': 0.03}


  1%|          | 1540/131400 [3:54:48<312:57:55,  8.68s/it]

{'loss': 6.3372, 'grad_norm': 7.337649822235107, 'learning_rate': 4.941400304414003e-05, 'epoch': 0.04}


  1%|          | 1550/131400 [3:56:12<293:52:56,  8.15s/it]

{'loss': 6.3922, 'grad_norm': 6.933180809020996, 'learning_rate': 4.941019786910198e-05, 'epoch': 0.04}


  1%|          | 1560/131400 [3:57:37<309:41:35,  8.59s/it]

{'loss': 6.5446, 'grad_norm': 6.398783206939697, 'learning_rate': 4.9406392694063927e-05, 'epoch': 0.04}


  1%|          | 1570/131400 [3:59:00<294:54:27,  8.18s/it]

{'loss': 6.4079, 'grad_norm': 6.348526477813721, 'learning_rate': 4.940258751902588e-05, 'epoch': 0.04}


  1%|          | 1580/131400 [4:00:26<311:45:43,  8.65s/it]

{'loss': 6.5517, 'grad_norm': 6.961728096008301, 'learning_rate': 4.939878234398783e-05, 'epoch': 0.04}


  1%|          | 1590/131400 [4:01:49<300:37:46,  8.34s/it]

{'loss': 6.5218, 'grad_norm': 5.614473819732666, 'learning_rate': 4.9394977168949775e-05, 'epoch': 0.04}


  1%|          | 1600/131400 [4:03:15<313:11:27,  8.69s/it]

{'loss': 6.4227, 'grad_norm': 6.435513019561768, 'learning_rate': 4.939117199391172e-05, 'epoch': 0.04}


  1%|          | 1610/131400 [4:04:38<297:46:48,  8.26s/it]

{'loss': 6.4919, 'grad_norm': 6.472431182861328, 'learning_rate': 4.938736681887367e-05, 'epoch': 0.04}


  1%|          | 1620/131400 [4:06:05<311:50:57,  8.65s/it]

{'loss': 6.4466, 'grad_norm': 7.399056911468506, 'learning_rate': 4.938356164383562e-05, 'epoch': 0.04}


  1%|          | 1630/131400 [4:07:27<300:32:52,  8.34s/it]

{'loss': 6.3151, 'grad_norm': 5.957429885864258, 'learning_rate': 4.9379756468797565e-05, 'epoch': 0.04}


  1%|          | 1640/131400 [4:08:54<309:12:25,  8.58s/it]

{'loss': 6.4082, 'grad_norm': 5.66441011428833, 'learning_rate': 4.937595129375951e-05, 'epoch': 0.04}


  1%|▏         | 1650/131400 [4:10:17<302:12:23,  8.38s/it]

{'loss': 6.4807, 'grad_norm': 5.561145782470703, 'learning_rate': 4.937214611872146e-05, 'epoch': 0.04}


  1%|▏         | 1660/131400 [4:11:44<304:10:37,  8.44s/it]

{'loss': 6.3624, 'grad_norm': 6.035840034484863, 'learning_rate': 4.9368340943683414e-05, 'epoch': 0.04}


  1%|▏         | 1670/131400 [4:13:07<303:41:10,  8.43s/it]

{'loss': 6.3695, 'grad_norm': 7.343282699584961, 'learning_rate': 4.936453576864536e-05, 'epoch': 0.04}


  1%|▏         | 1680/131400 [4:14:34<303:22:22,  8.42s/it]

{'loss': 6.5006, 'grad_norm': 6.080617904663086, 'learning_rate': 4.936073059360731e-05, 'epoch': 0.04}


  1%|▏         | 1690/131400 [4:15:56<302:07:55,  8.39s/it]

{'loss': 6.5241, 'grad_norm': 4.8789591789245605, 'learning_rate': 4.9356925418569256e-05, 'epoch': 0.04}


  1%|▏         | 1700/131400 [4:17:24<309:18:10,  8.59s/it]

{'loss': 6.5626, 'grad_norm': 5.597792148590088, 'learning_rate': 4.93531202435312e-05, 'epoch': 0.04}


  1%|▏         | 1710/131400 [4:18:46<299:01:35,  8.30s/it]

{'loss': 6.506, 'grad_norm': 5.826480388641357, 'learning_rate': 4.934931506849315e-05, 'epoch': 0.04}


  1%|▏         | 1720/131400 [4:20:13<313:47:40,  8.71s/it]

{'loss': 6.3881, 'grad_norm': 6.707212448120117, 'learning_rate': 4.93455098934551e-05, 'epoch': 0.04}


  1%|▏         | 1730/131400 [4:21:34<297:37:18,  8.26s/it]

{'loss': 6.5052, 'grad_norm': 4.815935134887695, 'learning_rate': 4.9341704718417045e-05, 'epoch': 0.04}


  1%|▏         | 1740/131400 [4:23:02<317:47:27,  8.82s/it]

{'loss': 6.346, 'grad_norm': 7.293612480163574, 'learning_rate': 4.933789954337899e-05, 'epoch': 0.04}


  1%|▏         | 1750/131400 [5:53:10<7105:33:15, 197.30s/it]  

{'loss': 6.3619, 'grad_norm': 6.743987560272217, 'learning_rate': 4.933409436834095e-05, 'epoch': 0.04}


  1%|▏         | 1760/131400 [5:54:28<475:04:37, 13.19s/it]  

{'loss': 6.4527, 'grad_norm': 6.682193756103516, 'learning_rate': 4.9330289193302894e-05, 'epoch': 0.04}


  1%|▏         | 1770/131400 [5:55:50<303:57:24,  8.44s/it]

{'loss': 6.4114, 'grad_norm': 5.029750823974609, 'learning_rate': 4.932648401826484e-05, 'epoch': 0.04}


  1%|▏         | 1780/131400 [5:57:07<276:04:40,  7.67s/it]

{'loss': 6.5018, 'grad_norm': 10.185173988342285, 'learning_rate': 4.932267884322679e-05, 'epoch': 0.04}


  1%|▏         | 1790/131400 [5:58:20<265:36:34,  7.38s/it]

{'loss': 6.3929, 'grad_norm': 5.580326557159424, 'learning_rate': 4.9318873668188736e-05, 'epoch': 0.04}


  1%|▏         | 1800/131400 [5:59:35<272:42:11,  7.58s/it]

{'loss': 6.3864, 'grad_norm': 6.890663146972656, 'learning_rate': 4.9315068493150684e-05, 'epoch': 0.04}


  1%|▏         | 1810/131400 [6:00:49<267:07:52,  7.42s/it]

{'loss': 6.3905, 'grad_norm': 6.7126145362854, 'learning_rate': 4.931126331811264e-05, 'epoch': 0.04}


  1%|▏         | 1820/131400 [6:02:04<270:40:18,  7.52s/it]

{'loss': 6.3219, 'grad_norm': 7.310121536254883, 'learning_rate': 4.9307458143074585e-05, 'epoch': 0.04}


  1%|▏         | 1830/131400 [6:03:18<266:40:06,  7.41s/it]

{'loss': 6.2827, 'grad_norm': 5.518352031707764, 'learning_rate': 4.930365296803653e-05, 'epoch': 0.04}


  1%|▏         | 1840/131400 [6:04:33<269:59:08,  7.50s/it]

{'loss': 6.3748, 'grad_norm': 6.454932689666748, 'learning_rate': 4.929984779299848e-05, 'epoch': 0.04}


  1%|▏         | 1850/131400 [6:05:49<273:39:15,  7.60s/it]

{'loss': 6.4809, 'grad_norm': 5.278397560119629, 'learning_rate': 4.9296042617960434e-05, 'epoch': 0.04}


  1%|▏         | 1860/131400 [6:07:05<273:54:51,  7.61s/it]

{'loss': 6.3929, 'grad_norm': 6.852612018585205, 'learning_rate': 4.929223744292238e-05, 'epoch': 0.04}


  1%|▏         | 1870/131400 [6:08:20<271:33:50,  7.55s/it]

{'loss': 6.5264, 'grad_norm': 6.653923034667969, 'learning_rate': 4.928843226788433e-05, 'epoch': 0.04}


  1%|▏         | 1880/131400 [6:09:35<267:52:59,  7.45s/it]

{'loss': 6.353, 'grad_norm': 5.61915397644043, 'learning_rate': 4.9284627092846276e-05, 'epoch': 0.04}


  1%|▏         | 1890/131400 [6:10:51<275:52:33,  7.67s/it]

{'loss': 6.3353, 'grad_norm': 8.038904190063477, 'learning_rate': 4.9280821917808223e-05, 'epoch': 0.04}


  1%|▏         | 1900/131400 [6:12:06<274:36:53,  7.63s/it]

{'loss': 6.2662, 'grad_norm': 5.482513904571533, 'learning_rate': 4.927701674277017e-05, 'epoch': 0.04}


  1%|▏         | 1910/131400 [6:13:23<270:30:34,  7.52s/it]

{'loss': 6.5378, 'grad_norm': 5.242302417755127, 'learning_rate': 4.927321156773212e-05, 'epoch': 0.04}


  1%|▏         | 1920/131400 [6:14:38<274:10:43,  7.62s/it]

{'loss': 6.3905, 'grad_norm': 4.757596969604492, 'learning_rate': 4.9269406392694065e-05, 'epoch': 0.04}


  1%|▏         | 1930/131400 [6:15:54<269:16:56,  7.49s/it]

{'loss': 6.359, 'grad_norm': 4.532262802124023, 'learning_rate': 4.926560121765601e-05, 'epoch': 0.04}


  1%|▏         | 1940/131400 [6:17:11<277:06:09,  7.71s/it]

{'loss': 6.4112, 'grad_norm': 6.066380023956299, 'learning_rate': 4.926179604261797e-05, 'epoch': 0.04}


  1%|▏         | 1950/131400 [6:18:26<273:02:44,  7.59s/it]

{'loss': 6.2195, 'grad_norm': 6.099703311920166, 'learning_rate': 4.9257990867579914e-05, 'epoch': 0.04}


  1%|▏         | 1960/131400 [6:19:43<269:52:06,  7.51s/it]

{'loss': 6.5134, 'grad_norm': 5.5978169441223145, 'learning_rate': 4.925418569254186e-05, 'epoch': 0.04}


  1%|▏         | 1970/131400 [6:20:59<275:02:48,  7.65s/it]

{'loss': 6.428, 'grad_norm': 6.089465141296387, 'learning_rate': 4.925038051750381e-05, 'epoch': 0.04}


  2%|▏         | 1980/131400 [6:22:15<272:52:50,  7.59s/it]

{'loss': 6.4893, 'grad_norm': 4.854267120361328, 'learning_rate': 4.9246575342465756e-05, 'epoch': 0.05}


  2%|▏         | 1990/131400 [6:23:32<278:17:13,  7.74s/it]

{'loss': 6.3567, 'grad_norm': 7.518989562988281, 'learning_rate': 4.9242770167427704e-05, 'epoch': 0.05}


  2%|▏         | 2000/131400 [6:24:47<273:33:24,  7.61s/it]

{'loss': 6.212, 'grad_norm': 6.317091941833496, 'learning_rate': 4.923896499238965e-05, 'epoch': 0.05}


  2%|▏         | 2010/131400 [6:26:06<273:56:57,  7.62s/it]

{'loss': 6.2967, 'grad_norm': 7.556146621704102, 'learning_rate': 4.92351598173516e-05, 'epoch': 0.05}


  2%|▏         | 2020/131400 [6:27:21<273:06:57,  7.60s/it]

{'loss': 6.3104, 'grad_norm': 5.014469146728516, 'learning_rate': 4.9231354642313546e-05, 'epoch': 0.05}


  2%|▏         | 2030/131400 [6:28:45<281:24:03,  7.83s/it]

{'loss': 6.2667, 'grad_norm': 6.976631164550781, 'learning_rate': 4.92275494672755e-05, 'epoch': 0.05}


  2%|▏         | 2040/131400 [6:30:14<310:14:27,  8.63s/it]

{'loss': 6.3817, 'grad_norm': 6.311310291290283, 'learning_rate': 4.922374429223745e-05, 'epoch': 0.05}


  2%|▏         | 2050/131400 [6:31:37<300:44:57,  8.37s/it]

{'loss': 6.2801, 'grad_norm': 5.173977851867676, 'learning_rate': 4.9219939117199395e-05, 'epoch': 0.05}


  2%|▏         | 2060/131400 [6:33:11<307:12:12,  8.55s/it]

{'loss': 6.4574, 'grad_norm': 5.715651512145996, 'learning_rate': 4.921613394216134e-05, 'epoch': 0.05}


  2%|▏         | 2070/131400 [6:34:40<330:05:07,  9.19s/it]

{'loss': 6.5249, 'grad_norm': 5.440438747406006, 'learning_rate': 4.921232876712329e-05, 'epoch': 0.05}


  2%|▏         | 2080/131400 [6:36:10<327:37:35,  9.12s/it]

{'loss': 6.2805, 'grad_norm': 4.822451591491699, 'learning_rate': 4.920852359208524e-05, 'epoch': 0.05}


  2%|▏         | 2090/131400 [6:37:30<296:37:03,  8.26s/it]

{'loss': 6.3268, 'grad_norm': 6.1822123527526855, 'learning_rate': 4.9204718417047184e-05, 'epoch': 0.05}


  2%|▏         | 2100/131400 [6:38:51<313:00:45,  8.71s/it]

{'loss': 6.335, 'grad_norm': 5.209956645965576, 'learning_rate': 4.920091324200913e-05, 'epoch': 0.05}


  2%|▏         | 2110/131400 [6:40:22<302:23:11,  8.42s/it]

{'loss': 6.3915, 'grad_norm': 6.939753532409668, 'learning_rate': 4.919710806697108e-05, 'epoch': 0.05}


  2%|▏         | 2120/131400 [6:41:49<330:21:15,  9.20s/it]

{'loss': 6.3758, 'grad_norm': 5.293596267700195, 'learning_rate': 4.9193302891933026e-05, 'epoch': 0.05}


  2%|▏         | 2130/131400 [6:43:24<356:58:01,  9.94s/it]

{'loss': 6.2422, 'grad_norm': 5.1794257164001465, 'learning_rate': 4.918949771689498e-05, 'epoch': 0.05}


  2%|▏         | 2140/131400 [6:44:50<307:46:26,  8.57s/it]

{'loss': 6.2709, 'grad_norm': 3.8321685791015625, 'learning_rate': 4.918569254185693e-05, 'epoch': 0.05}


  2%|▏         | 2150/131400 [6:46:16<301:44:08,  8.40s/it]

{'loss': 6.2193, 'grad_norm': 6.807651996612549, 'learning_rate': 4.9181887366818875e-05, 'epoch': 0.05}


  2%|▏         | 2160/131400 [6:47:43<300:48:28,  8.38s/it]

{'loss': 6.3323, 'grad_norm': 4.986557960510254, 'learning_rate': 4.917808219178082e-05, 'epoch': 0.05}


  2%|▏         | 2170/131400 [6:49:10<289:25:59,  8.06s/it]

{'loss': 6.2676, 'grad_norm': 5.454174518585205, 'learning_rate': 4.917427701674277e-05, 'epoch': 0.05}


  2%|▏         | 2180/131400 [6:50:38<339:32:55,  9.46s/it]

{'loss': 6.23, 'grad_norm': 4.432543754577637, 'learning_rate': 4.917047184170472e-05, 'epoch': 0.05}


  2%|▏         | 2190/131400 [6:52:12<319:54:25,  8.91s/it]

{'loss': 6.41, 'grad_norm': 4.52353572845459, 'learning_rate': 4.9166666666666665e-05, 'epoch': 0.05}


  2%|▏         | 2200/131400 [6:53:28<273:31:37,  7.62s/it]

{'loss': 6.4163, 'grad_norm': 6.311410903930664, 'learning_rate': 4.916286149162861e-05, 'epoch': 0.05}


  2%|▏         | 2210/131400 [6:55:04<381:24:29, 10.63s/it]

{'loss': 6.2326, 'grad_norm': 5.544553279876709, 'learning_rate': 4.9159056316590566e-05, 'epoch': 0.05}


  2%|▏         | 2220/131400 [6:56:39<322:46:49,  9.00s/it]

{'loss': 6.3129, 'grad_norm': 4.730569362640381, 'learning_rate': 4.9155251141552513e-05, 'epoch': 0.05}


  2%|▏         | 2230/131400 [6:58:12<340:33:32,  9.49s/it]

{'loss': 6.2392, 'grad_norm': 5.483435153961182, 'learning_rate': 4.915144596651447e-05, 'epoch': 0.05}


  2%|▏         | 2240/131400 [6:59:40<316:12:00,  8.81s/it]

{'loss': 6.2765, 'grad_norm': 5.1437835693359375, 'learning_rate': 4.9147640791476415e-05, 'epoch': 0.05}


  2%|▏         | 2250/131400 [7:01:15<340:56:25,  9.50s/it]

{'loss': 6.2752, 'grad_norm': 6.055098056793213, 'learning_rate': 4.914383561643836e-05, 'epoch': 0.05}


  2%|▏         | 2260/131400 [7:02:41<304:59:36,  8.50s/it]

{'loss': 6.2565, 'grad_norm': 5.138556003570557, 'learning_rate': 4.914003044140031e-05, 'epoch': 0.05}


  2%|▏         | 2270/131400 [7:04:13<343:27:10,  9.58s/it]

{'loss': 6.3758, 'grad_norm': 4.802087306976318, 'learning_rate': 4.913622526636226e-05, 'epoch': 0.05}


  2%|▏         | 2280/131400 [7:05:53<346:23:20,  9.66s/it]

{'loss': 6.4085, 'grad_norm': 6.24124002456665, 'learning_rate': 4.9132420091324204e-05, 'epoch': 0.05}


  2%|▏         | 2290/131400 [7:07:34<354:50:44,  9.89s/it]

{'loss': 6.1559, 'grad_norm': 4.685981273651123, 'learning_rate': 4.912861491628615e-05, 'epoch': 0.05}


  2%|▏         | 2300/131400 [7:09:04<322:12:55,  8.99s/it]

{'loss': 6.4361, 'grad_norm': 5.764148235321045, 'learning_rate': 4.91248097412481e-05, 'epoch': 0.05}


  2%|▏         | 2310/131400 [7:10:38<331:47:48,  9.25s/it]

{'loss': 6.2181, 'grad_norm': 5.111299991607666, 'learning_rate': 4.9121004566210047e-05, 'epoch': 0.05}


  2%|▏         | 2320/131400 [7:12:07<320:08:37,  8.93s/it]

{'loss': 6.3528, 'grad_norm': 4.807852268218994, 'learning_rate': 4.9117199391172e-05, 'epoch': 0.05}


  2%|▏         | 2330/131400 [7:13:37<326:07:19,  9.10s/it]

{'loss': 6.33, 'grad_norm': 4.213383674621582, 'learning_rate': 4.911339421613395e-05, 'epoch': 0.05}


  2%|▏         | 2340/131400 [7:15:12<332:33:24,  9.28s/it]

{'loss': 6.2069, 'grad_norm': 4.431093692779541, 'learning_rate': 4.9109589041095895e-05, 'epoch': 0.05}


  2%|▏         | 2350/131400 [7:16:37<302:15:18,  8.43s/it]

{'loss': 6.1883, 'grad_norm': 6.753552436828613, 'learning_rate': 4.910578386605784e-05, 'epoch': 0.05}


  2%|▏         | 2360/131400 [7:18:08<342:09:32,  9.55s/it]

{'loss': 6.2218, 'grad_norm': 5.322770595550537, 'learning_rate': 4.910197869101979e-05, 'epoch': 0.05}


  2%|▏         | 2370/131400 [7:19:35<309:19:26,  8.63s/it]

{'loss': 6.2885, 'grad_norm': 4.987555980682373, 'learning_rate': 4.909817351598174e-05, 'epoch': 0.05}


  2%|▏         | 2380/131400 [7:21:08<336:55:48,  9.40s/it]

{'loss': 6.2643, 'grad_norm': 3.965461254119873, 'learning_rate': 4.9094368340943685e-05, 'epoch': 0.05}


  2%|▏         | 2390/131400 [7:22:38<327:41:15,  9.14s/it]

{'loss': 6.3868, 'grad_norm': 6.903608322143555, 'learning_rate': 4.909056316590563e-05, 'epoch': 0.05}


  2%|▏         | 2400/131400 [7:24:18<397:25:16, 11.09s/it]

{'loss': 6.3228, 'grad_norm': 5.025815963745117, 'learning_rate': 4.908675799086758e-05, 'epoch': 0.05}


  2%|▏         | 2410/131400 [7:25:55<335:54:52,  9.38s/it]

{'loss': 6.3802, 'grad_norm': 5.280701160430908, 'learning_rate': 4.9082952815829534e-05, 'epoch': 0.06}


  2%|▏         | 2420/131400 [7:27:25<307:48:09,  8.59s/it]

{'loss': 6.2646, 'grad_norm': 4.93409538269043, 'learning_rate': 4.907914764079148e-05, 'epoch': 0.06}


  2%|▏         | 2430/131400 [7:28:50<310:08:41,  8.66s/it]

{'loss': 6.216, 'grad_norm': 4.5138983726501465, 'learning_rate': 4.907534246575343e-05, 'epoch': 0.06}


  2%|▏         | 2440/131400 [7:30:18<316:53:29,  8.85s/it]

{'loss': 6.2699, 'grad_norm': 5.8581223487854, 'learning_rate': 4.9071537290715376e-05, 'epoch': 0.06}


  2%|▏         | 2450/131400 [7:31:43<304:25:45,  8.50s/it]

{'loss': 6.2506, 'grad_norm': 4.955540180206299, 'learning_rate': 4.906773211567732e-05, 'epoch': 0.06}


  2%|▏         | 2460/131400 [7:33:09<295:58:50,  8.26s/it]

{'loss': 6.2347, 'grad_norm': 4.332690238952637, 'learning_rate': 4.906392694063927e-05, 'epoch': 0.06}


  2%|▏         | 2470/131400 [7:34:35<302:47:58,  8.45s/it]

{'loss': 6.1847, 'grad_norm': 4.988741397857666, 'learning_rate': 4.906012176560122e-05, 'epoch': 0.06}


  2%|▏         | 2480/131400 [7:36:06<319:20:55,  8.92s/it]

{'loss': 6.335, 'grad_norm': 3.6727936267852783, 'learning_rate': 4.9056316590563165e-05, 'epoch': 0.06}


  2%|▏         | 2490/131400 [7:37:34<321:42:38,  8.98s/it]

{'loss': 6.197, 'grad_norm': 4.760286808013916, 'learning_rate': 4.905251141552511e-05, 'epoch': 0.06}


  2%|▏         | 2500/131400 [7:39:05<328:33:28,  9.18s/it]

{'loss': 6.1684, 'grad_norm': 5.530391216278076, 'learning_rate': 4.904870624048707e-05, 'epoch': 0.06}


  2%|▏         | 2510/131400 [7:40:32<300:45:21,  8.40s/it]

{'loss': 6.204, 'grad_norm': 4.956892490386963, 'learning_rate': 4.9044901065449014e-05, 'epoch': 0.06}


  2%|▏         | 2520/131400 [7:42:02<332:31:47,  9.29s/it]

{'loss': 6.1989, 'grad_norm': 5.122632026672363, 'learning_rate': 4.904109589041096e-05, 'epoch': 0.06}


  2%|▏         | 2530/131400 [7:43:30<313:18:28,  8.75s/it]

{'loss': 6.2785, 'grad_norm': 6.4016571044921875, 'learning_rate': 4.903729071537291e-05, 'epoch': 0.06}


  2%|▏         | 2540/131400 [7:44:56<318:09:23,  8.89s/it]

{'loss': 6.3927, 'grad_norm': 5.0757904052734375, 'learning_rate': 4.9033485540334856e-05, 'epoch': 0.06}


  2%|▏         | 2550/131400 [7:46:34<330:17:24,  9.23s/it]

{'loss': 6.3294, 'grad_norm': 4.4092488288879395, 'learning_rate': 4.9029680365296804e-05, 'epoch': 0.06}


  2%|▏         | 2560/131400 [7:48:01<316:29:00,  8.84s/it]

{'loss': 6.2326, 'grad_norm': 4.090578079223633, 'learning_rate': 4.902587519025875e-05, 'epoch': 0.06}


  2%|▏         | 2570/131400 [7:49:27<304:28:19,  8.51s/it]

{'loss': 6.1428, 'grad_norm': 4.735541343688965, 'learning_rate': 4.90220700152207e-05, 'epoch': 0.06}


  2%|▏         | 2580/131400 [7:50:48<297:05:59,  8.30s/it]

{'loss': 6.2863, 'grad_norm': 4.969151496887207, 'learning_rate': 4.9018264840182646e-05, 'epoch': 0.06}


  2%|▏         | 2590/131400 [7:52:17<318:41:35,  8.91s/it]

{'loss': 6.2819, 'grad_norm': 5.556756973266602, 'learning_rate': 4.90144596651446e-05, 'epoch': 0.06}


  2%|▏         | 2600/131400 [7:53:39<290:38:37,  8.12s/it]

{'loss': 6.2363, 'grad_norm': 5.5328569412231445, 'learning_rate': 4.901065449010655e-05, 'epoch': 0.06}


  2%|▏         | 2610/131400 [7:55:03<304:47:08,  8.52s/it]

{'loss': 6.2181, 'grad_norm': 4.7072954177856445, 'learning_rate': 4.90068493150685e-05, 'epoch': 0.06}


  2%|▏         | 2620/131400 [7:56:28<312:47:22,  8.74s/it]

{'loss': 6.1994, 'grad_norm': 5.064584732055664, 'learning_rate': 4.900304414003045e-05, 'epoch': 0.06}


  2%|▏         | 2630/131400 [7:57:59<334:12:18,  9.34s/it]

{'loss': 6.2728, 'grad_norm': 4.304605007171631, 'learning_rate': 4.8999238964992396e-05, 'epoch': 0.06}


  2%|▏         | 2640/131400 [7:59:18<278:37:19,  7.79s/it]

{'loss': 6.1027, 'grad_norm': 4.779184818267822, 'learning_rate': 4.899543378995434e-05, 'epoch': 0.06}


  2%|▏         | 2650/131400 [8:00:42<305:10:12,  8.53s/it]

{'loss': 6.2141, 'grad_norm': 3.8799052238464355, 'learning_rate': 4.899162861491629e-05, 'epoch': 0.06}


  2%|▏         | 2660/131400 [8:02:04<284:08:50,  7.95s/it]

{'loss': 6.1997, 'grad_norm': 4.945228099822998, 'learning_rate': 4.898782343987824e-05, 'epoch': 0.06}


  2%|▏         | 2670/131400 [8:03:24<294:31:01,  8.24s/it]

{'loss': 6.2565, 'grad_norm': 4.160412311553955, 'learning_rate': 4.8984018264840185e-05, 'epoch': 0.06}


  2%|▏         | 2680/131400 [8:04:50<298:07:01,  8.34s/it]

{'loss': 6.1864, 'grad_norm': 5.608858108520508, 'learning_rate': 4.898021308980213e-05, 'epoch': 0.06}


  2%|▏         | 2690/131400 [8:06:08<268:58:48,  7.52s/it]

{'loss': 6.2021, 'grad_norm': 5.013757228851318, 'learning_rate': 4.897640791476408e-05, 'epoch': 0.06}


  2%|▏         | 2700/131400 [8:07:26<279:18:25,  7.81s/it]

{'loss': 6.1197, 'grad_norm': 6.325283050537109, 'learning_rate': 4.8972602739726034e-05, 'epoch': 0.06}


  2%|▏         | 2706/131400 [8:08:15<303:13:01,  8.48s/it]

KeyboardInterrupt: 

In [None]:
trainer.evaluate()

# Example of using the fine-tuned model for question answering
from transformers import pipeline

qa_pipeline = pipeline('question-answering', model=model, tokenizer=tokenizer)

context = "Albert Einstein was a German-born theoretical physicist who developed the theory of relativity."
question = "Who was Albert Einstein?"

answer = qa_pipeline(question=question, context=context)

print(f"Question: {question}")
print(f"Answer: '{answer['answer']}' with confidence score {answer['score']:.2f}")
