In [1]:
import os
from datasets import load_dataset
path = os.path.join("..","dataset","tiny_stories","en-zh")
data = load_dataset("json",data_files={"train":os.path.join(path,"train.jsonl"),"test":os.path.join(path,"valid.jsonl")})

In [2]:
from transformers import MT5Tokenizer
from transformers import RobertaForSequenceClassification,RobertaTokenizer,get_linear_schedule_with_warmup
from datasets import load_dataset
from torch.utils.data import Dataset
import os
import random
import numpy as np
import evaluate
import torch
import argparse
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.functional import F
from torch.cuda.amp import autocast as autocast,GradScaler
import pandas as pd
from sklearn.metrics import f1_score
from torch.optim import AdamW
import bleu


In [3]:
tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
tokenizer.model_max_length = 512

In [5]:
from transformers import MT5ForConditionalGeneration

model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small",)

In [6]:
def preprocess_function(examples):
    inputs = ["Translate English into Chinese:" + example['text'] for example in examples['src_text']]
    tgts = [ example for example in examples['tgt_text'] ]
    model_inputs = tokenizer(inputs,text_target=tgts,padding="max_length",truncation=True)  
    return model_inputs

In [7]:
tokenzied_data = data.load_from_disk('tokenzied_data')
#tokenzied_data = data.map(preprocess_function,batched=True,remove_columns=['src_text','tgt_text'])

In [8]:
tokenzied_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 500000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})

In [9]:
tokenizer.decode(tokenzied_data['train'][0]['input_ids'])

'Translate English into Chinese:Once upon a time, there was a little girl named Lily. She loved to talk to her friends, and she loved to play with her toys. One day, she found a lemon in her kitchen. Lily thought the lemon was a toy, so she took it to her room to play with it. Lily started to feel helpless because the lemon was not fun like her other toys. She tried to talk to the lemon, but it did not talk back. Lily was sad and didn\'t know what to do. Then, something unexpected happened. The lemon started to grow bigger and bigger! Suddenly, the lemon turned into a big, friendly lemon man. He could talk, and he was not helpless at all! He said, "Hi Lily, I am Mr. Lemon. I was hiding in your kitchen, and now I am here to play with you." Lily was so happy and surprised. They played together all day long, and Lily had a new, fun friend.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pa

In [10]:
tokenizer.decode(tokenzied_data['train'][0]['labels'])

'从前,有一个叫莉莉的小女孩。她喜欢和她的朋友聊天,她也喜欢玩她的玩具。一天,她在厨房里发现了一个柠檬。莉莉以为柠檬是玩具,所以她把它带到房间里玩。莉莉开始感到无助,因为柠檬不像她的其他玩具那样有趣。她试着和柠檬说话,但它不顶嘴。莉莉很难过,不知道该怎么办。然后,意想不到的事情发生了。柠檬开始变得越来越大!突然,柠檬变成了一个又大又友好的柠檬人。他会说话,一点也不无助!他说:“嗨,莉莉,我是柠檬先生。我躲在你的厨房里,现在我来和你一起玩。”莉莉既高兴又惊讶。他们整天都在一起玩,莉莉有了一个有趣的新朋友。</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><p

In [11]:
def collate_fn(examples):
    return tokenizer.pad(examples, padding="max_length", return_tensors="pt")

In [12]:
train_batch_size = 4
eval_batch_size = 2
train_dataloader = DataLoader(tokenzied_data['train'],collate_fn=collate_fn,batch_size = train_batch_size)
eval_dataloader = DataLoader(tokenzied_data['test'] , collate_fn=collate_fn,batch_size = eval_batch_size)

In [13]:
num_epochs = 1
lr = 1e-5
optimizer = AdamW(params=model.parameters(), lr=lr)

# Instantiate scheduler

lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),
    num_training_steps=(len(train_dataloader) * num_epochs),
)


In [14]:
device = "cuda"
model.to(device)
max_eval_codebleu = 0
iter_to_accumlate = 4
epochloss = []
for epoch in range(num_epochs):
    model.train()
    allloss = 0
    for step,batch in enumerate(tqdm(train_dataloader)):
        batch.to(device)
        outputs = model(**batch)
        loss = outputs.loss/iter_to_accumlate
        loss.backward()
        allloss += loss.item()
        epochloss.append(loss.item())
        if (step+1)%iter_to_accumlate==0:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
        if (step+1)%(4*iter_to_accumlate) == 0:
            print("epoch",epoch,"step",step,"loss",loss,sep=" ")
    print("epoch",epoch,"trainLoss:",allloss/(len(train_dataloader)*train_batch_size))
    
    count = 0
    model.eval()
    validloss = []
    all_me = []
    for step,batch in enumerate(tqdm(eval_dataloader)):
        batch.to(device)
        with torch.no_grad():
            output = model.generate(**batch)
        # validloss += [output.loss.item()]
        references = []
        hypothesis = []
        label_strs = tokenizer.batch_decode(batch['labels'].cpu(),skip_special_tokens = True)
        output_strs = tokenizer.batch_decode(output.cpu())
        for i in range(len(label_strs)):
            all_me.append(bleu.sentence_bleu(references= [label_strs[i].split()], hypothesis=output_strs[i].split()))
        print(label_strs,output_strs,all_me)
    eval_bleu = sum(all_me)/len(all_me)
    print("epoch ",epoch,"bleu ",eval_bleu)
    if eval_bleu > max_eval_codebleu:
        min_eval_codebleu = eval_bleu
        model.save_pretrained("Jchew/mT5ForTranslate_English2Chinese")
        model.push_to_hub("Jchew/mT5ForTranslate_English2Chinese")
        tokenizer.push_to_hub("Jchew/mT5ForTranslate_English2Chinese")

  0%|          | 16/125000 [00:04<8:11:28,  4.24it/s]

epoch 0 step 15 loss tensor(11.5027, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 32/125000 [00:07<8:13:09,  4.22it/s]

epoch 0 step 31 loss tensor(12.4297, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 48/125000 [00:11<8:20:47,  4.16it/s]

epoch 0 step 47 loss tensor(12.9952, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 64/125000 [00:15<8:21:38,  4.15it/s]

epoch 0 step 63 loss tensor(12.8109, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 80/125000 [00:19<8:20:29,  4.16it/s]

epoch 0 step 79 loss tensor(12.6205, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 96/125000 [00:22<8:17:44,  4.18it/s]

epoch 0 step 95 loss tensor(12.7459, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 112/125000 [00:26<8:17:40,  4.18it/s]

epoch 0 step 111 loss tensor(11.8303, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 128/125000 [00:30<8:16:25,  4.19it/s]

epoch 0 step 127 loss tensor(13.1457, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 144/125000 [00:33<8:18:01,  4.18it/s]

epoch 0 step 143 loss tensor(13.2817, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 160/125000 [00:37<8:17:02,  4.19it/s]

epoch 0 step 159 loss tensor(12.1787, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 176/125000 [00:41<8:20:03,  4.16it/s]

epoch 0 step 175 loss tensor(12.5181, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 192/125000 [00:45<8:24:26,  4.12it/s]

epoch 0 step 191 loss tensor(13.3451, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 208/125000 [00:48<8:25:29,  4.11it/s]

epoch 0 step 207 loss tensor(12.8580, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 224/125000 [00:52<8:23:02,  4.13it/s]

epoch 0 step 223 loss tensor(11.9465, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 240/125000 [00:56<8:20:40,  4.15it/s]

epoch 0 step 239 loss tensor(13.6040, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 256/125000 [01:00<8:20:30,  4.15it/s]

epoch 0 step 255 loss tensor(12.7260, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 272/125000 [01:03<8:21:18,  4.15it/s]

epoch 0 step 271 loss tensor(12.6769, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 288/125000 [01:07<8:21:56,  4.14it/s]

epoch 0 step 287 loss tensor(12.8504, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 304/125000 [01:11<8:22:00,  4.14it/s]

epoch 0 step 303 loss tensor(12.9200, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 320/125000 [01:14<8:22:20,  4.14it/s]

epoch 0 step 319 loss tensor(13.0444, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 336/125000 [01:18<8:26:16,  4.10it/s]

epoch 0 step 335 loss tensor(12.5533, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 352/125000 [01:22<8:25:51,  4.11it/s]

epoch 0 step 351 loss tensor(12.8176, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 368/125000 [01:26<8:24:17,  4.12it/s]

epoch 0 step 367 loss tensor(12.5836, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 384/125000 [01:30<8:24:56,  4.11it/s]

epoch 0 step 383 loss tensor(12.9389, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 400/125000 [01:33<8:22:23,  4.13it/s]

epoch 0 step 399 loss tensor(13.1447, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 416/125000 [01:37<8:23:25,  4.12it/s]

epoch 0 step 415 loss tensor(12.1788, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 432/125000 [01:41<8:24:24,  4.12it/s]

epoch 0 step 431 loss tensor(13.7213, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 448/125000 [01:45<8:22:27,  4.13it/s]

epoch 0 step 447 loss tensor(12.2419, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 464/125000 [01:48<8:23:27,  4.12it/s]

epoch 0 step 463 loss tensor(11.8498, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 480/125000 [01:52<8:28:54,  4.08it/s]

epoch 0 step 479 loss tensor(12.5210, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 496/125000 [01:56<8:28:25,  4.08it/s]

epoch 0 step 495 loss tensor(11.4576, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 512/125000 [02:00<8:26:11,  4.10it/s]

epoch 0 step 511 loss tensor(12.1299, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 528/125000 [02:04<8:23:55,  4.12it/s]

epoch 0 step 527 loss tensor(12.9011, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 544/125000 [02:07<8:26:41,  4.09it/s]

epoch 0 step 543 loss tensor(13.4753, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 560/125000 [02:11<8:24:40,  4.11it/s]

epoch 0 step 559 loss tensor(13.4340, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 576/125000 [02:15<8:24:03,  4.11it/s]

epoch 0 step 575 loss tensor(13.5512, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 592/125000 [02:19<8:23:48,  4.12it/s]

epoch 0 step 591 loss tensor(12.4486, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 608/125000 [02:22<8:23:35,  4.12it/s]

epoch 0 step 607 loss tensor(11.7728, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 624/125000 [02:26<8:19:25,  4.15it/s]

epoch 0 step 623 loss tensor(12.1400, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 640/125000 [02:30<8:28:03,  4.08it/s]

epoch 0 step 639 loss tensor(12.3527, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 656/125000 [02:34<8:28:21,  4.08it/s]

epoch 0 step 655 loss tensor(12.5913, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 672/125000 [02:38<8:26:36,  4.09it/s]

epoch 0 step 671 loss tensor(12.6791, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 688/125000 [02:41<8:24:36,  4.11it/s]

epoch 0 step 687 loss tensor(12.5738, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 704/125000 [02:45<8:22:59,  4.12it/s]

epoch 0 step 703 loss tensor(13.2602, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 720/125000 [02:49<8:22:57,  4.12it/s]

epoch 0 step 719 loss tensor(12.8335, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 736/125000 [02:53<8:23:50,  4.11it/s]

epoch 0 step 735 loss tensor(12.7655, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 752/125000 [02:56<8:25:06,  4.10it/s]

epoch 0 step 751 loss tensor(12.2426, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 768/125000 [03:00<8:27:55,  4.08it/s]

epoch 0 step 767 loss tensor(13.0616, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 784/125000 [03:04<8:26:06,  4.09it/s]

epoch 0 step 783 loss tensor(13.3657, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 800/125000 [03:08<8:24:38,  4.10it/s]

epoch 0 step 799 loss tensor(12.7331, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 816/125000 [03:12<8:23:25,  4.11it/s]

epoch 0 step 815 loss tensor(12.3104, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 832/125000 [03:15<8:23:29,  4.11it/s]

epoch 0 step 831 loss tensor(13.3942, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 848/125000 [03:19<8:24:47,  4.10it/s]

epoch 0 step 847 loss tensor(13.1844, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 864/125000 [03:23<8:22:37,  4.12it/s]

epoch 0 step 863 loss tensor(11.4514, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 880/125000 [03:27<8:22:31,  4.12it/s]

epoch 0 step 879 loss tensor(12.3390, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 896/125000 [03:30<8:26:58,  4.08it/s]

epoch 0 step 895 loss tensor(13.5072, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 912/125000 [03:34<8:26:31,  4.08it/s]

epoch 0 step 911 loss tensor(13.2961, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 928/125000 [03:38<8:26:47,  4.08it/s]

epoch 0 step 927 loss tensor(12.5026, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 944/125000 [03:42<8:27:47,  4.07it/s]

epoch 0 step 943 loss tensor(12.0858, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 960/125000 [03:46<8:23:12,  4.11it/s]

epoch 0 step 959 loss tensor(13.3813, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 976/125000 [03:49<8:24:09,  4.10it/s]

epoch 0 step 975 loss tensor(13.6859, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 992/125000 [03:53<8:24:49,  4.09it/s]

epoch 0 step 991 loss tensor(12.5946, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1008/125000 [03:57<8:24:12,  4.10it/s]

epoch 0 step 1007 loss tensor(12.6950, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1024/125000 [04:01<8:30:26,  4.05it/s]

epoch 0 step 1023 loss tensor(12.6162, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1040/125000 [04:05<8:26:20,  4.08it/s]

epoch 0 step 1039 loss tensor(12.1344, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1056/125000 [04:08<8:25:45,  4.08it/s]

epoch 0 step 1055 loss tensor(11.9065, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1072/125000 [04:12<8:23:50,  4.10it/s]

epoch 0 step 1071 loss tensor(12.8341, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1088/125000 [04:16<8:23:19,  4.10it/s]

epoch 0 step 1087 loss tensor(12.7030, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1104/125000 [04:20<8:24:07,  4.10it/s]

epoch 0 step 1103 loss tensor(12.3920, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1120/125000 [04:24<8:22:39,  4.11it/s]

epoch 0 step 1119 loss tensor(12.9708, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1136/125000 [04:27<8:23:09,  4.10it/s]

epoch 0 step 1135 loss tensor(12.5585, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1152/125000 [04:31<8:27:21,  4.07it/s]

epoch 0 step 1151 loss tensor(12.8016, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1168/125000 [04:35<8:30:23,  4.04it/s]

epoch 0 step 1167 loss tensor(11.6934, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1184/125000 [04:39<8:28:46,  4.06it/s]

epoch 0 step 1183 loss tensor(12.2508, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1200/125000 [04:43<8:27:06,  4.07it/s]

epoch 0 step 1199 loss tensor(12.9375, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1216/125000 [04:46<8:26:11,  4.08it/s]

epoch 0 step 1215 loss tensor(13.3048, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1232/125000 [04:50<8:22:11,  4.11it/s]

epoch 0 step 1231 loss tensor(11.9325, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1248/125000 [04:54<8:23:00,  4.10it/s]

epoch 0 step 1247 loss tensor(12.2284, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1264/125000 [04:58<8:21:32,  4.11it/s]

epoch 0 step 1263 loss tensor(12.2249, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1280/125000 [05:02<8:22:45,  4.10it/s]

epoch 0 step 1279 loss tensor(12.9202, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1296/125000 [05:05<8:22:25,  4.10it/s]

epoch 0 step 1295 loss tensor(12.8864, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1312/125000 [05:09<8:23:28,  4.09it/s]

epoch 0 step 1311 loss tensor(12.9231, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1328/125000 [05:13<8:32:13,  4.02it/s]

epoch 0 step 1327 loss tensor(11.9628, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1344/125000 [05:17<8:31:08,  4.03it/s]

epoch 0 step 1343 loss tensor(12.0762, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1360/125000 [05:21<8:29:18,  4.05it/s]

epoch 0 step 1359 loss tensor(12.3671, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1376/125000 [05:24<8:25:39,  4.07it/s]

epoch 0 step 1375 loss tensor(12.2712, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1392/125000 [05:28<8:23:25,  4.09it/s]

epoch 0 step 1391 loss tensor(12.5271, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1408/125000 [05:32<8:23:02,  4.09it/s]

epoch 0 step 1407 loss tensor(12.3448, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1424/125000 [05:36<8:21:46,  4.10it/s]

epoch 0 step 1423 loss tensor(12.1666, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1440/125000 [05:40<8:21:07,  4.11it/s]

epoch 0 step 1439 loss tensor(12.8103, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1456/125000 [05:43<8:21:23,  4.11it/s]

epoch 0 step 1455 loss tensor(12.9040, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1472/125000 [05:47<8:24:15,  4.08it/s]

epoch 0 step 1471 loss tensor(13.1942, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1488/125000 [05:51<8:27:03,  4.06it/s]

epoch 0 step 1487 loss tensor(12.8278, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1504/125000 [05:55<8:24:08,  4.08it/s]

epoch 0 step 1503 loss tensor(12.8156, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1520/125000 [05:59<8:23:13,  4.09it/s]

epoch 0 step 1519 loss tensor(11.6824, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1536/125000 [06:02<8:22:54,  4.09it/s]

epoch 0 step 1535 loss tensor(11.9514, device='cuda:0', grad_fn=<DivBackward0>)


  1%|          | 1552/125000 [06:06<8:22:11,  4.10it/s]

epoch 0 step 1551 loss tensor(12.8135, device='cuda:0', grad_fn=<DivBackward0>)


  1%|▏         | 1568/125000 [06:10<8:22:15,  4.10it/s]

epoch 0 step 1567 loss tensor(12.5012, device='cuda:0', grad_fn=<DivBackward0>)


  1%|▏         | 1584/125000 [06:14<8:21:49,  4.10it/s]

epoch 0 step 1583 loss tensor(12.5951, device='cuda:0', grad_fn=<DivBackward0>)


  1%|▏         | 1600/125000 [06:18<8:22:54,  4.09it/s]

epoch 0 step 1599 loss tensor(12.5673, device='cuda:0', grad_fn=<DivBackward0>)


  1%|▏         | 1616/125000 [06:21<8:25:53,  4.06it/s]

epoch 0 step 1615 loss tensor(13.3708, device='cuda:0', grad_fn=<DivBackward0>)


  1%|▏         | 1632/125000 [06:25<8:24:11,  4.08it/s]

epoch 0 step 1631 loss tensor(12.6996, device='cuda:0', grad_fn=<DivBackward0>)


  1%|▏         | 1648/125000 [06:29<8:21:56,  4.10it/s]

epoch 0 step 1647 loss tensor(13.1432, device='cuda:0', grad_fn=<DivBackward0>)


  1%|▏         | 1664/125000 [06:33<8:22:28,  4.09it/s]

epoch 0 step 1663 loss tensor(12.1089, device='cuda:0', grad_fn=<DivBackward0>)


  1%|▏         | 1680/125000 [06:37<8:21:56,  4.09it/s]

epoch 0 step 1679 loss tensor(12.7116, device='cuda:0', grad_fn=<DivBackward0>)


  1%|▏         | 1696/125000 [06:40<8:21:57,  4.09it/s]

epoch 0 step 1695 loss tensor(12.3021, device='cuda:0', grad_fn=<DivBackward0>)


  1%|▏         | 1712/125000 [06:44<8:21:32,  4.10it/s]

epoch 0 step 1711 loss tensor(12.8485, device='cuda:0', grad_fn=<DivBackward0>)


  1%|▏         | 1728/125000 [06:48<8:19:56,  4.11it/s]

epoch 0 step 1727 loss tensor(12.9348, device='cuda:0', grad_fn=<DivBackward0>)


  1%|▏         | 1744/125000 [06:52<8:20:49,  4.10it/s]

epoch 0 step 1743 loss tensor(12.5586, device='cuda:0', grad_fn=<DivBackward0>)


  1%|▏         | 1760/125000 [06:56<8:30:28,  4.02it/s]

epoch 0 step 1759 loss tensor(12.4278, device='cuda:0', grad_fn=<DivBackward0>)


  1%|▏         | 1776/125000 [06:59<8:29:06,  4.03it/s]

epoch 0 step 1775 loss tensor(12.6786, device='cuda:0', grad_fn=<DivBackward0>)


  1%|▏         | 1792/125000 [07:03<8:27:31,  4.05it/s]

epoch 0 step 1791 loss tensor(12.0947, device='cuda:0', grad_fn=<DivBackward0>)


  1%|▏         | 1808/125000 [07:07<8:23:20,  4.08it/s]

epoch 0 step 1807 loss tensor(12.2646, device='cuda:0', grad_fn=<DivBackward0>)


  1%|▏         | 1824/125000 [07:11<8:23:37,  4.08it/s]

epoch 0 step 1823 loss tensor(11.8744, device='cuda:0', grad_fn=<DivBackward0>)


  1%|▏         | 1840/125000 [07:15<8:19:45,  4.11it/s]

epoch 0 step 1839 loss tensor(11.0365, device='cuda:0', grad_fn=<DivBackward0>)


  1%|▏         | 1856/125000 [07:18<8:19:42,  4.11it/s]

epoch 0 step 1855 loss tensor(12.8202, device='cuda:0', grad_fn=<DivBackward0>)


  1%|▏         | 1872/125000 [07:22<8:21:55,  4.09it/s]

epoch 0 step 1871 loss tensor(11.9889, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 1888/125000 [07:26<8:20:45,  4.10it/s]

epoch 0 step 1887 loss tensor(12.0744, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 1904/125000 [07:30<8:19:43,  4.11it/s]

epoch 0 step 1903 loss tensor(12.1608, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 1920/125000 [07:34<8:21:27,  4.09it/s]

epoch 0 step 1919 loss tensor(11.9141, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 1936/125000 [07:37<8:27:53,  4.04it/s]

epoch 0 step 1935 loss tensor(12.8822, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 1952/125000 [07:41<8:26:49,  4.05it/s]

epoch 0 step 1951 loss tensor(12.1857, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 1968/125000 [07:45<8:23:56,  4.07it/s]

epoch 0 step 1967 loss tensor(11.2027, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 1984/125000 [07:49<8:23:55,  4.07it/s]

epoch 0 step 1983 loss tensor(11.7540, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2000/125000 [07:53<8:21:58,  4.08it/s]

epoch 0 step 1999 loss tensor(12.5960, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2016/125000 [07:56<8:20:31,  4.10it/s]

epoch 0 step 2015 loss tensor(12.7236, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2032/125000 [08:00<8:20:33,  4.09it/s]

epoch 0 step 2031 loss tensor(12.3185, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2048/125000 [08:04<8:19:37,  4.10it/s]

epoch 0 step 2047 loss tensor(13.1886, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2064/125000 [08:08<8:20:24,  4.09it/s]

epoch 0 step 2063 loss tensor(11.8851, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2080/125000 [08:12<8:23:48,  4.07it/s]

epoch 0 step 2079 loss tensor(11.8934, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2096/125000 [08:15<8:26:08,  4.05it/s]

epoch 0 step 2095 loss tensor(12.5737, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2112/125000 [08:19<8:23:23,  4.07it/s]

epoch 0 step 2111 loss tensor(12.3014, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2128/125000 [08:23<8:22:39,  4.07it/s]

epoch 0 step 2127 loss tensor(12.8635, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2144/125000 [08:27<8:23:02,  4.07it/s]

epoch 0 step 2143 loss tensor(12.5295, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2160/125000 [08:31<8:19:37,  4.10it/s]

epoch 0 step 2159 loss tensor(12.2564, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2176/125000 [08:34<8:19:29,  4.10it/s]

epoch 0 step 2175 loss tensor(12.1608, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2192/125000 [08:38<8:18:55,  4.10it/s]

epoch 0 step 2191 loss tensor(12.5283, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2208/125000 [08:42<8:21:39,  4.08it/s]

epoch 0 step 2207 loss tensor(12.9381, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2224/125000 [08:46<8:24:14,  4.06it/s]

epoch 0 step 2223 loss tensor(12.3401, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2240/125000 [08:50<8:22:46,  4.07it/s]

epoch 0 step 2239 loss tensor(11.6120, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2256/125000 [08:53<8:20:53,  4.08it/s]

epoch 0 step 2255 loss tensor(12.0415, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2272/125000 [08:57<8:18:06,  4.11it/s]

epoch 0 step 2271 loss tensor(12.6995, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2288/125000 [09:01<8:18:30,  4.10it/s]

epoch 0 step 2287 loss tensor(11.9858, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2304/125000 [09:05<8:18:48,  4.10it/s]

epoch 0 step 2303 loss tensor(12.6568, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2320/125000 [09:09<8:22:28,  4.07it/s]

epoch 0 step 2319 loss tensor(11.5137, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2336/125000 [09:12<8:21:05,  4.08it/s]

epoch 0 step 2335 loss tensor(12.1766, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2352/125000 [09:16<8:20:56,  4.08it/s]

epoch 0 step 2351 loss tensor(12.5826, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2368/125000 [09:20<8:17:49,  4.11it/s]

epoch 0 step 2367 loss tensor(11.2791, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2384/125000 [09:24<8:18:14,  4.10it/s]

epoch 0 step 2383 loss tensor(12.1534, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2400/125000 [09:28<8:18:11,  4.10it/s]

epoch 0 step 2399 loss tensor(12.4162, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2416/125000 [09:31<8:22:46,  4.06it/s]

epoch 0 step 2415 loss tensor(11.7970, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2432/125000 [09:35<8:21:58,  4.07it/s]

epoch 0 step 2431 loss tensor(12.5949, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2448/125000 [09:39<8:20:08,  4.08it/s]

epoch 0 step 2447 loss tensor(12.6939, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2464/125000 [09:43<8:18:58,  4.09it/s]

epoch 0 step 2463 loss tensor(12.5377, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2480/125000 [09:47<8:19:07,  4.09it/s]

epoch 0 step 2479 loss tensor(12.0750, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2496/125000 [09:50<8:19:59,  4.08it/s]

epoch 0 step 2495 loss tensor(12.2158, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2512/125000 [09:54<8:18:34,  4.09it/s]

epoch 0 step 2511 loss tensor(12.3487, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2528/125000 [09:58<8:23:50,  4.05it/s]

epoch 0 step 2527 loss tensor(12.0805, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2544/125000 [10:02<8:25:39,  4.04it/s]

epoch 0 step 2543 loss tensor(11.9338, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2560/125000 [10:06<8:23:48,  4.05it/s]

epoch 0 step 2559 loss tensor(12.6782, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2576/125000 [10:09<8:20:41,  4.08it/s]

epoch 0 step 2575 loss tensor(11.2725, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2592/125000 [10:13<8:28:26,  4.01it/s]

epoch 0 step 2591 loss tensor(12.1696, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2608/125000 [10:17<8:23:25,  4.05it/s]

epoch 0 step 2607 loss tensor(12.0958, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2624/125000 [10:21<8:17:36,  4.10it/s]

epoch 0 step 2623 loss tensor(12.8649, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2640/125000 [10:25<8:18:54,  4.09it/s]

epoch 0 step 2639 loss tensor(12.3023, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2656/125000 [10:28<8:18:49,  4.09it/s]

epoch 0 step 2655 loss tensor(12.6541, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2672/125000 [10:32<8:11:47,  4.15it/s]

epoch 0 step 2671 loss tensor(11.9326, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2688/125000 [10:36<8:22:47,  4.05it/s]

epoch 0 step 2687 loss tensor(12.4862, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2704/125000 [10:40<8:22:54,  4.05it/s]

epoch 0 step 2703 loss tensor(11.8174, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2720/125000 [10:44<8:20:37,  4.07it/s]

epoch 0 step 2719 loss tensor(12.0414, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2736/125000 [10:47<8:19:02,  4.08it/s]

epoch 0 step 2735 loss tensor(10.8836, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2752/125000 [10:51<8:16:35,  4.10it/s]

epoch 0 step 2751 loss tensor(12.5953, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2768/125000 [10:55<8:16:59,  4.10it/s]

epoch 0 step 2767 loss tensor(12.4740, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2784/125000 [10:59<8:18:01,  4.09it/s]

epoch 0 step 2783 loss tensor(11.9897, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2800/125000 [11:03<8:16:10,  4.10it/s]

epoch 0 step 2799 loss tensor(11.6697, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2816/125000 [11:06<8:17:09,  4.10it/s]

epoch 0 step 2815 loss tensor(12.2829, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2832/125000 [11:10<8:18:49,  4.08it/s]

epoch 0 step 2831 loss tensor(11.6090, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2848/125000 [11:14<8:26:17,  4.02it/s]

epoch 0 step 2847 loss tensor(12.0368, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2864/125000 [11:18<8:23:30,  4.04it/s]

epoch 0 step 2863 loss tensor(11.9263, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2880/125000 [11:22<8:21:35,  4.06it/s]

epoch 0 step 2879 loss tensor(11.5118, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2896/125000 [11:25<8:19:46,  4.07it/s]

epoch 0 step 2895 loss tensor(11.3960, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2912/125000 [11:29<8:18:43,  4.08it/s]

epoch 0 step 2911 loss tensor(12.0723, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2928/125000 [11:33<8:16:32,  4.10it/s]

epoch 0 step 2927 loss tensor(11.2139, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2944/125000 [11:37<8:18:10,  4.08it/s]

epoch 0 step 2943 loss tensor(11.5814, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2960/125000 [11:41<8:15:13,  4.11it/s]

epoch 0 step 2959 loss tensor(13.1135, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2976/125000 [11:44<8:17:19,  4.09it/s]

epoch 0 step 2975 loss tensor(12.7056, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 2992/125000 [11:48<8:21:46,  4.05it/s]

epoch 0 step 2991 loss tensor(11.8664, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 3008/125000 [11:52<8:20:10,  4.06it/s]

epoch 0 step 3007 loss tensor(11.0522, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 3024/125000 [11:56<8:17:10,  4.09it/s]

epoch 0 step 3023 loss tensor(12.9563, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 3040/125000 [12:00<8:17:17,  4.09it/s]

epoch 0 step 3039 loss tensor(12.0155, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 3056/125000 [12:03<8:16:02,  4.10it/s]

epoch 0 step 3055 loss tensor(10.8327, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 3072/125000 [12:07<8:17:14,  4.09it/s]

epoch 0 step 3071 loss tensor(10.8202, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 3088/125000 [12:11<8:16:26,  4.09it/s]

epoch 0 step 3087 loss tensor(12.1514, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 3104/125000 [12:15<8:15:09,  4.10it/s]

epoch 0 step 3103 loss tensor(12.6313, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 3120/125000 [12:19<8:18:16,  4.08it/s]

epoch 0 step 3119 loss tensor(10.9801, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3136/125000 [12:22<8:22:39,  4.04it/s]

epoch 0 step 3135 loss tensor(12.0838, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3152/125000 [12:26<8:21:07,  4.05it/s]

epoch 0 step 3151 loss tensor(12.0798, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3168/125000 [12:30<8:20:39,  4.06it/s]

epoch 0 step 3167 loss tensor(11.6134, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3184/125000 [12:34<8:18:38,  4.07it/s]

epoch 0 step 3183 loss tensor(12.5074, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3200/125000 [12:38<8:18:19,  4.07it/s]

epoch 0 step 3199 loss tensor(12.2519, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3216/125000 [12:42<8:16:38,  4.09it/s]

epoch 0 step 3215 loss tensor(12.0878, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3232/125000 [12:45<8:16:48,  4.08it/s]

epoch 0 step 3231 loss tensor(11.7754, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3248/125000 [12:49<8:14:36,  4.10it/s]

epoch 0 step 3247 loss tensor(13.0637, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3264/125000 [12:53<8:14:46,  4.10it/s]

epoch 0 step 3263 loss tensor(11.7444, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3280/125000 [12:57<8:18:54,  4.07it/s]

epoch 0 step 3279 loss tensor(12.1819, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3296/125000 [13:01<8:17:23,  4.08it/s]

epoch 0 step 3295 loss tensor(12.4741, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3312/125000 [13:04<8:16:31,  4.08it/s]

epoch 0 step 3311 loss tensor(11.6093, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3328/125000 [13:08<8:14:56,  4.10it/s]

epoch 0 step 3327 loss tensor(11.8194, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3344/125000 [13:12<8:15:17,  4.09it/s]

epoch 0 step 3343 loss tensor(11.6839, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3360/125000 [13:16<8:13:56,  4.10it/s]

epoch 0 step 3359 loss tensor(12.0347, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3376/125000 [13:19<8:14:44,  4.10it/s]

epoch 0 step 3375 loss tensor(12.7650, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3392/125000 [13:23<8:14:35,  4.10it/s]

epoch 0 step 3391 loss tensor(11.0819, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3408/125000 [13:27<8:21:52,  4.04it/s]

epoch 0 step 3407 loss tensor(12.0781, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3424/125000 [13:31<8:20:25,  4.05it/s]

epoch 0 step 3423 loss tensor(11.7414, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3440/125000 [13:35<8:20:08,  4.05it/s]

epoch 0 step 3439 loss tensor(12.0199, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3456/125000 [13:39<8:16:34,  4.08it/s]

epoch 0 step 3455 loss tensor(11.2936, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3472/125000 [13:42<8:15:26,  4.09it/s]

epoch 0 step 3471 loss tensor(11.1193, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3488/125000 [13:46<8:13:37,  4.10it/s]

epoch 0 step 3487 loss tensor(11.6273, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3504/125000 [13:50<8:15:04,  4.09it/s]

epoch 0 step 3503 loss tensor(12.4942, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3520/125000 [13:54<8:14:12,  4.10it/s]

epoch 0 step 3519 loss tensor(11.7492, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3536/125000 [13:57<8:15:20,  4.09it/s]

epoch 0 step 3535 loss tensor(11.6477, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3552/125000 [14:01<8:12:52,  4.11it/s]

epoch 0 step 3551 loss tensor(12.2779, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3568/125000 [14:05<8:20:36,  4.04it/s]

epoch 0 step 3567 loss tensor(11.9373, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3584/125000 [14:09<8:19:10,  4.05it/s]

epoch 0 step 3583 loss tensor(11.7752, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3600/125000 [14:13<8:18:53,  4.06it/s]

epoch 0 step 3599 loss tensor(11.2889, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3616/125000 [14:17<8:15:44,  4.08it/s]

epoch 0 step 3615 loss tensor(11.3931, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3632/125000 [14:20<8:14:31,  4.09it/s]

epoch 0 step 3631 loss tensor(12.2817, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3648/125000 [14:24<8:13:17,  4.10it/s]

epoch 0 step 3647 loss tensor(12.1171, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3664/125000 [14:28<8:13:57,  4.09it/s]

epoch 0 step 3663 loss tensor(11.1532, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3680/125000 [14:32<8:12:10,  4.11it/s]

epoch 0 step 3679 loss tensor(11.1796, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3696/125000 [14:36<8:17:23,  4.06it/s]

epoch 0 step 3695 loss tensor(12.5707, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3712/125000 [14:39<8:16:26,  4.07it/s]

epoch 0 step 3711 loss tensor(11.5512, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3728/125000 [14:43<8:12:57,  4.10it/s]

epoch 0 step 3727 loss tensor(11.8228, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3744/125000 [14:47<8:13:00,  4.10it/s]

epoch 0 step 3743 loss tensor(11.6966, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3760/125000 [14:51<8:13:48,  4.09it/s]

epoch 0 step 3759 loss tensor(11.3787, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3776/125000 [14:55<8:12:35,  4.10it/s]

epoch 0 step 3775 loss tensor(11.3021, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3792/125000 [14:58<8:17:16,  4.06it/s]

epoch 0 step 3791 loss tensor(12.4514, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3808/125000 [15:02<8:15:55,  4.07it/s]

epoch 0 step 3807 loss tensor(11.5979, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3824/125000 [15:06<8:17:02,  4.06it/s]

epoch 0 step 3823 loss tensor(11.8795, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3840/125000 [15:10<8:11:51,  4.11it/s]

epoch 0 step 3839 loss tensor(11.3230, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3856/125000 [15:13<8:13:27,  4.09it/s]

epoch 0 step 3855 loss tensor(11.3248, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3872/125000 [15:17<8:14:02,  4.09it/s]

epoch 0 step 3871 loss tensor(11.2963, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3888/125000 [15:21<8:16:53,  4.06it/s]

epoch 0 step 3887 loss tensor(11.3954, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3904/125000 [15:25<8:16:43,  4.06it/s]

epoch 0 step 3903 loss tensor(12.4913, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3920/125000 [15:29<8:16:54,  4.06it/s]

epoch 0 step 3919 loss tensor(12.0767, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3936/125000 [15:33<8:14:19,  4.08it/s]

epoch 0 step 3935 loss tensor(11.5729, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3952/125000 [15:36<8:13:09,  4.09it/s]

epoch 0 step 3951 loss tensor(11.4137, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3968/125000 [15:40<8:12:26,  4.10it/s]

epoch 0 step 3967 loss tensor(12.0083, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 3984/125000 [15:44<8:13:17,  4.09it/s]

epoch 0 step 3983 loss tensor(11.4562, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 4000/125000 [15:48<8:15:52,  4.07it/s]

epoch 0 step 3999 loss tensor(10.2476, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 4016/125000 [15:52<8:14:14,  4.08it/s]

epoch 0 step 4015 loss tensor(11.8995, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 4032/125000 [15:55<8:13:53,  4.08it/s]

epoch 0 step 4031 loss tensor(12.3609, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 4048/125000 [15:59<8:13:07,  4.09it/s]

epoch 0 step 4047 loss tensor(11.6274, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 4064/125000 [16:03<8:10:53,  4.11it/s]

epoch 0 step 4063 loss tensor(11.8747, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 4080/125000 [16:07<8:12:36,  4.09it/s]

epoch 0 step 4079 loss tensor(11.2445, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 4096/125000 [16:10<8:13:26,  4.08it/s]

epoch 0 step 4095 loss tensor(11.4456, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 4112/125000 [16:14<8:11:23,  4.10it/s]

epoch 0 step 4111 loss tensor(11.7758, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 4128/125000 [16:18<8:11:19,  4.10it/s]

epoch 0 step 4127 loss tensor(11.3449, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 4144/125000 [16:22<8:18:40,  4.04it/s]

epoch 0 step 4143 loss tensor(11.1504, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 4160/125000 [16:26<8:18:38,  4.04it/s]

epoch 0 step 4159 loss tensor(11.7960, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 4176/125000 [16:30<8:16:08,  4.06it/s]

epoch 0 step 4175 loss tensor(12.4915, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 4192/125000 [16:33<8:13:16,  4.08it/s]

epoch 0 step 4191 loss tensor(11.3011, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 4208/125000 [16:37<8:13:44,  4.08it/s]

epoch 0 step 4207 loss tensor(11.4604, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 4224/125000 [16:41<8:11:39,  4.09it/s]

epoch 0 step 4223 loss tensor(11.7978, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 4240/125000 [16:45<8:11:55,  4.09it/s]

epoch 0 step 4239 loss tensor(10.2227, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 4256/125000 [16:49<8:10:58,  4.10it/s]

epoch 0 step 4255 loss tensor(11.1537, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 4272/125000 [16:52<8:12:48,  4.08it/s]

epoch 0 step 4271 loss tensor(11.4051, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 4288/125000 [16:56<8:12:48,  4.08it/s]

epoch 0 step 4287 loss tensor(10.7635, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 4304/125000 [17:00<8:11:07,  4.10it/s]

epoch 0 step 4303 loss tensor(12.7175, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 4320/125000 [17:04<8:11:45,  4.09it/s]

epoch 0 step 4319 loss tensor(11.1491, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 4336/125000 [17:08<8:11:00,  4.10it/s]

epoch 0 step 4335 loss tensor(11.5896, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 4352/125000 [17:11<8:11:19,  4.09it/s]

epoch 0 step 4351 loss tensor(11.7712, device='cuda:0', grad_fn=<DivBackward0>)


  3%|▎         | 4368/125000 [17:15<8:12:41,  4.08it/s]

epoch 0 step 4367 loss tensor(10.8579, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▎         | 4384/125000 [17:19<8:15:17,  4.06it/s]

epoch 0 step 4383 loss tensor(10.4771, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▎         | 4400/125000 [17:23<8:15:46,  4.05it/s]

epoch 0 step 4399 loss tensor(11.1751, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▎         | 4416/125000 [17:27<8:13:49,  4.07it/s]

epoch 0 step 4415 loss tensor(10.9871, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▎         | 4432/125000 [17:30<8:12:58,  4.08it/s]

epoch 0 step 4431 loss tensor(11.5570, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▎         | 4448/125000 [17:34<8:12:03,  4.08it/s]

epoch 0 step 4447 loss tensor(11.5688, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▎         | 4464/125000 [17:38<8:09:36,  4.10it/s]

epoch 0 step 4463 loss tensor(11.7605, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▎         | 4480/125000 [17:42<8:09:00,  4.11it/s]

epoch 0 step 4479 loss tensor(11.3232, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▎         | 4496/125000 [17:46<8:10:36,  4.09it/s]

epoch 0 step 4495 loss tensor(10.7675, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▎         | 4512/125000 [17:49<8:10:50,  4.09it/s]

epoch 0 step 4511 loss tensor(10.2793, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▎         | 4528/125000 [17:53<8:15:13,  4.05it/s]

epoch 0 step 4527 loss tensor(10.8281, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▎         | 4544/125000 [17:57<8:14:37,  4.06it/s]

epoch 0 step 4543 loss tensor(11.5521, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▎         | 4560/125000 [18:01<8:12:20,  4.08it/s]

epoch 0 step 4559 loss tensor(10.9101, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▎         | 4576/125000 [18:05<8:12:53,  4.07it/s]

epoch 0 step 4575 loss tensor(11.4304, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▎         | 4592/125000 [18:08<8:09:46,  4.10it/s]

epoch 0 step 4591 loss tensor(12.3586, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▎         | 4608/125000 [18:12<8:10:10,  4.09it/s]

epoch 0 step 4607 loss tensor(10.2323, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▎         | 4624/125000 [18:16<8:10:12,  4.09it/s]

epoch 0 step 4623 loss tensor(9.9733, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▎         | 4640/125000 [18:20<8:15:00,  4.05it/s]

epoch 0 step 4639 loss tensor(10.9348, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▎         | 4656/125000 [18:24<8:13:40,  4.06it/s]

epoch 0 step 4655 loss tensor(10.9222, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▎         | 4672/125000 [18:27<8:12:55,  4.07it/s]

epoch 0 step 4671 loss tensor(11.4297, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 4688/125000 [18:31<8:09:59,  4.09it/s]

epoch 0 step 4687 loss tensor(10.8826, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 4704/125000 [18:35<8:09:08,  4.10it/s]

epoch 0 step 4703 loss tensor(10.8021, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 4720/125000 [18:39<8:10:21,  4.09it/s]

epoch 0 step 4719 loss tensor(11.6560, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 4736/125000 [18:43<8:09:45,  4.09it/s]

epoch 0 step 4735 loss tensor(10.4354, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 4752/125000 [18:46<8:09:15,  4.10it/s]

epoch 0 step 4751 loss tensor(11.0448, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 4768/125000 [18:50<8:13:54,  4.06it/s]

epoch 0 step 4767 loss tensor(11.2785, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 4784/125000 [18:54<8:13:57,  4.06it/s]

epoch 0 step 4783 loss tensor(11.6003, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 4800/125000 [18:58<8:09:32,  4.09it/s]

epoch 0 step 4799 loss tensor(11.6471, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 4816/125000 [19:02<8:16:21,  4.04it/s]

epoch 0 step 4815 loss tensor(10.7194, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 4832/125000 [19:05<8:11:06,  4.08it/s]

epoch 0 step 4831 loss tensor(10.6986, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 4848/125000 [19:09<8:08:46,  4.10it/s]

epoch 0 step 4847 loss tensor(10.6500, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 4864/125000 [19:13<8:08:51,  4.10it/s]

epoch 0 step 4863 loss tensor(11.2825, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 4880/125000 [19:17<8:11:38,  4.07it/s]

epoch 0 step 4879 loss tensor(10.4457, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 4896/125000 [19:21<8:09:29,  4.09it/s]

epoch 0 step 4895 loss tensor(10.7201, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 4912/125000 [19:24<8:09:51,  4.09it/s]

epoch 0 step 4911 loss tensor(10.7812, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 4928/125000 [19:28<8:08:47,  4.09it/s]

epoch 0 step 4927 loss tensor(12.1244, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 4944/125000 [19:32<8:07:58,  4.10it/s]

epoch 0 step 4943 loss tensor(11.2869, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 4960/125000 [19:36<8:10:43,  4.08it/s]

epoch 0 step 4959 loss tensor(11.1282, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 4976/125000 [19:40<8:11:59,  4.07it/s]

epoch 0 step 4975 loss tensor(11.1654, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 4992/125000 [19:43<8:12:00,  4.07it/s]

epoch 0 step 4991 loss tensor(10.8057, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5008/125000 [19:47<8:11:07,  4.07it/s]

epoch 0 step 5007 loss tensor(11.3156, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5024/125000 [19:51<8:10:19,  4.08it/s]

epoch 0 step 5023 loss tensor(11.0995, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5040/125000 [19:55<8:09:12,  4.09it/s]

epoch 0 step 5039 loss tensor(10.6547, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5056/125000 [19:59<8:09:44,  4.08it/s]

epoch 0 step 5055 loss tensor(10.9705, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5072/125000 [20:02<8:07:52,  4.10it/s]

epoch 0 step 5071 loss tensor(10.3408, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5088/125000 [20:06<8:09:02,  4.09it/s]

epoch 0 step 5087 loss tensor(11.2416, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5104/125000 [20:10<8:06:33,  4.11it/s]

epoch 0 step 5103 loss tensor(10.2027, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5120/125000 [20:14<8:13:20,  4.05it/s]

epoch 0 step 5119 loss tensor(9.9717, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5136/125000 [20:18<8:12:26,  4.06it/s]

epoch 0 step 5135 loss tensor(10.4925, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5152/125000 [20:21<8:11:42,  4.06it/s]

epoch 0 step 5151 loss tensor(11.6181, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5168/125000 [20:25<8:09:26,  4.08it/s]

epoch 0 step 5167 loss tensor(10.5704, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5184/125000 [20:29<8:11:15,  4.06it/s]

epoch 0 step 5183 loss tensor(10.9337, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5200/125000 [20:33<8:06:39,  4.10it/s]

epoch 0 step 5199 loss tensor(11.2492, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5216/125000 [20:37<8:09:06,  4.08it/s]

epoch 0 step 5215 loss tensor(10.1399, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5232/125000 [20:40<8:08:06,  4.09it/s]

epoch 0 step 5231 loss tensor(9.9903, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5248/125000 [20:44<8:07:14,  4.10it/s]

epoch 0 step 5247 loss tensor(11.7302, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5264/125000 [20:48<8:13:12,  4.05it/s]

epoch 0 step 5263 loss tensor(10.4327, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5280/125000 [20:52<8:11:33,  4.06it/s]

epoch 0 step 5279 loss tensor(10.0941, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5296/125000 [20:56<8:10:49,  4.06it/s]

epoch 0 step 5295 loss tensor(10.9179, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5312/125000 [20:59<8:08:50,  4.08it/s]

epoch 0 step 5311 loss tensor(11.1648, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5328/125000 [21:03<8:10:30,  4.07it/s]

epoch 0 step 5327 loss tensor(10.8696, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5344/125000 [21:07<8:07:44,  4.09it/s]

epoch 0 step 5343 loss tensor(10.2451, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5360/125000 [21:11<8:07:02,  4.09it/s]

epoch 0 step 5359 loss tensor(10.7302, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5376/125000 [21:15<8:07:33,  4.09it/s]

epoch 0 step 5375 loss tensor(11.2102, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5392/125000 [21:18<8:06:22,  4.10it/s]

epoch 0 step 5391 loss tensor(10.5750, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5408/125000 [21:22<8:06:30,  4.10it/s]

epoch 0 step 5407 loss tensor(10.0004, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5424/125000 [21:26<8:12:49,  4.04it/s]

epoch 0 step 5423 loss tensor(11.0412, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5440/125000 [21:30<8:15:16,  4.02it/s]

epoch 0 step 5439 loss tensor(10.6832, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5456/125000 [21:34<8:10:52,  4.06it/s]

epoch 0 step 5455 loss tensor(10.9289, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5472/125000 [21:38<8:09:45,  4.07it/s]

epoch 0 step 5471 loss tensor(10.3824, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5488/125000 [21:41<8:08:26,  4.08it/s]

epoch 0 step 5487 loss tensor(10.2393, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5504/125000 [21:45<8:05:31,  4.10it/s]

epoch 0 step 5503 loss tensor(9.8080, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5520/125000 [21:49<8:06:00,  4.10it/s]

epoch 0 step 5519 loss tensor(11.2582, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5536/125000 [21:53<8:07:33,  4.08it/s]

epoch 0 step 5535 loss tensor(10.3384, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5552/125000 [21:57<8:06:02,  4.10it/s]

epoch 0 step 5551 loss tensor(10.6643, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5568/125000 [22:00<8:05:53,  4.10it/s]

epoch 0 step 5567 loss tensor(10.3444, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5584/125000 [22:04<8:07:24,  4.08it/s]

epoch 0 step 5583 loss tensor(10.7239, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5600/125000 [22:08<8:14:26,  4.02it/s]

epoch 0 step 5599 loss tensor(10.4704, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 5616/125000 [22:12<8:10:46,  4.05it/s]

epoch 0 step 5615 loss tensor(10.1535, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 5632/125000 [22:16<8:10:15,  4.06it/s]

epoch 0 step 5631 loss tensor(10.7589, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 5648/125000 [22:19<8:08:00,  4.08it/s]

epoch 0 step 5647 loss tensor(10.2085, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 5664/125000 [22:23<8:06:25,  4.09it/s]

epoch 0 step 5663 loss tensor(10.2668, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 5680/125000 [22:27<8:06:46,  4.09it/s]

epoch 0 step 5679 loss tensor(9.7079, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 5696/125000 [22:31<8:05:48,  4.09it/s]

epoch 0 step 5695 loss tensor(10.2349, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 5712/125000 [22:35<8:06:43,  4.08it/s]

epoch 0 step 5711 loss tensor(9.9901, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 5728/125000 [22:38<8:06:31,  4.09it/s]

epoch 0 step 5727 loss tensor(10.1379, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 5744/125000 [22:42<8:04:26,  4.10it/s]

epoch 0 step 5743 loss tensor(10.8534, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 5760/125000 [22:46<8:10:12,  4.05it/s]

epoch 0 step 5759 loss tensor(10.5341, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 5776/125000 [22:50<8:09:09,  4.06it/s]

epoch 0 step 5775 loss tensor(9.7013, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 5792/125000 [22:54<8:08:35,  4.07it/s]

epoch 0 step 5791 loss tensor(10.7379, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 5808/125000 [22:57<8:07:20,  4.08it/s]

epoch 0 step 5807 loss tensor(10.4073, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 5824/125000 [23:01<8:05:17,  4.09it/s]

epoch 0 step 5823 loss tensor(10.6073, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 5840/125000 [23:05<8:04:37,  4.10it/s]

epoch 0 step 5839 loss tensor(10.0815, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 5856/125000 [23:09<8:06:52,  4.08it/s]

epoch 0 step 5855 loss tensor(10.5855, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 5872/125000 [23:13<8:05:05,  4.09it/s]

epoch 0 step 5871 loss tensor(10.2618, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 5888/125000 [23:16<8:04:12,  4.10it/s]

epoch 0 step 5887 loss tensor(10.2536, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 5904/125000 [23:20<8:10:38,  4.05it/s]

epoch 0 step 5903 loss tensor(10.0705, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 5920/125000 [23:24<8:10:21,  4.05it/s]

epoch 0 step 5919 loss tensor(10.4289, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 5936/125000 [23:28<8:10:03,  4.05it/s]

epoch 0 step 5935 loss tensor(9.5450, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 5952/125000 [23:32<8:07:23,  4.07it/s]

epoch 0 step 5951 loss tensor(10.2018, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 5968/125000 [23:35<8:12:53,  4.02it/s]

epoch 0 step 5967 loss tensor(11.1623, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 5984/125000 [23:39<8:04:17,  4.10it/s]

epoch 0 step 5983 loss tensor(10.7856, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 6000/125000 [23:43<8:05:03,  4.09it/s]

epoch 0 step 5999 loss tensor(9.6956, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 6016/125000 [23:47<8:06:21,  4.08it/s]

epoch 0 step 6015 loss tensor(10.2910, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 6032/125000 [23:51<8:06:46,  4.07it/s]

epoch 0 step 6031 loss tensor(10.3264, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 6048/125000 [23:54<8:08:51,  4.06it/s]

epoch 0 step 6047 loss tensor(10.8431, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 6064/125000 [23:58<8:06:56,  4.07it/s]

epoch 0 step 6063 loss tensor(10.1572, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 6080/125000 [24:02<8:05:10,  4.09it/s]

epoch 0 step 6079 loss tensor(9.3980, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 6096/125000 [24:06<8:03:47,  4.10it/s]

epoch 0 step 6095 loss tensor(9.5923, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 6112/125000 [24:10<8:03:53,  4.09it/s]

epoch 0 step 6111 loss tensor(9.9971, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 6128/125000 [24:13<8:03:40,  4.10it/s]

epoch 0 step 6127 loss tensor(10.5593, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 6144/125000 [24:17<8:05:14,  4.08it/s]

epoch 0 step 6143 loss tensor(10.0937, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 6160/125000 [24:21<8:10:02,  4.04it/s]

epoch 0 step 6159 loss tensor(9.3717, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 6176/125000 [24:25<8:06:40,  4.07it/s]

epoch 0 step 6175 loss tensor(10.0124, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 6192/125000 [24:29<8:04:31,  4.09it/s]

epoch 0 step 6191 loss tensor(9.5898, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 6208/125000 [24:33<8:04:43,  4.08it/s]

epoch 0 step 6207 loss tensor(11.2191, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 6224/125000 [24:36<8:03:35,  4.09it/s]

epoch 0 step 6223 loss tensor(9.5850, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▍         | 6240/125000 [24:40<8:03:36,  4.09it/s]

epoch 0 step 6239 loss tensor(10.4668, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6256/125000 [24:44<8:05:06,  4.08it/s]

epoch 0 step 6255 loss tensor(10.1379, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6272/125000 [24:48<8:06:56,  4.06it/s]

epoch 0 step 6271 loss tensor(9.9538, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6288/125000 [24:51<8:03:44,  4.09it/s]

epoch 0 step 6287 loss tensor(10.5909, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6304/125000 [24:55<8:02:45,  4.10it/s]

epoch 0 step 6303 loss tensor(9.6181, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6320/125000 [24:59<8:02:30,  4.10it/s]

epoch 0 step 6319 loss tensor(9.8686, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6336/125000 [25:03<8:02:13,  4.10it/s]

epoch 0 step 6335 loss tensor(9.5575, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6352/125000 [25:07<8:03:22,  4.09it/s]

epoch 0 step 6351 loss tensor(10.0354, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6368/125000 [25:10<8:02:07,  4.10it/s]

epoch 0 step 6367 loss tensor(9.5102, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6384/125000 [25:14<8:07:31,  4.05it/s]

epoch 0 step 6383 loss tensor(10.4761, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6400/125000 [25:18<8:07:30,  4.05it/s]

epoch 0 step 6399 loss tensor(10.1086, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6416/125000 [25:22<8:06:43,  4.06it/s]

epoch 0 step 6415 loss tensor(9.4763, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6432/125000 [25:26<8:03:49,  4.08it/s]

epoch 0 step 6431 loss tensor(9.9089, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6448/125000 [25:30<8:05:37,  4.07it/s]

epoch 0 step 6447 loss tensor(10.7945, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6464/125000 [25:33<8:02:57,  4.09it/s]

epoch 0 step 6463 loss tensor(9.2352, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6480/125000 [25:37<8:03:44,  4.08it/s]

epoch 0 step 6479 loss tensor(9.2781, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6496/125000 [25:41<8:01:34,  4.10it/s]

epoch 0 step 6495 loss tensor(10.2999, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6512/125000 [25:45<8:03:06,  4.09it/s]

epoch 0 step 6511 loss tensor(9.7547, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6528/125000 [25:48<8:00:58,  4.11it/s]

epoch 0 step 6527 loss tensor(9.6753, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6544/125000 [25:52<8:07:27,  4.05it/s]

epoch 0 step 6543 loss tensor(9.3810, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6560/125000 [25:56<8:05:22,  4.07it/s]

epoch 0 step 6559 loss tensor(9.6258, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6576/125000 [26:00<8:03:36,  4.08it/s]

epoch 0 step 6575 loss tensor(9.5220, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6592/125000 [26:04<8:04:13,  4.08it/s]

epoch 0 step 6591 loss tensor(10.1954, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6608/125000 [26:08<8:01:25,  4.10it/s]

epoch 0 step 6607 loss tensor(9.3632, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6624/125000 [26:11<8:01:46,  4.10it/s]

epoch 0 step 6623 loss tensor(9.4110, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6640/125000 [26:15<7:59:44,  4.11it/s]

epoch 0 step 6639 loss tensor(9.4287, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6656/125000 [26:19<8:02:55,  4.08it/s]

epoch 0 step 6655 loss tensor(10.1833, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6672/125000 [26:23<8:00:36,  4.10it/s]

epoch 0 step 6671 loss tensor(9.5399, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6688/125000 [26:26<8:06:16,  4.06it/s]

epoch 0 step 6687 loss tensor(9.3318, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6704/125000 [26:30<8:07:03,  4.05it/s]

epoch 0 step 6703 loss tensor(9.7131, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6720/125000 [26:34<8:05:38,  4.06it/s]

epoch 0 step 6719 loss tensor(9.6360, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6736/125000 [26:38<8:02:54,  4.08it/s]

epoch 0 step 6735 loss tensor(9.4012, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6752/125000 [26:42<8:08:39,  4.03it/s]

epoch 0 step 6751 loss tensor(10.0194, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6768/125000 [26:46<8:01:11,  4.10it/s]

epoch 0 step 6767 loss tensor(8.7575, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6784/125000 [26:49<8:02:29,  4.08it/s]

epoch 0 step 6783 loss tensor(8.7421, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6800/125000 [26:53<8:01:15,  4.09it/s]

epoch 0 step 6799 loss tensor(9.4893, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6816/125000 [26:57<8:00:59,  4.10it/s]

epoch 0 step 6815 loss tensor(9.4095, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6832/125000 [27:01<7:59:32,  4.11it/s]

epoch 0 step 6831 loss tensor(9.6574, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6848/125000 [27:05<8:07:42,  4.04it/s]

epoch 0 step 6847 loss tensor(9.7037, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 6864/125000 [27:08<8:03:38,  4.07it/s]

epoch 0 step 6863 loss tensor(9.4044, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 6880/125000 [27:12<8:02:49,  4.08it/s]

epoch 0 step 6879 loss tensor(9.9393, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 6896/125000 [27:16<8:00:58,  4.09it/s]

epoch 0 step 6895 loss tensor(10.4264, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 6912/125000 [27:20<8:01:20,  4.09it/s]

epoch 0 step 6911 loss tensor(9.3785, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 6928/125000 [27:24<7:59:43,  4.10it/s]

epoch 0 step 6927 loss tensor(9.0493, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 6944/125000 [27:27<8:00:39,  4.09it/s]

epoch 0 step 6943 loss tensor(9.8968, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 6960/125000 [27:31<7:59:49,  4.10it/s]

epoch 0 step 6959 loss tensor(9.7028, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 6976/125000 [27:35<8:04:21,  4.06it/s]

epoch 0 step 6975 loss tensor(9.1301, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 6992/125000 [27:39<8:04:45,  4.06it/s]

epoch 0 step 6991 loss tensor(9.5387, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7008/125000 [27:43<8:02:09,  4.08it/s]

epoch 0 step 7007 loss tensor(9.8139, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7024/125000 [27:46<8:02:21,  4.08it/s]

epoch 0 step 7023 loss tensor(8.9440, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7040/125000 [27:50<7:58:40,  4.11it/s]

epoch 0 step 7039 loss tensor(9.5343, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7056/125000 [27:54<8:01:29,  4.08it/s]

epoch 0 step 7055 loss tensor(9.8852, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7072/125000 [27:58<7:59:19,  4.10it/s]

epoch 0 step 7071 loss tensor(9.0769, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7088/125000 [28:02<8:00:46,  4.09it/s]

epoch 0 step 7087 loss tensor(9.1034, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7104/125000 [28:05<8:00:15,  4.09it/s]

epoch 0 step 7103 loss tensor(9.8269, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7120/125000 [28:09<7:57:58,  4.11it/s]

epoch 0 step 7119 loss tensor(9.0737, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7136/125000 [28:13<8:07:09,  4.03it/s]

epoch 0 step 7135 loss tensor(9.6814, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7152/125000 [28:17<8:04:56,  4.05it/s]

epoch 0 step 7151 loss tensor(8.8979, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7168/125000 [28:21<8:03:56,  4.06it/s]

epoch 0 step 7167 loss tensor(9.5270, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7184/125000 [28:24<8:01:01,  4.08it/s]

epoch 0 step 7183 loss tensor(9.1024, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7200/125000 [28:28<8:00:40,  4.08it/s]

epoch 0 step 7199 loss tensor(9.8690, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7216/125000 [28:32<7:59:47,  4.09it/s]

epoch 0 step 7215 loss tensor(10.1200, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7232/125000 [28:36<8:00:03,  4.09it/s]

epoch 0 step 7231 loss tensor(9.1388, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7248/125000 [28:40<7:58:20,  4.10it/s]

epoch 0 step 7247 loss tensor(9.3865, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7264/125000 [28:43<8:00:15,  4.09it/s]

epoch 0 step 7263 loss tensor(10.1442, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7280/125000 [28:47<8:03:39,  4.06it/s]

epoch 0 step 7279 loss tensor(9.2995, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7296/125000 [28:51<8:05:09,  4.04it/s]

epoch 0 step 7295 loss tensor(9.2939, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7312/125000 [28:55<8:01:59,  4.07it/s]

epoch 0 step 7311 loss tensor(9.0791, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7328/125000 [28:59<8:00:27,  4.08it/s]

epoch 0 step 7327 loss tensor(8.1955, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7344/125000 [29:02<8:00:31,  4.08it/s]

epoch 0 step 7343 loss tensor(8.8952, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7360/125000 [29:06<7:58:30,  4.10it/s]

epoch 0 step 7359 loss tensor(9.3934, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7376/125000 [29:10<7:59:30,  4.09it/s]

epoch 0 step 7375 loss tensor(9.0644, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7392/125000 [29:14<8:00:32,  4.08it/s]

epoch 0 step 7391 loss tensor(9.0441, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7408/125000 [29:18<7:58:54,  4.09it/s]

epoch 0 step 7407 loss tensor(8.9726, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7424/125000 [29:21<7:56:47,  4.11it/s]

epoch 0 step 7423 loss tensor(8.9718, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7440/125000 [29:25<8:03:35,  4.05it/s]

epoch 0 step 7439 loss tensor(9.4292, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7456/125000 [29:29<8:03:49,  4.05it/s]

epoch 0 step 7455 loss tensor(8.4846, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7472/125000 [29:33<8:00:49,  4.07it/s]

epoch 0 step 7471 loss tensor(9.1158, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7488/125000 [29:37<7:59:56,  4.08it/s]

epoch 0 step 7487 loss tensor(9.0931, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7504/125000 [29:41<7:58:57,  4.09it/s]

epoch 0 step 7503 loss tensor(8.7418, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7520/125000 [29:44<7:58:01,  4.10it/s]

epoch 0 step 7519 loss tensor(8.7193, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7536/125000 [29:48<7:58:34,  4.09it/s]

epoch 0 step 7535 loss tensor(8.3221, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7552/125000 [29:52<7:58:57,  4.09it/s]

epoch 0 step 7551 loss tensor(8.6169, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7568/125000 [29:56<8:00:08,  4.08it/s]

epoch 0 step 7567 loss tensor(7.8484, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7584/125000 [29:59<8:02:14,  4.06it/s]

epoch 0 step 7583 loss tensor(8.7015, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7600/125000 [30:03<8:02:09,  4.06it/s]

epoch 0 step 7599 loss tensor(8.7561, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7616/125000 [30:07<8:01:15,  4.07it/s]

epoch 0 step 7615 loss tensor(8.9629, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7632/125000 [30:11<7:58:34,  4.09it/s]

epoch 0 step 7631 loss tensor(8.1094, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7648/125000 [30:15<7:58:08,  4.09it/s]

epoch 0 step 7647 loss tensor(9.4557, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7664/125000 [30:19<7:56:50,  4.10it/s]

epoch 0 step 7663 loss tensor(8.5419, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7680/125000 [30:22<7:56:30,  4.10it/s]

epoch 0 step 7679 loss tensor(9.2399, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7696/125000 [30:26<7:58:36,  4.08it/s]

epoch 0 step 7695 loss tensor(8.7569, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7712/125000 [30:30<7:58:31,  4.09it/s]

epoch 0 step 7711 loss tensor(9.0169, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7728/125000 [30:34<7:57:00,  4.10it/s]

epoch 0 step 7727 loss tensor(8.7889, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7744/125000 [30:38<8:02:44,  4.05it/s]

epoch 0 step 7743 loss tensor(9.6278, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7760/125000 [30:41<8:06:46,  4.01it/s]

epoch 0 step 7759 loss tensor(8.8356, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7776/125000 [30:45<8:02:19,  4.05it/s]

epoch 0 step 7775 loss tensor(8.3105, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7792/125000 [30:49<7:59:06,  4.08it/s]

epoch 0 step 7791 loss tensor(8.6568, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 7808/125000 [30:53<7:58:23,  4.08it/s]

epoch 0 step 7807 loss tensor(8.4631, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▋         | 7824/125000 [30:57<8:00:04,  4.07it/s]

epoch 0 step 7823 loss tensor(9.4984, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▋         | 7840/125000 [31:00<7:57:58,  4.09it/s]

epoch 0 step 7839 loss tensor(9.0411, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▋         | 7856/125000 [31:04<7:57:45,  4.09it/s]

epoch 0 step 7855 loss tensor(9.3580, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▋         | 7872/125000 [31:08<7:56:40,  4.10it/s]

epoch 0 step 7871 loss tensor(8.5446, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▋         | 7888/125000 [31:12<7:55:43,  4.10it/s]

epoch 0 step 7887 loss tensor(8.5131, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▋         | 7904/125000 [31:16<7:59:16,  4.07it/s]

epoch 0 step 7903 loss tensor(9.3948, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▋         | 7920/125000 [31:19<8:01:35,  4.05it/s]

epoch 0 step 7919 loss tensor(8.5343, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▋         | 7936/125000 [31:23<8:01:35,  4.05it/s]

epoch 0 step 7935 loss tensor(8.5122, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▋         | 7952/125000 [31:27<8:01:44,  4.05it/s]

epoch 0 step 7951 loss tensor(9.1020, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▋         | 7968/125000 [31:31<7:58:42,  4.07it/s]

epoch 0 step 7967 loss tensor(8.4400, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▋         | 7984/125000 [31:35<7:57:32,  4.08it/s]

epoch 0 step 7983 loss tensor(9.3013, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▋         | 8000/125000 [31:39<7:56:19,  4.09it/s]

epoch 0 step 7999 loss tensor(8.9734, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▋         | 8016/125000 [31:42<7:55:57,  4.10it/s]

epoch 0 step 8015 loss tensor(8.3738, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▋         | 8032/125000 [31:46<7:56:02,  4.10it/s]

epoch 0 step 8031 loss tensor(8.4887, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▋         | 8048/125000 [31:50<7:57:11,  4.08it/s]

epoch 0 step 8047 loss tensor(8.1954, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▋         | 8064/125000 [31:54<7:55:11,  4.10it/s]

epoch 0 step 8063 loss tensor(9.2510, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▋         | 8080/125000 [31:57<7:57:36,  4.08it/s]

epoch 0 step 8079 loss tensor(8.8575, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▋         | 8096/125000 [32:01<8:05:03,  4.02it/s]

epoch 0 step 8095 loss tensor(8.0124, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▋         | 8112/125000 [32:05<8:00:45,  4.05it/s]

epoch 0 step 8111 loss tensor(7.9100, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8128/125000 [32:09<7:59:34,  4.06it/s]

epoch 0 step 8127 loss tensor(8.6158, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8144/125000 [32:13<7:59:26,  4.06it/s]

epoch 0 step 8143 loss tensor(8.3406, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8160/125000 [32:17<7:57:09,  4.08it/s]

epoch 0 step 8159 loss tensor(8.7565, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8176/125000 [32:20<7:55:00,  4.10it/s]

epoch 0 step 8175 loss tensor(8.7704, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8192/125000 [32:24<7:55:12,  4.10it/s]

epoch 0 step 8191 loss tensor(8.3583, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8208/125000 [32:28<7:56:02,  4.09it/s]

epoch 0 step 8207 loss tensor(8.2640, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8224/125000 [32:32<7:55:20,  4.09it/s]

epoch 0 step 8223 loss tensor(8.4751, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8240/125000 [32:36<7:59:11,  4.06it/s]

epoch 0 step 8239 loss tensor(8.7529, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8256/125000 [32:39<7:59:01,  4.06it/s]

epoch 0 step 8255 loss tensor(8.3085, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8272/125000 [32:43<7:55:54,  4.09it/s]

epoch 0 step 8271 loss tensor(8.1763, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8288/125000 [32:47<7:55:18,  4.09it/s]

epoch 0 step 8287 loss tensor(8.3819, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8304/125000 [32:51<7:54:30,  4.10it/s]

epoch 0 step 8303 loss tensor(8.0876, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8320/125000 [32:55<7:54:57,  4.09it/s]

epoch 0 step 8319 loss tensor(7.3296, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8336/125000 [32:58<7:55:53,  4.09it/s]

epoch 0 step 8335 loss tensor(7.6294, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8352/125000 [33:02<7:53:31,  4.11it/s]

epoch 0 step 8351 loss tensor(7.9697, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8368/125000 [33:06<7:55:56,  4.08it/s]

epoch 0 step 8367 loss tensor(8.1315, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8384/125000 [33:10<8:00:54,  4.04it/s]

epoch 0 step 8383 loss tensor(7.7602, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8400/125000 [33:14<7:58:59,  4.06it/s]

epoch 0 step 8399 loss tensor(8.2749, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8416/125000 [33:17<7:58:32,  4.06it/s]

epoch 0 step 8415 loss tensor(8.2840, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8432/125000 [33:21<7:59:42,  4.05it/s]

epoch 0 step 8431 loss tensor(8.0487, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8448/125000 [33:25<7:55:35,  4.08it/s]

epoch 0 step 8447 loss tensor(7.6910, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8464/125000 [33:29<7:55:00,  4.09it/s]

epoch 0 step 8463 loss tensor(8.7106, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8480/125000 [33:33<7:54:22,  4.09it/s]

epoch 0 step 8479 loss tensor(8.8593, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8496/125000 [33:36<7:54:27,  4.09it/s]

epoch 0 step 8495 loss tensor(8.4271, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8512/125000 [33:40<7:55:37,  4.08it/s]

epoch 0 step 8511 loss tensor(8.1944, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8528/125000 [33:44<7:53:01,  4.10it/s]

epoch 0 step 8527 loss tensor(7.5111, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8544/125000 [33:48<7:57:42,  4.06it/s]

epoch 0 step 8543 loss tensor(8.9432, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8560/125000 [33:52<8:00:07,  4.04it/s]

epoch 0 step 8559 loss tensor(8.2865, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8576/125000 [33:55<7:57:05,  4.07it/s]

epoch 0 step 8575 loss tensor(7.8074, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8592/125000 [33:59<7:56:48,  4.07it/s]

epoch 0 step 8591 loss tensor(8.7706, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8608/125000 [34:03<7:53:50,  4.09it/s]

epoch 0 step 8607 loss tensor(7.7282, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8624/125000 [34:07<7:52:59,  4.10it/s]

epoch 0 step 8623 loss tensor(7.9805, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8640/125000 [34:11<7:53:53,  4.09it/s]

epoch 0 step 8639 loss tensor(7.9338, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8656/125000 [34:14<7:53:38,  4.09it/s]

epoch 0 step 8655 loss tensor(8.3218, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8672/125000 [34:18<7:50:12,  4.12it/s]

epoch 0 step 8671 loss tensor(8.9231, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8688/125000 [34:22<7:58:57,  4.05it/s]

epoch 0 step 8687 loss tensor(7.7510, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8704/125000 [34:26<7:55:37,  4.08it/s]

epoch 0 step 8703 loss tensor(7.8809, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8720/125000 [34:30<7:55:18,  4.08it/s]

epoch 0 step 8719 loss tensor(8.4015, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8736/125000 [34:34<7:53:04,  4.10it/s]

epoch 0 step 8735 loss tensor(9.1674, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8752/125000 [34:37<7:53:52,  4.09it/s]

epoch 0 step 8751 loss tensor(8.2533, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8768/125000 [34:41<7:54:48,  4.08it/s]

epoch 0 step 8767 loss tensor(7.6558, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8784/125000 [34:45<7:53:13,  4.09it/s]

epoch 0 step 8783 loss tensor(8.4589, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8800/125000 [34:49<7:53:46,  4.09it/s]

epoch 0 step 8799 loss tensor(8.8376, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8816/125000 [34:53<7:57:14,  4.06it/s]

epoch 0 step 8815 loss tensor(8.0116, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8832/125000 [34:56<7:56:34,  4.06it/s]

epoch 0 step 8831 loss tensor(7.5440, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8848/125000 [35:00<7:55:23,  4.07it/s]

epoch 0 step 8847 loss tensor(7.6346, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8864/125000 [35:04<7:54:24,  4.08it/s]

epoch 0 step 8863 loss tensor(7.8651, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8880/125000 [35:08<7:56:02,  4.07it/s]

epoch 0 step 8879 loss tensor(7.7716, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8896/125000 [35:12<7:53:07,  4.09it/s]

epoch 0 step 8895 loss tensor(7.8071, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8912/125000 [35:15<7:53:18,  4.09it/s]

epoch 0 step 8911 loss tensor(7.1780, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8928/125000 [35:19<7:53:40,  4.08it/s]

epoch 0 step 8927 loss tensor(7.6510, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8944/125000 [35:23<7:52:18,  4.10it/s]

epoch 0 step 8943 loss tensor(8.0962, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8960/125000 [35:27<7:54:33,  4.08it/s]

epoch 0 step 8959 loss tensor(7.9401, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8976/125000 [35:31<7:56:57,  4.05it/s]

epoch 0 step 8975 loss tensor(8.3353, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 8992/125000 [35:34<7:56:18,  4.06it/s]

epoch 0 step 8991 loss tensor(8.3040, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 9008/125000 [35:38<7:54:43,  4.07it/s]

epoch 0 step 9007 loss tensor(8.1515, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 9024/125000 [35:42<7:53:03,  4.09it/s]

epoch 0 step 9023 loss tensor(8.1703, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 9040/125000 [35:46<7:52:18,  4.09it/s]

epoch 0 step 9039 loss tensor(7.8941, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 9056/125000 [35:50<7:52:19,  4.09it/s]

epoch 0 step 9055 loss tensor(7.1189, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 9072/125000 [35:53<7:52:44,  4.09it/s]

epoch 0 step 9071 loss tensor(7.9630, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 9088/125000 [35:57<7:48:21,  4.12it/s]

epoch 0 step 9087 loss tensor(6.9881, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 9104/125000 [36:01<7:57:15,  4.05it/s]

epoch 0 step 9103 loss tensor(7.9513, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 9120/125000 [36:05<7:54:13,  4.07it/s]

epoch 0 step 9119 loss tensor(7.4543, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 9136/125000 [36:09<7:51:50,  4.09it/s]

epoch 0 step 9135 loss tensor(7.4094, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 9152/125000 [36:12<7:51:05,  4.10it/s]

epoch 0 step 9151 loss tensor(7.9907, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 9168/125000 [36:16<7:53:05,  4.08it/s]

epoch 0 step 9167 loss tensor(7.7757, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 9184/125000 [36:20<7:52:03,  4.09it/s]

epoch 0 step 9183 loss tensor(7.1865, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 9200/125000 [36:24<7:50:41,  4.10it/s]

epoch 0 step 9199 loss tensor(7.7227, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 9216/125000 [36:28<7:51:01,  4.10it/s]

epoch 0 step 9215 loss tensor(7.7834, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 9232/125000 [36:31<7:51:25,  4.09it/s]

epoch 0 step 9231 loss tensor(8.0333, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 9248/125000 [36:35<7:59:08,  4.03it/s]

epoch 0 step 9247 loss tensor(7.2670, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 9264/125000 [36:39<7:56:40,  4.05it/s]

epoch 0 step 9263 loss tensor(7.8939, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 9280/125000 [36:43<7:56:38,  4.05it/s]

epoch 0 step 9279 loss tensor(8.4650, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 9296/125000 [36:47<7:54:44,  4.06it/s]

epoch 0 step 9295 loss tensor(7.5609, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 9312/125000 [36:50<7:52:45,  4.08it/s]

epoch 0 step 9311 loss tensor(8.0905, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 9328/125000 [36:54<7:49:36,  4.11it/s]

epoch 0 step 9327 loss tensor(7.2772, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 9344/125000 [36:58<7:50:25,  4.10it/s]

epoch 0 step 9343 loss tensor(7.2012, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 9360/125000 [37:02<7:51:21,  4.09it/s]

epoch 0 step 9359 loss tensor(7.4403, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9376/125000 [37:06<7:49:46,  4.10it/s]

epoch 0 step 9375 loss tensor(7.4438, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9392/125000 [37:09<7:55:41,  4.05it/s]

epoch 0 step 9391 loss tensor(7.7187, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9408/125000 [37:13<7:53:43,  4.07it/s]

epoch 0 step 9407 loss tensor(7.6870, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9424/125000 [37:17<7:53:39,  4.07it/s]

epoch 0 step 9423 loss tensor(6.9172, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9440/125000 [37:21<7:50:09,  4.10it/s]

epoch 0 step 9439 loss tensor(7.5313, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9456/125000 [37:25<7:50:41,  4.09it/s]

epoch 0 step 9455 loss tensor(8.0539, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9472/125000 [37:28<7:50:08,  4.10it/s]

epoch 0 step 9471 loss tensor(6.7813, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9488/125000 [37:32<7:51:04,  4.09it/s]

epoch 0 step 9487 loss tensor(7.6947, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9504/125000 [37:36<7:51:37,  4.08it/s]

epoch 0 step 9503 loss tensor(7.7108, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9520/125000 [37:40<7:55:31,  4.05it/s]

epoch 0 step 9519 loss tensor(7.7622, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9536/125000 [37:44<7:52:36,  4.07it/s]

epoch 0 step 9535 loss tensor(7.5200, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9552/125000 [37:47<7:51:06,  4.08it/s]

epoch 0 step 9551 loss tensor(8.0095, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9568/125000 [37:51<7:50:49,  4.09it/s]

epoch 0 step 9567 loss tensor(7.6342, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9584/125000 [37:55<7:50:17,  4.09it/s]

epoch 0 step 9583 loss tensor(7.4144, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9600/125000 [37:59<7:48:51,  4.10it/s]

epoch 0 step 9599 loss tensor(7.7481, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9616/125000 [38:03<7:48:09,  4.11it/s]

epoch 0 step 9615 loss tensor(7.0491, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9632/125000 [38:06<7:50:58,  4.08it/s]

epoch 0 step 9631 loss tensor(7.7130, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9648/125000 [38:10<7:49:05,  4.10it/s]

epoch 0 step 9647 loss tensor(7.1739, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9664/125000 [38:14<7:57:39,  4.02it/s]

epoch 0 step 9663 loss tensor(7.0289, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9680/125000 [38:18<7:55:12,  4.04it/s]

epoch 0 step 9679 loss tensor(7.6850, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9696/125000 [38:22<7:54:42,  4.05it/s]

epoch 0 step 9695 loss tensor(7.2471, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9712/125000 [38:25<7:50:19,  4.09it/s]

epoch 0 step 9711 loss tensor(7.8109, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9728/125000 [38:29<7:50:39,  4.08it/s]

epoch 0 step 9727 loss tensor(6.6669, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9744/125000 [38:33<7:50:22,  4.08it/s]

epoch 0 step 9743 loss tensor(6.8493, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9760/125000 [38:37<7:49:26,  4.09it/s]

epoch 0 step 9759 loss tensor(7.5236, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9776/125000 [38:41<7:48:04,  4.10it/s]

epoch 0 step 9775 loss tensor(7.0725, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9792/125000 [38:44<7:48:59,  4.09it/s]

epoch 0 step 9791 loss tensor(6.8726, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9808/125000 [38:48<7:48:36,  4.10it/s]

epoch 0 step 9807 loss tensor(7.1927, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9824/125000 [38:52<7:54:19,  4.05it/s]

epoch 0 step 9823 loss tensor(7.3063, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9840/125000 [38:56<7:54:05,  4.05it/s]

epoch 0 step 9839 loss tensor(6.6938, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9856/125000 [39:00<7:51:14,  4.07it/s]

epoch 0 step 9855 loss tensor(7.0721, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9872/125000 [39:04<7:52:05,  4.06it/s]

epoch 0 step 9871 loss tensor(7.4544, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9888/125000 [39:07<7:56:53,  4.02it/s]

epoch 0 step 9887 loss tensor(7.1144, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9904/125000 [39:11<7:49:47,  4.08it/s]

epoch 0 step 9903 loss tensor(6.9870, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9920/125000 [39:15<7:47:50,  4.10it/s]

epoch 0 step 9919 loss tensor(7.2868, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9936/125000 [39:19<7:49:48,  4.08it/s]

epoch 0 step 9935 loss tensor(7.0404, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9952/125000 [39:22<7:48:51,  4.09it/s]

epoch 0 step 9951 loss tensor(7.3782, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9968/125000 [39:26<7:46:53,  4.11it/s]

epoch 0 step 9967 loss tensor(7.0677, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 9984/125000 [39:30<7:54:28,  4.04it/s]

epoch 0 step 9983 loss tensor(7.0774, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10000/125000 [39:34<7:52:22,  4.06it/s]

epoch 0 step 9999 loss tensor(7.2014, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10016/125000 [39:38<7:50:20,  4.07it/s]

epoch 0 step 10015 loss tensor(7.7709, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10032/125000 [39:42<7:49:05,  4.08it/s]

epoch 0 step 10031 loss tensor(7.2795, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10048/125000 [39:45<7:47:31,  4.10it/s]

epoch 0 step 10047 loss tensor(7.2545, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10064/125000 [39:49<7:48:12,  4.09it/s]

epoch 0 step 10063 loss tensor(6.7794, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10080/125000 [39:53<7:47:12,  4.10it/s]

epoch 0 step 10079 loss tensor(6.8395, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10096/125000 [39:57<7:49:01,  4.08it/s]

epoch 0 step 10095 loss tensor(6.6168, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10112/125000 [40:01<7:47:24,  4.10it/s]

epoch 0 step 10111 loss tensor(7.6767, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10128/125000 [40:04<7:49:58,  4.07it/s]

epoch 0 step 10127 loss tensor(6.5640, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10144/125000 [40:08<7:56:22,  4.02it/s]

epoch 0 step 10143 loss tensor(6.8345, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10160/125000 [40:12<7:53:47,  4.04it/s]

epoch 0 step 10159 loss tensor(7.0230, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10176/125000 [40:16<7:50:38,  4.07it/s]

epoch 0 step 10175 loss tensor(6.8454, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10192/125000 [40:20<7:49:38,  4.07it/s]

epoch 0 step 10191 loss tensor(6.5934, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10208/125000 [40:23<7:48:15,  4.09it/s]

epoch 0 step 10207 loss tensor(6.4778, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10224/125000 [40:27<7:46:54,  4.10it/s]

epoch 0 step 10223 loss tensor(6.8883, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10240/125000 [40:31<7:47:39,  4.09it/s]

epoch 0 step 10239 loss tensor(6.5265, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10256/125000 [40:35<7:47:59,  4.09it/s]

epoch 0 step 10255 loss tensor(7.4633, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10272/125000 [40:39<7:49:17,  4.07it/s]

epoch 0 step 10271 loss tensor(5.9351, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10288/125000 [40:42<7:50:33,  4.06it/s]

epoch 0 step 10287 loss tensor(6.5295, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10304/125000 [40:46<7:50:03,  4.07it/s]

epoch 0 step 10303 loss tensor(7.0967, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10320/125000 [40:50<7:47:50,  4.09it/s]

epoch 0 step 10319 loss tensor(6.8493, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10336/125000 [40:54<7:47:31,  4.09it/s]

epoch 0 step 10335 loss tensor(7.3017, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10352/125000 [40:58<7:46:19,  4.10it/s]

epoch 0 step 10351 loss tensor(6.8502, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10368/125000 [41:01<7:47:49,  4.08it/s]

epoch 0 step 10367 loss tensor(6.8576, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10384/125000 [41:05<7:47:11,  4.09it/s]

epoch 0 step 10383 loss tensor(7.0875, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10400/125000 [41:09<7:47:09,  4.09it/s]

epoch 0 step 10399 loss tensor(6.6148, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10416/125000 [41:13<7:50:57,  4.05it/s]

epoch 0 step 10415 loss tensor(6.6080, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10432/125000 [41:17<7:49:34,  4.07it/s]

epoch 0 step 10431 loss tensor(6.7950, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10448/125000 [41:20<7:49:42,  4.06it/s]

epoch 0 step 10447 loss tensor(6.5274, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10464/125000 [41:24<7:49:01,  4.07it/s]

epoch 0 step 10463 loss tensor(7.1575, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10480/125000 [41:28<7:46:31,  4.09it/s]

epoch 0 step 10479 loss tensor(6.9307, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10496/125000 [41:32<7:46:21,  4.09it/s]

epoch 0 step 10495 loss tensor(7.2317, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10512/125000 [41:36<7:45:56,  4.10it/s]

epoch 0 step 10511 loss tensor(6.5928, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10528/125000 [41:39<7:44:49,  4.10it/s]

epoch 0 step 10527 loss tensor(6.8949, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10544/125000 [41:43<7:47:22,  4.08it/s]

epoch 0 step 10543 loss tensor(7.8074, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10560/125000 [41:47<7:44:34,  4.11it/s]

epoch 0 step 10559 loss tensor(7.2294, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10576/125000 [41:51<7:51:22,  4.05it/s]

epoch 0 step 10575 loss tensor(6.3042, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10592/125000 [41:55<7:50:24,  4.05it/s]

epoch 0 step 10591 loss tensor(6.6346, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10608/125000 [41:59<7:47:10,  4.08it/s]

epoch 0 step 10607 loss tensor(6.6413, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 10624/125000 [42:02<7:45:19,  4.10it/s]

epoch 0 step 10623 loss tensor(6.7715, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▊         | 10640/125000 [42:06<7:45:21,  4.10it/s]

epoch 0 step 10639 loss tensor(6.2709, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▊         | 10656/125000 [42:10<7:46:40,  4.08it/s]

epoch 0 step 10655 loss tensor(6.0214, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▊         | 10672/125000 [42:14<7:44:07,  4.11it/s]

epoch 0 step 10671 loss tensor(6.5504, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▊         | 10688/125000 [42:17<7:45:54,  4.09it/s]

epoch 0 step 10687 loss tensor(6.5061, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▊         | 10704/125000 [42:21<7:45:53,  4.09it/s]

epoch 0 step 10703 loss tensor(6.3726, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▊         | 10720/125000 [42:25<7:50:07,  4.05it/s]

epoch 0 step 10719 loss tensor(7.2622, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▊         | 10736/125000 [42:29<7:50:21,  4.05it/s]

epoch 0 step 10735 loss tensor(6.8734, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▊         | 10752/125000 [42:33<7:48:07,  4.07it/s]

epoch 0 step 10751 loss tensor(6.2769, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▊         | 10768/125000 [42:37<7:48:18,  4.07it/s]

epoch 0 step 10767 loss tensor(7.2837, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▊         | 10784/125000 [42:40<7:45:02,  4.09it/s]

epoch 0 step 10783 loss tensor(6.6559, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▊         | 10800/125000 [42:44<7:45:05,  4.09it/s]

epoch 0 step 10799 loss tensor(7.1890, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▊         | 10816/125000 [42:48<7:45:23,  4.09it/s]

epoch 0 step 10815 loss tensor(6.7239, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▊         | 10832/125000 [42:52<7:45:45,  4.09it/s]

epoch 0 step 10831 loss tensor(6.4047, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▊         | 10848/125000 [42:56<7:45:36,  4.09it/s]

epoch 0 step 10847 loss tensor(6.3054, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▊         | 10864/125000 [42:59<7:44:19,  4.10it/s]

epoch 0 step 10863 loss tensor(6.2374, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▊         | 10880/125000 [43:03<7:46:56,  4.07it/s]

epoch 0 step 10879 loss tensor(7.2751, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▊         | 10896/125000 [43:07<7:49:52,  4.05it/s]

epoch 0 step 10895 loss tensor(6.4540, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▊         | 10912/125000 [43:11<7:49:08,  4.05it/s]

epoch 0 step 10911 loss tensor(6.5257, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▊         | 10928/125000 [43:15<7:48:28,  4.06it/s]

epoch 0 step 10927 loss tensor(6.6919, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 10944/125000 [43:18<7:46:59,  4.07it/s]

epoch 0 step 10943 loss tensor(6.6204, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 10960/125000 [43:22<7:48:44,  4.05it/s]

epoch 0 step 10959 loss tensor(6.4609, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 10976/125000 [43:26<7:43:57,  4.10it/s]

epoch 0 step 10975 loss tensor(6.1294, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 10992/125000 [43:30<7:44:25,  4.09it/s]

epoch 0 step 10991 loss tensor(6.2677, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11008/125000 [43:34<7:44:01,  4.09it/s]

epoch 0 step 11007 loss tensor(6.2881, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11024/125000 [43:37<7:42:57,  4.10it/s]

epoch 0 step 11023 loss tensor(6.9400, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11040/125000 [43:41<7:43:59,  4.09it/s]

epoch 0 step 11039 loss tensor(7.2901, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11056/125000 [43:45<7:51:15,  4.03it/s]

epoch 0 step 11055 loss tensor(6.1461, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11072/125000 [43:49<7:46:15,  4.07it/s]

epoch 0 step 11071 loss tensor(6.6502, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11088/125000 [43:53<7:45:09,  4.08it/s]

epoch 0 step 11087 loss tensor(6.5320, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11104/125000 [43:56<7:45:10,  4.08it/s]

epoch 0 step 11103 loss tensor(6.2158, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11120/125000 [44:00<7:45:17,  4.08it/s]

epoch 0 step 11119 loss tensor(5.8809, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11136/125000 [44:04<7:43:20,  4.10it/s]

epoch 0 step 11135 loss tensor(6.6470, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11152/125000 [44:08<7:43:54,  4.09it/s]

epoch 0 step 11151 loss tensor(6.1020, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11168/125000 [44:12<7:42:53,  4.10it/s]

epoch 0 step 11167 loss tensor(6.3068, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11184/125000 [44:15<7:44:29,  4.08it/s]

epoch 0 step 11183 loss tensor(6.7240, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11200/125000 [44:19<7:49:28,  4.04it/s]

epoch 0 step 11199 loss tensor(7.1733, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11216/125000 [44:23<7:47:40,  4.05it/s]

epoch 0 step 11215 loss tensor(6.2270, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11232/125000 [44:27<7:45:24,  4.07it/s]

epoch 0 step 11231 loss tensor(6.4121, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11248/125000 [44:31<7:45:41,  4.07it/s]

epoch 0 step 11247 loss tensor(5.8688, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11264/125000 [44:34<7:52:23,  4.01it/s]

epoch 0 step 11263 loss tensor(6.0793, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11280/125000 [44:38<7:42:44,  4.10it/s]

epoch 0 step 11279 loss tensor(5.9954, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11296/125000 [44:42<7:42:07,  4.10it/s]

epoch 0 step 11295 loss tensor(5.9803, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11312/125000 [44:46<7:43:07,  4.09it/s]

epoch 0 step 11311 loss tensor(6.9491, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11328/125000 [44:50<7:42:12,  4.10it/s]

epoch 0 step 11327 loss tensor(6.1478, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11344/125000 [44:53<7:40:50,  4.11it/s]

epoch 0 step 11343 loss tensor(6.2482, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11360/125000 [44:57<7:47:53,  4.05it/s]

epoch 0 step 11359 loss tensor(6.4926, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11376/125000 [45:01<7:46:15,  4.06it/s]

epoch 0 step 11375 loss tensor(6.1611, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11392/125000 [45:05<7:46:32,  4.06it/s]

epoch 0 step 11391 loss tensor(6.3446, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11408/125000 [45:09<7:44:19,  4.08it/s]

epoch 0 step 11407 loss tensor(5.9607, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11424/125000 [45:12<7:42:52,  4.09it/s]

epoch 0 step 11423 loss tensor(6.0508, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11440/125000 [45:16<7:42:51,  4.09it/s]

epoch 0 step 11439 loss tensor(6.1586, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11456/125000 [45:20<7:43:08,  4.09it/s]

epoch 0 step 11455 loss tensor(5.8297, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11472/125000 [45:24<7:42:48,  4.09it/s]

epoch 0 step 11471 loss tensor(6.2166, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11488/125000 [45:28<7:43:13,  4.08it/s]

epoch 0 step 11487 loss tensor(5.9213, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11504/125000 [45:31<7:45:33,  4.06it/s]

epoch 0 step 11503 loss tensor(6.2128, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11520/125000 [45:35<7:45:39,  4.06it/s]

epoch 0 step 11519 loss tensor(6.2831, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11536/125000 [45:39<7:45:51,  4.06it/s]

epoch 0 step 11535 loss tensor(6.2904, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11552/125000 [45:43<7:43:08,  4.08it/s]

epoch 0 step 11551 loss tensor(5.6409, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11568/125000 [45:47<7:43:23,  4.08it/s]

epoch 0 step 11567 loss tensor(5.8207, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11584/125000 [45:51<7:42:02,  4.09it/s]

epoch 0 step 11583 loss tensor(6.0592, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11600/125000 [45:54<7:42:51,  4.08it/s]

epoch 0 step 11599 loss tensor(5.9238, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11616/125000 [45:58<7:42:46,  4.08it/s]

epoch 0 step 11615 loss tensor(5.5764, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11632/125000 [46:02<7:41:16,  4.10it/s]

epoch 0 step 11631 loss tensor(6.2626, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11648/125000 [46:06<7:39:52,  4.11it/s]

epoch 0 step 11647 loss tensor(5.7902, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11664/125000 [46:09<7:46:52,  4.05it/s]

epoch 0 step 11663 loss tensor(5.8542, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11680/125000 [46:13<7:45:59,  4.05it/s]

epoch 0 step 11679 loss tensor(6.0482, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11696/125000 [46:17<7:44:30,  4.07it/s]

epoch 0 step 11695 loss tensor(5.6688, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11712/125000 [46:21<7:45:05,  4.06it/s]

epoch 0 step 11711 loss tensor(5.7582, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11728/125000 [46:25<7:42:14,  4.08it/s]

epoch 0 step 11727 loss tensor(6.0096, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11744/125000 [46:29<7:41:04,  4.09it/s]

epoch 0 step 11743 loss tensor(5.8366, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11760/125000 [46:32<7:41:52,  4.09it/s]

epoch 0 step 11759 loss tensor(5.9412, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11776/125000 [46:36<7:41:11,  4.09it/s]

epoch 0 step 11775 loss tensor(5.8933, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11792/125000 [46:40<7:40:49,  4.09it/s]

epoch 0 step 11791 loss tensor(6.4232, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11808/125000 [46:44<7:45:32,  4.05it/s]

epoch 0 step 11807 loss tensor(5.6531, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11824/125000 [46:48<7:45:32,  4.05it/s]

epoch 0 step 11823 loss tensor(6.5628, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11840/125000 [46:51<7:42:15,  4.08it/s]

epoch 0 step 11839 loss tensor(5.7803, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11856/125000 [46:55<7:43:26,  4.07it/s]

epoch 0 step 11855 loss tensor(6.1118, device='cuda:0', grad_fn=<DivBackward0>)


  9%|▉         | 11872/125000 [46:59<7:44:05,  4.06it/s]

epoch 0 step 11871 loss tensor(5.7550, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 11888/125000 [47:03<7:41:02,  4.09it/s]

epoch 0 step 11887 loss tensor(5.6010, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 11904/125000 [47:07<7:41:40,  4.08it/s]

epoch 0 step 11903 loss tensor(6.2663, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 11920/125000 [47:10<7:40:35,  4.09it/s]

epoch 0 step 11919 loss tensor(5.6643, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 11936/125000 [47:14<7:43:58,  4.06it/s]

epoch 0 step 11935 loss tensor(5.8821, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 11952/125000 [47:18<7:41:21,  4.08it/s]

epoch 0 step 11951 loss tensor(6.1331, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 11968/125000 [47:22<7:47:43,  4.03it/s]

epoch 0 step 11967 loss tensor(5.7023, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 11984/125000 [47:26<7:40:22,  4.09it/s]

epoch 0 step 11983 loss tensor(5.5456, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12000/125000 [47:29<7:39:09,  4.10it/s]

epoch 0 step 11999 loss tensor(5.9794, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12016/125000 [47:33<7:39:58,  4.09it/s]

epoch 0 step 12015 loss tensor(5.9541, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12032/125000 [47:37<7:40:03,  4.09it/s]

epoch 0 step 12031 loss tensor(5.8177, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12048/125000 [47:41<7:33:57,  4.15it/s]

epoch 0 step 12047 loss tensor(5.9849, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12064/125000 [47:45<7:46:28,  4.04it/s]

epoch 0 step 12063 loss tensor(5.5175, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12080/125000 [47:48<7:44:38,  4.05it/s]

epoch 0 step 12079 loss tensor(5.4439, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12096/125000 [47:52<7:43:18,  4.06it/s]

epoch 0 step 12095 loss tensor(5.5024, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12112/125000 [47:56<7:40:56,  4.08it/s]

epoch 0 step 12111 loss tensor(6.1858, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12128/125000 [48:00<7:39:18,  4.10it/s]

epoch 0 step 12127 loss tensor(5.4453, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12144/125000 [48:04<7:40:51,  4.08it/s]

epoch 0 step 12143 loss tensor(5.4755, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12160/125000 [48:07<7:39:55,  4.09it/s]

epoch 0 step 12159 loss tensor(6.5526, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12176/125000 [48:11<7:39:23,  4.09it/s]

epoch 0 step 12175 loss tensor(5.3025, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12192/125000 [48:15<7:38:51,  4.10it/s]

epoch 0 step 12191 loss tensor(5.6929, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12208/125000 [48:19<7:40:39,  4.08it/s]

epoch 0 step 12207 loss tensor(5.3975, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12224/125000 [48:23<7:45:54,  4.03it/s]

epoch 0 step 12223 loss tensor(5.9074, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12240/125000 [48:26<7:43:40,  4.05it/s]

epoch 0 step 12239 loss tensor(5.4633, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12256/125000 [48:30<7:40:51,  4.08it/s]

epoch 0 step 12255 loss tensor(5.5289, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12272/125000 [48:34<7:41:00,  4.08it/s]

epoch 0 step 12271 loss tensor(5.2484, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12288/125000 [48:38<7:39:25,  4.09it/s]

epoch 0 step 12287 loss tensor(5.3433, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12304/125000 [48:42<7:40:40,  4.08it/s]

epoch 0 step 12303 loss tensor(5.6187, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12320/125000 [48:45<7:39:08,  4.09it/s]

epoch 0 step 12319 loss tensor(5.7635, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12336/125000 [48:49<7:38:40,  4.09it/s]

epoch 0 step 12335 loss tensor(6.0077, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12352/125000 [48:53<7:38:28,  4.10it/s]

epoch 0 step 12351 loss tensor(5.3276, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12368/125000 [48:57<7:38:47,  4.09it/s]

epoch 0 step 12367 loss tensor(5.7887, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12384/125000 [49:01<7:43:18,  4.05it/s]

epoch 0 step 12383 loss tensor(5.4128, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12400/125000 [49:04<7:43:21,  4.05it/s]

epoch 0 step 12399 loss tensor(5.3592, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12416/125000 [49:08<7:39:59,  4.08it/s]

epoch 0 step 12415 loss tensor(5.5047, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12432/125000 [49:12<7:38:56,  4.09it/s]

epoch 0 step 12431 loss tensor(5.2248, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12448/125000 [49:16<7:39:05,  4.09it/s]

epoch 0 step 12447 loss tensor(5.3604, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12464/125000 [49:20<7:37:54,  4.10it/s]

epoch 0 step 12463 loss tensor(5.6645, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12480/125000 [49:23<7:37:38,  4.10it/s]

epoch 0 step 12479 loss tensor(5.5433, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 12496/125000 [49:27<7:37:44,  4.10it/s]

epoch 0 step 12495 loss tensor(5.2275, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12512/125000 [49:31<7:40:01,  4.08it/s]

epoch 0 step 12511 loss tensor(5.4063, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12528/125000 [49:35<7:36:33,  4.11it/s]

epoch 0 step 12527 loss tensor(5.2609, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12544/125000 [49:39<7:45:18,  4.03it/s]

epoch 0 step 12543 loss tensor(5.5006, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12560/125000 [49:42<7:43:18,  4.04it/s]

epoch 0 step 12559 loss tensor(5.5653, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12576/125000 [49:46<7:39:40,  4.08it/s]

epoch 0 step 12575 loss tensor(5.3272, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12592/125000 [49:50<7:39:59,  4.07it/s]

epoch 0 step 12591 loss tensor(5.1679, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12608/125000 [49:54<7:40:36,  4.07it/s]

epoch 0 step 12607 loss tensor(5.1002, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12624/125000 [49:58<7:38:29,  4.08it/s]

epoch 0 step 12623 loss tensor(5.2222, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12640/125000 [50:01<7:37:46,  4.09it/s]

epoch 0 step 12639 loss tensor(5.5444, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12656/125000 [50:05<7:36:16,  4.10it/s]

epoch 0 step 12655 loss tensor(5.3789, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12672/125000 [50:09<7:37:00,  4.10it/s]

epoch 0 step 12671 loss tensor(5.1322, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12688/125000 [50:13<7:37:22,  4.09it/s]

epoch 0 step 12687 loss tensor(5.1295, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12704/125000 [50:17<7:40:17,  4.07it/s]

epoch 0 step 12703 loss tensor(5.1043, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12720/125000 [50:21<7:41:07,  4.06it/s]

epoch 0 step 12719 loss tensor(5.0218, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12736/125000 [50:24<7:40:14,  4.07it/s]

epoch 0 step 12735 loss tensor(5.3219, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12752/125000 [50:28<7:39:23,  4.07it/s]

epoch 0 step 12751 loss tensor(5.6337, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12768/125000 [50:32<7:40:47,  4.06it/s]

epoch 0 step 12767 loss tensor(5.3700, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12784/125000 [50:36<7:36:02,  4.10it/s]

epoch 0 step 12783 loss tensor(5.1179, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12800/125000 [50:40<7:37:13,  4.09it/s]

epoch 0 step 12799 loss tensor(5.0169, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12816/125000 [50:43<7:36:39,  4.09it/s]

epoch 0 step 12815 loss tensor(5.5494, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12832/125000 [50:47<7:36:43,  4.09it/s]

epoch 0 step 12831 loss tensor(5.5802, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12848/125000 [50:51<7:40:07,  4.06it/s]

epoch 0 step 12847 loss tensor(6.1536, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12864/125000 [50:55<7:43:09,  4.04it/s]

epoch 0 step 12863 loss tensor(5.0390, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12880/125000 [50:59<7:45:41,  4.01it/s]

epoch 0 step 12879 loss tensor(5.3647, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12896/125000 [51:02<7:41:59,  4.04it/s]

epoch 0 step 12895 loss tensor(5.1842, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12912/125000 [51:06<7:39:05,  4.07it/s]

epoch 0 step 12911 loss tensor(5.1977, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12928/125000 [51:10<7:38:12,  4.08it/s]

epoch 0 step 12927 loss tensor(5.1839, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12944/125000 [51:14<7:37:48,  4.08it/s]

epoch 0 step 12943 loss tensor(4.7580, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12960/125000 [51:18<7:35:15,  4.10it/s]

epoch 0 step 12959 loss tensor(5.0383, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12976/125000 [51:21<7:35:09,  4.10it/s]

epoch 0 step 12975 loss tensor(5.0264, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 12992/125000 [51:25<7:36:29,  4.09it/s]

epoch 0 step 12991 loss tensor(5.6503, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 13008/125000 [51:29<7:36:03,  4.09it/s]

epoch 0 step 13007 loss tensor(6.1662, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 13024/125000 [51:33<7:40:40,  4.05it/s]

epoch 0 step 13023 loss tensor(5.0875, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 13040/125000 [51:37<7:41:34,  4.04it/s]

epoch 0 step 13039 loss tensor(5.2856, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 13056/125000 [51:40<7:38:09,  4.07it/s]

epoch 0 step 13055 loss tensor(5.0760, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 13072/125000 [51:44<7:38:37,  4.07it/s]

epoch 0 step 13071 loss tensor(5.4610, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 13088/125000 [51:48<7:36:01,  4.09it/s]

epoch 0 step 13087 loss tensor(5.5002, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 13104/125000 [51:52<7:35:27,  4.09it/s]

epoch 0 step 13103 loss tensor(4.7706, device='cuda:0', grad_fn=<DivBackward0>)


 10%|█         | 13120/125000 [51:56<7:36:18,  4.09it/s]

epoch 0 step 13119 loss tensor(5.0613, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13136/125000 [51:59<7:35:35,  4.09it/s]

epoch 0 step 13135 loss tensor(5.0695, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13152/125000 [52:03<7:33:49,  4.11it/s]

epoch 0 step 13151 loss tensor(4.9065, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13168/125000 [52:07<7:34:25,  4.10it/s]

epoch 0 step 13167 loss tensor(5.5249, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13184/125000 [52:11<7:40:17,  4.05it/s]

epoch 0 step 13183 loss tensor(5.0484, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13200/125000 [52:15<7:39:02,  4.06it/s]

epoch 0 step 13199 loss tensor(4.6679, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13216/125000 [52:18<7:38:38,  4.06it/s]

epoch 0 step 13215 loss tensor(5.1273, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13232/125000 [52:22<7:37:06,  4.08it/s]

epoch 0 step 13231 loss tensor(5.4296, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13248/125000 [52:26<7:38:25,  4.06it/s]

epoch 0 step 13247 loss tensor(4.7564, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13264/125000 [52:30<7:35:14,  4.09it/s]

epoch 0 step 13263 loss tensor(5.3699, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13280/125000 [52:34<7:35:09,  4.09it/s]

epoch 0 step 13279 loss tensor(5.1745, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13296/125000 [52:37<7:34:49,  4.09it/s]

epoch 0 step 13295 loss tensor(4.8896, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13312/125000 [52:41<7:35:01,  4.09it/s]

epoch 0 step 13311 loss tensor(4.7197, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13328/125000 [52:45<7:38:43,  4.06it/s]

epoch 0 step 13327 loss tensor(5.0589, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13344/125000 [52:49<7:39:35,  4.05it/s]

epoch 0 step 13343 loss tensor(5.2046, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13360/125000 [52:53<7:36:13,  4.08it/s]

epoch 0 step 13359 loss tensor(4.9557, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13376/125000 [52:56<7:36:39,  4.07it/s]

epoch 0 step 13375 loss tensor(4.9958, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13392/125000 [53:00<7:37:19,  4.07it/s]

epoch 0 step 13391 loss tensor(4.8411, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13408/125000 [53:04<7:34:17,  4.09it/s]

epoch 0 step 13407 loss tensor(5.9283, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13424/125000 [53:08<7:34:48,  4.09it/s]

epoch 0 step 13423 loss tensor(4.6814, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13440/125000 [53:12<7:32:38,  4.11it/s]

epoch 0 step 13439 loss tensor(4.9432, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13456/125000 [53:15<7:33:55,  4.10it/s]

epoch 0 step 13455 loss tensor(5.0700, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13472/125000 [53:19<7:33:45,  4.10it/s]

epoch 0 step 13471 loss tensor(4.9905, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13488/125000 [53:23<7:34:33,  4.09it/s]

epoch 0 step 13487 loss tensor(4.9109, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13504/125000 [53:27<7:37:11,  4.06it/s]

epoch 0 step 13503 loss tensor(5.0871, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13520/125000 [53:31<7:40:16,  4.04it/s]

epoch 0 step 13519 loss tensor(4.9951, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13536/125000 [53:35<7:38:56,  4.05it/s]

epoch 0 step 13535 loss tensor(4.7585, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13552/125000 [53:38<7:38:08,  4.05it/s]

epoch 0 step 13551 loss tensor(4.7253, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13568/125000 [53:42<7:36:34,  4.07it/s]

epoch 0 step 13567 loss tensor(4.8257, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13584/125000 [53:46<7:35:18,  4.08it/s]

epoch 0 step 13583 loss tensor(4.9538, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13600/125000 [53:50<7:35:39,  4.07it/s]

epoch 0 step 13599 loss tensor(4.8448, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13616/125000 [53:54<7:33:50,  4.09it/s]

epoch 0 step 13615 loss tensor(4.6913, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13632/125000 [53:57<7:33:53,  4.09it/s]

epoch 0 step 13631 loss tensor(4.8541, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13648/125000 [54:01<7:32:40,  4.10it/s]

epoch 0 step 13647 loss tensor(5.3461, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13664/125000 [54:05<7:33:33,  4.09it/s]

epoch 0 step 13663 loss tensor(4.9809, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13680/125000 [54:09<7:33:05,  4.09it/s]

epoch 0 step 13679 loss tensor(4.9031, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13696/125000 [54:13<7:38:37,  4.04it/s]

epoch 0 step 13695 loss tensor(4.9892, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13712/125000 [54:16<7:37:16,  4.06it/s]

epoch 0 step 13711 loss tensor(4.5611, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13728/125000 [54:20<7:34:34,  4.08it/s]

epoch 0 step 13727 loss tensor(4.8136, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13744/125000 [54:24<7:35:03,  4.07it/s]

epoch 0 step 13743 loss tensor(4.6853, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13760/125000 [54:28<7:35:32,  4.07it/s]

epoch 0 step 13759 loss tensor(4.7856, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13776/125000 [54:32<7:32:34,  4.10it/s]

epoch 0 step 13775 loss tensor(4.5766, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13792/125000 [54:35<7:33:46,  4.08it/s]

epoch 0 step 13791 loss tensor(4.7610, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13808/125000 [54:39<7:33:35,  4.09it/s]

epoch 0 step 13807 loss tensor(4.6678, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13824/125000 [54:43<7:32:54,  4.09it/s]

epoch 0 step 13823 loss tensor(4.6429, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13840/125000 [54:47<7:30:29,  4.11it/s]

epoch 0 step 13839 loss tensor(4.7442, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13856/125000 [54:51<7:39:42,  4.03it/s]

epoch 0 step 13855 loss tensor(4.5524, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13872/125000 [54:54<7:34:58,  4.07it/s]

epoch 0 step 13871 loss tensor(4.9733, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13888/125000 [54:58<7:34:08,  4.08it/s]

epoch 0 step 13887 loss tensor(4.3456, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13904/125000 [55:02<7:35:33,  4.06it/s]

epoch 0 step 13903 loss tensor(4.6806, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13920/125000 [55:06<7:32:06,  4.09it/s]

epoch 0 step 13919 loss tensor(4.8883, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13936/125000 [55:10<7:31:58,  4.10it/s]

epoch 0 step 13935 loss tensor(4.9593, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13952/125000 [55:13<7:31:17,  4.10it/s]

epoch 0 step 13951 loss tensor(4.7197, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13968/125000 [55:17<7:34:54,  4.07it/s]

epoch 0 step 13967 loss tensor(5.2254, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 13984/125000 [55:21<7:32:55,  4.09it/s]

epoch 0 step 13983 loss tensor(4.5918, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 14000/125000 [55:25<7:31:39,  4.10it/s]

epoch 0 step 13999 loss tensor(5.1481, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 14016/125000 [55:29<7:32:02,  4.09it/s]

epoch 0 step 14015 loss tensor(4.4072, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 14032/125000 [55:32<7:30:54,  4.10it/s]

epoch 0 step 14031 loss tensor(4.8563, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 14048/125000 [55:36<7:32:21,  4.09it/s]

epoch 0 step 14047 loss tensor(4.4852, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█▏        | 14064/125000 [55:40<7:31:15,  4.10it/s]

epoch 0 step 14063 loss tensor(4.5657, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█▏        | 14080/125000 [55:44<7:31:32,  4.09it/s]

epoch 0 step 14079 loss tensor(4.6078, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█▏        | 14096/125000 [55:48<7:38:25,  4.03it/s]

epoch 0 step 14095 loss tensor(4.9267, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█▏        | 14112/125000 [55:51<7:35:33,  4.06it/s]

epoch 0 step 14111 loss tensor(5.0813, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█▏        | 14128/125000 [55:55<7:35:15,  4.06it/s]

epoch 0 step 14127 loss tensor(4.6385, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█▏        | 14144/125000 [55:59<7:33:14,  4.08it/s]

epoch 0 step 14143 loss tensor(4.7064, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█▏        | 14160/125000 [56:03<7:33:00,  4.08it/s]

epoch 0 step 14159 loss tensor(4.5171, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█▏        | 14176/125000 [56:07<7:32:11,  4.08it/s]

epoch 0 step 14175 loss tensor(4.9672, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█▏        | 14192/125000 [56:10<7:32:01,  4.09it/s]

epoch 0 step 14191 loss tensor(4.6618, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█▏        | 14208/125000 [56:14<7:32:03,  4.08it/s]

epoch 0 step 14207 loss tensor(5.0487, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█▏        | 14224/125000 [56:18<7:32:13,  4.08it/s]

epoch 0 step 14223 loss tensor(4.6691, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█▏        | 14240/125000 [56:22<7:32:39,  4.08it/s]

epoch 0 step 14239 loss tensor(4.8626, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█▏        | 14256/125000 [56:26<7:31:01,  4.09it/s]

epoch 0 step 14255 loss tensor(4.4761, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█▏        | 14272/125000 [56:29<7:31:02,  4.09it/s]

epoch 0 step 14271 loss tensor(4.5405, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█▏        | 14288/125000 [56:33<7:34:46,  4.06it/s]

epoch 0 step 14287 loss tensor(4.4429, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█▏        | 14304/125000 [56:37<7:32:55,  4.07it/s]

epoch 0 step 14303 loss tensor(5.1609, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█▏        | 14320/125000 [56:41<7:31:38,  4.08it/s]

epoch 0 step 14319 loss tensor(4.4522, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█▏        | 14336/125000 [56:45<7:30:53,  4.09it/s]

epoch 0 step 14335 loss tensor(4.4450, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█▏        | 14352/125000 [56:49<7:31:43,  4.08it/s]

epoch 0 step 14351 loss tensor(4.5581, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█▏        | 14368/125000 [56:52<7:31:32,  4.08it/s]

epoch 0 step 14367 loss tensor(4.4895, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14384/125000 [56:56<7:28:11,  4.11it/s]

epoch 0 step 14383 loss tensor(4.5074, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14400/125000 [57:00<7:30:44,  4.09it/s]

epoch 0 step 14399 loss tensor(4.5858, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14416/125000 [57:04<7:31:56,  4.08it/s]

epoch 0 step 14415 loss tensor(4.6963, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14432/125000 [57:08<7:40:29,  4.00it/s]

epoch 0 step 14431 loss tensor(4.2582, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14448/125000 [57:11<7:36:20,  4.04it/s]

epoch 0 step 14447 loss tensor(4.6757, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14464/125000 [57:15<7:34:57,  4.05it/s]

epoch 0 step 14463 loss tensor(4.2257, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14480/125000 [57:19<7:32:50,  4.07it/s]

epoch 0 step 14479 loss tensor(4.4805, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14496/125000 [57:23<7:31:21,  4.08it/s]

epoch 0 step 14495 loss tensor(4.4654, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14512/125000 [57:27<7:29:35,  4.10it/s]

epoch 0 step 14511 loss tensor(5.0762, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14528/125000 [57:30<7:29:43,  4.09it/s]

epoch 0 step 14527 loss tensor(4.5511, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14544/125000 [57:34<7:30:18,  4.09it/s]

epoch 0 step 14543 loss tensor(4.5170, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14560/125000 [57:38<7:29:52,  4.09it/s]

epoch 0 step 14559 loss tensor(4.3225, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14576/125000 [57:42<7:28:58,  4.10it/s]

epoch 0 step 14575 loss tensor(4.4316, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14592/125000 [57:46<7:32:26,  4.07it/s]

epoch 0 step 14591 loss tensor(4.4363, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14608/125000 [57:49<7:34:16,  4.05it/s]

epoch 0 step 14607 loss tensor(4.3527, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14624/125000 [57:53<7:30:52,  4.08it/s]

epoch 0 step 14623 loss tensor(4.2354, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14640/125000 [57:57<7:29:19,  4.09it/s]

epoch 0 step 14639 loss tensor(4.9474, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14656/125000 [58:01<7:30:15,  4.08it/s]

epoch 0 step 14655 loss tensor(4.3935, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14672/125000 [58:05<7:30:40,  4.08it/s]

epoch 0 step 14671 loss tensor(4.4272, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14688/125000 [58:08<7:30:35,  4.08it/s]

epoch 0 step 14687 loss tensor(4.2190, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14704/125000 [58:12<7:29:46,  4.09it/s]

epoch 0 step 14703 loss tensor(4.2070, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14720/125000 [58:16<7:30:50,  4.08it/s]

epoch 0 step 14719 loss tensor(4.0916, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14736/125000 [58:20<7:28:56,  4.09it/s]

epoch 0 step 14735 loss tensor(4.5038, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14752/125000 [58:24<7:35:27,  4.03it/s]

epoch 0 step 14751 loss tensor(4.4002, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14768/125000 [58:27<7:35:38,  4.03it/s]

epoch 0 step 14767 loss tensor(4.0794, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14784/125000 [58:31<7:31:50,  4.07it/s]

epoch 0 step 14783 loss tensor(4.2557, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14800/125000 [58:35<7:31:50,  4.06it/s]

epoch 0 step 14799 loss tensor(4.4871, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14816/125000 [58:39<7:29:47,  4.08it/s]

epoch 0 step 14815 loss tensor(4.2919, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14832/125000 [58:43<7:28:52,  4.09it/s]

epoch 0 step 14831 loss tensor(4.0712, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14848/125000 [58:46<7:28:37,  4.09it/s]

epoch 0 step 14847 loss tensor(4.0874, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14864/125000 [58:50<7:28:31,  4.09it/s]

epoch 0 step 14863 loss tensor(3.9712, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14880/125000 [58:54<7:29:58,  4.08it/s]

epoch 0 step 14879 loss tensor(4.3072, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14896/125000 [58:58<7:28:46,  4.09it/s]

epoch 0 step 14895 loss tensor(4.1546, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14912/125000 [59:02<7:31:51,  4.06it/s]

epoch 0 step 14911 loss tensor(4.3197, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14928/125000 [59:06<7:34:47,  4.03it/s]

epoch 0 step 14927 loss tensor(4.1883, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14944/125000 [59:09<7:31:13,  4.07it/s]

epoch 0 step 14943 loss tensor(4.1402, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14960/125000 [59:13<7:30:01,  4.08it/s]

epoch 0 step 14959 loss tensor(4.1997, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14976/125000 [59:17<7:29:22,  4.08it/s]

epoch 0 step 14975 loss tensor(4.2861, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 14992/125000 [59:21<7:36:19,  4.02it/s]

epoch 0 step 14991 loss tensor(4.1502, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15008/125000 [59:25<7:27:55,  4.09it/s]

epoch 0 step 15007 loss tensor(4.4092, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15024/125000 [59:28<7:27:32,  4.10it/s]

epoch 0 step 15023 loss tensor(4.3450, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15040/125000 [59:32<7:27:26,  4.10it/s]

epoch 0 step 15039 loss tensor(4.1289, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15056/125000 [59:36<7:27:39,  4.09it/s]

epoch 0 step 15055 loss tensor(4.1028, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15072/125000 [59:40<7:27:31,  4.09it/s]

epoch 0 step 15071 loss tensor(4.1120, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15088/125000 [59:44<7:33:12,  4.04it/s]

epoch 0 step 15087 loss tensor(3.9915, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15104/125000 [59:47<7:29:35,  4.07it/s]

epoch 0 step 15103 loss tensor(4.0782, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15120/125000 [59:51<7:29:25,  4.07it/s]

epoch 0 step 15119 loss tensor(4.2972, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15136/125000 [59:55<7:29:10,  4.08it/s]

epoch 0 step 15135 loss tensor(4.1209, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15152/125000 [59:59<7:29:20,  4.07it/s]

epoch 0 step 15151 loss tensor(4.1959, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15168/125000 [1:00:03<7:26:25,  4.10it/s]

epoch 0 step 15167 loss tensor(4.0980, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15184/125000 [1:00:06<7:27:44,  4.09it/s]

epoch 0 step 15183 loss tensor(4.2839, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15200/125000 [1:00:10<7:26:49,  4.10it/s]

epoch 0 step 15199 loss tensor(4.3927, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15216/125000 [1:00:14<7:26:07,  4.10it/s]

epoch 0 step 15215 loss tensor(4.4395, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15232/125000 [1:00:18<7:28:55,  4.08it/s]

epoch 0 step 15231 loss tensor(4.3535, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15248/125000 [1:00:22<7:33:01,  4.04it/s]

epoch 0 step 15247 loss tensor(4.2140, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15264/125000 [1:00:25<7:31:54,  4.05it/s]

epoch 0 step 15263 loss tensor(4.0421, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15280/125000 [1:00:29<7:30:26,  4.06it/s]

epoch 0 step 15279 loss tensor(5.1013, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15296/125000 [1:00:33<7:30:10,  4.06it/s]

epoch 0 step 15295 loss tensor(4.0094, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15312/125000 [1:00:37<7:28:45,  4.07it/s]

epoch 0 step 15311 loss tensor(3.9010, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15328/125000 [1:00:41<7:26:58,  4.09it/s]

epoch 0 step 15327 loss tensor(4.1162, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15344/125000 [1:00:44<7:25:48,  4.10it/s]

epoch 0 step 15343 loss tensor(4.3156, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15360/125000 [1:00:48<7:25:37,  4.10it/s]

epoch 0 step 15359 loss tensor(3.9033, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15376/125000 [1:00:52<7:28:16,  4.08it/s]

epoch 0 step 15375 loss tensor(3.9022, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15392/125000 [1:00:56<7:25:23,  4.10it/s]

epoch 0 step 15391 loss tensor(3.9070, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15408/125000 [1:01:00<7:30:53,  4.05it/s]

epoch 0 step 15407 loss tensor(4.4627, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15424/125000 [1:01:03<7:32:29,  4.04it/s]

epoch 0 step 15423 loss tensor(3.9269, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15440/125000 [1:01:07<7:28:33,  4.07it/s]

epoch 0 step 15439 loss tensor(3.9062, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15456/125000 [1:01:11<7:28:23,  4.07it/s]

epoch 0 step 15455 loss tensor(4.2724, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15472/125000 [1:01:15<7:27:46,  4.08it/s]

epoch 0 step 15471 loss tensor(4.2677, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15488/125000 [1:01:19<7:25:42,  4.10it/s]

epoch 0 step 15487 loss tensor(4.0401, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15504/125000 [1:01:23<7:25:32,  4.10it/s]

epoch 0 step 15503 loss tensor(3.8151, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15520/125000 [1:01:26<7:26:20,  4.09it/s]

epoch 0 step 15519 loss tensor(4.1210, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15536/125000 [1:01:30<7:23:01,  4.12it/s]

epoch 0 step 15535 loss tensor(4.2926, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15552/125000 [1:01:34<7:32:28,  4.03it/s]

epoch 0 step 15551 loss tensor(4.1617, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15568/125000 [1:01:38<7:29:39,  4.06it/s]

epoch 0 step 15567 loss tensor(4.0712, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15584/125000 [1:01:42<7:27:34,  4.07it/s]

epoch 0 step 15583 loss tensor(4.6715, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15600/125000 [1:01:45<7:25:22,  4.09it/s]

epoch 0 step 15599 loss tensor(3.9778, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 15616/125000 [1:01:49<7:25:08,  4.10it/s]

epoch 0 step 15615 loss tensor(3.9617, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 15632/125000 [1:01:53<7:26:23,  4.08it/s]

epoch 0 step 15631 loss tensor(3.5502, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 15648/125000 [1:01:57<7:25:26,  4.09it/s]

epoch 0 step 15647 loss tensor(3.9417, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 15664/125000 [1:02:01<7:24:35,  4.10it/s]

epoch 0 step 15663 loss tensor(4.1532, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 15680/125000 [1:02:04<7:32:58,  4.02it/s]

epoch 0 step 15679 loss tensor(3.6785, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 15696/125000 [1:02:08<7:29:59,  4.05it/s]

epoch 0 step 15695 loss tensor(3.8866, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 15712/125000 [1:02:12<7:27:59,  4.07it/s]

epoch 0 step 15711 loss tensor(4.0079, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 15728/125000 [1:02:16<7:26:33,  4.08it/s]

epoch 0 step 15727 loss tensor(4.1211, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 15744/125000 [1:02:20<7:32:20,  4.03it/s]

epoch 0 step 15743 loss tensor(3.8496, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 15760/125000 [1:02:23<7:25:22,  4.09it/s]

epoch 0 step 15759 loss tensor(3.7519, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 15776/125000 [1:02:27<7:24:59,  4.09it/s]

epoch 0 step 15775 loss tensor(3.6645, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 15792/125000 [1:02:31<7:26:01,  4.08it/s]

epoch 0 step 15791 loss tensor(3.8365, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 15808/125000 [1:02:35<7:25:58,  4.08it/s]

epoch 0 step 15807 loss tensor(3.7327, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 15824/125000 [1:02:39<7:29:19,  4.05it/s]

epoch 0 step 15823 loss tensor(3.6940, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 15840/125000 [1:02:42<7:28:18,  4.06it/s]

epoch 0 step 15839 loss tensor(3.7612, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 15856/125000 [1:02:46<7:25:13,  4.09it/s]

epoch 0 step 15855 loss tensor(3.8869, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 15872/125000 [1:02:50<7:25:24,  4.08it/s]

epoch 0 step 15871 loss tensor(4.1922, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 15888/125000 [1:02:54<7:24:30,  4.09it/s]

epoch 0 step 15887 loss tensor(3.6949, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 15904/125000 [1:02:58<7:23:48,  4.10it/s]

epoch 0 step 15903 loss tensor(3.8513, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 15920/125000 [1:03:01<7:23:44,  4.10it/s]

epoch 0 step 15919 loss tensor(3.5270, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 15936/125000 [1:03:05<7:24:43,  4.09it/s]

epoch 0 step 15935 loss tensor(3.7255, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 15952/125000 [1:03:09<7:23:21,  4.10it/s]

epoch 0 step 15951 loss tensor(3.6269, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 15968/125000 [1:03:13<7:28:25,  4.05it/s]

epoch 0 step 15967 loss tensor(3.7187, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 15984/125000 [1:03:17<7:26:13,  4.07it/s]

epoch 0 step 15983 loss tensor(3.6817, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16000/125000 [1:03:21<7:27:52,  4.06it/s]

epoch 0 step 15999 loss tensor(3.8391, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16016/125000 [1:03:24<7:24:54,  4.08it/s]

epoch 0 step 16015 loss tensor(4.6880, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16032/125000 [1:03:28<7:25:17,  4.08it/s]

epoch 0 step 16031 loss tensor(3.7104, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16048/125000 [1:03:32<7:23:03,  4.10it/s]

epoch 0 step 16047 loss tensor(3.5991, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16064/125000 [1:03:36<7:24:33,  4.08it/s]

epoch 0 step 16063 loss tensor(3.6742, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16080/125000 [1:03:39<7:24:32,  4.08it/s]

epoch 0 step 16079 loss tensor(3.6404, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16096/125000 [1:03:43<7:23:35,  4.09it/s]

epoch 0 step 16095 loss tensor(3.7882, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16112/125000 [1:03:47<7:28:12,  4.05it/s]

epoch 0 step 16111 loss tensor(3.7579, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16128/125000 [1:03:51<7:28:32,  4.05it/s]

epoch 0 step 16127 loss tensor(3.9334, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16144/125000 [1:03:55<7:26:02,  4.07it/s]

epoch 0 step 16143 loss tensor(3.8449, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16160/125000 [1:03:59<7:25:48,  4.07it/s]

epoch 0 step 16159 loss tensor(3.7770, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16176/125000 [1:04:02<7:24:22,  4.08it/s]

epoch 0 step 16175 loss tensor(3.7144, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16192/125000 [1:04:06<7:23:24,  4.09it/s]

epoch 0 step 16191 loss tensor(3.5406, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16208/125000 [1:04:10<7:22:37,  4.10it/s]

epoch 0 step 16207 loss tensor(3.9872, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16224/125000 [1:04:14<7:24:23,  4.08it/s]

epoch 0 step 16223 loss tensor(3.5659, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16240/125000 [1:04:18<7:23:00,  4.09it/s]

epoch 0 step 16239 loss tensor(3.8334, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16256/125000 [1:04:21<7:21:54,  4.10it/s]

epoch 0 step 16255 loss tensor(3.5473, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16272/125000 [1:04:25<7:26:50,  4.06it/s]

epoch 0 step 16271 loss tensor(3.7637, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16288/125000 [1:04:29<7:27:53,  4.05it/s]

epoch 0 step 16287 loss tensor(3.6594, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16304/125000 [1:04:33<7:24:33,  4.08it/s]

epoch 0 step 16303 loss tensor(3.7644, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16320/125000 [1:04:37<7:24:14,  4.08it/s]

epoch 0 step 16319 loss tensor(3.5319, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16336/125000 [1:04:40<7:24:44,  4.07it/s]

epoch 0 step 16335 loss tensor(3.6290, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16352/125000 [1:04:44<7:22:31,  4.09it/s]

epoch 0 step 16351 loss tensor(3.5965, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16368/125000 [1:04:48<7:23:05,  4.09it/s]

epoch 0 step 16367 loss tensor(3.5291, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16384/125000 [1:04:52<7:22:55,  4.09it/s]

epoch 0 step 16383 loss tensor(3.6458, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16400/125000 [1:04:56<7:22:07,  4.09it/s]

epoch 0 step 16399 loss tensor(3.4477, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16416/125000 [1:04:59<7:27:38,  4.04it/s]

epoch 0 step 16415 loss tensor(3.4976, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16432/125000 [1:05:03<7:27:39,  4.04it/s]

epoch 0 step 16431 loss tensor(3.4045, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16448/125000 [1:05:07<7:24:22,  4.07it/s]

epoch 0 step 16447 loss tensor(3.5282, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16464/125000 [1:05:11<7:23:29,  4.08it/s]

epoch 0 step 16463 loss tensor(3.5443, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16480/125000 [1:05:15<7:22:46,  4.08it/s]

epoch 0 step 16479 loss tensor(3.6488, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16496/125000 [1:05:18<7:22:22,  4.09it/s]

epoch 0 step 16495 loss tensor(3.6133, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16512/125000 [1:05:22<7:21:38,  4.09it/s]

epoch 0 step 16511 loss tensor(3.4099, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16528/125000 [1:05:26<7:22:29,  4.09it/s]

epoch 0 step 16527 loss tensor(3.6126, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16544/125000 [1:05:30<7:22:24,  4.09it/s]

epoch 0 step 16543 loss tensor(3.4766, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16560/125000 [1:05:34<7:22:03,  4.09it/s]

epoch 0 step 16559 loss tensor(3.4786, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16576/125000 [1:05:37<7:24:44,  4.06it/s]

epoch 0 step 16575 loss tensor(3.7585, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16592/125000 [1:05:41<7:24:41,  4.06it/s]

epoch 0 step 16591 loss tensor(3.4179, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16608/125000 [1:05:45<7:24:21,  4.07it/s]

epoch 0 step 16607 loss tensor(3.5803, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16624/125000 [1:05:49<7:23:27,  4.07it/s]

epoch 0 step 16623 loss tensor(4.3001, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16640/125000 [1:05:53<7:24:58,  4.06it/s]

epoch 0 step 16639 loss tensor(3.5486, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16656/125000 [1:05:56<7:20:10,  4.10it/s]

epoch 0 step 16655 loss tensor(3.2799, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16672/125000 [1:06:00<7:21:25,  4.09it/s]

epoch 0 step 16671 loss tensor(4.0312, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16688/125000 [1:06:04<7:20:35,  4.10it/s]

epoch 0 step 16687 loss tensor(3.5497, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16704/125000 [1:06:08<7:21:19,  4.09it/s]

epoch 0 step 16703 loss tensor(3.5637, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16720/125000 [1:06:12<7:21:31,  4.09it/s]

epoch 0 step 16719 loss tensor(3.5125, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16736/125000 [1:06:15<7:26:18,  4.04it/s]

epoch 0 step 16735 loss tensor(3.3270, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16752/125000 [1:06:19<7:24:26,  4.06it/s]

epoch 0 step 16751 loss tensor(3.5576, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16768/125000 [1:06:23<7:23:35,  4.07it/s]

epoch 0 step 16767 loss tensor(3.3011, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16784/125000 [1:06:27<7:22:43,  4.07it/s]

epoch 0 step 16783 loss tensor(3.6530, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16800/125000 [1:06:31<7:21:24,  4.09it/s]

epoch 0 step 16799 loss tensor(3.6255, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16816/125000 [1:06:35<7:21:17,  4.09it/s]

epoch 0 step 16815 loss tensor(3.5653, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16832/125000 [1:06:38<7:20:20,  4.09it/s]

epoch 0 step 16831 loss tensor(3.4879, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16848/125000 [1:06:42<7:21:12,  4.09it/s]

epoch 0 step 16847 loss tensor(3.5474, device='cuda:0', grad_fn=<DivBackward0>)


 13%|█▎        | 16864/125000 [1:06:46<7:20:49,  4.09it/s]

epoch 0 step 16863 loss tensor(3.2676, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▎        | 16880/125000 [1:06:50<7:25:07,  4.05it/s]

epoch 0 step 16879 loss tensor(3.5008, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▎        | 16896/125000 [1:06:54<7:24:22,  4.05it/s]

epoch 0 step 16895 loss tensor(3.7391, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▎        | 16912/125000 [1:06:57<7:22:34,  4.07it/s]

epoch 0 step 16911 loss tensor(3.5262, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▎        | 16928/125000 [1:07:01<7:22:02,  4.07it/s]

epoch 0 step 16927 loss tensor(3.5079, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▎        | 16944/125000 [1:07:05<7:29:18,  4.01it/s]

epoch 0 step 16943 loss tensor(3.3631, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▎        | 16960/125000 [1:07:09<7:20:06,  4.09it/s]

epoch 0 step 16959 loss tensor(3.3019, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▎        | 16976/125000 [1:07:13<7:21:08,  4.08it/s]

epoch 0 step 16975 loss tensor(3.4186, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▎        | 16992/125000 [1:07:16<7:20:47,  4.08it/s]

epoch 0 step 16991 loss tensor(3.4458, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▎        | 17008/125000 [1:07:20<7:19:31,  4.10it/s]

epoch 0 step 17007 loss tensor(3.4667, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▎        | 17024/125000 [1:07:24<7:17:43,  4.11it/s]

epoch 0 step 17023 loss tensor(3.6391, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▎        | 17040/125000 [1:07:28<7:24:35,  4.05it/s]

epoch 0 step 17039 loss tensor(3.4721, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▎        | 17056/125000 [1:07:32<7:24:04,  4.05it/s]

epoch 0 step 17055 loss tensor(3.2310, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▎        | 17072/125000 [1:07:35<7:22:02,  4.07it/s]

epoch 0 step 17071 loss tensor(3.4462, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▎        | 17088/125000 [1:07:39<7:19:52,  4.09it/s]

epoch 0 step 17087 loss tensor(3.1829, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▎        | 17104/125000 [1:07:43<7:19:29,  4.09it/s]

epoch 0 step 17103 loss tensor(3.4468, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▎        | 17120/125000 [1:07:47<7:18:59,  4.10it/s]

epoch 0 step 17119 loss tensor(3.2563, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▎        | 17136/125000 [1:07:51<7:19:53,  4.09it/s]

epoch 0 step 17135 loss tensor(3.1554, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▎        | 17152/125000 [1:07:54<7:19:56,  4.09it/s]

epoch 0 step 17151 loss tensor(3.1748, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▎        | 17168/125000 [1:07:58<7:20:08,  4.08it/s]

epoch 0 step 17167 loss tensor(3.2193, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▎        | 17184/125000 [1:08:02<7:19:03,  4.09it/s]

epoch 0 step 17183 loss tensor(3.2880, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17200/125000 [1:08:06<7:26:05,  4.03it/s]

epoch 0 step 17199 loss tensor(3.2819, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17216/125000 [1:08:10<7:24:30,  4.04it/s]

epoch 0 step 17215 loss tensor(3.3538, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17232/125000 [1:08:13<7:22:43,  4.06it/s]

epoch 0 step 17231 loss tensor(3.0636, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17248/125000 [1:08:17<7:20:42,  4.08it/s]

epoch 0 step 17247 loss tensor(3.3904, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17264/125000 [1:08:21<7:20:40,  4.07it/s]

epoch 0 step 17263 loss tensor(3.5109, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17280/125000 [1:08:25<7:18:15,  4.10it/s]

epoch 0 step 17279 loss tensor(3.2810, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17296/125000 [1:08:29<7:18:42,  4.09it/s]

epoch 0 step 17295 loss tensor(3.1777, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17312/125000 [1:08:32<7:18:49,  4.09it/s]

epoch 0 step 17311 loss tensor(3.3098, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17328/125000 [1:08:36<7:19:39,  4.08it/s]

epoch 0 step 17327 loss tensor(3.4103, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17344/125000 [1:08:40<7:17:45,  4.10it/s]

epoch 0 step 17343 loss tensor(3.5888, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17360/125000 [1:08:44<7:22:14,  4.06it/s]

epoch 0 step 17359 loss tensor(3.4744, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17376/125000 [1:08:48<7:21:07,  4.07it/s]

epoch 0 step 17375 loss tensor(3.1779, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17392/125000 [1:08:52<7:21:37,  4.06it/s]

epoch 0 step 17391 loss tensor(3.2465, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17408/125000 [1:08:55<7:19:20,  4.08it/s]

epoch 0 step 17407 loss tensor(3.2182, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17424/125000 [1:08:59<7:21:15,  4.06it/s]

epoch 0 step 17423 loss tensor(3.0610, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17440/125000 [1:09:03<7:18:12,  4.09it/s]

epoch 0 step 17439 loss tensor(3.1553, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17456/125000 [1:09:07<7:19:23,  4.08it/s]

epoch 0 step 17455 loss tensor(3.5061, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17472/125000 [1:09:11<7:18:10,  4.09it/s]

epoch 0 step 17471 loss tensor(3.2847, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17488/125000 [1:09:14<7:18:32,  4.09it/s]

epoch 0 step 17487 loss tensor(3.5403, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17504/125000 [1:09:18<7:16:42,  4.10it/s]

epoch 0 step 17503 loss tensor(3.4270, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17520/125000 [1:09:22<7:22:20,  4.05it/s]

epoch 0 step 17519 loss tensor(3.2905, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17536/125000 [1:09:26<7:20:34,  4.07it/s]

epoch 0 step 17535 loss tensor(3.2371, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17552/125000 [1:09:30<7:19:38,  4.07it/s]

epoch 0 step 17551 loss tensor(3.2043, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17568/125000 [1:09:33<7:18:28,  4.08it/s]

epoch 0 step 17567 loss tensor(3.0675, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17584/125000 [1:09:37<7:18:08,  4.09it/s]

epoch 0 step 17583 loss tensor(3.1693, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17600/125000 [1:09:41<7:17:18,  4.09it/s]

epoch 0 step 17599 loss tensor(3.0180, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17616/125000 [1:09:45<7:18:47,  4.08it/s]

epoch 0 step 17615 loss tensor(3.3875, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17632/125000 [1:09:49<7:18:06,  4.08it/s]

epoch 0 step 17631 loss tensor(3.1554, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17648/125000 [1:09:52<7:15:45,  4.11it/s]

epoch 0 step 17647 loss tensor(3.0582, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17664/125000 [1:09:56<7:21:13,  4.05it/s]

epoch 0 step 17663 loss tensor(3.0088, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17680/125000 [1:10:00<7:22:27,  4.04it/s]

epoch 0 step 17679 loss tensor(3.2091, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17696/125000 [1:10:04<7:19:59,  4.06it/s]

epoch 0 step 17695 loss tensor(3.1060, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17712/125000 [1:10:08<7:18:49,  4.07it/s]

epoch 0 step 17711 loss tensor(3.0111, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17728/125000 [1:10:11<7:20:18,  4.06it/s]

epoch 0 step 17727 loss tensor(2.8889, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17744/125000 [1:10:15<7:17:14,  4.09it/s]

epoch 0 step 17743 loss tensor(3.0490, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17760/125000 [1:10:19<7:17:25,  4.09it/s]

epoch 0 step 17759 loss tensor(2.8969, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17776/125000 [1:10:23<7:18:01,  4.08it/s]

epoch 0 step 17775 loss tensor(3.3031, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17792/125000 [1:10:27<7:16:28,  4.09it/s]

epoch 0 step 17791 loss tensor(3.2155, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17808/125000 [1:10:30<7:16:32,  4.09it/s]

epoch 0 step 17807 loss tensor(3.1004, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17824/125000 [1:10:34<7:22:00,  4.04it/s]

epoch 0 step 17823 loss tensor(2.8794, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17840/125000 [1:10:38<7:19:57,  4.06it/s]

epoch 0 step 17839 loss tensor(3.1734, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17856/125000 [1:10:42<7:17:33,  4.08it/s]

epoch 0 step 17855 loss tensor(3.0806, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17872/125000 [1:10:46<7:17:10,  4.08it/s]

epoch 0 step 17871 loss tensor(3.3410, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17888/125000 [1:10:49<7:15:48,  4.10it/s]

epoch 0 step 17887 loss tensor(3.0139, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17904/125000 [1:10:53<7:16:00,  4.09it/s]

epoch 0 step 17903 loss tensor(3.0646, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17920/125000 [1:10:57<7:16:22,  4.09it/s]

epoch 0 step 17919 loss tensor(2.8985, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17936/125000 [1:11:01<7:15:45,  4.09it/s]

epoch 0 step 17935 loss tensor(2.9321, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17952/125000 [1:11:05<7:16:01,  4.09it/s]

epoch 0 step 17951 loss tensor(3.1690, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17968/125000 [1:11:08<7:21:07,  4.04it/s]

epoch 0 step 17967 loss tensor(3.1478, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 17984/125000 [1:11:12<7:18:51,  4.06it/s]

epoch 0 step 17983 loss tensor(2.8658, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 18000/125000 [1:11:16<7:18:13,  4.07it/s]

epoch 0 step 17999 loss tensor(3.1178, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 18016/125000 [1:11:20<7:18:34,  4.07it/s]

epoch 0 step 18015 loss tensor(3.1016, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 18032/125000 [1:11:24<7:22:36,  4.03it/s]

epoch 0 step 18031 loss tensor(3.4238, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 18048/125000 [1:11:27<7:15:21,  4.09it/s]

epoch 0 step 18047 loss tensor(3.0135, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 18064/125000 [1:11:31<7:15:07,  4.10it/s]

epoch 0 step 18063 loss tensor(2.9138, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 18080/125000 [1:11:35<7:16:10,  4.09it/s]

epoch 0 step 18079 loss tensor(2.9401, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 18096/125000 [1:11:39<7:15:59,  4.09it/s]

epoch 0 step 18095 loss tensor(2.7967, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 18112/125000 [1:11:43<7:14:26,  4.10it/s]

epoch 0 step 18111 loss tensor(2.9903, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18128/125000 [1:11:46<7:20:15,  4.05it/s]

epoch 0 step 18127 loss tensor(2.7895, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18144/125000 [1:11:50<7:19:42,  4.05it/s]

epoch 0 step 18143 loss tensor(3.0347, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18160/125000 [1:11:54<7:17:10,  4.07it/s]

epoch 0 step 18159 loss tensor(3.0430, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18176/125000 [1:11:58<7:15:51,  4.08it/s]

epoch 0 step 18175 loss tensor(2.7491, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18192/125000 [1:12:02<7:14:11,  4.10it/s]

epoch 0 step 18191 loss tensor(3.1103, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18208/125000 [1:12:06<7:14:43,  4.09it/s]

epoch 0 step 18207 loss tensor(2.8262, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18224/125000 [1:12:09<7:14:55,  4.09it/s]

epoch 0 step 18223 loss tensor(2.9332, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18240/125000 [1:12:13<7:14:06,  4.10it/s]

epoch 0 step 18239 loss tensor(3.1467, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18256/125000 [1:12:17<7:14:50,  4.09it/s]

epoch 0 step 18255 loss tensor(2.8580, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18272/125000 [1:12:21<7:19:18,  4.05it/s]

epoch 0 step 18271 loss tensor(2.8854, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18288/125000 [1:12:25<7:18:36,  4.05it/s]

epoch 0 step 18287 loss tensor(2.7989, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18304/125000 [1:12:28<7:16:15,  4.08it/s]

epoch 0 step 18303 loss tensor(2.8011, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18320/125000 [1:12:32<7:15:49,  4.08it/s]

epoch 0 step 18319 loss tensor(2.7805, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18336/125000 [1:12:36<7:15:25,  4.08it/s]

epoch 0 step 18335 loss tensor(3.1054, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18352/125000 [1:12:40<7:14:34,  4.09it/s]

epoch 0 step 18351 loss tensor(2.9189, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18368/125000 [1:12:44<7:14:17,  4.09it/s]

epoch 0 step 18367 loss tensor(2.7905, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18384/125000 [1:12:47<7:14:00,  4.09it/s]

epoch 0 step 18383 loss tensor(2.8773, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18400/125000 [1:12:51<7:14:26,  4.09it/s]

epoch 0 step 18399 loss tensor(2.7799, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18416/125000 [1:12:55<7:11:48,  4.11it/s]

epoch 0 step 18415 loss tensor(2.6633, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18432/125000 [1:12:59<7:18:18,  4.05it/s]

epoch 0 step 18431 loss tensor(2.9654, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18448/125000 [1:13:03<7:17:31,  4.06it/s]

epoch 0 step 18447 loss tensor(2.7916, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18464/125000 [1:13:06<7:15:12,  4.08it/s]

epoch 0 step 18463 loss tensor(2.8205, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18480/125000 [1:13:10<7:14:47,  4.08it/s]

epoch 0 step 18479 loss tensor(2.8845, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18496/125000 [1:13:14<7:14:21,  4.09it/s]

epoch 0 step 18495 loss tensor(2.6492, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18512/125000 [1:13:18<7:13:53,  4.09it/s]

epoch 0 step 18511 loss tensor(2.5735, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18528/125000 [1:13:22<7:12:50,  4.10it/s]

epoch 0 step 18527 loss tensor(2.8990, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18544/125000 [1:13:25<7:12:41,  4.10it/s]

epoch 0 step 18543 loss tensor(2.7685, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18560/125000 [1:13:29<7:13:50,  4.09it/s]

epoch 0 step 18559 loss tensor(2.5807, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18576/125000 [1:13:33<7:18:01,  4.05it/s]

epoch 0 step 18575 loss tensor(2.6185, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18592/125000 [1:13:37<7:16:36,  4.06it/s]

epoch 0 step 18591 loss tensor(2.6197, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18608/125000 [1:13:41<7:16:38,  4.06it/s]

epoch 0 step 18607 loss tensor(2.7437, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18624/125000 [1:13:44<7:15:34,  4.07it/s]

epoch 0 step 18623 loss tensor(2.7367, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18640/125000 [1:13:48<7:13:28,  4.09it/s]

epoch 0 step 18639 loss tensor(2.6473, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18656/125000 [1:13:52<7:13:20,  4.09it/s]

epoch 0 step 18655 loss tensor(2.5932, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18672/125000 [1:13:56<7:12:47,  4.09it/s]

epoch 0 step 18671 loss tensor(2.9011, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18688/125000 [1:14:00<7:13:42,  4.09it/s]

epoch 0 step 18687 loss tensor(2.6063, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18704/125000 [1:14:03<7:10:59,  4.11it/s]

epoch 0 step 18703 loss tensor(2.6723, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18720/125000 [1:14:07<7:11:36,  4.10it/s]

epoch 0 step 18719 loss tensor(2.9151, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▍        | 18736/125000 [1:14:11<7:14:50,  4.07it/s]

epoch 0 step 18735 loss tensor(2.6237, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 18752/125000 [1:14:15<7:10:28,  4.11it/s]

epoch 0 step 18751 loss tensor(2.8982, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 18768/125000 [1:14:19<7:18:32,  4.04it/s]

epoch 0 step 18767 loss tensor(2.5494, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 18784/125000 [1:14:22<7:16:27,  4.06it/s]

epoch 0 step 18783 loss tensor(2.6497, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 18800/125000 [1:14:26<7:17:53,  4.04it/s]

epoch 0 step 18799 loss tensor(2.6889, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 18816/125000 [1:14:30<7:16:01,  4.06it/s]

epoch 0 step 18815 loss tensor(2.8613, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 18832/125000 [1:14:34<7:13:59,  4.08it/s]

epoch 0 step 18831 loss tensor(2.5882, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 18848/125000 [1:14:38<7:19:45,  4.02it/s]

epoch 0 step 18847 loss tensor(2.5424, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 18864/125000 [1:14:42<7:12:12,  4.09it/s]

epoch 0 step 18863 loss tensor(2.6173, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 18880/125000 [1:14:45<7:12:42,  4.09it/s]

epoch 0 step 18879 loss tensor(2.7641, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 18896/125000 [1:14:49<7:14:30,  4.07it/s]

epoch 0 step 18895 loss tensor(2.6769, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 18912/125000 [1:14:53<7:12:54,  4.08it/s]

epoch 0 step 18911 loss tensor(2.8946, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 18928/125000 [1:14:57<7:10:36,  4.11it/s]

epoch 0 step 18927 loss tensor(2.6049, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 18944/125000 [1:15:01<7:16:24,  4.05it/s]

epoch 0 step 18943 loss tensor(2.5716, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 18960/125000 [1:15:04<7:14:55,  4.06it/s]

epoch 0 step 18959 loss tensor(2.6296, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 18976/125000 [1:15:08<7:14:21,  4.07it/s]

epoch 0 step 18975 loss tensor(2.5116, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 18992/125000 [1:15:12<7:12:55,  4.08it/s]

epoch 0 step 18991 loss tensor(2.7570, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 19008/125000 [1:15:16<7:10:52,  4.10it/s]

epoch 0 step 19007 loss tensor(2.6696, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 19024/125000 [1:15:20<7:11:32,  4.09it/s]

epoch 0 step 19023 loss tensor(2.6306, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 19040/125000 [1:15:23<7:11:45,  4.09it/s]

epoch 0 step 19039 loss tensor(3.0390, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 19056/125000 [1:15:27<7:11:30,  4.09it/s]

epoch 0 step 19055 loss tensor(2.4345, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 19072/125000 [1:15:31<7:12:35,  4.08it/s]

epoch 0 step 19071 loss tensor(2.5422, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 19088/125000 [1:15:35<7:11:46,  4.09it/s]

epoch 0 step 19087 loss tensor(2.5458, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 19104/125000 [1:15:39<7:10:19,  4.10it/s]

epoch 0 step 19103 loss tensor(2.6382, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 19120/125000 [1:15:42<7:19:26,  4.02it/s]

epoch 0 step 19119 loss tensor(2.5674, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 19136/125000 [1:15:46<7:18:13,  4.03it/s]

epoch 0 step 19135 loss tensor(2.6715, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 19152/125000 [1:15:50<7:17:27,  4.03it/s]

epoch 0 step 19151 loss tensor(2.3628, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 19168/125000 [1:15:54<7:14:07,  4.06it/s]

epoch 0 step 19167 loss tensor(2.5322, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 19184/125000 [1:15:58<7:14:32,  4.06it/s]

epoch 0 step 19183 loss tensor(2.4828, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 19200/125000 [1:16:02<7:12:02,  4.08it/s]

epoch 0 step 19199 loss tensor(2.6398, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 19216/125000 [1:16:05<7:10:13,  4.10it/s]

epoch 0 step 19215 loss tensor(2.5599, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 19232/125000 [1:16:09<7:11:44,  4.08it/s]

epoch 0 step 19231 loss tensor(2.6624, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 19248/125000 [1:16:13<7:10:35,  4.09it/s]

epoch 0 step 19247 loss tensor(2.6032, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 19264/125000 [1:16:17<7:13:51,  4.06it/s]

epoch 0 step 19263 loss tensor(2.4782, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 19280/125000 [1:16:21<7:13:00,  4.07it/s]

epoch 0 step 19279 loss tensor(2.4751, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 19296/125000 [1:16:24<7:10:51,  4.09it/s]

epoch 0 step 19295 loss tensor(2.4516, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 19312/125000 [1:16:28<7:09:50,  4.10it/s]

epoch 0 step 19311 loss tensor(2.6076, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 19328/125000 [1:16:32<7:11:15,  4.08it/s]

epoch 0 step 19327 loss tensor(2.5241, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 19344/125000 [1:16:36<7:11:12,  4.08it/s]

epoch 0 step 19343 loss tensor(2.6490, device='cuda:0', grad_fn=<DivBackward0>)


 15%|█▌        | 19360/125000 [1:16:40<7:12:24,  4.07it/s]

epoch 0 step 19359 loss tensor(2.5339, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19376/125000 [1:16:43<7:09:44,  4.10it/s]

epoch 0 step 19375 loss tensor(2.4435, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19392/125000 [1:16:47<7:12:55,  4.07it/s]

epoch 0 step 19391 loss tensor(2.4946, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19408/125000 [1:16:51<7:13:32,  4.06it/s]

epoch 0 step 19407 loss tensor(2.6056, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19424/125000 [1:16:55<7:13:20,  4.06it/s]

epoch 0 step 19423 loss tensor(2.6023, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19440/125000 [1:16:59<7:11:44,  4.08it/s]

epoch 0 step 19439 loss tensor(2.4801, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19456/125000 [1:17:02<7:09:58,  4.09it/s]

epoch 0 step 19455 loss tensor(2.8085, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19472/125000 [1:17:06<7:09:04,  4.10it/s]

epoch 0 step 19471 loss tensor(2.5870, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19488/125000 [1:17:10<7:08:09,  4.11it/s]

epoch 0 step 19487 loss tensor(2.4732, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19504/125000 [1:17:14<7:10:10,  4.09it/s]

epoch 0 step 19503 loss tensor(2.5119, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19520/125000 [1:17:18<7:10:35,  4.08it/s]

epoch 0 step 19519 loss tensor(2.4600, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19536/125000 [1:17:21<7:14:12,  4.05it/s]

epoch 0 step 19535 loss tensor(2.4348, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19552/125000 [1:17:25<7:14:21,  4.05it/s]

epoch 0 step 19551 loss tensor(2.3304, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19568/125000 [1:17:29<7:12:06,  4.07it/s]

epoch 0 step 19567 loss tensor(2.4911, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19584/125000 [1:17:33<7:10:23,  4.08it/s]

epoch 0 step 19583 loss tensor(2.5845, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19600/125000 [1:17:37<7:09:54,  4.09it/s]

epoch 0 step 19599 loss tensor(2.8939, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19616/125000 [1:17:40<7:09:31,  4.09it/s]

epoch 0 step 19615 loss tensor(2.3763, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19632/125000 [1:17:44<7:08:50,  4.10it/s]

epoch 0 step 19631 loss tensor(2.3219, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19648/125000 [1:17:48<7:09:16,  4.09it/s]

epoch 0 step 19647 loss tensor(2.2550, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19664/125000 [1:17:52<7:09:11,  4.09it/s]

epoch 0 step 19663 loss tensor(2.1920, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19680/125000 [1:17:56<7:08:15,  4.10it/s]

epoch 0 step 19679 loss tensor(2.2768, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19696/125000 [1:17:59<7:13:16,  4.05it/s]

epoch 0 step 19695 loss tensor(2.3536, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19712/125000 [1:18:03<7:11:17,  4.07it/s]

epoch 0 step 19711 loss tensor(2.4054, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19728/125000 [1:18:07<7:10:41,  4.07it/s]

epoch 0 step 19727 loss tensor(2.3013, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19744/125000 [1:18:11<7:08:40,  4.09it/s]

epoch 0 step 19743 loss tensor(2.5380, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19760/125000 [1:18:15<7:11:11,  4.07it/s]

epoch 0 step 19759 loss tensor(2.7962, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19776/125000 [1:18:18<7:08:42,  4.09it/s]

epoch 0 step 19775 loss tensor(2.3127, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19792/125000 [1:18:22<7:09:07,  4.09it/s]

epoch 0 step 19791 loss tensor(2.4459, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19808/125000 [1:18:26<7:08:45,  4.09it/s]

epoch 0 step 19807 loss tensor(2.2735, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19824/125000 [1:18:30<7:07:15,  4.10it/s]

epoch 0 step 19823 loss tensor(2.2378, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19840/125000 [1:18:34<7:09:35,  4.08it/s]

epoch 0 step 19839 loss tensor(2.3558, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19856/125000 [1:18:37<7:08:36,  4.09it/s]

epoch 0 step 19855 loss tensor(2.3758, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19872/125000 [1:18:41<7:14:05,  4.04it/s]

epoch 0 step 19871 loss tensor(2.6889, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19888/125000 [1:18:45<7:16:31,  4.01it/s]

epoch 0 step 19887 loss tensor(2.5570, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19904/125000 [1:18:49<7:13:48,  4.04it/s]

epoch 0 step 19903 loss tensor(2.2890, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19920/125000 [1:18:53<7:11:47,  4.06it/s]

epoch 0 step 19919 loss tensor(2.5652, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19936/125000 [1:18:56<7:10:43,  4.07it/s]

epoch 0 step 19935 loss tensor(2.2888, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19952/125000 [1:19:00<7:08:37,  4.08it/s]

epoch 0 step 19951 loss tensor(2.1277, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19968/125000 [1:19:04<7:08:51,  4.08it/s]

epoch 0 step 19967 loss tensor(2.2068, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 19984/125000 [1:19:08<7:06:32,  4.10it/s]

epoch 0 step 19983 loss tensor(2.3404, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 20000/125000 [1:19:12<7:07:43,  4.09it/s]

epoch 0 step 19999 loss tensor(2.1940, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 20016/125000 [1:19:15<7:06:56,  4.10it/s]

epoch 0 step 20015 loss tensor(2.4770, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 20032/125000 [1:19:19<7:10:39,  4.06it/s]

epoch 0 step 20031 loss tensor(2.3060, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 20048/125000 [1:19:23<7:10:11,  4.07it/s]

epoch 0 step 20047 loss tensor(2.3079, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 20064/125000 [1:19:27<7:09:22,  4.07it/s]

epoch 0 step 20063 loss tensor(2.3365, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 20080/125000 [1:19:31<7:15:15,  4.02it/s]

epoch 0 step 20079 loss tensor(2.3086, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 20096/125000 [1:19:34<7:06:31,  4.10it/s]

epoch 0 step 20095 loss tensor(2.1383, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 20112/125000 [1:19:38<7:07:13,  4.09it/s]

epoch 0 step 20111 loss tensor(2.3404, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 20128/125000 [1:19:42<7:07:15,  4.09it/s]

epoch 0 step 20127 loss tensor(2.2863, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 20144/125000 [1:19:46<7:10:14,  4.06it/s]

epoch 0 step 20143 loss tensor(2.1471, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 20160/125000 [1:19:50<7:07:32,  4.09it/s]

epoch 0 step 20159 loss tensor(2.3605, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 20176/125000 [1:19:54<7:07:52,  4.08it/s]

epoch 0 step 20175 loss tensor(2.2815, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 20192/125000 [1:19:57<7:07:30,  4.09it/s]

epoch 0 step 20191 loss tensor(2.3609, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 20208/125000 [1:20:01<7:07:45,  4.08it/s]

epoch 0 step 20207 loss tensor(2.2087, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 20224/125000 [1:20:05<7:07:19,  4.09it/s]

epoch 0 step 20223 loss tensor(2.1818, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 20240/125000 [1:20:09<7:07:26,  4.08it/s]

epoch 0 step 20239 loss tensor(2.2779, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 20256/125000 [1:20:12<7:10:52,  4.05it/s]

epoch 0 step 20255 loss tensor(2.0509, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 20272/125000 [1:20:16<7:09:27,  4.06it/s]

epoch 0 step 20271 loss tensor(2.2237, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 20288/125000 [1:20:20<7:08:27,  4.07it/s]

epoch 0 step 20287 loss tensor(2.1445, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 20304/125000 [1:20:24<7:08:30,  4.07it/s]

epoch 0 step 20303 loss tensor(2.2604, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▋        | 20320/125000 [1:20:28<7:07:14,  4.08it/s]

epoch 0 step 20319 loss tensor(2.2110, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▋        | 20336/125000 [1:20:32<7:06:09,  4.09it/s]

epoch 0 step 20335 loss tensor(2.1110, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▋        | 20352/125000 [1:20:35<7:06:51,  4.09it/s]

epoch 0 step 20351 loss tensor(2.2160, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▋        | 20368/125000 [1:20:39<7:07:12,  4.08it/s]

epoch 0 step 20367 loss tensor(2.1083, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▋        | 20384/125000 [1:20:43<7:06:17,  4.09it/s]

epoch 0 step 20383 loss tensor(2.0850, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▋        | 20400/125000 [1:20:47<7:05:55,  4.09it/s]

epoch 0 step 20399 loss tensor(2.1106, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▋        | 20416/125000 [1:20:51<7:10:29,  4.05it/s]

epoch 0 step 20415 loss tensor(2.0159, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▋        | 20432/125000 [1:20:54<7:11:05,  4.04it/s]

epoch 0 step 20431 loss tensor(2.2597, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▋        | 20448/125000 [1:20:58<7:07:07,  4.08it/s]

epoch 0 step 20447 loss tensor(2.0950, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▋        | 20464/125000 [1:21:02<7:05:28,  4.09it/s]

epoch 0 step 20463 loss tensor(2.0755, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▋        | 20480/125000 [1:21:06<7:07:09,  4.08it/s]

epoch 0 step 20479 loss tensor(2.0268, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▋        | 20496/125000 [1:21:10<7:06:13,  4.09it/s]

epoch 0 step 20495 loss tensor(2.0530, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▋        | 20512/125000 [1:21:13<7:05:49,  4.09it/s]

epoch 0 step 20511 loss tensor(2.1276, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▋        | 20528/125000 [1:21:17<7:05:33,  4.09it/s]

epoch 0 step 20527 loss tensor(2.0765, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▋        | 20544/125000 [1:21:21<7:05:06,  4.10it/s]

epoch 0 step 20543 loss tensor(2.0927, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▋        | 20560/125000 [1:21:25<7:09:21,  4.05it/s]

epoch 0 step 20559 loss tensor(2.2245, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▋        | 20576/125000 [1:21:29<7:09:32,  4.05it/s]

epoch 0 step 20575 loss tensor(2.0238, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▋        | 20592/125000 [1:21:32<7:07:06,  4.07it/s]

epoch 0 step 20591 loss tensor(2.0143, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▋        | 20608/125000 [1:21:36<7:07:33,  4.07it/s]

epoch 0 step 20607 loss tensor(2.1118, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▋        | 20624/125000 [1:21:40<7:05:16,  4.09it/s]

epoch 0 step 20623 loss tensor(2.1709, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 20640/125000 [1:21:44<7:04:12,  4.10it/s]

epoch 0 step 20639 loss tensor(2.1230, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 20656/125000 [1:21:48<7:05:52,  4.08it/s]

epoch 0 step 20655 loss tensor(2.0953, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 20672/125000 [1:21:51<7:05:11,  4.09it/s]

epoch 0 step 20671 loss tensor(2.3019, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 20688/125000 [1:21:55<7:06:15,  4.08it/s]

epoch 0 step 20687 loss tensor(2.1342, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 20704/125000 [1:21:59<7:04:50,  4.09it/s]

epoch 0 step 20703 loss tensor(2.3193, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 20720/125000 [1:22:03<7:03:37,  4.10it/s]

epoch 0 step 20719 loss tensor(1.9956, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 20736/125000 [1:22:07<7:06:34,  4.07it/s]

epoch 0 step 20735 loss tensor(2.4069, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 20752/125000 [1:22:10<7:03:45,  4.10it/s]

epoch 0 step 20751 loss tensor(2.0139, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 20768/125000 [1:22:14<7:12:42,  4.01it/s]

epoch 0 step 20767 loss tensor(1.9954, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 20784/125000 [1:22:18<7:11:16,  4.03it/s]

epoch 0 step 20783 loss tensor(1.9120, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 20800/125000 [1:22:22<7:10:57,  4.03it/s]

epoch 0 step 20799 loss tensor(1.9695, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 20816/125000 [1:22:26<7:08:51,  4.05it/s]

epoch 0 step 20815 loss tensor(1.9401, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 20832/125000 [1:22:30<7:06:51,  4.07it/s]

epoch 0 step 20831 loss tensor(2.0456, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 20848/125000 [1:22:33<7:05:31,  4.08it/s]

epoch 0 step 20847 loss tensor(1.9999, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 20864/125000 [1:22:37<7:08:29,  4.05it/s]

epoch 0 step 20863 loss tensor(1.9568, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 20880/125000 [1:22:41<7:03:51,  4.09it/s]

epoch 0 step 20879 loss tensor(2.0150, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 20896/125000 [1:22:45<7:04:40,  4.09it/s]

epoch 0 step 20895 loss tensor(1.9997, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 20912/125000 [1:22:49<7:04:12,  4.09it/s]

epoch 0 step 20911 loss tensor(1.9409, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 20928/125000 [1:22:52<7:03:38,  4.09it/s]

epoch 0 step 20927 loss tensor(1.8917, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 20944/125000 [1:22:56<7:01:11,  4.12it/s]

epoch 0 step 20943 loss tensor(2.1456, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 20960/125000 [1:23:00<7:08:09,  4.05it/s]

epoch 0 step 20959 loss tensor(1.9755, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 20976/125000 [1:23:04<7:07:16,  4.06it/s]

epoch 0 step 20975 loss tensor(1.8460, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 20992/125000 [1:23:08<7:06:26,  4.06it/s]

epoch 0 step 20991 loss tensor(1.9939, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21008/125000 [1:23:11<7:04:50,  4.08it/s]

epoch 0 step 21007 loss tensor(2.0183, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21024/125000 [1:23:15<7:02:49,  4.10it/s]

epoch 0 step 21023 loss tensor(1.8789, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21040/125000 [1:23:19<7:03:51,  4.09it/s]

epoch 0 step 21039 loss tensor(1.8888, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21056/125000 [1:23:23<7:04:21,  4.08it/s]

epoch 0 step 21055 loss tensor(2.0058, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21072/125000 [1:23:27<7:03:17,  4.09it/s]

epoch 0 step 21071 loss tensor(2.0040, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21088/125000 [1:23:30<7:03:09,  4.09it/s]

epoch 0 step 21087 loss tensor(1.8994, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21104/125000 [1:23:34<7:02:40,  4.10it/s]

epoch 0 step 21103 loss tensor(2.0233, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21120/125000 [1:23:38<7:07:59,  4.05it/s]

epoch 0 step 21119 loss tensor(1.9567, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21136/125000 [1:23:42<7:07:40,  4.05it/s]

epoch 0 step 21135 loss tensor(1.9474, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21152/125000 [1:23:46<7:06:52,  4.05it/s]

epoch 0 step 21151 loss tensor(1.9053, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21168/125000 [1:23:50<7:05:46,  4.06it/s]

epoch 0 step 21167 loss tensor(1.8184, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21184/125000 [1:23:53<7:05:04,  4.07it/s]

epoch 0 step 21183 loss tensor(1.9129, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21200/125000 [1:23:57<7:03:24,  4.09it/s]

epoch 0 step 21199 loss tensor(1.8008, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21216/125000 [1:24:01<7:02:59,  4.09it/s]

epoch 0 step 21215 loss tensor(1.9203, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21232/125000 [1:24:05<7:01:48,  4.10it/s]

epoch 0 step 21231 loss tensor(2.0291, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21248/125000 [1:24:09<7:04:47,  4.07it/s]

epoch 0 step 21247 loss tensor(2.0912, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21264/125000 [1:24:12<7:02:13,  4.09it/s]

epoch 0 step 21263 loss tensor(2.0244, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21280/125000 [1:24:16<7:03:06,  4.09it/s]

epoch 0 step 21279 loss tensor(1.9314, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21296/125000 [1:24:20<7:02:11,  4.09it/s]

epoch 0 step 21295 loss tensor(1.9099, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21312/125000 [1:24:24<7:02:10,  4.09it/s]

epoch 0 step 21311 loss tensor(2.3020, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21328/125000 [1:24:28<7:03:22,  4.08it/s]

epoch 0 step 21327 loss tensor(2.0750, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21344/125000 [1:24:31<7:03:17,  4.08it/s]

epoch 0 step 21343 loss tensor(1.7954, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21360/125000 [1:24:35<7:01:43,  4.10it/s]

epoch 0 step 21359 loss tensor(1.8103, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21376/125000 [1:24:39<7:05:33,  4.06it/s]

epoch 0 step 21375 loss tensor(1.8105, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21392/125000 [1:24:43<7:06:43,  4.05it/s]

epoch 0 step 21391 loss tensor(2.0307, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21408/125000 [1:24:47<7:04:02,  4.07it/s]

epoch 0 step 21407 loss tensor(1.8442, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21424/125000 [1:24:50<7:02:17,  4.09it/s]

epoch 0 step 21423 loss tensor(1.7927, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21440/125000 [1:24:54<7:01:45,  4.09it/s]

epoch 0 step 21439 loss tensor(1.8256, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21456/125000 [1:24:58<7:02:00,  4.09it/s]

epoch 0 step 21455 loss tensor(2.0742, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21472/125000 [1:25:02<7:03:06,  4.08it/s]

epoch 0 step 21471 loss tensor(1.8677, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21488/125000 [1:25:06<7:02:42,  4.08it/s]

epoch 0 step 21487 loss tensor(1.8047, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21504/125000 [1:25:09<7:01:10,  4.10it/s]

epoch 0 step 21503 loss tensor(1.8920, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21520/125000 [1:25:13<7:01:17,  4.09it/s]

epoch 0 step 21519 loss tensor(1.9145, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21536/125000 [1:25:17<7:00:53,  4.10it/s]

epoch 0 step 21535 loss tensor(1.7415, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21552/125000 [1:25:21<7:07:56,  4.03it/s]

epoch 0 step 21551 loss tensor(1.7711, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21568/125000 [1:25:25<7:08:25,  4.02it/s]

epoch 0 step 21567 loss tensor(1.7716, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21584/125000 [1:25:28<7:07:47,  4.03it/s]

epoch 0 step 21583 loss tensor(1.8050, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21600/125000 [1:25:32<7:04:28,  4.06it/s]

epoch 0 step 21599 loss tensor(1.7860, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21616/125000 [1:25:36<7:03:07,  4.07it/s]

epoch 0 step 21615 loss tensor(1.8254, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21632/125000 [1:25:40<7:03:56,  4.06it/s]

epoch 0 step 21631 loss tensor(1.9158, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21648/125000 [1:25:44<7:01:43,  4.08it/s]

epoch 0 step 21647 loss tensor(1.7828, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21664/125000 [1:25:47<7:01:25,  4.09it/s]

epoch 0 step 21663 loss tensor(1.7896, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21680/125000 [1:25:51<7:01:43,  4.08it/s]

epoch 0 step 21679 loss tensor(2.1280, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21696/125000 [1:25:55<7:01:24,  4.09it/s]

epoch 0 step 21695 loss tensor(1.9050, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21712/125000 [1:25:59<7:01:15,  4.09it/s]

epoch 0 step 21711 loss tensor(1.9075, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21728/125000 [1:26:03<7:07:12,  4.03it/s]

epoch 0 step 21727 loss tensor(1.7416, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21744/125000 [1:26:07<7:05:45,  4.04it/s]

epoch 0 step 21743 loss tensor(1.7923, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21760/125000 [1:26:10<7:01:40,  4.08it/s]

epoch 0 step 21759 loss tensor(1.7629, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21776/125000 [1:26:14<7:03:42,  4.06it/s]

epoch 0 step 21775 loss tensor(1.8698, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21792/125000 [1:26:18<7:02:35,  4.07it/s]

epoch 0 step 21791 loss tensor(1.7166, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21808/125000 [1:26:22<7:01:00,  4.09it/s]

epoch 0 step 21807 loss tensor(1.7594, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21824/125000 [1:26:26<6:59:24,  4.10it/s]

epoch 0 step 21823 loss tensor(1.8690, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21840/125000 [1:26:29<7:00:11,  4.09it/s]

epoch 0 step 21839 loss tensor(1.9357, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21856/125000 [1:26:33<7:00:24,  4.09it/s]

epoch 0 step 21855 loss tensor(1.7938, device='cuda:0', grad_fn=<DivBackward0>)


 17%|█▋        | 21872/125000 [1:26:37<6:57:33,  4.12it/s]

epoch 0 step 21871 loss tensor(1.6962, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 21888/125000 [1:26:41<7:04:31,  4.05it/s]

epoch 0 step 21887 loss tensor(1.7728, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 21904/125000 [1:26:45<7:04:17,  4.05it/s]

epoch 0 step 21903 loss tensor(1.7038, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 21920/125000 [1:26:48<7:02:50,  4.06it/s]

epoch 0 step 21919 loss tensor(1.6626, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 21936/125000 [1:26:52<7:00:26,  4.09it/s]

epoch 0 step 21935 loss tensor(1.8140, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 21952/125000 [1:26:56<6:58:40,  4.10it/s]

epoch 0 step 21951 loss tensor(1.6507, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 21968/125000 [1:27:00<7:00:38,  4.08it/s]

epoch 0 step 21967 loss tensor(1.7855, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 21984/125000 [1:27:04<6:59:42,  4.09it/s]

epoch 0 step 21983 loss tensor(1.7353, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22000/125000 [1:27:07<7:00:12,  4.09it/s]

epoch 0 step 21999 loss tensor(1.6834, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22016/125000 [1:27:11<6:59:46,  4.09it/s]

epoch 0 step 22015 loss tensor(1.6694, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22032/125000 [1:27:15<6:59:34,  4.09it/s]

epoch 0 step 22031 loss tensor(1.6856, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22048/125000 [1:27:19<7:03:34,  4.05it/s]

epoch 0 step 22047 loss tensor(1.6922, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22064/125000 [1:27:23<7:03:10,  4.05it/s]

epoch 0 step 22063 loss tensor(1.7103, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22080/125000 [1:27:26<7:03:54,  4.05it/s]

epoch 0 step 22079 loss tensor(1.7086, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22096/125000 [1:27:30<7:00:22,  4.08it/s]

epoch 0 step 22095 loss tensor(1.6269, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22112/125000 [1:27:34<6:59:55,  4.08it/s]

epoch 0 step 22111 loss tensor(1.8624, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22128/125000 [1:27:38<6:58:26,  4.10it/s]

epoch 0 step 22127 loss tensor(1.8078, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22144/125000 [1:27:42<6:59:16,  4.09it/s]

epoch 0 step 22143 loss tensor(1.7490, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22160/125000 [1:27:45<6:58:37,  4.09it/s]

epoch 0 step 22159 loss tensor(2.3141, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22176/125000 [1:27:49<7:00:09,  4.08it/s]

epoch 0 step 22175 loss tensor(1.7979, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22192/125000 [1:27:53<6:59:00,  4.09it/s]

epoch 0 step 22191 loss tensor(1.6613, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22208/125000 [1:27:57<7:03:38,  4.04it/s]

epoch 0 step 22207 loss tensor(1.8443, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22224/125000 [1:28:01<7:01:43,  4.06it/s]

epoch 0 step 22223 loss tensor(1.7131, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22240/125000 [1:28:05<7:01:43,  4.06it/s]

epoch 0 step 22239 loss tensor(1.5178, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22256/125000 [1:28:08<7:01:48,  4.06it/s]

epoch 0 step 22255 loss tensor(1.5356, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22272/125000 [1:28:12<7:01:42,  4.06it/s]

epoch 0 step 22271 loss tensor(1.6773, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22288/125000 [1:28:16<6:57:54,  4.10it/s]

epoch 0 step 22287 loss tensor(1.5835, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22304/125000 [1:28:20<6:58:03,  4.09it/s]

epoch 0 step 22303 loss tensor(1.7697, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22320/125000 [1:28:24<6:58:07,  4.09it/s]

epoch 0 step 22319 loss tensor(1.6918, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22336/125000 [1:28:27<6:58:33,  4.09it/s]

epoch 0 step 22335 loss tensor(1.7307, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22352/125000 [1:28:31<7:02:31,  4.05it/s]

epoch 0 step 22351 loss tensor(1.6078, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22368/125000 [1:28:35<7:03:29,  4.04it/s]

epoch 0 step 22367 loss tensor(1.5836, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22384/125000 [1:28:39<6:59:59,  4.07it/s]

epoch 0 step 22383 loss tensor(1.6225, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22400/125000 [1:28:43<6:59:37,  4.08it/s]

epoch 0 step 22399 loss tensor(1.7494, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22416/125000 [1:28:46<6:58:48,  4.08it/s]

epoch 0 step 22415 loss tensor(2.1135, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22432/125000 [1:28:50<6:56:43,  4.10it/s]

epoch 0 step 22431 loss tensor(1.6233, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22448/125000 [1:28:54<6:58:51,  4.08it/s]

epoch 0 step 22447 loss tensor(1.5636, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22464/125000 [1:28:58<6:58:27,  4.08it/s]

epoch 0 step 22463 loss tensor(1.4550, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22480/125000 [1:29:02<6:58:34,  4.08it/s]

epoch 0 step 22479 loss tensor(1.5295, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22496/125000 [1:29:05<6:56:25,  4.10it/s]

epoch 0 step 22495 loss tensor(1.6825, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22512/125000 [1:29:09<6:58:46,  4.08it/s]

epoch 0 step 22511 loss tensor(1.5342, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22528/125000 [1:29:13<6:55:58,  4.11it/s]

epoch 0 step 22527 loss tensor(1.6184, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22544/125000 [1:29:17<7:04:18,  4.02it/s]

epoch 0 step 22543 loss tensor(1.5525, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22560/125000 [1:29:21<7:04:06,  4.03it/s]

epoch 0 step 22559 loss tensor(1.5922, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22576/125000 [1:29:24<7:00:12,  4.06it/s]

epoch 0 step 22575 loss tensor(1.6047, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22592/125000 [1:29:28<7:00:24,  4.06it/s]

epoch 0 step 22591 loss tensor(1.4496, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22608/125000 [1:29:32<6:59:01,  4.07it/s]

epoch 0 step 22607 loss tensor(1.6594, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22624/125000 [1:29:36<6:57:49,  4.08it/s]

epoch 0 step 22623 loss tensor(1.6400, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22640/125000 [1:29:40<6:57:18,  4.09it/s]

epoch 0 step 22639 loss tensor(1.5441, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22656/125000 [1:29:43<6:55:51,  4.10it/s]

epoch 0 step 22655 loss tensor(1.5784, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22672/125000 [1:29:47<6:58:00,  4.08it/s]

epoch 0 step 22671 loss tensor(1.5306, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22688/125000 [1:29:51<6:56:47,  4.09it/s]

epoch 0 step 22687 loss tensor(1.7549, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22704/125000 [1:29:55<6:59:34,  4.06it/s]

epoch 0 step 22703 loss tensor(1.7352, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22720/125000 [1:29:59<6:59:17,  4.07it/s]

epoch 0 step 22719 loss tensor(1.5102, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22736/125000 [1:30:03<6:57:23,  4.08it/s]

epoch 0 step 22735 loss tensor(1.6250, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22752/125000 [1:30:06<6:57:18,  4.08it/s]

epoch 0 step 22751 loss tensor(1.4708, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22768/125000 [1:30:10<6:56:55,  4.09it/s]

epoch 0 step 22767 loss tensor(1.6932, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22784/125000 [1:30:14<6:57:18,  4.08it/s]

epoch 0 step 22783 loss tensor(1.5081, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22800/125000 [1:30:18<6:56:47,  4.09it/s]

epoch 0 step 22799 loss tensor(1.6497, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22816/125000 [1:30:21<6:57:15,  4.08it/s]

epoch 0 step 22815 loss tensor(1.4694, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22832/125000 [1:30:25<7:00:10,  4.05it/s]

epoch 0 step 22831 loss tensor(1.4839, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22848/125000 [1:30:29<7:00:42,  4.05it/s]

epoch 0 step 22847 loss tensor(1.5377, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22864/125000 [1:30:33<6:59:42,  4.06it/s]

epoch 0 step 22863 loss tensor(1.4185, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22880/125000 [1:30:37<6:58:20,  4.07it/s]

epoch 0 step 22879 loss tensor(1.6935, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22896/125000 [1:30:41<6:56:43,  4.08it/s]

epoch 0 step 22895 loss tensor(1.5825, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22912/125000 [1:30:44<6:55:16,  4.10it/s]

epoch 0 step 22911 loss tensor(1.5538, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22928/125000 [1:30:48<6:55:29,  4.09it/s]

epoch 0 step 22927 loss tensor(1.5764, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22944/125000 [1:30:52<6:56:05,  4.09it/s]

epoch 0 step 22943 loss tensor(1.4150, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22960/125000 [1:30:56<6:55:37,  4.09it/s]

epoch 0 step 22959 loss tensor(1.3379, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22976/125000 [1:31:00<6:55:56,  4.09it/s]

epoch 0 step 22975 loss tensor(1.4099, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 22992/125000 [1:31:03<6:58:56,  4.06it/s]

epoch 0 step 22991 loss tensor(1.4931, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 23008/125000 [1:31:07<6:59:15,  4.05it/s]

epoch 0 step 23007 loss tensor(1.5786, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 23024/125000 [1:31:11<6:58:34,  4.06it/s]

epoch 0 step 23023 loss tensor(1.4348, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 23040/125000 [1:31:15<6:56:11,  4.08it/s]

epoch 0 step 23039 loss tensor(1.3887, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 23056/125000 [1:31:19<6:59:24,  4.05it/s]

epoch 0 step 23055 loss tensor(1.4519, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 23072/125000 [1:31:22<6:55:09,  4.09it/s]

epoch 0 step 23071 loss tensor(1.6164, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 23088/125000 [1:31:26<6:56:15,  4.08it/s]

epoch 0 step 23087 loss tensor(1.3477, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 23104/125000 [1:31:30<6:54:55,  4.09it/s]

epoch 0 step 23103 loss tensor(1.5512, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 23120/125000 [1:31:34<6:54:59,  4.09it/s]

epoch 0 step 23119 loss tensor(1.4981, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▊        | 23136/125000 [1:31:38<6:58:55,  4.05it/s]

epoch 0 step 23135 loss tensor(1.3465, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▊        | 23152/125000 [1:31:41<6:58:32,  4.06it/s]

epoch 0 step 23151 loss tensor(1.4005, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▊        | 23168/125000 [1:31:45<6:57:14,  4.07it/s]

epoch 0 step 23167 loss tensor(1.6003, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▊        | 23184/125000 [1:31:49<6:56:17,  4.08it/s]

epoch 0 step 23183 loss tensor(1.4213, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▊        | 23200/125000 [1:31:53<6:55:15,  4.09it/s]

epoch 0 step 23199 loss tensor(1.3981, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▊        | 23216/125000 [1:31:57<6:54:41,  4.09it/s]

epoch 0 step 23215 loss tensor(1.4481, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▊        | 23232/125000 [1:32:00<6:54:13,  4.09it/s]

epoch 0 step 23231 loss tensor(1.3298, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▊        | 23248/125000 [1:32:04<6:56:03,  4.08it/s]

epoch 0 step 23247 loss tensor(1.5639, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▊        | 23264/125000 [1:32:08<6:53:35,  4.10it/s]

epoch 0 step 23263 loss tensor(1.4883, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▊        | 23280/125000 [1:32:12<6:53:21,  4.10it/s]

epoch 0 step 23279 loss tensor(1.4145, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▊        | 23296/125000 [1:32:16<6:57:52,  4.06it/s]

epoch 0 step 23295 loss tensor(1.4988, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▊        | 23312/125000 [1:32:19<6:58:07,  4.05it/s]

epoch 0 step 23311 loss tensor(1.4224, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▊        | 23328/125000 [1:32:23<6:57:11,  4.06it/s]

epoch 0 step 23327 loss tensor(1.4782, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▊        | 23344/125000 [1:32:27<6:55:50,  4.07it/s]

epoch 0 step 23343 loss tensor(1.4500, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▊        | 23360/125000 [1:32:31<6:55:27,  4.08it/s]

epoch 0 step 23359 loss tensor(1.4976, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▊        | 23376/125000 [1:32:35<6:53:15,  4.10it/s]

epoch 0 step 23375 loss tensor(1.4096, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▊        | 23392/125000 [1:32:39<6:55:29,  4.08it/s]

epoch 0 step 23391 loss tensor(1.3179, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▊        | 23408/125000 [1:32:42<6:53:34,  4.09it/s]

epoch 0 step 23407 loss tensor(1.4792, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▊        | 23424/125000 [1:32:46<6:53:24,  4.10it/s]

epoch 0 step 23423 loss tensor(1.4321, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23440/125000 [1:32:50<6:58:46,  4.04it/s]

epoch 0 step 23439 loss tensor(1.3836, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23456/125000 [1:32:54<6:57:50,  4.05it/s]

epoch 0 step 23455 loss tensor(1.3666, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23472/125000 [1:32:58<6:55:56,  4.07it/s]

epoch 0 step 23471 loss tensor(1.3714, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23488/125000 [1:33:01<6:55:44,  4.07it/s]

epoch 0 step 23487 loss tensor(1.5313, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23504/125000 [1:33:05<6:53:21,  4.09it/s]

epoch 0 step 23503 loss tensor(1.2216, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23520/125000 [1:33:09<6:53:18,  4.09it/s]

epoch 0 step 23519 loss tensor(1.5283, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23536/125000 [1:33:13<6:52:39,  4.10it/s]

epoch 0 step 23535 loss tensor(1.6012, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23552/125000 [1:33:17<6:52:50,  4.10it/s]

epoch 0 step 23551 loss tensor(1.3751, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23568/125000 [1:33:20<6:53:01,  4.09it/s]

epoch 0 step 23567 loss tensor(1.4682, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23584/125000 [1:33:24<6:52:18,  4.10it/s]

epoch 0 step 23583 loss tensor(1.5877, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23600/125000 [1:33:28<6:56:42,  4.06it/s]

epoch 0 step 23599 loss tensor(1.3136, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23616/125000 [1:33:32<6:56:16,  4.06it/s]

epoch 0 step 23615 loss tensor(1.3260, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23632/125000 [1:33:36<6:55:20,  4.07it/s]

epoch 0 step 23631 loss tensor(1.4591, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23648/125000 [1:33:39<6:53:00,  4.09it/s]

epoch 0 step 23647 loss tensor(1.3957, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23664/125000 [1:33:43<6:54:01,  4.08it/s]

epoch 0 step 23663 loss tensor(1.6490, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23680/125000 [1:33:47<6:52:14,  4.10it/s]

epoch 0 step 23679 loss tensor(1.2922, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23696/125000 [1:33:51<6:52:06,  4.10it/s]

epoch 0 step 23695 loss tensor(1.3723, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23712/125000 [1:33:55<6:53:44,  4.08it/s]

epoch 0 step 23711 loss tensor(1.5110, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23728/125000 [1:33:58<6:53:29,  4.08it/s]

epoch 0 step 23727 loss tensor(1.2702, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23744/125000 [1:34:02<6:57:35,  4.04it/s]

epoch 0 step 23743 loss tensor(1.2492, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23760/125000 [1:34:06<6:56:09,  4.05it/s]

epoch 0 step 23759 loss tensor(1.2308, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23776/125000 [1:34:10<6:54:46,  4.07it/s]

epoch 0 step 23775 loss tensor(1.3579, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23792/125000 [1:34:14<6:54:55,  4.07it/s]

epoch 0 step 23791 loss tensor(1.2775, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23808/125000 [1:34:17<6:53:38,  4.08it/s]

epoch 0 step 23807 loss tensor(1.3709, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23824/125000 [1:34:21<6:51:56,  4.09it/s]

epoch 0 step 23823 loss tensor(1.2811, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23840/125000 [1:34:25<6:52:12,  4.09it/s]

epoch 0 step 23839 loss tensor(1.4080, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23856/125000 [1:34:29<6:52:23,  4.09it/s]

epoch 0 step 23855 loss tensor(1.2592, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23872/125000 [1:34:33<6:52:44,  4.08it/s]

epoch 0 step 23871 loss tensor(1.3906, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23888/125000 [1:34:36<6:51:17,  4.10it/s]

epoch 0 step 23887 loss tensor(1.2383, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23904/125000 [1:34:40<6:55:24,  4.06it/s]

epoch 0 step 23903 loss tensor(1.5757, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23920/125000 [1:34:44<6:56:28,  4.05it/s]

epoch 0 step 23919 loss tensor(1.2857, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23936/125000 [1:34:48<6:54:12,  4.07it/s]

epoch 0 step 23935 loss tensor(1.3844, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23952/125000 [1:34:52<6:51:47,  4.09it/s]

epoch 0 step 23951 loss tensor(1.2095, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23968/125000 [1:34:55<6:51:11,  4.10it/s]

epoch 0 step 23967 loss tensor(1.2566, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 23984/125000 [1:34:59<6:51:04,  4.10it/s]

epoch 0 step 23983 loss tensor(1.2253, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 24000/125000 [1:35:03<6:51:52,  4.09it/s]

epoch 0 step 23999 loss tensor(1.2988, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 24016/125000 [1:35:07<6:52:47,  4.08it/s]

epoch 0 step 24015 loss tensor(1.3368, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 24032/125000 [1:35:11<6:50:16,  4.10it/s]

epoch 0 step 24031 loss tensor(1.2664, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 24048/125000 [1:35:14<6:55:56,  4.05it/s]

epoch 0 step 24047 loss tensor(1.3314, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 24064/125000 [1:35:18<6:54:28,  4.06it/s]

epoch 0 step 24063 loss tensor(1.2183, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 24080/125000 [1:35:22<6:51:55,  4.08it/s]

epoch 0 step 24079 loss tensor(1.4368, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 24096/125000 [1:35:26<6:52:29,  4.08it/s]

epoch 0 step 24095 loss tensor(1.5493, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 24112/125000 [1:35:30<6:51:56,  4.08it/s]

epoch 0 step 24111 loss tensor(1.4058, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 24128/125000 [1:35:33<6:50:19,  4.10it/s]

epoch 0 step 24127 loss tensor(1.4606, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 24144/125000 [1:35:37<6:50:48,  4.09it/s]

epoch 0 step 24143 loss tensor(1.2540, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 24160/125000 [1:35:41<6:50:37,  4.09it/s]

epoch 0 step 24159 loss tensor(1.3893, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 24176/125000 [1:35:45<6:49:56,  4.10it/s]

epoch 0 step 24175 loss tensor(1.1629, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 24192/125000 [1:35:49<6:50:07,  4.10it/s]

epoch 0 step 24191 loss tensor(1.3832, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 24208/125000 [1:35:52<6:54:35,  4.05it/s]

epoch 0 step 24207 loss tensor(1.3112, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 24224/125000 [1:35:56<6:54:37,  4.05it/s]

epoch 0 step 24223 loss tensor(1.2701, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 24240/125000 [1:36:00<6:51:35,  4.08it/s]

epoch 0 step 24239 loss tensor(1.2538, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 24256/125000 [1:36:04<6:51:12,  4.08it/s]

epoch 0 step 24255 loss tensor(1.2168, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 24272/125000 [1:36:08<6:52:08,  4.07it/s]

epoch 0 step 24271 loss tensor(1.1957, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 24288/125000 [1:36:11<6:50:24,  4.09it/s]

epoch 0 step 24287 loss tensor(1.3013, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 24304/125000 [1:36:15<6:49:46,  4.10it/s]

epoch 0 step 24303 loss tensor(1.3635, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 24320/125000 [1:36:19<6:50:56,  4.08it/s]

epoch 0 step 24319 loss tensor(1.2949, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 24336/125000 [1:36:23<6:50:48,  4.08it/s]

epoch 0 step 24335 loss tensor(1.2957, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 24352/125000 [1:36:27<6:54:58,  4.04it/s]

epoch 0 step 24351 loss tensor(1.2064, device='cuda:0', grad_fn=<DivBackward0>)


 19%|█▉        | 24368/125000 [1:36:30<6:54:11,  4.05it/s]

epoch 0 step 24367 loss tensor(1.2327, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24384/125000 [1:36:34<6:51:44,  4.07it/s]

epoch 0 step 24383 loss tensor(1.3780, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24400/125000 [1:36:38<6:52:17,  4.07it/s]

epoch 0 step 24399 loss tensor(1.4322, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24416/125000 [1:36:42<6:50:30,  4.08it/s]

epoch 0 step 24415 loss tensor(1.2760, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24432/125000 [1:36:46<6:50:42,  4.08it/s]

epoch 0 step 24431 loss tensor(1.1582, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24448/125000 [1:36:50<6:49:43,  4.09it/s]

epoch 0 step 24447 loss tensor(1.1882, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24464/125000 [1:36:53<6:48:42,  4.10it/s]

epoch 0 step 24463 loss tensor(1.0676, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24480/125000 [1:36:57<6:48:22,  4.10it/s]

epoch 0 step 24479 loss tensor(1.0708, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24496/125000 [1:37:01<6:50:22,  4.08it/s]

epoch 0 step 24495 loss tensor(1.1702, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24512/125000 [1:37:05<6:53:05,  4.05it/s]

epoch 0 step 24511 loss tensor(1.1737, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24528/125000 [1:37:09<6:52:13,  4.06it/s]

epoch 0 step 24527 loss tensor(1.2105, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24544/125000 [1:37:12<6:53:13,  4.05it/s]

epoch 0 step 24543 loss tensor(1.2290, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24560/125000 [1:37:16<6:49:13,  4.09it/s]

epoch 0 step 24559 loss tensor(1.4152, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24576/125000 [1:37:20<6:51:18,  4.07it/s]

epoch 0 step 24575 loss tensor(1.0512, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24592/125000 [1:37:24<6:48:25,  4.10it/s]

epoch 0 step 24591 loss tensor(1.1568, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24608/125000 [1:37:28<6:48:24,  4.10it/s]

epoch 0 step 24607 loss tensor(1.3252, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24624/125000 [1:37:31<6:49:09,  4.09it/s]

epoch 0 step 24623 loss tensor(1.1531, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24640/125000 [1:37:35<6:49:16,  4.09it/s]

epoch 0 step 24639 loss tensor(1.3271, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24656/125000 [1:37:39<6:47:22,  4.11it/s]

epoch 0 step 24655 loss tensor(1.3960, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24672/125000 [1:37:43<6:53:10,  4.05it/s]

epoch 0 step 24671 loss tensor(1.2951, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24688/125000 [1:37:47<6:51:44,  4.06it/s]

epoch 0 step 24687 loss tensor(1.1426, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24704/125000 [1:37:50<6:50:26,  4.07it/s]

epoch 0 step 24703 loss tensor(1.1552, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24720/125000 [1:37:54<6:49:46,  4.08it/s]

epoch 0 step 24719 loss tensor(1.1629, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24736/125000 [1:37:58<6:35:54,  4.22it/s]

epoch 0 step 24735 loss tensor(1.1923, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24752/125000 [1:38:02<6:46:15,  4.11it/s]

epoch 0 step 24751 loss tensor(1.2340, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24768/125000 [1:38:05<6:46:26,  4.11it/s]

epoch 0 step 24767 loss tensor(1.2095, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24784/125000 [1:38:09<6:45:19,  4.12it/s]

epoch 0 step 24783 loss tensor(1.2460, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24800/125000 [1:38:13<6:45:13,  4.12it/s]

epoch 0 step 24799 loss tensor(1.2122, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24816/125000 [1:38:17<6:46:07,  4.11it/s]

epoch 0 step 24815 loss tensor(1.1860, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24832/125000 [1:38:20<6:47:28,  4.10it/s]

epoch 0 step 24831 loss tensor(1.4041, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24848/125000 [1:38:24<6:47:25,  4.10it/s]

epoch 0 step 24847 loss tensor(1.0965, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24864/125000 [1:38:28<6:47:51,  4.09it/s]

epoch 0 step 24863 loss tensor(1.1330, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24880/125000 [1:38:32<6:48:09,  4.09it/s]

epoch 0 step 24879 loss tensor(1.3098, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24896/125000 [1:38:36<6:46:43,  4.10it/s]

epoch 0 step 24895 loss tensor(1.8620, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24912/125000 [1:38:39<6:47:28,  4.09it/s]

epoch 0 step 24911 loss tensor(1.3661, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24928/125000 [1:38:43<6:53:41,  4.03it/s]

epoch 0 step 24927 loss tensor(1.1616, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24944/125000 [1:38:47<6:50:47,  4.06it/s]

epoch 0 step 24943 loss tensor(1.2276, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24960/125000 [1:38:51<6:49:26,  4.07it/s]

epoch 0 step 24959 loss tensor(1.1480, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24976/125000 [1:38:55<6:49:22,  4.07it/s]

epoch 0 step 24975 loss tensor(1.1532, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 24992/125000 [1:38:58<6:01:01,  4.62it/s]

epoch 0 step 24991 loss tensor(1.1141, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25008/125000 [1:39:02<6:00:00,  4.63it/s]

epoch 0 step 25007 loss tensor(1.1460, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25024/125000 [1:39:05<5:58:51,  4.64it/s]

epoch 0 step 25023 loss tensor(1.2505, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25040/125000 [1:39:08<5:59:12,  4.64it/s]

epoch 0 step 25039 loss tensor(1.1412, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25056/125000 [1:39:12<5:58:52,  4.64it/s]

epoch 0 step 25055 loss tensor(1.2052, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25072/125000 [1:39:15<5:59:15,  4.64it/s]

epoch 0 step 25071 loss tensor(0.9818, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25088/125000 [1:39:18<5:58:53,  4.64it/s]

epoch 0 step 25087 loss tensor(1.1524, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25104/125000 [1:39:22<5:58:50,  4.64it/s]

epoch 0 step 25103 loss tensor(1.1120, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25120/125000 [1:39:25<5:58:47,  4.64it/s]

epoch 0 step 25119 loss tensor(1.0348, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25136/125000 [1:39:28<5:59:10,  4.63it/s]

epoch 0 step 25135 loss tensor(1.0053, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25152/125000 [1:39:32<5:59:03,  4.63it/s]

epoch 0 step 25151 loss tensor(1.1408, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25168/125000 [1:39:35<5:58:23,  4.64it/s]

epoch 0 step 25167 loss tensor(1.0315, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25184/125000 [1:39:38<5:58:30,  4.64it/s]

epoch 0 step 25183 loss tensor(1.1260, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25200/125000 [1:39:42<5:58:12,  4.64it/s]

epoch 0 step 25199 loss tensor(1.3868, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25216/125000 [1:39:45<5:58:28,  4.64it/s]

epoch 0 step 25215 loss tensor(1.1710, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25232/125000 [1:39:48<5:58:10,  4.64it/s]

epoch 0 step 25231 loss tensor(1.0305, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25248/125000 [1:39:52<5:58:29,  4.64it/s]

epoch 0 step 25247 loss tensor(1.1159, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25264/125000 [1:39:55<5:58:23,  4.64it/s]

epoch 0 step 25263 loss tensor(0.9920, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25280/125000 [1:39:58<5:58:12,  4.64it/s]

epoch 0 step 25279 loss tensor(1.1883, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25296/125000 [1:40:02<5:58:16,  4.64it/s]

epoch 0 step 25295 loss tensor(1.0648, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25312/125000 [1:40:05<5:58:23,  4.64it/s]

epoch 0 step 25311 loss tensor(0.9788, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25328/125000 [1:40:08<5:57:51,  4.64it/s]

epoch 0 step 25327 loss tensor(1.1336, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25344/125000 [1:40:12<5:57:43,  4.64it/s]

epoch 0 step 25343 loss tensor(0.9264, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25360/125000 [1:40:15<5:57:43,  4.64it/s]

epoch 0 step 25359 loss tensor(1.3990, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25376/125000 [1:40:18<5:58:27,  4.63it/s]

epoch 0 step 25375 loss tensor(1.1038, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25392/125000 [1:40:22<5:58:08,  4.64it/s]

epoch 0 step 25391 loss tensor(1.1279, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25408/125000 [1:40:25<5:57:26,  4.64it/s]

epoch 0 step 25407 loss tensor(1.1623, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25424/125000 [1:40:28<5:58:02,  4.64it/s]

epoch 0 step 25423 loss tensor(1.1254, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25440/125000 [1:40:32<5:57:37,  4.64it/s]

epoch 0 step 25439 loss tensor(1.0092, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25456/125000 [1:40:35<5:57:29,  4.64it/s]

epoch 0 step 25455 loss tensor(1.0970, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25472/125000 [1:40:39<5:57:22,  4.64it/s]

epoch 0 step 25471 loss tensor(1.0435, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25488/125000 [1:40:42<5:57:23,  4.64it/s]

epoch 0 step 25487 loss tensor(1.1096, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25504/125000 [1:40:45<5:57:25,  4.64it/s]

epoch 0 step 25503 loss tensor(0.9340, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25520/125000 [1:40:49<5:57:41,  4.64it/s]

epoch 0 step 25519 loss tensor(1.2327, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25536/125000 [1:40:52<5:57:32,  4.64it/s]

epoch 0 step 25535 loss tensor(1.0798, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25552/125000 [1:40:55<5:57:25,  4.64it/s]

epoch 0 step 25551 loss tensor(1.2466, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25568/125000 [1:40:59<5:57:18,  4.64it/s]

epoch 0 step 25567 loss tensor(1.2155, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25584/125000 [1:41:02<5:57:18,  4.64it/s]

epoch 0 step 25583 loss tensor(1.1535, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25600/125000 [1:41:05<5:57:39,  4.63it/s]

epoch 0 step 25599 loss tensor(1.1250, device='cuda:0', grad_fn=<DivBackward0>)


 20%|██        | 25616/125000 [1:41:09<5:57:01,  4.64it/s]

epoch 0 step 25615 loss tensor(0.9409, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 25632/125000 [1:41:12<5:57:09,  4.64it/s]

epoch 0 step 25631 loss tensor(1.0908, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 25648/125000 [1:41:15<5:57:00,  4.64it/s]

epoch 0 step 25647 loss tensor(1.0258, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 25664/125000 [1:41:19<5:57:24,  4.63it/s]

epoch 0 step 25663 loss tensor(1.0809, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 25680/125000 [1:41:22<5:56:59,  4.64it/s]

epoch 0 step 25679 loss tensor(1.2443, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 25696/125000 [1:41:25<5:56:44,  4.64it/s]

epoch 0 step 25695 loss tensor(1.3194, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 25712/125000 [1:41:29<5:56:42,  4.64it/s]

epoch 0 step 25711 loss tensor(1.1338, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 25728/125000 [1:41:32<5:56:45,  4.64it/s]

epoch 0 step 25727 loss tensor(1.2680, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 25744/125000 [1:41:35<5:56:07,  4.65it/s]

epoch 0 step 25743 loss tensor(1.1135, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 25760/125000 [1:41:39<5:56:46,  4.64it/s]

epoch 0 step 25759 loss tensor(1.0875, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 25776/125000 [1:41:42<5:56:43,  4.64it/s]

epoch 0 step 25775 loss tensor(1.0549, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 25792/125000 [1:41:45<5:56:17,  4.64it/s]

epoch 0 step 25791 loss tensor(0.9831, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 25808/125000 [1:41:49<5:56:06,  4.64it/s]

epoch 0 step 25807 loss tensor(1.1023, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 25824/125000 [1:41:52<5:56:11,  4.64it/s]

epoch 0 step 25823 loss tensor(1.0473, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 25840/125000 [1:41:55<5:56:22,  4.64it/s]

epoch 0 step 25839 loss tensor(0.9509, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 25856/125000 [1:41:59<5:56:02,  4.64it/s]

epoch 0 step 25855 loss tensor(1.2543, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 25872/125000 [1:42:02<5:56:25,  4.64it/s]

epoch 0 step 25871 loss tensor(0.9910, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 25888/125000 [1:42:05<5:55:51,  4.64it/s]

epoch 0 step 25887 loss tensor(1.0805, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 25904/125000 [1:42:09<5:56:23,  4.63it/s]

epoch 0 step 25903 loss tensor(1.1380, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 25920/125000 [1:42:12<5:55:47,  4.64it/s]

epoch 0 step 25919 loss tensor(1.4371, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 25936/125000 [1:42:16<5:56:11,  4.64it/s]

epoch 0 step 25935 loss tensor(1.0517, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 25952/125000 [1:42:19<5:55:40,  4.64it/s]

epoch 0 step 25951 loss tensor(1.0684, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 25968/125000 [1:42:22<5:55:33,  4.64it/s]

epoch 0 step 25967 loss tensor(1.2628, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 25984/125000 [1:42:26<5:56:13,  4.63it/s]

epoch 0 step 25983 loss tensor(1.0358, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26000/125000 [1:42:29<5:55:21,  4.64it/s]

epoch 0 step 25999 loss tensor(0.8762, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26016/125000 [1:42:32<5:55:31,  4.64it/s]

epoch 0 step 26015 loss tensor(1.0762, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26032/125000 [1:42:36<5:55:15,  4.64it/s]

epoch 0 step 26031 loss tensor(1.1125, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26048/125000 [1:42:39<5:55:22,  4.64it/s]

epoch 0 step 26047 loss tensor(1.0916, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26064/125000 [1:42:42<5:55:37,  4.64it/s]

epoch 0 step 26063 loss tensor(0.9961, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26080/125000 [1:42:46<5:54:57,  4.64it/s]

epoch 0 step 26079 loss tensor(1.0459, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26096/125000 [1:42:49<5:55:32,  4.64it/s]

epoch 0 step 26095 loss tensor(1.0616, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26112/125000 [1:42:52<5:55:09,  4.64it/s]

epoch 0 step 26111 loss tensor(1.0036, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26128/125000 [1:42:56<5:55:52,  4.63it/s]

epoch 0 step 26127 loss tensor(0.9918, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26144/125000 [1:42:59<5:54:37,  4.65it/s]

epoch 0 step 26143 loss tensor(0.9133, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26160/125000 [1:43:02<5:55:33,  4.63it/s]

epoch 0 step 26159 loss tensor(1.1360, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26176/125000 [1:43:06<5:54:45,  4.64it/s]

epoch 0 step 26175 loss tensor(0.8909, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26192/125000 [1:43:09<5:54:52,  4.64it/s]

epoch 0 step 26191 loss tensor(0.9664, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26208/125000 [1:43:12<5:54:43,  4.64it/s]

epoch 0 step 26207 loss tensor(1.1179, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26224/125000 [1:43:16<5:55:16,  4.63it/s]

epoch 0 step 26223 loss tensor(0.9475, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26240/125000 [1:43:19<5:54:43,  4.64it/s]

epoch 0 step 26239 loss tensor(1.0299, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26256/125000 [1:43:22<5:54:52,  4.64it/s]

epoch 0 step 26255 loss tensor(1.1437, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26272/125000 [1:43:26<5:54:42,  4.64it/s]

epoch 0 step 26271 loss tensor(0.9692, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26288/125000 [1:43:29<5:54:15,  4.64it/s]

epoch 0 step 26287 loss tensor(1.0178, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26304/125000 [1:43:32<5:54:03,  4.65it/s]

epoch 0 step 26303 loss tensor(1.1181, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26320/125000 [1:43:36<5:54:31,  4.64it/s]

epoch 0 step 26319 loss tensor(1.0988, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26336/125000 [1:43:39<5:54:37,  4.64it/s]

epoch 0 step 26335 loss tensor(0.9323, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26352/125000 [1:43:42<5:54:44,  4.63it/s]

epoch 0 step 26351 loss tensor(0.9504, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26368/125000 [1:43:46<5:54:14,  4.64it/s]

epoch 0 step 26367 loss tensor(0.9220, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26384/125000 [1:43:49<5:54:06,  4.64it/s]

epoch 0 step 26383 loss tensor(0.9309, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26400/125000 [1:43:53<5:54:23,  4.64it/s]

epoch 0 step 26399 loss tensor(0.9489, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26416/125000 [1:43:56<5:53:46,  4.64it/s]

epoch 0 step 26415 loss tensor(0.8786, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26432/125000 [1:43:59<5:54:08,  4.64it/s]

epoch 0 step 26431 loss tensor(1.0501, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26448/125000 [1:44:03<5:54:04,  4.64it/s]

epoch 0 step 26447 loss tensor(0.9202, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26464/125000 [1:44:06<5:53:43,  4.64it/s]

epoch 0 step 26463 loss tensor(1.1164, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26480/125000 [1:44:09<5:54:26,  4.63it/s]

epoch 0 step 26479 loss tensor(0.9504, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26496/125000 [1:44:13<5:53:25,  4.65it/s]

epoch 0 step 26495 loss tensor(0.9073, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26512/125000 [1:44:16<5:54:10,  4.63it/s]

epoch 0 step 26511 loss tensor(0.9793, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26528/125000 [1:44:19<5:54:35,  4.63it/s]

epoch 0 step 26527 loss tensor(0.8913, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26544/125000 [1:44:23<5:55:31,  4.62it/s]

epoch 0 step 26543 loss tensor(0.9146, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██        | 26560/125000 [1:44:26<5:53:32,  4.64it/s]

epoch 0 step 26559 loss tensor(0.8525, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██▏       | 26576/125000 [1:44:29<5:54:00,  4.63it/s]

epoch 0 step 26575 loss tensor(1.0506, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██▏       | 26592/125000 [1:44:33<5:54:31,  4.63it/s]

epoch 0 step 26591 loss tensor(1.0605, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██▏       | 26608/125000 [1:44:36<5:54:03,  4.63it/s]

epoch 0 step 26607 loss tensor(0.9648, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██▏       | 26624/125000 [1:44:39<5:54:26,  4.63it/s]

epoch 0 step 26623 loss tensor(0.9603, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██▏       | 26640/125000 [1:44:43<5:54:24,  4.63it/s]

epoch 0 step 26639 loss tensor(0.8876, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██▏       | 26656/125000 [1:44:46<5:53:54,  4.63it/s]

epoch 0 step 26655 loss tensor(1.1442, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██▏       | 26672/125000 [1:44:49<5:53:53,  4.63it/s]

epoch 0 step 26671 loss tensor(0.9571, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██▏       | 26688/125000 [1:44:53<5:54:04,  4.63it/s]

epoch 0 step 26687 loss tensor(0.9568, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██▏       | 26704/125000 [1:44:56<5:54:07,  4.63it/s]

epoch 0 step 26703 loss tensor(1.0472, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██▏       | 26720/125000 [1:45:00<5:54:16,  4.62it/s]

epoch 0 step 26719 loss tensor(1.0120, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██▏       | 26736/125000 [1:45:03<5:53:30,  4.63it/s]

epoch 0 step 26735 loss tensor(0.9969, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██▏       | 26752/125000 [1:45:06<5:54:05,  4.62it/s]

epoch 0 step 26751 loss tensor(0.8650, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██▏       | 26768/125000 [1:45:10<5:53:46,  4.63it/s]

epoch 0 step 26767 loss tensor(1.0257, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██▏       | 26784/125000 [1:45:13<5:53:45,  4.63it/s]

epoch 0 step 26783 loss tensor(0.9495, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██▏       | 26800/125000 [1:45:16<5:53:42,  4.63it/s]

epoch 0 step 26799 loss tensor(1.0017, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██▏       | 26816/125000 [1:45:20<5:54:17,  4.62it/s]

epoch 0 step 26815 loss tensor(0.9029, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██▏       | 26832/125000 [1:45:23<5:53:36,  4.63it/s]

epoch 0 step 26831 loss tensor(1.0793, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██▏       | 26848/125000 [1:45:26<5:53:08,  4.63it/s]

epoch 0 step 26847 loss tensor(1.0973, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██▏       | 26864/125000 [1:45:30<5:53:01,  4.63it/s]

epoch 0 step 26863 loss tensor(1.1287, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 26880/125000 [1:45:33<5:53:11,  4.63it/s]

epoch 0 step 26879 loss tensor(0.9652, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 26896/125000 [1:45:36<5:53:19,  4.63it/s]

epoch 0 step 26895 loss tensor(0.9221, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 26912/125000 [1:45:40<5:53:07,  4.63it/s]

epoch 0 step 26911 loss tensor(0.9104, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 26928/125000 [1:45:43<5:53:22,  4.63it/s]

epoch 0 step 26927 loss tensor(0.9551, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 26944/125000 [1:45:46<5:53:23,  4.62it/s]

epoch 0 step 26943 loss tensor(0.8463, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 26960/125000 [1:45:50<5:53:00,  4.63it/s]

epoch 0 step 26959 loss tensor(0.9681, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 26976/125000 [1:45:53<5:52:46,  4.63it/s]

epoch 0 step 26975 loss tensor(0.8969, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 26992/125000 [1:45:56<5:53:07,  4.63it/s]

epoch 0 step 26991 loss tensor(1.1356, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27008/125000 [1:46:00<5:52:59,  4.63it/s]

epoch 0 step 27007 loss tensor(0.8718, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27024/125000 [1:46:03<5:52:49,  4.63it/s]

epoch 0 step 27023 loss tensor(0.8824, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27040/125000 [1:46:06<5:52:04,  4.64it/s]

epoch 0 step 27039 loss tensor(0.9503, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27056/125000 [1:46:10<5:53:00,  4.62it/s]

epoch 0 step 27055 loss tensor(0.8748, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27072/125000 [1:46:13<5:52:06,  4.64it/s]

epoch 0 step 27071 loss tensor(1.1983, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27088/125000 [1:46:17<5:52:10,  4.63it/s]

epoch 0 step 27087 loss tensor(0.8293, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27104/125000 [1:46:20<5:52:24,  4.63it/s]

epoch 0 step 27103 loss tensor(1.0648, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27120/125000 [1:46:23<5:52:07,  4.63it/s]

epoch 0 step 27119 loss tensor(0.8560, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27136/125000 [1:46:27<5:52:15,  4.63it/s]

epoch 0 step 27135 loss tensor(0.9297, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27152/125000 [1:46:30<5:52:05,  4.63it/s]

epoch 0 step 27151 loss tensor(0.9934, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27168/125000 [1:46:33<5:51:57,  4.63it/s]

epoch 0 step 27167 loss tensor(1.0363, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27184/125000 [1:46:37<5:51:55,  4.63it/s]

epoch 0 step 27183 loss tensor(0.8785, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27200/125000 [1:46:40<5:52:08,  4.63it/s]

epoch 0 step 27199 loss tensor(0.9922, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27216/125000 [1:46:43<5:51:48,  4.63it/s]

epoch 0 step 27215 loss tensor(1.0454, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27232/125000 [1:46:47<5:52:26,  4.62it/s]

epoch 0 step 27231 loss tensor(0.8961, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27248/125000 [1:46:50<5:51:59,  4.63it/s]

epoch 0 step 27247 loss tensor(1.0844, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27264/125000 [1:46:53<5:51:27,  4.63it/s]

epoch 0 step 27263 loss tensor(0.9807, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27280/125000 [1:46:57<5:51:46,  4.63it/s]

epoch 0 step 27279 loss tensor(1.0928, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27296/125000 [1:47:00<5:51:38,  4.63it/s]

epoch 0 step 27295 loss tensor(0.9853, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27312/125000 [1:47:03<5:51:21,  4.63it/s]

epoch 0 step 27311 loss tensor(0.8796, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27328/125000 [1:47:07<5:51:38,  4.63it/s]

epoch 0 step 27327 loss tensor(0.9678, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27344/125000 [1:47:10<5:51:37,  4.63it/s]

epoch 0 step 27343 loss tensor(0.8899, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27360/125000 [1:47:13<5:51:22,  4.63it/s]

epoch 0 step 27359 loss tensor(1.1208, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27376/125000 [1:47:17<5:51:43,  4.63it/s]

epoch 0 step 27375 loss tensor(1.0916, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27392/125000 [1:47:20<5:51:23,  4.63it/s]

epoch 0 step 27391 loss tensor(0.9268, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27408/125000 [1:47:24<5:51:38,  4.63it/s]

epoch 0 step 27407 loss tensor(0.8497, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27424/125000 [1:47:27<5:51:16,  4.63it/s]

epoch 0 step 27423 loss tensor(0.9598, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27440/125000 [1:47:30<5:51:43,  4.62it/s]

epoch 0 step 27439 loss tensor(0.9346, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27456/125000 [1:47:34<5:51:15,  4.63it/s]

epoch 0 step 27455 loss tensor(0.9305, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27472/125000 [1:47:37<5:50:58,  4.63it/s]

epoch 0 step 27471 loss tensor(0.8653, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27488/125000 [1:47:40<5:51:12,  4.63it/s]

epoch 0 step 27487 loss tensor(0.8216, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27504/125000 [1:47:44<5:50:31,  4.64it/s]

epoch 0 step 27503 loss tensor(0.8368, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27520/125000 [1:47:47<5:50:22,  4.64it/s]

epoch 0 step 27519 loss tensor(0.8042, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27536/125000 [1:47:50<5:50:21,  4.64it/s]

epoch 0 step 27535 loss tensor(0.8845, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27552/125000 [1:47:54<5:50:46,  4.63it/s]

epoch 0 step 27551 loss tensor(0.8360, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27568/125000 [1:47:57<5:51:09,  4.62it/s]

epoch 0 step 27567 loss tensor(0.8648, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27584/125000 [1:48:00<5:50:40,  4.63it/s]

epoch 0 step 27583 loss tensor(1.0614, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27600/125000 [1:48:04<5:50:30,  4.63it/s]

epoch 0 step 27599 loss tensor(0.8860, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27616/125000 [1:48:07<5:50:32,  4.63it/s]

epoch 0 step 27615 loss tensor(0.8976, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27632/125000 [1:48:10<5:50:27,  4.63it/s]

epoch 0 step 27631 loss tensor(0.9993, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27648/125000 [1:48:14<5:50:05,  4.63it/s]

epoch 0 step 27647 loss tensor(1.0083, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27664/125000 [1:48:17<5:50:02,  4.63it/s]

epoch 0 step 27663 loss tensor(0.7931, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27680/125000 [1:48:20<5:50:18,  4.63it/s]

epoch 0 step 27679 loss tensor(0.9185, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27696/125000 [1:48:24<5:50:08,  4.63it/s]

epoch 0 step 27695 loss tensor(0.8687, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27712/125000 [1:48:27<5:50:03,  4.63it/s]

epoch 0 step 27711 loss tensor(0.9809, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27728/125000 [1:48:30<5:50:16,  4.63it/s]

epoch 0 step 27727 loss tensor(0.8577, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27744/125000 [1:48:34<5:50:09,  4.63it/s]

epoch 0 step 27743 loss tensor(0.9135, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27760/125000 [1:48:37<5:49:17,  4.64it/s]

epoch 0 step 27759 loss tensor(1.0054, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27776/125000 [1:48:41<5:49:47,  4.63it/s]

epoch 0 step 27775 loss tensor(1.0153, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27792/125000 [1:48:44<5:49:52,  4.63it/s]

epoch 0 step 27791 loss tensor(1.1297, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27808/125000 [1:48:47<5:49:43,  4.63it/s]

epoch 0 step 27807 loss tensor(0.8916, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27824/125000 [1:48:51<5:49:45,  4.63it/s]

epoch 0 step 27823 loss tensor(0.8465, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27840/125000 [1:48:54<5:49:29,  4.63it/s]

epoch 0 step 27839 loss tensor(0.8467, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27856/125000 [1:48:57<5:49:40,  4.63it/s]

epoch 0 step 27855 loss tensor(0.9168, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27872/125000 [1:49:01<5:49:53,  4.63it/s]

epoch 0 step 27871 loss tensor(1.0239, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27888/125000 [1:49:04<5:49:20,  4.63it/s]

epoch 0 step 27887 loss tensor(0.8204, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27904/125000 [1:49:07<5:49:00,  4.64it/s]

epoch 0 step 27903 loss tensor(0.8171, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27920/125000 [1:49:11<5:49:42,  4.63it/s]

epoch 0 step 27919 loss tensor(0.9957, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27936/125000 [1:49:14<5:49:00,  4.64it/s]

epoch 0 step 27935 loss tensor(0.9773, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27952/125000 [1:49:17<5:49:35,  4.63it/s]

epoch 0 step 27951 loss tensor(0.8296, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27968/125000 [1:49:21<5:49:13,  4.63it/s]

epoch 0 step 27967 loss tensor(0.8892, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 27984/125000 [1:49:24<5:49:18,  4.63it/s]

epoch 0 step 27983 loss tensor(0.9346, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 28000/125000 [1:49:27<5:49:18,  4.63it/s]

epoch 0 step 27999 loss tensor(0.9132, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 28016/125000 [1:49:31<5:49:10,  4.63it/s]

epoch 0 step 28015 loss tensor(0.9367, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 28032/125000 [1:49:34<5:48:46,  4.63it/s]

epoch 0 step 28031 loss tensor(0.8542, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 28048/125000 [1:49:37<5:48:28,  4.64it/s]

epoch 0 step 28047 loss tensor(0.9715, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 28064/125000 [1:49:41<5:48:59,  4.63it/s]

epoch 0 step 28063 loss tensor(0.7352, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 28080/125000 [1:49:44<5:48:45,  4.63it/s]

epoch 0 step 28079 loss tensor(0.9104, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 28096/125000 [1:49:48<5:48:35,  4.63it/s]

epoch 0 step 28095 loss tensor(1.0829, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 28112/125000 [1:49:51<5:48:57,  4.63it/s]

epoch 0 step 28111 loss tensor(0.8651, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28128/125000 [1:49:54<5:48:40,  4.63it/s]

epoch 0 step 28127 loss tensor(0.7916, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28144/125000 [1:49:58<5:48:50,  4.63it/s]

epoch 0 step 28143 loss tensor(0.8142, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28160/125000 [1:50:01<5:48:25,  4.63it/s]

epoch 0 step 28159 loss tensor(0.9386, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28176/125000 [1:50:04<5:48:05,  4.64it/s]

epoch 0 step 28175 loss tensor(0.8052, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28192/125000 [1:50:08<5:48:47,  4.63it/s]

epoch 0 step 28191 loss tensor(1.0103, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28208/125000 [1:50:11<5:48:27,  4.63it/s]

epoch 0 step 28207 loss tensor(1.3804, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28224/125000 [1:50:14<5:48:03,  4.63it/s]

epoch 0 step 28223 loss tensor(0.7556, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28240/125000 [1:50:18<5:48:31,  4.63it/s]

epoch 0 step 28239 loss tensor(0.8399, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28256/125000 [1:50:21<5:48:11,  4.63it/s]

epoch 0 step 28255 loss tensor(0.7090, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28272/125000 [1:50:24<5:48:20,  4.63it/s]

epoch 0 step 28271 loss tensor(0.9395, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28288/125000 [1:50:28<5:48:05,  4.63it/s]

epoch 0 step 28287 loss tensor(0.7824, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28304/125000 [1:50:31<5:47:49,  4.63it/s]

epoch 0 step 28303 loss tensor(0.8571, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28320/125000 [1:50:34<5:48:15,  4.63it/s]

epoch 0 step 28319 loss tensor(0.8513, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28336/125000 [1:50:38<5:48:17,  4.63it/s]

epoch 0 step 28335 loss tensor(0.8562, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28352/125000 [1:50:41<5:47:49,  4.63it/s]

epoch 0 step 28351 loss tensor(0.9460, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28368/125000 [1:50:44<5:47:24,  4.64it/s]

epoch 0 step 28367 loss tensor(0.9361, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28384/125000 [1:50:48<5:47:41,  4.63it/s]

epoch 0 step 28383 loss tensor(0.9969, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28400/125000 [1:50:51<5:47:27,  4.63it/s]

epoch 0 step 28399 loss tensor(0.8544, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28416/125000 [1:50:54<5:47:24,  4.63it/s]

epoch 0 step 28415 loss tensor(0.8780, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28432/125000 [1:50:58<5:47:40,  4.63it/s]

epoch 0 step 28431 loss tensor(1.0247, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28448/125000 [1:51:01<5:47:28,  4.63it/s]

epoch 0 step 28447 loss tensor(0.9648, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28464/125000 [1:51:05<5:47:12,  4.63it/s]

epoch 0 step 28463 loss tensor(0.8462, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28480/125000 [1:51:08<5:47:11,  4.63it/s]

epoch 0 step 28479 loss tensor(0.7970, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28496/125000 [1:51:11<5:47:02,  4.63it/s]

epoch 0 step 28495 loss tensor(0.7320, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28512/125000 [1:51:15<5:46:54,  4.64it/s]

epoch 0 step 28511 loss tensor(0.9180, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28528/125000 [1:51:18<5:47:30,  4.63it/s]

epoch 0 step 28527 loss tensor(0.7480, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28544/125000 [1:51:21<5:47:04,  4.63it/s]

epoch 0 step 28543 loss tensor(0.7707, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28560/125000 [1:51:25<5:47:24,  4.63it/s]

epoch 0 step 28559 loss tensor(1.2387, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28576/125000 [1:51:28<5:47:00,  4.63it/s]

epoch 0 step 28575 loss tensor(0.6752, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28592/125000 [1:51:31<5:46:57,  4.63it/s]

epoch 0 step 28591 loss tensor(1.1011, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28608/125000 [1:51:35<5:46:28,  4.64it/s]

epoch 0 step 28607 loss tensor(0.8195, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28624/125000 [1:51:38<5:46:44,  4.63it/s]

epoch 0 step 28623 loss tensor(0.8255, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28640/125000 [1:51:41<5:46:31,  4.63it/s]

epoch 0 step 28639 loss tensor(0.8579, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28656/125000 [1:51:45<5:46:19,  4.64it/s]

epoch 0 step 28655 loss tensor(0.7956, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28672/125000 [1:51:48<5:46:31,  4.63it/s]

epoch 0 step 28671 loss tensor(0.7719, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28688/125000 [1:51:51<5:46:41,  4.63it/s]

epoch 0 step 28687 loss tensor(0.7629, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28704/125000 [1:51:55<5:46:21,  4.63it/s]

epoch 0 step 28703 loss tensor(0.8457, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28720/125000 [1:51:58<5:46:22,  4.63it/s]

epoch 0 step 28719 loss tensor(1.0563, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28736/125000 [1:52:01<5:46:30,  4.63it/s]

epoch 0 step 28735 loss tensor(1.0444, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28752/125000 [1:52:05<5:46:36,  4.63it/s]

epoch 0 step 28751 loss tensor(0.8005, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28768/125000 [1:52:08<5:45:45,  4.64it/s]

epoch 0 step 28767 loss tensor(0.9672, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28784/125000 [1:52:12<5:46:03,  4.63it/s]

epoch 0 step 28783 loss tensor(0.8105, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28800/125000 [1:52:15<5:45:55,  4.63it/s]

epoch 0 step 28799 loss tensor(0.8530, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28816/125000 [1:52:18<5:45:59,  4.63it/s]

epoch 0 step 28815 loss tensor(0.8307, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28832/125000 [1:52:22<5:45:25,  4.64it/s]

epoch 0 step 28831 loss tensor(0.8580, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28848/125000 [1:52:25<5:45:52,  4.63it/s]

epoch 0 step 28847 loss tensor(0.8195, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28864/125000 [1:52:28<5:46:14,  4.63it/s]

epoch 0 step 28863 loss tensor(0.9368, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28880/125000 [1:52:32<5:45:38,  4.63it/s]

epoch 0 step 28879 loss tensor(0.7203, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28896/125000 [1:52:35<5:46:27,  4.62it/s]

epoch 0 step 28895 loss tensor(0.8233, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28912/125000 [1:52:38<5:45:24,  4.64it/s]

epoch 0 step 28911 loss tensor(0.7713, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28928/125000 [1:52:42<5:45:49,  4.63it/s]

epoch 0 step 28927 loss tensor(0.8455, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28944/125000 [1:52:45<5:45:57,  4.63it/s]

epoch 0 step 28943 loss tensor(0.8665, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28960/125000 [1:52:48<5:45:25,  4.63it/s]

epoch 0 step 28959 loss tensor(0.8462, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28976/125000 [1:52:52<5:45:22,  4.63it/s]

epoch 0 step 28975 loss tensor(0.9301, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 28992/125000 [1:52:55<5:45:47,  4.63it/s]

epoch 0 step 28991 loss tensor(0.7469, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 29008/125000 [1:52:58<5:45:47,  4.63it/s]

epoch 0 step 29007 loss tensor(0.7761, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 29024/125000 [1:53:02<5:45:27,  4.63it/s]

epoch 0 step 29023 loss tensor(0.7384, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 29040/125000 [1:53:05<5:45:26,  4.63it/s]

epoch 0 step 29039 loss tensor(0.7679, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 29056/125000 [1:53:08<5:45:19,  4.63it/s]

epoch 0 step 29055 loss tensor(0.8011, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 29072/125000 [1:53:12<5:45:02,  4.63it/s]

epoch 0 step 29071 loss tensor(0.7971, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 29088/125000 [1:53:15<5:44:55,  4.63it/s]

epoch 0 step 29087 loss tensor(0.7445, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 29104/125000 [1:53:18<5:45:34,  4.62it/s]

epoch 0 step 29103 loss tensor(0.7310, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 29120/125000 [1:53:22<5:45:09,  4.63it/s]

epoch 0 step 29119 loss tensor(0.8905, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 29136/125000 [1:53:25<5:45:19,  4.63it/s]

epoch 0 step 29135 loss tensor(0.9268, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 29152/125000 [1:53:29<5:44:43,  4.63it/s]

epoch 0 step 29151 loss tensor(0.8114, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 29168/125000 [1:53:32<5:45:03,  4.63it/s]

epoch 0 step 29167 loss tensor(0.8480, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 29184/125000 [1:53:35<5:44:33,  4.63it/s]

epoch 0 step 29183 loss tensor(0.7397, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 29200/125000 [1:53:39<5:45:04,  4.63it/s]

epoch 0 step 29199 loss tensor(0.8341, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 29216/125000 [1:53:42<5:45:13,  4.62it/s]

epoch 0 step 29215 loss tensor(0.7279, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 29232/125000 [1:53:45<5:44:34,  4.63it/s]

epoch 0 step 29231 loss tensor(0.8536, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 29248/125000 [1:53:49<5:44:37,  4.63it/s]

epoch 0 step 29247 loss tensor(0.7437, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 29264/125000 [1:53:52<5:44:12,  4.64it/s]

epoch 0 step 29263 loss tensor(0.8001, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 29280/125000 [1:53:55<5:44:21,  4.63it/s]

epoch 0 step 29279 loss tensor(0.9026, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 29296/125000 [1:53:59<5:44:11,  4.63it/s]

epoch 0 step 29295 loss tensor(0.7327, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 29312/125000 [1:54:02<5:43:56,  4.64it/s]

epoch 0 step 29311 loss tensor(0.8367, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 29328/125000 [1:54:05<5:44:25,  4.63it/s]

epoch 0 step 29327 loss tensor(0.7541, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 29344/125000 [1:54:09<5:43:59,  4.63it/s]

epoch 0 step 29343 loss tensor(0.7928, device='cuda:0', grad_fn=<DivBackward0>)


 23%|██▎       | 29360/125000 [1:54:12<5:44:11,  4.63it/s]

epoch 0 step 29359 loss tensor(0.7888, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▎       | 29376/125000 [1:54:15<5:44:12,  4.63it/s]

epoch 0 step 29375 loss tensor(0.6656, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▎       | 29392/125000 [1:54:19<5:44:02,  4.63it/s]

epoch 0 step 29391 loss tensor(0.8457, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▎       | 29408/125000 [1:54:22<5:44:02,  4.63it/s]

epoch 0 step 29407 loss tensor(0.7593, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▎       | 29424/125000 [1:54:25<5:43:56,  4.63it/s]

epoch 0 step 29423 loss tensor(0.7248, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▎       | 29440/125000 [1:54:29<5:43:58,  4.63it/s]

epoch 0 step 29439 loss tensor(1.0575, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▎       | 29456/125000 [1:54:32<5:43:52,  4.63it/s]

epoch 0 step 29455 loss tensor(0.7617, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▎       | 29472/125000 [1:54:36<5:43:51,  4.63it/s]

epoch 0 step 29471 loss tensor(0.7632, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▎       | 29488/125000 [1:54:39<5:43:24,  4.64it/s]

epoch 0 step 29487 loss tensor(0.7355, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▎       | 29504/125000 [1:54:42<5:43:31,  4.63it/s]

epoch 0 step 29503 loss tensor(0.8034, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▎       | 29520/125000 [1:54:46<5:43:22,  4.63it/s]

epoch 0 step 29519 loss tensor(0.9561, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▎       | 29536/125000 [1:54:49<5:43:33,  4.63it/s]

epoch 0 step 29535 loss tensor(0.7814, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▎       | 29552/125000 [1:54:52<5:43:41,  4.63it/s]

epoch 0 step 29551 loss tensor(0.7689, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▎       | 29568/125000 [1:54:56<5:43:35,  4.63it/s]

epoch 0 step 29567 loss tensor(0.7955, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▎       | 29584/125000 [1:54:59<5:43:46,  4.63it/s]

epoch 0 step 29583 loss tensor(0.9163, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▎       | 29600/125000 [1:55:02<5:42:59,  4.64it/s]

epoch 0 step 29599 loss tensor(0.8947, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▎       | 29616/125000 [1:55:06<5:43:29,  4.63it/s]

epoch 0 step 29615 loss tensor(0.7970, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▎       | 29632/125000 [1:55:09<5:42:56,  4.63it/s]

epoch 0 step 29631 loss tensor(0.9066, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▎       | 29648/125000 [1:55:12<5:42:50,  4.64it/s]

epoch 0 step 29647 loss tensor(0.7814, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▎       | 29664/125000 [1:55:16<5:43:26,  4.63it/s]

epoch 0 step 29663 loss tensor(0.8711, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▎       | 29680/125000 [1:55:19<5:42:50,  4.63it/s]

epoch 0 step 29679 loss tensor(0.9413, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 29696/125000 [1:55:22<5:43:00,  4.63it/s]

epoch 0 step 29695 loss tensor(0.8300, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 29712/125000 [1:55:26<5:42:55,  4.63it/s]

epoch 0 step 29711 loss tensor(0.7908, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 29728/125000 [1:55:29<5:42:46,  4.63it/s]

epoch 0 step 29727 loss tensor(0.9873, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 29744/125000 [1:55:32<5:42:58,  4.63it/s]

epoch 0 step 29743 loss tensor(0.7477, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 29760/125000 [1:55:36<5:41:49,  4.64it/s]

epoch 0 step 29759 loss tensor(0.7684, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 29776/125000 [1:55:39<5:42:42,  4.63it/s]

epoch 0 step 29775 loss tensor(0.7925, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 29792/125000 [1:55:42<5:42:46,  4.63it/s]

epoch 0 step 29791 loss tensor(0.7478, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 29808/125000 [1:55:46<5:42:38,  4.63it/s]

epoch 0 step 29807 loss tensor(0.8583, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 29824/125000 [1:55:49<5:42:53,  4.63it/s]

epoch 0 step 29823 loss tensor(0.7908, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 29840/125000 [1:55:53<5:42:44,  4.63it/s]

epoch 0 step 29839 loss tensor(0.6698, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 29856/125000 [1:55:56<5:42:03,  4.64it/s]

epoch 0 step 29855 loss tensor(0.7021, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 29872/125000 [1:55:59<5:41:59,  4.64it/s]

epoch 0 step 29871 loss tensor(0.8315, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 29888/125000 [1:56:03<5:42:17,  4.63it/s]

epoch 0 step 29887 loss tensor(1.0456, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 29904/125000 [1:56:06<5:42:30,  4.63it/s]

epoch 0 step 29903 loss tensor(0.8115, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 29920/125000 [1:56:09<5:42:04,  4.63it/s]

epoch 0 step 29919 loss tensor(0.7596, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 29936/125000 [1:56:13<5:41:48,  4.64it/s]

epoch 0 step 29935 loss tensor(0.9429, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 29952/125000 [1:56:16<5:41:40,  4.64it/s]

epoch 0 step 29951 loss tensor(0.6871, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 29968/125000 [1:56:19<5:42:42,  4.62it/s]

epoch 0 step 29967 loss tensor(0.7843, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 29984/125000 [1:56:23<5:41:56,  4.63it/s]

epoch 0 step 29983 loss tensor(0.7768, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30000/125000 [1:56:26<5:42:09,  4.63it/s]

epoch 0 step 29999 loss tensor(0.8681, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30016/125000 [1:56:29<5:42:05,  4.63it/s]

epoch 0 step 30015 loss tensor(0.7146, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30032/125000 [1:56:33<5:42:24,  4.62it/s]

epoch 0 step 30031 loss tensor(0.7027, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30048/125000 [1:56:36<5:41:46,  4.63it/s]

epoch 0 step 30047 loss tensor(0.7887, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30064/125000 [1:56:39<5:41:40,  4.63it/s]

epoch 0 step 30063 loss tensor(0.8588, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30080/125000 [1:56:43<5:41:42,  4.63it/s]

epoch 0 step 30079 loss tensor(0.9983, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30096/125000 [1:56:46<5:41:33,  4.63it/s]

epoch 0 step 30095 loss tensor(0.7242, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30112/125000 [1:56:49<5:41:24,  4.63it/s]

epoch 0 step 30111 loss tensor(0.7709, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30128/125000 [1:56:53<5:41:35,  4.63it/s]

epoch 0 step 30127 loss tensor(0.8256, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30144/125000 [1:56:56<5:42:09,  4.62it/s]

epoch 0 step 30143 loss tensor(0.7474, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30160/125000 [1:57:00<5:41:14,  4.63it/s]

epoch 0 step 30159 loss tensor(0.6699, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30176/125000 [1:57:03<5:40:54,  4.64it/s]

epoch 0 step 30175 loss tensor(0.9391, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30192/125000 [1:57:06<5:41:01,  4.63it/s]

epoch 0 step 30191 loss tensor(0.6757, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30208/125000 [1:57:10<5:41:20,  4.63it/s]

epoch 0 step 30207 loss tensor(0.9343, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30224/125000 [1:57:13<5:41:14,  4.63it/s]

epoch 0 step 30223 loss tensor(0.7487, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30240/125000 [1:57:16<5:41:22,  4.63it/s]

epoch 0 step 30239 loss tensor(0.7409, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30256/125000 [1:57:20<5:40:46,  4.63it/s]

epoch 0 step 30255 loss tensor(0.7012, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30272/125000 [1:57:23<5:40:49,  4.63it/s]

epoch 0 step 30271 loss tensor(0.7349, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30288/125000 [1:57:26<5:40:38,  4.63it/s]

epoch 0 step 30287 loss tensor(0.7097, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30304/125000 [1:57:30<5:41:04,  4.63it/s]

epoch 0 step 30303 loss tensor(0.9036, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30320/125000 [1:57:33<5:41:00,  4.63it/s]

epoch 0 step 30319 loss tensor(0.7382, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30336/125000 [1:57:36<5:40:45,  4.63it/s]

epoch 0 step 30335 loss tensor(0.7580, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30352/125000 [1:57:40<5:40:30,  4.63it/s]

epoch 0 step 30351 loss tensor(0.8122, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30368/125000 [1:57:43<5:40:13,  4.64it/s]

epoch 0 step 30367 loss tensor(0.7346, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30384/125000 [1:57:46<5:40:26,  4.63it/s]

epoch 0 step 30383 loss tensor(0.7260, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30400/125000 [1:57:50<5:41:06,  4.62it/s]

epoch 0 step 30399 loss tensor(0.8399, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30416/125000 [1:57:53<5:40:34,  4.63it/s]

epoch 0 step 30415 loss tensor(0.6777, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30432/125000 [1:57:56<5:40:16,  4.63it/s]

epoch 0 step 30431 loss tensor(0.7453, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30448/125000 [1:58:00<5:40:21,  4.63it/s]

epoch 0 step 30447 loss tensor(0.7441, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30464/125000 [1:58:03<5:40:05,  4.63it/s]

epoch 0 step 30463 loss tensor(0.7866, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30480/125000 [1:58:06<5:39:54,  4.63it/s]

epoch 0 step 30479 loss tensor(0.7321, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30496/125000 [1:58:10<5:40:11,  4.63it/s]

epoch 0 step 30495 loss tensor(0.7383, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30512/125000 [1:58:13<5:39:13,  4.64it/s]

epoch 0 step 30511 loss tensor(0.8512, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30528/125000 [1:58:17<5:39:48,  4.63it/s]

epoch 0 step 30527 loss tensor(0.7629, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30544/125000 [1:58:20<5:39:47,  4.63it/s]

epoch 0 step 30543 loss tensor(0.6220, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30560/125000 [1:58:23<5:40:00,  4.63it/s]

epoch 0 step 30559 loss tensor(0.7663, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30576/125000 [1:58:27<5:39:58,  4.63it/s]

epoch 0 step 30575 loss tensor(0.8303, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30592/125000 [1:58:30<5:39:32,  4.63it/s]

epoch 0 step 30591 loss tensor(0.8602, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30608/125000 [1:58:33<5:39:28,  4.63it/s]

epoch 0 step 30607 loss tensor(0.8192, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▍       | 30624/125000 [1:58:37<5:39:29,  4.63it/s]

epoch 0 step 30623 loss tensor(0.7232, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 30640/125000 [1:58:40<5:38:57,  4.64it/s]

epoch 0 step 30639 loss tensor(0.8251, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 30656/125000 [1:58:43<5:39:35,  4.63it/s]

epoch 0 step 30655 loss tensor(0.7945, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 30672/125000 [1:58:47<5:39:28,  4.63it/s]

epoch 0 step 30671 loss tensor(0.6388, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 30688/125000 [1:58:50<5:39:31,  4.63it/s]

epoch 0 step 30687 loss tensor(0.7070, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 30704/125000 [1:58:53<5:39:22,  4.63it/s]

epoch 0 step 30703 loss tensor(0.8969, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 30720/125000 [1:58:57<5:38:51,  4.64it/s]

epoch 0 step 30719 loss tensor(0.8125, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 30736/125000 [1:59:00<5:38:45,  4.64it/s]

epoch 0 step 30735 loss tensor(0.7773, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 30752/125000 [1:59:03<5:39:08,  4.63it/s]

epoch 0 step 30751 loss tensor(0.8245, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 30768/125000 [1:59:07<5:39:17,  4.63it/s]

epoch 0 step 30767 loss tensor(0.8902, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 30784/125000 [1:59:10<5:39:07,  4.63it/s]

epoch 0 step 30783 loss tensor(0.6875, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 30800/125000 [1:59:13<5:38:59,  4.63it/s]

epoch 0 step 30799 loss tensor(0.7347, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 30816/125000 [1:59:17<5:39:04,  4.63it/s]

epoch 0 step 30815 loss tensor(0.6003, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 30832/125000 [1:59:20<5:39:36,  4.62it/s]

epoch 0 step 30831 loss tensor(0.7961, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 30848/125000 [1:59:24<5:39:00,  4.63it/s]

epoch 0 step 30847 loss tensor(0.7001, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 30864/125000 [1:59:27<5:38:54,  4.63it/s]

epoch 0 step 30863 loss tensor(0.6246, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 30880/125000 [1:59:30<5:38:28,  4.63it/s]

epoch 0 step 30879 loss tensor(0.7712, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 30896/125000 [1:59:34<5:38:40,  4.63it/s]

epoch 0 step 30895 loss tensor(0.7926, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 30912/125000 [1:59:37<5:38:11,  4.64it/s]

epoch 0 step 30911 loss tensor(0.8217, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 30928/125000 [1:59:40<5:38:03,  4.64it/s]

epoch 0 step 30927 loss tensor(0.7074, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 30944/125000 [1:59:44<5:38:29,  4.63it/s]

epoch 0 step 30943 loss tensor(0.8520, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 30960/125000 [1:59:47<5:38:22,  4.63it/s]

epoch 0 step 30959 loss tensor(0.7945, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 30976/125000 [1:59:50<5:38:06,  4.63it/s]

epoch 0 step 30975 loss tensor(0.7915, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 30992/125000 [1:59:54<5:38:25,  4.63it/s]

epoch 0 step 30991 loss tensor(0.6339, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 31008/125000 [1:59:57<5:38:17,  4.63it/s]

epoch 0 step 31007 loss tensor(0.6146, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 31024/125000 [2:00:00<5:38:05,  4.63it/s]

epoch 0 step 31023 loss tensor(0.7326, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 31040/125000 [2:00:04<5:38:11,  4.63it/s]

epoch 0 step 31039 loss tensor(0.6927, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 31056/125000 [2:00:07<5:38:08,  4.63it/s]

epoch 0 step 31055 loss tensor(0.5944, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 31072/125000 [2:00:10<5:37:38,  4.64it/s]

epoch 0 step 31071 loss tensor(0.7035, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 31088/125000 [2:00:14<5:38:08,  4.63it/s]

epoch 0 step 31087 loss tensor(0.8418, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 31104/125000 [2:00:17<5:37:57,  4.63it/s]

epoch 0 step 31103 loss tensor(0.7851, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 31120/125000 [2:00:20<5:38:10,  4.63it/s]

epoch 0 step 31119 loss tensor(0.8116, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 31136/125000 [2:00:24<5:38:15,  4.62it/s]

epoch 0 step 31135 loss tensor(0.7891, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 31152/125000 [2:00:27<5:38:00,  4.63it/s]

epoch 0 step 31151 loss tensor(0.7723, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 31168/125000 [2:00:30<5:37:32,  4.63it/s]

epoch 0 step 31167 loss tensor(0.9002, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 31184/125000 [2:00:34<5:37:33,  4.63it/s]

epoch 0 step 31183 loss tensor(0.7183, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 31200/125000 [2:00:37<5:37:49,  4.63it/s]

epoch 0 step 31199 loss tensor(0.6540, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 31216/125000 [2:00:41<5:37:44,  4.63it/s]

epoch 0 step 31215 loss tensor(0.8893, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 31232/125000 [2:00:44<5:37:35,  4.63it/s]

epoch 0 step 31231 loss tensor(0.6921, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 31248/125000 [2:00:47<5:37:14,  4.63it/s]

epoch 0 step 31247 loss tensor(0.7020, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31264/125000 [2:00:51<5:37:40,  4.63it/s]

epoch 0 step 31263 loss tensor(0.7038, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31280/125000 [2:00:54<5:36:38,  4.64it/s]

epoch 0 step 31279 loss tensor(0.6957, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31296/125000 [2:00:57<5:37:25,  4.63it/s]

epoch 0 step 31295 loss tensor(0.6121, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31312/125000 [2:01:01<5:37:23,  4.63it/s]

epoch 0 step 31311 loss tensor(0.6626, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31328/125000 [2:01:04<5:37:12,  4.63it/s]

epoch 0 step 31327 loss tensor(0.7620, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31344/125000 [2:01:07<5:37:00,  4.63it/s]

epoch 0 step 31343 loss tensor(0.6597, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31360/125000 [2:01:11<5:37:09,  4.63it/s]

epoch 0 step 31359 loss tensor(0.7679, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31376/125000 [2:01:14<5:36:46,  4.63it/s]

epoch 0 step 31375 loss tensor(0.6865, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31392/125000 [2:01:17<5:37:05,  4.63it/s]

epoch 0 step 31391 loss tensor(0.8421, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31408/125000 [2:01:21<5:36:38,  4.63it/s]

epoch 0 step 31407 loss tensor(0.7539, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31424/125000 [2:01:24<5:36:47,  4.63it/s]

epoch 0 step 31423 loss tensor(0.6577, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31440/125000 [2:01:27<5:36:51,  4.63it/s]

epoch 0 step 31439 loss tensor(0.7863, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31456/125000 [2:01:31<5:36:32,  4.63it/s]

epoch 0 step 31455 loss tensor(0.6980, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31472/125000 [2:01:34<5:36:32,  4.63it/s]

epoch 0 step 31471 loss tensor(0.7641, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31488/125000 [2:01:37<5:36:11,  4.64it/s]

epoch 0 step 31487 loss tensor(0.7664, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31504/125000 [2:01:41<5:36:12,  4.63it/s]

epoch 0 step 31503 loss tensor(0.7946, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31520/125000 [2:01:44<5:36:45,  4.63it/s]

epoch 0 step 31519 loss tensor(0.8135, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31536/125000 [2:01:48<5:36:18,  4.63it/s]

epoch 0 step 31535 loss tensor(0.6942, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31552/125000 [2:01:51<5:36:24,  4.63it/s]

epoch 0 step 31551 loss tensor(0.7339, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31568/125000 [2:01:54<5:36:07,  4.63it/s]

epoch 0 step 31567 loss tensor(0.8406, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31584/125000 [2:01:58<5:35:44,  4.64it/s]

epoch 0 step 31583 loss tensor(0.6277, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31600/125000 [2:02:01<5:35:57,  4.63it/s]

epoch 0 step 31599 loss tensor(0.7144, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31616/125000 [2:02:04<5:36:11,  4.63it/s]

epoch 0 step 31615 loss tensor(0.5981, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31632/125000 [2:02:08<5:36:28,  4.62it/s]

epoch 0 step 31631 loss tensor(0.7826, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31648/125000 [2:02:11<5:36:49,  4.62it/s]

epoch 0 step 31647 loss tensor(0.6523, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31664/125000 [2:02:14<5:35:30,  4.64it/s]

epoch 0 step 31663 loss tensor(0.7100, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31680/125000 [2:02:18<5:35:24,  4.64it/s]

epoch 0 step 31679 loss tensor(0.6981, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31696/125000 [2:02:21<5:35:59,  4.63it/s]

epoch 0 step 31695 loss tensor(0.9063, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31712/125000 [2:02:24<5:35:50,  4.63it/s]

epoch 0 step 31711 loss tensor(0.7589, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31728/125000 [2:02:28<5:35:59,  4.63it/s]

epoch 0 step 31727 loss tensor(0.7172, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31744/125000 [2:02:31<5:35:17,  4.64it/s]

epoch 0 step 31743 loss tensor(0.6672, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31760/125000 [2:02:34<5:35:47,  4.63it/s]

epoch 0 step 31759 loss tensor(0.8231, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31776/125000 [2:02:38<5:35:39,  4.63it/s]

epoch 0 step 31775 loss tensor(0.7217, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31792/125000 [2:02:41<5:35:43,  4.63it/s]

epoch 0 step 31791 loss tensor(0.7860, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31808/125000 [2:02:44<5:35:31,  4.63it/s]

epoch 0 step 31807 loss tensor(0.7803, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31824/125000 [2:02:48<5:35:07,  4.63it/s]

epoch 0 step 31823 loss tensor(0.7611, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31840/125000 [2:02:51<5:36:02,  4.62it/s]

epoch 0 step 31839 loss tensor(0.8174, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31856/125000 [2:02:54<5:35:46,  4.62it/s]

epoch 0 step 31855 loss tensor(0.7129, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▌       | 31872/125000 [2:02:58<5:35:24,  4.63it/s]

epoch 0 step 31871 loss tensor(0.6906, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 31888/125000 [2:03:01<5:35:21,  4.63it/s]

epoch 0 step 31887 loss tensor(0.6781, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 31904/125000 [2:03:05<5:35:27,  4.63it/s]

epoch 0 step 31903 loss tensor(0.6721, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 31920/125000 [2:03:08<5:35:06,  4.63it/s]

epoch 0 step 31919 loss tensor(0.7714, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 31936/125000 [2:03:11<5:35:27,  4.62it/s]

epoch 0 step 31935 loss tensor(0.7818, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 31952/125000 [2:03:15<5:35:00,  4.63it/s]

epoch 0 step 31951 loss tensor(0.5786, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 31968/125000 [2:03:18<5:35:06,  4.63it/s]

epoch 0 step 31967 loss tensor(0.7010, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 31984/125000 [2:03:21<5:34:33,  4.63it/s]

epoch 0 step 31983 loss tensor(1.0203, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32000/125000 [2:03:25<5:34:50,  4.63it/s]

epoch 0 step 31999 loss tensor(0.7137, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32016/125000 [2:03:28<5:35:17,  4.62it/s]

epoch 0 step 32015 loss tensor(0.7840, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32032/125000 [2:03:31<5:35:03,  4.62it/s]

epoch 0 step 32031 loss tensor(0.6495, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32048/125000 [2:03:35<5:33:58,  4.64it/s]

epoch 0 step 32047 loss tensor(0.6439, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32064/125000 [2:03:38<5:34:38,  4.63it/s]

epoch 0 step 32063 loss tensor(0.7182, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32080/125000 [2:03:41<5:35:04,  4.62it/s]

epoch 0 step 32079 loss tensor(0.6819, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32096/125000 [2:03:45<5:34:24,  4.63it/s]

epoch 0 step 32095 loss tensor(0.7936, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32112/125000 [2:03:48<5:34:01,  4.63it/s]

epoch 0 step 32111 loss tensor(0.6495, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32128/125000 [2:03:51<5:34:19,  4.63it/s]

epoch 0 step 32127 loss tensor(0.7268, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32144/125000 [2:03:55<5:34:23,  4.63it/s]

epoch 0 step 32143 loss tensor(0.6927, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32160/125000 [2:03:58<5:33:57,  4.63it/s]

epoch 0 step 32159 loss tensor(0.7559, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32176/125000 [2:04:01<5:33:52,  4.63it/s]

epoch 0 step 32175 loss tensor(0.7030, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32192/125000 [2:04:05<5:34:03,  4.63it/s]

epoch 0 step 32191 loss tensor(0.9778, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32208/125000 [2:04:08<5:34:05,  4.63it/s]

epoch 0 step 32207 loss tensor(0.8701, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32224/125000 [2:04:12<5:34:21,  4.62it/s]

epoch 0 step 32223 loss tensor(0.8145, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32240/125000 [2:04:15<5:34:04,  4.63it/s]

epoch 0 step 32239 loss tensor(0.6719, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32256/125000 [2:04:18<5:33:21,  4.64it/s]

epoch 0 step 32255 loss tensor(0.7030, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32272/125000 [2:04:22<5:33:43,  4.63it/s]

epoch 0 step 32271 loss tensor(0.5570, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32288/125000 [2:04:25<5:33:46,  4.63it/s]

epoch 0 step 32287 loss tensor(0.7317, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32304/125000 [2:04:28<5:34:00,  4.63it/s]

epoch 0 step 32303 loss tensor(0.7596, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32320/125000 [2:04:32<5:33:52,  4.63it/s]

epoch 0 step 32319 loss tensor(0.7740, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32336/125000 [2:04:35<5:33:47,  4.63it/s]

epoch 0 step 32335 loss tensor(0.6284, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32352/125000 [2:04:38<5:33:53,  4.62it/s]

epoch 0 step 32351 loss tensor(0.5817, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32368/125000 [2:04:42<5:33:30,  4.63it/s]

epoch 0 step 32367 loss tensor(0.7334, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32384/125000 [2:04:45<5:33:25,  4.63it/s]

epoch 0 step 32383 loss tensor(0.6524, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32400/125000 [2:04:48<5:32:18,  4.64it/s]

epoch 0 step 32399 loss tensor(0.6298, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32416/125000 [2:04:52<5:32:06,  4.65it/s]

epoch 0 step 32415 loss tensor(0.6395, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32432/125000 [2:04:55<5:31:20,  4.66it/s]

epoch 0 step 32431 loss tensor(0.7031, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32448/125000 [2:04:58<5:31:36,  4.65it/s]

epoch 0 step 32447 loss tensor(0.7389, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32464/125000 [2:05:02<5:32:36,  4.64it/s]

epoch 0 step 32463 loss tensor(0.8047, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32480/125000 [2:05:05<5:32:08,  4.64it/s]

epoch 0 step 32479 loss tensor(0.6563, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32496/125000 [2:05:08<5:31:53,  4.65it/s]

epoch 0 step 32495 loss tensor(0.6910, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32512/125000 [2:05:12<5:32:01,  4.64it/s]

epoch 0 step 32511 loss tensor(0.7269, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32528/125000 [2:05:15<5:32:27,  4.64it/s]

epoch 0 step 32527 loss tensor(0.7078, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32544/125000 [2:05:18<5:32:36,  4.63it/s]

epoch 0 step 32543 loss tensor(0.7686, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32560/125000 [2:05:22<5:32:33,  4.63it/s]

epoch 0 step 32559 loss tensor(0.6959, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32576/125000 [2:05:25<5:32:06,  4.64it/s]

epoch 0 step 32575 loss tensor(0.6003, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32592/125000 [2:05:29<5:32:18,  4.63it/s]

epoch 0 step 32591 loss tensor(0.7454, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32608/125000 [2:05:32<5:31:55,  4.64it/s]

epoch 0 step 32607 loss tensor(0.7214, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32624/125000 [2:05:35<5:32:33,  4.63it/s]

epoch 0 step 32623 loss tensor(0.7493, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32640/125000 [2:05:39<5:32:24,  4.63it/s]

epoch 0 step 32639 loss tensor(0.6262, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32656/125000 [2:05:42<5:32:02,  4.64it/s]

epoch 0 step 32655 loss tensor(0.7805, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32672/125000 [2:05:45<5:31:42,  4.64it/s]

epoch 0 step 32671 loss tensor(0.5877, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32688/125000 [2:05:49<5:31:10,  4.65it/s]

epoch 0 step 32687 loss tensor(0.8946, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32704/125000 [2:05:52<5:31:21,  4.64it/s]

epoch 0 step 32703 loss tensor(0.6742, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32720/125000 [2:05:55<5:31:41,  4.64it/s]

epoch 0 step 32719 loss tensor(0.6223, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32736/125000 [2:05:59<5:31:16,  4.64it/s]

epoch 0 step 32735 loss tensor(0.6340, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32752/125000 [2:06:02<5:31:39,  4.64it/s]

epoch 0 step 32751 loss tensor(0.7114, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32768/125000 [2:06:05<5:31:23,  4.64it/s]

epoch 0 step 32767 loss tensor(0.6087, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32784/125000 [2:06:09<5:31:10,  4.64it/s]

epoch 0 step 32783 loss tensor(0.7093, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 32800/125000 [2:06:12<5:31:44,  4.63it/s]

epoch 0 step 32799 loss tensor(0.6814, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▋       | 32816/125000 [2:06:15<5:31:33,  4.63it/s]

epoch 0 step 32815 loss tensor(0.7380, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▋       | 32832/125000 [2:06:19<5:30:16,  4.65it/s]

epoch 0 step 32831 loss tensor(0.6073, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▋       | 32848/125000 [2:06:22<5:31:34,  4.63it/s]

epoch 0 step 32847 loss tensor(0.7395, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▋       | 32864/125000 [2:06:25<5:31:09,  4.64it/s]

epoch 0 step 32863 loss tensor(0.6998, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▋       | 32880/125000 [2:06:29<5:30:59,  4.64it/s]

epoch 0 step 32879 loss tensor(0.5310, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▋       | 32896/125000 [2:06:32<5:31:25,  4.63it/s]

epoch 0 step 32895 loss tensor(0.8836, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▋       | 32912/125000 [2:06:35<5:31:36,  4.63it/s]

epoch 0 step 32911 loss tensor(0.7400, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▋       | 32928/125000 [2:06:39<5:30:51,  4.64it/s]

epoch 0 step 32927 loss tensor(0.7454, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▋       | 32944/125000 [2:06:42<5:30:31,  4.64it/s]

epoch 0 step 32943 loss tensor(0.7160, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▋       | 32960/125000 [2:06:45<5:30:26,  4.64it/s]

epoch 0 step 32959 loss tensor(0.5716, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▋       | 32976/125000 [2:06:49<5:31:03,  4.63it/s]

epoch 0 step 32975 loss tensor(0.7198, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▋       | 32992/125000 [2:06:52<5:30:29,  4.64it/s]

epoch 0 step 32991 loss tensor(0.6775, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▋       | 33008/125000 [2:06:56<5:30:44,  4.64it/s]

epoch 0 step 33007 loss tensor(0.7170, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▋       | 33024/125000 [2:06:59<5:30:05,  4.64it/s]

epoch 0 step 33023 loss tensor(0.6679, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▋       | 33040/125000 [2:07:02<5:30:14,  4.64it/s]

epoch 0 step 33039 loss tensor(0.7018, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▋       | 33056/125000 [2:07:06<5:30:04,  4.64it/s]

epoch 0 step 33055 loss tensor(0.6673, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▋       | 33072/125000 [2:07:09<5:30:22,  4.64it/s]

epoch 0 step 33071 loss tensor(0.7482, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▋       | 33088/125000 [2:07:12<5:30:16,  4.64it/s]

epoch 0 step 33087 loss tensor(0.7059, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▋       | 33104/125000 [2:07:16<5:30:42,  4.63it/s]

epoch 0 step 33103 loss tensor(0.7608, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▋       | 33120/125000 [2:07:19<5:30:39,  4.63it/s]

epoch 0 step 33119 loss tensor(0.6837, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33136/125000 [2:07:22<5:30:22,  4.63it/s]

epoch 0 step 33135 loss tensor(0.6495, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33152/125000 [2:07:26<5:29:44,  4.64it/s]

epoch 0 step 33151 loss tensor(0.6958, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33168/125000 [2:07:29<5:29:41,  4.64it/s]

epoch 0 step 33167 loss tensor(0.6691, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33184/125000 [2:07:32<5:30:29,  4.63it/s]

epoch 0 step 33183 loss tensor(0.7193, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33200/125000 [2:07:36<5:29:49,  4.64it/s]

epoch 0 step 33199 loss tensor(0.8528, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33216/125000 [2:07:39<5:29:59,  4.64it/s]

epoch 0 step 33215 loss tensor(0.6592, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33232/125000 [2:07:42<5:30:29,  4.63it/s]

epoch 0 step 33231 loss tensor(0.6194, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33248/125000 [2:07:46<5:29:15,  4.64it/s]

epoch 0 step 33247 loss tensor(0.6923, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33264/125000 [2:07:49<5:29:21,  4.64it/s]

epoch 0 step 33263 loss tensor(0.6229, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33280/125000 [2:07:52<5:29:29,  4.64it/s]

epoch 0 step 33279 loss tensor(0.7661, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33296/125000 [2:07:56<5:29:10,  4.64it/s]

epoch 0 step 33295 loss tensor(0.7274, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33312/125000 [2:07:59<5:29:43,  4.63it/s]

epoch 0 step 33311 loss tensor(0.6871, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33328/125000 [2:08:02<5:29:13,  4.64it/s]

epoch 0 step 33327 loss tensor(0.5916, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33344/125000 [2:08:06<5:28:55,  4.64it/s]

epoch 0 step 33343 loss tensor(0.8095, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33360/125000 [2:08:09<5:29:22,  4.64it/s]

epoch 0 step 33359 loss tensor(0.5481, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33376/125000 [2:08:12<5:28:56,  4.64it/s]

epoch 0 step 33375 loss tensor(0.6624, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33392/125000 [2:08:16<5:29:35,  4.63it/s]

epoch 0 step 33391 loss tensor(0.8615, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33408/125000 [2:08:19<5:29:30,  4.63it/s]

epoch 0 step 33407 loss tensor(0.7147, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33424/125000 [2:08:23<5:28:38,  4.64it/s]

epoch 0 step 33423 loss tensor(0.7969, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33440/125000 [2:08:26<5:29:36,  4.63it/s]

epoch 0 step 33439 loss tensor(0.7725, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33456/125000 [2:08:29<5:28:58,  4.64it/s]

epoch 0 step 33455 loss tensor(0.6261, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33472/125000 [2:08:33<5:28:21,  4.65it/s]

epoch 0 step 33471 loss tensor(0.6693, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33488/125000 [2:08:36<5:28:32,  4.64it/s]

epoch 0 step 33487 loss tensor(0.7046, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33504/125000 [2:08:39<5:28:33,  4.64it/s]

epoch 0 step 33503 loss tensor(0.7776, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33520/125000 [2:08:43<5:29:14,  4.63it/s]

epoch 0 step 33519 loss tensor(0.7599, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33536/125000 [2:08:46<5:28:58,  4.63it/s]

epoch 0 step 33535 loss tensor(0.7765, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33552/125000 [2:08:49<5:28:26,  4.64it/s]

epoch 0 step 33551 loss tensor(0.6163, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33568/125000 [2:08:53<5:28:59,  4.63it/s]

epoch 0 step 33567 loss tensor(0.6578, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33584/125000 [2:08:56<5:28:49,  4.63it/s]

epoch 0 step 33583 loss tensor(0.7793, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33600/125000 [2:08:59<5:28:43,  4.63it/s]

epoch 0 step 33599 loss tensor(0.7313, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33616/125000 [2:09:03<5:27:49,  4.65it/s]

epoch 0 step 33615 loss tensor(0.7122, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33632/125000 [2:09:06<5:28:29,  4.64it/s]

epoch 0 step 33631 loss tensor(0.6640, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33648/125000 [2:09:09<5:28:07,  4.64it/s]

epoch 0 step 33647 loss tensor(0.6813, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33664/125000 [2:09:13<5:28:26,  4.63it/s]

epoch 0 step 33663 loss tensor(0.7710, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33680/125000 [2:09:16<5:28:43,  4.63it/s]

epoch 0 step 33679 loss tensor(0.7021, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33696/125000 [2:09:19<5:28:25,  4.63it/s]

epoch 0 step 33695 loss tensor(0.6873, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33712/125000 [2:09:23<5:27:41,  4.64it/s]

epoch 0 step 33711 loss tensor(0.7025, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33728/125000 [2:09:26<5:27:49,  4.64it/s]

epoch 0 step 33727 loss tensor(0.6828, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33744/125000 [2:09:29<5:27:39,  4.64it/s]

epoch 0 step 33743 loss tensor(0.7829, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33760/125000 [2:09:33<5:27:31,  4.64it/s]

epoch 0 step 33759 loss tensor(0.6238, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33776/125000 [2:09:36<5:27:28,  4.64it/s]

epoch 0 step 33775 loss tensor(0.6702, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33792/125000 [2:09:39<5:28:10,  4.63it/s]

epoch 0 step 33791 loss tensor(0.7188, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33808/125000 [2:09:43<5:27:37,  4.64it/s]

epoch 0 step 33807 loss tensor(0.6769, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33824/125000 [2:09:46<5:27:28,  4.64it/s]

epoch 0 step 33823 loss tensor(0.6582, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33840/125000 [2:09:50<5:27:42,  4.64it/s]

epoch 0 step 33839 loss tensor(0.7946, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33856/125000 [2:09:53<5:27:42,  4.64it/s]

epoch 0 step 33855 loss tensor(0.6275, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33872/125000 [2:09:56<5:27:17,  4.64it/s]

epoch 0 step 33871 loss tensor(0.7631, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33888/125000 [2:10:00<5:27:54,  4.63it/s]

epoch 0 step 33887 loss tensor(0.6513, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33904/125000 [2:10:03<5:27:44,  4.63it/s]

epoch 0 step 33903 loss tensor(0.6409, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33920/125000 [2:10:06<5:26:49,  4.64it/s]

epoch 0 step 33919 loss tensor(0.7796, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33936/125000 [2:10:10<5:26:57,  4.64it/s]

epoch 0 step 33935 loss tensor(0.9010, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33952/125000 [2:10:13<5:27:09,  4.64it/s]

epoch 0 step 33951 loss tensor(0.6880, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33968/125000 [2:10:16<5:27:06,  4.64it/s]

epoch 0 step 33967 loss tensor(0.6661, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 33984/125000 [2:10:20<5:27:05,  4.64it/s]

epoch 0 step 33983 loss tensor(0.7858, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 34000/125000 [2:10:23<5:26:19,  4.65it/s]

epoch 0 step 33999 loss tensor(0.7325, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 34016/125000 [2:10:26<5:26:51,  4.64it/s]

epoch 0 step 34015 loss tensor(0.7109, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 34032/125000 [2:10:30<5:26:42,  4.64it/s]

epoch 0 step 34031 loss tensor(0.6436, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 34048/125000 [2:10:33<5:26:19,  4.65it/s]

epoch 0 step 34047 loss tensor(0.6128, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 34064/125000 [2:10:36<5:27:00,  4.63it/s]

epoch 0 step 34063 loss tensor(0.6015, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 34080/125000 [2:10:40<5:26:24,  4.64it/s]

epoch 0 step 34079 loss tensor(0.6330, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 34096/125000 [2:10:43<5:25:48,  4.65it/s]

epoch 0 step 34095 loss tensor(0.5862, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 34112/125000 [2:10:46<5:26:42,  4.64it/s]

epoch 0 step 34111 loss tensor(0.7132, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 34128/125000 [2:10:50<5:26:33,  4.64it/s]

epoch 0 step 34127 loss tensor(0.8389, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 34144/125000 [2:10:53<5:26:54,  4.63it/s]

epoch 0 step 34143 loss tensor(0.5598, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 34160/125000 [2:10:56<5:26:36,  4.64it/s]

epoch 0 step 34159 loss tensor(0.5680, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 34176/125000 [2:11:00<5:26:23,  4.64it/s]

epoch 0 step 34175 loss tensor(0.6741, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 34192/125000 [2:11:03<5:26:01,  4.64it/s]

epoch 0 step 34191 loss tensor(0.7217, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 34208/125000 [2:11:06<5:25:33,  4.65it/s]

epoch 0 step 34207 loss tensor(0.6149, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 34224/125000 [2:11:10<5:26:21,  4.64it/s]

epoch 0 step 34223 loss tensor(0.7417, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 34240/125000 [2:11:13<5:26:29,  4.63it/s]

epoch 0 step 34239 loss tensor(0.7829, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 34256/125000 [2:11:17<5:25:25,  4.65it/s]

epoch 0 step 34255 loss tensor(0.5740, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 34272/125000 [2:11:20<5:25:56,  4.64it/s]

epoch 0 step 34271 loss tensor(0.6917, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 34288/125000 [2:11:23<5:25:40,  4.64it/s]

epoch 0 step 34287 loss tensor(0.7169, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 34304/125000 [2:11:27<5:26:16,  4.63it/s]

epoch 0 step 34303 loss tensor(0.6848, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 34320/125000 [2:11:30<5:26:00,  4.64it/s]

epoch 0 step 34319 loss tensor(0.7346, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 34336/125000 [2:11:33<5:25:44,  4.64it/s]

epoch 0 step 34335 loss tensor(0.6393, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 34352/125000 [2:11:37<5:25:54,  4.64it/s]

epoch 0 step 34351 loss tensor(0.6441, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 34368/125000 [2:11:40<5:25:24,  4.64it/s]

epoch 0 step 34367 loss tensor(0.7418, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34384/125000 [2:11:43<5:25:17,  4.64it/s]

epoch 0 step 34383 loss tensor(0.6115, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34400/125000 [2:11:47<5:25:10,  4.64it/s]

epoch 0 step 34399 loss tensor(0.7323, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34416/125000 [2:11:50<5:25:05,  4.64it/s]

epoch 0 step 34415 loss tensor(0.5983, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34432/125000 [2:11:53<5:25:25,  4.64it/s]

epoch 0 step 34431 loss tensor(0.5963, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34448/125000 [2:11:57<5:25:28,  4.64it/s]

epoch 0 step 34447 loss tensor(0.6926, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34464/125000 [2:12:00<5:24:41,  4.65it/s]

epoch 0 step 34463 loss tensor(0.6102, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34480/125000 [2:12:03<5:25:09,  4.64it/s]

epoch 0 step 34479 loss tensor(0.5985, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34496/125000 [2:12:07<5:24:56,  4.64it/s]

epoch 0 step 34495 loss tensor(0.6141, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34512/125000 [2:12:10<5:24:58,  4.64it/s]

epoch 0 step 34511 loss tensor(0.6754, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34528/125000 [2:12:13<5:24:51,  4.64it/s]

epoch 0 step 34527 loss tensor(0.6740, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34544/125000 [2:12:17<5:25:05,  4.64it/s]

epoch 0 step 34543 loss tensor(0.6462, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34560/125000 [2:12:20<5:25:05,  4.64it/s]

epoch 0 step 34559 loss tensor(0.6222, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34576/125000 [2:12:23<5:24:57,  4.64it/s]

epoch 0 step 34575 loss tensor(0.7104, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34592/125000 [2:12:27<5:24:37,  4.64it/s]

epoch 0 step 34591 loss tensor(0.5933, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34608/125000 [2:12:30<5:23:58,  4.65it/s]

epoch 0 step 34607 loss tensor(0.6148, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34624/125000 [2:12:33<5:24:20,  4.64it/s]

epoch 0 step 34623 loss tensor(0.6482, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34640/125000 [2:12:37<5:24:27,  4.64it/s]

epoch 0 step 34639 loss tensor(0.7879, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34656/125000 [2:12:40<5:24:10,  4.64it/s]

epoch 0 step 34655 loss tensor(0.6762, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34672/125000 [2:12:44<5:23:58,  4.65it/s]

epoch 0 step 34671 loss tensor(0.6535, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34688/125000 [2:12:47<5:24:46,  4.63it/s]

epoch 0 step 34687 loss tensor(0.7053, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34704/125000 [2:12:50<5:24:07,  4.64it/s]

epoch 0 step 34703 loss tensor(0.8828, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34720/125000 [2:12:54<5:24:44,  4.63it/s]

epoch 0 step 34719 loss tensor(0.7821, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34736/125000 [2:12:57<5:24:14,  4.64it/s]

epoch 0 step 34735 loss tensor(0.6121, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34752/125000 [2:13:00<5:24:24,  4.64it/s]

epoch 0 step 34751 loss tensor(0.5944, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34768/125000 [2:13:04<5:24:23,  4.64it/s]

epoch 0 step 34767 loss tensor(0.6506, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34784/125000 [2:13:07<5:24:43,  4.63it/s]

epoch 0 step 34783 loss tensor(0.6228, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34800/125000 [2:13:10<5:24:01,  4.64it/s]

epoch 0 step 34799 loss tensor(0.6742, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34816/125000 [2:13:14<5:23:42,  4.64it/s]

epoch 0 step 34815 loss tensor(0.6536, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34832/125000 [2:13:17<5:24:23,  4.63it/s]

epoch 0 step 34831 loss tensor(0.6999, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34848/125000 [2:13:20<5:24:33,  4.63it/s]

epoch 0 step 34847 loss tensor(0.6148, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34864/125000 [2:13:24<5:24:05,  4.64it/s]

epoch 0 step 34863 loss tensor(0.6227, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34880/125000 [2:13:27<5:24:14,  4.63it/s]

epoch 0 step 34879 loss tensor(0.6069, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34896/125000 [2:13:30<5:22:59,  4.65it/s]

epoch 0 step 34895 loss tensor(0.6411, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34912/125000 [2:13:34<5:24:12,  4.63it/s]

epoch 0 step 34911 loss tensor(0.6642, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34928/125000 [2:13:37<5:23:36,  4.64it/s]

epoch 0 step 34927 loss tensor(0.6905, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34944/125000 [2:13:40<5:24:10,  4.63it/s]

epoch 0 step 34943 loss tensor(0.7894, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34960/125000 [2:13:44<5:23:39,  4.64it/s]

epoch 0 step 34959 loss tensor(0.7395, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34976/125000 [2:13:47<5:24:03,  4.63it/s]

epoch 0 step 34975 loss tensor(0.7598, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 34992/125000 [2:13:50<5:23:27,  4.64it/s]

epoch 0 step 34991 loss tensor(0.6451, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35008/125000 [2:13:54<5:23:09,  4.64it/s]

epoch 0 step 35007 loss tensor(0.5903, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35024/125000 [2:13:57<5:23:17,  4.64it/s]

epoch 0 step 35023 loss tensor(0.6572, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35040/125000 [2:14:00<5:23:16,  4.64it/s]

epoch 0 step 35039 loss tensor(0.6516, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35056/125000 [2:14:04<5:23:07,  4.64it/s]

epoch 0 step 35055 loss tensor(0.6067, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35072/125000 [2:14:07<5:23:19,  4.64it/s]

epoch 0 step 35071 loss tensor(0.5951, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35088/125000 [2:14:11<5:23:00,  4.64it/s]

epoch 0 step 35087 loss tensor(0.6484, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35104/125000 [2:14:14<5:23:12,  4.64it/s]

epoch 0 step 35103 loss tensor(0.6354, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35120/125000 [2:14:17<5:23:11,  4.63it/s]

epoch 0 step 35119 loss tensor(0.6497, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35136/125000 [2:14:21<5:22:41,  4.64it/s]

epoch 0 step 35135 loss tensor(0.6366, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35152/125000 [2:14:24<5:22:02,  4.65it/s]

epoch 0 step 35151 loss tensor(0.6778, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35168/125000 [2:14:27<5:22:24,  4.64it/s]

epoch 0 step 35167 loss tensor(0.7944, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35184/125000 [2:14:31<5:22:27,  4.64it/s]

epoch 0 step 35183 loss tensor(0.7085, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35200/125000 [2:14:34<5:22:27,  4.64it/s]

epoch 0 step 35199 loss tensor(0.7265, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35216/125000 [2:14:37<5:22:40,  4.64it/s]

epoch 0 step 35215 loss tensor(0.6797, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35232/125000 [2:14:41<5:22:18,  4.64it/s]

epoch 0 step 35231 loss tensor(0.5644, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35248/125000 [2:14:44<5:22:21,  4.64it/s]

epoch 0 step 35247 loss tensor(0.7207, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35264/125000 [2:14:47<5:22:37,  4.64it/s]

epoch 0 step 35263 loss tensor(0.5574, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35280/125000 [2:14:51<5:22:26,  4.64it/s]

epoch 0 step 35279 loss tensor(0.6408, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35296/125000 [2:14:54<5:22:16,  4.64it/s]

epoch 0 step 35295 loss tensor(0.6624, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35312/125000 [2:14:57<5:22:14,  4.64it/s]

epoch 0 step 35311 loss tensor(0.5946, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35328/125000 [2:15:01<5:22:11,  4.64it/s]

epoch 0 step 35327 loss tensor(0.6204, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35344/125000 [2:15:04<5:22:27,  4.63it/s]

epoch 0 step 35343 loss tensor(0.5458, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35360/125000 [2:15:07<5:22:48,  4.63it/s]

epoch 0 step 35359 loss tensor(0.7091, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35376/125000 [2:15:11<5:22:06,  4.64it/s]

epoch 0 step 35375 loss tensor(0.4595, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35392/125000 [2:15:14<5:22:11,  4.64it/s]

epoch 0 step 35391 loss tensor(0.6608, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35408/125000 [2:15:17<5:21:57,  4.64it/s]

epoch 0 step 35407 loss tensor(0.6201, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35424/125000 [2:15:21<5:21:39,  4.64it/s]

epoch 0 step 35423 loss tensor(0.5375, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35440/125000 [2:15:24<5:21:28,  4.64it/s]

epoch 0 step 35439 loss tensor(0.6862, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35456/125000 [2:15:27<5:21:58,  4.64it/s]

epoch 0 step 35455 loss tensor(0.5837, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35472/125000 [2:15:31<5:21:42,  4.64it/s]

epoch 0 step 35471 loss tensor(0.6733, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35488/125000 [2:15:34<5:21:40,  4.64it/s]

epoch 0 step 35487 loss tensor(0.7099, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35504/125000 [2:15:37<5:22:18,  4.63it/s]

epoch 0 step 35503 loss tensor(0.7803, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35520/125000 [2:15:41<5:21:24,  4.64it/s]

epoch 0 step 35519 loss tensor(0.5846, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35536/125000 [2:15:44<5:21:16,  4.64it/s]

epoch 0 step 35535 loss tensor(0.5966, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35552/125000 [2:15:48<5:21:39,  4.63it/s]

epoch 0 step 35551 loss tensor(0.5945, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35568/125000 [2:15:51<5:21:47,  4.63it/s]

epoch 0 step 35567 loss tensor(0.5948, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35584/125000 [2:15:54<5:21:03,  4.64it/s]

epoch 0 step 35583 loss tensor(0.6771, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35600/125000 [2:15:58<5:20:54,  4.64it/s]

epoch 0 step 35599 loss tensor(0.6886, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 35616/125000 [2:16:01<5:20:45,  4.64it/s]

epoch 0 step 35615 loss tensor(0.5828, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▊       | 35632/125000 [2:16:04<5:20:58,  4.64it/s]

epoch 0 step 35631 loss tensor(0.5309, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▊       | 35648/125000 [2:16:08<5:21:23,  4.63it/s]

epoch 0 step 35647 loss tensor(0.7145, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▊       | 35664/125000 [2:16:11<5:21:00,  4.64it/s]

epoch 0 step 35663 loss tensor(0.5858, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▊       | 35680/125000 [2:16:14<5:21:08,  4.64it/s]

epoch 0 step 35679 loss tensor(0.6170, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▊       | 35696/125000 [2:16:18<5:20:38,  4.64it/s]

epoch 0 step 35695 loss tensor(0.6399, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▊       | 35712/125000 [2:16:21<5:20:54,  4.64it/s]

epoch 0 step 35711 loss tensor(0.6210, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▊       | 35728/125000 [2:16:24<5:20:30,  4.64it/s]

epoch 0 step 35727 loss tensor(0.7736, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▊       | 35744/125000 [2:16:28<5:21:15,  4.63it/s]

epoch 0 step 35743 loss tensor(0.5526, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▊       | 35760/125000 [2:16:31<5:20:26,  4.64it/s]

epoch 0 step 35759 loss tensor(0.6952, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▊       | 35776/125000 [2:16:34<5:20:25,  4.64it/s]

epoch 0 step 35775 loss tensor(0.8191, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▊       | 35792/125000 [2:16:38<5:20:16,  4.64it/s]

epoch 0 step 35791 loss tensor(0.7493, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▊       | 35808/125000 [2:16:41<5:20:49,  4.63it/s]

epoch 0 step 35807 loss tensor(0.5859, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▊       | 35824/125000 [2:16:44<5:20:19,  4.64it/s]

epoch 0 step 35823 loss tensor(0.5854, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▊       | 35840/125000 [2:16:48<5:20:22,  4.64it/s]

epoch 0 step 35839 loss tensor(0.6230, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▊       | 35856/125000 [2:16:51<5:20:25,  4.64it/s]

epoch 0 step 35855 loss tensor(0.5887, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▊       | 35872/125000 [2:16:54<5:20:29,  4.63it/s]

epoch 0 step 35871 loss tensor(0.6249, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▊       | 35888/125000 [2:16:58<5:19:53,  4.64it/s]

epoch 0 step 35887 loss tensor(0.6540, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▊       | 35904/125000 [2:17:01<5:20:19,  4.64it/s]

epoch 0 step 35903 loss tensor(0.6102, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▊       | 35920/125000 [2:17:04<5:19:22,  4.65it/s]

epoch 0 step 35919 loss tensor(0.6134, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▊       | 35936/125000 [2:17:08<5:19:57,  4.64it/s]

epoch 0 step 35935 loss tensor(0.5752, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 35952/125000 [2:17:11<5:20:01,  4.64it/s]

epoch 0 step 35951 loss tensor(0.5969, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 35968/125000 [2:17:15<5:19:31,  4.64it/s]

epoch 0 step 35967 loss tensor(0.6653, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 35984/125000 [2:17:18<5:20:11,  4.63it/s]

epoch 0 step 35983 loss tensor(0.6382, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36000/125000 [2:17:21<5:19:26,  4.64it/s]

epoch 0 step 35999 loss tensor(0.7218, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36016/125000 [2:17:25<5:20:34,  4.63it/s]

epoch 0 step 36015 loss tensor(0.6664, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36032/125000 [2:17:28<5:19:36,  4.64it/s]

epoch 0 step 36031 loss tensor(0.6113, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36048/125000 [2:17:31<5:19:33,  4.64it/s]

epoch 0 step 36047 loss tensor(0.5917, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36064/125000 [2:17:35<5:19:12,  4.64it/s]

epoch 0 step 36063 loss tensor(0.6982, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36080/125000 [2:17:38<5:18:56,  4.65it/s]

epoch 0 step 36079 loss tensor(0.5812, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36096/125000 [2:17:41<5:19:12,  4.64it/s]

epoch 0 step 36095 loss tensor(0.6307, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36112/125000 [2:17:45<5:19:16,  4.64it/s]

epoch 0 step 36111 loss tensor(0.7123, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36128/125000 [2:17:48<5:19:21,  4.64it/s]

epoch 0 step 36127 loss tensor(0.6239, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36144/125000 [2:17:51<5:19:19,  4.64it/s]

epoch 0 step 36143 loss tensor(0.6750, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36160/125000 [2:17:55<5:18:48,  4.64it/s]

epoch 0 step 36159 loss tensor(0.6464, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36176/125000 [2:17:58<5:18:49,  4.64it/s]

epoch 0 step 36175 loss tensor(0.5814, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36192/125000 [2:18:01<5:19:22,  4.63it/s]

epoch 0 step 36191 loss tensor(0.6093, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36208/125000 [2:18:05<5:18:59,  4.64it/s]

epoch 0 step 36207 loss tensor(0.6961, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36224/125000 [2:18:08<5:18:29,  4.65it/s]

epoch 0 step 36223 loss tensor(0.6556, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36240/125000 [2:18:11<5:19:23,  4.63it/s]

epoch 0 step 36239 loss tensor(0.6258, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36256/125000 [2:18:15<5:18:45,  4.64it/s]

epoch 0 step 36255 loss tensor(0.6023, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36272/125000 [2:18:18<5:18:30,  4.64it/s]

epoch 0 step 36271 loss tensor(0.6791, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36288/125000 [2:18:21<5:18:36,  4.64it/s]

epoch 0 step 36287 loss tensor(0.6052, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36304/125000 [2:18:25<5:18:20,  4.64it/s]

epoch 0 step 36303 loss tensor(0.6310, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36320/125000 [2:18:28<5:18:31,  4.64it/s]

epoch 0 step 36319 loss tensor(0.5712, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36336/125000 [2:18:31<5:17:47,  4.65it/s]

epoch 0 step 36335 loss tensor(0.7052, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36352/125000 [2:18:35<5:18:32,  4.64it/s]

epoch 0 step 36351 loss tensor(0.6505, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36368/125000 [2:18:38<5:18:51,  4.63it/s]

epoch 0 step 36367 loss tensor(0.6771, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36384/125000 [2:18:42<5:18:39,  4.63it/s]

epoch 0 step 36383 loss tensor(0.5645, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36400/125000 [2:18:45<5:17:49,  4.65it/s]

epoch 0 step 36399 loss tensor(0.7321, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36416/125000 [2:18:48<5:18:21,  4.64it/s]

epoch 0 step 36415 loss tensor(0.6909, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36432/125000 [2:18:52<5:17:40,  4.65it/s]

epoch 0 step 36431 loss tensor(0.7087, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36448/125000 [2:18:55<5:18:22,  4.64it/s]

epoch 0 step 36447 loss tensor(0.6077, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36464/125000 [2:18:58<5:18:10,  4.64it/s]

epoch 0 step 36463 loss tensor(0.6077, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36480/125000 [2:19:02<5:17:29,  4.65it/s]

epoch 0 step 36479 loss tensor(0.6079, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36496/125000 [2:19:05<5:18:27,  4.63it/s]

epoch 0 step 36495 loss tensor(0.6980, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36512/125000 [2:19:08<5:17:20,  4.65it/s]

epoch 0 step 36511 loss tensor(0.6253, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36528/125000 [2:19:12<5:17:34,  4.64it/s]

epoch 0 step 36527 loss tensor(0.6097, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36544/125000 [2:19:15<5:18:03,  4.64it/s]

epoch 0 step 36543 loss tensor(0.6925, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36560/125000 [2:19:18<5:17:23,  4.64it/s]

epoch 0 step 36559 loss tensor(0.5308, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36576/125000 [2:19:22<5:17:17,  4.64it/s]

epoch 0 step 36575 loss tensor(0.6159, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36592/125000 [2:19:25<5:17:01,  4.65it/s]

epoch 0 step 36591 loss tensor(0.7533, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36608/125000 [2:19:28<5:17:26,  4.64it/s]

epoch 0 step 36607 loss tensor(0.5237, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36624/125000 [2:19:32<5:17:05,  4.65it/s]

epoch 0 step 36623 loss tensor(0.5495, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36640/125000 [2:19:35<5:17:33,  4.64it/s]

epoch 0 step 36639 loss tensor(0.8856, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36656/125000 [2:19:38<5:16:42,  4.65it/s]

epoch 0 step 36655 loss tensor(0.7570, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36672/125000 [2:19:42<5:18:00,  4.63it/s]

epoch 0 step 36671 loss tensor(0.7504, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36688/125000 [2:19:45<5:17:49,  4.63it/s]

epoch 0 step 36687 loss tensor(0.6872, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36704/125000 [2:19:48<5:17:19,  4.64it/s]

epoch 0 step 36703 loss tensor(0.6299, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36720/125000 [2:19:52<5:16:35,  4.65it/s]

epoch 0 step 36719 loss tensor(0.8497, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36736/125000 [2:19:55<5:16:59,  4.64it/s]

epoch 0 step 36735 loss tensor(0.7012, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36752/125000 [2:19:58<5:17:26,  4.63it/s]

epoch 0 step 36751 loss tensor(0.7232, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36768/125000 [2:20:02<5:16:39,  4.64it/s]

epoch 0 step 36767 loss tensor(0.6699, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36784/125000 [2:20:05<5:17:02,  4.64it/s]

epoch 0 step 36783 loss tensor(0.5666, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36800/125000 [2:20:09<5:16:36,  4.64it/s]

epoch 0 step 36799 loss tensor(0.6733, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36816/125000 [2:20:12<5:16:46,  4.64it/s]

epoch 0 step 36815 loss tensor(0.5716, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36832/125000 [2:20:15<5:16:47,  4.64it/s]

epoch 0 step 36831 loss tensor(0.6931, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36848/125000 [2:20:19<5:16:48,  4.64it/s]

epoch 0 step 36847 loss tensor(0.6658, device='cuda:0', grad_fn=<DivBackward0>)


 29%|██▉       | 36864/125000 [2:20:22<5:16:14,  4.64it/s]

epoch 0 step 36863 loss tensor(0.6289, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 36880/125000 [2:20:25<5:16:46,  4.64it/s]

epoch 0 step 36879 loss tensor(0.6338, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 36896/125000 [2:20:29<5:16:16,  4.64it/s]

epoch 0 step 36895 loss tensor(0.6055, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 36912/125000 [2:20:32<5:16:34,  4.64it/s]

epoch 0 step 36911 loss tensor(0.4311, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 36928/125000 [2:20:35<5:16:39,  4.64it/s]

epoch 0 step 36927 loss tensor(0.6101, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 36944/125000 [2:20:39<5:16:05,  4.64it/s]

epoch 0 step 36943 loss tensor(0.5585, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 36960/125000 [2:20:42<5:16:11,  4.64it/s]

epoch 0 step 36959 loss tensor(0.6797, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 36976/125000 [2:20:45<5:16:48,  4.63it/s]

epoch 0 step 36975 loss tensor(0.6637, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 36992/125000 [2:20:49<5:16:09,  4.64it/s]

epoch 0 step 36991 loss tensor(0.6750, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37008/125000 [2:20:52<5:16:45,  4.63it/s]

epoch 0 step 37007 loss tensor(0.5434, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37024/125000 [2:20:55<5:15:50,  4.64it/s]

epoch 0 step 37023 loss tensor(0.6035, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37040/125000 [2:20:59<5:15:54,  4.64it/s]

epoch 0 step 37039 loss tensor(0.7228, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37056/125000 [2:21:02<5:16:16,  4.63it/s]

epoch 0 step 37055 loss tensor(0.5876, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37072/125000 [2:21:05<5:16:20,  4.63it/s]

epoch 0 step 37071 loss tensor(0.6819, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37088/125000 [2:21:09<5:16:15,  4.63it/s]

epoch 0 step 37087 loss tensor(0.6387, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37104/125000 [2:21:12<5:16:00,  4.64it/s]

epoch 0 step 37103 loss tensor(0.7318, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37120/125000 [2:21:15<5:15:58,  4.64it/s]

epoch 0 step 37119 loss tensor(0.5969, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37136/125000 [2:21:19<5:15:54,  4.64it/s]

epoch 0 step 37135 loss tensor(0.5358, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37152/125000 [2:21:22<5:15:15,  4.64it/s]

epoch 0 step 37151 loss tensor(0.6332, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37168/125000 [2:21:25<5:15:38,  4.64it/s]

epoch 0 step 37167 loss tensor(0.5945, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37184/125000 [2:21:29<5:15:35,  4.64it/s]

epoch 0 step 37183 loss tensor(0.8026, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37200/125000 [2:21:32<5:15:40,  4.64it/s]

epoch 0 step 37199 loss tensor(0.6252, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37216/125000 [2:21:36<5:15:47,  4.63it/s]

epoch 0 step 37215 loss tensor(0.6059, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37232/125000 [2:21:39<5:15:42,  4.63it/s]

epoch 0 step 37231 loss tensor(0.5635, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37248/125000 [2:21:42<5:15:57,  4.63it/s]

epoch 0 step 37247 loss tensor(0.6710, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37264/125000 [2:21:46<5:15:13,  4.64it/s]

epoch 0 step 37263 loss tensor(0.6599, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37280/125000 [2:21:49<5:15:10,  4.64it/s]

epoch 0 step 37279 loss tensor(0.5305, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37296/125000 [2:21:52<5:15:06,  4.64it/s]

epoch 0 step 37295 loss tensor(0.5589, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37312/125000 [2:21:56<5:14:54,  4.64it/s]

epoch 0 step 37311 loss tensor(0.5482, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37328/125000 [2:21:59<5:15:42,  4.63it/s]

epoch 0 step 37327 loss tensor(0.6444, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37344/125000 [2:22:02<5:14:48,  4.64it/s]

epoch 0 step 37343 loss tensor(0.7283, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37360/125000 [2:22:06<5:15:15,  4.63it/s]

epoch 0 step 37359 loss tensor(0.6150, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37376/125000 [2:22:09<5:15:10,  4.63it/s]

epoch 0 step 37375 loss tensor(0.7331, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37392/125000 [2:22:12<5:14:33,  4.64it/s]

epoch 0 step 37391 loss tensor(0.5946, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37408/125000 [2:22:16<5:14:56,  4.64it/s]

epoch 0 step 37407 loss tensor(0.7001, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37424/125000 [2:22:19<5:14:06,  4.65it/s]

epoch 0 step 37423 loss tensor(0.6188, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37440/125000 [2:22:22<5:15:08,  4.63it/s]

epoch 0 step 37439 loss tensor(0.5580, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37456/125000 [2:22:26<5:14:35,  4.64it/s]

epoch 0 step 37455 loss tensor(0.6987, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37472/125000 [2:22:29<5:14:40,  4.64it/s]

epoch 0 step 37471 loss tensor(0.7548, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 37488/125000 [2:22:32<5:14:12,  4.64it/s]

epoch 0 step 37487 loss tensor(0.7029, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37504/125000 [2:22:36<5:14:47,  4.63it/s]

epoch 0 step 37503 loss tensor(0.7448, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37520/125000 [2:22:39<5:14:00,  4.64it/s]

epoch 0 step 37519 loss tensor(0.6318, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37536/125000 [2:22:42<5:14:25,  4.64it/s]

epoch 0 step 37535 loss tensor(0.6565, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37552/125000 [2:22:46<5:13:29,  4.65it/s]

epoch 0 step 37551 loss tensor(0.7315, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37568/125000 [2:22:49<5:14:02,  4.64it/s]

epoch 0 step 37567 loss tensor(0.6164, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37584/125000 [2:22:52<5:14:05,  4.64it/s]

epoch 0 step 37583 loss tensor(0.5443, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37600/125000 [2:22:56<5:13:23,  4.65it/s]

epoch 0 step 37599 loss tensor(0.5558, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37616/125000 [2:22:59<5:13:30,  4.65it/s]

epoch 0 step 37615 loss tensor(0.8128, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37632/125000 [2:23:03<5:13:47,  4.64it/s]

epoch 0 step 37631 loss tensor(0.5899, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37648/125000 [2:23:06<5:13:50,  4.64it/s]

epoch 0 step 37647 loss tensor(0.5996, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37664/125000 [2:23:09<5:13:54,  4.64it/s]

epoch 0 step 37663 loss tensor(0.7673, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37680/125000 [2:23:13<5:14:12,  4.63it/s]

epoch 0 step 37679 loss tensor(0.6579, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37696/125000 [2:23:16<5:13:27,  4.64it/s]

epoch 0 step 37695 loss tensor(0.7026, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37712/125000 [2:23:19<5:13:53,  4.63it/s]

epoch 0 step 37711 loss tensor(0.5842, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37728/125000 [2:23:23<5:13:01,  4.65it/s]

epoch 0 step 37727 loss tensor(0.5982, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37744/125000 [2:23:26<5:13:59,  4.63it/s]

epoch 0 step 37743 loss tensor(0.6418, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37760/125000 [2:23:29<5:13:22,  4.64it/s]

epoch 0 step 37759 loss tensor(0.5589, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37776/125000 [2:23:33<5:13:18,  4.64it/s]

epoch 0 step 37775 loss tensor(0.7470, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37792/125000 [2:23:36<5:12:12,  4.66it/s]

epoch 0 step 37791 loss tensor(0.5049, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37808/125000 [2:23:39<5:11:40,  4.66it/s]

epoch 0 step 37807 loss tensor(0.5691, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37824/125000 [2:23:43<5:13:13,  4.64it/s]

epoch 0 step 37823 loss tensor(0.7090, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37840/125000 [2:23:46<5:13:15,  4.64it/s]

epoch 0 step 37839 loss tensor(0.6504, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37856/125000 [2:23:49<5:13:24,  4.63it/s]

epoch 0 step 37855 loss tensor(0.6161, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37872/125000 [2:23:53<5:13:36,  4.63it/s]

epoch 0 step 37871 loss tensor(0.5537, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37888/125000 [2:23:56<5:13:03,  4.64it/s]

epoch 0 step 37887 loss tensor(0.5753, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37904/125000 [2:23:59<5:13:02,  4.64it/s]

epoch 0 step 37903 loss tensor(0.6784, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37920/125000 [2:24:03<5:12:01,  4.65it/s]

epoch 0 step 37919 loss tensor(0.6016, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37936/125000 [2:24:06<5:13:07,  4.63it/s]

epoch 0 step 37935 loss tensor(0.6590, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37952/125000 [2:24:09<5:11:23,  4.66it/s]

epoch 0 step 37951 loss tensor(0.6872, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37968/125000 [2:24:13<5:11:15,  4.66it/s]

epoch 0 step 37967 loss tensor(0.5603, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 37984/125000 [2:24:16<5:12:13,  4.65it/s]

epoch 0 step 37983 loss tensor(0.6725, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 38000/125000 [2:24:19<5:10:59,  4.66it/s]

epoch 0 step 37999 loss tensor(0.7583, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 38016/125000 [2:24:23<5:10:57,  4.66it/s]

epoch 0 step 38015 loss tensor(0.5639, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 38032/125000 [2:24:26<5:13:06,  4.63it/s]

epoch 0 step 38031 loss tensor(0.6617, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 38048/125000 [2:24:29<5:12:46,  4.63it/s]

epoch 0 step 38047 loss tensor(0.6430, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 38064/125000 [2:24:33<5:12:28,  4.64it/s]

epoch 0 step 38063 loss tensor(0.5209, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 38080/125000 [2:24:36<5:12:18,  4.64it/s]

epoch 0 step 38079 loss tensor(0.6560, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 38096/125000 [2:24:39<5:11:08,  4.65it/s]

epoch 0 step 38095 loss tensor(0.7279, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███       | 38112/125000 [2:24:43<5:11:24,  4.65it/s]

epoch 0 step 38111 loss tensor(0.5771, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38128/125000 [2:24:46<5:11:33,  4.65it/s]

epoch 0 step 38127 loss tensor(0.6262, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38144/125000 [2:24:50<5:12:37,  4.63it/s]

epoch 0 step 38143 loss tensor(0.5380, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38160/125000 [2:24:53<5:12:02,  4.64it/s]

epoch 0 step 38159 loss tensor(0.5377, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38176/125000 [2:24:56<5:12:17,  4.63it/s]

epoch 0 step 38175 loss tensor(0.6607, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38192/125000 [2:25:00<5:12:23,  4.63it/s]

epoch 0 step 38191 loss tensor(0.5193, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38208/125000 [2:25:03<5:11:47,  4.64it/s]

epoch 0 step 38207 loss tensor(0.5195, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38224/125000 [2:25:06<5:11:42,  4.64it/s]

epoch 0 step 38223 loss tensor(0.6549, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38240/125000 [2:25:10<5:11:28,  4.64it/s]

epoch 0 step 38239 loss tensor(0.6586, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38256/125000 [2:25:13<5:12:05,  4.63it/s]

epoch 0 step 38255 loss tensor(0.6421, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38272/125000 [2:25:16<5:11:29,  4.64it/s]

epoch 0 step 38271 loss tensor(0.5340, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38288/125000 [2:25:20<5:11:32,  4.64it/s]

epoch 0 step 38287 loss tensor(0.5692, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38304/125000 [2:25:23<5:11:49,  4.63it/s]

epoch 0 step 38303 loss tensor(0.7426, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38320/125000 [2:25:26<5:10:41,  4.65it/s]

epoch 0 step 38319 loss tensor(0.5877, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38336/125000 [2:25:30<5:11:30,  4.64it/s]

epoch 0 step 38335 loss tensor(0.7300, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38352/125000 [2:25:33<5:10:45,  4.65it/s]

epoch 0 step 38351 loss tensor(0.6742, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38368/125000 [2:25:36<5:10:36,  4.65it/s]

epoch 0 step 38367 loss tensor(0.6373, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38384/125000 [2:25:40<5:10:56,  4.64it/s]

epoch 0 step 38383 loss tensor(0.5579, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38400/125000 [2:25:43<5:10:34,  4.65it/s]

epoch 0 step 38399 loss tensor(0.5937, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38416/125000 [2:25:46<5:10:37,  4.65it/s]

epoch 0 step 38415 loss tensor(0.5629, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38432/125000 [2:25:50<5:10:46,  4.64it/s]

epoch 0 step 38431 loss tensor(0.6676, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38448/125000 [2:25:53<5:10:53,  4.64it/s]

epoch 0 step 38447 loss tensor(0.5339, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38464/125000 [2:25:56<5:11:08,  4.64it/s]

epoch 0 step 38463 loss tensor(0.6514, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38480/125000 [2:26:00<5:10:47,  4.64it/s]

epoch 0 step 38479 loss tensor(0.6316, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38496/125000 [2:26:03<5:10:17,  4.65it/s]

epoch 0 step 38495 loss tensor(0.5847, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38512/125000 [2:26:06<5:10:56,  4.64it/s]

epoch 0 step 38511 loss tensor(0.6582, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38528/125000 [2:26:10<5:10:15,  4.65it/s]

epoch 0 step 38527 loss tensor(0.6194, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38544/125000 [2:26:13<5:11:10,  4.63it/s]

epoch 0 step 38543 loss tensor(0.5006, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38560/125000 [2:26:17<5:10:24,  4.64it/s]

epoch 0 step 38559 loss tensor(0.7334, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38576/125000 [2:26:20<5:10:49,  4.63it/s]

epoch 0 step 38575 loss tensor(0.5960, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38592/125000 [2:26:23<5:10:10,  4.64it/s]

epoch 0 step 38591 loss tensor(0.6679, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38608/125000 [2:26:27<5:09:23,  4.65it/s]

epoch 0 step 38607 loss tensor(0.5666, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38624/125000 [2:26:30<5:10:05,  4.64it/s]

epoch 0 step 38623 loss tensor(0.5868, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38640/125000 [2:26:33<5:09:36,  4.65it/s]

epoch 0 step 38639 loss tensor(0.7931, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38656/125000 [2:26:37<5:09:52,  4.64it/s]

epoch 0 step 38655 loss tensor(0.5481, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38672/125000 [2:26:40<5:10:16,  4.64it/s]

epoch 0 step 38671 loss tensor(0.6651, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38688/125000 [2:26:43<5:09:53,  4.64it/s]

epoch 0 step 38687 loss tensor(0.7317, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38704/125000 [2:26:47<5:09:52,  4.64it/s]

epoch 0 step 38703 loss tensor(0.5716, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38720/125000 [2:26:50<5:09:21,  4.65it/s]

epoch 0 step 38719 loss tensor(0.6055, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38736/125000 [2:26:53<5:09:49,  4.64it/s]

epoch 0 step 38735 loss tensor(0.5712, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38752/125000 [2:26:57<5:09:53,  4.64it/s]

epoch 0 step 38751 loss tensor(0.6176, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38768/125000 [2:27:00<5:09:44,  4.64it/s]

epoch 0 step 38767 loss tensor(0.6182, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38784/125000 [2:27:03<5:09:18,  4.65it/s]

epoch 0 step 38783 loss tensor(0.5696, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38800/125000 [2:27:07<5:09:32,  4.64it/s]

epoch 0 step 38799 loss tensor(0.5547, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38816/125000 [2:27:10<5:09:20,  4.64it/s]

epoch 0 step 38815 loss tensor(0.6399, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38832/125000 [2:27:13<5:09:28,  4.64it/s]

epoch 0 step 38831 loss tensor(0.5870, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38848/125000 [2:27:17<5:09:29,  4.64it/s]

epoch 0 step 38847 loss tensor(0.6394, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38864/125000 [2:27:20<5:09:34,  4.64it/s]

epoch 0 step 38863 loss tensor(0.7285, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38880/125000 [2:27:23<5:08:56,  4.65it/s]

epoch 0 step 38879 loss tensor(0.4614, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38896/125000 [2:27:27<5:09:23,  4.64it/s]

epoch 0 step 38895 loss tensor(0.6909, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38912/125000 [2:27:30<5:08:51,  4.65it/s]

epoch 0 step 38911 loss tensor(0.6110, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38928/125000 [2:27:33<5:09:01,  4.64it/s]

epoch 0 step 38927 loss tensor(0.6323, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38944/125000 [2:27:37<5:09:31,  4.63it/s]

epoch 0 step 38943 loss tensor(0.6198, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38960/125000 [2:27:40<5:09:11,  4.64it/s]

epoch 0 step 38959 loss tensor(0.4887, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38976/125000 [2:27:44<5:08:40,  4.64it/s]

epoch 0 step 38975 loss tensor(0.6640, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 38992/125000 [2:27:47<5:09:22,  4.63it/s]

epoch 0 step 38991 loss tensor(0.6099, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 39008/125000 [2:27:50<5:08:26,  4.65it/s]

epoch 0 step 39007 loss tensor(0.5305, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 39024/125000 [2:27:54<5:08:48,  4.64it/s]

epoch 0 step 39023 loss tensor(0.6038, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 39040/125000 [2:27:57<5:08:16,  4.65it/s]

epoch 0 step 39039 loss tensor(0.6580, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███       | 39056/125000 [2:28:00<5:08:52,  4.64it/s]

epoch 0 step 39055 loss tensor(0.6351, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███▏      | 39072/125000 [2:28:04<5:08:31,  4.64it/s]

epoch 0 step 39071 loss tensor(0.7225, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███▏      | 39088/125000 [2:28:07<5:08:29,  4.64it/s]

epoch 0 step 39087 loss tensor(0.6779, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███▏      | 39104/125000 [2:28:10<5:08:50,  4.64it/s]

epoch 0 step 39103 loss tensor(0.6946, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███▏      | 39120/125000 [2:28:14<5:08:59,  4.63it/s]

epoch 0 step 39119 loss tensor(0.6096, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███▏      | 39136/125000 [2:28:17<5:08:42,  4.64it/s]

epoch 0 step 39135 loss tensor(0.4843, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███▏      | 39152/125000 [2:28:20<5:08:35,  4.64it/s]

epoch 0 step 39151 loss tensor(0.6716, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███▏      | 39168/125000 [2:28:24<5:08:49,  4.63it/s]

epoch 0 step 39167 loss tensor(0.6067, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███▏      | 39184/125000 [2:28:27<5:07:59,  4.64it/s]

epoch 0 step 39183 loss tensor(0.5755, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███▏      | 39200/125000 [2:28:30<5:08:42,  4.63it/s]

epoch 0 step 39199 loss tensor(0.6761, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███▏      | 39216/125000 [2:28:34<5:07:52,  4.64it/s]

epoch 0 step 39215 loss tensor(0.6212, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███▏      | 39232/125000 [2:28:37<5:07:40,  4.65it/s]

epoch 0 step 39231 loss tensor(0.6218, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███▏      | 39248/125000 [2:28:40<5:08:09,  4.64it/s]

epoch 0 step 39247 loss tensor(0.6578, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███▏      | 39264/125000 [2:28:44<5:08:24,  4.63it/s]

epoch 0 step 39263 loss tensor(0.6177, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███▏      | 39280/125000 [2:28:47<5:07:56,  4.64it/s]

epoch 0 step 39279 loss tensor(0.5707, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███▏      | 39296/125000 [2:28:50<5:07:49,  4.64it/s]

epoch 0 step 39295 loss tensor(0.5121, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███▏      | 39312/125000 [2:28:54<5:07:49,  4.64it/s]

epoch 0 step 39311 loss tensor(0.7568, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███▏      | 39328/125000 [2:28:57<5:07:47,  4.64it/s]

epoch 0 step 39327 loss tensor(0.5011, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███▏      | 39344/125000 [2:29:00<5:08:04,  4.63it/s]

epoch 0 step 39343 loss tensor(0.7010, device='cuda:0', grad_fn=<DivBackward0>)


 31%|███▏      | 39360/125000 [2:29:04<5:07:31,  4.64it/s]

epoch 0 step 39359 loss tensor(0.5688, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39376/125000 [2:29:07<5:08:12,  4.63it/s]

epoch 0 step 39375 loss tensor(0.7416, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39392/125000 [2:29:11<5:07:39,  4.64it/s]

epoch 0 step 39391 loss tensor(0.5977, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39408/125000 [2:29:14<5:07:29,  4.64it/s]

epoch 0 step 39407 loss tensor(0.6588, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39424/125000 [2:29:17<5:07:34,  4.64it/s]

epoch 0 step 39423 loss tensor(0.6629, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39440/125000 [2:29:21<5:06:44,  4.65it/s]

epoch 0 step 39439 loss tensor(0.5629, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39456/125000 [2:29:24<5:07:20,  4.64it/s]

epoch 0 step 39455 loss tensor(0.7799, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39472/125000 [2:29:27<5:07:16,  4.64it/s]

epoch 0 step 39471 loss tensor(0.6845, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39488/125000 [2:29:31<5:07:26,  4.64it/s]

epoch 0 step 39487 loss tensor(0.5604, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39504/125000 [2:29:34<5:06:53,  4.64it/s]

epoch 0 step 39503 loss tensor(0.5971, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39520/125000 [2:29:37<5:07:01,  4.64it/s]

epoch 0 step 39519 loss tensor(0.6917, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39536/125000 [2:29:41<5:07:02,  4.64it/s]

epoch 0 step 39535 loss tensor(0.6761, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39552/125000 [2:29:44<5:06:52,  4.64it/s]

epoch 0 step 39551 loss tensor(0.6627, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39568/125000 [2:29:47<5:07:20,  4.63it/s]

epoch 0 step 39567 loss tensor(0.6736, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39584/125000 [2:29:51<5:07:19,  4.63it/s]

epoch 0 step 39583 loss tensor(0.6199, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39600/125000 [2:29:54<5:06:33,  4.64it/s]

epoch 0 step 39599 loss tensor(0.7233, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39616/125000 [2:29:57<5:06:55,  4.64it/s]

epoch 0 step 39615 loss tensor(0.6650, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39632/125000 [2:30:01<5:06:19,  4.64it/s]

epoch 0 step 39631 loss tensor(0.6863, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39648/125000 [2:30:04<5:06:59,  4.63it/s]

epoch 0 step 39647 loss tensor(0.5616, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39664/125000 [2:30:07<5:06:28,  4.64it/s]

epoch 0 step 39663 loss tensor(0.5992, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39680/125000 [2:30:11<5:06:12,  4.64it/s]

epoch 0 step 39679 loss tensor(0.6158, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39696/125000 [2:30:14<5:06:47,  4.63it/s]

epoch 0 step 39695 loss tensor(0.6467, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39712/125000 [2:30:17<5:06:01,  4.64it/s]

epoch 0 step 39711 loss tensor(0.5653, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39728/125000 [2:30:21<5:05:45,  4.65it/s]

epoch 0 step 39727 loss tensor(0.5720, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39744/125000 [2:30:24<5:06:00,  4.64it/s]

epoch 0 step 39743 loss tensor(0.5114, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39760/125000 [2:30:27<5:06:27,  4.64it/s]

epoch 0 step 39759 loss tensor(0.6623, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39776/125000 [2:30:31<5:06:22,  4.64it/s]

epoch 0 step 39775 loss tensor(0.6078, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39792/125000 [2:30:34<5:05:31,  4.65it/s]

epoch 0 step 39791 loss tensor(0.6054, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39808/125000 [2:30:38<5:05:39,  4.65it/s]

epoch 0 step 39807 loss tensor(0.7008, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39824/125000 [2:30:41<5:06:08,  4.64it/s]

epoch 0 step 39823 loss tensor(0.6715, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39840/125000 [2:30:44<5:06:00,  4.64it/s]

epoch 0 step 39839 loss tensor(0.6263, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39856/125000 [2:30:48<5:05:52,  4.64it/s]

epoch 0 step 39855 loss tensor(0.6495, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39872/125000 [2:30:51<5:05:53,  4.64it/s]

epoch 0 step 39871 loss tensor(0.6888, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39888/125000 [2:30:54<5:06:17,  4.63it/s]

epoch 0 step 39887 loss tensor(0.6915, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39904/125000 [2:30:58<5:05:38,  4.64it/s]

epoch 0 step 39903 loss tensor(0.8236, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39920/125000 [2:31:01<5:05:38,  4.64it/s]

epoch 0 step 39919 loss tensor(0.6933, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39936/125000 [2:31:04<5:05:55,  4.63it/s]

epoch 0 step 39935 loss tensor(0.7680, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39952/125000 [2:31:08<5:05:54,  4.63it/s]

epoch 0 step 39951 loss tensor(0.5822, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39968/125000 [2:31:11<5:05:38,  4.64it/s]

epoch 0 step 39967 loss tensor(0.7010, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 39984/125000 [2:31:14<5:05:00,  4.65it/s]

epoch 0 step 39983 loss tensor(0.6064, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40000/125000 [2:31:18<5:05:34,  4.64it/s]

epoch 0 step 39999 loss tensor(0.6397, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40016/125000 [2:31:21<5:05:45,  4.63it/s]

epoch 0 step 40015 loss tensor(0.6584, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40032/125000 [2:31:24<5:05:50,  4.63it/s]

epoch 0 step 40031 loss tensor(0.6223, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40048/125000 [2:31:28<5:05:07,  4.64it/s]

epoch 0 step 40047 loss tensor(0.7574, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40064/125000 [2:31:31<5:05:28,  4.63it/s]

epoch 0 step 40063 loss tensor(0.6140, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40080/125000 [2:31:34<5:04:46,  4.64it/s]

epoch 0 step 40079 loss tensor(0.6511, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40096/125000 [2:31:38<5:05:02,  4.64it/s]

epoch 0 step 40095 loss tensor(0.5405, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40112/125000 [2:31:41<5:05:38,  4.63it/s]

epoch 0 step 40111 loss tensor(0.6105, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40128/125000 [2:31:44<5:05:13,  4.63it/s]

epoch 0 step 40127 loss tensor(0.7372, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40144/125000 [2:31:48<5:05:03,  4.64it/s]

epoch 0 step 40143 loss tensor(0.5829, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40160/125000 [2:31:51<5:05:10,  4.63it/s]

epoch 0 step 40159 loss tensor(0.6976, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40176/125000 [2:31:54<5:05:02,  4.63it/s]

epoch 0 step 40175 loss tensor(0.6682, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40192/125000 [2:31:58<5:03:26,  4.66it/s]

epoch 0 step 40191 loss tensor(0.6769, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40208/125000 [2:32:01<5:03:21,  4.66it/s]

epoch 0 step 40207 loss tensor(0.7772, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40224/125000 [2:32:04<5:03:02,  4.66it/s]

epoch 0 step 40223 loss tensor(0.6815, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40240/125000 [2:32:08<5:03:45,  4.65it/s]

epoch 0 step 40239 loss tensor(0.7067, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40256/125000 [2:32:11<5:03:21,  4.66it/s]

epoch 0 step 40255 loss tensor(0.7124, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40272/125000 [2:32:15<5:03:42,  4.65it/s]

epoch 0 step 40271 loss tensor(0.7296, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40288/125000 [2:32:18<5:03:03,  4.66it/s]

epoch 0 step 40287 loss tensor(0.6718, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40304/125000 [2:32:21<5:04:57,  4.63it/s]

epoch 0 step 40303 loss tensor(0.6851, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40320/125000 [2:32:25<5:04:42,  4.63it/s]

epoch 0 step 40319 loss tensor(0.5823, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40336/125000 [2:32:28<5:04:33,  4.63it/s]

epoch 0 step 40335 loss tensor(0.6374, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40352/125000 [2:32:31<5:05:02,  4.62it/s]

epoch 0 step 40351 loss tensor(0.6254, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40368/125000 [2:32:35<5:03:30,  4.65it/s]

epoch 0 step 40367 loss tensor(0.6289, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40384/125000 [2:32:38<5:02:57,  4.66it/s]

epoch 0 step 40383 loss tensor(0.7181, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40400/125000 [2:32:41<5:02:46,  4.66it/s]

epoch 0 step 40399 loss tensor(0.6246, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40416/125000 [2:32:45<5:03:45,  4.64it/s]

epoch 0 step 40415 loss tensor(0.6775, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40432/125000 [2:32:48<5:04:27,  4.63it/s]

epoch 0 step 40431 loss tensor(0.5925, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40448/125000 [2:32:51<5:03:49,  4.64it/s]

epoch 0 step 40447 loss tensor(0.5708, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40464/125000 [2:32:55<5:03:33,  4.64it/s]

epoch 0 step 40463 loss tensor(0.6822, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40480/125000 [2:32:58<5:04:11,  4.63it/s]

epoch 0 step 40479 loss tensor(0.5837, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40496/125000 [2:33:01<5:04:01,  4.63it/s]

epoch 0 step 40495 loss tensor(0.6711, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40512/125000 [2:33:05<5:04:00,  4.63it/s]

epoch 0 step 40511 loss tensor(0.5680, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40528/125000 [2:33:08<5:03:28,  4.64it/s]

epoch 0 step 40527 loss tensor(0.6567, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40544/125000 [2:33:11<5:03:41,  4.64it/s]

epoch 0 step 40543 loss tensor(0.6176, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40560/125000 [2:33:15<5:03:46,  4.63it/s]

epoch 0 step 40559 loss tensor(0.7378, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40576/125000 [2:33:18<5:02:51,  4.65it/s]

epoch 0 step 40575 loss tensor(0.5190, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40592/125000 [2:33:21<5:03:28,  4.64it/s]

epoch 0 step 40591 loss tensor(0.5955, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40608/125000 [2:33:25<5:03:10,  4.64it/s]

epoch 0 step 40607 loss tensor(0.5904, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 40624/125000 [2:33:28<5:03:38,  4.63it/s]

epoch 0 step 40623 loss tensor(0.7366, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 40640/125000 [2:33:31<5:02:46,  4.64it/s]

epoch 0 step 40639 loss tensor(0.6786, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 40656/125000 [2:33:35<5:03:44,  4.63it/s]

epoch 0 step 40655 loss tensor(0.5749, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 40672/125000 [2:33:38<5:03:19,  4.63it/s]

epoch 0 step 40671 loss tensor(0.7241, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 40688/125000 [2:33:42<5:03:16,  4.63it/s]

epoch 0 step 40687 loss tensor(0.6852, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 40704/125000 [2:33:45<5:03:23,  4.63it/s]

epoch 0 step 40703 loss tensor(0.5934, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 40720/125000 [2:33:48<5:03:15,  4.63it/s]

epoch 0 step 40719 loss tensor(0.6123, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 40736/125000 [2:33:52<5:02:54,  4.64it/s]

epoch 0 step 40735 loss tensor(0.4811, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 40752/125000 [2:33:55<5:02:50,  4.64it/s]

epoch 0 step 40751 loss tensor(0.7625, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 40768/125000 [2:33:58<5:03:18,  4.63it/s]

epoch 0 step 40767 loss tensor(0.6443, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 40784/125000 [2:34:02<5:02:30,  4.64it/s]

epoch 0 step 40783 loss tensor(0.5967, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 40800/125000 [2:34:05<5:02:14,  4.64it/s]

epoch 0 step 40799 loss tensor(0.6467, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 40816/125000 [2:34:08<5:02:31,  4.64it/s]

epoch 0 step 40815 loss tensor(0.5844, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 40832/125000 [2:34:12<5:02:47,  4.63it/s]

epoch 0 step 40831 loss tensor(0.4920, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 40848/125000 [2:34:15<5:02:10,  4.64it/s]

epoch 0 step 40847 loss tensor(0.5438, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 40864/125000 [2:34:18<5:02:50,  4.63it/s]

epoch 0 step 40863 loss tensor(0.5191, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 40880/125000 [2:34:22<5:02:41,  4.63it/s]

epoch 0 step 40879 loss tensor(0.5525, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 40896/125000 [2:34:25<5:02:19,  4.64it/s]

epoch 0 step 40895 loss tensor(0.5871, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 40912/125000 [2:34:28<5:02:09,  4.64it/s]

epoch 0 step 40911 loss tensor(0.5682, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 40928/125000 [2:34:32<5:02:50,  4.63it/s]

epoch 0 step 40927 loss tensor(0.6693, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 40944/125000 [2:34:35<5:01:52,  4.64it/s]

epoch 0 step 40943 loss tensor(0.5556, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 40960/125000 [2:34:38<5:01:45,  4.64it/s]

epoch 0 step 40959 loss tensor(0.6789, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 40976/125000 [2:34:42<5:01:52,  4.64it/s]

epoch 0 step 40975 loss tensor(0.6312, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 40992/125000 [2:34:45<5:01:09,  4.65it/s]

epoch 0 step 40991 loss tensor(0.5255, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41008/125000 [2:34:48<5:00:52,  4.65it/s]

epoch 0 step 41007 loss tensor(0.5879, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41024/125000 [2:34:52<5:00:42,  4.65it/s]

epoch 0 step 41023 loss tensor(0.6604, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41040/125000 [2:34:55<5:01:15,  4.64it/s]

epoch 0 step 41039 loss tensor(0.6762, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41056/125000 [2:34:58<5:02:04,  4.63it/s]

epoch 0 step 41055 loss tensor(0.7860, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41072/125000 [2:35:02<5:01:36,  4.64it/s]

epoch 0 step 41071 loss tensor(0.6472, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41088/125000 [2:35:05<5:00:30,  4.65it/s]

epoch 0 step 41087 loss tensor(0.6453, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41104/125000 [2:35:08<4:59:47,  4.66it/s]

epoch 0 step 41103 loss tensor(0.5629, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41120/125000 [2:35:12<5:00:12,  4.66it/s]

epoch 0 step 41119 loss tensor(0.7400, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41136/125000 [2:35:15<4:59:17,  4.67it/s]

epoch 0 step 41135 loss tensor(0.5544, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41152/125000 [2:35:18<4:59:27,  4.67it/s]

epoch 0 step 41151 loss tensor(0.6583, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41168/125000 [2:35:22<5:02:33,  4.62it/s]

epoch 0 step 41167 loss tensor(0.6028, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41184/125000 [2:35:25<5:01:49,  4.63it/s]

epoch 0 step 41183 loss tensor(0.6407, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41200/125000 [2:35:29<5:01:05,  4.64it/s]

epoch 0 step 41199 loss tensor(0.6458, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41216/125000 [2:35:32<5:02:02,  4.62it/s]

epoch 0 step 41215 loss tensor(0.6462, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41232/125000 [2:35:35<5:01:30,  4.63it/s]

epoch 0 step 41231 loss tensor(0.5629, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41248/125000 [2:35:39<5:00:52,  4.64it/s]

epoch 0 step 41247 loss tensor(0.8189, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41264/125000 [2:35:42<5:02:09,  4.62it/s]

epoch 0 step 41263 loss tensor(0.6660, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41280/125000 [2:35:45<5:01:05,  4.63it/s]

epoch 0 step 41279 loss tensor(0.6026, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41296/125000 [2:35:49<5:00:44,  4.64it/s]

epoch 0 step 41295 loss tensor(0.7156, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41312/125000 [2:35:52<5:00:24,  4.64it/s]

epoch 0 step 41311 loss tensor(0.6451, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41328/125000 [2:35:55<4:59:42,  4.65it/s]

epoch 0 step 41327 loss tensor(0.7551, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41344/125000 [2:35:59<4:59:58,  4.65it/s]

epoch 0 step 41343 loss tensor(0.5570, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41360/125000 [2:36:02<5:00:43,  4.64it/s]

epoch 0 step 41359 loss tensor(0.5875, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41376/125000 [2:36:05<5:00:14,  4.64it/s]

epoch 0 step 41375 loss tensor(0.7293, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41392/125000 [2:36:09<5:00:15,  4.64it/s]

epoch 0 step 41391 loss tensor(0.5982, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41408/125000 [2:36:12<5:00:36,  4.63it/s]

epoch 0 step 41407 loss tensor(0.5443, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41424/125000 [2:36:15<5:00:40,  4.63it/s]

epoch 0 step 41423 loss tensor(0.6144, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41440/125000 [2:36:19<4:59:57,  4.64it/s]

epoch 0 step 41439 loss tensor(0.5874, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41456/125000 [2:36:22<5:00:21,  4.64it/s]

epoch 0 step 41455 loss tensor(0.5553, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41472/125000 [2:36:25<5:00:08,  4.64it/s]

epoch 0 step 41471 loss tensor(0.7246, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41488/125000 [2:36:29<4:59:39,  4.64it/s]

epoch 0 step 41487 loss tensor(0.5820, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41504/125000 [2:36:32<5:00:08,  4.64it/s]

epoch 0 step 41503 loss tensor(0.5993, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41520/125000 [2:36:35<5:00:10,  4.64it/s]

epoch 0 step 41519 loss tensor(0.6660, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41536/125000 [2:36:39<4:59:35,  4.64it/s]

epoch 0 step 41535 loss tensor(0.7291, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41552/125000 [2:36:42<4:59:49,  4.64it/s]

epoch 0 step 41551 loss tensor(0.7133, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41568/125000 [2:36:46<4:59:48,  4.64it/s]

epoch 0 step 41567 loss tensor(0.6950, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41584/125000 [2:36:49<5:00:16,  4.63it/s]

epoch 0 step 41583 loss tensor(0.8517, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41600/125000 [2:36:52<5:00:01,  4.63it/s]

epoch 0 step 41599 loss tensor(0.6562, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41616/125000 [2:36:56<4:59:18,  4.64it/s]

epoch 0 step 41615 loss tensor(0.6148, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41632/125000 [2:36:59<4:59:13,  4.64it/s]

epoch 0 step 41631 loss tensor(0.5540, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41648/125000 [2:37:02<4:59:38,  4.64it/s]

epoch 0 step 41647 loss tensor(0.7313, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41664/125000 [2:37:06<4:59:48,  4.63it/s]

epoch 0 step 41663 loss tensor(0.6225, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41680/125000 [2:37:09<4:57:42,  4.66it/s]

epoch 0 step 41679 loss tensor(0.6511, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41696/125000 [2:37:12<4:58:24,  4.65it/s]

epoch 0 step 41695 loss tensor(0.5955, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41712/125000 [2:37:16<4:57:08,  4.67it/s]

epoch 0 step 41711 loss tensor(0.5763, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41728/125000 [2:37:19<4:59:09,  4.64it/s]

epoch 0 step 41727 loss tensor(0.7525, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41744/125000 [2:37:22<4:59:00,  4.64it/s]

epoch 0 step 41743 loss tensor(0.6375, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41760/125000 [2:37:26<4:58:37,  4.65it/s]

epoch 0 step 41759 loss tensor(0.7212, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41776/125000 [2:37:29<4:58:14,  4.65it/s]

epoch 0 step 41775 loss tensor(0.7289, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41792/125000 [2:37:32<4:59:31,  4.63it/s]

epoch 0 step 41791 loss tensor(0.6734, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41808/125000 [2:37:36<4:59:09,  4.63it/s]

epoch 0 step 41807 loss tensor(0.6879, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41824/125000 [2:37:39<4:59:28,  4.63it/s]

epoch 0 step 41823 loss tensor(0.5262, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41840/125000 [2:37:42<4:59:32,  4.63it/s]

epoch 0 step 41839 loss tensor(0.5912, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41856/125000 [2:37:46<4:58:46,  4.64it/s]

epoch 0 step 41855 loss tensor(0.6294, device='cuda:0', grad_fn=<DivBackward0>)


 33%|███▎      | 41872/125000 [2:37:49<4:57:39,  4.65it/s]

epoch 0 step 41871 loss tensor(0.6541, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▎      | 41888/125000 [2:37:52<4:57:50,  4.65it/s]

epoch 0 step 41887 loss tensor(0.5860, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▎      | 41904/125000 [2:37:56<4:57:50,  4.65it/s]

epoch 0 step 41903 loss tensor(0.6122, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▎      | 41920/125000 [2:37:59<4:58:13,  4.64it/s]

epoch 0 step 41919 loss tensor(0.6072, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▎      | 41936/125000 [2:38:02<4:58:55,  4.63it/s]

epoch 0 step 41935 loss tensor(0.7253, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▎      | 41952/125000 [2:38:06<4:58:53,  4.63it/s]

epoch 0 step 41951 loss tensor(0.7067, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▎      | 41968/125000 [2:38:09<4:57:05,  4.66it/s]

epoch 0 step 41967 loss tensor(0.5114, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▎      | 41984/125000 [2:38:12<4:57:01,  4.66it/s]

epoch 0 step 41983 loss tensor(0.6872, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▎      | 42000/125000 [2:38:16<4:58:10,  4.64it/s]

epoch 0 step 41999 loss tensor(0.6728, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▎      | 42016/125000 [2:38:19<4:56:06,  4.67it/s]

epoch 0 step 42015 loss tensor(0.5743, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▎      | 42032/125000 [2:38:22<4:58:43,  4.63it/s]

epoch 0 step 42031 loss tensor(0.5494, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▎      | 42048/125000 [2:38:26<4:56:34,  4.66it/s]

epoch 0 step 42047 loss tensor(0.5458, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▎      | 42064/125000 [2:38:29<4:55:46,  4.67it/s]

epoch 0 step 42063 loss tensor(0.5842, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▎      | 42080/125000 [2:38:32<4:57:28,  4.65it/s]

epoch 0 step 42079 loss tensor(0.5736, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▎      | 42096/125000 [2:38:36<4:58:32,  4.63it/s]

epoch 0 step 42095 loss tensor(0.6592, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▎      | 42112/125000 [2:38:39<4:57:20,  4.65it/s]

epoch 0 step 42111 loss tensor(0.6851, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▎      | 42128/125000 [2:38:42<4:57:59,  4.63it/s]

epoch 0 step 42127 loss tensor(0.6270, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▎      | 42144/125000 [2:38:46<4:57:28,  4.64it/s]

epoch 0 step 42143 loss tensor(0.6116, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▎      | 42160/125000 [2:38:49<4:56:33,  4.66it/s]

epoch 0 step 42159 loss tensor(0.5679, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▎      | 42176/125000 [2:38:53<4:57:20,  4.64it/s]

epoch 0 step 42175 loss tensor(0.6238, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42192/125000 [2:38:56<4:58:07,  4.63it/s]

epoch 0 step 42191 loss tensor(0.4922, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42208/125000 [2:38:59<4:57:48,  4.63it/s]

epoch 0 step 42207 loss tensor(0.7695, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42224/125000 [2:39:03<4:56:54,  4.65it/s]

epoch 0 step 42223 loss tensor(0.5878, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42240/125000 [2:39:06<4:57:17,  4.64it/s]

epoch 0 step 42239 loss tensor(0.7250, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42256/125000 [2:39:09<4:57:47,  4.63it/s]

epoch 0 step 42255 loss tensor(0.6409, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42272/125000 [2:39:13<4:57:13,  4.64it/s]

epoch 0 step 42271 loss tensor(0.6540, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42288/125000 [2:39:16<4:57:23,  4.64it/s]

epoch 0 step 42287 loss tensor(0.5497, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42304/125000 [2:39:19<4:57:19,  4.64it/s]

epoch 0 step 42303 loss tensor(0.5346, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42320/125000 [2:39:23<4:57:07,  4.64it/s]

epoch 0 step 42319 loss tensor(0.5841, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42336/125000 [2:39:26<4:57:20,  4.63it/s]

epoch 0 step 42335 loss tensor(0.5948, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42352/125000 [2:39:29<4:57:33,  4.63it/s]

epoch 0 step 42351 loss tensor(0.6389, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42368/125000 [2:39:33<4:57:02,  4.64it/s]

epoch 0 step 42367 loss tensor(0.5911, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42384/125000 [2:39:36<4:56:43,  4.64it/s]

epoch 0 step 42383 loss tensor(0.5199, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42400/125000 [2:39:39<4:56:46,  4.64it/s]

epoch 0 step 42399 loss tensor(0.7049, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42416/125000 [2:39:43<4:57:01,  4.63it/s]

epoch 0 step 42415 loss tensor(0.5197, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42432/125000 [2:39:46<4:56:48,  4.64it/s]

epoch 0 step 42431 loss tensor(0.7976, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42448/125000 [2:39:49<4:56:05,  4.65it/s]

epoch 0 step 42447 loss tensor(0.5639, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42464/125000 [2:39:53<4:56:31,  4.64it/s]

epoch 0 step 42463 loss tensor(0.5214, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42480/125000 [2:39:56<4:56:43,  4.63it/s]

epoch 0 step 42479 loss tensor(0.5666, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42496/125000 [2:39:59<4:57:18,  4.63it/s]

epoch 0 step 42495 loss tensor(0.6810, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42512/125000 [2:40:03<4:56:03,  4.64it/s]

epoch 0 step 42511 loss tensor(0.5527, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42528/125000 [2:40:06<4:56:18,  4.64it/s]

epoch 0 step 42527 loss tensor(0.5729, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42544/125000 [2:40:09<4:56:13,  4.64it/s]

epoch 0 step 42543 loss tensor(0.6149, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42560/125000 [2:40:13<4:56:31,  4.63it/s]

epoch 0 step 42559 loss tensor(0.5174, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42576/125000 [2:40:16<4:56:33,  4.63it/s]

epoch 0 step 42575 loss tensor(0.6907, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42592/125000 [2:40:20<4:55:47,  4.64it/s]

epoch 0 step 42591 loss tensor(0.5911, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42608/125000 [2:40:23<4:56:22,  4.63it/s]

epoch 0 step 42607 loss tensor(0.5358, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42624/125000 [2:40:26<4:55:59,  4.64it/s]

epoch 0 step 42623 loss tensor(0.5316, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42640/125000 [2:40:30<4:55:55,  4.64it/s]

epoch 0 step 42639 loss tensor(0.6073, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42656/125000 [2:40:33<4:55:32,  4.64it/s]

epoch 0 step 42655 loss tensor(0.5785, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42672/125000 [2:40:36<4:55:48,  4.64it/s]

epoch 0 step 42671 loss tensor(0.5848, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42688/125000 [2:40:40<4:55:36,  4.64it/s]

epoch 0 step 42687 loss tensor(0.5369, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42704/125000 [2:40:43<4:55:50,  4.64it/s]

epoch 0 step 42703 loss tensor(0.6210, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42720/125000 [2:40:46<4:55:12,  4.65it/s]

epoch 0 step 42719 loss tensor(0.6041, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42736/125000 [2:40:50<4:54:56,  4.65it/s]

epoch 0 step 42735 loss tensor(0.5878, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42752/125000 [2:40:53<4:55:12,  4.64it/s]

epoch 0 step 42751 loss tensor(0.6348, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42768/125000 [2:40:56<4:55:04,  4.64it/s]

epoch 0 step 42767 loss tensor(0.6182, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42784/125000 [2:41:00<4:55:18,  4.64it/s]

epoch 0 step 42783 loss tensor(0.5609, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42800/125000 [2:41:03<4:55:53,  4.63it/s]

epoch 0 step 42799 loss tensor(0.7311, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42816/125000 [2:41:06<4:54:50,  4.65it/s]

epoch 0 step 42815 loss tensor(0.5765, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42832/125000 [2:41:10<4:54:45,  4.65it/s]

epoch 0 step 42831 loss tensor(0.5451, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42848/125000 [2:41:13<4:55:11,  4.64it/s]

epoch 0 step 42847 loss tensor(0.5651, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42864/125000 [2:41:16<4:55:30,  4.63it/s]

epoch 0 step 42863 loss tensor(0.6289, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42880/125000 [2:41:20<4:54:51,  4.64it/s]

epoch 0 step 42879 loss tensor(0.6103, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42896/125000 [2:41:23<4:55:34,  4.63it/s]

epoch 0 step 42895 loss tensor(0.7744, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42912/125000 [2:41:26<4:54:25,  4.65it/s]

epoch 0 step 42911 loss tensor(0.6220, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42928/125000 [2:41:30<4:54:59,  4.64it/s]

epoch 0 step 42927 loss tensor(0.5095, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42944/125000 [2:41:33<4:54:59,  4.64it/s]

epoch 0 step 42943 loss tensor(0.6639, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42960/125000 [2:41:36<4:54:26,  4.64it/s]

epoch 0 step 42959 loss tensor(0.6026, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42976/125000 [2:41:40<4:54:18,  4.65it/s]

epoch 0 step 42975 loss tensor(0.7566, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 42992/125000 [2:41:43<4:54:20,  4.64it/s]

epoch 0 step 42991 loss tensor(0.5633, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 43008/125000 [2:41:47<4:54:34,  4.64it/s]

epoch 0 step 43007 loss tensor(0.6237, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 43024/125000 [2:41:50<4:54:51,  4.63it/s]

epoch 0 step 43023 loss tensor(0.5742, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 43040/125000 [2:41:53<4:54:31,  4.64it/s]

epoch 0 step 43039 loss tensor(0.4918, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 43056/125000 [2:41:57<4:53:46,  4.65it/s]

epoch 0 step 43055 loss tensor(0.6788, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 43072/125000 [2:42:00<4:54:07,  4.64it/s]

epoch 0 step 43071 loss tensor(0.8090, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 43088/125000 [2:42:03<4:54:24,  4.64it/s]

epoch 0 step 43087 loss tensor(0.5555, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 43104/125000 [2:42:07<4:54:17,  4.64it/s]

epoch 0 step 43103 loss tensor(0.5491, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▍      | 43120/125000 [2:42:10<4:54:08,  4.64it/s]

epoch 0 step 43119 loss tensor(0.7145, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43136/125000 [2:42:13<4:54:19,  4.64it/s]

epoch 0 step 43135 loss tensor(0.6322, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43152/125000 [2:42:17<4:54:32,  4.63it/s]

epoch 0 step 43151 loss tensor(0.6705, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43168/125000 [2:42:20<4:53:46,  4.64it/s]

epoch 0 step 43167 loss tensor(0.7768, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43184/125000 [2:42:23<4:53:24,  4.65it/s]

epoch 0 step 43183 loss tensor(0.5723, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43200/125000 [2:42:27<4:53:36,  4.64it/s]

epoch 0 step 43199 loss tensor(0.5856, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43216/125000 [2:42:30<4:54:13,  4.63it/s]

epoch 0 step 43215 loss tensor(0.6082, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43232/125000 [2:42:33<4:53:50,  4.64it/s]

epoch 0 step 43231 loss tensor(0.5628, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43248/125000 [2:42:37<4:54:00,  4.63it/s]

epoch 0 step 43247 loss tensor(0.6736, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43264/125000 [2:42:40<4:53:19,  4.64it/s]

epoch 0 step 43263 loss tensor(0.7582, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43280/125000 [2:42:43<4:54:00,  4.63it/s]

epoch 0 step 43279 loss tensor(0.6082, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43296/125000 [2:42:47<4:53:42,  4.64it/s]

epoch 0 step 43295 loss tensor(0.6288, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43312/125000 [2:42:50<4:53:13,  4.64it/s]

epoch 0 step 43311 loss tensor(0.5735, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43328/125000 [2:42:53<4:53:41,  4.63it/s]

epoch 0 step 43327 loss tensor(0.6339, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43344/125000 [2:42:57<4:53:08,  4.64it/s]

epoch 0 step 43343 loss tensor(0.7380, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43360/125000 [2:43:00<4:52:53,  4.65it/s]

epoch 0 step 43359 loss tensor(0.5494, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43376/125000 [2:43:03<4:53:31,  4.63it/s]

epoch 0 step 43375 loss tensor(0.5939, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43392/125000 [2:43:07<4:53:33,  4.63it/s]

epoch 0 step 43391 loss tensor(0.6254, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43408/125000 [2:43:10<4:52:56,  4.64it/s]

epoch 0 step 43407 loss tensor(0.5331, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43424/125000 [2:43:14<4:52:37,  4.65it/s]

epoch 0 step 43423 loss tensor(0.5438, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43440/125000 [2:43:17<4:53:21,  4.63it/s]

epoch 0 step 43439 loss tensor(0.6345, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43456/125000 [2:43:20<4:53:19,  4.63it/s]

epoch 0 step 43455 loss tensor(0.5773, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43472/125000 [2:43:24<4:52:36,  4.64it/s]

epoch 0 step 43471 loss tensor(0.8939, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43488/125000 [2:43:27<4:52:31,  4.64it/s]

epoch 0 step 43487 loss tensor(0.6306, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43504/125000 [2:43:30<4:53:20,  4.63it/s]

epoch 0 step 43503 loss tensor(0.6047, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43520/125000 [2:43:34<4:52:06,  4.65it/s]

epoch 0 step 43519 loss tensor(0.4801, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43536/125000 [2:43:37<4:52:28,  4.64it/s]

epoch 0 step 43535 loss tensor(0.6296, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43552/125000 [2:43:40<4:52:40,  4.64it/s]

epoch 0 step 43551 loss tensor(0.5588, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43568/125000 [2:43:44<4:52:37,  4.64it/s]

epoch 0 step 43567 loss tensor(0.7314, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43584/125000 [2:43:47<4:52:01,  4.65it/s]

epoch 0 step 43583 loss tensor(0.5320, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43600/125000 [2:43:50<4:52:30,  4.64it/s]

epoch 0 step 43599 loss tensor(0.6081, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43616/125000 [2:43:54<4:52:32,  4.64it/s]

epoch 0 step 43615 loss tensor(0.6323, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43632/125000 [2:43:57<4:52:14,  4.64it/s]

epoch 0 step 43631 loss tensor(0.5719, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43648/125000 [2:44:00<4:52:19,  4.64it/s]

epoch 0 step 43647 loss tensor(0.6664, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43664/125000 [2:44:04<4:52:19,  4.64it/s]

epoch 0 step 43663 loss tensor(0.6277, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43680/125000 [2:44:07<4:52:18,  4.64it/s]

epoch 0 step 43679 loss tensor(0.5388, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43696/125000 [2:44:10<4:52:06,  4.64it/s]

epoch 0 step 43695 loss tensor(0.6291, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43712/125000 [2:44:14<4:51:35,  4.65it/s]

epoch 0 step 43711 loss tensor(0.5177, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43728/125000 [2:44:17<4:52:18,  4.63it/s]

epoch 0 step 43727 loss tensor(0.6837, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▍      | 43744/125000 [2:44:20<4:52:12,  4.63it/s]

epoch 0 step 43743 loss tensor(0.7194, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 43760/125000 [2:44:24<4:51:40,  4.64it/s]

epoch 0 step 43759 loss tensor(0.6949, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 43776/125000 [2:44:27<4:51:55,  4.64it/s]

epoch 0 step 43775 loss tensor(0.5113, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 43792/125000 [2:44:30<4:51:09,  4.65it/s]

epoch 0 step 43791 loss tensor(0.6285, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 43808/125000 [2:44:34<4:51:47,  4.64it/s]

epoch 0 step 43807 loss tensor(0.5978, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 43824/125000 [2:44:37<4:51:36,  4.64it/s]

epoch 0 step 43823 loss tensor(0.5598, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 43840/125000 [2:44:41<4:51:27,  4.64it/s]

epoch 0 step 43839 loss tensor(0.6235, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 43856/125000 [2:44:44<4:51:28,  4.64it/s]

epoch 0 step 43855 loss tensor(0.5508, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 43872/125000 [2:44:47<4:50:49,  4.65it/s]

epoch 0 step 43871 loss tensor(0.5933, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 43888/125000 [2:44:51<4:51:08,  4.64it/s]

epoch 0 step 43887 loss tensor(0.6914, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 43904/125000 [2:44:54<4:51:59,  4.63it/s]

epoch 0 step 43903 loss tensor(0.4500, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 43920/125000 [2:44:57<4:50:58,  4.64it/s]

epoch 0 step 43919 loss tensor(0.5923, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 43936/125000 [2:45:01<4:51:22,  4.64it/s]

epoch 0 step 43935 loss tensor(0.5577, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 43952/125000 [2:45:04<4:51:10,  4.64it/s]

epoch 0 step 43951 loss tensor(0.6613, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 43968/125000 [2:45:07<4:51:05,  4.64it/s]

epoch 0 step 43967 loss tensor(0.6389, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 43984/125000 [2:45:11<4:51:32,  4.63it/s]

epoch 0 step 43983 loss tensor(0.6006, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 44000/125000 [2:45:14<4:50:30,  4.65it/s]

epoch 0 step 43999 loss tensor(0.5927, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 44016/125000 [2:45:17<4:51:11,  4.64it/s]

epoch 0 step 44015 loss tensor(0.6186, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 44032/125000 [2:45:21<4:51:21,  4.63it/s]

epoch 0 step 44031 loss tensor(0.7146, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 44048/125000 [2:45:24<4:50:35,  4.64it/s]

epoch 0 step 44047 loss tensor(0.6545, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 44064/125000 [2:45:27<4:50:51,  4.64it/s]

epoch 0 step 44063 loss tensor(0.5740, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 44080/125000 [2:45:31<4:50:29,  4.64it/s]

epoch 0 step 44079 loss tensor(0.6427, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 44096/125000 [2:45:34<4:50:14,  4.65it/s]

epoch 0 step 44095 loss tensor(0.7412, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 44112/125000 [2:45:37<4:49:59,  4.65it/s]

epoch 0 step 44111 loss tensor(0.5104, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 44128/125000 [2:45:41<4:50:34,  4.64it/s]

epoch 0 step 44127 loss tensor(0.6366, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 44144/125000 [2:45:44<4:50:42,  4.64it/s]

epoch 0 step 44143 loss tensor(0.6878, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 44160/125000 [2:45:47<4:50:06,  4.64it/s]

epoch 0 step 44159 loss tensor(0.5527, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 44176/125000 [2:45:51<4:50:02,  4.64it/s]

epoch 0 step 44175 loss tensor(0.6277, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 44192/125000 [2:45:54<4:50:04,  4.64it/s]

epoch 0 step 44191 loss tensor(0.4431, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 44208/125000 [2:45:57<4:50:19,  4.64it/s]

epoch 0 step 44207 loss tensor(0.5293, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 44224/125000 [2:46:01<4:50:22,  4.64it/s]

epoch 0 step 44223 loss tensor(0.5644, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 44240/125000 [2:46:04<4:49:42,  4.65it/s]

epoch 0 step 44239 loss tensor(0.6023, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 44256/125000 [2:46:08<4:50:01,  4.64it/s]

epoch 0 step 44255 loss tensor(0.5178, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 44272/125000 [2:46:11<4:49:41,  4.64it/s]

epoch 0 step 44271 loss tensor(0.8066, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 44288/125000 [2:46:14<4:50:32,  4.63it/s]

epoch 0 step 44287 loss tensor(0.7474, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 44304/125000 [2:46:18<4:50:34,  4.63it/s]

epoch 0 step 44303 loss tensor(0.5681, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 44320/125000 [2:46:21<4:49:26,  4.65it/s]

epoch 0 step 44319 loss tensor(0.5577, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 44336/125000 [2:46:24<4:49:45,  4.64it/s]

epoch 0 step 44335 loss tensor(0.6164, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 44352/125000 [2:46:28<4:50:01,  4.63it/s]

epoch 0 step 44351 loss tensor(0.7067, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 44368/125000 [2:46:31<4:49:57,  4.63it/s]

epoch 0 step 44367 loss tensor(0.5722, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44384/125000 [2:46:34<4:49:27,  4.64it/s]

epoch 0 step 44383 loss tensor(0.5639, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44400/125000 [2:46:38<4:49:26,  4.64it/s]

epoch 0 step 44399 loss tensor(0.6089, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44416/125000 [2:46:41<4:49:43,  4.64it/s]

epoch 0 step 44415 loss tensor(0.8146, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44432/125000 [2:46:44<4:49:10,  4.64it/s]

epoch 0 step 44431 loss tensor(0.6548, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44448/125000 [2:46:48<4:49:41,  4.63it/s]

epoch 0 step 44447 loss tensor(0.6007, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44464/125000 [2:46:51<4:49:36,  4.63it/s]

epoch 0 step 44463 loss tensor(0.5200, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44480/125000 [2:46:54<4:49:50,  4.63it/s]

epoch 0 step 44479 loss tensor(0.6773, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44496/125000 [2:46:58<4:48:07,  4.66it/s]

epoch 0 step 44495 loss tensor(0.4844, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44512/125000 [2:47:01<4:47:51,  4.66it/s]

epoch 0 step 44511 loss tensor(0.5955, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44528/125000 [2:47:04<4:48:33,  4.65it/s]

epoch 0 step 44527 loss tensor(0.5849, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44544/125000 [2:47:08<4:47:43,  4.66it/s]

epoch 0 step 44543 loss tensor(0.5617, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44560/125000 [2:47:11<4:48:44,  4.64it/s]

epoch 0 step 44559 loss tensor(0.6036, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44576/125000 [2:47:14<4:49:44,  4.63it/s]

epoch 0 step 44575 loss tensor(0.5960, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44592/125000 [2:47:18<4:49:28,  4.63it/s]

epoch 0 step 44591 loss tensor(0.5695, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44608/125000 [2:47:21<4:50:03,  4.62it/s]

epoch 0 step 44607 loss tensor(0.6255, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44624/125000 [2:47:24<4:48:49,  4.64it/s]

epoch 0 step 44623 loss tensor(0.7078, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44640/125000 [2:47:28<4:48:47,  4.64it/s]

epoch 0 step 44639 loss tensor(0.5395, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44656/125000 [2:47:31<4:48:28,  4.64it/s]

epoch 0 step 44655 loss tensor(0.5808, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44672/125000 [2:47:35<4:48:54,  4.63it/s]

epoch 0 step 44671 loss tensor(0.5772, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44688/125000 [2:47:38<4:48:29,  4.64it/s]

epoch 0 step 44687 loss tensor(0.6225, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44704/125000 [2:47:41<4:48:23,  4.64it/s]

epoch 0 step 44703 loss tensor(0.4894, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44720/125000 [2:47:45<4:48:04,  4.64it/s]

epoch 0 step 44719 loss tensor(0.5183, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44736/125000 [2:47:48<4:48:29,  4.64it/s]

epoch 0 step 44735 loss tensor(0.5272, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44752/125000 [2:47:51<4:48:10,  4.64it/s]

epoch 0 step 44751 loss tensor(0.6307, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44768/125000 [2:47:55<4:48:41,  4.63it/s]

epoch 0 step 44767 loss tensor(0.4881, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44784/125000 [2:47:58<4:47:46,  4.65it/s]

epoch 0 step 44783 loss tensor(0.6424, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44800/125000 [2:48:01<4:48:03,  4.64it/s]

epoch 0 step 44799 loss tensor(0.5490, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44816/125000 [2:48:05<4:48:14,  4.64it/s]

epoch 0 step 44815 loss tensor(0.5709, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44832/125000 [2:48:08<4:48:12,  4.64it/s]

epoch 0 step 44831 loss tensor(0.6138, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44848/125000 [2:48:11<4:48:23,  4.63it/s]

epoch 0 step 44847 loss tensor(0.5782, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44864/125000 [2:48:15<4:47:46,  4.64it/s]

epoch 0 step 44863 loss tensor(0.5437, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44880/125000 [2:48:18<4:47:56,  4.64it/s]

epoch 0 step 44879 loss tensor(0.6820, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44896/125000 [2:48:21<4:48:27,  4.63it/s]

epoch 0 step 44895 loss tensor(0.6071, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44912/125000 [2:48:25<4:47:04,  4.65it/s]

epoch 0 step 44911 loss tensor(0.5559, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44928/125000 [2:48:28<4:47:06,  4.65it/s]

epoch 0 step 44927 loss tensor(0.5468, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44944/125000 [2:48:31<4:47:49,  4.64it/s]

epoch 0 step 44943 loss tensor(0.6499, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44960/125000 [2:48:35<4:47:21,  4.64it/s]

epoch 0 step 44959 loss tensor(0.7843, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44976/125000 [2:48:38<4:47:35,  4.64it/s]

epoch 0 step 44975 loss tensor(0.5892, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 44992/125000 [2:48:41<4:46:44,  4.65it/s]

epoch 0 step 44991 loss tensor(0.5161, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 45008/125000 [2:48:45<4:47:07,  4.64it/s]

epoch 0 step 45007 loss tensor(0.6196, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 45024/125000 [2:48:48<4:47:02,  4.64it/s]

epoch 0 step 45023 loss tensor(0.5759, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 45040/125000 [2:48:51<4:46:46,  4.65it/s]

epoch 0 step 45039 loss tensor(0.5706, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 45056/125000 [2:48:55<4:47:08,  4.64it/s]

epoch 0 step 45055 loss tensor(0.6210, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 45072/125000 [2:48:58<4:46:51,  4.64it/s]

epoch 0 step 45071 loss tensor(0.6778, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 45088/125000 [2:49:01<4:47:24,  4.63it/s]

epoch 0 step 45087 loss tensor(0.5925, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 45104/125000 [2:49:05<4:46:37,  4.65it/s]

epoch 0 step 45103 loss tensor(0.6760, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 45120/125000 [2:49:08<4:47:25,  4.63it/s]

epoch 0 step 45119 loss tensor(0.5708, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 45136/125000 [2:49:12<4:46:30,  4.65it/s]

epoch 0 step 45135 loss tensor(0.5413, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 45152/125000 [2:49:15<4:46:49,  4.64it/s]

epoch 0 step 45151 loss tensor(0.5045, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 45168/125000 [2:49:18<4:47:05,  4.63it/s]

epoch 0 step 45167 loss tensor(0.6633, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 45184/125000 [2:49:22<4:46:22,  4.65it/s]

epoch 0 step 45183 loss tensor(0.5611, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 45200/125000 [2:49:25<4:46:30,  4.64it/s]

epoch 0 step 45199 loss tensor(0.5824, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 45216/125000 [2:49:28<4:47:04,  4.63it/s]

epoch 0 step 45215 loss tensor(0.5883, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 45232/125000 [2:49:32<4:46:13,  4.64it/s]

epoch 0 step 45231 loss tensor(0.5459, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 45248/125000 [2:49:35<4:45:58,  4.65it/s]

epoch 0 step 45247 loss tensor(0.6140, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 45264/125000 [2:49:38<4:46:24,  4.64it/s]

epoch 0 step 45263 loss tensor(0.5956, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 45280/125000 [2:49:42<4:46:28,  4.64it/s]

epoch 0 step 45279 loss tensor(0.6726, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 45296/125000 [2:49:45<4:46:25,  4.64it/s]

epoch 0 step 45295 loss tensor(0.6215, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 45312/125000 [2:49:48<4:46:14,  4.64it/s]

epoch 0 step 45311 loss tensor(0.5642, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 45328/125000 [2:49:52<4:46:14,  4.64it/s]

epoch 0 step 45327 loss tensor(0.6064, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 45344/125000 [2:49:55<4:46:21,  4.64it/s]

epoch 0 step 45343 loss tensor(0.6609, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 45360/125000 [2:49:58<4:45:58,  4.64it/s]

epoch 0 step 45359 loss tensor(0.5585, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 45376/125000 [2:50:02<4:46:39,  4.63it/s]

epoch 0 step 45375 loss tensor(0.6598, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 45392/125000 [2:50:05<4:45:24,  4.65it/s]

epoch 0 step 45391 loss tensor(0.5569, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 45408/125000 [2:50:08<4:46:10,  4.64it/s]

epoch 0 step 45407 loss tensor(0.5035, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 45424/125000 [2:50:12<4:45:28,  4.65it/s]

epoch 0 step 45423 loss tensor(0.5053, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 45440/125000 [2:50:15<4:45:56,  4.64it/s]

epoch 0 step 45439 loss tensor(0.5981, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 45456/125000 [2:50:18<4:45:35,  4.64it/s]

epoch 0 step 45455 loss tensor(0.4810, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 45472/125000 [2:50:22<4:46:19,  4.63it/s]

epoch 0 step 45471 loss tensor(0.5038, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 45488/125000 [2:50:25<4:45:07,  4.65it/s]

epoch 0 step 45487 loss tensor(0.5968, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 45504/125000 [2:50:29<4:44:55,  4.65it/s]

epoch 0 step 45503 loss tensor(0.5392, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 45520/125000 [2:50:32<4:45:58,  4.63it/s]

epoch 0 step 45519 loss tensor(0.6699, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 45536/125000 [2:50:35<4:45:28,  4.64it/s]

epoch 0 step 45535 loss tensor(0.5788, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 45552/125000 [2:50:39<4:45:11,  4.64it/s]

epoch 0 step 45551 loss tensor(0.5927, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 45568/125000 [2:50:42<4:44:51,  4.65it/s]

epoch 0 step 45567 loss tensor(0.4965, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 45584/125000 [2:50:45<4:45:08,  4.64it/s]

epoch 0 step 45583 loss tensor(0.5562, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 45600/125000 [2:50:49<4:44:59,  4.64it/s]

epoch 0 step 45599 loss tensor(0.6390, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▋      | 45616/125000 [2:50:52<4:44:52,  4.64it/s]

epoch 0 step 45615 loss tensor(0.6077, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 45632/125000 [2:50:55<4:44:43,  4.65it/s]

epoch 0 step 45631 loss tensor(0.5641, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 45648/125000 [2:50:59<4:44:54,  4.64it/s]

epoch 0 step 45647 loss tensor(0.6269, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 45664/125000 [2:51:02<4:45:29,  4.63it/s]

epoch 0 step 45663 loss tensor(0.5655, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 45680/125000 [2:51:05<4:44:50,  4.64it/s]

epoch 0 step 45679 loss tensor(0.6596, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 45696/125000 [2:51:09<4:44:56,  4.64it/s]

epoch 0 step 45695 loss tensor(0.6026, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 45712/125000 [2:51:12<4:45:08,  4.63it/s]

epoch 0 step 45711 loss tensor(0.4997, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 45728/125000 [2:51:15<4:44:42,  4.64it/s]

epoch 0 step 45727 loss tensor(0.5798, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 45744/125000 [2:51:19<4:44:42,  4.64it/s]

epoch 0 step 45743 loss tensor(0.4923, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 45760/125000 [2:51:22<4:44:47,  4.64it/s]

epoch 0 step 45759 loss tensor(0.7247, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 45776/125000 [2:51:25<4:44:06,  4.65it/s]

epoch 0 step 45775 loss tensor(0.5587, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 45792/125000 [2:51:29<4:44:02,  4.65it/s]

epoch 0 step 45791 loss tensor(0.7094, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 45808/125000 [2:51:32<4:44:57,  4.63it/s]

epoch 0 step 45807 loss tensor(0.5761, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 45824/125000 [2:51:35<4:44:04,  4.65it/s]

epoch 0 step 45823 loss tensor(0.4890, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 45840/125000 [2:51:39<4:44:20,  4.64it/s]

epoch 0 step 45839 loss tensor(0.5370, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 45856/125000 [2:51:42<4:44:01,  4.64it/s]

epoch 0 step 45855 loss tensor(0.5671, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 45872/125000 [2:51:45<4:44:34,  4.63it/s]

epoch 0 step 45871 loss tensor(0.5971, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 45888/125000 [2:51:49<4:44:34,  4.63it/s]

epoch 0 step 45887 loss tensor(0.7085, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 45904/125000 [2:51:52<4:44:32,  4.63it/s]

epoch 0 step 45903 loss tensor(0.5690, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 45920/125000 [2:51:56<4:44:15,  4.64it/s]

epoch 0 step 45919 loss tensor(0.6377, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 45936/125000 [2:51:59<4:43:49,  4.64it/s]

epoch 0 step 45935 loss tensor(0.5392, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 45952/125000 [2:52:02<4:43:44,  4.64it/s]

epoch 0 step 45951 loss tensor(0.6345, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 45968/125000 [2:52:06<4:43:42,  4.64it/s]

epoch 0 step 45967 loss tensor(0.7209, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 45984/125000 [2:52:09<4:44:20,  4.63it/s]

epoch 0 step 45983 loss tensor(0.4560, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46000/125000 [2:52:12<4:44:15,  4.63it/s]

epoch 0 step 45999 loss tensor(0.5403, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46016/125000 [2:52:16<4:43:42,  4.64it/s]

epoch 0 step 46015 loss tensor(0.6058, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46032/125000 [2:52:19<4:43:45,  4.64it/s]

epoch 0 step 46031 loss tensor(0.5892, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46048/125000 [2:52:22<4:43:29,  4.64it/s]

epoch 0 step 46047 loss tensor(0.6392, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46064/125000 [2:52:26<4:43:38,  4.64it/s]

epoch 0 step 46063 loss tensor(0.5987, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46080/125000 [2:52:29<4:43:28,  4.64it/s]

epoch 0 step 46079 loss tensor(0.5288, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46096/125000 [2:52:32<4:42:53,  4.65it/s]

epoch 0 step 46095 loss tensor(0.6015, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46112/125000 [2:52:36<4:43:40,  4.63it/s]

epoch 0 step 46111 loss tensor(0.6323, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46128/125000 [2:52:39<4:43:23,  4.64it/s]

epoch 0 step 46127 loss tensor(0.4810, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46144/125000 [2:52:42<4:44:26,  4.62it/s]

epoch 0 step 46143 loss tensor(0.6476, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46160/125000 [2:52:46<4:43:48,  4.63it/s]

epoch 0 step 46159 loss tensor(0.5133, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46176/125000 [2:52:49<4:43:06,  4.64it/s]

epoch 0 step 46175 loss tensor(0.6037, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46192/125000 [2:52:52<4:43:22,  4.64it/s]

epoch 0 step 46191 loss tensor(0.4545, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46208/125000 [2:52:56<4:42:36,  4.65it/s]

epoch 0 step 46207 loss tensor(0.6267, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46224/125000 [2:52:59<4:42:55,  4.64it/s]

epoch 0 step 46223 loss tensor(0.6020, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46240/125000 [2:53:02<4:43:13,  4.63it/s]

epoch 0 step 46239 loss tensor(0.6582, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46256/125000 [2:53:06<4:42:53,  4.64it/s]

epoch 0 step 46255 loss tensor(0.6080, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46272/125000 [2:53:09<4:42:55,  4.64it/s]

epoch 0 step 46271 loss tensor(0.5851, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46288/125000 [2:53:12<4:42:35,  4.64it/s]

epoch 0 step 46287 loss tensor(0.5998, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46304/125000 [2:53:16<4:42:29,  4.64it/s]

epoch 0 step 46303 loss tensor(0.5910, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46320/125000 [2:53:19<4:42:04,  4.65it/s]

epoch 0 step 46319 loss tensor(0.5984, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46336/125000 [2:53:23<4:43:19,  4.63it/s]

epoch 0 step 46335 loss tensor(0.5490, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46352/125000 [2:53:26<4:42:14,  4.64it/s]

epoch 0 step 46351 loss tensor(0.5754, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46368/125000 [2:53:29<4:42:37,  4.64it/s]

epoch 0 step 46367 loss tensor(0.7187, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46384/125000 [2:53:33<4:42:18,  4.64it/s]

epoch 0 step 46383 loss tensor(0.7603, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46400/125000 [2:53:36<4:42:06,  4.64it/s]

epoch 0 step 46399 loss tensor(0.6535, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46416/125000 [2:53:39<4:42:50,  4.63it/s]

epoch 0 step 46415 loss tensor(0.5703, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46432/125000 [2:53:43<4:42:44,  4.63it/s]

epoch 0 step 46431 loss tensor(0.5450, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46448/125000 [2:53:46<4:41:54,  4.64it/s]

epoch 0 step 46447 loss tensor(0.5441, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46464/125000 [2:53:49<4:41:44,  4.65it/s]

epoch 0 step 46463 loss tensor(0.5552, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46480/125000 [2:53:53<4:42:07,  4.64it/s]

epoch 0 step 46479 loss tensor(0.6616, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46496/125000 [2:53:56<4:42:18,  4.63it/s]

epoch 0 step 46495 loss tensor(0.7135, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46512/125000 [2:53:59<4:42:04,  4.64it/s]

epoch 0 step 46511 loss tensor(0.5772, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 46517/125000 [2:54:00<4:37:26,  4.71it/s]