## Library

In [1]:
import timm
import torch
import torchvision
import torch.utils.data as data
import torchvision.transforms as transforms
from torch.optim import AdamW, SGD
from torch import nn
from tqdm import tqdm, tqdm_notebook
from torch.optim.lr_scheduler import CosineAnnealingLR

from vit_pooling import ViTPooling
from patch_aug import NegativePatchShuffle, NegativePatchRotate, PositivePatchShuffle, PositivePatchRotate


gpu_ids = []
device_names = []
if torch.cuda.is_available():
    for gpu_id in range(torch.cuda.device_count()):
        gpu_ids += [gpu_id]
        device_names += [torch.cuda.get_device_name(gpu_id)]
print(gpu_ids)
print(device_names)

if len(gpu_ids) > 1:
    gpu = 'cuda:' + str(gpu_ids[0])  # GPU Number
else:
    gpu = "cuda" if torch.cuda.is_available() else "cpu"

[0, 1, 2, 3]
['NVIDIA GeForce RTX 3090', 'NVIDIA GeForce RTX 3090', 'NVIDIA GeForce RTX 3090', 'NVIDIA GeForce RTX 3090']


## Hyper parameter

In [2]:
device = gpu
BATCH_SIZE = 64
NUM_EPOCHS = 8
NUM_WORKERS = 2
LEARNING_RATE = 1.25e-03
pre_model_path = './save/ViT/timm/ViT_timm_vit_base_patch16_224_in21k.pt'
fine_model_path = f'./save/ViT_timm_vit_base_patch16_224_in21k_augPositive_i2012_ep{NUM_EPOCHS}_lr{LEARNING_RATE}.pt'
dynamic_model_path = f'./save/ViT_timm_vit_base_patch16_224_in21k_augPositive_i2012_ep'

NUM_CLASSES = 1000

## Dataset

In [3]:
transform_train = transforms.Compose([
    transforms.RandomResizedCrop(224),
    PositivePatchShuffle(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])
transform_test = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])
# pre_train_set = torchvision.datasets.ImageFolder('./data/ImageNet-21k', transform=transform_train)
# pre_train_loader = data.DataLoader(pre_train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
train_set = torchvision.datasets.ImageFolder('../../YJ/ILSVRC2012/train', transform=transform_train)
train_loader = data.DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
test_set = torchvision.datasets.ImageFolder('../../YJ/ILSVRC2012/val', transform=transform_test)
test_loader = data.DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)

## Fine-tuning Class

In [4]:
class FineTunner(object):
    def __init__(self):
        self.model = None
        self.optimizer = None
        self.epochs = [0]
        self.losses = [0]

    def process(self, load=False):
        self.build_model(load)
        self.finetune_model()
        self.save_model()

    def build_model(self, load):
        self.model = timm.create_model('vit_base_patch16_224_in21k', pretrained=True).to(device)
        self.model.num_classes = NUM_CLASSES
        print(f'Parameter: {sum(p.numel() for p in self.model.parameters() if p.requires_grad)}')
        print(f'Classes: {self.model.num_classes}')
        self.optimizer = SGD(self.model.parameters(), lr=0)
        if load:
            checkpoint = torch.load(pre_model_path)
            self.epochs = checkpoint['epochs']
            self.model.load_state_dict(checkpoint['model'])
            self.losses = checkpoint['losses']
            print(f'Parameter: {sum(p.numel() for p in self.model.parameters() if p.requires_grad)}')
            print(f'Classes: {self.model.num_classes}')
            print(f'Epoch: {self.epochs[-1]}')
            print(f'****** Reset epochs and losses ******')
            self.epochs = []
            self.losses = []

    def finetune_model(self):
        model = self.model
        criterion = nn.CrossEntropyLoss()
        optimizer = SGD(model.parameters(), lr=LEARNING_RATE, momentum=0.9)
        scheduler = CosineAnnealingLR(optimizer, T_max=NUM_EPOCHS)

        for epoch in range(NUM_EPOCHS):
            running_loss = 0.0
            saving_loss = 0.0
            for i, data in tqdm_notebook(enumerate(train_loader, 0), total=len(train_loader)):
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                optimizer.zero_grad()

                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                running_loss += loss.item()
                saving_loss += loss.item()
                if i % 100 == 99:
                    print(f'[Epoch {epoch}, Batch {i + 1:5d}] loss: {running_loss / 100:.3f}')
                    running_loss = 0.0
                if i % 1000 == 999:
                    self.epochs.append(epoch + 1)
                    self.model = model
                    self.optimizer = optimizer
                    self.losses.append(saving_loss/1000)
                    self.save_model()
                    saving_loss = 0.0
            scheduler.step()
        print('****** Finished Fine-tuning ******')
        self.model = model

    def save_model(self):
        checkpoint = {
            'epochs': self.epochs,
            'model': self.model.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'losses': self.losses,
        }
        torch.save(checkpoint, fine_model_path)
        # torch.save(checkpoint, dynamic_model_path+str(self.epochs[-1])+f'_lr{LEARNING_RATE}.pt')
        print(f"****** Model checkpoint saved at epochs {self.epochs[-1]} ******")

In [5]:
if __name__ == '__main__':
    trainer = FineTunner()
    trainer.process(load=True)

Parameter: 102595923
Classes: 1000
Parameter: 102595923
Classes: 1000
Epoch: 0
****** Reset epochs and losses ******


  0%|          | 100/20019 [00:37<2:07:43,  2.60it/s]

[Epoch 0, Batch   100] loss: 7.960


  1%|          | 200/20019 [01:13<1:54:47,  2.88it/s]

[Epoch 0, Batch   200] loss: 7.261


  1%|▏         | 300/20019 [01:50<2:01:22,  2.71it/s]

[Epoch 0, Batch   300] loss: 7.038


  2%|▏         | 400/20019 [02:26<1:56:16,  2.81it/s]

[Epoch 0, Batch   400] loss: 6.774


  2%|▏         | 500/20019 [03:01<1:56:11,  2.80it/s]

[Epoch 0, Batch   500] loss: 6.475


  3%|▎         | 600/20019 [03:37<1:53:05,  2.86it/s]

[Epoch 0, Batch   600] loss: 5.795


  3%|▎         | 700/20019 [04:13<1:52:05,  2.87it/s]

[Epoch 0, Batch   700] loss: 4.738


  4%|▍         | 800/20019 [04:50<1:52:41,  2.84it/s]

[Epoch 0, Batch   800] loss: 3.508


  4%|▍         | 900/20019 [05:25<1:54:04,  2.79it/s]

[Epoch 0, Batch   900] loss: 2.864


  5%|▍         | 999/20019 [06:00<1:51:02,  2.85it/s]

[Epoch 0, Batch  1000] loss: 2.505


  5%|▍         | 1000/20019 [06:03<5:03:10,  1.05it/s]

****** Model checkpoint saved at epochs 1 ******


  5%|▌         | 1100/20019 [06:43<2:05:33,  2.51it/s]

[Epoch 0, Batch  1100] loss: 2.262


  6%|▌         | 1200/20019 [07:21<1:58:07,  2.66it/s]

[Epoch 0, Batch  1200] loss: 2.191


  6%|▋         | 1300/20019 [07:59<1:50:39,  2.82it/s]

[Epoch 0, Batch  1300] loss: 2.147


  7%|▋         | 1400/20019 [08:35<1:57:15,  2.65it/s]

[Epoch 0, Batch  1400] loss: 2.095


  7%|▋         | 1500/20019 [09:12<1:51:30,  2.77it/s]

[Epoch 0, Batch  1500] loss: 2.055


  8%|▊         | 1600/20019 [09:49<1:53:33,  2.70it/s]

[Epoch 0, Batch  1600] loss: 1.976


  8%|▊         | 1700/20019 [10:27<1:54:54,  2.66it/s]

[Epoch 0, Batch  1700] loss: 1.956


  9%|▉         | 1800/20019 [11:03<2:01:17,  2.50it/s]

[Epoch 0, Batch  1800] loss: 1.919


  9%|▉         | 1900/20019 [11:40<1:46:01,  2.85it/s]

[Epoch 0, Batch  1900] loss: 1.857


 10%|▉         | 1999/20019 [12:17<1:46:46,  2.81it/s]

[Epoch 0, Batch  2000] loss: 1.920


 10%|▉         | 2000/20019 [12:20<5:38:16,  1.13s/it]

****** Model checkpoint saved at epochs 1 ******


 10%|█         | 2100/20019 [13:01<1:52:33,  2.65it/s]

[Epoch 0, Batch  2100] loss: 1.886


 11%|█         | 2200/20019 [13:38<1:46:34,  2.79it/s]

[Epoch 0, Batch  2200] loss: 1.855


 11%|█▏        | 2300/20019 [14:15<1:52:39,  2.62it/s]

[Epoch 0, Batch  2300] loss: 1.805


 12%|█▏        | 2400/20019 [14:53<1:45:36,  2.78it/s]

[Epoch 0, Batch  2400] loss: 1.789


 12%|█▏        | 2500/20019 [15:31<1:48:10,  2.70it/s]

[Epoch 0, Batch  2500] loss: 1.786


 13%|█▎        | 2600/20019 [16:09<1:43:08,  2.81it/s]

[Epoch 0, Batch  2600] loss: 1.805


 13%|█▎        | 2700/20019 [16:46<1:45:26,  2.74it/s]

[Epoch 0, Batch  2700] loss: 1.730


 14%|█▍        | 2800/20019 [17:24<1:57:30,  2.44it/s]

[Epoch 0, Batch  2800] loss: 1.759


 14%|█▍        | 2900/20019 [18:01<1:55:52,  2.46it/s]

[Epoch 0, Batch  2900] loss: 1.733


 15%|█▍        | 2999/20019 [18:37<1:42:42,  2.76it/s]

[Epoch 0, Batch  3000] loss: 1.710


 15%|█▍        | 3000/20019 [18:40<4:58:46,  1.05s/it]

****** Model checkpoint saved at epochs 1 ******


 15%|█▌        | 3100/20019 [19:22<1:46:47,  2.64it/s]

[Epoch 0, Batch  3100] loss: 1.738


 16%|█▌        | 3200/20019 [20:00<1:40:02,  2.80it/s]

[Epoch 0, Batch  3200] loss: 1.718


 16%|█▋        | 3300/20019 [20:39<1:44:48,  2.66it/s]

[Epoch 0, Batch  3300] loss: 1.696


 17%|█▋        | 3400/20019 [21:17<1:48:50,  2.54it/s]

[Epoch 0, Batch  3400] loss: 1.689


 17%|█▋        | 3500/20019 [21:56<1:48:13,  2.54it/s]

[Epoch 0, Batch  3500] loss: 1.705


 18%|█▊        | 3600/20019 [22:34<1:38:40,  2.77it/s]

[Epoch 0, Batch  3600] loss: 1.698


 18%|█▊        | 3700/20019 [23:12<1:38:33,  2.76it/s]

[Epoch 0, Batch  3700] loss: 1.699


 19%|█▉        | 3800/20019 [23:50<1:51:49,  2.42it/s]

[Epoch 0, Batch  3800] loss: 1.709


 19%|█▉        | 3900/20019 [24:29<1:39:49,  2.69it/s]

[Epoch 0, Batch  3900] loss: 1.660


 20%|█▉        | 3999/20019 [25:08<1:35:07,  2.81it/s]

[Epoch 0, Batch  4000] loss: 1.625


 20%|█▉        | 4000/20019 [25:11<4:38:51,  1.04s/it]

****** Model checkpoint saved at epochs 1 ******


 20%|██        | 4100/20019 [25:54<1:55:53,  2.29it/s]

[Epoch 0, Batch  4100] loss: 1.676


 21%|██        | 4200/20019 [26:32<1:37:23,  2.71it/s]

[Epoch 0, Batch  4200] loss: 1.615


 21%|██▏       | 4300/20019 [27:11<1:41:05,  2.59it/s]

[Epoch 0, Batch  4300] loss: 1.569


 22%|██▏       | 4400/20019 [27:49<1:47:02,  2.43it/s]

[Epoch 0, Batch  4400] loss: 1.595


 22%|██▏       | 4500/20019 [28:29<1:34:05,  2.75it/s]

[Epoch 0, Batch  4500] loss: 1.619


 23%|██▎       | 4600/20019 [29:08<1:32:09,  2.79it/s]

[Epoch 0, Batch  4600] loss: 1.653


 23%|██▎       | 4700/20019 [29:47<1:41:23,  2.52it/s]

[Epoch 0, Batch  4700] loss: 1.610


 24%|██▍       | 4800/20019 [30:26<1:34:16,  2.69it/s]

[Epoch 0, Batch  4800] loss: 1.619


 24%|██▍       | 4900/20019 [31:05<1:33:38,  2.69it/s]

[Epoch 0, Batch  4900] loss: 1.590


 25%|██▍       | 4999/20019 [31:44<1:37:15,  2.57it/s]

[Epoch 0, Batch  5000] loss: 1.616


 25%|██▍       | 5000/20019 [31:47<4:24:58,  1.06s/it]

****** Model checkpoint saved at epochs 1 ******


 25%|██▌       | 5100/20019 [32:31<1:46:54,  2.33it/s]

[Epoch 0, Batch  5100] loss: 1.547


 26%|██▌       | 5200/20019 [33:10<1:39:39,  2.48it/s]

[Epoch 0, Batch  5200] loss: 1.534


 26%|██▋       | 5300/20019 [33:50<1:50:43,  2.22it/s]

[Epoch 0, Batch  5300] loss: 1.549


 27%|██▋       | 5400/20019 [34:29<1:30:37,  2.69it/s]

[Epoch 0, Batch  5400] loss: 1.541


 27%|██▋       | 5500/20019 [35:09<1:27:27,  2.77it/s]

[Epoch 0, Batch  5500] loss: 1.619


 28%|██▊       | 5600/20019 [35:49<1:35:48,  2.51it/s]

[Epoch 0, Batch  5600] loss: 1.531


 28%|██▊       | 5700/20019 [36:30<1:43:55,  2.30it/s]

[Epoch 0, Batch  5700] loss: 1.614


 29%|██▉       | 5800/20019 [37:09<1:31:02,  2.60it/s]

[Epoch 0, Batch  5800] loss: 1.503


 29%|██▉       | 5900/20019 [37:50<1:34:25,  2.49it/s]

[Epoch 0, Batch  5900] loss: 1.526


 30%|██▉       | 5999/20019 [38:30<1:28:48,  2.63it/s]

[Epoch 0, Batch  6000] loss: 1.529


 30%|██▉       | 6000/20019 [38:32<4:06:22,  1.05s/it]

****** Model checkpoint saved at epochs 1 ******


 30%|███       | 6100/20019 [39:18<1:42:25,  2.26it/s]

[Epoch 0, Batch  6100] loss: 1.535


 31%|███       | 6200/20019 [40:00<1:42:13,  2.25it/s]

[Epoch 0, Batch  6200] loss: 1.533


 31%|███▏      | 6300/20019 [40:40<1:27:35,  2.61it/s]

[Epoch 0, Batch  6300] loss: 1.512


 32%|███▏      | 6400/20019 [41:21<1:31:26,  2.48it/s]

[Epoch 0, Batch  6400] loss: 1.540


 32%|███▏      | 6500/20019 [42:03<1:36:19,  2.34it/s]

[Epoch 0, Batch  6500] loss: 1.521


 33%|███▎      | 6600/20019 [42:44<1:24:57,  2.63it/s]

[Epoch 0, Batch  6600] loss: 1.507


 33%|███▎      | 6700/20019 [43:25<1:32:49,  2.39it/s]

[Epoch 0, Batch  6700] loss: 1.511


 34%|███▍      | 6800/20019 [44:07<1:27:12,  2.53it/s]

[Epoch 0, Batch  6800] loss: 1.474


 34%|███▍      | 6900/20019 [44:48<1:22:27,  2.65it/s]

[Epoch 0, Batch  6900] loss: 1.556


 35%|███▍      | 6999/20019 [45:29<1:22:22,  2.63it/s]

[Epoch 0, Batch  7000] loss: 1.447


 35%|███▍      | 7000/20019 [45:31<3:54:25,  1.08s/it]

****** Model checkpoint saved at epochs 1 ******


 35%|███▌      | 7100/20019 [46:18<1:29:53,  2.40it/s]

[Epoch 0, Batch  7100] loss: 1.419


 36%|███▌      | 7200/20019 [47:00<1:33:29,  2.29it/s]

[Epoch 0, Batch  7200] loss: 1.482


 36%|███▋      | 7300/20019 [47:42<1:22:13,  2.58it/s]

[Epoch 0, Batch  7300] loss: 1.470


 37%|███▋      | 7400/20019 [48:24<1:28:33,  2.37it/s]

[Epoch 0, Batch  7400] loss: 1.491


 37%|███▋      | 7500/20019 [49:07<1:24:18,  2.47it/s]

[Epoch 0, Batch  7500] loss: 1.445


 38%|███▊      | 7600/20019 [49:49<1:31:08,  2.27it/s]

[Epoch 0, Batch  7600] loss: 1.480


 38%|███▊      | 7700/20019 [50:32<1:28:10,  2.33it/s]

[Epoch 0, Batch  7700] loss: 1.473


 39%|███▉      | 7800/20019 [51:15<1:23:12,  2.45it/s]

[Epoch 0, Batch  7800] loss: 1.455


 39%|███▉      | 7900/20019 [51:56<1:17:51,  2.59it/s]

[Epoch 0, Batch  7900] loss: 1.433


 40%|███▉      | 7999/20019 [52:39<1:25:18,  2.35it/s]

[Epoch 0, Batch  8000] loss: 1.461


 40%|███▉      | 8000/20019 [52:42<3:36:17,  1.08s/it]

****** Model checkpoint saved at epochs 1 ******


 40%|████      | 8100/20019 [53:29<1:24:27,  2.35it/s]

[Epoch 0, Batch  8100] loss: 1.431


 41%|████      | 8200/20019 [54:13<1:46:57,  1.84it/s]

[Epoch 0, Batch  8200] loss: 1.476


 41%|████▏     | 8300/20019 [54:57<1:23:57,  2.33it/s]

[Epoch 0, Batch  8300] loss: 1.466


 42%|████▏     | 8400/20019 [55:41<1:23:46,  2.31it/s]

[Epoch 0, Batch  8400] loss: 1.475


 42%|████▏     | 8500/20019 [56:25<1:13:41,  2.61it/s]

[Epoch 0, Batch  8500] loss: 1.376


 43%|████▎     | 8600/20019 [57:09<1:21:29,  2.34it/s]

[Epoch 0, Batch  8600] loss: 1.425


 43%|████▎     | 8700/20019 [57:54<1:31:51,  2.05it/s]

[Epoch 0, Batch  8700] loss: 1.470


 44%|████▍     | 8800/20019 [58:39<1:21:36,  2.29it/s]

[Epoch 0, Batch  8800] loss: 1.425


 44%|████▍     | 8900/20019 [59:24<1:18:22,  2.36it/s]

[Epoch 0, Batch  8900] loss: 1.416


 45%|████▍     | 8999/20019 [1:00:09<1:34:12,  1.95it/s]

[Epoch 0, Batch  9000] loss: 1.444


 45%|████▍     | 9000/20019 [1:00:11<3:29:17,  1.14s/it]

****** Model checkpoint saved at epochs 1 ******


 45%|████▌     | 9100/20019 [1:01:00<1:19:51,  2.28it/s]

[Epoch 0, Batch  9100] loss: 1.415


 46%|████▌     | 9200/20019 [1:01:46<1:26:16,  2.09it/s]

[Epoch 0, Batch  9200] loss: 1.409


 46%|████▋     | 9300/20019 [1:02:32<1:22:14,  2.17it/s]

[Epoch 0, Batch  9300] loss: 1.422


 47%|████▋     | 9400/20019 [1:03:17<1:17:33,  2.28it/s]

[Epoch 0, Batch  9400] loss: 1.383


 47%|████▋     | 9500/20019 [1:04:03<1:20:43,  2.17it/s]

[Epoch 0, Batch  9500] loss: 1.439


 48%|████▊     | 9600/20019 [1:04:48<1:17:35,  2.24it/s]

[Epoch 0, Batch  9600] loss: 1.400


 48%|████▊     | 9700/20019 [1:05:33<1:25:45,  2.01it/s]

[Epoch 0, Batch  9700] loss: 1.446


 49%|████▉     | 9800/20019 [1:06:19<1:11:47,  2.37it/s]

[Epoch 0, Batch  9800] loss: 1.395


 49%|████▉     | 9900/20019 [1:07:06<1:19:56,  2.11it/s]

[Epoch 0, Batch  9900] loss: 1.369


 50%|████▉     | 9999/20019 [1:07:52<1:10:10,  2.38it/s]

[Epoch 0, Batch 10000] loss: 1.399


 50%|████▉     | 10000/20019 [1:07:55<3:06:32,  1.12s/it]

****** Model checkpoint saved at epochs 1 ******


 50%|█████     | 10100/20019 [1:08:46<1:16:23,  2.16it/s]

[Epoch 0, Batch 10100] loss: 1.391


 51%|█████     | 10200/20019 [1:09:34<1:09:26,  2.36it/s]

[Epoch 0, Batch 10200] loss: 1.419


 51%|█████▏    | 10300/20019 [1:10:21<1:16:28,  2.12it/s]

[Epoch 0, Batch 10300] loss: 1.412


 52%|█████▏    | 10400/20019 [1:11:11<1:17:24,  2.07it/s]

[Epoch 0, Batch 10400] loss: 1.414


 52%|█████▏    | 10500/20019 [1:11:58<1:11:34,  2.22it/s]

[Epoch 0, Batch 10500] loss: 1.391


 53%|█████▎    | 10600/20019 [1:12:47<1:14:36,  2.10it/s]

[Epoch 0, Batch 10600] loss: 1.389


 53%|█████▎    | 10700/20019 [1:13:34<1:11:29,  2.17it/s]

[Epoch 0, Batch 10700] loss: 1.414


 54%|█████▍    | 10800/20019 [1:14:22<1:11:37,  2.15it/s]

[Epoch 0, Batch 10800] loss: 1.385


 54%|█████▍    | 10900/20019 [1:15:10<1:13:29,  2.07it/s]

[Epoch 0, Batch 10900] loss: 1.415


 55%|█████▍    | 10999/20019 [1:15:57<1:10:05,  2.14it/s]

[Epoch 0, Batch 11000] loss: 1.396


 55%|█████▍    | 11000/20019 [1:16:00<2:57:48,  1.18s/it]

****** Model checkpoint saved at epochs 1 ******


 55%|█████▌    | 11100/20019 [1:16:52<1:06:07,  2.25it/s]

[Epoch 0, Batch 11100] loss: 1.313


 56%|█████▌    | 11200/20019 [1:17:41<1:11:40,  2.05it/s]

[Epoch 0, Batch 11200] loss: 1.396


 56%|█████▋    | 11300/20019 [1:18:30<1:10:53,  2.05it/s]

[Epoch 0, Batch 11300] loss: 1.334


 57%|█████▋    | 11400/20019 [1:19:18<1:07:30,  2.13it/s]

[Epoch 0, Batch 11400] loss: 1.363


 57%|█████▋    | 11500/20019 [1:20:08<1:07:12,  2.11it/s]

[Epoch 0, Batch 11500] loss: 1.404


 58%|█████▊    | 11600/20019 [1:20:58<1:04:07,  2.19it/s]

[Epoch 0, Batch 11600] loss: 1.384


 58%|█████▊    | 11700/20019 [1:21:47<1:11:28,  1.94it/s]

[Epoch 0, Batch 11700] loss: 1.377


 59%|█████▉    | 11800/20019 [1:22:38<1:11:45,  1.91it/s]

[Epoch 0, Batch 11800] loss: 1.353


 59%|█████▉    | 11900/20019 [1:23:27<1:00:03,  2.25it/s]

[Epoch 0, Batch 11900] loss: 1.345


 60%|█████▉    | 11999/20019 [1:24:18<1:10:43,  1.89it/s]

[Epoch 0, Batch 12000] loss: 1.412


 60%|█████▉    | 12000/20019 [1:24:21<2:34:22,  1.16s/it]

****** Model checkpoint saved at epochs 1 ******


 60%|██████    | 12100/20019 [1:25:16<1:06:15,  1.99it/s]

[Epoch 0, Batch 12100] loss: 1.367


 61%|██████    | 12200/20019 [1:26:08<59:31,  2.19it/s]  

[Epoch 0, Batch 12200] loss: 1.372


 61%|██████▏   | 12300/20019 [1:26:59<1:04:32,  1.99it/s]

[Epoch 0, Batch 12300] loss: 1.362


 62%|██████▏   | 12400/20019 [1:27:52<1:04:18,  1.97it/s]

[Epoch 0, Batch 12400] loss: 1.371


 62%|██████▏   | 12500/20019 [1:28:44<1:07:26,  1.86it/s]

[Epoch 0, Batch 12500] loss: 1.367


 63%|██████▎   | 12600/20019 [1:29:35<1:01:55,  2.00it/s]

[Epoch 0, Batch 12600] loss: 1.341


 63%|██████▎   | 12700/20019 [1:30:28<1:14:20,  1.64it/s]

[Epoch 0, Batch 12700] loss: 1.340


 64%|██████▍   | 12800/20019 [1:31:21<1:07:11,  1.79it/s]

[Epoch 0, Batch 12800] loss: 1.374


 64%|██████▍   | 12900/20019 [1:32:14<1:05:06,  1.82it/s]

[Epoch 0, Batch 12900] loss: 1.373


 65%|██████▍   | 12999/20019 [1:33:06<1:01:38,  1.90it/s]

[Epoch 0, Batch 13000] loss: 1.405


 65%|██████▍   | 13000/20019 [1:33:10<2:43:57,  1.40s/it]

****** Model checkpoint saved at epochs 1 ******


 65%|██████▌   | 13100/20019 [1:34:07<55:46,  2.07it/s]  

[Epoch 0, Batch 13100] loss: 1.333


 66%|██████▌   | 13200/20019 [1:35:01<1:00:47,  1.87it/s]

[Epoch 0, Batch 13200] loss: 1.351


 66%|██████▋   | 13300/20019 [1:35:55<55:36,  2.01it/s]  

[Epoch 0, Batch 13300] loss: 1.364


 67%|██████▋   | 13400/20019 [1:36:51<1:01:31,  1.79it/s]

[Epoch 0, Batch 13400] loss: 1.308


 67%|██████▋   | 13500/20019 [1:37:46<57:22,  1.89it/s]  

[Epoch 0, Batch 13500] loss: 1.375


 68%|██████▊   | 13600/20019 [1:38:41<56:07,  1.91it/s]  

[Epoch 0, Batch 13600] loss: 1.351


 68%|██████▊   | 13700/20019 [1:39:37<1:05:38,  1.60it/s]

[Epoch 0, Batch 13700] loss: 1.327


 69%|██████▉   | 13800/20019 [1:40:32<1:02:11,  1.67it/s]

[Epoch 0, Batch 13800] loss: 1.323


 69%|██████▉   | 13900/20019 [1:41:27<52:09,  1.96it/s]  

[Epoch 0, Batch 13900] loss: 1.340


 70%|██████▉   | 13999/20019 [1:42:23<57:36,  1.74it/s]  

[Epoch 0, Batch 14000] loss: 1.335


 70%|██████▉   | 14000/20019 [1:42:26<1:58:09,  1.18s/it]

****** Model checkpoint saved at epochs 1 ******


 70%|███████   | 14100/20019 [1:43:26<55:47,  1.77it/s]  

[Epoch 0, Batch 14100] loss: 1.291


 71%|███████   | 14200/20019 [1:44:22<55:30,  1.75it/s]  

[Epoch 0, Batch 14200] loss: 1.362


 71%|███████▏  | 14300/20019 [1:45:18<56:45,  1.68it/s]  

[Epoch 0, Batch 14300] loss: 1.332


 72%|███████▏  | 14400/20019 [1:46:15<56:26,  1.66it/s]  

[Epoch 0, Batch 14400] loss: 1.333


 72%|███████▏  | 14500/20019 [1:47:12<48:28,  1.90it/s]  

[Epoch 0, Batch 14500] loss: 1.368


 73%|███████▎  | 14600/20019 [1:48:09<52:05,  1.73it/s]  

[Epoch 0, Batch 14600] loss: 1.306


 73%|███████▎  | 14700/20019 [1:49:06<44:00,  2.01it/s]  

[Epoch 0, Batch 14700] loss: 1.359


 74%|███████▍  | 14800/20019 [1:50:03<44:59,  1.93it/s]  

[Epoch 0, Batch 14800] loss: 1.364


 74%|███████▍  | 14900/20019 [1:51:01<47:49,  1.78it/s]  

[Epoch 0, Batch 14900] loss: 1.319


 75%|███████▍  | 14999/20019 [1:51:58<47:15,  1.77it/s]  

[Epoch 0, Batch 15000] loss: 1.296


 75%|███████▍  | 15000/20019 [1:52:01<1:48:31,  1.30s/it]

****** Model checkpoint saved at epochs 1 ******


 75%|███████▌  | 15100/20019 [1:53:02<45:16,  1.81it/s]  

[Epoch 0, Batch 15100] loss: 1.308


 76%|███████▌  | 15200/20019 [1:53:59<50:04,  1.60it/s]

[Epoch 0, Batch 15200] loss: 1.337


 76%|███████▋  | 15300/20019 [1:54:57<41:52,  1.88it/s]

[Epoch 0, Batch 15300] loss: 1.346


 77%|███████▋  | 15400/20019 [1:55:55<46:08,  1.67it/s]

[Epoch 0, Batch 15400] loss: 1.300


 77%|███████▋  | 15500/20019 [1:56:53<45:25,  1.66it/s]

[Epoch 0, Batch 15500] loss: 1.333


 78%|███████▊  | 15600/20019 [1:57:53<39:09,  1.88it/s]  

[Epoch 0, Batch 15600] loss: 1.323


 78%|███████▊  | 15700/20019 [1:58:53<41:58,  1.71it/s]

[Epoch 0, Batch 15700] loss: 1.282


 79%|███████▉  | 15800/20019 [1:59:52<36:46,  1.91it/s]

[Epoch 0, Batch 15800] loss: 1.345


 79%|███████▉  | 15900/20019 [2:00:50<43:00,  1.60it/s]

[Epoch 0, Batch 15900] loss: 1.348


 80%|███████▉  | 15999/20019 [2:01:51<43:45,  1.53it/s]  

[Epoch 0, Batch 16000] loss: 1.287


 80%|███████▉  | 16000/20019 [2:01:54<1:33:31,  1.40s/it]

****** Model checkpoint saved at epochs 1 ******


 80%|████████  | 16100/20019 [2:02:57<35:26,  1.84it/s]  

[Epoch 0, Batch 16100] loss: 1.320


 81%|████████  | 16200/20019 [2:03:57<33:36,  1.89it/s]

[Epoch 0, Batch 16200] loss: 1.336


 81%|████████▏ | 16300/20019 [2:04:59<36:44,  1.69it/s]  

[Epoch 0, Batch 16300] loss: 1.262


 82%|████████▏ | 16400/20019 [2:06:01<40:25,  1.49it/s]

[Epoch 0, Batch 16400] loss: 1.362


 82%|████████▏ | 16500/20019 [2:07:02<40:37,  1.44it/s]

[Epoch 0, Batch 16500] loss: 1.349


 83%|████████▎ | 16600/20019 [2:08:04<40:03,  1.42it/s]

[Epoch 0, Batch 16600] loss: 1.366


 83%|████████▎ | 16700/20019 [2:09:05<39:05,  1.41it/s]

[Epoch 0, Batch 16700] loss: 1.334


 84%|████████▍ | 16800/20019 [2:10:06<31:22,  1.71it/s]

[Epoch 0, Batch 16800] loss: 1.299


 84%|████████▍ | 16900/20019 [2:11:07<29:29,  1.76it/s]

[Epoch 0, Batch 16900] loss: 1.284


 85%|████████▍ | 16999/20019 [2:12:07<27:28,  1.83it/s]

[Epoch 0, Batch 17000] loss: 1.280


 85%|████████▍ | 17000/20019 [2:12:10<1:06:13,  1.32s/it]

****** Model checkpoint saved at epochs 1 ******


 85%|████████▌ | 17100/20019 [2:13:17<27:59,  1.74it/s]  

[Epoch 0, Batch 17100] loss: 1.366


 86%|████████▌ | 17200/20019 [2:14:18<27:02,  1.74it/s]

[Epoch 0, Batch 17200] loss: 1.292


 86%|████████▋ | 17300/20019 [2:15:20<24:25,  1.86it/s]

[Epoch 0, Batch 17300] loss: 1.307


 87%|████████▋ | 17400/20019 [2:16:27<23:49,  1.83it/s]  

[Epoch 0, Batch 17400] loss: 1.232


 87%|████████▋ | 17500/20019 [2:17:30<23:49,  1.76it/s]

[Epoch 0, Batch 17500] loss: 1.277


 88%|████████▊ | 17600/20019 [2:18:36<38:59,  1.03it/s]

[Epoch 0, Batch 17600] loss: 1.334


 88%|████████▊ | 17700/20019 [2:19:40<21:27,  1.80it/s]

[Epoch 0, Batch 17700] loss: 1.251


 89%|████████▉ | 17800/20019 [2:20:44<21:39,  1.71it/s]

[Epoch 0, Batch 17800] loss: 1.276


 89%|████████▉ | 17900/20019 [2:21:47<21:54,  1.61it/s]

[Epoch 0, Batch 17900] loss: 1.283


 90%|████████▉ | 17999/20019 [2:22:50<19:31,  1.72it/s]

[Epoch 0, Batch 18000] loss: 1.306


 90%|████████▉ | 18000/20019 [2:22:54<44:59,  1.34s/it]

****** Model checkpoint saved at epochs 1 ******


 90%|█████████ | 18100/20019 [2:24:06<22:08,  1.44it/s]  

[Epoch 0, Batch 18100] loss: 1.261


 91%|█████████ | 18200/20019 [2:25:15<39:42,  1.31s/it]

[Epoch 0, Batch 18200] loss: 1.283


 91%|█████████▏| 18300/20019 [2:26:20<22:04,  1.30it/s]

[Epoch 0, Batch 18300] loss: 1.356


 92%|█████████▏| 18400/20019 [2:27:27<16:51,  1.60it/s]

[Epoch 0, Batch 18400] loss: 1.271


 92%|█████████▏| 18500/20019 [2:28:34<14:59,  1.69it/s]

[Epoch 0, Batch 18500] loss: 1.269


 93%|█████████▎| 18600/20019 [2:29:43<22:22,  1.06it/s]

[Epoch 0, Batch 18600] loss: 1.307


 93%|█████████▎| 18700/20019 [2:30:52<13:28,  1.63it/s]

[Epoch 0, Batch 18700] loss: 1.316


 94%|█████████▍| 18800/20019 [2:32:01<13:44,  1.48it/s]

[Epoch 0, Batch 18800] loss: 1.291


 94%|█████████▍| 18900/20019 [2:33:10<19:21,  1.04s/it]

[Epoch 0, Batch 18900] loss: 1.305


 95%|█████████▍| 18999/20019 [2:34:18<11:59,  1.42it/s]

[Epoch 0, Batch 19000] loss: 1.273


 95%|█████████▍| 19000/20019 [2:34:21<21:27,  1.26s/it]

****** Model checkpoint saved at epochs 1 ******


 95%|█████████▌| 19100/20019 [2:35:32<10:35,  1.45it/s]

[Epoch 0, Batch 19100] loss: 1.229


 96%|█████████▌| 19200/20019 [2:36:43<09:55,  1.38it/s]

[Epoch 0, Batch 19200] loss: 1.245


 96%|█████████▋| 19300/20019 [2:37:54<07:54,  1.52it/s]

[Epoch 0, Batch 19300] loss: 1.263


 97%|█████████▋| 19400/20019 [2:39:03<06:29,  1.59it/s]

[Epoch 0, Batch 19400] loss: 1.268


 97%|█████████▋| 19500/20019 [2:40:14<05:12,  1.66it/s]

[Epoch 0, Batch 19500] loss: 1.307


 98%|█████████▊| 19600/20019 [2:41:24<04:21,  1.61it/s]

[Epoch 0, Batch 19600] loss: 1.292


 98%|█████████▊| 19700/20019 [2:42:32<03:15,  1.63it/s]

[Epoch 0, Batch 19700] loss: 1.261


 99%|█████████▉| 19800/20019 [2:43:42<02:15,  1.61it/s]

[Epoch 0, Batch 19800] loss: 1.270


 99%|█████████▉| 19900/20019 [2:44:52<01:32,  1.28it/s]

[Epoch 0, Batch 19900] loss: 1.307


100%|█████████▉| 19999/20019 [2:46:00<00:15,  1.33it/s]

[Epoch 0, Batch 20000] loss: 1.268


100%|█████████▉| 20000/20019 [2:46:02<00:25,  1.34s/it]

****** Model checkpoint saved at epochs 1 ******


100%|██████████| 20019/20019 [2:46:13<00:00,  2.01it/s]
  0%|          | 100/20019 [00:35<1:55:59,  2.86it/s]

[Epoch 1, Batch   100] loss: 1.190


  1%|          | 200/20019 [01:11<1:55:44,  2.85it/s]

[Epoch 1, Batch   200] loss: 1.200


  1%|▏         | 300/20019 [01:46<1:54:09,  2.88it/s]

[Epoch 1, Batch   300] loss: 1.196


  2%|▏         | 400/20019 [02:21<1:53:36,  2.88it/s]

[Epoch 1, Batch   400] loss: 1.216


  2%|▏         | 500/20019 [02:56<1:54:18,  2.85it/s]

[Epoch 1, Batch   500] loss: 1.189


  3%|▎         | 600/20019 [03:31<1:53:19,  2.86it/s]

[Epoch 1, Batch   600] loss: 1.159


  3%|▎         | 700/20019 [04:06<1:51:48,  2.88it/s]

[Epoch 1, Batch   700] loss: 1.173


  4%|▍         | 800/20019 [04:41<1:51:49,  2.86it/s]

[Epoch 1, Batch   800] loss: 1.191


  4%|▍         | 900/20019 [05:16<1:51:11,  2.87it/s]

[Epoch 1, Batch   900] loss: 1.189


  5%|▍         | 999/20019 [05:51<1:50:30,  2.87it/s]

[Epoch 1, Batch  1000] loss: 1.212


  5%|▍         | 1000/20019 [05:53<4:41:59,  1.12it/s]

****** Model checkpoint saved at epochs 2 ******


  5%|▌         | 1100/20019 [06:28<1:49:07,  2.89it/s]

[Epoch 1, Batch  1100] loss: 1.174


  6%|▌         | 1200/20019 [07:04<1:49:08,  2.87it/s]

[Epoch 1, Batch  1200] loss: 1.143


  6%|▋         | 1300/20019 [07:39<1:47:55,  2.89it/s]

[Epoch 1, Batch  1300] loss: 1.163


  7%|▋         | 1400/20019 [08:13<1:47:29,  2.89it/s]

[Epoch 1, Batch  1400] loss: 1.185


  7%|▋         | 1500/20019 [08:48<1:47:22,  2.87it/s]

[Epoch 1, Batch  1500] loss: 1.210


  8%|▊         | 1600/20019 [09:33<1:52:10,  2.74it/s] 

[Epoch 1, Batch  1600] loss: 1.174


  8%|▊         | 1700/20019 [10:07<1:45:47,  2.89it/s]

[Epoch 1, Batch  1700] loss: 1.180


  9%|▉         | 1800/20019 [10:42<1:45:05,  2.89it/s]

[Epoch 1, Batch  1800] loss: 1.143


  9%|▉         | 1900/20019 [11:17<1:45:32,  2.86it/s]

[Epoch 1, Batch  1900] loss: 1.148


 10%|▉         | 1999/20019 [11:58<1:55:45,  2.59it/s]

[Epoch 1, Batch  2000] loss: 1.186


 10%|▉         | 2000/20019 [12:00<5:14:39,  1.05s/it]

****** Model checkpoint saved at epochs 2 ******


 10%|█         | 2100/20019 [12:41<1:44:52,  2.85it/s]

[Epoch 1, Batch  2100] loss: 1.151


 11%|█         | 2200/20019 [13:16<1:44:32,  2.84it/s]

[Epoch 1, Batch  2200] loss: 1.139


 11%|█▏        | 2300/20019 [13:51<1:44:53,  2.82it/s]

[Epoch 1, Batch  2300] loss: 1.170


 12%|█▏        | 2400/20019 [14:26<1:42:58,  2.85it/s]

[Epoch 1, Batch  2400] loss: 1.183


 12%|█▏        | 2500/20019 [15:02<1:42:06,  2.86it/s]

[Epoch 1, Batch  2500] loss: 1.204


 13%|█▎        | 2600/20019 [15:46<2:05:56,  2.31it/s] 

[Epoch 1, Batch  2600] loss: 1.169


 13%|█▎        | 2700/20019 [16:24<1:40:32,  2.87it/s]

[Epoch 1, Batch  2700] loss: 1.174


 14%|█▍        | 2800/20019 [17:02<1:39:54,  2.87it/s]

[Epoch 1, Batch  2800] loss: 1.227


 14%|█▍        | 2900/20019 [17:38<1:46:33,  2.68it/s]

[Epoch 1, Batch  2900] loss: 1.188


 15%|█▍        | 2999/20019 [18:13<1:39:00,  2.87it/s]

[Epoch 1, Batch  3000] loss: 1.173


 15%|█▍        | 3000/20019 [18:15<4:30:50,  1.05it/s]

****** Model checkpoint saved at epochs 2 ******


 15%|█▌        | 3100/20019 [18:52<1:38:34,  2.86it/s]

[Epoch 1, Batch  3100] loss: 1.153


 16%|█▌        | 3200/20019 [19:41<1:50:41,  2.53it/s] 

[Epoch 1, Batch  3200] loss: 1.128


 16%|█▋        | 3300/20019 [20:18<1:40:55,  2.76it/s]

[Epoch 1, Batch  3300] loss: 1.180


 17%|█▋        | 3400/20019 [20:58<1:35:48,  2.89it/s]

[Epoch 1, Batch  3400] loss: 1.175


 17%|█▋        | 3500/20019 [21:33<1:36:01,  2.87it/s]

[Epoch 1, Batch  3500] loss: 1.177


 18%|█▊        | 3600/20019 [22:07<1:34:53,  2.88it/s]

[Epoch 1, Batch  3600] loss: 1.175


 18%|█▊        | 3700/20019 [22:50<1:34:06,  2.89it/s] 

[Epoch 1, Batch  3700] loss: 1.169


 19%|█▉        | 3800/20019 [23:27<3:27:31,  1.30it/s]

[Epoch 1, Batch  3800] loss: 1.202


 19%|█▉        | 3900/20019 [24:06<1:36:50,  2.77it/s]

[Epoch 1, Batch  3900] loss: 1.153


 20%|█▉        | 3999/20019 [24:43<1:36:33,  2.77it/s]

[Epoch 1, Batch  4000] loss: 1.157


 20%|█▉        | 4000/20019 [24:46<4:49:48,  1.09s/it]

****** Model checkpoint saved at epochs 2 ******


 20%|██        | 4100/20019 [25:28<1:32:49,  2.86it/s]

[Epoch 1, Batch  4100] loss: 1.175


 21%|██        | 4200/20019 [26:06<1:30:58,  2.90it/s]

[Epoch 1, Batch  4200] loss: 1.167


 21%|██▏       | 4300/20019 [26:45<2:03:27,  2.12it/s]

[Epoch 1, Batch  4300] loss: 1.173


 22%|██▏       | 4400/20019 [27:24<1:33:34,  2.78it/s]

[Epoch 1, Batch  4400] loss: 1.141


 22%|██▏       | 4500/20019 [28:01<1:28:57,  2.91it/s]

[Epoch 1, Batch  4500] loss: 1.157


 23%|██▎       | 4600/20019 [28:37<1:29:14,  2.88it/s]

[Epoch 1, Batch  4600] loss: 1.170


 23%|██▎       | 4700/20019 [29:12<1:29:45,  2.84it/s]

[Epoch 1, Batch  4700] loss: 1.186


 24%|██▍       | 4800/20019 [29:56<1:28:15,  2.87it/s]

[Epoch 1, Batch  4800] loss: 1.165


 24%|██▍       | 4900/20019 [30:34<1:27:23,  2.88it/s]

[Epoch 1, Batch  4900] loss: 1.187


 25%|██▍       | 4999/20019 [31:14<1:34:10,  2.66it/s]

[Epoch 1, Batch  5000] loss: 1.137


 25%|██▍       | 5000/20019 [31:17<4:40:57,  1.12s/it]

****** Model checkpoint saved at epochs 2 ******


 25%|██▌       | 5100/20019 [31:59<1:34:23,  2.63it/s]

[Epoch 1, Batch  5100] loss: 1.171


 26%|██▌       | 5200/20019 [32:38<1:27:26,  2.82it/s]

[Epoch 1, Batch  5200] loss: 1.145


 26%|██▋       | 5300/20019 [33:15<1:26:09,  2.85it/s]

[Epoch 1, Batch  5300] loss: 1.180


 27%|██▋       | 5400/20019 [33:56<1:36:02,  2.54it/s]

[Epoch 1, Batch  5400] loss: 1.168


 27%|██▋       | 5500/20019 [34:36<1:27:40,  2.76it/s]

[Epoch 1, Batch  5500] loss: 1.150


 28%|██▊       | 5600/20019 [35:16<1:24:41,  2.84it/s]

[Epoch 1, Batch  5600] loss: 1.156


 28%|██▊       | 5700/20019 [35:56<2:26:27,  1.63it/s]

[Epoch 1, Batch  5700] loss: 1.175


 29%|██▉       | 5800/20019 [36:35<1:29:18,  2.65it/s]

[Epoch 1, Batch  5800] loss: 1.152


 29%|██▉       | 5900/20019 [37:15<1:22:41,  2.85it/s]

[Epoch 1, Batch  5900] loss: 1.127


 30%|██▉       | 5999/20019 [37:54<1:24:52,  2.75it/s]

[Epoch 1, Batch  6000] loss: 1.166


 30%|██▉       | 6000/20019 [37:56<4:15:28,  1.09s/it]

****** Model checkpoint saved at epochs 2 ******


 30%|███       | 6100/20019 [38:33<1:25:51,  2.70it/s]

[Epoch 1, Batch  6100] loss: 1.172


 31%|███       | 6200/20019 [39:12<1:24:22,  2.73it/s]

[Epoch 1, Batch  6200] loss: 1.195


 31%|███▏      | 6300/20019 [40:01<1:30:20,  2.53it/s] 

[Epoch 1, Batch  6300] loss: 1.171


 32%|███▏      | 6400/20019 [40:41<1:24:11,  2.70it/s]

[Epoch 1, Batch  6400] loss: 1.210


 32%|███▏      | 6500/20019 [41:23<1:18:21,  2.88it/s]

[Epoch 1, Batch  6500] loss: 1.116


 33%|███▎      | 6600/20019 [42:04<1:22:58,  2.70it/s]

[Epoch 1, Batch  6600] loss: 1.156


 33%|███▎      | 6700/20019 [42:45<1:25:54,  2.58it/s]

[Epoch 1, Batch  6700] loss: 1.161


 34%|███▍      | 6800/20019 [43:24<1:21:36,  2.70it/s]

[Epoch 1, Batch  6800] loss: 1.158


 34%|███▍      | 6900/20019 [44:05<1:24:09,  2.60it/s]

[Epoch 1, Batch  6900] loss: 1.128


 35%|███▍      | 6999/20019 [44:45<1:49:22,  1.98it/s]

[Epoch 1, Batch  7000] loss: 1.151


 35%|███▍      | 7000/20019 [44:49<5:18:39,  1.47s/it]

****** Model checkpoint saved at epochs 2 ******


 35%|███▌      | 7100/20019 [45:32<1:26:10,  2.50it/s]

[Epoch 1, Batch  7100] loss: 1.100


 36%|███▌      | 7200/20019 [46:10<1:18:57,  2.71it/s]

[Epoch 1, Batch  7200] loss: 1.154


 36%|███▋      | 7300/20019 [46:46<1:14:40,  2.84it/s]

[Epoch 1, Batch  7300] loss: 1.163


 37%|███▋      | 7400/20019 [47:31<6:19:48,  1.81s/it]

[Epoch 1, Batch  7400] loss: 1.138


 37%|███▋      | 7500/20019 [48:09<1:15:24,  2.77it/s]

[Epoch 1, Batch  7500] loss: 1.161


 38%|███▊      | 7600/20019 [48:48<1:18:01,  2.65it/s]

[Epoch 1, Batch  7600] loss: 1.180


 38%|███▊      | 7700/20019 [49:32<1:25:54,  2.39it/s]

[Epoch 1, Batch  7700] loss: 1.152


 39%|███▉      | 7800/20019 [50:10<1:26:58,  2.34it/s]

[Epoch 1, Batch  7800] loss: 1.132


 39%|███▉      | 7900/20019 [50:58<1:57:42,  1.72it/s]

[Epoch 1, Batch  7900] loss: 1.123


 40%|███▉      | 7999/20019 [51:37<1:13:56,  2.71it/s]

[Epoch 1, Batch  8000] loss: 1.140


 40%|███▉      | 8000/20019 [51:40<3:49:05,  1.14s/it]

****** Model checkpoint saved at epochs 2 ******


 40%|████      | 8100/20019 [52:25<1:11:16,  2.79it/s]

[Epoch 1, Batch  8100] loss: 1.126


 41%|████      | 8200/20019 [53:09<1:09:38,  2.83it/s]

[Epoch 1, Batch  8200] loss: 1.195


 41%|████▏     | 8300/20019 [53:52<1:13:57,  2.64it/s]

[Epoch 1, Batch  8300] loss: 1.124


 42%|████▏     | 8400/20019 [54:35<1:29:46,  2.16it/s]

[Epoch 1, Batch  8400] loss: 1.178


 42%|████▏     | 8500/20019 [55:19<1:12:26,  2.65it/s]

[Epoch 1, Batch  8500] loss: 1.205


 43%|████▎     | 8600/20019 [55:57<1:10:28,  2.70it/s]

[Epoch 1, Batch  8600] loss: 1.138


 43%|████▎     | 8700/20019 [56:44<2:13:21,  1.41it/s]

[Epoch 1, Batch  8700] loss: 1.130


 44%|████▍     | 8800/20019 [57:26<1:05:58,  2.83it/s]

[Epoch 1, Batch  8800] loss: 1.133


 44%|████▍     | 8900/20019 [58:05<1:13:43,  2.51it/s]

[Epoch 1, Batch  8900] loss: 1.162


 45%|████▍     | 8999/20019 [58:52<4:28:46,  1.46s/it]

[Epoch 1, Batch  9000] loss: 1.132


 45%|████▍     | 9000/20019 [58:54<5:30:57,  1.80s/it]

****** Model checkpoint saved at epochs 2 ******


 45%|████▌     | 9100/20019 [59:41<1:12:57,  2.49it/s]

[Epoch 1, Batch  9100] loss: 1.155


 46%|████▌     | 9200/20019 [1:00:20<1:12:23,  2.49it/s]

[Epoch 1, Batch  9200] loss: 1.176


 46%|████▋     | 9300/20019 [1:01:09<1:09:04,  2.59it/s]

[Epoch 1, Batch  9300] loss: 1.140


 47%|████▋     | 9400/20019 [1:01:50<1:07:09,  2.64it/s]

[Epoch 1, Batch  9400] loss: 1.102


 47%|████▋     | 9500/20019 [1:02:34<1:11:57,  2.44it/s]

[Epoch 1, Batch  9500] loss: 1.128


 48%|████▊     | 9600/20019 [1:03:20<1:18:52,  2.20it/s]

[Epoch 1, Batch  9600] loss: 1.156


 48%|████▊     | 9700/20019 [1:04:02<1:06:14,  2.60it/s]

[Epoch 1, Batch  9700] loss: 1.098


 49%|████▉     | 9800/20019 [1:04:41<1:06:50,  2.55it/s]

[Epoch 1, Batch  9800] loss: 1.105


 49%|████▉     | 9900/20019 [1:05:21<1:03:56,  2.64it/s]

[Epoch 1, Batch  9900] loss: 1.113


 50%|████▉     | 9999/20019 [1:06:14<1:25:33,  1.95it/s]

[Epoch 1, Batch 10000] loss: 1.152


 50%|████▉     | 10000/20019 [1:06:17<3:09:43,  1.14s/it]

****** Model checkpoint saved at epochs 2 ******


 50%|█████     | 10100/20019 [1:07:06<1:07:10,  2.46it/s]

[Epoch 1, Batch 10100] loss: 1.138


 51%|█████     | 10200/20019 [1:07:51<1:12:24,  2.26it/s]

[Epoch 1, Batch 10200] loss: 1.167


 51%|█████▏    | 10300/20019 [1:08:34<1:00:13,  2.69it/s]

[Epoch 1, Batch 10300] loss: 1.194


 52%|█████▏    | 10400/20019 [1:09:21<1:04:04,  2.50it/s]

[Epoch 1, Batch 10400] loss: 1.170


 52%|█████▏    | 10500/20019 [1:10:03<1:13:21,  2.16it/s]

[Epoch 1, Batch 10500] loss: 1.128


 53%|█████▎    | 10600/20019 [1:10:43<1:07:06,  2.34it/s]

[Epoch 1, Batch 10600] loss: 1.149


 53%|█████▎    | 10700/20019 [1:11:28<2:51:39,  1.11s/it]

[Epoch 1, Batch 10700] loss: 1.155


 54%|█████▍    | 10800/20019 [1:12:17<58:51,  2.61it/s]  

[Epoch 1, Batch 10800] loss: 1.101


 54%|█████▍    | 10900/20019 [1:13:06<1:28:02,  1.73it/s]

[Epoch 1, Batch 10900] loss: 1.138


 55%|█████▍    | 10999/20019 [1:13:49<1:10:22,  2.14it/s]

[Epoch 1, Batch 11000] loss: 1.104


 55%|█████▍    | 11000/20019 [1:13:52<3:02:09,  1.21s/it]

****** Model checkpoint saved at epochs 2 ******


 55%|█████▌    | 11100/20019 [1:14:43<58:23,  2.55it/s]  

[Epoch 1, Batch 11100] loss: 1.170


 56%|█████▌    | 11200/20019 [1:15:35<1:40:40,  1.46it/s]

[Epoch 1, Batch 11200] loss: 1.143


 56%|█████▋    | 11300/20019 [1:16:24<1:24:55,  1.71it/s]

[Epoch 1, Batch 11300] loss: 1.143


 57%|█████▋    | 11400/20019 [1:17:12<59:35,  2.41it/s]  

[Epoch 1, Batch 11400] loss: 1.158


 57%|█████▋    | 11500/20019 [1:17:59<57:54,  2.45it/s]  

[Epoch 1, Batch 11500] loss: 1.157


 58%|█████▊    | 11600/20019 [1:18:45<58:50,  2.38it/s]  

[Epoch 1, Batch 11600] loss: 1.111


 58%|█████▊    | 11700/20019 [1:19:29<58:49,  2.36it/s]  

[Epoch 1, Batch 11700] loss: 1.148


 59%|█████▉    | 11800/20019 [1:20:21<54:52,  2.50it/s]  

[Epoch 1, Batch 11800] loss: 1.131


 59%|█████▉    | 11900/20019 [1:21:12<1:18:15,  1.73it/s]

[Epoch 1, Batch 11900] loss: 1.147


 60%|█████▉    | 11999/20019 [1:21:58<52:07,  2.56it/s]  

[Epoch 1, Batch 12000] loss: 1.167


 60%|█████▉    | 12000/20019 [1:22:01<2:32:42,  1.14s/it]

****** Model checkpoint saved at epochs 2 ******


 60%|██████    | 12100/20019 [1:22:52<1:03:31,  2.08it/s]

[Epoch 1, Batch 12100] loss: 1.118


 61%|██████    | 12200/20019 [1:23:43<56:41,  2.30it/s]  

[Epoch 1, Batch 12200] loss: 1.155


 61%|██████▏   | 12300/20019 [1:24:34<1:09:34,  1.85it/s]

[Epoch 1, Batch 12300] loss: 1.105


 62%|██████▏   | 12400/20019 [1:25:26<52:25,  2.42it/s]  

[Epoch 1, Batch 12400] loss: 1.128


 62%|██████▏   | 12500/20019 [1:26:11<58:44,  2.13it/s]  

[Epoch 1, Batch 12500] loss: 1.134


 63%|██████▎   | 12600/20019 [1:27:09<1:00:42,  2.04it/s]

[Epoch 1, Batch 12600] loss: 1.141


 63%|██████▎   | 12700/20019 [1:28:03<58:27,  2.09it/s]  

[Epoch 1, Batch 12700] loss: 1.102


 64%|██████▍   | 12800/20019 [1:28:53<56:29,  2.13it/s]  

[Epoch 1, Batch 12800] loss: 1.148


 64%|██████▍   | 12900/20019 [1:29:47<49:12,  2.41it/s]  

[Epoch 1, Batch 12900] loss: 1.147


 65%|██████▍   | 12999/20019 [1:30:38<53:06,  2.20it/s]  

[Epoch 1, Batch 13000] loss: 1.128


 65%|██████▍   | 13000/20019 [1:30:40<2:09:46,  1.11s/it]

****** Model checkpoint saved at epochs 2 ******


 65%|██████▌   | 13100/20019 [1:31:35<54:06,  2.13it/s]  

[Epoch 1, Batch 13100] loss: 1.098


 66%|██████▌   | 13200/20019 [1:32:23<48:56,  2.32it/s]  

[Epoch 1, Batch 13200] loss: 1.164


 66%|██████▋   | 13300/20019 [1:33:19<52:15,  2.14it/s]  

[Epoch 1, Batch 13300] loss: 1.119


 67%|██████▋   | 13400/20019 [1:34:16<1:42:35,  1.08it/s]

[Epoch 1, Batch 13400] loss: 1.161


 67%|██████▋   | 13500/20019 [1:35:11<48:00,  2.26it/s]  

[Epoch 1, Batch 13500] loss: 1.109


 68%|██████▊   | 13600/20019 [1:36:01<53:49,  1.99it/s]  

[Epoch 1, Batch 13600] loss: 1.133


 68%|██████▊   | 13700/20019 [1:36:59<51:46,  2.03it/s]  

[Epoch 1, Batch 13700] loss: 1.170


 69%|██████▉   | 13800/20019 [1:37:57<59:56,  1.73it/s]  

[Epoch 1, Batch 13800] loss: 1.116


 69%|██████▉   | 13900/20019 [1:38:53<1:18:18,  1.30it/s]

[Epoch 1, Batch 13900] loss: 1.115


 70%|██████▉   | 13999/20019 [1:39:47<54:05,  1.85it/s]  

[Epoch 1, Batch 14000] loss: 1.120


 70%|██████▉   | 14000/20019 [1:39:49<1:52:45,  1.12s/it]

****** Model checkpoint saved at epochs 2 ******


 70%|███████   | 14100/20019 [1:40:42<59:40,  1.65it/s]  

[Epoch 1, Batch 14100] loss: 1.178


 71%|███████   | 14200/20019 [1:41:45<56:43,  1.71it/s]  

[Epoch 1, Batch 14200] loss: 1.125


 71%|███████▏  | 14300/20019 [1:42:44<55:23,  1.72it/s]  

[Epoch 1, Batch 14300] loss: 1.073


 72%|███████▏  | 14400/20019 [1:43:38<55:36,  1.68it/s]  

[Epoch 1, Batch 14400] loss: 1.146


 72%|███████▏  | 14500/20019 [1:44:40<47:29,  1.94it/s]  

[Epoch 1, Batch 14500] loss: 1.117


 73%|███████▎  | 14600/20019 [1:45:40<47:37,  1.90it/s]  

[Epoch 1, Batch 14600] loss: 1.152


 73%|███████▎  | 14700/20019 [1:46:34<44:31,  1.99it/s]

[Epoch 1, Batch 14700] loss: 1.084


 74%|███████▍  | 14800/20019 [1:47:37<54:52,  1.59it/s]  

[Epoch 1, Batch 14800] loss: 1.167


 74%|███████▍  | 14900/20019 [1:48:40<51:06,  1.67it/s]  

[Epoch 1, Batch 14900] loss: 1.156


 75%|███████▍  | 14999/20019 [1:49:42<44:22,  1.89it/s]  

[Epoch 1, Batch 15000] loss: 1.117


 75%|███████▍  | 15000/20019 [1:49:45<1:53:41,  1.36s/it]

****** Model checkpoint saved at epochs 2 ******


 75%|███████▌  | 15100/20019 [1:50:43<56:18,  1.46it/s]  

[Epoch 1, Batch 15100] loss: 1.135


 76%|███████▌  | 15200/20019 [1:51:52<49:30,  1.62it/s]  

[Epoch 1, Batch 15200] loss: 1.143


 76%|███████▋  | 15300/20019 [1:52:56<48:23,  1.63it/s]  

[Epoch 1, Batch 15300] loss: 1.102


 77%|███████▋  | 15400/20019 [1:53:55<49:00,  1.57it/s]

[Epoch 1, Batch 15400] loss: 1.115


 77%|███████▋  | 15500/20019 [1:55:05<47:24,  1.59it/s]  

[Epoch 1, Batch 15500] loss: 1.116


 78%|███████▊  | 15600/20019 [1:56:12<46:14,  1.59it/s]  

[Epoch 1, Batch 15600] loss: 1.132


 78%|███████▊  | 15700/20019 [1:57:15<40:35,  1.77it/s]  

[Epoch 1, Batch 15700] loss: 1.142


 79%|███████▉  | 15800/20019 [1:58:25<42:47,  1.64it/s]  

[Epoch 1, Batch 15800] loss: 1.159


 79%|███████▉  | 15900/20019 [1:59:31<38:24,  1.79it/s]  

[Epoch 1, Batch 15900] loss: 1.117


 80%|███████▉  | 15999/20019 [2:00:33<45:09,  1.48it/s]

[Epoch 1, Batch 16000] loss: 1.094


 80%|███████▉  | 16000/20019 [2:00:36<1:31:42,  1.37s/it]

****** Model checkpoint saved at epochs 2 ******


 80%|████████  | 16100/20019 [2:01:42<40:19,  1.62it/s]  

[Epoch 1, Batch 16100] loss: 1.112


 81%|████████  | 16200/20019 [2:02:48<35:47,  1.78it/s]

[Epoch 1, Batch 16200] loss: 1.127


 81%|████████▏ | 16300/20019 [2:04:01<37:11,  1.67it/s]  

[Epoch 1, Batch 16300] loss: 1.069


 82%|████████▏ | 16400/20019 [2:05:06<34:18,  1.76it/s]

[Epoch 1, Batch 16400] loss: 1.145


 82%|████████▏ | 16500/20019 [2:06:19<35:12,  1.67it/s]  

[Epoch 1, Batch 16500] loss: 1.099


 83%|████████▎ | 16600/20019 [2:07:31<34:50,  1.64it/s]  

[Epoch 1, Batch 16600] loss: 1.070


 83%|████████▎ | 16700/20019 [2:08:42<36:34,  1.51it/s]  

[Epoch 1, Batch 16700] loss: 1.126


 84%|████████▍ | 16800/20019 [2:09:52<33:02,  1.62it/s]  

[Epoch 1, Batch 16800] loss: 1.047


 84%|████████▍ | 16900/20019 [2:10:56<32:33,  1.60it/s]

[Epoch 1, Batch 16900] loss: 1.097


 85%|████████▍ | 16999/20019 [2:11:59<29:15,  1.72it/s]

[Epoch 1, Batch 17000] loss: 1.077


 85%|████████▍ | 17000/20019 [2:12:03<1:14:20,  1.48s/it]

****** Model checkpoint saved at epochs 2 ******


 85%|████████▌ | 17100/20019 [2:13:18<29:32,  1.65it/s]  

[Epoch 1, Batch 17100] loss: 1.131


 86%|████████▌ | 17200/20019 [2:14:23<29:56,  1.57it/s]

[Epoch 1, Batch 17200] loss: 1.114


 86%|████████▋ | 17300/20019 [2:15:28<26:40,  1.70it/s]

[Epoch 1, Batch 17300] loss: 1.088


 87%|████████▋ | 17400/20019 [2:16:43<25:44,  1.70it/s]  

[Epoch 1, Batch 17400] loss: 1.104


 87%|████████▋ | 17500/20019 [2:17:49<26:13,  1.60it/s]

[Epoch 1, Batch 17500] loss: 1.113


 88%|████████▊ | 17600/20019 [2:18:55<23:27,  1.72it/s]

[Epoch 1, Batch 17600] loss: 1.102


 88%|████████▊ | 17700/20019 [2:20:02<27:05,  1.43it/s]

[Epoch 1, Batch 17700] loss: 1.146


 89%|████████▉ | 17800/20019 [2:21:08<25:27,  1.45it/s]

[Epoch 1, Batch 17800] loss: 1.089


 89%|████████▉ | 17900/20019 [2:22:15<23:08,  1.53it/s]

[Epoch 1, Batch 17900] loss: 1.109


 90%|████████▉ | 17999/20019 [2:23:22<24:17,  1.39it/s]

[Epoch 1, Batch 18000] loss: 1.147


 90%|████████▉ | 18000/20019 [2:23:24<46:27,  1.38s/it]

****** Model checkpoint saved at epochs 2 ******


 90%|█████████ | 18100/20019 [2:24:37<21:13,  1.51it/s]

[Epoch 1, Batch 18100] loss: 1.140


 91%|█████████ | 18200/20019 [2:25:46<18:28,  1.64it/s]

[Epoch 1, Batch 18200] loss: 1.086


 91%|█████████▏| 18300/20019 [2:26:54<17:25,  1.64it/s]

[Epoch 1, Batch 18300] loss: 1.094


 92%|█████████▏| 18400/20019 [2:28:02<16:24,  1.64it/s]

[Epoch 1, Batch 18400] loss: 1.142


 92%|█████████▏| 18500/20019 [2:29:12<16:42,  1.52it/s]

[Epoch 1, Batch 18500] loss: 1.098


 93%|█████████▎| 18600/20019 [2:30:19<14:09,  1.67it/s]

[Epoch 1, Batch 18600] loss: 1.117


 93%|█████████▎| 18700/20019 [2:31:27<13:38,  1.61it/s]

[Epoch 1, Batch 18700] loss: 1.131


 94%|█████████▍| 18800/20019 [2:32:35<12:43,  1.60it/s]

[Epoch 1, Batch 18800] loss: 1.105


 94%|█████████▍| 18900/20019 [2:33:44<11:37,  1.60it/s]

[Epoch 1, Batch 18900] loss: 1.127


 95%|█████████▍| 18999/20019 [2:34:52<13:44,  1.24it/s]

[Epoch 1, Batch 19000] loss: 1.116


 95%|█████████▍| 19000/20019 [2:34:56<27:14,  1.60s/it]

****** Model checkpoint saved at epochs 2 ******


 95%|█████████▌| 19100/20019 [2:36:10<11:05,  1.38it/s]

[Epoch 1, Batch 19100] loss: 1.101


 96%|█████████▌| 19200/20019 [2:37:18<09:28,  1.44it/s]

[Epoch 1, Batch 19200] loss: 1.103


 96%|█████████▋| 19300/20019 [2:38:26<08:04,  1.48it/s]

[Epoch 1, Batch 19300] loss: 1.105


 97%|█████████▋| 19400/20019 [2:39:42<08:38,  1.19it/s]

[Epoch 1, Batch 19400] loss: 1.123


 97%|█████████▋| 19500/20019 [2:40:57<06:19,  1.37it/s]

[Epoch 1, Batch 19500] loss: 1.096


 98%|█████████▊| 19600/20019 [2:42:07<04:29,  1.56it/s]

[Epoch 1, Batch 19600] loss: 1.110


 98%|█████████▊| 19700/20019 [2:43:24<03:24,  1.56it/s]

[Epoch 1, Batch 19700] loss: 1.113


 99%|█████████▉| 19800/20019 [2:44:35<02:12,  1.65it/s]

[Epoch 1, Batch 19800] loss: 1.100


 99%|█████████▉| 19900/20019 [2:45:52<01:24,  1.41it/s]

[Epoch 1, Batch 19900] loss: 1.080


100%|█████████▉| 19999/20019 [2:47:08<00:14,  1.43it/s]

[Epoch 1, Batch 20000] loss: 1.105


100%|█████████▉| 20000/20019 [2:47:11<00:28,  1.48s/it]

****** Model checkpoint saved at epochs 2 ******


100%|██████████| 20019/20019 [2:47:24<00:00,  1.99it/s]
  0%|          | 100/20019 [00:35<1:54:15,  2.91it/s]

[Epoch 2, Batch   100] loss: 1.027


  1%|          | 200/20019 [01:10<1:54:40,  2.88it/s]

[Epoch 2, Batch   200] loss: 1.012


  1%|▏         | 300/20019 [01:45<1:54:53,  2.86it/s]

[Epoch 2, Batch   300] loss: 1.005


  2%|▏         | 400/20019 [02:21<1:53:14,  2.89it/s]

[Epoch 2, Batch   400] loss: 1.010


  2%|▏         | 500/20019 [02:56<1:52:55,  2.88it/s]

[Epoch 2, Batch   500] loss: 0.995


  3%|▎         | 600/20019 [03:31<1:52:13,  2.88it/s]

[Epoch 2, Batch   600] loss: 0.989


  3%|▎         | 700/20019 [04:08<1:53:36,  2.83it/s]

[Epoch 2, Batch   700] loss: 1.048


  4%|▍         | 800/20019 [04:43<1:51:14,  2.88it/s]

[Epoch 2, Batch   800] loss: 1.001


  4%|▍         | 900/20019 [05:18<1:50:47,  2.88it/s]

[Epoch 2, Batch   900] loss: 1.008


  5%|▍         | 999/20019 [05:52<1:50:02,  2.88it/s]

[Epoch 2, Batch  1000] loss: 0.993


  5%|▍         | 1000/20019 [05:55<5:23:07,  1.02s/it]

****** Model checkpoint saved at epochs 3 ******


  5%|▌         | 1100/20019 [06:29<1:48:51,  2.90it/s]

[Epoch 2, Batch  1100] loss: 1.038


  6%|▌         | 1200/20019 [07:04<1:48:32,  2.89it/s]

[Epoch 2, Batch  1200] loss: 1.013


  6%|▋         | 1300/20019 [07:39<1:48:19,  2.88it/s]

[Epoch 2, Batch  1300] loss: 1.016


  7%|▋         | 1400/20019 [08:21<1:47:28,  2.89it/s]

[Epoch 2, Batch  1400] loss: 0.948


  7%|▋         | 1500/20019 [08:57<1:47:00,  2.88it/s]

[Epoch 2, Batch  1500] loss: 0.993


  8%|▊         | 1600/20019 [09:33<1:47:07,  2.87it/s]

[Epoch 2, Batch  1600] loss: 1.015


  8%|▊         | 1700/20019 [10:08<1:45:46,  2.89it/s]

[Epoch 2, Batch  1700] loss: 0.976


  9%|▉         | 1800/20019 [10:43<1:45:22,  2.88it/s]

[Epoch 2, Batch  1800] loss: 0.995


  9%|▉         | 1900/20019 [11:19<1:44:33,  2.89it/s]

[Epoch 2, Batch  1900] loss: 0.968


 10%|▉         | 1999/20019 [11:55<1:44:35,  2.87it/s]

[Epoch 2, Batch  2000] loss: 1.006


 10%|▉         | 2000/20019 [11:58<5:19:58,  1.07s/it]

****** Model checkpoint saved at epochs 3 ******


 10%|█         | 2100/20019 [12:34<1:43:10,  2.89it/s]

[Epoch 2, Batch  2100] loss: 1.027


 11%|█         | 2200/20019 [13:09<1:42:42,  2.89it/s]

[Epoch 2, Batch  2200] loss: 1.004


 11%|█▏        | 2300/20019 [13:44<1:42:15,  2.89it/s]

[Epoch 2, Batch  2300] loss: 1.008


 12%|█▏        | 2400/20019 [14:18<1:42:01,  2.88it/s]

[Epoch 2, Batch  2400] loss: 0.992


 12%|█▏        | 2500/20019 [14:53<1:41:55,  2.86it/s]

[Epoch 2, Batch  2500] loss: 0.990


 13%|█▎        | 2600/20019 [15:35<1:40:25,  2.89it/s] 

[Epoch 2, Batch  2600] loss: 1.030


 13%|█▎        | 2700/20019 [16:10<1:39:46,  2.89it/s]

[Epoch 2, Batch  2700] loss: 1.000


 14%|█▍        | 2800/20019 [16:45<1:39:01,  2.90it/s]

[Epoch 2, Batch  2800] loss: 0.974


 14%|█▍        | 2900/20019 [17:19<1:38:42,  2.89it/s]

[Epoch 2, Batch  2900] loss: 0.981


 15%|█▍        | 2999/20019 [17:54<1:39:01,  2.86it/s]

[Epoch 2, Batch  3000] loss: 0.995


 15%|█▍        | 3000/20019 [17:57<5:02:35,  1.07s/it]

****** Model checkpoint saved at epochs 3 ******


 15%|█▌        | 3100/20019 [18:39<1:37:36,  2.89it/s]

[Epoch 2, Batch  3100] loss: 0.994


 16%|█▌        | 3200/20019 [19:14<1:37:16,  2.88it/s]

[Epoch 2, Batch  3200] loss: 1.030


 16%|█▋        | 3300/20019 [19:48<1:36:30,  2.89it/s]

[Epoch 2, Batch  3300] loss: 1.019


 17%|█▋        | 3400/20019 [20:28<1:57:52,  2.35it/s]

[Epoch 2, Batch  3400] loss: 1.012


 17%|█▋        | 3500/20019 [21:02<1:35:20,  2.89it/s]

[Epoch 2, Batch  3500] loss: 1.006


 18%|█▊        | 3600/20019 [21:39<1:34:24,  2.90it/s]

[Epoch 2, Batch  3600] loss: 0.990


 18%|█▊        | 3700/20019 [22:14<1:33:45,  2.90it/s]

[Epoch 2, Batch  3700] loss: 1.014


 19%|█▉        | 3800/20019 [22:49<1:34:11,  2.87it/s]

[Epoch 2, Batch  3800] loss: 1.029


 19%|█▉        | 3900/20019 [23:25<1:33:15,  2.88it/s]

[Epoch 2, Batch  3900] loss: 1.013


 20%|█▉        | 3999/20019 [24:00<1:33:43,  2.85it/s]

[Epoch 2, Batch  4000] loss: 0.932


 20%|█▉        | 4000/20019 [24:03<4:55:55,  1.11s/it]

****** Model checkpoint saved at epochs 3 ******


 20%|██        | 4100/20019 [24:41<1:47:34,  2.47it/s]

[Epoch 2, Batch  4100] loss: 1.001


 21%|██        | 4200/20019 [25:16<1:32:08,  2.86it/s]

[Epoch 2, Batch  4200] loss: 0.982


 21%|██▏       | 4300/20019 [25:51<1:31:01,  2.88it/s]

[Epoch 2, Batch  4300] loss: 0.938


 22%|██▏       | 4400/20019 [26:28<1:30:29,  2.88it/s]

[Epoch 2, Batch  4400] loss: 0.963


 22%|██▏       | 4500/20019 [27:03<1:30:11,  2.87it/s]

[Epoch 2, Batch  4500] loss: 1.007


 23%|██▎       | 4600/20019 [27:39<1:29:38,  2.87it/s]

[Epoch 2, Batch  4600] loss: 0.995


 23%|██▎       | 4700/20019 [28:15<1:28:20,  2.89it/s]

[Epoch 2, Batch  4700] loss: 0.935


 24%|██▍       | 4800/20019 [28:49<1:27:37,  2.89it/s]

[Epoch 2, Batch  4800] loss: 0.991


 24%|██▍       | 4900/20019 [29:24<1:28:23,  2.85it/s]

[Epoch 2, Batch  4900] loss: 1.026


 25%|██▍       | 4999/20019 [30:04<1:28:25,  2.83it/s]

[Epoch 2, Batch  5000] loss: 1.001


 25%|██▍       | 5000/20019 [30:07<4:28:38,  1.07s/it]

****** Model checkpoint saved at epochs 3 ******


 25%|██▌       | 5100/20019 [30:45<1:40:33,  2.47it/s]

[Epoch 2, Batch  5100] loss: 1.039


 26%|██▌       | 5200/20019 [31:21<1:26:24,  2.86it/s]

[Epoch 2, Batch  5200] loss: 1.012


 26%|██▋       | 5300/20019 [31:56<1:24:59,  2.89it/s]

[Epoch 2, Batch  5300] loss: 0.989


 27%|██▋       | 5400/20019 [32:33<1:24:28,  2.88it/s]

[Epoch 2, Batch  5400] loss: 0.986


 27%|██▋       | 5500/20019 [33:09<1:24:02,  2.88it/s]

[Epoch 2, Batch  5500] loss: 0.990


 28%|██▊       | 5600/20019 [33:45<1:25:37,  2.81it/s]

[Epoch 2, Batch  5600] loss: 0.965


 28%|██▊       | 5700/20019 [34:21<1:23:06,  2.87it/s]

[Epoch 2, Batch  5700] loss: 0.988


 29%|██▉       | 5800/20019 [34:56<1:22:44,  2.86it/s]

[Epoch 2, Batch  5800] loss: 0.994


 29%|██▉       | 5900/20019 [35:35<1:22:43,  2.84it/s]

[Epoch 2, Batch  5900] loss: 1.002


 30%|██▉       | 5999/20019 [36:10<1:22:07,  2.85it/s]

[Epoch 2, Batch  6000] loss: 1.017


 30%|██▉       | 6000/20019 [36:13<4:05:26,  1.05s/it]

****** Model checkpoint saved at epochs 3 ******


 30%|███       | 6100/20019 [36:52<1:20:53,  2.87it/s]

[Epoch 2, Batch  6100] loss: 0.986


 31%|███       | 6200/20019 [37:27<1:22:42,  2.78it/s]

[Epoch 2, Batch  6200] loss: 1.021


 31%|███▏      | 6300/20019 [38:07<1:30:38,  2.52it/s]

[Epoch 2, Batch  6300] loss: 0.981


 32%|███▏      | 6400/20019 [38:44<1:18:50,  2.88it/s]

[Epoch 2, Batch  6400] loss: 1.019


 32%|███▏      | 6500/20019 [39:22<1:52:43,  2.00it/s]

[Epoch 2, Batch  6500] loss: 1.012


 33%|███▎      | 6600/20019 [39:57<1:17:44,  2.88it/s]

[Epoch 2, Batch  6600] loss: 0.989


 33%|███▎      | 6700/20019 [40:35<1:16:33,  2.90it/s]

[Epoch 2, Batch  6700] loss: 1.048


 34%|███▍      | 6800/20019 [41:10<1:16:04,  2.90it/s]

[Epoch 2, Batch  6800] loss: 0.981


 34%|███▍      | 6900/20019 [41:49<1:16:16,  2.87it/s]

[Epoch 2, Batch  6900] loss: 1.025


 35%|███▍      | 6999/20019 [42:26<1:16:01,  2.85it/s]

[Epoch 2, Batch  7000] loss: 0.982


 35%|███▍      | 7000/20019 [42:28<3:56:01,  1.09s/it]

****** Model checkpoint saved at epochs 3 ******


 35%|███▌      | 7100/20019 [43:03<1:14:52,  2.88it/s]

[Epoch 2, Batch  7100] loss: 0.989


 36%|███▌      | 7200/20019 [43:38<1:14:10,  2.88it/s]

[Epoch 2, Batch  7200] loss: 1.008


 36%|███▋      | 7300/20019 [44:20<1:13:26,  2.89it/s]

[Epoch 2, Batch  7300] loss: 1.010


 37%|███▋      | 7400/20019 [44:57<1:13:37,  2.86it/s]

[Epoch 2, Batch  7400] loss: 0.965


 37%|███▋      | 7500/20019 [45:34<1:23:47,  2.49it/s]

[Epoch 2, Batch  7500] loss: 1.008


 38%|███▊      | 7600/20019 [46:10<1:11:54,  2.88it/s]

[Epoch 2, Batch  7600] loss: 0.984


 38%|███▊      | 7700/20019 [46:47<1:11:37,  2.87it/s]

[Epoch 2, Batch  7700] loss: 0.991


 39%|███▉      | 7800/20019 [47:22<1:11:21,  2.85it/s]

[Epoch 2, Batch  7800] loss: 1.051


 39%|███▉      | 7900/20019 [48:02<1:10:15,  2.88it/s]

[Epoch 2, Batch  7900] loss: 0.981


 40%|███▉      | 7999/20019 [48:39<1:11:49,  2.79it/s]

[Epoch 2, Batch  8000] loss: 0.973


 40%|███▉      | 8000/20019 [48:42<3:31:54,  1.06s/it]

****** Model checkpoint saved at epochs 3 ******


 40%|████      | 8100/20019 [49:20<1:16:59,  2.58it/s]

[Epoch 2, Batch  8100] loss: 1.051


 41%|████      | 8200/20019 [49:57<1:08:18,  2.88it/s]

[Epoch 2, Batch  8200] loss: 0.995


 41%|████▏     | 8300/20019 [50:34<1:07:32,  2.89it/s]

[Epoch 2, Batch  8300] loss: 1.017


 42%|████▏     | 8400/20019 [51:12<1:30:22,  2.14it/s]

[Epoch 2, Batch  8400] loss: 0.963


 42%|████▏     | 8500/20019 [51:49<1:11:46,  2.67it/s]

[Epoch 2, Batch  8500] loss: 0.982


 43%|████▎     | 8600/20019 [52:25<1:06:19,  2.87it/s]

[Epoch 2, Batch  8600] loss: 1.033


 43%|████▎     | 8700/20019 [53:01<1:08:33,  2.75it/s]

[Epoch 2, Batch  8700] loss: 1.010


 44%|████▍     | 8800/20019 [53:36<1:09:51,  2.68it/s]

[Epoch 2, Batch  8800] loss: 0.988


 44%|████▍     | 8900/20019 [54:20<1:08:39,  2.70it/s]

[Epoch 2, Batch  8900] loss: 1.013


 45%|████▍     | 8999/20019 [54:55<1:04:30,  2.85it/s]

[Epoch 2, Batch  9000] loss: 1.022


 45%|████▍     | 9000/20019 [54:57<3:13:47,  1.06s/it]

****** Model checkpoint saved at epochs 3 ******


 45%|████▌     | 9100/20019 [55:35<1:02:58,  2.89it/s]

[Epoch 2, Batch  9100] loss: 0.979


 46%|████▌     | 9200/20019 [56:11<1:08:27,  2.63it/s]

[Epoch 2, Batch  9200] loss: 0.960


 46%|████▋     | 9300/20019 [56:47<1:03:18,  2.82it/s]

[Epoch 2, Batch  9300] loss: 0.986


 47%|████▋     | 9400/20019 [57:24<1:08:54,  2.57it/s]

[Epoch 2, Batch  9400] loss: 0.993


 47%|████▋     | 9500/20019 [58:00<1:00:30,  2.90it/s]

[Epoch 2, Batch  9500] loss: 1.000


 48%|████▊     | 9600/20019 [58:44<1:01:40,  2.82it/s]

[Epoch 2, Batch  9600] loss: 1.010


 48%|████▊     | 9700/20019 [59:20<1:00:05,  2.86it/s]

[Epoch 2, Batch  9700] loss: 1.008


 49%|████▉     | 9800/20019 [59:57<59:16,  2.87it/s]  

[Epoch 2, Batch  9800] loss: 0.997


 49%|████▉     | 9900/20019 [1:00:40<1:03:03,  2.67it/s]

[Epoch 2, Batch  9900] loss: 1.007


 50%|████▉     | 9999/20019 [1:01:18<58:26,  2.86it/s]  

[Epoch 2, Batch 10000] loss: 1.009


 50%|████▉     | 10000/20019 [1:01:21<2:59:36,  1.08s/it]

****** Model checkpoint saved at epochs 3 ******


 50%|█████     | 10100/20019 [1:01:59<58:17,  2.84it/s]  

[Epoch 2, Batch 10100] loss: 0.982


 51%|█████     | 10200/20019 [1:02:40<59:19,  2.76it/s]  

[Epoch 2, Batch 10200] loss: 0.993


 51%|█████▏    | 10300/20019 [1:03:20<1:00:19,  2.68it/s]

[Epoch 2, Batch 10300] loss: 0.956


 52%|█████▏    | 10400/20019 [1:03:58<1:00:54,  2.63it/s]

[Epoch 2, Batch 10400] loss: 0.988


 52%|█████▏    | 10500/20019 [1:04:38<56:08,  2.83it/s]  

[Epoch 2, Batch 10500] loss: 0.987


 53%|█████▎    | 10600/20019 [1:05:18<1:03:56,  2.46it/s]

[Epoch 2, Batch 10600] loss: 0.986


 53%|█████▎    | 10700/20019 [1:05:58<59:58,  2.59it/s]  

[Epoch 2, Batch 10700] loss: 1.012


 54%|█████▍    | 10800/20019 [1:06:39<54:46,  2.80it/s]  

[Epoch 2, Batch 10800] loss: 1.010


 54%|█████▍    | 10900/20019 [1:07:18<1:03:19,  2.40it/s]

[Epoch 2, Batch 10900] loss: 0.982


 55%|█████▍    | 10999/20019 [1:07:55<1:08:08,  2.21it/s]

[Epoch 2, Batch 11000] loss: 1.022


 55%|█████▍    | 11000/20019 [1:07:58<2:53:40,  1.16s/it]

****** Model checkpoint saved at epochs 3 ******


 55%|█████▌    | 11100/20019 [1:08:39<53:29,  2.78it/s]  

[Epoch 2, Batch 11100] loss: 1.025


 56%|█████▌    | 11200/20019 [1:09:16<56:08,  2.62it/s]  

[Epoch 2, Batch 11200] loss: 0.981


 56%|█████▋    | 11300/20019 [1:09:57<57:36,  2.52it/s]  

[Epoch 2, Batch 11300] loss: 0.978


 57%|█████▋    | 11400/20019 [1:10:39<59:03,  2.43it/s]  

[Epoch 2, Batch 11400] loss: 0.948


 57%|█████▋    | 11500/20019 [1:11:19<1:28:01,  1.61it/s]

[Epoch 2, Batch 11500] loss: 0.965


 58%|█████▊    | 11600/20019 [1:12:02<1:49:54,  1.28it/s]

[Epoch 2, Batch 11600] loss: 0.983


 58%|█████▊    | 11700/20019 [1:12:42<52:37,  2.63it/s]  

[Epoch 2, Batch 11700] loss: 0.982


 59%|█████▉    | 11800/20019 [1:13:24<52:16,  2.62it/s]  

[Epoch 2, Batch 11800] loss: 1.018


 59%|█████▉    | 11900/20019 [1:14:04<53:05,  2.55it/s]  

[Epoch 2, Batch 11900] loss: 0.958


 60%|█████▉    | 11999/20019 [1:14:44<53:32,  2.50it/s]  

[Epoch 2, Batch 12000] loss: 1.018


 60%|█████▉    | 12000/20019 [1:14:47<2:28:17,  1.11s/it]

****** Model checkpoint saved at epochs 3 ******


 60%|██████    | 12100/20019 [1:15:26<50:04,  2.64it/s]  

[Epoch 2, Batch 12100] loss: 0.988


 61%|██████    | 12200/20019 [1:16:12<53:11,  2.45it/s]  

[Epoch 2, Batch 12200] loss: 0.969


 61%|██████▏   | 12300/20019 [1:16:54<49:23,  2.61it/s]  

[Epoch 2, Batch 12300] loss: 1.010


 62%|██████▏   | 12400/20019 [1:17:34<46:50,  2.71it/s]  

[Epoch 2, Batch 12400] loss: 0.999


 62%|██████▏   | 12500/20019 [1:18:14<46:47,  2.68it/s]  

[Epoch 2, Batch 12500] loss: 0.932


 63%|██████▎   | 12600/20019 [1:18:54<47:27,  2.61it/s]  

[Epoch 2, Batch 12600] loss: 0.981


 63%|██████▎   | 12700/20019 [1:19:33<46:21,  2.63it/s]

[Epoch 2, Batch 12700] loss: 1.018


 64%|██████▍   | 12800/20019 [1:20:14<49:34,  2.43it/s]  

[Epoch 2, Batch 12800] loss: 0.994


 64%|██████▍   | 12900/20019 [1:21:03<49:46,  2.38it/s]  

[Epoch 2, Batch 12900] loss: 0.993


 65%|██████▍   | 12999/20019 [1:21:47<46:05,  2.54it/s]  

[Epoch 2, Batch 13000] loss: 0.960


 65%|██████▍   | 13000/20019 [1:21:50<2:11:21,  1.12s/it]

****** Model checkpoint saved at epochs 3 ******


 65%|██████▌   | 13100/20019 [1:22:34<45:53,  2.51it/s]  

[Epoch 2, Batch 13100] loss: 0.994


 66%|██████▌   | 13200/20019 [1:23:16<49:27,  2.30it/s]

[Epoch 2, Batch 13200] loss: 0.981


 66%|██████▋   | 13300/20019 [1:24:04<45:08,  2.48it/s]  

[Epoch 2, Batch 13300] loss: 0.981


 67%|██████▋   | 13400/20019 [1:24:47<48:12,  2.29it/s]  

[Epoch 2, Batch 13400] loss: 1.013


 67%|██████▋   | 13500/20019 [1:25:30<47:18,  2.30it/s]  

[Epoch 2, Batch 13500] loss: 0.988


 68%|██████▊   | 13600/20019 [1:26:15<45:21,  2.36it/s]  

[Epoch 2, Batch 13600] loss: 0.973


 68%|██████▊   | 13700/20019 [1:27:01<43:35,  2.42it/s]  

[Epoch 2, Batch 13700] loss: 0.967


 69%|██████▉   | 13800/20019 [1:27:47<44:14,  2.34it/s]  

[Epoch 2, Batch 13800] loss: 1.024


 69%|██████▉   | 13900/20019 [1:28:30<41:55,  2.43it/s]  

[Epoch 2, Batch 13900] loss: 0.956


 70%|██████▉   | 13999/20019 [1:29:13<43:12,  2.32it/s]

[Epoch 2, Batch 14000] loss: 1.038


 70%|██████▉   | 14000/20019 [1:29:16<1:53:31,  1.13s/it]

****** Model checkpoint saved at epochs 3 ******


 70%|███████   | 14100/20019 [1:30:05<41:50,  2.36it/s]  

[Epoch 2, Batch 14100] loss: 1.006


 71%|███████   | 14200/20019 [1:30:53<44:41,  2.17it/s]  

[Epoch 2, Batch 14200] loss: 0.958


 71%|███████▏  | 14300/20019 [1:31:39<44:25,  2.15it/s]

[Epoch 2, Batch 14300] loss: 0.979


 72%|███████▏  | 14400/20019 [1:32:27<43:44,  2.14it/s]  

[Epoch 2, Batch 14400] loss: 1.008


 72%|███████▏  | 14500/20019 [1:33:12<37:18,  2.47it/s]

[Epoch 2, Batch 14500] loss: 0.947


 73%|███████▎  | 14600/20019 [1:34:02<48:53,  1.85it/s]  

[Epoch 2, Batch 14600] loss: 0.964


 73%|███████▎  | 14700/20019 [1:34:50<38:22,  2.31it/s]

[Epoch 2, Batch 14700] loss: 0.962


 74%|███████▍  | 14800/20019 [1:35:38<41:10,  2.11it/s]

[Epoch 2, Batch 14800] loss: 0.981


 74%|███████▍  | 14900/20019 [1:36:28<36:13,  2.35it/s]  

[Epoch 2, Batch 14900] loss: 0.981


 75%|███████▍  | 14999/20019 [1:37:18<40:38,  2.06it/s]  

[Epoch 2, Batch 15000] loss: 0.958


 75%|███████▍  | 15000/20019 [1:37:20<1:37:03,  1.16s/it]

****** Model checkpoint saved at epochs 3 ******


 75%|███████▌  | 15100/20019 [1:38:10<38:04,  2.15it/s]  

[Epoch 2, Batch 15100] loss: 1.010


 76%|███████▌  | 15200/20019 [1:38:58<41:03,  1.96it/s]

[Epoch 2, Batch 15200] loss: 0.965


 76%|███████▋  | 15300/20019 [1:39:47<40:24,  1.95it/s]

[Epoch 2, Batch 15300] loss: 0.995


 77%|███████▋  | 15400/20019 [1:40:40<35:38,  2.16it/s]  

[Epoch 2, Batch 15400] loss: 0.984


 77%|███████▋  | 15500/20019 [1:41:32<40:28,  1.86it/s]

[Epoch 2, Batch 15500] loss: 1.018


 78%|███████▊  | 15600/20019 [1:42:24<38:23,  1.92it/s]  

[Epoch 2, Batch 15600] loss: 0.981


 78%|███████▊  | 15700/20019 [1:43:15<34:26,  2.09it/s]  

[Epoch 2, Batch 15700] loss: 0.989


 79%|███████▉  | 15800/20019 [1:44:09<33:46,  2.08it/s]  

[Epoch 2, Batch 15800] loss: 0.989


 79%|███████▉  | 15900/20019 [1:45:01<34:16,  2.00it/s]

[Epoch 2, Batch 15900] loss: 1.002


 80%|███████▉  | 15999/20019 [1:45:52<33:33,  2.00it/s]

[Epoch 2, Batch 16000] loss: 0.983


 80%|███████▉  | 16000/20019 [1:45:55<1:24:32,  1.26s/it]

****** Model checkpoint saved at epochs 3 ******


 80%|████████  | 16100/20019 [1:46:52<33:46,  1.93it/s]  

[Epoch 2, Batch 16100] loss: 1.025


 81%|████████  | 16200/20019 [1:47:50<31:55,  1.99it/s]  

[Epoch 2, Batch 16200] loss: 0.978


 81%|████████▏ | 16300/20019 [1:48:43<31:59,  1.94it/s]

[Epoch 2, Batch 16300] loss: 1.040


 82%|████████▏ | 16400/20019 [1:49:42<30:43,  1.96it/s]  

[Epoch 2, Batch 16400] loss: 1.007


 82%|████████▏ | 16500/20019 [1:50:41<35:05,  1.67it/s]  

[Epoch 2, Batch 16500] loss: 1.011


 83%|████████▎ | 16600/20019 [1:51:40<30:16,  1.88it/s]  

[Epoch 2, Batch 16600] loss: 0.982


 83%|████████▎ | 16700/20019 [1:52:39<28:39,  1.93it/s]

[Epoch 2, Batch 16700] loss: 0.990


 84%|████████▍ | 16800/20019 [1:53:39<28:47,  1.86it/s]

[Epoch 2, Batch 16800] loss: 0.985


 84%|████████▍ | 16900/20019 [1:54:41<28:36,  1.82it/s]

[Epoch 2, Batch 16900] loss: 1.006


 85%|████████▍ | 16999/20019 [1:55:41<33:07,  1.52it/s]  

[Epoch 2, Batch 17000] loss: 1.005


 85%|████████▍ | 17000/20019 [1:55:44<1:04:30,  1.28s/it]

****** Model checkpoint saved at epochs 3 ******


 85%|████████▌ | 17100/20019 [1:56:47<31:43,  1.53it/s]  

[Epoch 2, Batch 17100] loss: 1.001


 86%|████████▌ | 17200/20019 [1:57:48<27:20,  1.72it/s]

[Epoch 2, Batch 17200] loss: 0.975


 86%|████████▋ | 17300/20019 [1:58:50<26:44,  1.69it/s]

[Epoch 2, Batch 17300] loss: 0.996


 87%|████████▋ | 17400/20019 [1:59:54<25:49,  1.69it/s]

[Epoch 2, Batch 17400] loss: 0.954


 87%|████████▋ | 17500/20019 [2:00:54<24:08,  1.74it/s]

[Epoch 2, Batch 17500] loss: 0.944


 88%|████████▊ | 17600/20019 [2:02:00<25:45,  1.57it/s]  

[Epoch 2, Batch 17600] loss: 1.006


 88%|████████▊ | 17700/20019 [2:03:04<22:23,  1.73it/s]

[Epoch 2, Batch 17700] loss: 0.973


 89%|████████▉ | 17800/20019 [2:04:08<22:52,  1.62it/s]

[Epoch 2, Batch 17800] loss: 0.987


 89%|████████▉ | 17900/20019 [2:05:13<19:52,  1.78it/s]

[Epoch 2, Batch 17900] loss: 0.972


 90%|████████▉ | 17999/20019 [2:06:17<21:37,  1.56it/s]

[Epoch 2, Batch 18000] loss: 1.010


 90%|████████▉ | 18000/20019 [2:06:20<40:00,  1.19s/it]

****** Model checkpoint saved at epochs 3 ******


 90%|█████████ | 18100/20019 [2:07:26<17:30,  1.83it/s]

[Epoch 2, Batch 18100] loss: 0.987


 91%|█████████ | 18200/20019 [2:08:33<18:03,  1.68it/s]

[Epoch 2, Batch 18200] loss: 0.977


 91%|█████████▏| 18300/20019 [2:09:40<16:31,  1.73it/s]

[Epoch 2, Batch 18300] loss: 0.957


 92%|█████████▏| 18400/20019 [2:10:45<16:25,  1.64it/s]

[Epoch 2, Batch 18400] loss: 0.993


 92%|█████████▏| 18500/20019 [2:11:50<14:45,  1.72it/s]

[Epoch 2, Batch 18500] loss: 1.045


 93%|█████████▎| 18600/20019 [2:12:57<14:20,  1.65it/s]

[Epoch 2, Batch 18600] loss: 0.952


 93%|█████████▎| 18700/20019 [2:14:05<14:10,  1.55it/s]

[Epoch 2, Batch 18700] loss: 1.014


 94%|█████████▍| 18800/20019 [2:15:11<12:47,  1.59it/s]

[Epoch 2, Batch 18800] loss: 0.966


 94%|█████████▍| 18900/20019 [2:16:18<10:59,  1.70it/s]

[Epoch 2, Batch 18900] loss: 1.018


 95%|█████████▍| 18999/20019 [2:17:24<12:17,  1.38it/s]

[Epoch 2, Batch 19000] loss: 0.971


 95%|█████████▍| 19000/20019 [2:17:27<24:20,  1.43s/it]

****** Model checkpoint saved at epochs 3 ******


 95%|█████████▌| 19100/20019 [2:18:37<10:02,  1.53it/s]

[Epoch 2, Batch 19100] loss: 0.975


 96%|█████████▌| 19200/20019 [2:19:45<07:55,  1.72it/s]

[Epoch 2, Batch 19200] loss: 0.982


 96%|█████████▋| 19300/20019 [2:20:54<07:17,  1.64it/s]

[Epoch 2, Batch 19300] loss: 0.956


 97%|█████████▋| 19400/20019 [2:22:04<06:42,  1.54it/s]

[Epoch 2, Batch 19400] loss: 0.976


 97%|█████████▋| 19500/20019 [2:23:12<05:27,  1.59it/s]

[Epoch 2, Batch 19500] loss: 1.034


 98%|█████████▊| 19600/20019 [2:24:24<04:31,  1.54it/s]

[Epoch 2, Batch 19600] loss: 0.942


 98%|█████████▊| 19700/20019 [2:25:34<03:16,  1.62it/s]

[Epoch 2, Batch 19700] loss: 1.029


 99%|█████████▉| 19800/20019 [2:26:44<02:14,  1.63it/s]

[Epoch 2, Batch 19800] loss: 1.023


 99%|█████████▉| 19900/20019 [2:27:55<01:13,  1.61it/s]

[Epoch 2, Batch 19900] loss: 0.999


100%|█████████▉| 19999/20019 [2:29:02<00:15,  1.29it/s]

[Epoch 2, Batch 20000] loss: 0.982


100%|█████████▉| 20000/20019 [2:29:04<00:24,  1.30s/it]

****** Model checkpoint saved at epochs 3 ******


100%|██████████| 20019/20019 [2:29:16<00:00,  2.24it/s]
  0%|          | 100/20019 [00:38<1:56:33,  2.85it/s]

[Epoch 3, Batch   100] loss: 0.895


  1%|          | 200/20019 [01:13<1:55:54,  2.85it/s]

[Epoch 3, Batch   200] loss: 0.873


  1%|▏         | 300/20019 [01:49<1:55:33,  2.84it/s]

[Epoch 3, Batch   300] loss: 0.893


  2%|▏         | 400/20019 [02:24<1:54:28,  2.86it/s]

[Epoch 3, Batch   400] loss: 0.913


  2%|▏         | 500/20019 [02:59<1:54:23,  2.84it/s]

[Epoch 3, Batch   500] loss: 0.888


  3%|▎         | 600/20019 [03:34<1:54:01,  2.84it/s]

[Epoch 3, Batch   600] loss: 0.882


  3%|▎         | 700/20019 [04:09<1:53:18,  2.84it/s]

[Epoch 3, Batch   700] loss: 0.870


  4%|▍         | 800/20019 [04:44<1:51:37,  2.87it/s]

[Epoch 3, Batch   800] loss: 0.862


  4%|▍         | 900/20019 [05:19<1:50:49,  2.88it/s]

[Epoch 3, Batch   900] loss: 0.880


  5%|▍         | 999/20019 [05:54<1:50:42,  2.86it/s]

[Epoch 3, Batch  1000] loss: 0.846


  5%|▍         | 1000/20019 [05:56<4:37:16,  1.14it/s]

****** Model checkpoint saved at epochs 4 ******


  5%|▌         | 1100/20019 [06:32<1:49:51,  2.87it/s]

[Epoch 3, Batch  1100] loss: 0.888


  6%|▌         | 1200/20019 [07:07<1:50:45,  2.83it/s]

[Epoch 3, Batch  1200] loss: 0.864


  6%|▋         | 1300/20019 [07:42<1:49:03,  2.86it/s]

[Epoch 3, Batch  1300] loss: 0.855


  7%|▋         | 1400/20019 [08:17<1:48:02,  2.87it/s]

[Epoch 3, Batch  1400] loss: 0.860


  7%|▋         | 1500/20019 [08:53<1:47:55,  2.86it/s]

[Epoch 3, Batch  1500] loss: 0.880


  8%|▊         | 1600/20019 [09:28<1:47:08,  2.87it/s]

[Epoch 3, Batch  1600] loss: 0.861


  8%|▊         | 1700/20019 [10:03<1:46:50,  2.86it/s]

[Epoch 3, Batch  1700] loss: 0.859


  9%|▉         | 1800/20019 [10:38<1:46:35,  2.85it/s]

[Epoch 3, Batch  1800] loss: 0.868


  9%|▉         | 1900/20019 [11:13<1:44:49,  2.88it/s]

[Epoch 3, Batch  1900] loss: 0.870


 10%|▉         | 1999/20019 [11:47<1:44:49,  2.87it/s]

[Epoch 3, Batch  2000] loss: 0.878


 10%|▉         | 2000/20019 [11:50<4:59:41,  1.00it/s]

****** Model checkpoint saved at epochs 4 ******


 10%|█         | 2100/20019 [12:26<1:43:44,  2.88it/s]

[Epoch 3, Batch  2100] loss: 0.872


 11%|█         | 2200/20019 [13:01<1:43:34,  2.87it/s]

[Epoch 3, Batch  2200] loss: 0.852


 11%|█▏        | 2300/20019 [13:36<1:42:41,  2.88it/s]

[Epoch 3, Batch  2300] loss: 0.877


 12%|█▏        | 2400/20019 [14:10<1:42:13,  2.87it/s]

[Epoch 3, Batch  2400] loss: 0.831


 12%|█▏        | 2500/20019 [14:45<1:41:35,  2.87it/s]

[Epoch 3, Batch  2500] loss: 0.833


 13%|█▎        | 2600/20019 [15:20<1:41:12,  2.87it/s]

[Epoch 3, Batch  2600] loss: 0.876


 13%|█▎        | 2700/20019 [15:55<1:40:32,  2.87it/s]

[Epoch 3, Batch  2700] loss: 0.883


 14%|█▍        | 2800/20019 [16:30<1:39:52,  2.87it/s]

[Epoch 3, Batch  2800] loss: 0.872


 14%|█▍        | 2900/20019 [17:05<1:39:22,  2.87it/s]

[Epoch 3, Batch  2900] loss: 0.895


 15%|█▍        | 2999/20019 [17:40<1:39:08,  2.86it/s]

[Epoch 3, Batch  3000] loss: 0.887


 15%|█▍        | 3000/20019 [17:42<4:34:36,  1.03it/s]

****** Model checkpoint saved at epochs 4 ******


 15%|█▌        | 3100/20019 [18:18<1:38:02,  2.88it/s]

[Epoch 3, Batch  3100] loss: 0.853


 16%|█▌        | 3200/20019 [18:53<1:37:26,  2.88it/s]

[Epoch 3, Batch  3200] loss: 0.881


 16%|█▋        | 3300/20019 [19:28<1:36:54,  2.88it/s]

[Epoch 3, Batch  3300] loss: 0.888


 17%|█▋        | 3400/20019 [20:03<1:36:47,  2.86it/s]

[Epoch 3, Batch  3400] loss: 0.886


 17%|█▋        | 3500/20019 [20:38<1:36:15,  2.86it/s]

[Epoch 3, Batch  3500] loss: 0.885


 18%|█▊        | 3600/20019 [21:13<1:35:12,  2.87it/s]

[Epoch 3, Batch  3600] loss: 0.880


 18%|█▊        | 3700/20019 [21:48<1:34:36,  2.87it/s]

[Epoch 3, Batch  3700] loss: 0.823


 19%|█▉        | 3800/20019 [22:23<1:34:02,  2.87it/s]

[Epoch 3, Batch  3800] loss: 0.949


 19%|█▉        | 3900/20019 [22:57<1:33:28,  2.87it/s]

[Epoch 3, Batch  3900] loss: 0.873


 20%|█▉        | 3999/20019 [23:32<1:33:20,  2.86it/s]

[Epoch 3, Batch  4000] loss: 0.881


 20%|█▉        | 4000/20019 [23:34<4:19:54,  1.03it/s]

****** Model checkpoint saved at epochs 4 ******


 20%|██        | 4100/20019 [24:10<1:32:20,  2.87it/s]

[Epoch 3, Batch  4100] loss: 0.865


 21%|██        | 4200/20019 [24:45<1:31:35,  2.88it/s]

[Epoch 3, Batch  4200] loss: 0.915


 21%|██▏       | 4300/20019 [25:20<1:31:09,  2.87it/s]

[Epoch 3, Batch  4300] loss: 0.852


 22%|██▏       | 4400/20019 [25:55<1:30:29,  2.88it/s]

[Epoch 3, Batch  4400] loss: 0.874


 22%|██▏       | 4500/20019 [26:30<1:29:47,  2.88it/s]

[Epoch 3, Batch  4500] loss: 0.917


 23%|██▎       | 4600/20019 [27:05<1:29:02,  2.89it/s]

[Epoch 3, Batch  4600] loss: 0.921


 23%|██▎       | 4700/20019 [27:39<1:28:37,  2.88it/s]

[Epoch 3, Batch  4700] loss: 0.897


 24%|██▍       | 4800/20019 [28:14<1:28:01,  2.88it/s]

[Epoch 3, Batch  4800] loss: 0.886


 24%|██▍       | 4900/20019 [28:49<1:27:18,  2.89it/s]

[Epoch 3, Batch  4900] loss: 0.892


 25%|██▍       | 4999/20019 [29:23<1:27:34,  2.86it/s]

[Epoch 3, Batch  5000] loss: 0.916


 25%|██▍       | 5000/20019 [29:26<4:08:50,  1.01it/s]

****** Model checkpoint saved at epochs 4 ******


 25%|██▌       | 5100/20019 [30:02<1:26:40,  2.87it/s]

[Epoch 3, Batch  5100] loss: 0.865


 26%|██▌       | 5200/20019 [30:37<1:25:46,  2.88it/s]

[Epoch 3, Batch  5200] loss: 0.874


 26%|██▋       | 5300/20019 [31:12<1:25:32,  2.87it/s]

[Epoch 3, Batch  5300] loss: 0.928


 27%|██▋       | 5400/20019 [31:47<1:24:34,  2.88it/s]

[Epoch 3, Batch  5400] loss: 0.879


 27%|██▋       | 5500/20019 [32:21<1:24:04,  2.88it/s]

[Epoch 3, Batch  5500] loss: 0.873


 28%|██▊       | 5600/20019 [32:56<1:23:27,  2.88it/s]

[Epoch 3, Batch  5600] loss: 0.870


 28%|██▊       | 5700/20019 [33:31<1:22:54,  2.88it/s]

[Epoch 3, Batch  5700] loss: 0.885


 29%|██▉       | 5800/20019 [34:06<1:22:22,  2.88it/s]

[Epoch 3, Batch  5800] loss: 0.874


 29%|██▉       | 5900/20019 [34:41<1:21:53,  2.87it/s]

[Epoch 3, Batch  5900] loss: 0.880


 30%|██▉       | 5999/20019 [35:15<1:21:27,  2.87it/s]

[Epoch 3, Batch  6000] loss: 0.878


 30%|██▉       | 6000/20019 [35:18<3:54:48,  1.00s/it]

****** Model checkpoint saved at epochs 4 ******


 30%|███       | 6100/20019 [35:54<1:20:35,  2.88it/s]

[Epoch 3, Batch  6100] loss: 0.900


 31%|███       | 6200/20019 [36:29<1:20:31,  2.86it/s]

[Epoch 3, Batch  6200] loss: 0.845


 31%|███▏      | 6300/20019 [37:04<1:19:33,  2.87it/s]

[Epoch 3, Batch  6300] loss: 0.903


 32%|███▏      | 6400/20019 [37:38<1:18:50,  2.88it/s]

[Epoch 3, Batch  6400] loss: 0.844


 32%|███▏      | 6500/20019 [38:13<1:18:28,  2.87it/s]

[Epoch 3, Batch  6500] loss: 0.854


 33%|███▎      | 6600/20019 [38:48<1:17:43,  2.88it/s]

[Epoch 3, Batch  6600] loss: 0.875


 33%|███▎      | 6700/20019 [39:23<1:17:07,  2.88it/s]

[Epoch 3, Batch  6700] loss: 0.841


 34%|███▍      | 6800/20019 [39:58<1:16:35,  2.88it/s]

[Epoch 3, Batch  6800] loss: 0.904


 34%|███▍      | 6900/20019 [40:33<1:15:58,  2.88it/s]

[Epoch 3, Batch  6900] loss: 0.891


 35%|███▍      | 6999/20019 [41:07<1:15:37,  2.87it/s]

[Epoch 3, Batch  7000] loss: 0.885


 35%|███▍      | 7000/20019 [41:10<3:37:35,  1.00s/it]

****** Model checkpoint saved at epochs 4 ******


 35%|███▌      | 7100/20019 [41:46<1:14:49,  2.88it/s]

[Epoch 3, Batch  7100] loss: 0.856


 36%|███▌      | 7200/20019 [42:21<1:14:11,  2.88it/s]

[Epoch 3, Batch  7200] loss: 0.860


 36%|███▋      | 7300/20019 [42:56<1:13:41,  2.88it/s]

[Epoch 3, Batch  7300] loss: 0.877


 37%|███▋      | 7400/20019 [43:31<1:13:14,  2.87it/s]

[Epoch 3, Batch  7400] loss: 0.895


 37%|███▋      | 7500/20019 [44:06<1:12:37,  2.87it/s]

[Epoch 3, Batch  7500] loss: 0.860


 38%|███▊      | 7600/20019 [44:41<1:13:04,  2.83it/s]

[Epoch 3, Batch  7600] loss: 0.860


 38%|███▊      | 7700/20019 [45:16<1:11:30,  2.87it/s]

[Epoch 3, Batch  7700] loss: 0.891


 39%|███▉      | 7800/20019 [45:51<1:10:56,  2.87it/s]

[Epoch 3, Batch  7800] loss: 0.870


 39%|███▉      | 7900/20019 [46:26<1:10:18,  2.87it/s]

[Epoch 3, Batch  7900] loss: 0.860


 40%|███▉      | 7999/20019 [47:01<1:10:01,  2.86it/s]

[Epoch 3, Batch  8000] loss: 0.889


 40%|███▉      | 8000/20019 [47:03<3:15:47,  1.02it/s]

****** Model checkpoint saved at epochs 4 ******


 40%|████      | 8100/20019 [47:39<1:09:01,  2.88it/s]

[Epoch 3, Batch  8100] loss: 0.859


 41%|████      | 8200/20019 [48:15<1:08:17,  2.88it/s]

[Epoch 3, Batch  8200] loss: 0.907


 41%|████▏     | 8300/20019 [48:50<1:07:55,  2.88it/s]

[Epoch 3, Batch  8300] loss: 0.856


 42%|████▏     | 8400/20019 [49:25<1:14:18,  2.61it/s]

[Epoch 3, Batch  8400] loss: 0.897


 42%|████▏     | 8500/20019 [50:00<1:06:36,  2.88it/s]

[Epoch 3, Batch  8500] loss: 0.863


 43%|████▎     | 8600/20019 [50:35<1:06:05,  2.88it/s]

[Epoch 3, Batch  8600] loss: 0.917


 43%|████▎     | 8700/20019 [51:10<1:05:51,  2.86it/s]

[Epoch 3, Batch  8700] loss: 0.863


 44%|████▍     | 8800/20019 [51:45<1:05:03,  2.87it/s]

[Epoch 3, Batch  8800] loss: 0.863


 44%|████▍     | 8900/20019 [52:20<1:04:28,  2.87it/s]

[Epoch 3, Batch  8900] loss: 0.872


 45%|████▍     | 8999/20019 [52:56<1:03:59,  2.87it/s]

[Epoch 3, Batch  9000] loss: 0.900


 45%|████▍     | 9000/20019 [52:58<2:59:41,  1.02it/s]

****** Model checkpoint saved at epochs 4 ******


 45%|████▌     | 9100/20019 [53:35<1:03:11,  2.88it/s]

[Epoch 3, Batch  9100] loss: 0.862


 46%|████▌     | 9200/20019 [54:10<1:02:52,  2.87it/s]

[Epoch 3, Batch  9200] loss: 0.908


 46%|████▋     | 9300/20019 [54:46<1:03:38,  2.81it/s]

[Epoch 3, Batch  9300] loss: 0.908


 47%|████▋     | 9400/20019 [55:21<1:01:52,  2.86it/s]

[Epoch 3, Batch  9400] loss: 0.853


 47%|████▋     | 9500/20019 [55:57<1:01:05,  2.87it/s]

[Epoch 3, Batch  9500] loss: 0.906


 48%|████▊     | 9600/20019 [56:32<1:00:42,  2.86it/s]

[Epoch 3, Batch  9600] loss: 0.847


 48%|████▊     | 9700/20019 [57:07<1:00:19,  2.85it/s]

[Epoch 3, Batch  9700] loss: 0.851


 49%|████▉     | 9800/20019 [57:43<1:01:18,  2.78it/s]

[Epoch 3, Batch  9800] loss: 0.880


 49%|████▉     | 9900/20019 [58:18<59:41,  2.83it/s]  

[Epoch 3, Batch  9900] loss: 0.926


 50%|████▉     | 9999/20019 [58:55<1:06:19,  2.52it/s]

[Epoch 3, Batch 10000] loss: 0.879


 50%|████▉     | 10000/20019 [58:57<2:52:36,  1.03s/it]

****** Model checkpoint saved at epochs 4 ******


 50%|█████     | 10100/20019 [59:34<58:18,  2.83it/s]  

[Epoch 3, Batch 10100] loss: 0.895


 51%|█████     | 10200/20019 [1:00:10<56:41,  2.89it/s]  

[Epoch 3, Batch 10200] loss: 0.907


 51%|█████▏    | 10300/20019 [1:00:46<56:17,  2.88it/s]  

[Epoch 3, Batch 10300] loss: 0.879


 52%|█████▏    | 10400/20019 [1:01:23<55:58,  2.86it/s]  

[Epoch 3, Batch 10400] loss: 0.868


 52%|█████▏    | 10500/20019 [1:01:59<55:36,  2.85it/s]  

[Epoch 3, Batch 10500] loss: 0.886


 53%|█████▎    | 10600/20019 [1:02:36<1:00:55,  2.58it/s]

[Epoch 3, Batch 10600] loss: 0.870


 53%|█████▎    | 10700/20019 [1:03:13<55:06,  2.82it/s]  

[Epoch 3, Batch 10700] loss: 0.869


 54%|█████▍    | 10800/20019 [1:03:50<53:45,  2.86it/s]  

[Epoch 3, Batch 10800] loss: 0.863


 54%|█████▍    | 10900/20019 [1:04:27<53:32,  2.84it/s]  

[Epoch 3, Batch 10900] loss: 0.914


 55%|█████▍    | 10999/20019 [1:05:03<53:42,  2.80it/s]  

[Epoch 3, Batch 11000] loss: 0.867


 55%|█████▍    | 11000/20019 [1:05:06<2:38:48,  1.06s/it]

****** Model checkpoint saved at epochs 4 ******


 55%|█████▌    | 11100/20019 [1:05:45<58:35,  2.54it/s]  

[Epoch 3, Batch 11100] loss: 0.876


 56%|█████▌    | 11200/20019 [1:06:23<1:02:46,  2.34it/s]

[Epoch 3, Batch 11200] loss: 0.860


 56%|█████▋    | 11300/20019 [1:07:01<57:10,  2.54it/s]  

[Epoch 3, Batch 11300] loss: 0.835


 57%|█████▋    | 11400/20019 [1:07:39<56:33,  2.54it/s]  

[Epoch 3, Batch 11400] loss: 0.858


 57%|█████▋    | 11500/20019 [1:08:17<51:44,  2.74it/s]  

[Epoch 3, Batch 11500] loss: 0.879


 58%|█████▊    | 11600/20019 [1:08:55<52:51,  2.65it/s]  

[Epoch 3, Batch 11600] loss: 0.884


 58%|█████▊    | 11700/20019 [1:09:33<49:17,  2.81it/s]  

[Epoch 3, Batch 11700] loss: 0.918


 59%|█████▉    | 11800/20019 [1:10:12<54:50,  2.50it/s]  

[Epoch 3, Batch 11800] loss: 0.825


 59%|█████▉    | 11900/20019 [1:10:51<53:14,  2.54it/s]  

[Epoch 3, Batch 11900] loss: 0.877


 60%|█████▉    | 11999/20019 [1:11:31<52:04,  2.57it/s]  

[Epoch 3, Batch 12000] loss: 0.880


 60%|█████▉    | 12000/20019 [1:11:33<2:25:14,  1.09s/it]

****** Model checkpoint saved at epochs 4 ******


 60%|██████    | 12100/20019 [1:12:14<54:50,  2.41it/s]  

[Epoch 3, Batch 12100] loss: 0.911


 61%|██████    | 12200/20019 [1:12:53<49:41,  2.62it/s]  

[Epoch 3, Batch 12200] loss: 0.859


 61%|██████▏   | 12300/20019 [1:13:33<58:17,  2.21it/s]  

[Epoch 3, Batch 12300] loss: 0.874


 62%|██████▏   | 12400/20019 [1:14:13<52:31,  2.42it/s]  

[Epoch 3, Batch 12400] loss: 0.889


 62%|██████▏   | 12500/20019 [1:14:53<47:58,  2.61it/s]

[Epoch 3, Batch 12500] loss: 0.885


 63%|██████▎   | 12600/20019 [1:15:34<52:02,  2.38it/s]  

[Epoch 3, Batch 12600] loss: 0.904


 63%|██████▎   | 12700/20019 [1:16:15<46:01,  2.65it/s]  

[Epoch 3, Batch 12700] loss: 0.876


 64%|██████▍   | 12800/20019 [1:16:55<48:14,  2.49it/s]

[Epoch 3, Batch 12800] loss: 0.863


 64%|██████▍   | 12900/20019 [1:17:36<47:23,  2.50it/s]

[Epoch 3, Batch 12900] loss: 0.850


 65%|██████▍   | 12999/20019 [1:18:17<46:16,  2.53it/s]  

[Epoch 3, Batch 13000] loss: 0.851


 65%|██████▍   | 13000/20019 [1:18:20<2:01:56,  1.04s/it]

****** Model checkpoint saved at epochs 4 ******


 65%|██████▌   | 13100/20019 [1:19:02<52:32,  2.19it/s]  

[Epoch 3, Batch 13100] loss: 0.880


 66%|██████▌   | 13200/20019 [1:19:43<44:38,  2.55it/s]

[Epoch 3, Batch 13200] loss: 0.873


 66%|██████▋   | 13300/20019 [1:20:25<41:54,  2.67it/s]

[Epoch 3, Batch 13300] loss: 0.864


 67%|██████▋   | 13400/20019 [1:21:07<46:38,  2.37it/s]

[Epoch 3, Batch 13400] loss: 0.889


 67%|██████▋   | 13500/20019 [1:21:49<42:07,  2.58it/s]  

[Epoch 3, Batch 13500] loss: 0.851


 68%|██████▊   | 13600/20019 [1:22:31<44:20,  2.41it/s]

[Epoch 3, Batch 13600] loss: 0.881


 68%|██████▊   | 13700/20019 [1:23:15<43:24,  2.43it/s]

[Epoch 3, Batch 13700] loss: 0.891


 69%|██████▉   | 13800/20019 [1:23:58<40:04,  2.59it/s]

[Epoch 3, Batch 13800] loss: 0.874


 69%|██████▉   | 13900/20019 [1:24:41<41:50,  2.44it/s]

[Epoch 3, Batch 13900] loss: 0.889


 70%|██████▉   | 13999/20019 [1:25:24<1:02:13,  1.61it/s]

[Epoch 3, Batch 14000] loss: 0.851


 70%|██████▉   | 14000/20019 [1:25:27<2:01:53,  1.22s/it]

****** Model checkpoint saved at epochs 4 ******


 70%|███████   | 14100/20019 [1:26:11<40:19,  2.45it/s]  

[Epoch 3, Batch 14100] loss: 0.894


 71%|███████   | 14200/20019 [1:26:56<39:57,  2.43it/s]

[Epoch 3, Batch 14200] loss: 0.871


 71%|███████▏  | 14300/20019 [1:27:40<42:15,  2.26it/s]

[Epoch 3, Batch 14300] loss: 0.866


 72%|███████▏  | 14400/20019 [1:28:26<56:03,  1.67it/s]

[Epoch 3, Batch 14400] loss: 0.886


 72%|███████▏  | 14500/20019 [1:29:11<38:14,  2.41it/s]

[Epoch 3, Batch 14500] loss: 0.897


 73%|███████▎  | 14600/20019 [1:29:56<39:26,  2.29it/s]

[Epoch 3, Batch 14600] loss: 0.882


 73%|███████▎  | 14700/20019 [1:30:42<37:25,  2.37it/s]

[Epoch 3, Batch 14700] loss: 0.896


 74%|███████▍  | 14800/20019 [1:31:28<39:26,  2.21it/s]

[Epoch 3, Batch 14800] loss: 0.902


 74%|███████▍  | 14900/20019 [1:32:15<37:42,  2.26it/s]

[Epoch 3, Batch 14900] loss: 0.875


 75%|███████▍  | 14999/20019 [1:33:02<37:17,  2.24it/s]

[Epoch 3, Batch 15000] loss: 0.887


 75%|███████▍  | 15000/20019 [1:33:05<1:43:52,  1.24s/it]

****** Model checkpoint saved at epochs 4 ******


 75%|███████▌  | 15100/20019 [1:33:53<36:07,  2.27it/s]  

[Epoch 3, Batch 15100] loss: 0.842


 76%|███████▌  | 15200/20019 [1:34:41<38:04,  2.11it/s]

[Epoch 3, Batch 15200] loss: 0.870


 76%|███████▋  | 15300/20019 [1:35:28<35:51,  2.19it/s]

[Epoch 3, Batch 15300] loss: 0.900


 77%|███████▋  | 15400/20019 [1:36:16<38:18,  2.01it/s]

[Epoch 3, Batch 15400] loss: 0.911


 77%|███████▋  | 15500/20019 [1:37:04<37:05,  2.03it/s]

[Epoch 3, Batch 15500] loss: 0.886


 78%|███████▊  | 15600/20019 [1:37:51<37:17,  1.98it/s]

[Epoch 3, Batch 15600] loss: 0.894


 78%|███████▊  | 15700/20019 [1:38:39<33:44,  2.13it/s]

[Epoch 3, Batch 15700] loss: 0.880


 79%|███████▉  | 15800/20019 [1:39:27<33:08,  2.12it/s]

[Epoch 3, Batch 15800] loss: 0.880


 79%|███████▉  | 15900/20019 [1:40:16<33:58,  2.02it/s]

[Epoch 3, Batch 15900] loss: 0.838


 80%|███████▉  | 15999/20019 [1:41:05<35:05,  1.91it/s]

[Epoch 3, Batch 16000] loss: 0.855


 80%|███████▉  | 16000/20019 [1:41:07<1:15:01,  1.12s/it]

****** Model checkpoint saved at epochs 4 ******


 80%|████████  | 16100/20019 [1:41:58<32:07,  2.03it/s]  

[Epoch 3, Batch 16100] loss: 0.864


 81%|████████  | 16200/20019 [1:42:48<34:11,  1.86it/s]

[Epoch 3, Batch 16200] loss: 0.886


 81%|████████▏ | 16300/20019 [1:43:38<31:35,  1.96it/s]

[Epoch 3, Batch 16300] loss: 0.844


 82%|████████▏ | 16400/20019 [1:44:28<32:52,  1.84it/s]

[Epoch 3, Batch 16400] loss: 0.843


 82%|████████▏ | 16500/20019 [1:45:19<31:07,  1.88it/s]

[Epoch 3, Batch 16500] loss: 0.892


 83%|████████▎ | 16600/20019 [1:46:11<32:40,  1.74it/s]

[Epoch 3, Batch 16600] loss: 0.863


 83%|████████▎ | 16700/20019 [1:47:02<28:15,  1.96it/s]

[Epoch 3, Batch 16700] loss: 0.867


 84%|████████▍ | 16800/20019 [1:47:54<27:35,  1.94it/s]

[Epoch 3, Batch 16800] loss: 0.901


 84%|████████▍ | 16900/20019 [1:48:46<26:45,  1.94it/s]

[Epoch 3, Batch 16900] loss: 0.863


 85%|████████▍ | 16999/20019 [1:49:37<25:36,  1.97it/s]

[Epoch 3, Batch 17000] loss: 0.885


 85%|████████▍ | 17000/20019 [1:49:40<59:20,  1.18s/it]

****** Model checkpoint saved at epochs 4 ******


 85%|████████▌ | 17100/20019 [1:50:34<24:23,  1.99it/s]

[Epoch 3, Batch 17100] loss: 0.881


 86%|████████▌ | 17200/20019 [1:51:26<22:40,  2.07it/s]

[Epoch 3, Batch 17200] loss: 0.863


 86%|████████▋ | 17300/20019 [1:52:19<24:43,  1.83it/s]

[Epoch 3, Batch 17300] loss: 0.889


 87%|████████▋ | 17400/20019 [1:53:12<24:40,  1.77it/s]

[Epoch 3, Batch 17400] loss: 0.892


 87%|████████▋ | 17500/20019 [1:54:05<23:47,  1.76it/s]

[Epoch 3, Batch 17500] loss: 0.883


 88%|████████▊ | 17600/20019 [1:54:58<21:10,  1.90it/s]

[Epoch 3, Batch 17600] loss: 0.831


 88%|████████▊ | 17700/20019 [1:55:53<20:56,  1.85it/s]

[Epoch 3, Batch 17700] loss: 0.873


 89%|████████▉ | 17800/20019 [1:56:47<18:55,  1.95it/s]

[Epoch 3, Batch 17800] loss: 0.902


 89%|████████▉ | 17900/20019 [1:57:43<17:44,  1.99it/s]

[Epoch 3, Batch 17900] loss: 0.916


 90%|████████▉ | 17999/20019 [1:58:38<19:08,  1.76it/s]

[Epoch 3, Batch 18000] loss: 0.907


 90%|████████▉ | 18000/20019 [1:58:41<39:05,  1.16s/it]

****** Model checkpoint saved at epochs 4 ******


 90%|█████████ | 18100/20019 [1:59:38<17:02,  1.88it/s]

[Epoch 3, Batch 18100] loss: 0.835


 91%|█████████ | 18200/20019 [2:00:34<18:20,  1.65it/s]

[Epoch 3, Batch 18200] loss: 0.874


 91%|█████████▏| 18300/20019 [2:01:31<18:06,  1.58it/s]

[Epoch 3, Batch 18300] loss: 0.879


 92%|█████████▏| 18400/20019 [2:02:27<16:41,  1.62it/s]

[Epoch 3, Batch 18400] loss: 0.883


 92%|█████████▏| 18500/20019 [2:03:24<15:02,  1.68it/s]

[Epoch 3, Batch 18500] loss: 0.859


 93%|█████████▎| 18600/20019 [2:04:19<13:03,  1.81it/s]

[Epoch 3, Batch 18600] loss: 0.902


 93%|█████████▎| 18700/20019 [2:05:15<12:04,  1.82it/s]

[Epoch 3, Batch 18700] loss: 0.881


 94%|█████████▍| 18800/20019 [2:06:12<10:12,  1.99it/s]

[Epoch 3, Batch 18800] loss: 0.852


 94%|█████████▍| 18900/20019 [2:07:09<10:51,  1.72it/s]

[Epoch 3, Batch 18900] loss: 0.870


 95%|█████████▍| 18999/20019 [2:08:06<09:07,  1.86it/s]

[Epoch 3, Batch 19000] loss: 0.864


 95%|█████████▍| 19000/20019 [2:08:09<21:30,  1.27s/it]

****** Model checkpoint saved at epochs 4 ******


 95%|█████████▌| 19100/20019 [2:09:06<08:20,  1.84it/s]

[Epoch 3, Batch 19100] loss: 0.865


 96%|█████████▌| 19200/20019 [2:10:03<07:09,  1.91it/s]

[Epoch 3, Batch 19200] loss: 0.867


 96%|█████████▋| 19300/20019 [2:11:01<06:47,  1.76it/s]

[Epoch 3, Batch 19300] loss: 0.892


 97%|█████████▋| 19400/20019 [2:11:59<05:29,  1.88it/s]

[Epoch 3, Batch 19400] loss: 0.869


 97%|█████████▋| 19500/20019 [2:12:56<04:36,  1.88it/s]

[Epoch 3, Batch 19500] loss: 0.901


 98%|█████████▊| 19600/20019 [2:13:56<03:47,  1.84it/s]

[Epoch 3, Batch 19600] loss: 0.835


 98%|█████████▊| 19700/20019 [2:14:55<02:46,  1.91it/s]

[Epoch 3, Batch 19700] loss: 0.901


 99%|█████████▉| 19800/20019 [2:15:54<01:55,  1.89it/s]

[Epoch 3, Batch 19800] loss: 0.810


 99%|█████████▉| 19900/20019 [2:16:52<01:06,  1.78it/s]

[Epoch 3, Batch 19900] loss: 0.839


100%|█████████▉| 19999/20019 [2:17:51<00:12,  1.63it/s]

[Epoch 3, Batch 20000] loss: 0.881


100%|█████████▉| 20000/20019 [2:17:54<00:21,  1.13s/it]

****** Model checkpoint saved at epochs 4 ******


100%|██████████| 20019/20019 [2:18:03<00:00,  2.42it/s]
  0%|          | 100/20019 [00:37<1:55:31,  2.87it/s]

[Epoch 4, Batch   100] loss: 0.817


  1%|          | 200/20019 [01:12<1:54:53,  2.88it/s]

[Epoch 4, Batch   200] loss: 0.757


  1%|▏         | 300/20019 [01:47<1:54:17,  2.88it/s]

[Epoch 4, Batch   300] loss: 0.802


  2%|▏         | 400/20019 [02:21<1:53:46,  2.87it/s]

[Epoch 4, Batch   400] loss: 0.757


  2%|▏         | 500/20019 [02:56<1:53:08,  2.88it/s]

[Epoch 4, Batch   500] loss: 0.783


  3%|▎         | 600/20019 [03:31<1:52:20,  2.88it/s]

[Epoch 4, Batch   600] loss: 0.755


  3%|▎         | 700/20019 [04:06<1:51:54,  2.88it/s]

[Epoch 4, Batch   700] loss: 0.747


  4%|▍         | 800/20019 [04:41<1:51:06,  2.88it/s]

[Epoch 4, Batch   800] loss: 0.766


  4%|▍         | 900/20019 [05:16<1:50:43,  2.88it/s]

[Epoch 4, Batch   900] loss: 0.779


  5%|▍         | 999/20019 [05:50<1:50:55,  2.86it/s]

[Epoch 4, Batch  1000] loss: 0.805


  5%|▍         | 1000/20019 [05:53<5:11:01,  1.02it/s]

****** Model checkpoint saved at epochs 5 ******


  5%|▌         | 1100/20019 [06:29<1:49:49,  2.87it/s]

[Epoch 4, Batch  1100] loss: 0.771


  6%|▌         | 1200/20019 [07:03<1:49:27,  2.87it/s]

[Epoch 4, Batch  1200] loss: 0.758


  6%|▋         | 1300/20019 [07:38<1:48:37,  2.87it/s]

[Epoch 4, Batch  1300] loss: 0.784


  7%|▋         | 1400/20019 [08:13<1:47:59,  2.87it/s]

[Epoch 4, Batch  1400] loss: 0.784


  7%|▋         | 1500/20019 [08:48<1:47:38,  2.87it/s]

[Epoch 4, Batch  1500] loss: 0.769


  8%|▊         | 1600/20019 [09:23<1:46:45,  2.88it/s]

[Epoch 4, Batch  1600] loss: 0.780


  8%|▊         | 1700/20019 [09:58<1:46:06,  2.88it/s]

[Epoch 4, Batch  1700] loss: 0.723


  9%|▉         | 1800/20019 [10:33<1:45:27,  2.88it/s]

[Epoch 4, Batch  1800] loss: 0.743


  9%|▉         | 1900/20019 [11:08<1:44:51,  2.88it/s]

[Epoch 4, Batch  1900] loss: 0.781


 10%|▉         | 1999/20019 [11:42<1:44:56,  2.86it/s]

[Epoch 4, Batch  2000] loss: 0.753


 10%|▉         | 2000/20019 [11:45<5:05:01,  1.02s/it]

****** Model checkpoint saved at epochs 5 ******


 10%|█         | 2100/20019 [12:21<1:43:51,  2.88it/s]

[Epoch 4, Batch  2100] loss: 0.776


 11%|█         | 2200/20019 [12:56<1:43:14,  2.88it/s]

[Epoch 4, Batch  2200] loss: 0.768


 11%|█▏        | 2300/20019 [13:31<1:42:46,  2.87it/s]

[Epoch 4, Batch  2300] loss: 0.793


 12%|█▏        | 2400/20019 [14:05<1:42:09,  2.87it/s]

[Epoch 4, Batch  2400] loss: 0.789


 12%|█▏        | 2500/20019 [14:40<1:41:48,  2.87it/s]

[Epoch 4, Batch  2500] loss: 0.785


 13%|█▎        | 2600/20019 [15:15<1:41:22,  2.86it/s]

[Epoch 4, Batch  2600] loss: 0.781


 13%|█▎        | 2700/20019 [15:50<1:40:22,  2.88it/s]

[Epoch 4, Batch  2700] loss: 0.775


 14%|█▍        | 2800/20019 [16:25<1:39:51,  2.87it/s]

[Epoch 4, Batch  2800] loss: 0.774


 14%|█▍        | 2900/20019 [17:00<1:39:20,  2.87it/s]

[Epoch 4, Batch  2900] loss: 0.779


 15%|█▍        | 2999/20019 [17:35<1:39:12,  2.86it/s]

[Epoch 4, Batch  3000] loss: 0.760


 15%|█▍        | 3000/20019 [17:37<4:38:08,  1.02it/s]

****** Model checkpoint saved at epochs 5 ******


 15%|█▌        | 3100/20019 [18:13<1:38:01,  2.88it/s]

[Epoch 4, Batch  3100] loss: 0.778


 16%|█▌        | 3200/20019 [18:48<1:37:20,  2.88it/s]

[Epoch 4, Batch  3200] loss: 0.770


 16%|█▋        | 3300/20019 [19:23<1:37:14,  2.87it/s]

[Epoch 4, Batch  3300] loss: 0.789


 17%|█▋        | 3400/20019 [19:57<1:36:14,  2.88it/s]

[Epoch 4, Batch  3400] loss: 0.797


 17%|█▋        | 3500/20019 [20:32<1:35:44,  2.88it/s]

[Epoch 4, Batch  3500] loss: 0.765


 18%|█▊        | 3600/20019 [21:07<1:35:04,  2.88it/s]

[Epoch 4, Batch  3600] loss: 0.776


 18%|█▊        | 3700/20019 [21:42<1:34:42,  2.87it/s]

[Epoch 4, Batch  3700] loss: 0.746


 19%|█▉        | 3800/20019 [22:17<1:34:23,  2.86it/s]

[Epoch 4, Batch  3800] loss: 0.773


 19%|█▉        | 3900/20019 [22:52<1:33:24,  2.88it/s]

[Epoch 4, Batch  3900] loss: 0.753


 20%|█▉        | 3999/20019 [23:26<1:33:14,  2.86it/s]

[Epoch 4, Batch  4000] loss: 0.781


 20%|█▉        | 4000/20019 [23:29<4:18:12,  1.03it/s]

****** Model checkpoint saved at epochs 5 ******


 20%|██        | 4100/20019 [24:05<1:32:12,  2.88it/s]

[Epoch 4, Batch  4100] loss: 0.751


 21%|██        | 4200/20019 [24:40<1:31:38,  2.88it/s]

[Epoch 4, Batch  4200] loss: 0.788


 21%|██▏       | 4300/20019 [25:15<1:31:00,  2.88it/s]

[Epoch 4, Batch  4300] loss: 0.769


 22%|██▏       | 4400/20019 [25:50<1:30:25,  2.88it/s]

[Epoch 4, Batch  4400] loss: 0.734


 22%|██▏       | 4500/20019 [26:25<1:29:56,  2.88it/s]

[Epoch 4, Batch  4500] loss: 0.759


 23%|██▎       | 4600/20019 [26:59<1:29:23,  2.87it/s]

[Epoch 4, Batch  4600] loss: 0.786


 23%|██▎       | 4700/20019 [27:34<1:28:54,  2.87it/s]

[Epoch 4, Batch  4700] loss: 0.776


 24%|██▍       | 4800/20019 [28:09<1:28:17,  2.87it/s]

[Epoch 4, Batch  4800] loss: 0.755


 24%|██▍       | 4900/20019 [28:44<1:27:35,  2.88it/s]

[Epoch 4, Batch  4900] loss: 0.727


 25%|██▍       | 4999/20019 [29:19<1:27:39,  2.86it/s]

[Epoch 4, Batch  5000] loss: 0.763


 25%|██▍       | 5000/20019 [29:22<4:48:40,  1.15s/it]

****** Model checkpoint saved at epochs 5 ******


 25%|██▌       | 5100/20019 [29:58<1:26:38,  2.87it/s]

[Epoch 4, Batch  5100] loss: 0.764


 26%|██▌       | 5200/20019 [30:33<1:26:24,  2.86it/s]

[Epoch 4, Batch  5200] loss: 0.779


 26%|██▋       | 5300/20019 [31:08<1:25:18,  2.88it/s]

[Epoch 4, Batch  5300] loss: 0.762


 27%|██▋       | 5400/20019 [31:43<1:24:59,  2.87it/s]

[Epoch 4, Batch  5400] loss: 0.751


 27%|██▋       | 5500/20019 [32:18<1:24:16,  2.87it/s]

[Epoch 4, Batch  5500] loss: 0.791


 28%|██▊       | 5600/20019 [32:53<1:23:36,  2.87it/s]

[Epoch 4, Batch  5600] loss: 0.793


 28%|██▊       | 5700/20019 [33:27<1:23:04,  2.87it/s]

[Epoch 4, Batch  5700] loss: 0.764


 29%|██▉       | 5800/20019 [34:02<1:22:22,  2.88it/s]

[Epoch 4, Batch  5800] loss: 0.785


 29%|██▉       | 5900/20019 [34:37<1:21:34,  2.88it/s]

[Epoch 4, Batch  5900] loss: 0.796


 30%|██▉       | 5999/20019 [35:12<1:21:57,  2.85it/s]

[Epoch 4, Batch  6000] loss: 0.759


 30%|██▉       | 6000/20019 [35:14<4:06:19,  1.05s/it]

****** Model checkpoint saved at epochs 5 ******


 30%|███       | 6100/20019 [35:51<1:20:48,  2.87it/s]

[Epoch 4, Batch  6100] loss: 0.749


 31%|███       | 6200/20019 [36:26<1:20:09,  2.87it/s]

[Epoch 4, Batch  6200] loss: 0.766


 31%|███▏      | 6300/20019 [37:01<1:19:32,  2.87it/s]

[Epoch 4, Batch  6300] loss: 0.758


 32%|███▏      | 6400/20019 [37:36<1:18:57,  2.87it/s]

[Epoch 4, Batch  6400] loss: 0.745


 32%|███▏      | 6500/20019 [38:11<1:18:26,  2.87it/s]

[Epoch 4, Batch  6500] loss: 0.766


 33%|███▎      | 6600/20019 [38:46<1:17:58,  2.87it/s]

[Epoch 4, Batch  6600] loss: 0.755


 33%|███▎      | 6700/20019 [39:21<1:17:11,  2.88it/s]

[Epoch 4, Batch  6700] loss: 0.775


 34%|███▍      | 6800/20019 [39:56<1:17:10,  2.85it/s]

[Epoch 4, Batch  6800] loss: 0.748


 34%|███▍      | 6900/20019 [40:31<1:16:10,  2.87it/s]

[Epoch 4, Batch  6900] loss: 0.768


 35%|███▍      | 6999/20019 [41:05<1:16:20,  2.84it/s]

[Epoch 4, Batch  7000] loss: 0.793


 35%|███▍      | 7000/20019 [41:08<3:51:26,  1.07s/it]

****** Model checkpoint saved at epochs 5 ******


 35%|███▌      | 7100/20019 [41:45<1:15:04,  2.87it/s]

[Epoch 4, Batch  7100] loss: 0.762


 36%|███▌      | 7200/20019 [42:21<1:14:23,  2.87it/s]

[Epoch 4, Batch  7200] loss: 0.779


 36%|███▋      | 7300/20019 [42:56<1:13:56,  2.87it/s]

[Epoch 4, Batch  7300] loss: 0.789


 37%|███▋      | 7400/20019 [43:31<1:13:14,  2.87it/s]

[Epoch 4, Batch  7400] loss: 0.768


 37%|███▋      | 7500/20019 [44:06<1:12:27,  2.88it/s]

[Epoch 4, Batch  7500] loss: 0.771


 38%|███▊      | 7600/20019 [44:41<1:11:58,  2.88it/s]

[Epoch 4, Batch  7600] loss: 0.763


 38%|███▊      | 7700/20019 [45:17<1:11:17,  2.88it/s]

[Epoch 4, Batch  7700] loss: 0.796


 39%|███▉      | 7800/20019 [45:52<1:10:48,  2.88it/s]

[Epoch 4, Batch  7800] loss: 0.762


 39%|███▉      | 7900/20019 [46:27<1:10:05,  2.88it/s]

[Epoch 4, Batch  7900] loss: 0.776


 40%|███▉      | 7999/20019 [47:02<1:13:36,  2.72it/s]

[Epoch 4, Batch  8000] loss: 0.786


 40%|███▉      | 8000/20019 [47:05<3:35:21,  1.08s/it]

****** Model checkpoint saved at epochs 5 ******


 40%|████      | 8100/20019 [47:42<1:14:29,  2.67it/s]

[Epoch 4, Batch  8100] loss: 0.757


 41%|████      | 8200/20019 [48:18<1:09:04,  2.85it/s]

[Epoch 4, Batch  8200] loss: 0.770


 41%|████▏     | 8300/20019 [48:55<1:14:23,  2.63it/s]

[Epoch 4, Batch  8300] loss: 0.771


 42%|████▏     | 8400/20019 [49:31<1:08:12,  2.84it/s]

[Epoch 4, Batch  8400] loss: 0.756


 42%|████▏     | 8500/20019 [50:08<1:06:58,  2.87it/s]

[Epoch 4, Batch  8500] loss: 0.766


 43%|████▎     | 8600/20019 [50:45<1:09:39,  2.73it/s]

[Epoch 4, Batch  8600] loss: 0.796


 43%|████▎     | 8700/20019 [51:21<1:15:29,  2.50it/s]

[Epoch 4, Batch  8700] loss: 0.764


 44%|████▍     | 8800/20019 [51:59<1:08:20,  2.74it/s]

[Epoch 4, Batch  8800] loss: 0.750


 44%|████▍     | 8900/20019 [52:36<1:06:29,  2.79it/s]

[Epoch 4, Batch  8900] loss: 0.771


 45%|████▍     | 8999/20019 [53:12<1:06:48,  2.75it/s]

[Epoch 4, Batch  9000] loss: 0.755


 45%|████▍     | 9000/20019 [53:14<3:11:45,  1.04s/it]

****** Model checkpoint saved at epochs 5 ******


 45%|████▌     | 9100/20019 [53:53<1:07:04,  2.71it/s]

[Epoch 4, Batch  9100] loss: 0.777


 46%|████▌     | 9200/20019 [54:30<1:07:54,  2.66it/s]

[Epoch 4, Batch  9200] loss: 0.740


 46%|████▋     | 9300/20019 [55:07<1:04:06,  2.79it/s]

[Epoch 4, Batch  9300] loss: 0.740


 47%|████▋     | 9400/20019 [55:44<1:04:41,  2.74it/s]

[Epoch 4, Batch  9400] loss: 0.733


 47%|████▋     | 9500/20019 [56:21<1:05:25,  2.68it/s]

[Epoch 4, Batch  9500] loss: 0.765


 48%|████▊     | 9600/20019 [56:58<1:04:14,  2.70it/s]

[Epoch 4, Batch  9600] loss: 0.747


 48%|████▊     | 9700/20019 [57:36<1:05:29,  2.63it/s]

[Epoch 4, Batch  9700] loss: 0.756


 49%|████▉     | 9800/20019 [58:14<1:00:56,  2.80it/s]

[Epoch 4, Batch  9800] loss: 0.759


 49%|████▉     | 9900/20019 [58:52<1:06:18,  2.54it/s]

[Epoch 4, Batch  9900] loss: 0.762


 50%|████▉     | 9999/20019 [59:29<1:03:24,  2.63it/s]

[Epoch 4, Batch 10000] loss: 0.752


 50%|████▉     | 10000/20019 [59:32<2:56:32,  1.06s/it]

****** Model checkpoint saved at epochs 5 ******


 50%|█████     | 10100/20019 [1:00:12<59:16,  2.79it/s]  

[Epoch 4, Batch 10100] loss: 0.773


 51%|█████     | 10200/20019 [1:00:49<1:02:54,  2.60it/s]

[Epoch 4, Batch 10200] loss: 0.764


 51%|█████▏    | 10300/20019 [1:01:28<59:58,  2.70it/s]  

[Epoch 4, Batch 10300] loss: 0.775


 52%|█████▏    | 10400/20019 [1:02:08<1:02:11,  2.58it/s]

[Epoch 4, Batch 10400] loss: 0.773


 52%|█████▏    | 10500/20019 [1:02:47<1:06:52,  2.37it/s]

[Epoch 4, Batch 10500] loss: 0.796


 53%|█████▎    | 10600/20019 [1:03:25<55:20,  2.84it/s]  

[Epoch 4, Batch 10600] loss: 0.764


 53%|█████▎    | 10700/20019 [1:04:04<59:28,  2.61it/s]  

[Epoch 4, Batch 10700] loss: 0.771


 54%|█████▍    | 10800/20019 [1:04:43<1:08:35,  2.24it/s]

[Epoch 4, Batch 10800] loss: 0.769


 54%|█████▍    | 10900/20019 [1:05:22<58:06,  2.62it/s]  

[Epoch 4, Batch 10900] loss: 0.765


 55%|█████▍    | 10999/20019 [1:06:01<52:38,  2.86it/s]  

[Epoch 4, Batch 11000] loss: 0.742


 55%|█████▍    | 11000/20019 [1:06:04<2:33:46,  1.02s/it]

****** Model checkpoint saved at epochs 5 ******


 55%|█████▌    | 11100/20019 [1:06:45<54:55,  2.71it/s]  

[Epoch 4, Batch 11100] loss: 0.800


 56%|█████▌    | 11200/20019 [1:07:26<1:02:26,  2.35it/s]

[Epoch 4, Batch 11200] loss: 0.735


 56%|█████▋    | 11300/20019 [1:08:06<55:15,  2.63it/s]  

[Epoch 4, Batch 11300] loss: 0.780


 57%|█████▋    | 11400/20019 [1:08:47<58:29,  2.46it/s]  

[Epoch 4, Batch 11400] loss: 0.751


 57%|█████▋    | 11500/20019 [1:09:27<54:24,  2.61it/s]  

[Epoch 4, Batch 11500] loss: 0.782


 58%|█████▊    | 11600/20019 [1:10:06<55:48,  2.51it/s]  

[Epoch 4, Batch 11600] loss: 0.749


 58%|█████▊    | 11700/20019 [1:10:47<55:02,  2.52it/s]  

[Epoch 4, Batch 11700] loss: 0.730


 59%|█████▉    | 11800/20019 [1:11:27<54:51,  2.50it/s]  

[Epoch 4, Batch 11800] loss: 0.783


 59%|█████▉    | 11900/20019 [1:12:09<53:50,  2.51it/s]  

[Epoch 4, Batch 11900] loss: 0.767


 60%|█████▉    | 11999/20019 [1:12:50<52:38,  2.54it/s]  

[Epoch 4, Batch 12000] loss: 0.762


 60%|█████▉    | 12000/20019 [1:12:52<2:33:48,  1.15s/it]

****** Model checkpoint saved at epochs 5 ******


 60%|██████    | 12100/20019 [1:13:35<49:58,  2.64it/s]  

[Epoch 4, Batch 12100] loss: 0.760


 61%|██████    | 12200/20019 [1:14:18<1:04:05,  2.03it/s]

[Epoch 4, Batch 12200] loss: 0.761


 61%|██████▏   | 12300/20019 [1:15:00<54:39,  2.35it/s]  

[Epoch 4, Batch 12300] loss: 0.745


 62%|██████▏   | 12400/20019 [1:15:42<52:23,  2.42it/s]  

[Epoch 4, Batch 12400] loss: 0.795


 62%|██████▏   | 12500/20019 [1:16:25<52:11,  2.40it/s]  

[Epoch 4, Batch 12500] loss: 0.767


 63%|██████▎   | 12600/20019 [1:17:07<52:20,  2.36it/s]  

[Epoch 4, Batch 12600] loss: 0.751


 63%|██████▎   | 12700/20019 [1:17:51<55:34,  2.20it/s]  

[Epoch 4, Batch 12700] loss: 0.791


 64%|██████▍   | 12800/20019 [1:18:33<54:53,  2.19it/s]

[Epoch 4, Batch 12800] loss: 0.785


 64%|██████▍   | 12900/20019 [1:19:15<54:06,  2.19it/s]

[Epoch 4, Batch 12900] loss: 0.743


 65%|██████▍   | 12999/20019 [1:19:57<55:24,  2.11it/s]

[Epoch 4, Batch 13000] loss: 0.761


 65%|██████▍   | 13000/20019 [1:19:59<2:09:21,  1.11s/it]

****** Model checkpoint saved at epochs 5 ******


 65%|██████▌   | 13100/20019 [1:20:43<49:19,  2.34it/s]  

[Epoch 4, Batch 13100] loss: 0.756


 66%|██████▌   | 13200/20019 [1:21:27<1:00:59,  1.86it/s]

[Epoch 4, Batch 13200] loss: 0.761


 66%|██████▋   | 13300/20019 [1:22:10<48:24,  2.31it/s]  

[Epoch 4, Batch 13300] loss: 0.754


 67%|██████▋   | 13400/20019 [1:22:52<46:36,  2.37it/s]

[Epoch 4, Batch 13400] loss: 0.737


 67%|██████▋   | 13500/20019 [1:23:36<41:22,  2.63it/s]  

[Epoch 4, Batch 13500] loss: 0.741


 68%|██████▊   | 13600/20019 [1:24:20<44:59,  2.38it/s]

[Epoch 4, Batch 13600] loss: 0.784


 68%|██████▊   | 13700/20019 [1:25:04<44:45,  2.35it/s]

[Epoch 4, Batch 13700] loss: 0.760


 69%|██████▉   | 13800/20019 [1:25:47<46:40,  2.22it/s]

[Epoch 4, Batch 13800] loss: 0.753


 69%|██████▉   | 13900/20019 [1:26:33<49:07,  2.08it/s]  

[Epoch 4, Batch 13900] loss: 0.757


 70%|██████▉   | 13999/20019 [1:27:17<42:55,  2.34it/s]

[Epoch 4, Batch 14000] loss: 0.776


 70%|██████▉   | 14000/20019 [1:27:20<1:49:10,  1.09s/it]

****** Model checkpoint saved at epochs 5 ******


 70%|███████   | 14100/20019 [1:28:07<46:23,  2.13it/s]  

[Epoch 4, Batch 14100] loss: 0.788


 71%|███████   | 14200/20019 [1:28:52<45:34,  2.13it/s]

[Epoch 4, Batch 14200] loss: 0.783


 71%|███████▏  | 14300/20019 [1:29:37<49:39,  1.92it/s]

[Epoch 4, Batch 14300] loss: 0.748


 72%|███████▏  | 14400/20019 [1:30:23<39:30,  2.37it/s]

[Epoch 4, Batch 14400] loss: 0.745


 72%|███████▏  | 14500/20019 [1:31:08<41:46,  2.20it/s]

[Epoch 4, Batch 14500] loss: 0.784


 73%|███████▎  | 14600/20019 [1:31:54<43:54,  2.06it/s]

[Epoch 4, Batch 14600] loss: 0.758


 73%|███████▎  | 14700/20019 [1:32:40<42:22,  2.09it/s]

[Epoch 4, Batch 14700] loss: 0.775


 74%|███████▍  | 14800/20019 [1:33:25<39:54,  2.18it/s]

[Epoch 4, Batch 14800] loss: 0.790


 74%|███████▍  | 14900/20019 [1:34:10<37:20,  2.28it/s]

[Epoch 4, Batch 14900] loss: 0.782


 75%|███████▍  | 14999/20019 [1:34:55<34:27,  2.43it/s]

[Epoch 4, Batch 15000] loss: 0.744


 75%|███████▍  | 15000/20019 [1:34:58<1:40:25,  1.20s/it]

****** Model checkpoint saved at epochs 5 ******


 75%|███████▌  | 15100/20019 [1:35:45<35:01,  2.34it/s]  

[Epoch 4, Batch 15100] loss: 0.766


 76%|███████▌  | 15200/20019 [1:36:31<36:52,  2.18it/s]

[Epoch 4, Batch 15200] loss: 0.781


 76%|███████▋  | 15300/20019 [1:37:16<35:48,  2.20it/s]

[Epoch 4, Batch 15300] loss: 0.787


 77%|███████▋  | 15400/20019 [1:38:02<33:03,  2.33it/s]

[Epoch 4, Batch 15400] loss: 0.752


 77%|███████▋  | 15500/20019 [1:38:49<33:11,  2.27it/s]

[Epoch 4, Batch 15500] loss: 0.774


 78%|███████▊  | 15600/20019 [1:39:35<33:51,  2.18it/s]

[Epoch 4, Batch 15600] loss: 0.768


 78%|███████▊  | 15700/20019 [1:40:22<30:22,  2.37it/s]

[Epoch 4, Batch 15700] loss: 0.776


 79%|███████▉  | 15800/20019 [1:41:08<33:54,  2.07it/s]

[Epoch 4, Batch 15800] loss: 0.770


 79%|███████▉  | 15900/20019 [1:41:54<30:59,  2.21it/s]

[Epoch 4, Batch 15900] loss: 0.780


 80%|███████▉  | 15999/20019 [1:42:40<28:02,  2.39it/s]

[Epoch 4, Batch 16000] loss: 0.801


 80%|███████▉  | 16000/20019 [1:42:43<1:17:50,  1.16s/it]

****** Model checkpoint saved at epochs 5 ******


 80%|████████  | 16100/20019 [1:43:31<27:35,  2.37it/s]  

[Epoch 4, Batch 16100] loss: 0.766


 81%|████████  | 16200/20019 [1:44:19<28:59,  2.20it/s]

[Epoch 4, Batch 16200] loss: 0.772


 81%|████████▏ | 16300/20019 [1:45:06<27:17,  2.27it/s]

[Epoch 4, Batch 16300] loss: 0.793


 82%|████████▏ | 16400/20019 [1:45:54<27:57,  2.16it/s]

[Epoch 4, Batch 16400] loss: 0.742


 82%|████████▏ | 16500/20019 [1:46:42<30:50,  1.90it/s]

[Epoch 4, Batch 16500] loss: 0.801


 83%|████████▎ | 16600/20019 [1:47:30<26:02,  2.19it/s]

[Epoch 4, Batch 16600] loss: 0.767


 83%|████████▎ | 16700/20019 [1:48:18<26:10,  2.11it/s]

[Epoch 4, Batch 16700] loss: 0.749


 84%|████████▍ | 16800/20019 [1:49:05<24:37,  2.18it/s]

[Epoch 4, Batch 16800] loss: 0.774


 84%|████████▍ | 16900/20019 [1:49:54<25:20,  2.05it/s]

[Epoch 4, Batch 16900] loss: 0.774


 85%|████████▍ | 16999/20019 [1:50:43<24:59,  2.01it/s]

[Epoch 4, Batch 17000] loss: 0.790


 85%|████████▍ | 17000/20019 [1:50:46<56:39,  1.13s/it]

****** Model checkpoint saved at epochs 5 ******


 85%|████████▌ | 17100/20019 [1:51:37<23:35,  2.06it/s]

[Epoch 4, Batch 17100] loss: 0.775


 86%|████████▌ | 17200/20019 [1:52:27<22:22,  2.10it/s]

[Epoch 4, Batch 17200] loss: 0.764


 86%|████████▋ | 17300/20019 [1:53:17<19:47,  2.29it/s]

[Epoch 4, Batch 17300] loss: 0.751


 87%|████████▋ | 17400/20019 [1:54:07<22:20,  1.95it/s]

[Epoch 4, Batch 17400] loss: 0.810


 87%|████████▋ | 17500/20019 [1:54:58<23:11,  1.81it/s]

[Epoch 4, Batch 17500] loss: 0.790


 88%|████████▊ | 17600/20019 [1:55:48<21:44,  1.85it/s]

[Epoch 4, Batch 17600] loss: 0.789


 88%|████████▊ | 17700/20019 [1:56:38<21:35,  1.79it/s]

[Epoch 4, Batch 17700] loss: 0.770


 89%|████████▉ | 17800/20019 [1:57:30<17:52,  2.07it/s]

[Epoch 4, Batch 17800] loss: 0.768


 89%|████████▉ | 17900/20019 [1:58:22<19:37,  1.80it/s]

[Epoch 4, Batch 17900] loss: 0.746


 90%|████████▉ | 17999/20019 [1:59:11<15:55,  2.11it/s]

[Epoch 4, Batch 18000] loss: 0.778


 90%|████████▉ | 18000/20019 [1:59:15<41:52,  1.24s/it]

****** Model checkpoint saved at epochs 5 ******


 90%|█████████ | 18100/20019 [2:00:08<17:58,  1.78it/s]

[Epoch 4, Batch 18100] loss: 0.780


 91%|█████████ | 18200/20019 [2:00:59<15:48,  1.92it/s]

[Epoch 4, Batch 18200] loss: 0.755


 91%|█████████▏| 18300/20019 [2:01:51<15:24,  1.86it/s]

[Epoch 4, Batch 18300] loss: 0.769


 92%|█████████▏| 18400/20019 [2:02:42<12:53,  2.09it/s]

[Epoch 4, Batch 18400] loss: 0.756


 92%|█████████▏| 18500/20019 [2:03:34<13:55,  1.82it/s]

[Epoch 4, Batch 18500] loss: 0.750


 93%|█████████▎| 18600/20019 [2:04:26<12:38,  1.87it/s]

[Epoch 4, Batch 18600] loss: 0.765


 93%|█████████▎| 18700/20019 [2:05:19<13:11,  1.67it/s]

[Epoch 4, Batch 18700] loss: 0.801


 94%|█████████▍| 18800/20019 [2:06:11<10:41,  1.90it/s]

[Epoch 4, Batch 18800] loss: 0.773


 94%|█████████▍| 18900/20019 [2:07:03<09:08,  2.04it/s]

[Epoch 4, Batch 18900] loss: 0.801


 95%|█████████▍| 18999/20019 [2:07:56<08:31,  1.99it/s]

[Epoch 4, Batch 19000] loss: 0.761


 95%|█████████▍| 19000/20019 [2:08:00<22:09,  1.30s/it]

****** Model checkpoint saved at epochs 5 ******


 95%|█████████▌| 19100/20019 [2:08:54<07:30,  2.04it/s]

[Epoch 4, Batch 19100] loss: 0.757


 96%|█████████▌| 19200/20019 [2:09:47<08:19,  1.64it/s]

[Epoch 4, Batch 19200] loss: 0.763


 96%|█████████▋| 19300/20019 [2:10:41<05:56,  2.02it/s]

[Epoch 4, Batch 19300] loss: 0.783


 97%|█████████▋| 19400/20019 [2:11:36<04:43,  2.18it/s]

[Epoch 4, Batch 19400] loss: 0.779


 97%|█████████▋| 19500/20019 [2:12:30<04:32,  1.91it/s]

[Epoch 4, Batch 19500] loss: 0.794


 98%|█████████▊| 19600/20019 [2:13:26<03:42,  1.88it/s]

[Epoch 4, Batch 19600] loss: 0.770


 98%|█████████▊| 19700/20019 [2:14:22<02:38,  2.01it/s]

[Epoch 4, Batch 19700] loss: 0.762


 99%|█████████▉| 19800/20019 [2:15:18<01:50,  1.99it/s]

[Epoch 4, Batch 19800] loss: 0.779


 99%|█████████▉| 19900/20019 [2:16:14<01:00,  1.96it/s]

[Epoch 4, Batch 19900] loss: 0.789


100%|█████████▉| 19999/20019 [2:17:11<00:12,  1.66it/s]

[Epoch 4, Batch 20000] loss: 0.743


100%|█████████▉| 20000/20019 [2:17:14<00:24,  1.27s/it]

****** Model checkpoint saved at epochs 5 ******


100%|██████████| 20019/20019 [2:17:23<00:00,  2.43it/s]
  0%|          | 100/20019 [00:36<1:55:37,  2.87it/s]

[Epoch 5, Batch   100] loss: 0.696


  1%|          | 200/20019 [01:11<1:55:20,  2.86it/s]

[Epoch 5, Batch   200] loss: 0.717


  1%|▏         | 300/20019 [01:46<1:54:43,  2.86it/s]

[Epoch 5, Batch   300] loss: 0.705


  2%|▏         | 400/20019 [02:21<1:53:50,  2.87it/s]

[Epoch 5, Batch   400] loss: 0.682


  2%|▏         | 500/20019 [02:55<1:53:23,  2.87it/s]

[Epoch 5, Batch   500] loss: 0.705


  3%|▎         | 600/20019 [03:30<1:52:48,  2.87it/s]

[Epoch 5, Batch   600] loss: 0.670


  3%|▎         | 700/20019 [04:05<1:51:53,  2.88it/s]

[Epoch 5, Batch   700] loss: 0.660


  4%|▍         | 800/20019 [04:40<1:51:38,  2.87it/s]

[Epoch 5, Batch   800] loss: 0.710


  4%|▍         | 900/20019 [05:15<1:51:09,  2.87it/s]

[Epoch 5, Batch   900] loss: 0.696


  5%|▍         | 999/20019 [05:50<1:50:48,  2.86it/s]

[Epoch 5, Batch  1000] loss: 0.701


  5%|▍         | 1000/20019 [05:52<4:53:43,  1.08it/s]

****** Model checkpoint saved at epochs 6 ******


  5%|▌         | 1100/20019 [06:28<1:49:46,  2.87it/s]

[Epoch 5, Batch  1100] loss: 0.690


  6%|▌         | 1200/20019 [07:03<1:49:02,  2.88it/s]

[Epoch 5, Batch  1200] loss: 0.695


  6%|▋         | 1300/20019 [07:38<1:48:45,  2.87it/s]

[Epoch 5, Batch  1300] loss: 0.672


  7%|▋         | 1400/20019 [08:13<1:48:23,  2.86it/s]

[Epoch 5, Batch  1400] loss: 0.663


  7%|▋         | 1500/20019 [08:48<1:47:37,  2.87it/s]

[Epoch 5, Batch  1500] loss: 0.673


  8%|▊         | 1600/20019 [09:23<1:46:44,  2.88it/s]

[Epoch 5, Batch  1600] loss: 0.651


  8%|▊         | 1700/20019 [09:57<1:46:02,  2.88it/s]

[Epoch 5, Batch  1700] loss: 0.687


  9%|▉         | 1800/20019 [10:32<1:45:24,  2.88it/s]

[Epoch 5, Batch  1800] loss: 0.721


  9%|▉         | 1900/20019 [11:07<1:45:38,  2.86it/s]

[Epoch 5, Batch  1900] loss: 0.657


 10%|▉         | 1999/20019 [11:42<1:45:02,  2.86it/s]

[Epoch 5, Batch  2000] loss: 0.688


 10%|▉         | 2000/20019 [11:44<5:05:02,  1.02s/it]

****** Model checkpoint saved at epochs 6 ******


 10%|█         | 2100/20019 [12:20<1:44:23,  2.86it/s]

[Epoch 5, Batch  2100] loss: 0.679


 11%|█         | 2200/20019 [12:55<1:43:44,  2.86it/s]

[Epoch 5, Batch  2200] loss: 0.657


 11%|█▏        | 2300/20019 [13:30<1:43:21,  2.86it/s]

[Epoch 5, Batch  2300] loss: 0.677


 12%|█▏        | 2400/20019 [14:05<1:42:20,  2.87it/s]

[Epoch 5, Batch  2400] loss: 0.687


 12%|█▏        | 2500/20019 [14:40<1:41:46,  2.87it/s]

[Epoch 5, Batch  2500] loss: 0.669


 13%|█▎        | 2600/20019 [15:15<1:41:00,  2.87it/s]

[Epoch 5, Batch  2600] loss: 0.650


 13%|█▎        | 2700/20019 [15:50<1:40:32,  2.87it/s]

[Epoch 5, Batch  2700] loss: 0.644


 14%|█▍        | 2800/20019 [16:25<1:39:51,  2.87it/s]

[Epoch 5, Batch  2800] loss: 0.643


 14%|█▍        | 2900/20019 [17:00<1:39:36,  2.86it/s]

[Epoch 5, Batch  2900] loss: 0.693


 15%|█▍        | 2999/20019 [17:34<1:39:10,  2.86it/s]

[Epoch 5, Batch  3000] loss: 0.664


 15%|█▍        | 3000/20019 [17:37<4:49:56,  1.02s/it]

****** Model checkpoint saved at epochs 6 ******


 15%|█▌        | 3100/20019 [18:13<1:38:20,  2.87it/s]

[Epoch 5, Batch  3100] loss: 0.678


 16%|█▌        | 3200/20019 [18:48<1:38:09,  2.86it/s]

[Epoch 5, Batch  3200] loss: 0.695


 16%|█▋        | 3300/20019 [19:23<1:37:30,  2.86it/s]

[Epoch 5, Batch  3300] loss: 0.669


 17%|█▋        | 3400/20019 [19:58<1:36:22,  2.87it/s]

[Epoch 5, Batch  3400] loss: 0.681


 17%|█▋        | 3500/20019 [20:33<1:35:46,  2.87it/s]

[Epoch 5, Batch  3500] loss: 0.684


 18%|█▊        | 3600/20019 [21:08<1:35:14,  2.87it/s]

[Epoch 5, Batch  3600] loss: 0.694


 18%|█▊        | 3700/20019 [21:43<1:34:31,  2.88it/s]

[Epoch 5, Batch  3700] loss: 0.641


 19%|█▉        | 3800/20019 [22:17<1:34:01,  2.87it/s]

[Epoch 5, Batch  3800] loss: 0.671


 19%|█▉        | 3900/20019 [22:52<1:33:28,  2.87it/s]

[Epoch 5, Batch  3900] loss: 0.678


 20%|█▉        | 3999/20019 [23:27<1:33:36,  2.85it/s]

[Epoch 5, Batch  4000] loss: 0.650


 20%|█▉        | 4000/20019 [23:30<4:37:02,  1.04s/it]

****** Model checkpoint saved at epochs 6 ******


 20%|██        | 4100/20019 [24:06<1:32:26,  2.87it/s]

[Epoch 5, Batch  4100] loss: 0.658


 21%|██        | 4200/20019 [24:41<1:31:52,  2.87it/s]

[Epoch 5, Batch  4200] loss: 0.688


 21%|██▏       | 4300/20019 [25:16<1:31:32,  2.86it/s]

[Epoch 5, Batch  4300] loss: 0.683


 22%|██▏       | 4400/20019 [25:51<1:30:54,  2.86it/s]

[Epoch 5, Batch  4400] loss: 0.688


 22%|██▏       | 4500/20019 [26:26<1:30:03,  2.87it/s]

[Epoch 5, Batch  4500] loss: 0.646


 23%|██▎       | 4600/20019 [27:01<1:30:04,  2.85it/s]

[Epoch 5, Batch  4600] loss: 0.668


 23%|██▎       | 4700/20019 [27:36<1:28:59,  2.87it/s]

[Epoch 5, Batch  4700] loss: 0.687


 24%|██▍       | 4800/20019 [28:11<1:28:42,  2.86it/s]

[Epoch 5, Batch  4800] loss: 0.682


 24%|██▍       | 4900/20019 [28:46<1:27:40,  2.87it/s]

[Epoch 5, Batch  4900] loss: 0.691


 25%|██▍       | 4999/20019 [29:21<1:27:41,  2.85it/s]

[Epoch 5, Batch  5000] loss: 0.700


 25%|██▍       | 5000/20019 [29:23<4:14:32,  1.02s/it]

****** Model checkpoint saved at epochs 6 ******


 25%|██▌       | 5100/20019 [30:00<1:26:30,  2.87it/s]

[Epoch 5, Batch  5100] loss: 0.686


 26%|██▌       | 5200/20019 [30:35<1:25:46,  2.88it/s]

[Epoch 5, Batch  5200] loss: 0.645


 26%|██▋       | 5300/20019 [31:09<1:25:21,  2.87it/s]

[Epoch 5, Batch  5300] loss: 0.697


 27%|██▋       | 5400/20019 [31:44<1:24:49,  2.87it/s]

[Epoch 5, Batch  5400] loss: 0.686


 27%|██▋       | 5500/20019 [32:19<1:24:21,  2.87it/s]

[Epoch 5, Batch  5500] loss: 0.695


 28%|██▊       | 5600/20019 [32:54<1:23:32,  2.88it/s]

[Epoch 5, Batch  5600] loss: 0.666


 28%|██▊       | 5700/20019 [33:29<1:23:14,  2.87it/s]

[Epoch 5, Batch  5700] loss: 0.672


 29%|██▉       | 5800/20019 [34:04<1:22:52,  2.86it/s]

[Epoch 5, Batch  5800] loss: 0.679


 29%|██▉       | 5900/20019 [34:39<1:22:25,  2.86it/s]

[Epoch 5, Batch  5900] loss: 0.659


 30%|██▉       | 5999/20019 [35:14<1:21:39,  2.86it/s]

[Epoch 5, Batch  6000] loss: 0.645


 30%|██▉       | 6000/20019 [35:16<3:47:28,  1.03it/s]

****** Model checkpoint saved at epochs 6 ******


 30%|███       | 6100/20019 [35:53<1:21:05,  2.86it/s]

[Epoch 5, Batch  6100] loss: 0.685


 31%|███       | 6200/20019 [36:28<1:20:05,  2.88it/s]

[Epoch 5, Batch  6200] loss: 0.709


 31%|███▏      | 6300/20019 [37:03<1:19:33,  2.87it/s]

[Epoch 5, Batch  6300] loss: 0.648


 32%|███▏      | 6400/20019 [37:38<1:18:53,  2.88it/s]

[Epoch 5, Batch  6400] loss: 0.688


 32%|███▏      | 6500/20019 [38:13<1:18:25,  2.87it/s]

[Epoch 5, Batch  6500] loss: 0.643


 33%|███▎      | 6600/20019 [38:48<1:17:53,  2.87it/s]

[Epoch 5, Batch  6600] loss: 0.674


 33%|███▎      | 6700/20019 [39:24<1:17:13,  2.87it/s]

[Epoch 5, Batch  6700] loss: 0.637


 34%|███▍      | 6800/20019 [39:59<1:16:49,  2.87it/s]

[Epoch 5, Batch  6800] loss: 0.690


 34%|███▍      | 6900/20019 [40:34<1:21:47,  2.67it/s]

[Epoch 5, Batch  6900] loss: 0.646


 35%|███▍      | 6999/20019 [41:09<1:15:59,  2.86it/s]

[Epoch 5, Batch  7000] loss: 0.677


 35%|███▍      | 7000/20019 [41:11<3:33:52,  1.01it/s]

****** Model checkpoint saved at epochs 6 ******


 35%|███▌      | 7100/20019 [41:49<1:17:37,  2.77it/s]

[Epoch 5, Batch  7100] loss: 0.689


 36%|███▌      | 7200/20019 [42:26<1:14:31,  2.87it/s]

[Epoch 5, Batch  7200] loss: 0.650


 36%|███▋      | 7300/20019 [43:02<1:27:28,  2.42it/s]

[Epoch 5, Batch  7300] loss: 0.628


 37%|███▋      | 7400/20019 [43:37<1:13:49,  2.85it/s]

[Epoch 5, Batch  7400] loss: 0.664


 37%|███▋      | 7500/20019 [44:13<1:12:34,  2.87it/s]

[Epoch 5, Batch  7500] loss: 0.689


 38%|███▊      | 7600/20019 [44:48<1:11:43,  2.89it/s]

[Epoch 5, Batch  7600] loss: 0.634


 38%|███▊      | 7700/20019 [45:24<1:13:27,  2.80it/s]

[Epoch 5, Batch  7700] loss: 0.655


 39%|███▉      | 7800/20019 [46:00<1:11:39,  2.84it/s]

[Epoch 5, Batch  7800] loss: 0.701


 39%|███▉      | 7900/20019 [46:36<1:12:16,  2.79it/s]

[Epoch 5, Batch  7900] loss: 0.680


 40%|███▉      | 7999/20019 [47:12<1:10:31,  2.84it/s]

[Epoch 5, Batch  8000] loss: 0.713


 40%|███▉      | 8000/20019 [47:14<3:16:23,  1.02it/s]

****** Model checkpoint saved at epochs 6 ******


 40%|████      | 8100/20019 [47:52<1:13:50,  2.69it/s]

[Epoch 5, Batch  8100] loss: 0.688


 41%|████      | 8200/20019 [48:28<1:09:06,  2.85it/s]

[Epoch 5, Batch  8200] loss: 0.675


 41%|████▏     | 8300/20019 [49:05<1:12:43,  2.69it/s]

[Epoch 5, Batch  8300] loss: 0.699


 42%|████▏     | 8400/20019 [49:42<1:08:45,  2.82it/s]

[Epoch 5, Batch  8400] loss: 0.683


 42%|████▏     | 8500/20019 [50:19<1:14:02,  2.59it/s]

[Epoch 5, Batch  8500] loss: 0.681


 43%|████▎     | 8600/20019 [50:55<1:06:40,  2.85it/s]

[Epoch 5, Batch  8600] loss: 0.677


 43%|████▎     | 8700/20019 [51:32<1:12:29,  2.60it/s]

[Epoch 5, Batch  8700] loss: 0.687


 44%|████▍     | 8800/20019 [52:11<1:14:49,  2.50it/s]

[Epoch 5, Batch  8800] loss: 0.658


 44%|████▍     | 8900/20019 [52:47<1:07:09,  2.76it/s]

[Epoch 5, Batch  8900] loss: 0.689


 45%|████▍     | 8999/20019 [53:24<1:07:19,  2.73it/s]

[Epoch 5, Batch  9000] loss: 0.675


 45%|████▍     | 9000/20019 [53:27<3:39:52,  1.20s/it]

****** Model checkpoint saved at epochs 6 ******


 45%|████▌     | 9100/20019 [54:07<1:03:21,  2.87it/s]

[Epoch 5, Batch  9100] loss: 0.636


 46%|████▌     | 9200/20019 [54:46<1:06:31,  2.71it/s]

[Epoch 5, Batch  9200] loss: 0.689


 46%|████▋     | 9300/20019 [55:24<1:05:28,  2.73it/s]

[Epoch 5, Batch  9300] loss: 0.668


 47%|████▋     | 9400/20019 [56:02<1:04:23,  2.75it/s]

[Epoch 5, Batch  9400] loss: 0.644


 47%|████▋     | 9500/20019 [56:41<1:07:43,  2.59it/s]

[Epoch 5, Batch  9500] loss: 0.658


 48%|████▊     | 9600/20019 [57:19<1:04:06,  2.71it/s]

[Epoch 5, Batch  9600] loss: 0.714


 48%|████▊     | 9700/20019 [57:58<1:02:17,  2.76it/s]

[Epoch 5, Batch  9700] loss: 0.677


 49%|████▉     | 9800/20019 [58:36<1:05:57,  2.58it/s]

[Epoch 5, Batch  9800] loss: 0.658


 49%|████▉     | 9900/20019 [59:14<1:06:41,  2.53it/s]

[Epoch 5, Batch  9900] loss: 0.674


 50%|████▉     | 9999/20019 [59:54<1:03:26,  2.63it/s]

[Epoch 5, Batch 10000] loss: 0.700


 50%|████▉     | 10000/20019 [59:57<3:20:09,  1.20s/it]

****** Model checkpoint saved at epochs 6 ******


 50%|█████     | 10100/20019 [1:00:38<1:02:09,  2.66it/s]

[Epoch 5, Batch 10100] loss: 0.697


 51%|█████     | 10200/20019 [1:01:17<1:00:29,  2.71it/s]

[Epoch 5, Batch 10200] loss: 0.678


 51%|█████▏    | 10300/20019 [1:01:56<1:05:08,  2.49it/s]

[Epoch 5, Batch 10300] loss: 0.684


 52%|█████▏    | 10400/20019 [1:02:35<59:17,  2.70it/s]  

[Epoch 5, Batch 10400] loss: 0.649


 52%|█████▏    | 10500/20019 [1:03:15<1:00:50,  2.61it/s]

[Epoch 5, Batch 10500] loss: 0.654


 53%|█████▎    | 10600/20019 [1:03:55<1:00:03,  2.61it/s]

[Epoch 5, Batch 10600] loss: 0.634


 53%|█████▎    | 10700/20019 [1:04:36<58:15,  2.67it/s]  

[Epoch 5, Batch 10700] loss: 0.686


 54%|█████▍    | 10800/20019 [1:05:15<1:05:41,  2.34it/s]

[Epoch 5, Batch 10800] loss: 0.666


 54%|█████▍    | 10900/20019 [1:05:55<1:02:41,  2.42it/s]

[Epoch 5, Batch 10900] loss: 0.678


 55%|█████▍    | 10999/20019 [1:06:34<55:47,  2.69it/s]  

[Epoch 5, Batch 11000] loss: 0.645


 55%|█████▍    | 11000/20019 [1:06:37<2:31:41,  1.01s/it]

****** Model checkpoint saved at epochs 6 ******


 55%|█████▌    | 11100/20019 [1:07:19<56:41,  2.62it/s]  

[Epoch 5, Batch 11100] loss: 0.694


 56%|█████▌    | 11200/20019 [1:08:00<1:00:34,  2.43it/s]

[Epoch 5, Batch 11200] loss: 0.666


 56%|█████▋    | 11300/20019 [1:08:41<1:02:19,  2.33it/s]

[Epoch 5, Batch 11300] loss: 0.695


 57%|█████▋    | 11400/20019 [1:09:22<54:06,  2.65it/s]  

[Epoch 5, Batch 11400] loss: 0.693


 57%|█████▋    | 11500/20019 [1:10:03<55:11,  2.57it/s]  

[Epoch 5, Batch 11500] loss: 0.660


 58%|█████▊    | 11600/20019 [1:10:44<57:16,  2.45it/s]  

[Epoch 5, Batch 11600] loss: 0.657


 58%|█████▊    | 11700/20019 [1:11:25<56:15,  2.46it/s]  

[Epoch 5, Batch 11700] loss: 0.659


 59%|█████▉    | 11800/20019 [1:12:06<53:47,  2.55it/s]  

[Epoch 5, Batch 11800] loss: 0.672


 59%|█████▉    | 11900/20019 [1:12:48<59:33,  2.27it/s]  

[Epoch 5, Batch 11900] loss: 0.651


 60%|█████▉    | 11999/20019 [1:13:29<58:16,  2.29it/s]  

[Epoch 5, Batch 12000] loss: 0.685


 60%|█████▉    | 12000/20019 [1:13:32<2:25:13,  1.09s/it]

****** Model checkpoint saved at epochs 6 ******


 60%|██████    | 12100/20019 [1:14:16<51:30,  2.56it/s]  

[Epoch 5, Batch 12100] loss: 0.673


 61%|██████    | 12200/20019 [1:14:59<1:04:07,  2.03it/s]

[Epoch 5, Batch 12200] loss: 0.702


 61%|██████▏   | 12300/20019 [1:15:42<1:03:37,  2.02it/s]

[Epoch 5, Batch 12300] loss: 0.637


 62%|██████▏   | 12400/20019 [1:16:27<51:29,  2.47it/s]  

[Epoch 5, Batch 12400] loss: 0.658


 62%|██████▏   | 12500/20019 [1:17:10<48:52,  2.56it/s]  

[Epoch 5, Batch 12500] loss: 0.676


 63%|██████▎   | 12600/20019 [1:17:55<56:47,  2.18it/s]  

[Epoch 5, Batch 12600] loss: 0.627


 63%|██████▎   | 12700/20019 [1:18:39<52:44,  2.31it/s]  

[Epoch 5, Batch 12700] loss: 0.673


 64%|██████▍   | 12800/20019 [1:19:22<54:28,  2.21it/s]  

[Epoch 5, Batch 12800] loss: 0.705


 64%|██████▍   | 12900/20019 [1:20:07<47:16,  2.51it/s]  

[Epoch 5, Batch 12900] loss: 0.682


 65%|██████▍   | 12999/20019 [1:20:50<49:43,  2.35it/s]  

[Epoch 5, Batch 13000] loss: 0.654


 65%|██████▍   | 13000/20019 [1:20:53<2:25:43,  1.25s/it]

****** Model checkpoint saved at epochs 6 ******


 65%|██████▌   | 13100/20019 [1:21:39<52:19,  2.20it/s]  

[Epoch 5, Batch 13100] loss: 0.674


 66%|██████▌   | 13200/20019 [1:22:24<47:57,  2.37it/s]

[Epoch 5, Batch 13200] loss: 0.675


 66%|██████▋   | 13300/20019 [1:23:09<49:01,  2.28it/s]  

[Epoch 5, Batch 13300] loss: 0.667


 67%|██████▋   | 13400/20019 [1:23:55<53:31,  2.06it/s]  

[Epoch 5, Batch 13400] loss: 0.669


 67%|██████▋   | 13500/20019 [1:24:42<49:28,  2.20it/s]  

[Epoch 5, Batch 13500] loss: 0.692


 68%|██████▊   | 13600/20019 [1:25:28<49:50,  2.15it/s]

[Epoch 5, Batch 13600] loss: 0.670


 68%|██████▊   | 13700/20019 [1:26:15<54:09,  1.94it/s]

[Epoch 5, Batch 13700] loss: 0.679


 69%|██████▉   | 13800/20019 [1:27:03<53:24,  1.94it/s]  

[Epoch 5, Batch 13800] loss: 0.692


 69%|██████▉   | 13900/20019 [1:27:51<47:33,  2.14it/s]

[Epoch 5, Batch 13900] loss: 0.650


 70%|██████▉   | 13999/20019 [1:28:40<53:25,  1.88it/s]  

[Epoch 5, Batch 14000] loss: 0.690


 70%|██████▉   | 14000/20019 [1:28:42<1:51:09,  1.11s/it]

****** Model checkpoint saved at epochs 6 ******


 70%|███████   | 14100/20019 [1:29:32<48:48,  2.02it/s]  

[Epoch 5, Batch 14100] loss: 0.655


 71%|███████   | 14200/20019 [1:30:22<48:52,  1.98it/s]

[Epoch 5, Batch 14200] loss: 0.713


 71%|███████▏  | 14300/20019 [1:31:11<48:02,  1.98it/s]

[Epoch 5, Batch 14300] loss: 0.671


 72%|███████▏  | 14400/20019 [1:32:01<48:50,  1.92it/s]

[Epoch 5, Batch 14400] loss: 0.711


 72%|███████▏  | 14500/20019 [1:32:51<45:41,  2.01it/s]

[Epoch 5, Batch 14500] loss: 0.669


 73%|███████▎  | 14600/20019 [1:33:42<40:50,  2.21it/s]

[Epoch 5, Batch 14600] loss: 0.701


 73%|███████▎  | 14700/20019 [1:34:33<44:05,  2.01it/s]

[Epoch 5, Batch 14700] loss: 0.681


 74%|███████▍  | 14800/20019 [1:35:26<43:30,  2.00it/s]

[Epoch 5, Batch 14800] loss: 0.666


 74%|███████▍  | 14900/20019 [1:36:19<40:56,  2.08it/s]  

[Epoch 5, Batch 14900] loss: 0.670


 75%|███████▍  | 14999/20019 [1:37:12<46:10,  1.81it/s]

[Epoch 5, Batch 15000] loss: 0.652


 75%|███████▍  | 15000/20019 [1:37:14<1:34:26,  1.13s/it]

****** Model checkpoint saved at epochs 6 ******


 75%|███████▌  | 15100/20019 [1:38:09<44:39,  1.84it/s]  

[Epoch 5, Batch 15100] loss: 0.639


 76%|███████▌  | 15200/20019 [1:39:03<40:32,  1.98it/s]

[Epoch 5, Batch 15200] loss: 0.664


 76%|███████▋  | 15300/20019 [1:39:58<38:32,  2.04it/s]

[Epoch 5, Batch 15300] loss: 0.669


 77%|███████▋  | 15400/20019 [1:40:52<46:10,  1.67it/s]

[Epoch 5, Batch 15400] loss: 0.686


 77%|███████▋  | 15500/20019 [1:41:46<40:51,  1.84it/s]

[Epoch 5, Batch 15500] loss: 0.675


 78%|███████▊  | 15600/20019 [1:42:41<43:55,  1.68it/s]

[Epoch 5, Batch 15600] loss: 0.660


 78%|███████▊  | 15700/20019 [1:43:35<39:44,  1.81it/s]

[Epoch 5, Batch 15700] loss: 0.688


 79%|███████▉  | 15800/20019 [1:44:31<36:13,  1.94it/s]

[Epoch 5, Batch 15800] loss: 0.674


 79%|███████▉  | 15900/20019 [1:45:27<42:15,  1.62it/s]

[Epoch 5, Batch 15900] loss: 0.669


 80%|███████▉  | 15999/20019 [1:46:22<41:25,  1.62it/s]

[Epoch 5, Batch 16000] loss: 0.690


 80%|███████▉  | 16000/20019 [1:46:24<1:18:49,  1.18s/it]

****** Model checkpoint saved at epochs 6 ******


 80%|████████  | 16100/20019 [1:47:21<33:23,  1.96it/s]  

[Epoch 5, Batch 16100] loss: 0.691


 81%|████████  | 16200/20019 [1:48:18<33:47,  1.88it/s]

[Epoch 5, Batch 16200] loss: 0.670


 81%|████████▏ | 16300/20019 [1:49:15<34:12,  1.81it/s]

[Epoch 5, Batch 16300] loss: 0.702


 82%|████████▏ | 16400/20019 [1:50:13<32:18,  1.87it/s]

[Epoch 5, Batch 16400] loss: 0.671


 82%|████████▏ | 16500/20019 [1:51:09<30:34,  1.92it/s]

[Epoch 5, Batch 16500] loss: 0.684


 83%|████████▎ | 16600/20019 [1:52:06<33:29,  1.70it/s]

[Epoch 5, Batch 16600] loss: 0.704


 83%|████████▎ | 16700/20019 [1:53:04<33:31,  1.65it/s]

[Epoch 5, Batch 16700] loss: 0.706


 84%|████████▍ | 16800/20019 [1:54:01<29:58,  1.79it/s]

[Epoch 5, Batch 16800] loss: 0.659


 84%|████████▍ | 16900/20019 [1:55:00<30:52,  1.68it/s]

[Epoch 5, Batch 16900] loss: 0.684


 85%|████████▍ | 16999/20019 [1:55:57<33:05,  1.52it/s]

[Epoch 5, Batch 17000] loss: 0.654


 85%|████████▍ | 17000/20019 [1:56:00<1:10:36,  1.40s/it]

****** Model checkpoint saved at epochs 6 ******


 85%|████████▌ | 17100/20019 [1:56:59<28:30,  1.71it/s]  

[Epoch 5, Batch 17100] loss: 0.646


 86%|████████▌ | 17200/20019 [1:57:58<30:26,  1.54it/s]

[Epoch 5, Batch 17200] loss: 0.691


 86%|████████▋ | 17300/20019 [1:58:57<28:45,  1.58it/s]

[Epoch 5, Batch 17300] loss: 0.679


 87%|████████▋ | 17400/20019 [1:59:55<24:21,  1.79it/s]

[Epoch 5, Batch 17400] loss: 0.699


 87%|████████▋ | 17500/20019 [2:00:54<25:25,  1.65it/s]

[Epoch 5, Batch 17500] loss: 0.705


 88%|████████▊ | 17600/20019 [2:01:54<23:34,  1.71it/s]

[Epoch 5, Batch 17600] loss: 0.668


 88%|████████▊ | 17700/20019 [2:02:53<26:05,  1.48it/s]

[Epoch 5, Batch 17700] loss: 0.674


 89%|████████▉ | 17800/20019 [2:03:53<20:13,  1.83it/s]

[Epoch 5, Batch 17800] loss: 0.658


 89%|████████▉ | 17900/20019 [2:04:53<17:55,  1.97it/s]

[Epoch 5, Batch 17900] loss: 0.695


 90%|████████▉ | 17999/20019 [2:05:53<19:51,  1.70it/s]

[Epoch 5, Batch 18000] loss: 0.683


 90%|████████▉ | 18000/20019 [2:05:56<42:09,  1.25s/it]

****** Model checkpoint saved at epochs 6 ******


 90%|█████████ | 18100/20019 [2:06:58<21:21,  1.50it/s]

[Epoch 5, Batch 18100] loss: 0.629


 91%|█████████ | 18200/20019 [2:07:59<17:13,  1.76it/s]

[Epoch 5, Batch 18200] loss: 0.701


 91%|█████████▏| 18300/20019 [2:09:01<15:58,  1.79it/s]

[Epoch 5, Batch 18300] loss: 0.663


 92%|█████████▏| 18400/20019 [2:10:03<15:22,  1.76it/s]

[Epoch 5, Batch 18400] loss: 0.686


 92%|█████████▏| 18500/20019 [2:11:05<13:55,  1.82it/s]

[Epoch 5, Batch 18500] loss: 0.646


 93%|█████████▎| 18600/20019 [2:12:09<14:06,  1.68it/s]

[Epoch 5, Batch 18600] loss: 0.686


 93%|█████████▎| 18700/20019 [2:13:12<12:41,  1.73it/s]

[Epoch 5, Batch 18700] loss: 0.629


 94%|█████████▍| 18800/20019 [2:14:16<12:12,  1.67it/s]

[Epoch 5, Batch 18800] loss: 0.625


 94%|█████████▍| 18900/20019 [2:15:20<12:32,  1.49it/s]

[Epoch 5, Batch 18900] loss: 0.658


 95%|█████████▍| 18999/20019 [2:16:22<10:00,  1.70it/s]

[Epoch 5, Batch 19000] loss: 0.659


 95%|█████████▍| 19000/20019 [2:16:26<25:15,  1.49s/it]

****** Model checkpoint saved at epochs 6 ******


 95%|█████████▌| 19100/20019 [2:17:30<09:17,  1.65it/s]

[Epoch 5, Batch 19100] loss: 0.673


 96%|█████████▌| 19200/20019 [2:18:34<07:59,  1.71it/s]

[Epoch 5, Batch 19200] loss: 0.648


 96%|█████████▋| 19300/20019 [2:19:38<07:08,  1.68it/s]

[Epoch 5, Batch 19300] loss: 0.686


 97%|█████████▋| 19400/20019 [2:20:43<06:10,  1.67it/s]

[Epoch 5, Batch 19400] loss: 0.687


 97%|█████████▋| 19500/20019 [2:21:47<06:05,  1.42it/s]

[Epoch 5, Batch 19500] loss: 0.647


 98%|█████████▊| 19600/20019 [2:22:52<04:30,  1.55it/s]

[Epoch 5, Batch 19600] loss: 0.700


 98%|█████████▊| 19700/20019 [2:23:56<03:25,  1.55it/s]

[Epoch 5, Batch 19700] loss: 0.670


 99%|█████████▉| 19800/20019 [2:25:00<02:16,  1.61it/s]

[Epoch 5, Batch 19800] loss: 0.647


 99%|█████████▉| 19900/20019 [2:26:04<01:10,  1.68it/s]

[Epoch 5, Batch 19900] loss: 0.678


100%|█████████▉| 19999/20019 [2:27:09<00:13,  1.52it/s]

[Epoch 5, Batch 20000] loss: 0.714


100%|█████████▉| 20000/20019 [2:27:11<00:24,  1.28s/it]

****** Model checkpoint saved at epochs 6 ******


100%|██████████| 20019/20019 [2:27:23<00:00,  2.26it/s]
  0%|          | 100/20019 [00:35<1:55:33,  2.87it/s]

[Epoch 6, Batch   100] loss: 0.630


  1%|          | 200/20019 [01:10<1:54:55,  2.87it/s]

[Epoch 6, Batch   200] loss: 0.624


  1%|▏         | 300/20019 [01:45<1:54:24,  2.87it/s]

[Epoch 6, Batch   300] loss: 0.605


  2%|▏         | 400/20019 [02:20<1:53:45,  2.87it/s]

[Epoch 6, Batch   400] loss: 0.626


  2%|▏         | 500/20019 [02:55<1:53:17,  2.87it/s]

[Epoch 6, Batch   500] loss: 0.626


  3%|▎         | 600/20019 [03:30<1:52:24,  2.88it/s]

[Epoch 6, Batch   600] loss: 0.658


  3%|▎         | 700/20019 [04:04<1:52:27,  2.86it/s]

[Epoch 6, Batch   700] loss: 0.612


  4%|▍         | 800/20019 [04:39<1:51:07,  2.88it/s]

[Epoch 6, Batch   800] loss: 0.622


  4%|▍         | 900/20019 [05:14<1:50:27,  2.88it/s]

[Epoch 6, Batch   900] loss: 0.599


  5%|▍         | 999/20019 [05:48<1:51:30,  2.84it/s]

[Epoch 6, Batch  1000] loss: 0.619


  5%|▍         | 1000/20019 [05:51<4:52:34,  1.08it/s]

****** Model checkpoint saved at epochs 7 ******


  5%|▌         | 1100/20019 [06:27<1:49:30,  2.88it/s]

[Epoch 6, Batch  1100] loss: 0.609


  6%|▌         | 1200/20019 [07:02<1:48:51,  2.88it/s]

[Epoch 6, Batch  1200] loss: 0.622


  6%|▋         | 1300/20019 [07:37<1:49:02,  2.86it/s]

[Epoch 6, Batch  1300] loss: 0.613


  7%|▋         | 1400/20019 [08:12<1:47:53,  2.88it/s]

[Epoch 6, Batch  1400] loss: 0.590


  7%|▋         | 1500/20019 [08:47<1:47:57,  2.86it/s]

[Epoch 6, Batch  1500] loss: 0.648


  8%|▊         | 1600/20019 [09:22<1:46:51,  2.87it/s]

[Epoch 6, Batch  1600] loss: 0.625


  8%|▊         | 1700/20019 [09:56<1:46:05,  2.88it/s]

[Epoch 6, Batch  1700] loss: 0.636


  9%|▉         | 1800/20019 [10:31<1:45:43,  2.87it/s]

[Epoch 6, Batch  1800] loss: 0.605


  9%|▉         | 1900/20019 [11:06<1:46:08,  2.84it/s]

[Epoch 6, Batch  1900] loss: 0.629


 10%|▉         | 1999/20019 [11:41<1:45:18,  2.85it/s]

[Epoch 6, Batch  2000] loss: 0.601


 10%|▉         | 2000/20019 [11:43<4:57:16,  1.01it/s]

****** Model checkpoint saved at epochs 7 ******


 10%|█         | 2100/20019 [12:20<1:44:21,  2.86it/s]

[Epoch 6, Batch  2100] loss: 0.573


 11%|█         | 2200/20019 [12:55<1:43:17,  2.88it/s]

[Epoch 6, Batch  2200] loss: 0.589


 11%|█▏        | 2300/20019 [13:30<1:42:37,  2.88it/s]

[Epoch 6, Batch  2300] loss: 0.605


 12%|█▏        | 2400/20019 [14:04<1:42:06,  2.88it/s]

[Epoch 6, Batch  2400] loss: 0.599


 12%|█▏        | 2500/20019 [14:39<1:41:25,  2.88it/s]

[Epoch 6, Batch  2500] loss: 0.611


 13%|█▎        | 2600/20019 [15:14<1:41:04,  2.87it/s]

[Epoch 6, Batch  2600] loss: 0.613


 13%|█▎        | 2700/20019 [15:49<1:39:59,  2.89it/s]

[Epoch 6, Batch  2700] loss: 0.584


 14%|█▍        | 2800/20019 [16:24<1:40:04,  2.87it/s]

[Epoch 6, Batch  2800] loss: 0.639


 14%|█▍        | 2900/20019 [16:59<1:39:19,  2.87it/s]

[Epoch 6, Batch  2900] loss: 0.597


 15%|█▍        | 2999/20019 [17:33<1:39:15,  2.86it/s]

[Epoch 6, Batch  3000] loss: 0.553


 15%|█▍        | 3000/20019 [17:36<4:37:27,  1.02it/s]

****** Model checkpoint saved at epochs 7 ******


 15%|█▌        | 3100/20019 [18:12<1:38:02,  2.88it/s]

[Epoch 6, Batch  3100] loss: 0.612


 16%|█▌        | 3200/20019 [18:47<1:37:38,  2.87it/s]

[Epoch 6, Batch  3200] loss: 0.606


 16%|█▋        | 3300/20019 [19:22<1:37:07,  2.87it/s]

[Epoch 6, Batch  3300] loss: 0.608


 17%|█▋        | 3400/20019 [19:57<1:36:22,  2.87it/s]

[Epoch 6, Batch  3400] loss: 0.579


 17%|█▋        | 3500/20019 [20:32<1:35:47,  2.87it/s]

[Epoch 6, Batch  3500] loss: 0.627


 18%|█▊        | 3600/20019 [21:07<1:37:04,  2.82it/s]

[Epoch 6, Batch  3600] loss: 0.620


 18%|█▊        | 3700/20019 [21:42<1:34:31,  2.88it/s]

[Epoch 6, Batch  3700] loss: 0.585


 19%|█▉        | 3800/20019 [22:17<1:33:56,  2.88it/s]

[Epoch 6, Batch  3800] loss: 0.559


 19%|█▉        | 3900/20019 [22:52<1:33:54,  2.86it/s]

[Epoch 6, Batch  3900] loss: 0.589


 20%|█▉        | 3999/20019 [23:26<1:33:05,  2.87it/s]

[Epoch 6, Batch  4000] loss: 0.603


 20%|█▉        | 4000/20019 [23:29<4:26:08,  1.00it/s]

****** Model checkpoint saved at epochs 7 ******


 20%|██        | 4100/20019 [24:06<1:33:18,  2.84it/s]

[Epoch 6, Batch  4100] loss: 0.597


 21%|██        | 4200/20019 [24:43<1:31:47,  2.87it/s]

[Epoch 6, Batch  4200] loss: 0.601


 21%|██▏       | 4300/20019 [25:18<1:31:04,  2.88it/s]

[Epoch 6, Batch  4300] loss: 0.627


 22%|██▏       | 4400/20019 [25:53<1:30:36,  2.87it/s]

[Epoch 6, Batch  4400] loss: 0.615


 22%|██▏       | 4500/20019 [26:28<1:29:56,  2.88it/s]

[Epoch 6, Batch  4500] loss: 0.572


 23%|██▎       | 4600/20019 [27:03<1:29:34,  2.87it/s]

[Epoch 6, Batch  4600] loss: 0.588


 23%|██▎       | 4700/20019 [27:38<1:28:43,  2.88it/s]

[Epoch 6, Batch  4700] loss: 0.587


 24%|██▍       | 4800/20019 [28:13<1:28:44,  2.86it/s]

[Epoch 6, Batch  4800] loss: 0.617


 24%|██▍       | 4900/20019 [28:49<1:27:37,  2.88it/s]

[Epoch 6, Batch  4900] loss: 0.596


 25%|██▍       | 4999/20019 [29:23<1:27:56,  2.85it/s]

[Epoch 6, Batch  5000] loss: 0.604


 25%|██▍       | 5000/20019 [29:26<4:03:12,  1.03it/s]

****** Model checkpoint saved at epochs 7 ******


 25%|██▌       | 5100/20019 [30:04<1:26:39,  2.87it/s]

[Epoch 6, Batch  5100] loss: 0.564


 26%|██▌       | 5200/20019 [30:39<1:25:59,  2.87it/s]

[Epoch 6, Batch  5200] loss: 0.588


 26%|██▋       | 5300/20019 [31:14<1:31:43,  2.67it/s]

[Epoch 6, Batch  5300] loss: 0.615


 27%|██▋       | 5400/20019 [31:50<1:28:55,  2.74it/s]

[Epoch 6, Batch  5400] loss: 0.603


 27%|██▋       | 5500/20019 [32:26<1:26:27,  2.80it/s]

[Epoch 6, Batch  5500] loss: 0.575


 28%|██▊       | 5600/20019 [33:01<1:30:14,  2.66it/s]

[Epoch 6, Batch  5600] loss: 0.596


 28%|██▊       | 5700/20019 [33:37<1:23:00,  2.88it/s]

[Epoch 6, Batch  5700] loss: 0.606


 29%|██▉       | 5800/20019 [34:13<1:22:35,  2.87it/s]

[Epoch 6, Batch  5800] loss: 0.605


 29%|██▉       | 5900/20019 [34:48<1:21:42,  2.88it/s]

[Epoch 6, Batch  5900] loss: 0.621


 30%|██▉       | 5999/20019 [35:25<1:26:05,  2.71it/s]

[Epoch 6, Batch  6000] loss: 0.612


 30%|██▉       | 6000/20019 [35:27<3:50:49,  1.01it/s]

****** Model checkpoint saved at epochs 7 ******


 30%|███       | 6100/20019 [36:06<1:26:44,  2.67it/s]

[Epoch 6, Batch  6100] loss: 0.602


 31%|███       | 6200/20019 [36:42<1:26:49,  2.65it/s]

[Epoch 6, Batch  6200] loss: 0.628


 31%|███▏      | 6300/20019 [37:18<1:19:49,  2.86it/s]

[Epoch 6, Batch  6300] loss: 0.599


 32%|███▏      | 6400/20019 [37:53<1:19:33,  2.85it/s]

[Epoch 6, Batch  6400] loss: 0.619


 32%|███▏      | 6500/20019 [38:30<1:21:22,  2.77it/s]

[Epoch 6, Batch  6500] loss: 0.577


 33%|███▎      | 6600/20019 [39:08<1:18:46,  2.84it/s]

[Epoch 6, Batch  6600] loss: 0.597


 33%|███▎      | 6700/20019 [39:44<1:25:28,  2.60it/s]

[Epoch 6, Batch  6700] loss: 0.609


 34%|███▍      | 6800/20019 [40:22<1:20:57,  2.72it/s]

[Epoch 6, Batch  6800] loss: 0.623


 34%|███▍      | 6900/20019 [40:58<1:16:28,  2.86it/s]

[Epoch 6, Batch  6900] loss: 0.549


 35%|███▍      | 6999/20019 [41:36<1:19:35,  2.73it/s]

[Epoch 6, Batch  7000] loss: 0.594


 35%|███▍      | 7000/20019 [41:39<4:17:03,  1.18s/it]

****** Model checkpoint saved at epochs 7 ******


 35%|███▌      | 7100/20019 [42:19<1:17:13,  2.79it/s]

[Epoch 6, Batch  7100] loss: 0.594


 36%|███▌      | 7200/20019 [42:57<1:19:29,  2.69it/s]

[Epoch 6, Batch  7200] loss: 0.602


 36%|███▋      | 7300/20019 [43:35<1:21:32,  2.60it/s]

[Epoch 6, Batch  7300] loss: 0.583


 37%|███▋      | 7400/20019 [44:14<1:16:30,  2.75it/s]

[Epoch 6, Batch  7400] loss: 0.588


 37%|███▋      | 7500/20019 [44:52<1:18:25,  2.66it/s]

[Epoch 6, Batch  7500] loss: 0.591


 38%|███▊      | 7600/20019 [45:29<1:14:40,  2.77it/s]

[Epoch 6, Batch  7600] loss: 0.603


 38%|███▊      | 7700/20019 [46:07<1:12:40,  2.83it/s]

[Epoch 6, Batch  7700] loss: 0.598


 39%|███▉      | 7800/20019 [46:45<1:21:04,  2.51it/s]

[Epoch 6, Batch  7800] loss: 0.581


 39%|███▉      | 7900/20019 [47:24<1:13:35,  2.74it/s]

[Epoch 6, Batch  7900] loss: 0.597


 40%|███▉      | 7999/20019 [48:02<1:16:00,  2.64it/s]

[Epoch 6, Batch  8000] loss: 0.585


 40%|███▉      | 8000/20019 [48:05<3:30:04,  1.05s/it]

****** Model checkpoint saved at epochs 7 ******


 40%|████      | 8100/20019 [48:45<1:12:50,  2.73it/s]

[Epoch 6, Batch  8100] loss: 0.603


 41%|████      | 8200/20019 [49:23<1:16:48,  2.56it/s]

[Epoch 6, Batch  8200] loss: 0.638


 41%|████▏     | 8300/20019 [50:02<1:20:14,  2.43it/s]

[Epoch 6, Batch  8300] loss: 0.574


 42%|████▏     | 8400/20019 [50:40<1:29:04,  2.17it/s]

[Epoch 6, Batch  8400] loss: 0.612


 42%|████▏     | 8500/20019 [51:19<1:12:03,  2.66it/s]

[Epoch 6, Batch  8500] loss: 0.591


 43%|████▎     | 8600/20019 [51:57<1:27:57,  2.16it/s]

[Epoch 6, Batch  8600] loss: 0.611


 43%|████▎     | 8700/20019 [52:37<1:22:04,  2.30it/s]

[Epoch 6, Batch  8700] loss: 0.648


 44%|████▍     | 8800/20019 [53:16<1:10:41,  2.64it/s]

[Epoch 6, Batch  8800] loss: 0.605


 44%|████▍     | 8900/20019 [53:56<1:19:19,  2.34it/s]

[Epoch 6, Batch  8900] loss: 0.593


 45%|████▍     | 8999/20019 [54:35<1:09:47,  2.63it/s]

[Epoch 6, Batch  9000] loss: 0.600


 45%|████▍     | 9000/20019 [54:37<3:10:34,  1.04s/it]

****** Model checkpoint saved at epochs 7 ******


 45%|████▌     | 9100/20019 [55:18<1:13:09,  2.49it/s]

[Epoch 6, Batch  9100] loss: 0.609


 46%|████▌     | 9200/20019 [55:58<1:06:08,  2.73it/s]

[Epoch 6, Batch  9200] loss: 0.617


 46%|████▋     | 9300/20019 [56:38<1:13:07,  2.44it/s]

[Epoch 6, Batch  9300] loss: 0.600


 47%|████▋     | 9400/20019 [57:18<1:07:39,  2.62it/s]

[Epoch 6, Batch  9400] loss: 0.587


 47%|████▋     | 9500/20019 [57:57<1:09:16,  2.53it/s]

[Epoch 6, Batch  9500] loss: 0.592


 48%|████▊     | 9600/20019 [58:37<1:04:47,  2.68it/s]

[Epoch 6, Batch  9600] loss: 0.592


 48%|████▊     | 9700/20019 [59:16<1:06:21,  2.59it/s]

[Epoch 6, Batch  9700] loss: 0.579


 49%|████▉     | 9800/20019 [59:56<1:02:03,  2.74it/s]

[Epoch 6, Batch  9800] loss: 0.588


 49%|████▉     | 9900/20019 [1:00:38<1:06:21,  2.54it/s]

[Epoch 6, Batch  9900] loss: 0.612


 50%|████▉     | 9999/20019 [1:01:18<1:14:23,  2.25it/s]

[Epoch 6, Batch 10000] loss: 0.596


 50%|████▉     | 10000/20019 [1:01:21<3:29:13,  1.25s/it]

****** Model checkpoint saved at epochs 7 ******


 50%|█████     | 10100/20019 [1:02:02<1:04:37,  2.56it/s]

[Epoch 6, Batch 10100] loss: 0.613


 51%|█████     | 10200/20019 [1:02:44<1:06:43,  2.45it/s]

[Epoch 6, Batch 10200] loss: 0.596


 51%|█████▏    | 10300/20019 [1:03:25<59:07,  2.74it/s]  

[Epoch 6, Batch 10300] loss: 0.608


 52%|█████▏    | 10400/20019 [1:04:08<1:07:44,  2.37it/s]

[Epoch 6, Batch 10400] loss: 0.598


 52%|█████▏    | 10500/20019 [1:04:51<1:02:51,  2.52it/s]

[Epoch 6, Batch 10500] loss: 0.622


 53%|█████▎    | 10600/20019 [1:05:33<1:06:53,  2.35it/s]

[Epoch 6, Batch 10600] loss: 0.595


 53%|█████▎    | 10700/20019 [1:06:14<1:04:59,  2.39it/s]

[Epoch 6, Batch 10700] loss: 0.620


 54%|█████▍    | 10800/20019 [1:06:55<1:03:48,  2.41it/s]

[Epoch 6, Batch 10800] loss: 0.574


 54%|█████▍    | 10900/20019 [1:07:37<1:08:48,  2.21it/s]

[Epoch 6, Batch 10900] loss: 0.628


 55%|█████▍    | 10999/20019 [1:08:19<1:03:38,  2.36it/s]

[Epoch 6, Batch 11000] loss: 0.615


 55%|█████▍    | 11000/20019 [1:08:22<2:38:51,  1.06s/it]

****** Model checkpoint saved at epochs 7 ******


 55%|█████▌    | 11100/20019 [1:09:06<1:07:22,  2.21it/s]

[Epoch 6, Batch 11100] loss: 0.595


 56%|█████▌    | 11200/20019 [1:09:49<1:01:13,  2.40it/s]

[Epoch 6, Batch 11200] loss: 0.562


 56%|█████▋    | 11300/20019 [1:10:33<1:01:31,  2.36it/s]

[Epoch 6, Batch 11300] loss: 0.599


 57%|█████▋    | 11400/20019 [1:11:15<57:20,  2.51it/s]  

[Epoch 6, Batch 11400] loss: 0.597


 57%|█████▋    | 11500/20019 [1:11:57<1:00:47,  2.34it/s]

[Epoch 6, Batch 11500] loss: 0.622


 58%|█████▊    | 11600/20019 [1:12:41<1:10:11,  2.00it/s]

[Epoch 6, Batch 11600] loss: 0.589


 58%|█████▊    | 11700/20019 [1:13:25<1:01:03,  2.27it/s]

[Epoch 6, Batch 11700] loss: 0.610


 59%|█████▉    | 11800/20019 [1:14:08<59:13,  2.31it/s]  

[Epoch 6, Batch 11800] loss: 0.598


 59%|█████▉    | 11900/20019 [1:14:51<59:16,  2.28it/s]  

[Epoch 6, Batch 11900] loss: 0.587


 60%|█████▉    | 11999/20019 [1:15:35<1:11:36,  1.87it/s]

[Epoch 6, Batch 12000] loss: 0.597


 60%|█████▉    | 12000/20019 [1:15:38<2:54:46,  1.31s/it]

****** Model checkpoint saved at epochs 7 ******


 60%|██████    | 12100/20019 [1:16:24<53:00,  2.49it/s]  

[Epoch 6, Batch 12100] loss: 0.566


 61%|██████    | 12200/20019 [1:17:08<56:40,  2.30it/s]  

[Epoch 6, Batch 12200] loss: 0.583


 61%|██████▏   | 12300/20019 [1:17:53<53:27,  2.41it/s]  

[Epoch 6, Batch 12300] loss: 0.601


 62%|██████▏   | 12400/20019 [1:18:38<53:08,  2.39it/s]  

[Epoch 6, Batch 12400] loss: 0.611


 62%|██████▏   | 12500/20019 [1:19:24<49:52,  2.51it/s]  

[Epoch 6, Batch 12500] loss: 0.583


 63%|██████▎   | 12600/20019 [1:20:09<49:20,  2.51it/s]  

[Epoch 6, Batch 12600] loss: 0.609


 63%|██████▎   | 12700/20019 [1:20:55<50:34,  2.41it/s]  

[Epoch 6, Batch 12700] loss: 0.630


 64%|██████▍   | 12800/20019 [1:21:40<54:43,  2.20it/s]  

[Epoch 6, Batch 12800] loss: 0.603


 64%|██████▍   | 12900/20019 [1:22:27<55:20,  2.14it/s]  

[Epoch 6, Batch 12900] loss: 0.585


 65%|██████▍   | 12999/20019 [1:23:13<54:19,  2.15it/s]  

[Epoch 6, Batch 13000] loss: 0.570


 65%|██████▍   | 13000/20019 [1:23:16<2:16:06,  1.16s/it]

****** Model checkpoint saved at epochs 7 ******


 65%|██████▌   | 13100/20019 [1:24:03<53:39,  2.15it/s]  

[Epoch 6, Batch 13100] loss: 0.610


 66%|██████▌   | 13200/20019 [1:24:51<53:18,  2.13it/s]  

[Epoch 6, Batch 13200] loss: 0.605


 66%|██████▋   | 13300/20019 [1:25:39<54:26,  2.06it/s]  

[Epoch 6, Batch 13300] loss: 0.603


 67%|██████▋   | 13400/20019 [1:26:27<58:18,  1.89it/s]  

[Epoch 6, Batch 13400] loss: 0.590


 67%|██████▋   | 13500/20019 [1:27:15<54:52,  1.98it/s]  

[Epoch 6, Batch 13500] loss: 0.613


 68%|██████▊   | 13600/20019 [1:28:04<54:02,  1.98it/s]  

[Epoch 6, Batch 13600] loss: 0.572


 68%|██████▊   | 13700/20019 [1:28:52<50:23,  2.09it/s]  

[Epoch 6, Batch 13700] loss: 0.630


 69%|██████▉   | 13800/20019 [1:29:42<55:05,  1.88it/s]  

[Epoch 6, Batch 13800] loss: 0.607


 69%|██████▉   | 13900/20019 [1:30:30<45:57,  2.22it/s]

[Epoch 6, Batch 13900] loss: 0.635


 70%|██████▉   | 13999/20019 [1:31:20<53:31,  1.87it/s]  

[Epoch 6, Batch 14000] loss: 0.627


 70%|██████▉   | 14000/20019 [1:31:23<2:01:04,  1.21s/it]

****** Model checkpoint saved at epochs 7 ******


 70%|███████   | 14100/20019 [1:32:14<46:56,  2.10it/s]  

[Epoch 6, Batch 14100] loss: 0.617


 71%|███████   | 14200/20019 [1:33:05<44:54,  2.16it/s]

[Epoch 6, Batch 14200] loss: 0.627


 71%|███████▏  | 14300/20019 [1:33:56<43:13,  2.20it/s]  

[Epoch 6, Batch 14300] loss: 0.624


 72%|███████▏  | 14400/20019 [1:34:47<48:12,  1.94it/s]

[Epoch 6, Batch 14400] loss: 0.595


 72%|███████▏  | 14500/20019 [1:35:39<43:39,  2.11it/s]

[Epoch 6, Batch 14500] loss: 0.592


 73%|███████▎  | 14600/20019 [1:36:32<45:15,  2.00it/s]

[Epoch 6, Batch 14600] loss: 0.571


 73%|███████▎  | 14700/20019 [1:37:25<42:47,  2.07it/s]

[Epoch 6, Batch 14700] loss: 0.581


 74%|███████▍  | 14800/20019 [1:38:19<44:23,  1.96it/s]

[Epoch 6, Batch 14800] loss: 0.592


 74%|███████▍  | 14900/20019 [1:39:12<42:24,  2.01it/s]

[Epoch 6, Batch 14900] loss: 0.589


 75%|███████▍  | 14999/20019 [1:40:06<52:10,  1.60it/s]

[Epoch 6, Batch 15000] loss: 0.614


 75%|███████▍  | 15000/20019 [1:40:09<1:46:17,  1.27s/it]

****** Model checkpoint saved at epochs 7 ******


 75%|███████▌  | 15100/20019 [1:41:04<42:51,  1.91it/s]  

[Epoch 6, Batch 15100] loss: 0.603


 76%|███████▌  | 15200/20019 [1:42:00<46:34,  1.72it/s]

[Epoch 6, Batch 15200] loss: 0.611


 76%|███████▋  | 15300/20019 [1:42:56<41:40,  1.89it/s]  

[Epoch 6, Batch 15300] loss: 0.622


 77%|███████▋  | 15400/20019 [1:43:53<45:16,  1.70it/s]

[Epoch 6, Batch 15400] loss: 0.601


 77%|███████▋  | 15500/20019 [1:44:49<45:45,  1.65it/s]

[Epoch 6, Batch 15500] loss: 0.619


 78%|███████▊  | 15600/20019 [1:45:48<44:32,  1.65it/s]

[Epoch 6, Batch 15600] loss: 0.547


 78%|███████▊  | 15700/20019 [1:46:47<46:41,  1.54it/s]

[Epoch 6, Batch 15700] loss: 0.614


 79%|███████▉  | 15800/20019 [1:47:45<37:00,  1.90it/s]

[Epoch 6, Batch 15800] loss: 0.599


 79%|███████▉  | 15900/20019 [1:48:43<36:27,  1.88it/s]

[Epoch 6, Batch 15900] loss: 0.571


 80%|███████▉  | 15999/20019 [1:49:42<38:27,  1.74it/s]

[Epoch 6, Batch 16000] loss: 0.585


 80%|███████▉  | 16000/20019 [1:49:45<1:26:44,  1.29s/it]

****** Model checkpoint saved at epochs 7 ******


 80%|████████  | 16100/20019 [1:50:46<36:38,  1.78it/s]  

[Epoch 6, Batch 16100] loss: 0.579


 81%|████████  | 16200/20019 [1:51:45<42:19,  1.50it/s]

[Epoch 6, Batch 16200] loss: 0.615


 81%|████████▏ | 16300/20019 [1:52:45<42:10,  1.47it/s]

[Epoch 6, Batch 16300] loss: 0.627


 82%|████████▏ | 16400/20019 [1:53:44<37:38,  1.60it/s]

[Epoch 6, Batch 16400] loss: 0.589


 82%|████████▏ | 16500/20019 [1:54:44<35:56,  1.63it/s]

[Epoch 6, Batch 16500] loss: 0.602


 83%|████████▎ | 16600/20019 [1:55:45<33:11,  1.72it/s]

[Epoch 6, Batch 16600] loss: 0.600


 83%|████████▎ | 16700/20019 [1:56:46<31:33,  1.75it/s]

[Epoch 6, Batch 16700] loss: 0.611


 84%|████████▍ | 16800/20019 [1:57:47<30:04,  1.78it/s]

[Epoch 6, Batch 16800] loss: 0.598


 84%|████████▍ | 16900/20019 [1:58:49<34:02,  1.53it/s]

[Epoch 6, Batch 16900] loss: 0.587


 85%|████████▍ | 16999/20019 [1:59:50<29:54,  1.68it/s]

[Epoch 6, Batch 17000] loss: 0.629


 85%|████████▍ | 17000/20019 [1:59:53<1:03:10,  1.26s/it]

****** Model checkpoint saved at epochs 7 ******


 85%|████████▌ | 17100/20019 [2:00:56<30:43,  1.58it/s]  

[Epoch 6, Batch 17100] loss: 0.600


 86%|████████▌ | 17200/20019 [2:01:59<31:48,  1.48it/s]

[Epoch 6, Batch 17200] loss: 0.609


 86%|████████▋ | 17300/20019 [2:03:02<31:44,  1.43it/s]

[Epoch 6, Batch 17300] loss: 0.627


 87%|████████▋ | 17400/20019 [2:04:06<32:40,  1.34it/s]

[Epoch 6, Batch 17400] loss: 0.589


 87%|████████▋ | 17500/20019 [2:05:08<28:01,  1.50it/s]

[Epoch 6, Batch 17500] loss: 0.568


 88%|████████▊ | 17600/20019 [2:06:11<23:00,  1.75it/s]

[Epoch 6, Batch 17600] loss: 0.600


 88%|████████▊ | 17700/20019 [2:07:15<25:51,  1.49it/s]

[Epoch 6, Batch 17700] loss: 0.594


 89%|████████▉ | 17800/20019 [2:08:19<26:43,  1.38it/s]

[Epoch 6, Batch 17800] loss: 0.568


 89%|████████▉ | 17900/20019 [2:09:23<24:36,  1.44it/s]

[Epoch 6, Batch 17900] loss: 0.548


 90%|████████▉ | 17999/20019 [2:10:27<19:26,  1.73it/s]

[Epoch 6, Batch 18000] loss: 0.592


 90%|████████▉ | 18000/20019 [2:10:30<43:34,  1.29s/it]

****** Model checkpoint saved at epochs 7 ******


 90%|█████████ | 18100/20019 [2:11:35<21:27,  1.49it/s]

[Epoch 6, Batch 18100] loss: 0.609


 91%|█████████ | 18200/20019 [2:12:40<17:47,  1.70it/s]

[Epoch 6, Batch 18200] loss: 0.617


 91%|█████████▏| 18300/20019 [2:13:45<16:56,  1.69it/s]

[Epoch 6, Batch 18300] loss: 0.608


 92%|█████████▏| 18400/20019 [2:14:49<15:34,  1.73it/s]

[Epoch 6, Batch 18400] loss: 0.602


 92%|█████████▏| 18500/20019 [2:15:54<15:09,  1.67it/s]

[Epoch 6, Batch 18500] loss: 0.597


 93%|█████████▎| 18600/20019 [2:17:00<14:39,  1.61it/s]

[Epoch 6, Batch 18600] loss: 0.566


 93%|█████████▎| 18700/20019 [2:18:06<13:39,  1.61it/s]

[Epoch 6, Batch 18700] loss: 0.611


 94%|█████████▍| 18800/20019 [2:19:12<11:29,  1.77it/s]

[Epoch 6, Batch 18800] loss: 0.626


 94%|█████████▍| 18900/20019 [2:20:18<11:00,  1.69it/s]

[Epoch 6, Batch 18900] loss: 0.596


 95%|█████████▍| 18999/20019 [2:21:22<11:53,  1.43it/s]

[Epoch 6, Batch 19000] loss: 0.601


 95%|█████████▍| 19000/20019 [2:21:25<20:46,  1.22s/it]

****** Model checkpoint saved at epochs 7 ******


 95%|█████████▌| 19100/20019 [2:22:31<09:35,  1.60it/s]

[Epoch 6, Batch 19100] loss: 0.583


 96%|█████████▌| 19200/20019 [2:23:37<08:46,  1.55it/s]

[Epoch 6, Batch 19200] loss: 0.593


 96%|█████████▋| 19300/20019 [2:24:43<07:13,  1.66it/s]

[Epoch 6, Batch 19300] loss: 0.607


 97%|█████████▋| 19400/20019 [2:25:49<06:06,  1.69it/s]

[Epoch 6, Batch 19400] loss: 0.587


 97%|█████████▋| 19500/20019 [2:26:56<05:07,  1.69it/s]

[Epoch 6, Batch 19500] loss: 0.583


 98%|█████████▊| 19600/20019 [2:28:02<04:50,  1.44it/s]

[Epoch 6, Batch 19600] loss: 0.586


 98%|█████████▊| 19700/20019 [2:29:08<03:53,  1.37it/s]

[Epoch 6, Batch 19700] loss: 0.592


 99%|█████████▉| 19800/20019 [2:30:16<02:37,  1.39it/s]

[Epoch 6, Batch 19800] loss: 0.588


 99%|█████████▉| 19900/20019 [2:31:24<01:24,  1.41it/s]

[Epoch 6, Batch 19900] loss: 0.605


100%|█████████▉| 19999/20019 [2:32:30<00:12,  1.62it/s]

[Epoch 6, Batch 20000] loss: 0.578


100%|█████████▉| 20000/20019 [2:32:33<00:25,  1.36s/it]

****** Model checkpoint saved at epochs 7 ******


100%|██████████| 20019/20019 [2:32:45<00:00,  2.18it/s]
  0%|          | 100/20019 [00:35<1:55:49,  2.87it/s]

[Epoch 7, Batch   100] loss: 0.578


  1%|          | 200/20019 [01:10<1:55:01,  2.87it/s]

[Epoch 7, Batch   200] loss: 0.597


  1%|▏         | 300/20019 [01:45<1:54:29,  2.87it/s]

[Epoch 7, Batch   300] loss: 0.543


  2%|▏         | 400/20019 [02:20<1:53:41,  2.88it/s]

[Epoch 7, Batch   400] loss: 0.572


  2%|▏         | 500/20019 [02:55<1:53:14,  2.87it/s]

[Epoch 7, Batch   500] loss: 0.537


  3%|▎         | 600/20019 [03:30<1:52:35,  2.87it/s]

[Epoch 7, Batch   600] loss: 0.591


  3%|▎         | 700/20019 [04:05<1:51:53,  2.88it/s]

[Epoch 7, Batch   700] loss: 0.582


  4%|▍         | 800/20019 [04:39<1:51:16,  2.88it/s]

[Epoch 7, Batch   800] loss: 0.541


  4%|▍         | 900/20019 [05:14<1:50:45,  2.88it/s]

[Epoch 7, Batch   900] loss: 0.590


  5%|▍         | 999/20019 [05:49<1:50:35,  2.87it/s]

[Epoch 7, Batch  1000] loss: 0.560


  5%|▍         | 1000/20019 [05:51<4:53:52,  1.08it/s]

****** Model checkpoint saved at epochs 8 ******


  5%|▌         | 1100/20019 [06:27<1:49:35,  2.88it/s]

[Epoch 7, Batch  1100] loss: 0.566


  6%|▌         | 1200/20019 [07:02<1:48:49,  2.88it/s]

[Epoch 7, Batch  1200] loss: 0.540


  6%|▋         | 1300/20019 [07:37<1:48:23,  2.88it/s]

[Epoch 7, Batch  1300] loss: 0.571


  7%|▋         | 1400/20019 [08:12<1:48:04,  2.87it/s]

[Epoch 7, Batch  1400] loss: 0.586


  7%|▋         | 1500/20019 [08:47<1:47:27,  2.87it/s]

[Epoch 7, Batch  1500] loss: 0.584


  8%|▊         | 1600/20019 [09:22<1:47:02,  2.87it/s]

[Epoch 7, Batch  1600] loss: 0.569


  8%|▊         | 1700/20019 [09:57<1:46:19,  2.87it/s]

[Epoch 7, Batch  1700] loss: 0.554


  9%|▉         | 1800/20019 [10:32<1:45:38,  2.87it/s]

[Epoch 7, Batch  1800] loss: 0.578


  9%|▉         | 1900/20019 [11:07<1:45:08,  2.87it/s]

[Epoch 7, Batch  1900] loss: 0.573


 10%|▉         | 1999/20019 [11:41<1:45:11,  2.86it/s]

[Epoch 7, Batch  2000] loss: 0.562


 10%|▉         | 2000/20019 [11:44<4:52:11,  1.03it/s]

****** Model checkpoint saved at epochs 8 ******


 10%|█         | 2100/20019 [12:20<1:43:51,  2.88it/s]

[Epoch 7, Batch  2100] loss: 0.588


 11%|█         | 2200/20019 [12:55<1:43:25,  2.87it/s]

[Epoch 7, Batch  2200] loss: 0.566


 11%|█▏        | 2300/20019 [13:30<1:43:01,  2.87it/s]

[Epoch 7, Batch  2300] loss: 0.552


 12%|█▏        | 2400/20019 [14:05<1:42:29,  2.86it/s]

[Epoch 7, Batch  2400] loss: 0.590


 12%|█▏        | 2500/20019 [14:40<1:41:27,  2.88it/s]

[Epoch 7, Batch  2500] loss: 0.537


 13%|█▎        | 2600/20019 [15:14<1:40:57,  2.88it/s]

[Epoch 7, Batch  2600] loss: 0.575


 13%|█▎        | 2700/20019 [15:49<1:40:19,  2.88it/s]

[Epoch 7, Batch  2700] loss: 0.544


 14%|█▍        | 2800/20019 [16:24<1:39:49,  2.87it/s]

[Epoch 7, Batch  2800] loss: 0.543


 14%|█▍        | 2900/20019 [16:59<1:39:12,  2.88it/s]

[Epoch 7, Batch  2900] loss: 0.601


 15%|█▍        | 2999/20019 [17:34<1:39:08,  2.86it/s]

[Epoch 7, Batch  3000] loss: 0.551


 15%|█▍        | 3000/20019 [17:36<4:39:35,  1.01it/s]

****** Model checkpoint saved at epochs 8 ******


 15%|█▌        | 3100/20019 [18:12<1:37:58,  2.88it/s]

[Epoch 7, Batch  3100] loss: 0.534


 16%|█▌        | 3200/20019 [18:47<1:37:26,  2.88it/s]

[Epoch 7, Batch  3200] loss: 0.589


 16%|█▋        | 3300/20019 [19:22<1:37:03,  2.87it/s]

[Epoch 7, Batch  3300] loss: 0.581


 17%|█▋        | 3400/20019 [19:57<1:36:25,  2.87it/s]

[Epoch 7, Batch  3400] loss: 0.553


 17%|█▋        | 3500/20019 [20:32<1:35:39,  2.88it/s]

[Epoch 7, Batch  3500] loss: 0.570


 18%|█▊        | 3600/20019 [21:07<1:35:11,  2.87it/s]

[Epoch 7, Batch  3600] loss: 0.528


 18%|█▊        | 3700/20019 [21:41<1:34:43,  2.87it/s]

[Epoch 7, Batch  3700] loss: 0.517


 19%|█▉        | 3800/20019 [22:16<1:34:14,  2.87it/s]

[Epoch 7, Batch  3800] loss: 0.583


 19%|█▉        | 3900/20019 [22:51<1:33:19,  2.88it/s]

[Epoch 7, Batch  3900] loss: 0.571


 20%|█▉        | 3999/20019 [23:26<1:33:46,  2.85it/s]

[Epoch 7, Batch  4000] loss: 0.531


 20%|█▉        | 4000/20019 [23:29<4:42:42,  1.06s/it]

****** Model checkpoint saved at epochs 8 ******


 20%|██        | 4100/20019 [24:05<1:34:40,  2.80it/s]

[Epoch 7, Batch  4100] loss: 0.549


 21%|██        | 4200/20019 [24:41<1:31:53,  2.87it/s]

[Epoch 7, Batch  4200] loss: 0.546


 21%|██▏       | 4300/20019 [25:16<1:31:23,  2.87it/s]

[Epoch 7, Batch  4300] loss: 0.556


 22%|██▏       | 4400/20019 [25:52<1:32:35,  2.81it/s]

[Epoch 7, Batch  4400] loss: 0.580


 22%|██▏       | 4500/20019 [26:28<1:30:05,  2.87it/s]

[Epoch 7, Batch  4500] loss: 0.537


 23%|██▎       | 4600/20019 [27:03<1:29:23,  2.87it/s]

[Epoch 7, Batch  4600] loss: 0.539


 23%|██▎       | 4700/20019 [27:38<1:28:52,  2.87it/s]

[Epoch 7, Batch  4700] loss: 0.576


 24%|██▍       | 4800/20019 [28:13<1:30:26,  2.80it/s]

[Epoch 7, Batch  4800] loss: 0.559


 24%|██▍       | 4900/20019 [28:49<1:27:40,  2.87it/s]

[Epoch 7, Batch  4900] loss: 0.565


 25%|██▍       | 4999/20019 [29:24<1:28:22,  2.83it/s]

[Epoch 7, Batch  5000] loss: 0.580


 25%|██▍       | 5000/20019 [29:26<4:16:14,  1.02s/it]

****** Model checkpoint saved at epochs 8 ******


 25%|██▌       | 5100/20019 [30:03<1:27:50,  2.83it/s]

[Epoch 7, Batch  5100] loss: 0.589


 26%|██▌       | 5200/20019 [30:38<1:26:21,  2.86it/s]

[Epoch 7, Batch  5200] loss: 0.582


 26%|██▋       | 5300/20019 [31:14<1:25:17,  2.88it/s]

[Epoch 7, Batch  5300] loss: 0.540


 27%|██▋       | 5400/20019 [31:49<1:26:32,  2.82it/s]

[Epoch 7, Batch  5400] loss: 0.558


 27%|██▋       | 5500/20019 [32:25<1:24:06,  2.88it/s]

[Epoch 7, Batch  5500] loss: 0.575


 28%|██▊       | 5600/20019 [33:01<1:28:50,  2.70it/s]

[Epoch 7, Batch  5600] loss: 0.548


 28%|██▊       | 5700/20019 [33:37<1:23:01,  2.87it/s]

[Epoch 7, Batch  5700] loss: 0.543


 29%|██▉       | 5800/20019 [34:12<1:23:44,  2.83it/s]

[Epoch 7, Batch  5800] loss: 0.588


 29%|██▉       | 5900/20019 [34:49<1:22:14,  2.86it/s]

[Epoch 7, Batch  5900] loss: 0.569


 30%|██▉       | 5999/20019 [35:24<1:24:19,  2.77it/s]

[Epoch 7, Batch  6000] loss: 0.548


 30%|██▉       | 6000/20019 [35:27<3:52:38,  1.00it/s]

****** Model checkpoint saved at epochs 8 ******


 30%|███       | 6100/20019 [36:05<1:23:57,  2.76it/s]

[Epoch 7, Batch  6100] loss: 0.562


 31%|███       | 6200/20019 [36:41<1:20:15,  2.87it/s]

[Epoch 7, Batch  6200] loss: 0.583


 31%|███▏      | 6300/20019 [37:18<1:24:32,  2.70it/s]

[Epoch 7, Batch  6300] loss: 0.551


 32%|███▏      | 6400/20019 [37:54<1:19:06,  2.87it/s]

[Epoch 7, Batch  6400] loss: 0.577


 32%|███▏      | 6500/20019 [38:30<1:22:30,  2.73it/s]

[Epoch 7, Batch  6500] loss: 0.560


 33%|███▎      | 6600/20019 [39:07<1:23:43,  2.67it/s]

[Epoch 7, Batch  6600] loss: 0.546


 33%|███▎      | 6700/20019 [39:43<1:17:19,  2.87it/s]

[Epoch 7, Batch  6700] loss: 0.542


 34%|███▍      | 6800/20019 [40:20<1:20:54,  2.72it/s]

[Epoch 7, Batch  6800] loss: 0.522


 34%|███▍      | 6900/20019 [40:56<1:16:05,  2.87it/s]

[Epoch 7, Batch  6900] loss: 0.564


 35%|███▍      | 6999/20019 [41:33<1:16:26,  2.84it/s]

[Epoch 7, Batch  7000] loss: 0.554


 35%|███▍      | 7000/20019 [41:35<3:35:17,  1.01it/s]

****** Model checkpoint saved at epochs 8 ******


 35%|███▌      | 7100/20019 [42:15<1:23:00,  2.59it/s]

[Epoch 7, Batch  7100] loss: 0.588


 36%|███▌      | 7200/20019 [42:52<1:15:18,  2.84it/s]

[Epoch 7, Batch  7200] loss: 0.575


 36%|███▋      | 7300/20019 [43:29<1:17:11,  2.75it/s]

[Epoch 7, Batch  7300] loss: 0.581


 37%|███▋      | 7400/20019 [44:07<1:15:29,  2.79it/s]

[Epoch 7, Batch  7400] loss: 0.542


 37%|███▋      | 7500/20019 [44:46<1:13:47,  2.83it/s]

[Epoch 7, Batch  7500] loss: 0.564


 38%|███▊      | 7600/20019 [45:23<1:25:13,  2.43it/s]

[Epoch 7, Batch  7600] loss: 0.564


 38%|███▊      | 7700/20019 [46:01<1:36:02,  2.14it/s]

[Epoch 7, Batch  7700] loss: 0.556


 39%|███▉      | 7800/20019 [46:39<1:19:34,  2.56it/s]

[Epoch 7, Batch  7800] loss: 0.578


 39%|███▉      | 7900/20019 [47:17<1:16:45,  2.63it/s]

[Epoch 7, Batch  7900] loss: 0.548


 40%|███▉      | 7999/20019 [47:55<1:12:23,  2.77it/s]

[Epoch 7, Batch  8000] loss: 0.558


 40%|███▉      | 8000/20019 [47:57<3:15:51,  1.02it/s]

****** Model checkpoint saved at epochs 8 ******


 40%|████      | 8100/20019 [48:37<1:13:17,  2.71it/s]

[Epoch 7, Batch  8100] loss: 0.538


 41%|████      | 8200/20019 [49:16<1:15:19,  2.61it/s]

[Epoch 7, Batch  8200] loss: 0.564


 41%|████▏     | 8300/20019 [49:55<1:18:14,  2.50it/s]

[Epoch 7, Batch  8300] loss: 0.494


 42%|████▏     | 8400/20019 [50:34<1:18:18,  2.47it/s]

[Epoch 7, Batch  8400] loss: 0.576


 42%|████▏     | 8500/20019 [51:13<1:13:52,  2.60it/s]

[Epoch 7, Batch  8500] loss: 0.541


 43%|████▎     | 8600/20019 [51:53<1:09:44,  2.73it/s]

[Epoch 7, Batch  8600] loss: 0.559


 43%|████▎     | 8700/20019 [52:31<1:12:05,  2.62it/s]

[Epoch 7, Batch  8700] loss: 0.555


 44%|████▍     | 8800/20019 [53:10<1:10:04,  2.67it/s]

[Epoch 7, Batch  8800] loss: 0.590


 44%|████▍     | 8900/20019 [53:50<1:11:48,  2.58it/s]

[Epoch 7, Batch  8900] loss: 0.561


 45%|████▍     | 8999/20019 [54:29<1:09:04,  2.66it/s]

[Epoch 7, Batch  9000] loss: 0.558


 45%|████▍     | 9000/20019 [54:32<3:37:54,  1.19s/it]

****** Model checkpoint saved at epochs 8 ******


 45%|████▌     | 9100/20019 [55:13<1:08:11,  2.67it/s]

[Epoch 7, Batch  9100] loss: 0.570


 46%|████▌     | 9200/20019 [55:54<1:07:21,  2.68it/s]

[Epoch 7, Batch  9200] loss: 0.580


 46%|████▋     | 9300/20019 [56:33<1:08:16,  2.62it/s]

[Epoch 7, Batch  9300] loss: 0.570


 47%|████▋     | 9400/20019 [57:13<1:13:34,  2.41it/s]

[Epoch 7, Batch  9400] loss: 0.555


 47%|████▋     | 9500/20019 [57:53<1:07:57,  2.58it/s]

[Epoch 7, Batch  9500] loss: 0.545


 48%|████▊     | 9600/20019 [58:33<1:08:14,  2.54it/s]

[Epoch 7, Batch  9600] loss: 0.582


 48%|████▊     | 9700/20019 [59:15<1:09:16,  2.48it/s]

[Epoch 7, Batch  9700] loss: 0.557


 49%|████▉     | 9800/20019 [59:56<1:04:33,  2.64it/s]

[Epoch 7, Batch  9800] loss: 0.578


 49%|████▉     | 9900/20019 [1:00:37<1:11:05,  2.37it/s]

[Epoch 7, Batch  9900] loss: 0.568


 50%|████▉     | 9999/20019 [1:01:17<1:03:26,  2.63it/s]

[Epoch 7, Batch 10000] loss: 0.581


 50%|████▉     | 10000/20019 [1:01:20<3:18:00,  1.19s/it]

****** Model checkpoint saved at epochs 8 ******


 50%|█████     | 10100/20019 [1:02:03<1:05:13,  2.53it/s]

[Epoch 7, Batch 10100] loss: 0.586


 51%|█████     | 10200/20019 [1:02:44<1:15:22,  2.17it/s]

[Epoch 7, Batch 10200] loss: 0.542


 51%|█████▏    | 10300/20019 [1:03:26<1:09:26,  2.33it/s]

[Epoch 7, Batch 10300] loss: 0.566


 52%|█████▏    | 10400/20019 [1:04:07<1:04:14,  2.50it/s]

[Epoch 7, Batch 10400] loss: 0.567


 52%|█████▏    | 10500/20019 [1:04:48<1:04:32,  2.46it/s]

[Epoch 7, Batch 10500] loss: 0.547


 53%|█████▎    | 10600/20019 [1:05:31<1:08:50,  2.28it/s]

[Epoch 7, Batch 10600] loss: 0.568


 53%|█████▎    | 10700/20019 [1:06:12<1:07:00,  2.32it/s]

[Epoch 7, Batch 10700] loss: 0.545


 54%|█████▍    | 10800/20019 [1:06:54<58:33,  2.62it/s]  

[Epoch 7, Batch 10800] loss: 0.525


 54%|█████▍    | 10900/20019 [1:07:36<1:00:51,  2.50it/s]

[Epoch 7, Batch 10900] loss: 0.551


 55%|█████▍    | 10999/20019 [1:08:19<1:04:50,  2.32it/s]

[Epoch 7, Batch 11000] loss: 0.552


 55%|█████▍    | 11000/20019 [1:08:22<3:05:57,  1.24s/it]

****** Model checkpoint saved at epochs 8 ******


 55%|█████▌    | 11100/20019 [1:09:07<58:02,  2.56it/s]  

[Epoch 7, Batch 11100] loss: 0.564


 56%|█████▌    | 11200/20019 [1:09:51<1:00:09,  2.44it/s]

[Epoch 7, Batch 11200] loss: 0.533


 56%|█████▋    | 11300/20019 [1:10:34<59:05,  2.46it/s]  

[Epoch 7, Batch 11300] loss: 0.544


 57%|█████▋    | 11400/20019 [1:11:16<1:02:45,  2.29it/s]

[Epoch 7, Batch 11400] loss: 0.541


 57%|█████▋    | 11500/20019 [1:12:01<1:03:07,  2.25it/s]

[Epoch 7, Batch 11500] loss: 0.536


 58%|█████▊    | 11600/20019 [1:12:45<53:36,  2.62it/s]  

[Epoch 7, Batch 11600] loss: 0.555


 58%|█████▊    | 11700/20019 [1:13:28<1:09:49,  1.99it/s]

[Epoch 7, Batch 11700] loss: 0.558


 59%|█████▉    | 11800/20019 [1:14:12<1:06:23,  2.06it/s]

[Epoch 7, Batch 11800] loss: 0.550


 59%|█████▉    | 11900/20019 [1:14:56<56:36,  2.39it/s]  

[Epoch 7, Batch 11900] loss: 0.554


 60%|█████▉    | 11999/20019 [1:15:40<1:01:38,  2.17it/s]

[Epoch 7, Batch 12000] loss: 0.577


 60%|█████▉    | 12000/20019 [1:15:43<2:44:31,  1.23s/it]

****** Model checkpoint saved at epochs 8 ******


 60%|██████    | 12100/20019 [1:16:30<59:44,  2.21it/s]  

[Epoch 7, Batch 12100] loss: 0.564


 61%|██████    | 12200/20019 [1:17:14<52:07,  2.50it/s]  

[Epoch 7, Batch 12200] loss: 0.590


 61%|██████▏   | 12300/20019 [1:17:59<1:01:34,  2.09it/s]

[Epoch 7, Batch 12300] loss: 0.580


 62%|██████▏   | 12400/20019 [1:18:46<57:51,  2.19it/s]  

[Epoch 7, Batch 12400] loss: 0.576


 62%|██████▏   | 12500/20019 [1:19:33<56:22,  2.22it/s]  

[Epoch 7, Batch 12500] loss: 0.560


 63%|██████▎   | 12600/20019 [1:20:19<1:00:35,  2.04it/s]

[Epoch 7, Batch 12600] loss: 0.568


 63%|██████▎   | 12700/20019 [1:21:07<59:09,  2.06it/s]  

[Epoch 7, Batch 12700] loss: 0.545


 64%|██████▍   | 12800/20019 [1:21:54<56:14,  2.14it/s]  

[Epoch 7, Batch 12800] loss: 0.564


 64%|██████▍   | 12900/20019 [1:22:40<53:07,  2.23it/s]  

[Epoch 7, Batch 12900] loss: 0.566


 65%|██████▍   | 12999/20019 [1:23:28<1:00:22,  1.94it/s]

[Epoch 7, Batch 13000] loss: 0.549


 65%|██████▍   | 13000/20019 [1:23:30<2:20:05,  1.20s/it]

****** Model checkpoint saved at epochs 8 ******


 65%|██████▌   | 13100/20019 [1:24:19<58:44,  1.96it/s]  

[Epoch 7, Batch 13100] loss: 0.560


 66%|██████▌   | 13200/20019 [1:25:08<54:53,  2.07it/s]  

[Epoch 7, Batch 13200] loss: 0.544


 66%|██████▋   | 13300/20019 [1:25:56<52:34,  2.13it/s]  

[Epoch 7, Batch 13300] loss: 0.538


 67%|██████▋   | 13400/20019 [1:26:46<49:04,  2.25it/s]  

[Epoch 7, Batch 13400] loss: 0.563


 67%|██████▋   | 13500/20019 [1:27:35<58:22,  1.86it/s]  

[Epoch 7, Batch 13500] loss: 0.584


 68%|██████▊   | 13600/20019 [1:28:25<54:23,  1.97it/s]  

[Epoch 7, Batch 13600] loss: 0.558


 68%|██████▊   | 13700/20019 [1:29:15<58:43,  1.79it/s]  

[Epoch 7, Batch 13700] loss: 0.550


 69%|██████▉   | 13800/20019 [1:30:06<55:41,  1.86it/s]  

[Epoch 7, Batch 13800] loss: 0.568


 69%|██████▉   | 13900/20019 [1:30:57<54:43,  1.86it/s]  

[Epoch 7, Batch 13900] loss: 0.553


 70%|██████▉   | 13999/20019 [1:31:47<49:38,  2.02it/s]  

[Epoch 7, Batch 14000] loss: 0.546


 70%|██████▉   | 14000/20019 [1:31:50<2:10:53,  1.30s/it]

****** Model checkpoint saved at epochs 8 ******


 70%|███████   | 14100/20019 [1:32:43<53:04,  1.86it/s]  

[Epoch 7, Batch 14100] loss: 0.554


 71%|███████   | 14200/20019 [1:33:35<46:59,  2.06it/s]

[Epoch 7, Batch 14200] loss: 0.557


 71%|███████▏  | 14300/20019 [1:34:29<53:01,  1.80it/s]  

[Epoch 7, Batch 14300] loss: 0.544


 72%|███████▏  | 14400/20019 [1:35:23<1:01:55,  1.51it/s]

[Epoch 7, Batch 14400] loss: 0.544


 72%|███████▏  | 14500/20019 [1:36:16<51:21,  1.79it/s]  

[Epoch 7, Batch 14500] loss: 0.568


 73%|███████▎  | 14600/20019 [1:37:09<44:29,  2.03it/s]  

[Epoch 7, Batch 14600] loss: 0.546


 73%|███████▎  | 14700/20019 [1:38:04<43:30,  2.04it/s]  

[Epoch 7, Batch 14700] loss: 0.576


 74%|███████▍  | 14800/20019 [1:39:00<47:21,  1.84it/s]

[Epoch 7, Batch 14800] loss: 0.537


 74%|███████▍  | 14900/20019 [1:39:55<50:47,  1.68it/s]

[Epoch 7, Batch 14900] loss: 0.556


 75%|███████▍  | 14999/20019 [1:40:52<43:45,  1.91it/s]  

[Epoch 7, Batch 15000] loss: 0.554


 75%|███████▍  | 15000/20019 [1:40:55<1:47:36,  1.29s/it]

****** Model checkpoint saved at epochs 8 ******


 75%|███████▌  | 15100/20019 [1:41:51<47:45,  1.72it/s]  

[Epoch 7, Batch 15100] loss: 0.568


 76%|███████▌  | 15200/20019 [1:42:48<46:44,  1.72it/s]

[Epoch 7, Batch 15200] loss: 0.584


 76%|███████▋  | 15300/20019 [1:43:45<47:16,  1.66it/s]

[Epoch 7, Batch 15300] loss: 0.567


 77%|███████▋  | 15400/20019 [1:44:42<45:11,  1.70it/s]

[Epoch 7, Batch 15400] loss: 0.579


 77%|███████▋  | 15500/20019 [1:45:40<45:39,  1.65it/s]  

[Epoch 7, Batch 15500] loss: 0.547


 78%|███████▊  | 15600/20019 [1:46:38<45:38,  1.61it/s]

[Epoch 7, Batch 15600] loss: 0.555


 78%|███████▊  | 15700/20019 [1:47:37<44:37,  1.61it/s]

[Epoch 7, Batch 15700] loss: 0.555


 79%|███████▉  | 15800/20019 [1:48:36<40:07,  1.75it/s]

[Epoch 7, Batch 15800] loss: 0.568


 79%|███████▉  | 15900/20019 [1:49:37<44:10,  1.55it/s]

[Epoch 7, Batch 15900] loss: 0.585


 80%|███████▉  | 15999/20019 [1:50:36<35:36,  1.88it/s]

[Epoch 7, Batch 16000] loss: 0.545


 80%|███████▉  | 16000/20019 [1:50:39<1:24:13,  1.26s/it]

****** Model checkpoint saved at epochs 8 ******


 80%|████████  | 16100/20019 [1:51:40<35:51,  1.82it/s]  

[Epoch 7, Batch 16100] loss: 0.573


 81%|████████  | 16200/20019 [1:52:41<44:16,  1.44it/s]

[Epoch 7, Batch 16200] loss: 0.586


 81%|████████▏ | 16300/20019 [1:53:43<40:20,  1.54it/s]

[Epoch 7, Batch 16300] loss: 0.549


 82%|████████▏ | 16400/20019 [1:54:45<43:24,  1.39it/s]

[Epoch 7, Batch 16400] loss: 0.530


 82%|████████▏ | 16500/20019 [1:55:47<37:41,  1.56it/s]

[Epoch 7, Batch 16500] loss: 0.563


 83%|████████▎ | 16600/20019 [1:56:48<39:45,  1.43it/s]

[Epoch 7, Batch 16600] loss: 0.560


 83%|████████▎ | 16700/20019 [1:57:50<35:50,  1.54it/s]

[Epoch 7, Batch 16700] loss: 0.575


 84%|████████▍ | 16800/20019 [1:58:53<37:27,  1.43it/s]

[Epoch 7, Batch 16800] loss: 0.554


 84%|████████▍ | 16900/20019 [1:59:56<32:57,  1.58it/s]

[Epoch 7, Batch 16900] loss: 0.551


 85%|████████▍ | 16999/20019 [2:00:58<28:20,  1.78it/s]

[Epoch 7, Batch 17000] loss: 0.537


 85%|████████▍ | 17000/20019 [2:01:01<1:03:40,  1.27s/it]

****** Model checkpoint saved at epochs 8 ******


 85%|████████▌ | 17100/20019 [2:02:04<29:04,  1.67it/s]  

[Epoch 7, Batch 17100] loss: 0.534


 86%|████████▌ | 17200/20019 [2:03:09<30:52,  1.52it/s]

[Epoch 7, Batch 17200] loss: 0.555


 86%|████████▋ | 17300/20019 [2:04:11<31:07,  1.46it/s]

[Epoch 7, Batch 17300] loss: 0.558


 87%|████████▋ | 17400/20019 [2:05:15<33:33,  1.30it/s]

[Epoch 7, Batch 17400] loss: 0.553


 87%|████████▋ | 17500/20019 [2:06:19<29:21,  1.43it/s]

[Epoch 7, Batch 17500] loss: 0.562


 88%|████████▊ | 17600/20019 [2:07:23<26:01,  1.55it/s]

[Epoch 7, Batch 17600] loss: 0.558


 88%|████████▊ | 17700/20019 [2:08:28<28:17,  1.37it/s]

[Epoch 7, Batch 17700] loss: 0.554


 89%|████████▉ | 17800/20019 [2:09:33<25:37,  1.44it/s]

[Epoch 7, Batch 17800] loss: 0.542


 89%|████████▉ | 17900/20019 [2:10:39<24:20,  1.45it/s]

[Epoch 7, Batch 17900] loss: 0.543


 90%|████████▉ | 17999/20019 [2:11:44<20:20,  1.65it/s]

[Epoch 7, Batch 18000] loss: 0.533


 90%|████████▉ | 18000/20019 [2:11:47<44:45,  1.33s/it]

****** Model checkpoint saved at epochs 8 ******


 90%|█████████ | 18100/20019 [2:12:52<18:51,  1.70it/s]

[Epoch 7, Batch 18100] loss: 0.567


 91%|█████████ | 18200/20019 [2:13:59<20:58,  1.45it/s]

[Epoch 7, Batch 18200] loss: 0.568


 91%|█████████▏| 18300/20019 [2:15:06<18:14,  1.57it/s]

[Epoch 7, Batch 18300] loss: 0.570


 92%|█████████▏| 18400/20019 [2:16:12<17:40,  1.53it/s]

[Epoch 7, Batch 18400] loss: 0.588


 92%|█████████▏| 18500/20019 [2:17:20<15:52,  1.59it/s]

[Epoch 7, Batch 18500] loss: 0.577


 93%|█████████▎| 18600/20019 [2:18:27<15:41,  1.51it/s]

[Epoch 7, Batch 18600] loss: 0.554


 93%|█████████▎| 18700/20019 [2:19:34<16:48,  1.31it/s]

[Epoch 7, Batch 18700] loss: 0.555


 94%|█████████▍| 18800/20019 [2:20:41<14:01,  1.45it/s]

[Epoch 7, Batch 18800] loss: 0.559


 94%|█████████▍| 18900/20019 [2:21:48<13:13,  1.41it/s]

[Epoch 7, Batch 18900] loss: 0.562


 95%|█████████▍| 18999/20019 [2:22:54<12:31,  1.36it/s]

[Epoch 7, Batch 19000] loss: 0.555


 95%|█████████▍| 19000/20019 [2:22:57<22:10,  1.31s/it]

****** Model checkpoint saved at epochs 8 ******


 95%|█████████▌| 19100/20019 [2:24:06<11:16,  1.36it/s]

[Epoch 7, Batch 19100] loss: 0.578


 96%|█████████▌| 19200/20019 [2:25:14<09:46,  1.40it/s]

[Epoch 7, Batch 19200] loss: 0.568


 96%|█████████▋| 19300/20019 [2:26:21<08:06,  1.48it/s]

[Epoch 7, Batch 19300] loss: 0.571


 97%|█████████▋| 19400/20019 [2:27:29<07:43,  1.34it/s]

[Epoch 7, Batch 19400] loss: 0.558


 97%|█████████▋| 19500/20019 [2:28:36<07:00,  1.24it/s]

[Epoch 7, Batch 19500] loss: 0.566


 98%|█████████▊| 19600/20019 [2:29:43<04:31,  1.55it/s]

[Epoch 7, Batch 19600] loss: 0.557


 98%|█████████▊| 19700/20019 [2:30:51<03:26,  1.55it/s]

[Epoch 7, Batch 19700] loss: 0.572


 99%|█████████▉| 19800/20019 [2:31:57<02:19,  1.57it/s]

[Epoch 7, Batch 19800] loss: 0.574


 99%|█████████▉| 19900/20019 [2:33:05<01:14,  1.59it/s]

[Epoch 7, Batch 19900] loss: 0.530


100%|█████████▉| 19999/20019 [2:34:12<00:13,  1.49it/s]

[Epoch 7, Batch 20000] loss: 0.554


100%|█████████▉| 20000/20019 [2:34:15<00:26,  1.39s/it]

****** Model checkpoint saved at epochs 8 ******


100%|██████████| 20019/20019 [2:34:28<00:00,  2.16it/s]


****** Finished Fine-tuning ******
****** Model checkpoint saved at epochs 8 ******
