## Train a character-level GPT on some text data

The inputs here are simple text files, which we chop up to individual characters and then train GPT on. So you could say this is a char-transformer instead of a char-rnn. Doesn't quite roll off the tongue as well. In this example we will feed it some Shakespeare, which we'll get it to predict character-level.

In [1]:
from allennlp.data.token_indexers import TokenIndexer, PretrainedTransformerIndexer
from allennlp.data.tokenizers import Token, Tokenizer, PretrainedTransformerTokenizer

import nltk
#nltk.download('punkt')
import numpy as np
from os import listdir
from os.path import join as pathjoin
import torch
import torch.nn as nn
from torch.nn import functional as F
import tqdm

from minGPT.mingpt.model import GPT, GPTConfig
from minGPT.mingpt.trainer import Trainer, TrainerConfig
# make deterministic
from minGPT.mingpt.utils import sample, set_seed
set_seed(42)

In [2]:
DATA_DIR = '/home/mlepekhin/data/big'
#MODELS_DIR = '/home/mlepekhin/models/big'
transformer_model = 'DeepPavlov/rubert-base-cased'

In [3]:
import math
from torch.utils.data import Dataset


def detokenize(tokens):
    return ' '.join([str(x) for x in tokens[1:-1]]).replace(' ##', '')

class BPEDataset(Dataset):
    def __init__(self, data, block_size):
        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))
        
        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data
    
    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx + self.block_size + 1]
        # encode every character to an integer
        dix = [self.stoi[s] for s in chunk]
        """
        arrange data and targets so that the first i elements of x
        will be asked to predict the i-th element of y. Notice that
        the eventual language model will actually make block_size
        individual predictions at the same time based on this data,
        so we are being clever and amortizing the cost of the forward
        pass of the network. So for example if block_size is 4, then
        we could e.g. sample a chunk of text "hello", the integers in
        x will correspond to "hell" and in y will be "ello". This will
        then actually "multitask" 4 separate examples at the same time
        in the language model:
        - given just "h", please predict "e" as next
        - given "he" please predict "l" next
        - given "hel" predict "l" next
        - given "hell" predict "o" next
        
        In addition, because the DataLoader will create batches of examples,
        every forward/backward pass during traning will simultaneously train
        a LOT of predictions, amortizing a lot of computation. In particular,
        for a batched input of integers X (B, T) where B is batch size and
        T is block_size and Y (B, T), the network will during training be
        simultaneously training to make B*T predictions, all at once! Of course,
        at test time we can paralellize across batch B, but unlike during training
        we cannot parallelize across the time dimension T - we have to run
        a forward pass of the network to recover the next single character of the 
        sequence along each batch dimension, and repeatedly always feed in a next
        character to get the next one.
        
        So yes there is a big asymmetry between train/test time of autoregressive
        models. During training we can go B*T at a time with every forward pass,
        but during test time we can only go B at a time, T times, with T forward 
        passes.
        """
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y


In [4]:
block_size = 128
tokenizer = PretrainedTransformerTokenizer(transformer_model)
#indexer = PretrainedTransformerIndexer(transformer_model)

In [8]:
def train_gpt_generator(train_text_file, state_dict_file, n_layer=8, n_head=8, n_embd=512,
                        max_epochs=1, batch_size=256):
    text_sentences = nltk.tokenize.sent_tokenize(open(train_text_file, 'r').read())
    tokens = np.concatenate([tokenizer.tokenize(sent)[1:-1] for sent in text_sentences])
    tokens = [str(token) for token in tokens]
    train_dataset = BPEDataset(tokens, block_size) 
    
    mconf = GPTConfig(
        train_dataset.vocab_size, train_dataset.block_size,
        n_layer=n_layer, n_head=n_head, n_embd=n_embd
    )
    model = GPT(mconf)
    tconf = TrainerConfig(
        max_epochs=max_epochs, batch_size=batch_size, learning_rate=6e-4,
        lr_decay=True, warmup_tokens=batch_size*20, final_tokens=2*len(train_dataset)*block_size,
        num_workers=3
    )
    trainer = Trainer(model, train_dataset, None, tconf)
    trainer.train()
    torch.save(model.state_dict(), state_dict_file)

In [9]:
GENRE_DATA_DIR = '/home/mlepekhin/data/big/genre'
GPT_MODELS_DIR = '/home/mlepekhin/models/mini_gpt_big_bpe/'
LANG = 'ru'

In [10]:
#train_gpt_generator(
#        pathjoin(GENRE_DATA_DIR, LANG, 'A1.txt'),
#        pathjoin(GPT_MODELS_DIR, LANG, 'A1')
#)

In [11]:
for train_text_file in tqdm.tqdm(listdir(pathjoin(GENRE_DATA_DIR, LANG))):
    label = train_text_file[:-4]
    train_gpt_generator(
        pathjoin(GENRE_DATA_DIR, LANG, train_text_file),
        pathjoin(GPT_MODELS_DIR, LANG, label)
    )

  0%|          | 0/10 [00:00<?, ?it/s]

data has 732852 characters, 38863 unique.




epoch 1 iter 0: train loss 10.66786. lr 6.000000e-04:   0%|          | 0/2863 [00:07<?, ?it/s][A
epoch 1 iter 0: train loss 10.66786. lr 6.000000e-04:   0%|          | 1/2863 [00:07<6:16:40,  7.90s/it][A
epoch 1 iter 1: train loss 9.92521. lr 5.999998e-04:   0%|          | 1/2863 [00:08<6:16:40,  7.90s/it] [A
epoch 1 iter 1: train loss 9.92521. lr 5.999998e-04:   0%|          | 2/2863 [00:08<4:33:06,  5.73s/it][A
epoch 1 iter 2: train loss 9.49199. lr 5.999996e-04:   0%|          | 2/2863 [00:09<4:33:06,  5.73s/it][A
epoch 1 iter 2: train loss 9.49199. lr 5.999996e-04:   0%|          | 3/2863 [00:09<3:20:35,  4.21s/it][A
epoch 1 iter 3: train loss 9.22060. lr 5.999993e-04:   0%|          | 3/2863 [00:09<3:20:35,  4.21s/it][A
epoch 1 iter 3: train loss 9.22060. lr 5.999993e-04:   0%|          | 4/2863 [00:09<2:29:50,  3.14s/it][A
epoch 1 iter 4: train loss 8.97282. lr 5.999989e-04:   0%|          | 4/2863 [00:10<2:29:50,  3.14s/it][A
epoch 1 iter 4: train loss 8.97282. lr 5.9

epoch 1 iter 36: train loss 7.53150. lr 5.999387e-04:   1%|▏         | 37/2863 [00:32<34:18,  1.37it/s][A
epoch 1 iter 37: train loss 7.51890. lr 5.999353e-04:   1%|▏         | 37/2863 [00:32<34:18,  1.37it/s][A
epoch 1 iter 37: train loss 7.51890. lr 5.999353e-04:   1%|▏         | 38/2863 [00:32<34:38,  1.36it/s][A
epoch 1 iter 38: train loss 7.47893. lr 5.999318e-04:   1%|▏         | 38/2863 [00:33<34:38,  1.36it/s][A
epoch 1 iter 38: train loss 7.47893. lr 5.999318e-04:   1%|▏         | 39/2863 [00:33<34:44,  1.35it/s][A
epoch 1 iter 39: train loss 7.53272. lr 5.999283e-04:   1%|▏         | 39/2863 [00:34<34:44,  1.35it/s][A
epoch 1 iter 39: train loss 7.53272. lr 5.999283e-04:   1%|▏         | 40/2863 [00:34<34:53,  1.35it/s][A
epoch 1 iter 40: train loss 7.46947. lr 5.999246e-04:   1%|▏         | 40/2863 [00:35<34:53,  1.35it/s][A
epoch 1 iter 40: train loss 7.46947. lr 5.999246e-04:   1%|▏         | 41/2863 [00:35<35:04,  1.34it/s][A
epoch 1 iter 41: train loss 7.43743. 

epoch 1 iter 74: train loss 7.12664. lr 5.997470e-04:   3%|▎         | 75/2863 [01:00<35:15,  1.32it/s][A
epoch 1 iter 75: train loss 7.16155. lr 5.997401e-04:   3%|▎         | 75/2863 [01:01<35:15,  1.32it/s][A
epoch 1 iter 75: train loss 7.16155. lr 5.997401e-04:   3%|▎         | 76/2863 [01:01<35:05,  1.32it/s][A
epoch 1 iter 76: train loss 7.12394. lr 5.997332e-04:   3%|▎         | 76/2863 [01:02<35:05,  1.32it/s][A
epoch 1 iter 76: train loss 7.12394. lr 5.997332e-04:   3%|▎         | 77/2863 [01:02<35:09,  1.32it/s][A
epoch 1 iter 77: train loss 7.14290. lr 5.997263e-04:   3%|▎         | 77/2863 [01:03<35:09,  1.32it/s][A
epoch 1 iter 77: train loss 7.14290. lr 5.997263e-04:   3%|▎         | 78/2863 [01:03<35:08,  1.32it/s][A
epoch 1 iter 78: train loss 7.13642. lr 5.997192e-04:   3%|▎         | 78/2863 [01:03<35:08,  1.32it/s][A
epoch 1 iter 78: train loss 7.13642. lr 5.997192e-04:   3%|▎         | 79/2863 [01:03<35:03,  1.32it/s][A
epoch 1 iter 79: train loss 7.07333. 

epoch 1 iter 112: train loss 6.77600. lr 5.994249e-04:   4%|▍         | 113/2863 [01:29<35:10,  1.30it/s][A
epoch 1 iter 113: train loss 6.76185. lr 5.994146e-04:   4%|▍         | 113/2863 [01:30<35:10,  1.30it/s][A
epoch 1 iter 113: train loss 6.76185. lr 5.994146e-04:   4%|▍         | 114/2863 [01:30<35:01,  1.31it/s][A
epoch 1 iter 114: train loss 6.79693. lr 5.994043e-04:   4%|▍         | 114/2863 [01:31<35:01,  1.31it/s][A
epoch 1 iter 114: train loss 6.79693. lr 5.994043e-04:   4%|▍         | 115/2863 [01:31<34:59,  1.31it/s][A
epoch 1 iter 115: train loss 6.73730. lr 5.993939e-04:   4%|▍         | 115/2863 [01:32<34:59,  1.31it/s][A
epoch 1 iter 115: train loss 6.73730. lr 5.993939e-04:   4%|▍         | 116/2863 [01:32<34:54,  1.31it/s][A
epoch 1 iter 116: train loss 6.77495. lr 5.993834e-04:   4%|▍         | 116/2863 [01:32<34:54,  1.31it/s][A
epoch 1 iter 116: train loss 6.77495. lr 5.993834e-04:   4%|▍         | 117/2863 [01:32<34:41,  1.32it/s][A
epoch 1 iter 117: t

epoch 1 iter 150: train loss 6.49547. lr 5.989726e-04:   5%|▌         | 150/2863 [01:57<31:33,  1.43it/s][A
epoch 1 iter 150: train loss 6.49547. lr 5.989726e-04:   5%|▌         | 151/2863 [01:57<31:06,  1.45it/s][A
epoch 1 iter 151: train loss 6.46092. lr 5.989589e-04:   5%|▌         | 151/2863 [01:58<31:06,  1.45it/s][A
epoch 1 iter 151: train loss 6.46092. lr 5.989589e-04:   5%|▌         | 152/2863 [01:58<30:47,  1.47it/s][A
epoch 1 iter 152: train loss 6.48244. lr 5.989451e-04:   5%|▌         | 152/2863 [01:58<30:47,  1.47it/s][A
epoch 1 iter 152: train loss 6.48244. lr 5.989451e-04:   5%|▌         | 153/2863 [01:58<30:34,  1.48it/s][A
epoch 1 iter 153: train loss 6.52008. lr 5.989313e-04:   5%|▌         | 153/2863 [01:59<30:34,  1.48it/s][A
epoch 1 iter 153: train loss 6.52008. lr 5.989313e-04:   5%|▌         | 154/2863 [01:59<30:25,  1.48it/s][A
epoch 1 iter 154: train loss 6.47863. lr 5.989174e-04:   5%|▌         | 154/2863 [02:00<30:25,  1.48it/s][A
epoch 1 iter 154: t

epoch 1 iter 187: train loss 6.24769. lr 5.984072e-04:   7%|▋         | 188/2863 [02:22<29:56,  1.49it/s][A
epoch 1 iter 188: train loss 6.25062. lr 5.983902e-04:   7%|▋         | 188/2863 [02:23<29:56,  1.49it/s][A
epoch 1 iter 188: train loss 6.25062. lr 5.983902e-04:   7%|▋         | 189/2863 [02:23<29:52,  1.49it/s][A
epoch 1 iter 189: train loss 6.25623. lr 5.983731e-04:   7%|▋         | 189/2863 [02:23<29:52,  1.49it/s][A
epoch 1 iter 189: train loss 6.25623. lr 5.983731e-04:   7%|▋         | 190/2863 [02:23<29:49,  1.49it/s][A
epoch 1 iter 190: train loss 6.30489. lr 5.983560e-04:   7%|▋         | 190/2863 [02:24<29:49,  1.49it/s][A
epoch 1 iter 190: train loss 6.30489. lr 5.983560e-04:   7%|▋         | 191/2863 [02:24<29:47,  1.49it/s][A
epoch 1 iter 191: train loss 6.27813. lr 5.983387e-04:   7%|▋         | 191/2863 [02:25<29:47,  1.49it/s][A
epoch 1 iter 191: train loss 6.27813. lr 5.983387e-04:   7%|▋         | 192/2863 [02:25<29:45,  1.50it/s][A
epoch 1 iter 192: t

epoch 1 iter 225: train loss 6.10261. lr 5.976985e-04:   8%|▊         | 225/2863 [02:47<29:20,  1.50it/s][A
epoch 1 iter 225: train loss 6.10261. lr 5.976985e-04:   8%|▊         | 226/2863 [02:47<29:19,  1.50it/s][A
epoch 1 iter 226: train loss 6.05043. lr 5.976781e-04:   8%|▊         | 226/2863 [02:48<29:19,  1.50it/s][A
epoch 1 iter 226: train loss 6.05043. lr 5.976781e-04:   8%|▊         | 227/2863 [02:48<29:18,  1.50it/s][A
epoch 1 iter 227: train loss 6.08620. lr 5.976576e-04:   8%|▊         | 227/2863 [02:49<29:18,  1.50it/s][A
epoch 1 iter 227: train loss 6.08620. lr 5.976576e-04:   8%|▊         | 228/2863 [02:49<29:17,  1.50it/s][A
epoch 1 iter 228: train loss 6.11020. lr 5.976370e-04:   8%|▊         | 228/2863 [02:49<29:17,  1.50it/s][A
epoch 1 iter 228: train loss 6.11020. lr 5.976370e-04:   8%|▊         | 229/2863 [02:49<29:16,  1.50it/s][A
epoch 1 iter 229: train loss 6.04832. lr 5.976163e-04:   8%|▊         | 229/2863 [02:50<29:16,  1.50it/s][A
epoch 1 iter 229: t

epoch 1 iter 262: train loss 5.89503. lr 5.968840e-04:   9%|▉         | 263/2863 [03:12<28:53,  1.50it/s][A
epoch 1 iter 263: train loss 5.90340. lr 5.968603e-04:   9%|▉         | 263/2863 [03:13<28:53,  1.50it/s][A
epoch 1 iter 263: train loss 5.90340. lr 5.968603e-04:   9%|▉         | 264/2863 [03:13<28:53,  1.50it/s][A
epoch 1 iter 264: train loss 5.87131. lr 5.968365e-04:   9%|▉         | 264/2863 [03:14<28:53,  1.50it/s][A
epoch 1 iter 264: train loss 5.87131. lr 5.968365e-04:   9%|▉         | 265/2863 [03:14<28:53,  1.50it/s][A
epoch 1 iter 265: train loss 5.92271. lr 5.968126e-04:   9%|▉         | 265/2863 [03:14<28:53,  1.50it/s][A
epoch 1 iter 265: train loss 5.92271. lr 5.968126e-04:   9%|▉         | 266/2863 [03:14<30:22,  1.42it/s][A
epoch 1 iter 266: train loss 5.86969. lr 5.967886e-04:   9%|▉         | 266/2863 [03:15<30:22,  1.42it/s][A
epoch 1 iter 266: train loss 5.86969. lr 5.967886e-04:   9%|▉         | 267/2863 [03:15<29:55,  1.45it/s][A
epoch 1 iter 267: t

epoch 1 iter 300: train loss 5.71467. lr 5.959201e-04:  10%|█         | 300/2863 [03:38<28:42,  1.49it/s][A
epoch 1 iter 300: train loss 5.71467. lr 5.959201e-04:  11%|█         | 301/2863 [03:38<28:38,  1.49it/s][A
epoch 1 iter 301: train loss 5.67439. lr 5.958930e-04:  11%|█         | 301/2863 [03:38<28:38,  1.49it/s][A
epoch 1 iter 301: train loss 5.67439. lr 5.958930e-04:  11%|█         | 302/2863 [03:38<28:34,  1.49it/s][A
epoch 1 iter 302: train loss 5.72518. lr 5.958658e-04:  11%|█         | 302/2863 [03:39<28:34,  1.49it/s][A
epoch 1 iter 302: train loss 5.72518. lr 5.958658e-04:  11%|█         | 303/2863 [03:39<28:31,  1.50it/s][A
epoch 1 iter 303: train loss 5.70361. lr 5.958385e-04:  11%|█         | 303/2863 [03:40<28:31,  1.50it/s][A
epoch 1 iter 303: train loss 5.70361. lr 5.958385e-04:  11%|█         | 304/2863 [03:40<28:30,  1.50it/s][A
epoch 1 iter 304: train loss 5.70855. lr 5.958111e-04:  11%|█         | 304/2863 [03:40<28:30,  1.50it/s][A
epoch 1 iter 304: t

epoch 1 iter 337: train loss 5.47733. lr 5.948579e-04:  12%|█▏        | 338/2863 [04:03<28:08,  1.50it/s][A
epoch 1 iter 338: train loss 5.57617. lr 5.948275e-04:  12%|█▏        | 338/2863 [04:03<28:08,  1.50it/s][A
epoch 1 iter 338: train loss 5.57617. lr 5.948275e-04:  12%|█▏        | 339/2863 [04:03<28:07,  1.50it/s][A
epoch 1 iter 339: train loss 5.51888. lr 5.947970e-04:  12%|█▏        | 339/2863 [04:04<28:07,  1.50it/s][A
epoch 1 iter 339: train loss 5.51888. lr 5.947970e-04:  12%|█▏        | 340/2863 [04:04<28:07,  1.50it/s][A
epoch 1 iter 340: train loss 5.58917. lr 5.947664e-04:  12%|█▏        | 340/2863 [04:05<28:07,  1.50it/s][A
epoch 1 iter 340: train loss 5.58917. lr 5.947664e-04:  12%|█▏        | 341/2863 [04:05<28:07,  1.49it/s][A
epoch 1 iter 341: train loss 5.43871. lr 5.947358e-04:  12%|█▏        | 341/2863 [04:05<28:07,  1.49it/s][A
epoch 1 iter 341: train loss 5.43871. lr 5.947358e-04:  12%|█▏        | 342/2863 [04:05<28:07,  1.49it/s][A
epoch 1 iter 342: t

epoch 1 iter 375: train loss 5.34986. lr 5.936404e-04:  13%|█▎        | 375/2863 [04:29<30:11,  1.37it/s][A
epoch 1 iter 375: train loss 5.34986. lr 5.936404e-04:  13%|█▎        | 376/2863 [04:29<30:36,  1.35it/s][A
epoch 1 iter 376: train loss 5.39611. lr 5.936067e-04:  13%|█▎        | 376/2863 [04:29<30:36,  1.35it/s][A
epoch 1 iter 376: train loss 5.39611. lr 5.936067e-04:  13%|█▎        | 377/2863 [04:29<30:55,  1.34it/s][A
epoch 1 iter 377: train loss 5.31446. lr 5.935728e-04:  13%|█▎        | 377/2863 [04:30<30:55,  1.34it/s][A
epoch 1 iter 377: train loss 5.31446. lr 5.935728e-04:  13%|█▎        | 378/2863 [04:30<32:30,  1.27it/s][A
epoch 1 iter 378: train loss 5.35244. lr 5.935389e-04:  13%|█▎        | 378/2863 [04:31<32:30,  1.27it/s][A
epoch 1 iter 378: train loss 5.35244. lr 5.935389e-04:  13%|█▎        | 379/2863 [04:31<32:08,  1.29it/s][A
epoch 1 iter 379: train loss 5.30113. lr 5.935048e-04:  13%|█▎        | 379/2863 [04:32<32:08,  1.29it/s][A
epoch 1 iter 379: t

epoch 1 iter 412: train loss 5.14287. lr 5.923323e-04:  14%|█▍        | 413/2863 [04:57<31:16,  1.31it/s][A
epoch 1 iter 413: train loss 5.10982. lr 5.922952e-04:  14%|█▍        | 413/2863 [04:58<31:16,  1.31it/s][A
epoch 1 iter 413: train loss 5.10982. lr 5.922952e-04:  14%|█▍        | 414/2863 [04:58<31:14,  1.31it/s][A
epoch 1 iter 414: train loss 5.12314. lr 5.922581e-04:  14%|█▍        | 414/2863 [04:59<31:14,  1.31it/s][A
epoch 1 iter 414: train loss 5.12314. lr 5.922581e-04:  14%|█▍        | 415/2863 [04:59<31:12,  1.31it/s][A
epoch 1 iter 415: train loss 5.09799. lr 5.922209e-04:  14%|█▍        | 415/2863 [04:59<31:12,  1.31it/s][A
epoch 1 iter 415: train loss 5.09799. lr 5.922209e-04:  15%|█▍        | 416/2863 [04:59<31:04,  1.31it/s][A
epoch 1 iter 416: train loss 5.14946. lr 5.921836e-04:  15%|█▍        | 416/2863 [05:00<31:04,  1.31it/s][A
epoch 1 iter 416: train loss 5.14946. lr 5.921836e-04:  15%|█▍        | 417/2863 [05:00<31:02,  1.31it/s][A
epoch 1 iter 417: t

epoch 1 iter 450: train loss 4.95840. lr 5.908633e-04:  16%|█▌        | 450/2863 [05:26<30:35,  1.31it/s][A
epoch 1 iter 450: train loss 4.95840. lr 5.908633e-04:  16%|█▌        | 451/2863 [05:26<30:34,  1.31it/s][A
epoch 1 iter 451: train loss 4.88680. lr 5.908229e-04:  16%|█▌        | 451/2863 [05:27<30:34,  1.31it/s][A
epoch 1 iter 451: train loss 4.88680. lr 5.908229e-04:  16%|█▌        | 452/2863 [05:27<30:37,  1.31it/s][A
epoch 1 iter 452: train loss 4.93602. lr 5.907825e-04:  16%|█▌        | 452/2863 [05:28<30:37,  1.31it/s][A
epoch 1 iter 452: train loss 4.93602. lr 5.907825e-04:  16%|█▌        | 453/2863 [05:28<30:41,  1.31it/s][A
epoch 1 iter 453: train loss 4.97150. lr 5.907419e-04:  16%|█▌        | 453/2863 [05:28<30:41,  1.31it/s][A
epoch 1 iter 453: train loss 4.97150. lr 5.907419e-04:  16%|█▌        | 454/2863 [05:28<30:40,  1.31it/s][A
epoch 1 iter 454: train loss 4.90617. lr 5.907013e-04:  16%|█▌        | 454/2863 [05:29<30:40,  1.31it/s][A
epoch 1 iter 454: t

epoch 1 iter 487: train loss 4.72469. lr 5.893114e-04:  17%|█▋        | 488/2863 [05:54<26:57,  1.47it/s][A
epoch 1 iter 488: train loss 4.69134. lr 5.892678e-04:  17%|█▋        | 488/2863 [05:55<26:57,  1.47it/s][A
epoch 1 iter 488: train loss 4.69134. lr 5.892678e-04:  17%|█▋        | 489/2863 [05:55<26:50,  1.47it/s][A
epoch 1 iter 489: train loss 4.66252. lr 5.892241e-04:  17%|█▋        | 489/2863 [05:55<26:50,  1.47it/s][A
epoch 1 iter 489: train loss 4.66252. lr 5.892241e-04:  17%|█▋        | 490/2863 [05:55<28:10,  1.40it/s][A
epoch 1 iter 490: train loss 4.69366. lr 5.891804e-04:  17%|█▋        | 490/2863 [05:56<28:10,  1.40it/s][A
epoch 1 iter 490: train loss 4.69366. lr 5.891804e-04:  17%|█▋        | 491/2863 [05:56<27:42,  1.43it/s][A
epoch 1 iter 491: train loss 4.71170. lr 5.891365e-04:  17%|█▋        | 491/2863 [05:57<27:42,  1.43it/s][A
epoch 1 iter 491: train loss 4.71170. lr 5.891365e-04:  17%|█▋        | 492/2863 [05:57<27:22,  1.44it/s][A
epoch 1 iter 492: t

epoch 1 iter 525: train loss 4.56385. lr 5.875935e-04:  18%|█▊        | 525/2863 [06:20<26:22,  1.48it/s][A
epoch 1 iter 525: train loss 4.56385. lr 5.875935e-04:  18%|█▊        | 526/2863 [06:20<26:21,  1.48it/s][A
epoch 1 iter 526: train loss 4.50236. lr 5.875466e-04:  18%|█▊        | 526/2863 [06:20<26:21,  1.48it/s][A
epoch 1 iter 526: train loss 4.50236. lr 5.875466e-04:  18%|█▊        | 527/2863 [06:20<26:17,  1.48it/s][A
epoch 1 iter 527: train loss 4.49952. lr 5.874996e-04:  18%|█▊        | 527/2863 [06:21<26:17,  1.48it/s][A
epoch 1 iter 527: train loss 4.49952. lr 5.874996e-04:  18%|█▊        | 528/2863 [06:21<26:15,  1.48it/s][A
epoch 1 iter 528: train loss 4.56316. lr 5.874525e-04:  18%|█▊        | 528/2863 [06:22<26:15,  1.48it/s][A
epoch 1 iter 528: train loss 4.56316. lr 5.874525e-04:  18%|█▊        | 529/2863 [06:22<26:14,  1.48it/s][A
epoch 1 iter 529: train loss 4.51968. lr 5.874053e-04:  18%|█▊        | 529/2863 [06:22<26:14,  1.48it/s][A
epoch 1 iter 529: t

epoch 1 iter 562: train loss 4.31588. lr 5.858005e-04:  20%|█▉        | 563/2863 [06:45<26:13,  1.46it/s][A
epoch 1 iter 563: train loss 4.31544. lr 5.857504e-04:  20%|█▉        | 563/2863 [06:46<26:13,  1.46it/s][A
epoch 1 iter 563: train loss 4.31544. lr 5.857504e-04:  20%|█▉        | 564/2863 [06:46<26:13,  1.46it/s][A
epoch 1 iter 564: train loss 4.30887. lr 5.857002e-04:  20%|█▉        | 564/2863 [06:46<26:13,  1.46it/s][A
epoch 1 iter 564: train loss 4.30887. lr 5.857002e-04:  20%|█▉        | 565/2863 [06:46<26:10,  1.46it/s][A
epoch 1 iter 565: train loss 4.32631. lr 5.856499e-04:  20%|█▉        | 565/2863 [06:47<26:10,  1.46it/s][A
epoch 1 iter 565: train loss 4.32631. lr 5.856499e-04:  20%|█▉        | 566/2863 [06:47<26:10,  1.46it/s][A
epoch 1 iter 566: train loss 4.26401. lr 5.855996e-04:  20%|█▉        | 566/2863 [06:48<26:10,  1.46it/s][A
epoch 1 iter 566: train loss 4.26401. lr 5.855996e-04:  20%|█▉        | 567/2863 [06:48<26:10,  1.46it/s][A
epoch 1 iter 567: t

epoch 1 iter 600: train loss 4.16497. lr 5.838364e-04:  21%|██        | 600/2863 [07:11<25:45,  1.46it/s][A
epoch 1 iter 600: train loss 4.16497. lr 5.838364e-04:  21%|██        | 601/2863 [07:11<25:43,  1.47it/s][A
epoch 1 iter 601: train loss 4.09710. lr 5.837830e-04:  21%|██        | 601/2863 [07:12<25:43,  1.47it/s][A
epoch 1 iter 601: train loss 4.09710. lr 5.837830e-04:  21%|██        | 602/2863 [07:12<27:03,  1.39it/s][A
epoch 1 iter 602: train loss 4.03142. lr 5.837296e-04:  21%|██        | 602/2863 [07:13<27:03,  1.39it/s][A
epoch 1 iter 602: train loss 4.03142. lr 5.837296e-04:  21%|██        | 603/2863 [07:13<26:42,  1.41it/s][A
epoch 1 iter 603: train loss 4.10220. lr 5.836761e-04:  21%|██        | 603/2863 [07:13<26:42,  1.41it/s][A
epoch 1 iter 603: train loss 4.10220. lr 5.836761e-04:  21%|██        | 604/2863 [07:13<26:28,  1.42it/s][A
epoch 1 iter 604: train loss 4.08735. lr 5.836225e-04:  21%|██        | 604/2863 [07:14<26:28,  1.42it/s][A
epoch 1 iter 604: t

epoch 1 iter 637: train loss 3.88535. lr 5.818054e-04:  22%|██▏       | 638/2863 [07:37<26:10,  1.42it/s][A
epoch 1 iter 638: train loss 3.88620. lr 5.817489e-04:  22%|██▏       | 638/2863 [07:38<26:10,  1.42it/s][A
epoch 1 iter 638: train loss 3.88620. lr 5.817489e-04:  22%|██▏       | 639/2863 [07:38<26:08,  1.42it/s][A
epoch 1 iter 639: train loss 3.85816. lr 5.816923e-04:  22%|██▏       | 639/2863 [07:38<26:08,  1.42it/s][A
epoch 1 iter 639: train loss 3.85816. lr 5.816923e-04:  22%|██▏       | 640/2863 [07:38<26:02,  1.42it/s][A
epoch 1 iter 640: train loss 3.86717. lr 5.816356e-04:  22%|██▏       | 640/2863 [07:39<26:02,  1.42it/s][A
epoch 1 iter 640: train loss 3.86717. lr 5.816356e-04:  22%|██▏       | 641/2863 [07:39<25:53,  1.43it/s][A
epoch 1 iter 641: train loss 3.88137. lr 5.815788e-04:  22%|██▏       | 641/2863 [07:40<25:53,  1.43it/s][A
epoch 1 iter 641: train loss 3.88137. lr 5.815788e-04:  22%|██▏       | 642/2863 [07:40<25:43,  1.44it/s][A
epoch 1 iter 642: t

epoch 1 iter 675: train loss 3.66609. lr 5.795985e-04:  24%|██▎       | 675/2863 [08:03<25:34,  1.43it/s][A
epoch 1 iter 675: train loss 3.66609. lr 5.795985e-04:  24%|██▎       | 676/2863 [08:03<26:53,  1.36it/s][A
epoch 1 iter 676: train loss 3.68601. lr 5.795388e-04:  24%|██▎       | 676/2863 [08:04<26:53,  1.36it/s][A
epoch 1 iter 676: train loss 3.68601. lr 5.795388e-04:  24%|██▎       | 677/2863 [08:04<27:11,  1.34it/s][A
epoch 1 iter 677: train loss 3.68198. lr 5.794790e-04:  24%|██▎       | 677/2863 [08:05<27:11,  1.34it/s][A
epoch 1 iter 677: train loss 3.68198. lr 5.794790e-04:  24%|██▎       | 678/2863 [08:05<26:45,  1.36it/s][A
epoch 1 iter 678: train loss 3.65761. lr 5.794191e-04:  24%|██▎       | 678/2863 [08:06<26:45,  1.36it/s][A
epoch 1 iter 678: train loss 3.65761. lr 5.794191e-04:  24%|██▎       | 679/2863 [08:06<26:12,  1.39it/s][A
epoch 1 iter 679: train loss 3.66877. lr 5.793591e-04:  24%|██▎       | 679/2863 [08:06<26:12,  1.39it/s][A
epoch 1 iter 679: t

epoch 1 iter 712: train loss 3.46509. lr 5.773329e-04:  25%|██▍       | 713/2863 [08:30<24:43,  1.45it/s][A
epoch 1 iter 713: train loss 3.44614. lr 5.772701e-04:  25%|██▍       | 713/2863 [08:31<24:43,  1.45it/s][A
epoch 1 iter 713: train loss 3.44614. lr 5.772701e-04:  25%|██▍       | 714/2863 [08:31<26:06,  1.37it/s][A
epoch 1 iter 714: train loss 3.46853. lr 5.772072e-04:  25%|██▍       | 714/2863 [08:31<26:06,  1.37it/s][A
epoch 1 iter 714: train loss 3.46853. lr 5.772072e-04:  25%|██▍       | 715/2863 [08:31<25:42,  1.39it/s][A
epoch 1 iter 715: train loss 3.46720. lr 5.771442e-04:  25%|██▍       | 715/2863 [08:32<25:42,  1.39it/s][A
epoch 1 iter 715: train loss 3.46720. lr 5.771442e-04:  25%|██▌       | 716/2863 [08:32<25:27,  1.41it/s][A
epoch 1 iter 716: train loss 3.47367. lr 5.770811e-04:  25%|██▌       | 716/2863 [08:33<25:27,  1.41it/s][A
epoch 1 iter 716: train loss 3.47367. lr 5.770811e-04:  25%|██▌       | 717/2863 [08:33<25:15,  1.42it/s][A
epoch 1 iter 717: t

epoch 1 iter 750: train loss 3.33260. lr 5.748870e-04:  26%|██▌       | 750/2863 [08:57<24:35,  1.43it/s][A
epoch 1 iter 750: train loss 3.33260. lr 5.748870e-04:  26%|██▌       | 751/2863 [08:57<24:27,  1.44it/s][A
epoch 1 iter 751: train loss 3.21545. lr 5.748210e-04:  26%|██▌       | 751/2863 [08:57<24:27,  1.44it/s][A
epoch 1 iter 751: train loss 3.21545. lr 5.748210e-04:  26%|██▋       | 752/2863 [08:57<24:21,  1.44it/s][A
epoch 1 iter 752: train loss 3.28474. lr 5.747549e-04:  26%|██▋       | 752/2863 [08:58<24:21,  1.44it/s][A
epoch 1 iter 752: train loss 3.28474. lr 5.747549e-04:  26%|██▋       | 753/2863 [08:58<24:22,  1.44it/s][A
epoch 1 iter 753: train loss 3.21723. lr 5.746888e-04:  26%|██▋       | 753/2863 [08:59<24:22,  1.44it/s][A
epoch 1 iter 753: train loss 3.21723. lr 5.746888e-04:  26%|██▋       | 754/2863 [08:59<24:48,  1.42it/s][A
epoch 1 iter 754: train loss 3.23615. lr 5.746226e-04:  26%|██▋       | 754/2863 [08:59<24:48,  1.42it/s][A
epoch 1 iter 754: t

epoch 1 iter 787: train loss 3.05766. lr 5.723906e-04:  28%|██▊       | 788/2863 [09:23<24:13,  1.43it/s][A
epoch 1 iter 788: train loss 3.07522. lr 5.723215e-04:  28%|██▊       | 788/2863 [09:24<24:13,  1.43it/s][A
epoch 1 iter 788: train loss 3.07522. lr 5.723215e-04:  28%|██▊       | 789/2863 [09:24<24:10,  1.43it/s][A
epoch 1 iter 789: train loss 3.07450. lr 5.722524e-04:  28%|██▊       | 789/2863 [09:24<24:10,  1.43it/s][A
epoch 1 iter 789: train loss 3.07450. lr 5.722524e-04:  28%|██▊       | 790/2863 [09:24<24:16,  1.42it/s][A
epoch 1 iter 790: train loss 3.02818. lr 5.721832e-04:  28%|██▊       | 790/2863 [09:25<24:16,  1.42it/s][A
epoch 1 iter 790: train loss 3.02818. lr 5.721832e-04:  28%|██▊       | 791/2863 [09:25<24:17,  1.42it/s][A
epoch 1 iter 791: train loss 2.96947. lr 5.721140e-04:  28%|██▊       | 791/2863 [09:26<24:17,  1.42it/s][A
epoch 1 iter 791: train loss 2.96947. lr 5.721140e-04:  28%|██▊       | 792/2863 [09:26<25:24,  1.36it/s][A
epoch 1 iter 792: t

epoch 1 iter 825: train loss 2.80579. lr 5.697098e-04:  29%|██▉       | 825/2863 [09:50<24:04,  1.41it/s][A
epoch 1 iter 825: train loss 2.80579. lr 5.697098e-04:  29%|██▉       | 826/2863 [09:50<25:12,  1.35it/s][A
epoch 1 iter 826: train loss 2.84977. lr 5.696377e-04:  29%|██▉       | 826/2863 [09:51<25:12,  1.35it/s][A
epoch 1 iter 826: train loss 2.84977. lr 5.696377e-04:  29%|██▉       | 827/2863 [09:51<25:09,  1.35it/s][A
epoch 1 iter 827: train loss 2.82144. lr 5.695654e-04:  29%|██▉       | 827/2863 [09:51<25:09,  1.35it/s][A
epoch 1 iter 827: train loss 2.82144. lr 5.695654e-04:  29%|██▉       | 828/2863 [09:51<25:17,  1.34it/s][A
epoch 1 iter 828: train loss 2.74593. lr 5.694931e-04:  29%|██▉       | 828/2863 [09:52<25:17,  1.34it/s][A
epoch 1 iter 828: train loss 2.74593. lr 5.694931e-04:  29%|██▉       | 829/2863 [09:52<25:09,  1.35it/s][A
epoch 1 iter 829: train loss 2.75625. lr 5.694208e-04:  29%|██▉       | 829/2863 [09:53<25:09,  1.35it/s][A
epoch 1 iter 829: t

epoch 1 iter 862: train loss 2.69867. lr 5.669868e-04:  30%|███       | 863/2863 [10:16<23:45,  1.40it/s][A
epoch 1 iter 863: train loss 2.59821. lr 5.669117e-04:  30%|███       | 863/2863 [10:17<23:45,  1.40it/s][A
epoch 1 iter 863: train loss 2.59821. lr 5.669117e-04:  30%|███       | 864/2863 [10:17<24:32,  1.36it/s][A
epoch 1 iter 864: train loss 2.65920. lr 5.668365e-04:  30%|███       | 864/2863 [10:18<24:32,  1.36it/s][A
epoch 1 iter 864: train loss 2.65920. lr 5.668365e-04:  30%|███       | 865/2863 [10:18<24:50,  1.34it/s][A
epoch 1 iter 865: train loss 2.62110. lr 5.667612e-04:  30%|███       | 865/2863 [10:19<24:50,  1.34it/s][A
epoch 1 iter 865: train loss 2.62110. lr 5.667612e-04:  30%|███       | 866/2863 [10:19<24:35,  1.35it/s][A
epoch 1 iter 866: train loss 2.56855. lr 5.666858e-04:  30%|███       | 866/2863 [10:19<24:35,  1.35it/s][A
epoch 1 iter 866: train loss 2.56855. lr 5.666858e-04:  30%|███       | 867/2863 [10:19<24:04,  1.38it/s][A
epoch 1 iter 867: t

epoch 1 iter 900: train loss 2.35993. lr 5.640757e-04:  31%|███▏      | 900/2863 [10:46<25:26,  1.29it/s][A
epoch 1 iter 900: train loss 2.35993. lr 5.640757e-04:  31%|███▏      | 901/2863 [10:46<25:17,  1.29it/s][A
epoch 1 iter 901: train loss 2.41635. lr 5.639975e-04:  31%|███▏      | 901/2863 [10:47<25:17,  1.29it/s][A
epoch 1 iter 901: train loss 2.41635. lr 5.639975e-04:  32%|███▏      | 902/2863 [10:47<25:17,  1.29it/s][A
epoch 1 iter 902: train loss 2.41363. lr 5.639193e-04:  32%|███▏      | 902/2863 [10:48<25:17,  1.29it/s][A
epoch 1 iter 902: train loss 2.41363. lr 5.639193e-04:  32%|███▏      | 903/2863 [10:48<25:26,  1.28it/s][A
epoch 1 iter 903: train loss 2.42945. lr 5.638410e-04:  32%|███▏      | 903/2863 [10:48<25:26,  1.28it/s][A
epoch 1 iter 903: train loss 2.42945. lr 5.638410e-04:  32%|███▏      | 904/2863 [10:48<25:31,  1.28it/s][A
epoch 1 iter 904: train loss 2.46283. lr 5.637626e-04:  32%|███▏      | 904/2863 [10:49<25:31,  1.28it/s][A
epoch 1 iter 904: t

epoch 1 iter 937: train loss 2.26741. lr 5.611308e-04:  33%|███▎      | 938/2863 [11:15<26:18,  1.22it/s][A
epoch 1 iter 938: train loss 2.26830. lr 5.610497e-04:  33%|███▎      | 938/2863 [11:16<26:18,  1.22it/s][A
epoch 1 iter 938: train loss 2.26830. lr 5.610497e-04:  33%|███▎      | 939/2863 [11:16<25:57,  1.24it/s][A
epoch 1 iter 939: train loss 2.17876. lr 5.609685e-04:  33%|███▎      | 939/2863 [11:17<25:57,  1.24it/s][A
epoch 1 iter 939: train loss 2.17876. lr 5.609685e-04:  33%|███▎      | 940/2863 [11:17<25:47,  1.24it/s][A
epoch 1 iter 940: train loss 2.19383. lr 5.608873e-04:  33%|███▎      | 940/2863 [11:18<25:47,  1.24it/s][A
epoch 1 iter 940: train loss 2.19383. lr 5.608873e-04:  33%|███▎      | 941/2863 [11:18<25:41,  1.25it/s][A
epoch 1 iter 941: train loss 2.17068. lr 5.608060e-04:  33%|███▎      | 941/2863 [11:19<25:41,  1.25it/s][A
epoch 1 iter 941: train loss 2.17068. lr 5.608060e-04:  33%|███▎      | 942/2863 [11:19<25:34,  1.25it/s][A
epoch 1 iter 942: t

epoch 1 iter 975: train loss 2.01166. lr 5.579942e-04:  34%|███▍      | 975/2863 [11:46<24:59,  1.26it/s][A
epoch 1 iter 975: train loss 2.01166. lr 5.579942e-04:  34%|███▍      | 976/2863 [11:46<24:48,  1.27it/s][A
epoch 1 iter 976: train loss 2.04251. lr 5.579102e-04:  34%|███▍      | 976/2863 [11:46<24:48,  1.27it/s][A
epoch 1 iter 976: train loss 2.04251. lr 5.579102e-04:  34%|███▍      | 977/2863 [11:46<24:08,  1.30it/s][A
epoch 1 iter 977: train loss 2.02280. lr 5.578260e-04:  34%|███▍      | 977/2863 [11:47<24:08,  1.30it/s][A
epoch 1 iter 977: train loss 2.02280. lr 5.578260e-04:  34%|███▍      | 978/2863 [11:47<23:36,  1.33it/s][A
epoch 1 iter 978: train loss 2.06348. lr 5.577418e-04:  34%|███▍      | 978/2863 [11:48<23:36,  1.33it/s][A
epoch 1 iter 978: train loss 2.06348. lr 5.577418e-04:  34%|███▍      | 979/2863 [11:48<23:09,  1.36it/s][A
epoch 1 iter 979: train loss 1.99259. lr 5.576575e-04:  34%|███▍      | 979/2863 [11:48<23:09,  1.36it/s][A
epoch 1 iter 979: t

epoch 1 iter 1012: train loss 1.86569. lr 5.548324e-04:  35%|███▌      | 1012/2863 [12:12<22:42,  1.36it/s][A
epoch 1 iter 1012: train loss 1.86569. lr 5.548324e-04:  35%|███▌      | 1013/2863 [12:12<22:21,  1.38it/s][A
epoch 1 iter 1013: train loss 1.84363. lr 5.547455e-04:  35%|███▌      | 1013/2863 [12:13<22:21,  1.38it/s][A
epoch 1 iter 1013: train loss 1.84363. lr 5.547455e-04:  35%|███▌      | 1014/2863 [12:13<22:03,  1.40it/s][A
epoch 1 iter 1014: train loss 1.87550. lr 5.546585e-04:  35%|███▌      | 1014/2863 [12:14<22:03,  1.40it/s][A
epoch 1 iter 1014: train loss 1.87550. lr 5.546585e-04:  35%|███▌      | 1015/2863 [12:14<21:50,  1.41it/s][A
epoch 1 iter 1015: train loss 1.81581. lr 5.545714e-04:  35%|███▌      | 1015/2863 [12:15<21:50,  1.41it/s][A
epoch 1 iter 1015: train loss 1.81581. lr 5.545714e-04:  35%|███▌      | 1016/2863 [12:15<21:50,  1.41it/s][A
epoch 1 iter 1016: train loss 1.82141. lr 5.544843e-04:  35%|███▌      | 1016/2863 [12:15<21:50,  1.41it/s][A
e

epoch 1 iter 1048: train loss 1.66248. lr 5.516551e-04:  37%|███▋      | 1049/2863 [12:39<23:13,  1.30it/s][A
epoch 1 iter 1049: train loss 1.72743. lr 5.515655e-04:  37%|███▋      | 1049/2863 [12:40<23:13,  1.30it/s][A
epoch 1 iter 1049: train loss 1.72743. lr 5.515655e-04:  37%|███▋      | 1050/2863 [12:40<23:47,  1.27it/s][A
epoch 1 iter 1050: train loss 1.76295. lr 5.514757e-04:  37%|███▋      | 1050/2863 [12:41<23:47,  1.27it/s][A
epoch 1 iter 1050: train loss 1.76295. lr 5.514757e-04:  37%|███▋      | 1051/2863 [12:41<23:03,  1.31it/s][A
epoch 1 iter 1051: train loss 1.73266. lr 5.513859e-04:  37%|███▋      | 1051/2863 [12:42<23:03,  1.31it/s][A
epoch 1 iter 1051: train loss 1.73266. lr 5.513859e-04:  37%|███▋      | 1052/2863 [12:42<22:30,  1.34it/s][A
epoch 1 iter 1052: train loss 1.65312. lr 5.512960e-04:  37%|███▋      | 1052/2863 [12:42<22:30,  1.34it/s][A
epoch 1 iter 1052: train loss 1.65312. lr 5.512960e-04:  37%|███▋      | 1053/2863 [12:42<22:08,  1.36it/s][A
e

epoch 1 iter 1085: train loss 1.58479. lr 5.482873e-04:  38%|███▊      | 1085/2863 [13:06<21:43,  1.36it/s][A
epoch 1 iter 1085: train loss 1.58479. lr 5.482873e-04:  38%|███▊      | 1086/2863 [13:06<21:41,  1.37it/s][A
epoch 1 iter 1086: train loss 1.51994. lr 5.481948e-04:  38%|███▊      | 1086/2863 [13:07<21:41,  1.37it/s][A
epoch 1 iter 1086: train loss 1.51994. lr 5.481948e-04:  38%|███▊      | 1087/2863 [13:07<21:31,  1.38it/s][A
epoch 1 iter 1087: train loss 1.55061. lr 5.481023e-04:  38%|███▊      | 1087/2863 [13:08<21:31,  1.38it/s][A
epoch 1 iter 1087: train loss 1.55061. lr 5.481023e-04:  38%|███▊      | 1088/2863 [13:08<21:23,  1.38it/s][A
epoch 1 iter 1088: train loss 1.56923. lr 5.480097e-04:  38%|███▊      | 1088/2863 [13:09<21:23,  1.38it/s][A
epoch 1 iter 1088: train loss 1.56923. lr 5.480097e-04:  38%|███▊      | 1089/2863 [13:09<22:29,  1.31it/s][A
epoch 1 iter 1089: train loss 1.55612. lr 5.479170e-04:  38%|███▊      | 1089/2863 [13:09<22:29,  1.31it/s][A
e

epoch 1 iter 1121: train loss 1.44033. lr 5.449122e-04:  39%|███▉      | 1122/2863 [13:33<21:45,  1.33it/s][A
epoch 1 iter 1122: train loss 1.42607. lr 5.448170e-04:  39%|███▉      | 1122/2863 [13:34<21:45,  1.33it/s][A
epoch 1 iter 1122: train loss 1.42607. lr 5.448170e-04:  39%|███▉      | 1123/2863 [13:34<21:54,  1.32it/s][A
epoch 1 iter 1123: train loss 1.44140. lr 5.447218e-04:  39%|███▉      | 1123/2863 [13:34<21:54,  1.32it/s][A
epoch 1 iter 1123: train loss 1.44140. lr 5.447218e-04:  39%|███▉      | 1124/2863 [13:34<21:46,  1.33it/s][A
epoch 1 iter 1124: train loss 1.46040. lr 5.446266e-04:  39%|███▉      | 1124/2863 [13:35<21:46,  1.33it/s][A
epoch 1 iter 1124: train loss 1.46040. lr 5.446266e-04:  39%|███▉      | 1125/2863 [13:35<21:31,  1.35it/s][A
epoch 1 iter 1125: train loss 1.46338. lr 5.445312e-04:  39%|███▉      | 1125/2863 [13:36<21:31,  1.35it/s][A
epoch 1 iter 1125: train loss 1.46338. lr 5.445312e-04:  39%|███▉      | 1126/2863 [13:36<21:11,  1.37it/s][A
e

epoch 1 iter 1158: train loss 1.26173. lr 5.413437e-04:  40%|████      | 1158/2863 [14:00<21:56,  1.30it/s][A
epoch 1 iter 1158: train loss 1.26173. lr 5.413437e-04:  40%|████      | 1159/2863 [14:00<22:32,  1.26it/s][A
epoch 1 iter 1159: train loss 1.32333. lr 5.412459e-04:  40%|████      | 1159/2863 [14:01<22:32,  1.26it/s][A
epoch 1 iter 1159: train loss 1.32333. lr 5.412459e-04:  41%|████      | 1160/2863 [14:01<22:19,  1.27it/s][A
epoch 1 iter 1160: train loss 1.33273. lr 5.411479e-04:  41%|████      | 1160/2863 [14:02<22:19,  1.27it/s][A
epoch 1 iter 1160: train loss 1.33273. lr 5.411479e-04:  41%|████      | 1161/2863 [14:02<22:02,  1.29it/s][A
epoch 1 iter 1161: train loss 1.31350. lr 5.410500e-04:  41%|████      | 1161/2863 [14:03<22:02,  1.29it/s][A
epoch 1 iter 1161: train loss 1.31350. lr 5.410500e-04:  41%|████      | 1162/2863 [14:03<22:28,  1.26it/s][A
epoch 1 iter 1162: train loss 1.28878. lr 5.409519e-04:  41%|████      | 1162/2863 [14:03<22:28,  1.26it/s][A
e

epoch 1 iter 1194: train loss 1.17730. lr 5.377761e-04:  42%|████▏     | 1195/2863 [14:27<21:26,  1.30it/s][A
epoch 1 iter 1195: train loss 1.16320. lr 5.376757e-04:  42%|████▏     | 1195/2863 [14:28<21:26,  1.30it/s][A
epoch 1 iter 1195: train loss 1.16320. lr 5.376757e-04:  42%|████▏     | 1196/2863 [14:28<20:46,  1.34it/s][A
epoch 1 iter 1196: train loss 1.18416. lr 5.375752e-04:  42%|████▏     | 1196/2863 [14:28<20:46,  1.34it/s][A
epoch 1 iter 1196: train loss 1.18416. lr 5.375752e-04:  42%|████▏     | 1197/2863 [14:28<20:24,  1.36it/s][A
epoch 1 iter 1197: train loss 1.19029. lr 5.374746e-04:  42%|████▏     | 1197/2863 [14:29<20:24,  1.36it/s][A
epoch 1 iter 1197: train loss 1.19029. lr 5.374746e-04:  42%|████▏     | 1198/2863 [14:29<20:02,  1.38it/s][A
epoch 1 iter 1198: train loss 1.18354. lr 5.373740e-04:  42%|████▏     | 1198/2863 [14:30<20:02,  1.38it/s][A
epoch 1 iter 1198: train loss 1.18354. lr 5.373740e-04:  42%|████▏     | 1199/2863 [14:30<20:46,  1.33it/s][A
e

epoch 1 iter 1231: train loss 1.11789. lr 5.340128e-04:  43%|████▎     | 1231/2863 [14:55<19:40,  1.38it/s][A
epoch 1 iter 1231: train loss 1.11789. lr 5.340128e-04:  43%|████▎     | 1232/2863 [14:55<19:59,  1.36it/s][A
epoch 1 iter 1232: train loss 1.04680. lr 5.339097e-04:  43%|████▎     | 1232/2863 [14:56<19:59,  1.36it/s][A
epoch 1 iter 1232: train loss 1.04680. lr 5.339097e-04:  43%|████▎     | 1233/2863 [14:56<19:39,  1.38it/s][A
epoch 1 iter 1233: train loss 1.07316. lr 5.338066e-04:  43%|████▎     | 1233/2863 [14:56<19:39,  1.38it/s][A
epoch 1 iter 1233: train loss 1.07316. lr 5.338066e-04:  43%|████▎     | 1234/2863 [14:56<19:53,  1.36it/s][A
epoch 1 iter 1234: train loss 1.09763. lr 5.337034e-04:  43%|████▎     | 1234/2863 [14:57<19:53,  1.36it/s][A
epoch 1 iter 1234: train loss 1.09763. lr 5.337034e-04:  43%|████▎     | 1235/2863 [14:57<19:59,  1.36it/s][A
epoch 1 iter 1235: train loss 1.09609. lr 5.336001e-04:  43%|████▎     | 1235/2863 [14:58<19:59,  1.36it/s][A
e

epoch 1 iter 1267: train loss 1.01185. lr 5.302585e-04:  44%|████▍     | 1268/2863 [15:22<19:51,  1.34it/s][A
epoch 1 iter 1268: train loss 1.03269. lr 5.301529e-04:  44%|████▍     | 1268/2863 [15:23<19:51,  1.34it/s][A
epoch 1 iter 1268: train loss 1.03269. lr 5.301529e-04:  44%|████▍     | 1269/2863 [15:23<20:11,  1.32it/s][A
epoch 1 iter 1269: train loss 1.01017. lr 5.300472e-04:  44%|████▍     | 1269/2863 [15:23<20:11,  1.32it/s][A
epoch 1 iter 1269: train loss 1.01017. lr 5.300472e-04:  44%|████▍     | 1270/2863 [15:23<20:14,  1.31it/s][A
epoch 1 iter 1270: train loss 1.00083. lr 5.299415e-04:  44%|████▍     | 1270/2863 [15:24<20:14,  1.31it/s][A
epoch 1 iter 1270: train loss 1.00083. lr 5.299415e-04:  44%|████▍     | 1271/2863 [15:24<20:05,  1.32it/s][A
epoch 1 iter 1271: train loss 1.00082. lr 5.298357e-04:  44%|████▍     | 1271/2863 [15:25<20:05,  1.32it/s][A
epoch 1 iter 1271: train loss 1.00082. lr 5.298357e-04:  44%|████▍     | 1272/2863 [15:25<19:56,  1.33it/s][A
e

epoch 1 iter 1304: train loss 0.93123. lr 5.263062e-04:  46%|████▌     | 1304/2863 [15:49<18:46,  1.38it/s][A
epoch 1 iter 1304: train loss 0.93123. lr 5.263062e-04:  46%|████▌     | 1305/2863 [15:49<18:34,  1.40it/s][A
epoch 1 iter 1305: train loss 0.95064. lr 5.261981e-04:  46%|████▌     | 1305/2863 [15:49<18:34,  1.40it/s][A
epoch 1 iter 1305: train loss 0.95064. lr 5.261981e-04:  46%|████▌     | 1306/2863 [15:49<18:48,  1.38it/s][A
epoch 1 iter 1306: train loss 0.90155. lr 5.260899e-04:  46%|████▌     | 1306/2863 [15:50<18:48,  1.38it/s][A
epoch 1 iter 1306: train loss 0.90155. lr 5.260899e-04:  46%|████▌     | 1307/2863 [15:50<18:41,  1.39it/s][A
epoch 1 iter 1307: train loss 0.95684. lr 5.259817e-04:  46%|████▌     | 1307/2863 [15:51<18:41,  1.39it/s][A
epoch 1 iter 1307: train loss 0.95684. lr 5.259817e-04:  46%|████▌     | 1308/2863 [15:51<18:25,  1.41it/s][A
epoch 1 iter 1308: train loss 0.92326. lr 5.258734e-04:  46%|████▌     | 1308/2863 [15:51<18:25,  1.41it/s][A
e

epoch 1 iter 1340: train loss 0.87432. lr 5.223713e-04:  47%|████▋     | 1341/2863 [16:15<17:42,  1.43it/s][A
epoch 1 iter 1341: train loss 0.87347. lr 5.222607e-04:  47%|████▋     | 1341/2863 [16:15<17:42,  1.43it/s][A
epoch 1 iter 1341: train loss 0.87347. lr 5.222607e-04:  47%|████▋     | 1342/2863 [16:15<17:39,  1.44it/s][A
epoch 1 iter 1342: train loss 0.86836. lr 5.221501e-04:  47%|████▋     | 1342/2863 [16:16<17:39,  1.44it/s][A
epoch 1 iter 1342: train loss 0.86836. lr 5.221501e-04:  47%|████▋     | 1343/2863 [16:16<17:41,  1.43it/s][A
epoch 1 iter 1343: train loss 0.87592. lr 5.220394e-04:  47%|████▋     | 1343/2863 [16:17<17:41,  1.43it/s][A
epoch 1 iter 1343: train loss 0.87592. lr 5.220394e-04:  47%|████▋     | 1344/2863 [16:17<17:37,  1.44it/s][A
epoch 1 iter 1344: train loss 0.85379. lr 5.219287e-04:  47%|████▋     | 1344/2863 [16:17<17:37,  1.44it/s][A
epoch 1 iter 1344: train loss 0.85379. lr 5.219287e-04:  47%|████▋     | 1345/2863 [16:17<17:34,  1.44it/s][A
e

epoch 1 iter 1377: train loss 0.83286. lr 5.182365e-04:  48%|████▊     | 1377/2863 [16:41<17:21,  1.43it/s][A
epoch 1 iter 1377: train loss 0.83286. lr 5.182365e-04:  48%|████▊     | 1378/2863 [16:41<17:20,  1.43it/s][A
epoch 1 iter 1378: train loss 0.82072. lr 5.181235e-04:  48%|████▊     | 1378/2863 [16:41<17:20,  1.43it/s][A
epoch 1 iter 1378: train loss 0.82072. lr 5.181235e-04:  48%|████▊     | 1379/2863 [16:41<17:15,  1.43it/s][A
epoch 1 iter 1379: train loss 0.79767. lr 5.180105e-04:  48%|████▊     | 1379/2863 [16:42<17:15,  1.43it/s][A
epoch 1 iter 1379: train loss 0.79767. lr 5.180105e-04:  48%|████▊     | 1380/2863 [16:42<17:10,  1.44it/s][A
epoch 1 iter 1380: train loss 0.80585. lr 5.178973e-04:  48%|████▊     | 1380/2863 [16:43<17:10,  1.44it/s][A
epoch 1 iter 1380: train loss 0.80585. lr 5.178973e-04:  48%|████▊     | 1381/2863 [16:43<17:15,  1.43it/s][A
epoch 1 iter 1381: train loss 0.82164. lr 5.177841e-04:  48%|████▊     | 1381/2863 [16:43<17:15,  1.43it/s][A
e

epoch 1 iter 1413: train loss 0.76628. lr 5.141272e-04:  49%|████▉     | 1414/2863 [17:06<18:00,  1.34it/s][A
epoch 1 iter 1414: train loss 0.73963. lr 5.140118e-04:  49%|████▉     | 1414/2863 [17:07<18:00,  1.34it/s][A
epoch 1 iter 1414: train loss 0.73963. lr 5.140118e-04:  49%|████▉     | 1415/2863 [17:07<17:50,  1.35it/s][A
epoch 1 iter 1415: train loss 0.75841. lr 5.138964e-04:  49%|████▉     | 1415/2863 [17:07<17:50,  1.35it/s][A
epoch 1 iter 1415: train loss 0.75841. lr 5.138964e-04:  49%|████▉     | 1416/2863 [17:07<17:44,  1.36it/s][A
epoch 1 iter 1416: train loss 0.77876. lr 5.137809e-04:  49%|████▉     | 1416/2863 [17:08<17:44,  1.36it/s][A
epoch 1 iter 1416: train loss 0.77876. lr 5.137809e-04:  49%|████▉     | 1417/2863 [17:08<17:34,  1.37it/s][A
epoch 1 iter 1417: train loss 0.76134. lr 5.136654e-04:  49%|████▉     | 1417/2863 [17:09<17:34,  1.37it/s][A
epoch 1 iter 1417: train loss 0.76134. lr 5.136654e-04:  50%|████▉     | 1418/2863 [17:09<17:26,  1.38it/s][A
e

epoch 1 iter 1450: train loss 0.72015. lr 5.098166e-04:  51%|█████     | 1450/2863 [17:32<16:22,  1.44it/s][A
epoch 1 iter 1450: train loss 0.72015. lr 5.098166e-04:  51%|█████     | 1451/2863 [17:32<16:19,  1.44it/s][A
epoch 1 iter 1451: train loss 0.72938. lr 5.096989e-04:  51%|█████     | 1451/2863 [17:33<16:19,  1.44it/s][A
epoch 1 iter 1451: train loss 0.72938. lr 5.096989e-04:  51%|█████     | 1452/2863 [17:33<16:21,  1.44it/s][A
epoch 1 iter 1452: train loss 0.71430. lr 5.095811e-04:  51%|█████     | 1452/2863 [17:34<16:21,  1.44it/s][A
epoch 1 iter 1452: train loss 0.71430. lr 5.095811e-04:  51%|█████     | 1453/2863 [17:34<16:19,  1.44it/s][A
epoch 1 iter 1453: train loss 0.71744. lr 5.094633e-04:  51%|█████     | 1453/2863 [17:34<16:19,  1.44it/s][A
epoch 1 iter 1453: train loss 0.71744. lr 5.094633e-04:  51%|█████     | 1454/2863 [17:34<16:18,  1.44it/s][A
epoch 1 iter 1454: train loss 0.70759. lr 5.093454e-04:  51%|█████     | 1454/2863 [17:35<16:18,  1.44it/s][A
e

epoch 1 iter 1486: train loss 0.69264. lr 5.055394e-04:  52%|█████▏    | 1487/2863 [17:58<16:02,  1.43it/s][A
epoch 1 iter 1487: train loss 0.68479. lr 5.054195e-04:  52%|█████▏    | 1487/2863 [17:58<16:02,  1.43it/s][A
epoch 1 iter 1487: train loss 0.68479. lr 5.054195e-04:  52%|█████▏    | 1488/2863 [17:58<15:56,  1.44it/s][A
epoch 1 iter 1488: train loss 0.67216. lr 5.052995e-04:  52%|█████▏    | 1488/2863 [17:59<15:56,  1.44it/s][A
epoch 1 iter 1488: train loss 0.67216. lr 5.052995e-04:  52%|█████▏    | 1489/2863 [17:59<15:56,  1.44it/s][A
epoch 1 iter 1489: train loss 0.68720. lr 5.051794e-04:  52%|█████▏    | 1489/2863 [18:00<15:56,  1.44it/s][A
epoch 1 iter 1489: train loss 0.68720. lr 5.051794e-04:  52%|█████▏    | 1490/2863 [18:00<15:51,  1.44it/s][A
epoch 1 iter 1490: train loss 0.66206. lr 5.050592e-04:  52%|█████▏    | 1490/2863 [18:01<15:51,  1.44it/s][A
epoch 1 iter 1490: train loss 0.66206. lr 5.050592e-04:  52%|█████▏    | 1491/2863 [18:01<15:47,  1.45it/s][A
e

epoch 1 iter 1523: train loss 0.65089. lr 5.010599e-04:  53%|█████▎    | 1523/2863 [18:24<15:41,  1.42it/s][A
epoch 1 iter 1523: train loss 0.65089. lr 5.010599e-04:  53%|█████▎    | 1524/2863 [18:24<15:37,  1.43it/s][A
epoch 1 iter 1524: train loss 0.65256. lr 5.009377e-04:  53%|█████▎    | 1524/2863 [18:24<15:37,  1.43it/s][A
epoch 1 iter 1524: train loss 0.65256. lr 5.009377e-04:  53%|█████▎    | 1525/2863 [18:24<15:32,  1.43it/s][A
epoch 1 iter 1525: train loss 0.64227. lr 5.008154e-04:  53%|█████▎    | 1525/2863 [18:25<15:32,  1.43it/s][A
epoch 1 iter 1525: train loss 0.64227. lr 5.008154e-04:  53%|█████▎    | 1526/2863 [18:25<16:16,  1.37it/s][A
epoch 1 iter 1526: train loss 0.64358. lr 5.006930e-04:  53%|█████▎    | 1526/2863 [18:26<16:16,  1.37it/s][A
epoch 1 iter 1526: train loss 0.64358. lr 5.006930e-04:  53%|█████▎    | 1527/2863 [18:26<16:01,  1.39it/s][A
epoch 1 iter 1527: train loss 0.64927. lr 5.005706e-04:  53%|█████▎    | 1527/2863 [18:27<16:01,  1.39it/s][A
e

epoch 1 iter 1559: train loss 0.61642. lr 4.966218e-04:  54%|█████▍    | 1560/2863 [18:49<15:32,  1.40it/s][A
epoch 1 iter 1560: train loss 0.62418. lr 4.964975e-04:  54%|█████▍    | 1560/2863 [18:50<15:32,  1.40it/s][A
epoch 1 iter 1560: train loss 0.62418. lr 4.964975e-04:  55%|█████▍    | 1561/2863 [18:50<15:25,  1.41it/s][A
epoch 1 iter 1561: train loss 0.61073. lr 4.963730e-04:  55%|█████▍    | 1561/2863 [18:51<15:25,  1.41it/s][A
epoch 1 iter 1561: train loss 0.61073. lr 4.963730e-04:  55%|█████▍    | 1562/2863 [18:51<15:17,  1.42it/s][A
epoch 1 iter 1562: train loss 0.63385. lr 4.962485e-04:  55%|█████▍    | 1562/2863 [18:52<15:17,  1.42it/s][A
epoch 1 iter 1562: train loss 0.63385. lr 4.962485e-04:  55%|█████▍    | 1563/2863 [18:52<15:17,  1.42it/s][A
epoch 1 iter 1563: train loss 0.60719. lr 4.961240e-04:  55%|█████▍    | 1563/2863 [18:52<15:17,  1.42it/s][A
epoch 1 iter 1563: train loss 0.60719. lr 4.961240e-04:  55%|█████▍    | 1564/2863 [18:52<15:14,  1.42it/s][A
e

epoch 1 iter 1596: train loss 0.59668. lr 4.919805e-04:  56%|█████▌    | 1596/2863 [19:16<14:40,  1.44it/s][A
epoch 1 iter 1596: train loss 0.59668. lr 4.919805e-04:  56%|█████▌    | 1597/2863 [19:16<14:36,  1.44it/s][A
epoch 1 iter 1597: train loss 0.60467. lr 4.918540e-04:  56%|█████▌    | 1597/2863 [19:16<14:36,  1.44it/s][A
epoch 1 iter 1597: train loss 0.60467. lr 4.918540e-04:  56%|█████▌    | 1598/2863 [19:16<14:33,  1.45it/s][A
epoch 1 iter 1598: train loss 0.57772. lr 4.917274e-04:  56%|█████▌    | 1598/2863 [19:17<14:33,  1.45it/s][A
epoch 1 iter 1598: train loss 0.57772. lr 4.917274e-04:  56%|█████▌    | 1599/2863 [19:17<14:31,  1.45it/s][A
epoch 1 iter 1599: train loss 0.58758. lr 4.916007e-04:  56%|█████▌    | 1599/2863 [19:18<14:31,  1.45it/s][A
epoch 1 iter 1599: train loss 0.58758. lr 4.916007e-04:  56%|█████▌    | 1600/2863 [19:18<14:33,  1.45it/s][A
epoch 1 iter 1600: train loss 0.58278. lr 4.914740e-04:  56%|█████▌    | 1600/2863 [19:18<14:33,  1.45it/s][A
e

epoch 1 iter 1632: train loss 0.57190. lr 4.873887e-04:  57%|█████▋    | 1633/2863 [19:41<14:18,  1.43it/s][A
epoch 1 iter 1633: train loss 0.57714. lr 4.872601e-04:  57%|█████▋    | 1633/2863 [19:41<14:18,  1.43it/s][A
epoch 1 iter 1633: train loss 0.57714. lr 4.872601e-04:  57%|█████▋    | 1634/2863 [19:41<14:14,  1.44it/s][A
epoch 1 iter 1634: train loss 0.57406. lr 4.871314e-04:  57%|█████▋    | 1634/2863 [19:42<14:14,  1.44it/s][A
epoch 1 iter 1634: train loss 0.57406. lr 4.871314e-04:  57%|█████▋    | 1635/2863 [19:42<14:10,  1.44it/s][A
epoch 1 iter 1635: train loss 0.56861. lr 4.870027e-04:  57%|█████▋    | 1635/2863 [19:43<14:10,  1.44it/s][A
epoch 1 iter 1635: train loss 0.56861. lr 4.870027e-04:  57%|█████▋    | 1636/2863 [19:43<14:10,  1.44it/s][A
epoch 1 iter 1636: train loss 0.56950. lr 4.868739e-04:  57%|█████▋    | 1636/2863 [19:44<14:10,  1.44it/s][A
epoch 1 iter 1636: train loss 0.56950. lr 4.868739e-04:  57%|█████▋    | 1637/2863 [19:44<14:13,  1.44it/s][A
e

epoch 1 iter 1669: train loss 0.55103. lr 4.825930e-04:  58%|█████▊    | 1669/2863 [20:07<13:55,  1.43it/s][A
epoch 1 iter 1669: train loss 0.55103. lr 4.825930e-04:  58%|█████▊    | 1670/2863 [20:07<13:49,  1.44it/s][A
epoch 1 iter 1670: train loss 0.55199. lr 4.824624e-04:  58%|█████▊    | 1670/2863 [20:07<13:49,  1.44it/s][A
epoch 1 iter 1670: train loss 0.55199. lr 4.824624e-04:  58%|█████▊    | 1671/2863 [20:07<13:45,  1.44it/s][A
epoch 1 iter 1671: train loss 0.56549. lr 4.823317e-04:  58%|█████▊    | 1671/2863 [20:08<13:45,  1.44it/s][A
epoch 1 iter 1671: train loss 0.56549. lr 4.823317e-04:  58%|█████▊    | 1672/2863 [20:08<13:41,  1.45it/s][A
epoch 1 iter 1672: train loss 0.54386. lr 4.822009e-04:  58%|█████▊    | 1672/2863 [20:09<13:41,  1.45it/s][A
epoch 1 iter 1672: train loss 0.54386. lr 4.822009e-04:  58%|█████▊    | 1673/2863 [20:09<13:39,  1.45it/s][A
epoch 1 iter 1673: train loss 0.55251. lr 4.820701e-04:  58%|█████▊    | 1673/2863 [20:09<13:39,  1.45it/s][A
e

epoch 1 iter 1705: train loss 0.54428. lr 4.778548e-04:  60%|█████▉    | 1706/2863 [20:32<13:26,  1.43it/s][A
epoch 1 iter 1706: train loss 0.54051. lr 4.777221e-04:  60%|█████▉    | 1706/2863 [20:33<13:26,  1.43it/s][A
epoch 1 iter 1706: train loss 0.54051. lr 4.777221e-04:  60%|█████▉    | 1707/2863 [20:33<13:23,  1.44it/s][A
epoch 1 iter 1707: train loss 0.52660. lr 4.775895e-04:  60%|█████▉    | 1707/2863 [20:34<13:23,  1.44it/s][A
epoch 1 iter 1707: train loss 0.52660. lr 4.775895e-04:  60%|█████▉    | 1708/2863 [20:34<13:18,  1.45it/s][A
epoch 1 iter 1708: train loss 0.54242. lr 4.774567e-04:  60%|█████▉    | 1708/2863 [20:34<13:18,  1.45it/s][A
epoch 1 iter 1708: train loss 0.54242. lr 4.774567e-04:  60%|█████▉    | 1709/2863 [20:34<13:15,  1.45it/s][A
epoch 1 iter 1709: train loss 0.53963. lr 4.773240e-04:  60%|█████▉    | 1709/2863 [20:35<13:15,  1.45it/s][A
epoch 1 iter 1709: train loss 0.53963. lr 4.773240e-04:  60%|█████▉    | 1710/2863 [20:35<13:15,  1.45it/s][A
e

epoch 1 iter 1742: train loss 0.50173. lr 4.729125e-04:  61%|██████    | 1742/2863 [20:58<13:02,  1.43it/s][A
epoch 1 iter 1742: train loss 0.50173. lr 4.729125e-04:  61%|██████    | 1743/2863 [20:58<12:58,  1.44it/s][A
epoch 1 iter 1743: train loss 0.52347. lr 4.727779e-04:  61%|██████    | 1743/2863 [20:59<12:58,  1.44it/s][A
epoch 1 iter 1743: train loss 0.52347. lr 4.727779e-04:  61%|██████    | 1744/2863 [20:59<12:55,  1.44it/s][A
epoch 1 iter 1744: train loss 0.52997. lr 4.726433e-04:  61%|██████    | 1744/2863 [21:00<12:55,  1.44it/s][A
epoch 1 iter 1744: train loss 0.52997. lr 4.726433e-04:  61%|██████    | 1745/2863 [21:00<12:52,  1.45it/s][A
epoch 1 iter 1745: train loss 0.51850. lr 4.725086e-04:  61%|██████    | 1745/2863 [21:00<12:52,  1.45it/s][A
epoch 1 iter 1745: train loss 0.51850. lr 4.725086e-04:  61%|██████    | 1746/2863 [21:00<12:51,  1.45it/s][A
epoch 1 iter 1746: train loss 0.51208. lr 4.723739e-04:  61%|██████    | 1746/2863 [21:01<12:51,  1.45it/s][A
e

epoch 1 iter 1778: train loss 0.49332. lr 4.680354e-04:  62%|██████▏   | 1779/2863 [21:23<12:56,  1.40it/s][A
epoch 1 iter 1779: train loss 0.49942. lr 4.678990e-04:  62%|██████▏   | 1779/2863 [21:24<12:56,  1.40it/s][A
epoch 1 iter 1779: train loss 0.49942. lr 4.678990e-04:  62%|██████▏   | 1780/2863 [21:24<13:09,  1.37it/s][A
epoch 1 iter 1780: train loss 0.49950. lr 4.677625e-04:  62%|██████▏   | 1780/2863 [21:25<13:09,  1.37it/s][A
epoch 1 iter 1780: train loss 0.49950. lr 4.677625e-04:  62%|██████▏   | 1781/2863 [21:25<13:00,  1.39it/s][A
epoch 1 iter 1781: train loss 0.48413. lr 4.676260e-04:  62%|██████▏   | 1781/2863 [21:26<13:00,  1.39it/s][A
epoch 1 iter 1781: train loss 0.48413. lr 4.676260e-04:  62%|██████▏   | 1782/2863 [21:26<12:47,  1.41it/s][A
epoch 1 iter 1782: train loss 0.49576. lr 4.674894e-04:  62%|██████▏   | 1782/2863 [21:26<12:47,  1.41it/s][A
epoch 1 iter 1782: train loss 0.49576. lr 4.674894e-04:  62%|██████▏   | 1783/2863 [21:26<12:43,  1.41it/s][A
e

epoch 1 iter 1815: train loss 0.49795. lr 4.629545e-04:  63%|██████▎   | 1815/2863 [21:51<13:31,  1.29it/s][A
epoch 1 iter 1815: train loss 0.49795. lr 4.629545e-04:  63%|██████▎   | 1816/2863 [21:51<13:33,  1.29it/s][A
epoch 1 iter 1816: train loss 0.48109. lr 4.628162e-04:  63%|██████▎   | 1816/2863 [21:52<13:33,  1.29it/s][A
epoch 1 iter 1816: train loss 0.48109. lr 4.628162e-04:  63%|██████▎   | 1817/2863 [21:52<13:37,  1.28it/s][A
epoch 1 iter 1817: train loss 0.49574. lr 4.626779e-04:  63%|██████▎   | 1817/2863 [21:53<13:37,  1.28it/s][A
epoch 1 iter 1817: train loss 0.49574. lr 4.626779e-04:  63%|██████▎   | 1818/2863 [21:53<13:34,  1.28it/s][A
epoch 1 iter 1818: train loss 0.47440. lr 4.625395e-04:  63%|██████▎   | 1818/2863 [21:53<13:34,  1.28it/s][A
epoch 1 iter 1818: train loss 0.47440. lr 4.625395e-04:  64%|██████▎   | 1819/2863 [21:53<13:32,  1.29it/s][A
epoch 1 iter 1819: train loss 0.47864. lr 4.624011e-04:  64%|██████▎   | 1819/2863 [21:54<13:32,  1.29it/s][A
e

epoch 1 iter 1851: train loss 0.48521. lr 4.579464e-04:  65%|██████▍   | 1852/2863 [22:19<13:13,  1.27it/s][A
epoch 1 iter 1852: train loss 0.48168. lr 4.578064e-04:  65%|██████▍   | 1852/2863 [22:20<13:13,  1.27it/s][A
epoch 1 iter 1852: train loss 0.48168. lr 4.578064e-04:  65%|██████▍   | 1853/2863 [22:20<13:15,  1.27it/s][A
epoch 1 iter 1853: train loss 0.46758. lr 4.576663e-04:  65%|██████▍   | 1853/2863 [22:21<13:15,  1.27it/s][A
epoch 1 iter 1853: train loss 0.46758. lr 4.576663e-04:  65%|██████▍   | 1854/2863 [22:21<13:14,  1.27it/s][A
epoch 1 iter 1854: train loss 0.47686. lr 4.575262e-04:  65%|██████▍   | 1854/2863 [22:22<13:14,  1.27it/s][A
epoch 1 iter 1854: train loss 0.47686. lr 4.575262e-04:  65%|██████▍   | 1855/2863 [22:22<13:16,  1.26it/s][A
epoch 1 iter 1855: train loss 0.45808. lr 4.573861e-04:  65%|██████▍   | 1855/2863 [22:23<13:16,  1.26it/s][A
epoch 1 iter 1855: train loss 0.45808. lr 4.573861e-04:  65%|██████▍   | 1856/2863 [22:23<13:14,  1.27it/s][A
e

epoch 1 iter 1888: train loss 0.45490. lr 4.527349e-04:  66%|██████▌   | 1888/2863 [22:49<12:46,  1.27it/s][A
epoch 1 iter 1888: train loss 0.45490. lr 4.527349e-04:  66%|██████▌   | 1889/2863 [22:49<12:44,  1.27it/s][A
epoch 1 iter 1889: train loss 0.47021. lr 4.525932e-04:  66%|██████▌   | 1889/2863 [22:50<12:44,  1.27it/s][A
epoch 1 iter 1889: train loss 0.47021. lr 4.525932e-04:  66%|██████▌   | 1890/2863 [22:50<13:14,  1.23it/s][A
epoch 1 iter 1890: train loss 0.45787. lr 4.524514e-04:  66%|██████▌   | 1890/2863 [22:50<13:14,  1.23it/s][A
epoch 1 iter 1890: train loss 0.45787. lr 4.524514e-04:  66%|██████▌   | 1891/2863 [22:50<13:08,  1.23it/s][A
epoch 1 iter 1891: train loss 0.46595. lr 4.523096e-04:  66%|██████▌   | 1891/2863 [22:51<13:08,  1.23it/s][A
epoch 1 iter 1891: train loss 0.46595. lr 4.523096e-04:  66%|██████▌   | 1892/2863 [22:51<13:03,  1.24it/s][A
epoch 1 iter 1892: train loss 0.45737. lr 4.521677e-04:  66%|██████▌   | 1892/2863 [22:52<13:03,  1.24it/s][A
e

epoch 1 iter 1924: train loss 0.43362. lr 4.476039e-04:  67%|██████▋   | 1925/2863 [23:16<10:55,  1.43it/s][A
epoch 1 iter 1925: train loss 0.44585. lr 4.474605e-04:  67%|██████▋   | 1925/2863 [23:16<10:55,  1.43it/s][A
epoch 1 iter 1925: train loss 0.44585. lr 4.474605e-04:  67%|██████▋   | 1926/2863 [23:16<10:51,  1.44it/s][A
epoch 1 iter 1926: train loss 0.44498. lr 4.473171e-04:  67%|██████▋   | 1926/2863 [23:17<10:51,  1.44it/s][A
epoch 1 iter 1926: train loss 0.44498. lr 4.473171e-04:  67%|██████▋   | 1927/2863 [23:17<11:02,  1.41it/s][A
epoch 1 iter 1927: train loss 0.45401. lr 4.471736e-04:  67%|██████▋   | 1927/2863 [23:18<11:02,  1.41it/s][A
epoch 1 iter 1927: train loss 0.45401. lr 4.471736e-04:  67%|██████▋   | 1928/2863 [23:18<11:06,  1.40it/s][A
epoch 1 iter 1928: train loss 0.45280. lr 4.470301e-04:  67%|██████▋   | 1928/2863 [23:18<11:06,  1.40it/s][A
epoch 1 iter 1928: train loss 0.45280. lr 4.470301e-04:  67%|██████▋   | 1929/2863 [23:18<11:01,  1.41it/s][A
e

epoch 1 iter 1961: train loss 0.43564. lr 4.422702e-04:  68%|██████▊   | 1961/2863 [23:41<10:35,  1.42it/s][A
epoch 1 iter 1961: train loss 0.43564. lr 4.422702e-04:  69%|██████▊   | 1962/2863 [23:41<10:41,  1.41it/s][A
epoch 1 iter 1962: train loss 0.44022. lr 4.421253e-04:  69%|██████▊   | 1962/2863 [23:42<10:41,  1.41it/s][A
epoch 1 iter 1962: train loss 0.44022. lr 4.421253e-04:  69%|██████▊   | 1963/2863 [23:42<10:38,  1.41it/s][A
epoch 1 iter 1963: train loss 0.43492. lr 4.419802e-04:  69%|██████▊   | 1963/2863 [23:43<10:38,  1.41it/s][A
epoch 1 iter 1963: train loss 0.43492. lr 4.419802e-04:  69%|██████▊   | 1964/2863 [23:43<10:37,  1.41it/s][A
epoch 1 iter 1964: train loss 0.43876. lr 4.418352e-04:  69%|██████▊   | 1964/2863 [23:44<10:37,  1.41it/s][A
epoch 1 iter 1964: train loss 0.43876. lr 4.418352e-04:  69%|██████▊   | 1965/2863 [23:44<10:31,  1.42it/s][A
epoch 1 iter 1965: train loss 0.44453. lr 4.416901e-04:  69%|██████▊   | 1965/2863 [23:44<10:31,  1.42it/s][A
e

epoch 1 iter 1997: train loss 0.42586. lr 4.370244e-04:  70%|██████▉   | 1998/2863 [24:07<09:56,  1.45it/s][A
epoch 1 iter 1998: train loss 0.42793. lr 4.368780e-04:  70%|██████▉   | 1998/2863 [24:07<09:56,  1.45it/s][A
epoch 1 iter 1998: train loss 0.42793. lr 4.368780e-04:  70%|██████▉   | 1999/2863 [24:07<09:56,  1.45it/s][A
epoch 1 iter 1999: train loss 0.43811. lr 4.367314e-04:  70%|██████▉   | 1999/2863 [24:08<09:56,  1.45it/s][A
epoch 1 iter 1999: train loss 0.43811. lr 4.367314e-04:  70%|██████▉   | 2000/2863 [24:08<09:56,  1.45it/s][A
epoch 1 iter 2000: train loss 0.42378. lr 4.365848e-04:  70%|██████▉   | 2000/2863 [24:09<09:56,  1.45it/s][A
epoch 1 iter 2000: train loss 0.42378. lr 4.365848e-04:  70%|██████▉   | 2001/2863 [24:09<09:56,  1.45it/s][A
epoch 1 iter 2001: train loss 0.42159. lr 4.364382e-04:  70%|██████▉   | 2001/2863 [24:10<09:56,  1.45it/s][A
epoch 1 iter 2001: train loss 0.42159. lr 4.364382e-04:  70%|██████▉   | 2002/2863 [24:10<10:28,  1.37it/s][A
e

epoch 1 iter 2034: train loss 0.41809. lr 4.315772e-04:  71%|███████   | 2034/2863 [24:33<10:00,  1.38it/s][A
epoch 1 iter 2034: train loss 0.41809. lr 4.315772e-04:  71%|███████   | 2035/2863 [24:33<09:57,  1.39it/s][A
epoch 1 iter 2035: train loss 0.42296. lr 4.314292e-04:  71%|███████   | 2035/2863 [24:34<09:57,  1.39it/s][A
epoch 1 iter 2035: train loss 0.42296. lr 4.314292e-04:  71%|███████   | 2036/2863 [24:34<09:51,  1.40it/s][A
epoch 1 iter 2036: train loss 0.42949. lr 4.312812e-04:  71%|███████   | 2036/2863 [24:34<09:51,  1.40it/s][A
epoch 1 iter 2036: train loss 0.42949. lr 4.312812e-04:  71%|███████   | 2037/2863 [24:34<09:42,  1.42it/s][A
epoch 1 iter 2037: train loss 0.40985. lr 4.311331e-04:  71%|███████   | 2037/2863 [24:35<09:42,  1.42it/s][A
epoch 1 iter 2037: train loss 0.40985. lr 4.311331e-04:  71%|███████   | 2038/2863 [24:35<09:35,  1.43it/s][A
epoch 1 iter 2038: train loss 0.41672. lr 4.309850e-04:  71%|███████   | 2038/2863 [24:36<09:35,  1.43it/s][A
e

epoch 1 iter 2070: train loss 0.39842. lr 4.262251e-04:  72%|███████▏  | 2071/2863 [24:58<09:10,  1.44it/s][A
epoch 1 iter 2071: train loss 0.40927. lr 4.260757e-04:  72%|███████▏  | 2071/2863 [24:59<09:10,  1.44it/s][A
epoch 1 iter 2071: train loss 0.40927. lr 4.260757e-04:  72%|███████▏  | 2072/2863 [24:59<09:11,  1.43it/s][A
epoch 1 iter 2072: train loss 0.41595. lr 4.259263e-04:  72%|███████▏  | 2072/2863 [24:59<09:11,  1.43it/s][A
epoch 1 iter 2072: train loss 0.41595. lr 4.259263e-04:  72%|███████▏  | 2073/2863 [24:59<09:12,  1.43it/s][A
epoch 1 iter 2073: train loss 0.41237. lr 4.257769e-04:  72%|███████▏  | 2073/2863 [25:00<09:12,  1.43it/s][A
epoch 1 iter 2073: train loss 0.41237. lr 4.257769e-04:  72%|███████▏  | 2074/2863 [25:00<09:10,  1.43it/s][A
epoch 1 iter 2074: train loss 0.40860. lr 4.256274e-04:  72%|███████▏  | 2074/2863 [25:01<09:10,  1.43it/s][A
epoch 1 iter 2074: train loss 0.40860. lr 4.256274e-04:  72%|███████▏  | 2075/2863 [25:01<09:07,  1.44it/s][A
e

epoch 1 iter 2107: train loss 0.41453. lr 4.206730e-04:  74%|███████▎  | 2107/2863 [25:24<08:45,  1.44it/s][A
epoch 1 iter 2107: train loss 0.41453. lr 4.206730e-04:  74%|███████▎  | 2108/2863 [25:24<08:44,  1.44it/s][A
epoch 1 iter 2108: train loss 0.39743. lr 4.205223e-04:  74%|███████▎  | 2108/2863 [25:25<08:44,  1.44it/s][A
epoch 1 iter 2108: train loss 0.39743. lr 4.205223e-04:  74%|███████▎  | 2109/2863 [25:25<08:41,  1.44it/s][A
epoch 1 iter 2109: train loss 0.39567. lr 4.203715e-04:  74%|███████▎  | 2109/2863 [25:26<08:41,  1.44it/s][A
epoch 1 iter 2109: train loss 0.39567. lr 4.203715e-04:  74%|███████▎  | 2110/2863 [25:26<08:39,  1.45it/s][A
epoch 1 iter 2110: train loss 0.39856. lr 4.202206e-04:  74%|███████▎  | 2110/2863 [25:26<08:39,  1.45it/s][A
epoch 1 iter 2110: train loss 0.39856. lr 4.202206e-04:  74%|███████▎  | 2111/2863 [25:26<08:40,  1.45it/s][A
epoch 1 iter 2111: train loss 0.40337. lr 4.200698e-04:  74%|███████▎  | 2111/2863 [25:27<08:40,  1.45it/s][A
e

epoch 1 iter 2143: train loss 0.38391. lr 4.152232e-04:  75%|███████▍  | 2144/2863 [25:50<08:32,  1.40it/s][A
epoch 1 iter 2144: train loss 0.39581. lr 4.150712e-04:  75%|███████▍  | 2144/2863 [25:50<08:32,  1.40it/s][A
epoch 1 iter 2144: train loss 0.39581. lr 4.150712e-04:  75%|███████▍  | 2145/2863 [25:50<08:27,  1.42it/s][A
epoch 1 iter 2145: train loss 0.36894. lr 4.149191e-04:  75%|███████▍  | 2145/2863 [25:51<08:27,  1.42it/s][A
epoch 1 iter 2145: train loss 0.36894. lr 4.149191e-04:  75%|███████▍  | 2146/2863 [25:51<08:50,  1.35it/s][A
epoch 1 iter 2146: train loss 0.39173. lr 4.147670e-04:  75%|███████▍  | 2146/2863 [25:52<08:50,  1.35it/s][A
epoch 1 iter 2146: train loss 0.39173. lr 4.147670e-04:  75%|███████▍  | 2147/2863 [25:52<09:04,  1.32it/s][A
epoch 1 iter 2147: train loss 0.38199. lr 4.146149e-04:  75%|███████▍  | 2147/2863 [25:53<09:04,  1.32it/s][A
epoch 1 iter 2147: train loss 0.38199. lr 4.146149e-04:  75%|███████▌  | 2148/2863 [25:53<09:03,  1.31it/s][A
e

epoch 1 iter 2180: train loss 0.38893. lr 4.095752e-04:  76%|███████▌  | 2180/2863 [26:16<08:08,  1.40it/s][A
epoch 1 iter 2180: train loss 0.38893. lr 4.095752e-04:  76%|███████▌  | 2181/2863 [26:16<08:06,  1.40it/s][A
epoch 1 iter 2181: train loss 0.37667. lr 4.094219e-04:  76%|███████▌  | 2181/2863 [26:17<08:06,  1.40it/s][A
epoch 1 iter 2181: train loss 0.37667. lr 4.094219e-04:  76%|███████▌  | 2182/2863 [26:17<08:03,  1.41it/s][A
epoch 1 iter 2182: train loss 0.38661. lr 4.092686e-04:  76%|███████▌  | 2182/2863 [26:17<08:03,  1.41it/s][A
epoch 1 iter 2182: train loss 0.38661. lr 4.092686e-04:  76%|███████▌  | 2183/2863 [26:17<08:01,  1.41it/s][A
epoch 1 iter 2183: train loss 0.37772. lr 4.091152e-04:  76%|███████▌  | 2183/2863 [26:18<08:01,  1.41it/s][A
epoch 1 iter 2183: train loss 0.37772. lr 4.091152e-04:  76%|███████▋  | 2184/2863 [26:18<07:55,  1.43it/s][A
epoch 1 iter 2184: train loss 0.38828. lr 4.089618e-04:  76%|███████▋  | 2184/2863 [26:19<07:55,  1.43it/s][A
e

epoch 1 iter 2216: train loss 0.36500. lr 4.040364e-04:  77%|███████▋  | 2217/2863 [26:41<07:27,  1.44it/s][A
epoch 1 iter 2217: train loss 0.37717. lr 4.038819e-04:  77%|███████▋  | 2217/2863 [26:42<07:27,  1.44it/s][A
epoch 1 iter 2217: train loss 0.37717. lr 4.038819e-04:  77%|███████▋  | 2218/2863 [26:42<07:27,  1.44it/s][A
epoch 1 iter 2218: train loss 0.37468. lr 4.037275e-04:  77%|███████▋  | 2218/2863 [26:42<07:27,  1.44it/s][A
epoch 1 iter 2218: train loss 0.37468. lr 4.037275e-04:  78%|███████▊  | 2219/2863 [26:42<07:26,  1.44it/s][A
epoch 1 iter 2219: train loss 0.36163. lr 4.035730e-04:  78%|███████▊  | 2219/2863 [26:43<07:26,  1.44it/s][A
epoch 1 iter 2219: train loss 0.36163. lr 4.035730e-04:  78%|███████▊  | 2220/2863 [26:43<07:28,  1.43it/s][A
epoch 1 iter 2220: train loss 0.37663. lr 4.034184e-04:  78%|███████▊  | 2220/2863 [26:44<07:28,  1.43it/s][A
epoch 1 iter 2220: train loss 0.37663. lr 4.034184e-04:  78%|███████▊  | 2221/2863 [26:44<07:29,  1.43it/s][A
e

epoch 1 iter 2253: train loss 0.36601. lr 3.983014e-04:  79%|███████▊  | 2253/2863 [27:07<07:02,  1.44it/s][A
epoch 1 iter 2253: train loss 0.36601. lr 3.983014e-04:  79%|███████▊  | 2254/2863 [27:07<07:23,  1.37it/s][A
epoch 1 iter 2254: train loss 0.37905. lr 3.981459e-04:  79%|███████▊  | 2254/2863 [27:08<07:23,  1.37it/s][A
epoch 1 iter 2254: train loss 0.37905. lr 3.981459e-04:  79%|███████▉  | 2255/2863 [27:08<07:16,  1.39it/s][A
epoch 1 iter 2255: train loss 0.36726. lr 3.979903e-04:  79%|███████▉  | 2255/2863 [27:08<07:16,  1.39it/s][A
epoch 1 iter 2255: train loss 0.36726. lr 3.979903e-04:  79%|███████▉  | 2256/2863 [27:08<07:11,  1.41it/s][A
epoch 1 iter 2256: train loss 0.36537. lr 3.978346e-04:  79%|███████▉  | 2256/2863 [27:09<07:11,  1.41it/s][A
epoch 1 iter 2256: train loss 0.36537. lr 3.978346e-04:  79%|███████▉  | 2257/2863 [27:09<07:06,  1.42it/s][A
epoch 1 iter 2257: train loss 0.37930. lr 3.976790e-04:  79%|███████▉  | 2257/2863 [27:10<07:06,  1.42it/s][A
e

epoch 1 iter 2289: train loss 0.36642. lr 3.926826e-04:  80%|███████▉  | 2290/2863 [27:32<06:41,  1.43it/s][A
epoch 1 iter 2290: train loss 0.36288. lr 3.925260e-04:  80%|███████▉  | 2290/2863 [27:33<06:41,  1.43it/s][A
epoch 1 iter 2290: train loss 0.36288. lr 3.925260e-04:  80%|████████  | 2291/2863 [27:33<06:42,  1.42it/s][A
epoch 1 iter 2291: train loss 0.35989. lr 3.923693e-04:  80%|████████  | 2291/2863 [27:34<06:42,  1.42it/s][A
epoch 1 iter 2291: train loss 0.35989. lr 3.923693e-04:  80%|████████  | 2292/2863 [27:34<06:43,  1.42it/s][A
epoch 1 iter 2292: train loss 0.35417. lr 3.922127e-04:  80%|████████  | 2292/2863 [27:34<06:43,  1.42it/s][A
epoch 1 iter 2292: train loss 0.35417. lr 3.922127e-04:  80%|████████  | 2293/2863 [27:34<06:39,  1.43it/s][A
epoch 1 iter 2293: train loss 0.35751. lr 3.920560e-04:  80%|████████  | 2293/2863 [27:35<06:39,  1.43it/s][A
epoch 1 iter 2293: train loss 0.35751. lr 3.920560e-04:  80%|████████  | 2294/2863 [27:35<06:35,  1.44it/s][A
e

epoch 1 iter 2326: train loss 0.34986. lr 3.868700e-04:  81%|████████  | 2326/2863 [27:58<06:26,  1.39it/s][A
epoch 1 iter 2326: train loss 0.34986. lr 3.868700e-04:  81%|████████▏ | 2327/2863 [27:58<06:37,  1.35it/s][A
epoch 1 iter 2327: train loss 0.34530. lr 3.867124e-04:  81%|████████▏ | 2327/2863 [27:59<06:37,  1.35it/s][A
epoch 1 iter 2327: train loss 0.34530. lr 3.867124e-04:  81%|████████▏ | 2328/2863 [27:59<06:38,  1.34it/s][A
epoch 1 iter 2328: train loss 0.35869. lr 3.865547e-04:  81%|████████▏ | 2328/2863 [28:00<06:38,  1.34it/s][A
epoch 1 iter 2328: train loss 0.35869. lr 3.865547e-04:  81%|████████▏ | 2329/2863 [28:00<06:36,  1.35it/s][A
epoch 1 iter 2329: train loss 0.35635. lr 3.863971e-04:  81%|████████▏ | 2329/2863 [28:01<06:36,  1.35it/s][A
epoch 1 iter 2329: train loss 0.35635. lr 3.863971e-04:  81%|████████▏ | 2330/2863 [28:01<06:27,  1.38it/s][A
epoch 1 iter 2330: train loss 0.35551. lr 3.862394e-04:  81%|████████▏ | 2330/2863 [28:01<06:27,  1.38it/s][A
e

epoch 1 iter 2362: train loss 0.34282. lr 3.811800e-04:  83%|████████▎ | 2363/2863 [28:23<05:47,  1.44it/s][A
epoch 1 iter 2363: train loss 0.34630. lr 3.810215e-04:  83%|████████▎ | 2363/2863 [28:24<05:47,  1.44it/s][A
epoch 1 iter 2363: train loss 0.34630. lr 3.810215e-04:  83%|████████▎ | 2364/2863 [28:24<05:45,  1.44it/s][A
epoch 1 iter 2364: train loss 0.35561. lr 3.808630e-04:  83%|████████▎ | 2364/2863 [28:25<05:45,  1.44it/s][A
epoch 1 iter 2364: train loss 0.35561. lr 3.808630e-04:  83%|████████▎ | 2365/2863 [28:25<05:45,  1.44it/s][A
epoch 1 iter 2365: train loss 0.35422. lr 3.807044e-04:  83%|████████▎ | 2365/2863 [28:26<05:45,  1.44it/s][A
epoch 1 iter 2365: train loss 0.35422. lr 3.807044e-04:  83%|████████▎ | 2366/2863 [28:26<06:02,  1.37it/s][A
epoch 1 iter 2366: train loss 0.35452. lr 3.805458e-04:  83%|████████▎ | 2366/2863 [28:26<06:02,  1.37it/s][A
epoch 1 iter 2366: train loss 0.35452. lr 3.805458e-04:  83%|████████▎ | 2367/2863 [28:26<05:55,  1.39it/s][A
e

epoch 1 iter 2399: train loss 0.33613. lr 3.752991e-04:  84%|████████▍ | 2399/2863 [28:50<05:25,  1.43it/s][A
epoch 1 iter 2399: train loss 0.33613. lr 3.752991e-04:  84%|████████▍ | 2400/2863 [28:50<05:23,  1.43it/s][A
epoch 1 iter 2400: train loss 0.33362. lr 3.751397e-04:  84%|████████▍ | 2400/2863 [28:50<05:23,  1.43it/s][A
epoch 1 iter 2400: train loss 0.33362. lr 3.751397e-04:  84%|████████▍ | 2401/2863 [28:50<05:21,  1.44it/s][A
epoch 1 iter 2401: train loss 0.33626. lr 3.749803e-04:  84%|████████▍ | 2401/2863 [28:51<05:21,  1.44it/s][A
epoch 1 iter 2401: train loss 0.33626. lr 3.749803e-04:  84%|████████▍ | 2402/2863 [28:51<05:19,  1.44it/s][A
epoch 1 iter 2402: train loss 0.34453. lr 3.748208e-04:  84%|████████▍ | 2402/2863 [28:52<05:19,  1.44it/s][A
epoch 1 iter 2402: train loss 0.34453. lr 3.748208e-04:  84%|████████▍ | 2403/2863 [28:52<05:21,  1.43it/s][A
epoch 1 iter 2403: train loss 0.33634. lr 3.746614e-04:  84%|████████▍ | 2403/2863 [28:52<05:21,  1.43it/s][A
e

epoch 1 iter 2435: train loss 0.33769. lr 3.695472e-04:  85%|████████▌ | 2436/2863 [29:15<04:54,  1.45it/s][A
epoch 1 iter 2436: train loss 0.33202. lr 3.693870e-04:  85%|████████▌ | 2436/2863 [29:15<04:54,  1.45it/s][A
epoch 1 iter 2436: train loss 0.33202. lr 3.693870e-04:  85%|████████▌ | 2437/2863 [29:15<04:54,  1.45it/s][A
epoch 1 iter 2437: train loss 0.33064. lr 3.692269e-04:  85%|████████▌ | 2437/2863 [29:16<04:54,  1.45it/s][A
epoch 1 iter 2437: train loss 0.33064. lr 3.692269e-04:  85%|████████▌ | 2438/2863 [29:16<04:53,  1.45it/s][A
epoch 1 iter 2438: train loss 0.32413. lr 3.690666e-04:  85%|████████▌ | 2438/2863 [29:17<04:53,  1.45it/s][A
epoch 1 iter 2438: train loss 0.32413. lr 3.690666e-04:  85%|████████▌ | 2439/2863 [29:17<04:52,  1.45it/s][A
epoch 1 iter 2439: train loss 0.34005. lr 3.689064e-04:  85%|████████▌ | 2439/2863 [29:18<04:52,  1.45it/s][A
epoch 1 iter 2439: train loss 0.34005. lr 3.689064e-04:  85%|████████▌ | 2440/2863 [29:18<04:52,  1.45it/s][A
e

epoch 1 iter 2472: train loss 0.32308. lr 3.636073e-04:  86%|████████▋ | 2472/2863 [29:41<04:32,  1.43it/s][A
epoch 1 iter 2472: train loss 0.32308. lr 3.636073e-04:  86%|████████▋ | 2473/2863 [29:41<04:30,  1.44it/s][A
epoch 1 iter 2473: train loss 0.32588. lr 3.634464e-04:  86%|████████▋ | 2473/2863 [29:41<04:30,  1.44it/s][A
epoch 1 iter 2473: train loss 0.32588. lr 3.634464e-04:  86%|████████▋ | 2474/2863 [29:41<04:29,  1.44it/s][A
epoch 1 iter 2474: train loss 0.32634. lr 3.632855e-04:  86%|████████▋ | 2474/2863 [29:42<04:29,  1.44it/s][A
epoch 1 iter 2474: train loss 0.32634. lr 3.632855e-04:  86%|████████▋ | 2475/2863 [29:42<04:28,  1.44it/s][A
epoch 1 iter 2475: train loss 0.33144. lr 3.631245e-04:  86%|████████▋ | 2475/2863 [29:43<04:28,  1.44it/s][A
epoch 1 iter 2475: train loss 0.33144. lr 3.631245e-04:  86%|████████▋ | 2476/2863 [29:43<04:28,  1.44it/s][A
epoch 1 iter 2476: train loss 0.33241. lr 3.629636e-04:  86%|████████▋ | 2476/2863 [29:44<04:28,  1.44it/s][A
e

epoch 1 iter 2508: train loss 0.33082. lr 3.578028e-04:  88%|████████▊ | 2509/2863 [30:08<04:39,  1.27it/s][A
epoch 1 iter 2509: train loss 0.32306. lr 3.576412e-04:  88%|████████▊ | 2509/2863 [30:09<04:39,  1.27it/s][A
epoch 1 iter 2509: train loss 0.32306. lr 3.576412e-04:  88%|████████▊ | 2510/2863 [30:09<04:38,  1.27it/s][A
epoch 1 iter 2510: train loss 0.31217. lr 3.574796e-04:  88%|████████▊ | 2510/2863 [30:09<04:38,  1.27it/s][A
epoch 1 iter 2510: train loss 0.31217. lr 3.574796e-04:  88%|████████▊ | 2511/2863 [30:09<04:35,  1.28it/s][A
epoch 1 iter 2511: train loss 0.32809. lr 3.573180e-04:  88%|████████▊ | 2511/2863 [30:10<04:35,  1.28it/s][A
epoch 1 iter 2511: train loss 0.32809. lr 3.573180e-04:  88%|████████▊ | 2512/2863 [30:10<04:34,  1.28it/s][A
epoch 1 iter 2512: train loss 0.31155. lr 3.571564e-04:  88%|████████▊ | 2512/2863 [30:11<04:34,  1.28it/s][A
epoch 1 iter 2512: train loss 0.31155. lr 3.571564e-04:  88%|████████▊ | 2513/2863 [30:11<04:32,  1.28it/s][A
e

epoch 1 iter 2545: train loss 0.31083. lr 3.518135e-04:  89%|████████▉ | 2545/2863 [30:37<04:12,  1.26it/s][A
epoch 1 iter 2545: train loss 0.31083. lr 3.518135e-04:  89%|████████▉ | 2546/2863 [30:37<04:11,  1.26it/s][A
epoch 1 iter 2546: train loss 0.32598. lr 3.516513e-04:  89%|████████▉ | 2546/2863 [30:38<04:11,  1.26it/s][A
epoch 1 iter 2546: train loss 0.32598. lr 3.516513e-04:  89%|████████▉ | 2547/2863 [30:38<04:09,  1.27it/s][A
epoch 1 iter 2547: train loss 0.30646. lr 3.514891e-04:  89%|████████▉ | 2547/2863 [30:39<04:09,  1.27it/s][A
epoch 1 iter 2547: train loss 0.30646. lr 3.514891e-04:  89%|████████▉ | 2548/2863 [30:39<04:08,  1.27it/s][A
epoch 1 iter 2548: train loss 0.30446. lr 3.513269e-04:  89%|████████▉ | 2548/2863 [30:39<04:08,  1.27it/s][A
epoch 1 iter 2548: train loss 0.30446. lr 3.513269e-04:  89%|████████▉ | 2549/2863 [30:39<04:05,  1.28it/s][A
epoch 1 iter 2549: train loss 0.31932. lr 3.511647e-04:  89%|████████▉ | 2549/2863 [30:40<04:05,  1.28it/s][A
e

epoch 1 iter 2581: train loss 0.31857. lr 3.459656e-04:  90%|█████████ | 2582/2863 [31:06<03:42,  1.26it/s][A
epoch 1 iter 2582: train loss 0.31953. lr 3.458028e-04:  90%|█████████ | 2582/2863 [31:06<03:42,  1.26it/s][A
epoch 1 iter 2582: train loss 0.31953. lr 3.458028e-04:  90%|█████████ | 2583/2863 [31:06<03:41,  1.26it/s][A
epoch 1 iter 2583: train loss 0.31177. lr 3.456401e-04:  90%|█████████ | 2583/2863 [31:07<03:41,  1.26it/s][A
epoch 1 iter 2583: train loss 0.31177. lr 3.456401e-04:  90%|█████████ | 2584/2863 [31:07<03:40,  1.26it/s][A
epoch 1 iter 2584: train loss 0.32451. lr 3.454774e-04:  90%|█████████ | 2584/2863 [31:08<03:40,  1.26it/s][A
epoch 1 iter 2584: train loss 0.32451. lr 3.454774e-04:  90%|█████████ | 2585/2863 [31:08<03:42,  1.25it/s][A
epoch 1 iter 2585: train loss 0.30785. lr 3.453146e-04:  90%|█████████ | 2585/2863 [31:09<03:42,  1.25it/s][A
epoch 1 iter 2585: train loss 0.30785. lr 3.453146e-04:  90%|█████████ | 2586/2863 [31:09<03:40,  1.25it/s][A
e

epoch 1 iter 2618: train loss 0.31605. lr 3.399365e-04:  91%|█████████▏| 2618/2863 [31:33<03:07,  1.31it/s][A
epoch 1 iter 2618: train loss 0.31605. lr 3.399365e-04:  91%|█████████▏| 2619/2863 [31:33<03:03,  1.33it/s][A
epoch 1 iter 2619: train loss 0.31052. lr 3.397733e-04:  91%|█████████▏| 2619/2863 [31:34<03:03,  1.33it/s][A
epoch 1 iter 2619: train loss 0.31052. lr 3.397733e-04:  92%|█████████▏| 2620/2863 [31:34<02:59,  1.36it/s][A
epoch 1 iter 2620: train loss 0.30297. lr 3.396101e-04:  92%|█████████▏| 2620/2863 [31:35<02:59,  1.36it/s][A
epoch 1 iter 2620: train loss 0.30297. lr 3.396101e-04:  92%|█████████▏| 2621/2863 [31:35<02:55,  1.38it/s][A
epoch 1 iter 2621: train loss 0.30442. lr 3.394469e-04:  92%|█████████▏| 2621/2863 [31:35<02:55,  1.38it/s][A
epoch 1 iter 2621: train loss 0.30442. lr 3.394469e-04:  92%|█████████▏| 2622/2863 [31:35<02:51,  1.40it/s][A
epoch 1 iter 2622: train loss 0.31159. lr 3.392837e-04:  92%|█████████▏| 2622/2863 [31:36<02:51,  1.40it/s][A
e

epoch 1 iter 2654: train loss 0.31169. lr 3.340546e-04:  93%|█████████▎| 2655/2863 [31:59<02:31,  1.37it/s][A
epoch 1 iter 2655: train loss 0.30183. lr 3.338910e-04:  93%|█████████▎| 2655/2863 [31:59<02:31,  1.37it/s][A
epoch 1 iter 2655: train loss 0.30183. lr 3.338910e-04:  93%|█████████▎| 2656/2863 [31:59<02:30,  1.38it/s][A
epoch 1 iter 2656: train loss 0.30690. lr 3.337274e-04:  93%|█████████▎| 2656/2863 [32:00<02:30,  1.38it/s][A
epoch 1 iter 2656: train loss 0.30690. lr 3.337274e-04:  93%|█████████▎| 2657/2863 [32:00<02:28,  1.39it/s][A
epoch 1 iter 2657: train loss 0.30475. lr 3.335638e-04:  93%|█████████▎| 2657/2863 [32:01<02:28,  1.39it/s][A
epoch 1 iter 2657: train loss 0.30475. lr 3.335638e-04:  93%|█████████▎| 2658/2863 [32:01<02:26,  1.40it/s][A
epoch 1 iter 2658: train loss 0.30689. lr 3.334002e-04:  93%|█████████▎| 2658/2863 [32:02<02:26,  1.40it/s][A
epoch 1 iter 2658: train loss 0.30689. lr 3.334002e-04:  93%|█████████▎| 2659/2863 [32:02<02:25,  1.41it/s][A
e

epoch 1 iter 2691: train loss 0.29213. lr 3.279954e-04:  94%|█████████▍| 2691/2863 [32:25<02:01,  1.41it/s][A
epoch 1 iter 2691: train loss 0.29213. lr 3.279954e-04:  94%|█████████▍| 2692/2863 [32:25<02:00,  1.42it/s][A
epoch 1 iter 2692: train loss 0.30893. lr 3.278315e-04:  94%|█████████▍| 2692/2863 [32:26<02:00,  1.42it/s][A
epoch 1 iter 2692: train loss 0.30893. lr 3.278315e-04:  94%|█████████▍| 2693/2863 [32:26<01:59,  1.43it/s][A
epoch 1 iter 2693: train loss 0.30142. lr 3.276675e-04:  94%|█████████▍| 2693/2863 [32:26<01:59,  1.43it/s][A
epoch 1 iter 2693: train loss 0.30142. lr 3.276675e-04:  94%|█████████▍| 2694/2863 [32:26<01:57,  1.43it/s][A
epoch 1 iter 2694: train loss 0.30302. lr 3.275036e-04:  94%|█████████▍| 2694/2863 [32:27<01:57,  1.43it/s][A
epoch 1 iter 2694: train loss 0.30302. lr 3.275036e-04:  94%|█████████▍| 2695/2863 [32:27<01:57,  1.43it/s][A
epoch 1 iter 2695: train loss 0.29676. lr 3.273396e-04:  94%|█████████▍| 2695/2863 [32:28<01:57,  1.43it/s][A
e

epoch 1 iter 2727: train loss 0.29667. lr 3.220889e-04:  95%|█████████▌| 2728/2863 [32:52<01:45,  1.29it/s][A
epoch 1 iter 2728: train loss 0.29614. lr 3.219247e-04:  95%|█████████▌| 2728/2863 [32:53<01:45,  1.29it/s][A
epoch 1 iter 2728: train loss 0.29614. lr 3.219247e-04:  95%|█████████▌| 2729/2863 [32:53<01:44,  1.28it/s][A
epoch 1 iter 2729: train loss 0.30021. lr 3.217605e-04:  95%|█████████▌| 2729/2863 [32:54<01:44,  1.28it/s][A
epoch 1 iter 2729: train loss 0.30021. lr 3.217605e-04:  95%|█████████▌| 2730/2863 [32:54<01:48,  1.23it/s][A
epoch 1 iter 2730: train loss 0.30089. lr 3.215963e-04:  95%|█████████▌| 2730/2863 [32:54<01:48,  1.23it/s][A
epoch 1 iter 2730: train loss 0.30089. lr 3.215963e-04:  95%|█████████▌| 2731/2863 [32:54<01:45,  1.25it/s][A
epoch 1 iter 2731: train loss 0.30665. lr 3.214321e-04:  95%|█████████▌| 2731/2863 [32:55<01:45,  1.25it/s][A
epoch 1 iter 2731: train loss 0.30665. lr 3.214321e-04:  95%|█████████▌| 2732/2863 [32:55<01:44,  1.26it/s][A
e

epoch 1 iter 2764: train loss 0.28461. lr 3.160094e-04:  97%|█████████▋| 2764/2863 [33:21<01:18,  1.26it/s][A
epoch 1 iter 2764: train loss 0.28461. lr 3.160094e-04:  97%|█████████▋| 2765/2863 [33:21<01:17,  1.27it/s][A
epoch 1 iter 2765: train loss 0.29497. lr 3.158450e-04:  97%|█████████▋| 2765/2863 [33:22<01:17,  1.27it/s][A
epoch 1 iter 2765: train loss 0.29497. lr 3.158450e-04:  97%|█████████▋| 2766/2863 [33:22<01:16,  1.27it/s][A
epoch 1 iter 2766: train loss 0.28862. lr 3.156806e-04:  97%|█████████▋| 2766/2863 [33:23<01:16,  1.27it/s][A
epoch 1 iter 2766: train loss 0.28862. lr 3.156806e-04:  97%|█████████▋| 2767/2863 [33:23<01:15,  1.27it/s][A
epoch 1 iter 2767: train loss 0.28171. lr 3.155162e-04:  97%|█████████▋| 2767/2863 [33:24<01:15,  1.27it/s][A
epoch 1 iter 2767: train loss 0.28171. lr 3.155162e-04:  97%|█████████▋| 2768/2863 [33:24<01:15,  1.26it/s][A
epoch 1 iter 2768: train loss 0.27717. lr 3.153517e-04:  97%|█████████▋| 2768/2863 [33:24<01:15,  1.26it/s][A
e

epoch 1 iter 2800: train loss 0.27686. lr 3.100878e-04:  98%|█████████▊| 2801/2863 [33:50<00:48,  1.27it/s][A
epoch 1 iter 2801: train loss 0.28917. lr 3.099233e-04:  98%|█████████▊| 2801/2863 [33:50<00:48,  1.27it/s][A
epoch 1 iter 2801: train loss 0.28917. lr 3.099233e-04:  98%|█████████▊| 2802/2863 [33:50<00:47,  1.28it/s][A
epoch 1 iter 2802: train loss 0.28973. lr 3.097587e-04:  98%|█████████▊| 2802/2863 [33:51<00:47,  1.28it/s][A
epoch 1 iter 2802: train loss 0.28973. lr 3.097587e-04:  98%|█████████▊| 2803/2863 [33:51<00:47,  1.28it/s][A
epoch 1 iter 2803: train loss 0.29727. lr 3.095942e-04:  98%|█████████▊| 2803/2863 [33:52<00:47,  1.28it/s][A
epoch 1 iter 2803: train loss 0.29727. lr 3.095942e-04:  98%|█████████▊| 2804/2863 [33:52<00:46,  1.27it/s][A
epoch 1 iter 2804: train loss 0.27966. lr 3.094296e-04:  98%|█████████▊| 2804/2863 [33:53<00:46,  1.27it/s][A
epoch 1 iter 2804: train loss 0.27966. lr 3.094296e-04:  98%|█████████▊| 2805/2863 [33:53<00:45,  1.27it/s][A
e

epoch 1 iter 2837: train loss 0.28168. lr 3.039977e-04:  99%|█████████▉| 2837/2863 [34:17<00:18,  1.42it/s][A
epoch 1 iter 2837: train loss 0.28168. lr 3.039977e-04:  99%|█████████▉| 2838/2863 [34:17<00:17,  1.43it/s][A
epoch 1 iter 2838: train loss 0.28607. lr 3.038331e-04:  99%|█████████▉| 2838/2863 [34:18<00:17,  1.43it/s][A
epoch 1 iter 2838: train loss 0.28607. lr 3.038331e-04:  99%|█████████▉| 2839/2863 [34:18<00:16,  1.44it/s][A
epoch 1 iter 2839: train loss 0.26548. lr 3.036684e-04:  99%|█████████▉| 2839/2863 [34:19<00:16,  1.44it/s][A
epoch 1 iter 2839: train loss 0.26548. lr 3.036684e-04:  99%|█████████▉| 2840/2863 [34:19<00:15,  1.44it/s][A
epoch 1 iter 2840: train loss 0.26813. lr 3.035038e-04:  99%|█████████▉| 2840/2863 [34:19<00:15,  1.44it/s][A
epoch 1 iter 2840: train loss 0.26813. lr 3.035038e-04:  99%|█████████▉| 2841/2863 [34:19<00:15,  1.44it/s][A
epoch 1 iter 2841: train loss 0.28392. lr 3.033392e-04:  99%|█████████▉| 2841/2863 [34:20<00:15,  1.44it/s][A
e

data has 1895152 characters, 44695 unique.



  0%|          | 0/7403 [00:01<?, ?it/s][A
 10%|█         | 1/10 [36:24<5:27:43, 2184.87s/it]


RuntimeError: CUDA out of memory. Tried to allocate 1.37 GiB (GPU 3; 23.65 GiB total capacity; 7.18 GiB already allocated; 180.12 MiB free; 8.32 GiB reserved in total by PyTorch) (malloc at /pytorch/c10/cuda/CUDACachingAllocator.cpp:289)
frame #0: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x46 (0x7fbfe9ccd536 in /home/mlepekhin/anaconda3/envs/mlepekhin_research/lib/python3.8/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x1cf1e (0x7fbfe9f16f1e in /home/mlepekhin/anaconda3/envs/mlepekhin_research/lib/python3.8/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x1df9e (0x7fbfe9f17f9e in /home/mlepekhin/anaconda3/envs/mlepekhin_research/lib/python3.8/site-packages/torch/lib/libc10_cuda.so)
frame #3: at::native::empty_cuda(c10::ArrayRef<long>, c10::TensorOptions const&, c10::optional<c10::MemoryFormat>) + 0x135 (0x7fbfecaa59e5 in /home/mlepekhin/anaconda3/envs/mlepekhin_research/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so)
frame #4: <unknown function> + 0xf688bb (0x7fbfeb0918bb in /home/mlepekhin/anaconda3/envs/mlepekhin_research/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so)
frame #5: <unknown function> + 0xfb21a7 (0x7fbfeb0db1a7 in /home/mlepekhin/anaconda3/envs/mlepekhin_research/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so)
frame #6: <unknown function> + 0x1073c49 (0x7fc0279aec49 in /home/mlepekhin/anaconda3/envs/mlepekhin_research/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #7: <unknown function> + 0x1073f87 (0x7fc0279aef87 in /home/mlepekhin/anaconda3/envs/mlepekhin_research/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #8: <unknown function> + 0xe1ff1e (0x7fc02775af1e in /home/mlepekhin/anaconda3/envs/mlepekhin_research/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #9: at::native::empty_like(at::Tensor const&, c10::TensorOptions const&, c10::optional<c10::MemoryFormat>) + 0x9e0 (0x7fc027761810 in /home/mlepekhin/anaconda3/envs/mlepekhin_research/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #10: <unknown function> + 0x1132be1 (0x7fc027a6dbe1 in /home/mlepekhin/anaconda3/envs/mlepekhin_research/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #11: <unknown function> + 0x1185ee3 (0x7fc027ac0ee3 in /home/mlepekhin/anaconda3/envs/mlepekhin_research/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #12: <unknown function> + 0x28aa4c2 (0x7fbfec9d34c2 in /home/mlepekhin/anaconda3/envs/mlepekhin_research/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so)
frame #13: at::Tensor at::native::(anonymous namespace)::host_softmax_backward<at::native::(anonymous namespace)::LogSoftMaxBackwardEpilogue, true>(at::Tensor const&, at::Tensor const&, long, bool) + 0xb9 (0x7fbfec9e7ed9 in /home/mlepekhin/anaconda3/envs/mlepekhin_research/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so)
frame #14: at::native::log_softmax_backward_cuda(at::Tensor const&, at::Tensor const&, long, at::Tensor const&) + 0x99 (0x7fbfec9d3c59 in /home/mlepekhin/anaconda3/envs/mlepekhin_research/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so)
frame #15: <unknown function> + 0xf73540 (0x7fbfeb09c540 in /home/mlepekhin/anaconda3/envs/mlepekhin_research/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so)
frame #16: <unknown function> + 0x10c4396 (0x7fc0279ff396 in /home/mlepekhin/anaconda3/envs/mlepekhin_research/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #17: <unknown function> + 0x2ca977c (0x7fc0295e477c in /home/mlepekhin/anaconda3/envs/mlepekhin_research/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #18: <unknown function> + 0x10c4396 (0x7fc0279ff396 in /home/mlepekhin/anaconda3/envs/mlepekhin_research/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #19: torch::autograd::generated::LogSoftmaxBackward::apply(std::vector<at::Tensor, std::allocator<at::Tensor> >&&) + 0x1c9 (0x7fc0291e0859 in /home/mlepekhin/anaconda3/envs/mlepekhin_research/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #20: <unknown function> + 0x2d89705 (0x7fc0296c4705 in /home/mlepekhin/anaconda3/envs/mlepekhin_research/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #21: torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&) + 0x16f3 (0x7fc0296c1a03 in /home/mlepekhin/anaconda3/envs/mlepekhin_research/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #22: torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&, bool) + 0x3d2 (0x7fc0296c27e2 in /home/mlepekhin/anaconda3/envs/mlepekhin_research/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #23: torch::autograd::Engine::thread_init(int) + 0x39 (0x7fc0296bae59 in /home/mlepekhin/anaconda3/envs/mlepekhin_research/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #24: torch::autograd::python::PythonEngine::thread_init(int) + 0x38 (0x7fc035ffe5f8 in /home/mlepekhin/anaconda3/envs/mlepekhin_research/lib/python3.8/site-packages/torch/lib/libtorch_python.so)
frame #25: <unknown function> + 0xc819d (0x7fc0686bc19d in /home/mlepekhin/anaconda3/envs/mlepekhin_research/lib/python3.8/site-packages/zmq/backend/cython/../../../../.././libstdc++.so.6)
frame #26: <unknown function> + 0x9669 (0x7fc06ad99669 in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #27: clone + 0x43 (0x7fc06acc1323 in /lib/x86_64-linux-gnu/libc.so.6)


1
