<a href="https://colab.research.google.com/github/JuCarv-bit/exercises-deep-learning-curriculum/blob/main/Notebook_running_shakespeare.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Theoretical Discussion - Week 1

**- What is different architecturally from the Transformer, vs a normal RNN, like an LSTM? (Specifically, how are recurrence and time managed?)**

 Transformers work with parallel inputs (multi-head attention), while RNNs and LSTMs have a time component and a hidden state. Recurrent layers are present in RNN, while transform uses MHA layers.





- Attention is defined as, Attention(Q,K,V) = softmax(QK^T/sqrt(d_k))V. What are the dimensions for Q, K, and V? Why do we use this setup? What other combinations could we do with (Q,K) that also output weights?

The dimension of the matrix Query $Q$ and $K$ is $(dk, d_{model})$, and Value is $(dv, d_{model})$.

Computation of multiheaded attention (mostly) constant.

The dimensions of (Q,K) must be equal



- Are the dense layers different at each multi-head attention block? Why or why not?

The architecture is the same (self-attention + feed forward layer)


- Why do we have so many skip connections, especially connecting the input of an attention function to the output? Intuitively, what if we didn't?

The skipping connections prevent the vanishing and exploding gradient. The model can have a worst performance if not using it.

In [1]:
import numpy as np
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm

from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset

In [2]:
vocab_size = 10
d_model = 512 #128

num_heads = 8 #4
num_layers = 6 #5
dropout = 0.1 #0.3
batch_size = 32 #
len_seq = 6

# 5000 permutacoes de 8 digitos

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
seq = [2, 8, 8, 0, 7, 1, 1]

# X = torch.tensor(seq).unsqueeze(0).to(torch.int64).to(device)
# y = torch.flip(X, (-1,)).to(torch.int64).to(device)

class GenerateDataset():
    def __init__(self, numdigits, vocabsize):
        self.numdigits = numdigits
        self.vocabsize = vocabsize
        # create a dataset of vocabsize**numdigits rows and vocabsize columns
        self.data = torch.randint(0, vocabsize, (vocabsize**4, numdigits)).to(device)
        self.data_reversed = torch.flip(self.data, (-1,)).to(device)


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

    def get_data(self):
        return self.data.to(device)

    def get_data_reversed(self):
        return self.data_reversed.to(device)

    def get_num_digits():
        return self.numdigits


dataset = GenerateDataset(numdigits = len_seq, vocabsize = vocab_size)
X = dataset.get_data()
y = dataset.get_data_reversed()


cuda


In [3]:
global ALREADY_PRINT
ALREADY_PRINT = 0

class Embedding(nn.Module):
    def __init__(self, vocab_size, d_model):
        super(Embedding, self).__init__()
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        print(self.vocab_size, self.d_model)
        return self.embedding(x)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1):
        super(PositionalEncoding, self).__init__()
        # implements the positional encoding function as the size of x
        self.dropout = nn.Dropout(p=dropout)
        max_len = 512
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)].requires_grad_(False)
        return self.dropout(x)


class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.dk = d_model // num_heads

        self.WO = nn.Linear(d_model, d_model)
        self.WQ = nn.Linear(d_model, d_model)
        self.WK = nn.Linear(d_model, d_model)
        self.WV = nn.Linear(d_model, d_model)

        self.dropout = nn.Dropout(dropout)

    def attention(self, Q, K, V, dk, mask=False):
        QKT = torch.matmul(Q, K.transpose(-2, -1))  # output dim: (batch_size, num_heads, n_tokens, n_tokens)
        scaled_dot_product = torch.div(QKT, math.sqrt(dk))

        if mask:
            mask = torch.triu(torch.ones(QKT.size(-2), QKT.size(-1)), diagonal=1).bool().to(device)
            scaled_dot_product = scaled_dot_product.masked_fill(mask, -float('inf'))
            # print("Masked")
            # print(scaled_dot_product)
            # ALREADY_PRINT = 1

        sm = torch.nn.Softmax(dim=-1)
        attention = sm(scaled_dot_product)
        attention = self.dropout(attention)
        return torch.matmul(attention, V)

    def forward(self, Q, K, V, mask=False):
        Q = self.WQ(Q)
        K = self.WK(K)
        V = self.WV(V)

        Q = Q.view(Q.size(0), -1, self.num_heads, self.dk).transpose(1, 2)  # Q: Why -1? A: It's a batch size
        K = K.view(K.size(0), -1, self.num_heads, self.dk).transpose(1, 2)
        V = V.view(V.size(0), -1, self.num_heads, self.dk).transpose(1, 2)

        attention = self.attention(Q, K, V, self.dk, mask)
        attention = attention.transpose(1, 2).contiguous().view(Q.size(0), -1, self.d_model)
        return self.WO(attention)

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super(DecoderLayer, self).__init__()
        self.multi_head_attention = MultiHeadAttention(d_model, num_heads, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Linear(d_model, d_model)
        )

    def forward(self, x, mask):
        x = x + self.dropout1(self.multi_head_attention(x, x, x, mask))
        x = self.norm1(x)

        x = x + self.dropout2(self.ff(x))
        x = self.norm2(x)
        return x

class Decoder(nn.Module):
    def __init__(self, d_model, num_heads, num_layers, dropout=0.1):
        super(Decoder, self).__init__()
        self.layers = nn.ModuleList([DecoderLayer(d_model, num_heads, dropout) for _ in range(num_layers)])

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return x

class MyTransformerDecoderOnly(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers,  dropout=0.1):
        super(MyTransformerDecoderOnly, self).__init__()
        self.embedding = Embedding(vocab_size, d_model)
        self.transpose_embedding = nn.Linear(d_model, vocab_size)
        self.transpose_embedding.weight = self.embedding.embedding.weight
        self.pos_enc = PositionalEncoding(d_model)
        self.decoder = Decoder(d_model, num_heads, num_layers, dropout)
        self.linear = nn.Linear(d_model, vocab_size)

    def forward(self, x, mask):
        x = self.embedding(x)
        x = self.pos_enc(x)
        x = self.decoder(x, mask)
        x = self.linear(x)
        # Transpos of emebedding
        # x = self.transpose_embedding(x)

        return x


## Trainning

In [4]:
X[0]
X.size()

torch.Size([10000, 6])

In [5]:
print(y[0])
print(X[0])

tensor([0, 8, 1, 5, 7, 2], device='cuda:0')
tensor([2, 7, 5, 1, 8, 0], device='cuda:0')


In [6]:


train_dataset = TensorDataset(X, y)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

model = MyTransformerDecoderOnly(vocab_size, d_model, num_heads, num_layers, dropout)
cross_entropy = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.0001)

print(device)

model.to(device)
model.train()

for epoch in range(1):
    pbar = tqdm(enumerate(train_loader), total=len(train_loader))
    for i, (X, y) in pbar:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        y_pred = model(X, mask=True)
        loss = cross_entropy(y_pred.view(-1, vocab_size), y.view(-1))

        loss.backward()
        optimizer.step()
        pbar.set_description(f'Epoch {epoch+1} - loss: {loss.item():.4f}')




cuda


  0%|          | 0/313 [00:00<?, ?it/s]

10 512


Epoch 1 - loss: 2.2423:   2%|▏         | 6/313 [00:01<00:54,  5.66it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 2.2795:   5%|▌         | 16/313 [00:01<00:18, 16.35it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 2.1693:   8%|▊         | 26/313 [00:01<00:11, 26.07it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 2.1845:  12%|█▏        | 36/313 [00:02<00:08, 33.30it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 2.1507:  13%|█▎        | 41/313 [00:02<00:07, 34.42it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 2.0564:  16%|█▋        | 51/313 [00:02<00:06, 38.60it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 2.0021:  19%|█▉        | 61/313 [00:02<00:06, 41.49it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10

Epoch 1 - loss: 1.9532:  23%|██▎       | 71/313 [00:02<00:05, 42.70it/s]

 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.9247:  26%|██▌       | 81/313 [00:03<00:05, 41.80it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.9619:  29%|██▉       | 91/313 [00:03<00:05, 40.84it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.8415:  31%|███       | 96/313 [00:03<00:05, 41.27it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.7527:  34%|███▍      | 106/313 [00:03<00:04, 41.74it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.7404:  37%|███▋      | 116/313 [00:04<00:04, 42.93it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.6649:  40%|████      | 126/313 [00:04<00:04, 43.24it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.6339:  43%|████▎     | 136/313 [00:04<00:04, 42.22it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.6129:  45%|████▌     | 141/313 [00:04<00:04, 42.00it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.5204:  48%|████▊     | 151/313 [00:04<00:03, 41.78it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.4480:  51%|█████▏    | 161/313 [00:05<00:03, 42.34it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.4097:  55%|█████▍    | 171/313 [00:05<00:03, 41.18it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.3557:  58%|█████▊    | 181/313 [00:05<00:03, 41.80it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.3121:  59%|█████▉    | 186/313 [00:05<00:03, 41.64it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.2733:  63%|██████▎   | 196/313 [00:05<00:02, 42.37it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.3570:  66%|██████▌   | 206/313 [00:06<00:02, 41.88it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.3138:  67%|██████▋   | 211/313 [00:06<00:02, 36.23it/s]

10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.2730:  70%|███████   | 220/313 [00:06<00:02, 37.36it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.2501:  73%|███████▎  | 230/313 [00:06<00:02, 40.38it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.1948:  77%|███████▋  | 240/313 [00:07<00:01, 42.12it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.1850:  80%|███████▉  | 250/313 [00:07<00:01, 42.88it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.1811:  81%|████████▏ | 255/313 [00:07<00:01, 41.59it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.2026:  85%|████████▍ | 265/313 [00:07<00:01, 42.63it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.1879:  88%|████████▊ | 275/313 [00:07<00:00, 42.51it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.1860:  91%|█████████ | 285/313 [00:08<00:00, 43.55it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.2297:  93%|█████████▎| 290/313 [00:08<00:00, 42.65it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.1930:  96%|█████████▌| 299/313 [00:08<00:00, 37.08it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.1796:  98%|█████████▊| 307/313 [00:08<00:00, 35.02it/s]

10 512
10 512
10 512
10 512
10 512
10 512
10 512


Epoch 1 - loss: 1.1726: 100%|██████████| 313/313 [00:08<00:00, 35.24it/s]

10 512
10 512
10 512
10 512
10 512





In [7]:

model.eval()
X_test = torch.tensor([[9,8,4,6,5,2]]).to(device)
output = model(X_test, mask=True)
predicted = torch.argmax(output, dim=-1)
print(predicted)



10 512
tensor([[5, 5, 4, 4, 8, 9]], device='cuda:0')


In [8]:

model.eval()
X_test = torch.tensor([[1,2,3,4,5,6]]).to(device)
output = model(X_test, mask=True)
predicted = torch.argmax(output, dim=-1)
print(predicted)


10 512
tensor([[5, 5, 3, 3, 2, 1]], device='cuda:0')


# Shakespeare

In [11]:
import re

In [92]:
from torch.utils.data import Dataset

class ShakespeareData(Dataset):
    def __init__(self, words, block_size):
        self.words = words
        self.vocab = sorted(list(set(words)))
        self.idx2word = {i: word for i, word in enumerate(self.vocab)}
        self.word2idx = {word: i for i, word in enumerate(self.vocab)}
        self.block_size = block_size
        self.vocab_size = len(self.vocab)

    def __len__(self):
        return len(self.words) - self.block_size

    def __getitem__(self, idx):
        sequence = self.words[idx:idx + self.block_size + 1]
        encoded_sequence = [self.word2idx[s] for s in sequence]
        x = torch.tensor(encoded_sequence[:-1], dtype=torch.long)
        y = torch.tensor(encoded_sequence[1:], dtype=torch.long)
        return x, y

    def get_vocab(self):
        return self.vocab

    def get_idx2word(self):
        return self.idx2word

    def get_word2idx(self):
        return self.word2idx

    def get_block_size(self):
        return self.block_size

    def get_words(self):
        return self.words

    def get_vocab_size(self):
        return self.vocab_size

    def get_block_size(self):
        return self.block_size


In [93]:
file = "shakespeare.txt"
# file = "/kaggle/input/shakespeare/shakespeare.txt"
with open(file, "r") as f:
    text = f.read()


# text = open(file, 'r').read()
words = re.split(r"\b", text)
block_size = 32
train_dataset = ShakespeareData(words, block_size)
vocab_size = train_dataset.get_vocab_size()

batch_size = 128
train_loader = DataLoader(
    train_dataset, shuffle=True, pin_memory=True, batch_size=batch_size
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)


device: cuda


In [76]:
num_layers = 2
num_heads = 2
d_model = 128
vocab_size = train_dataset.get_vocab_size()

model = MyTransformerDecoderOnly(vocab_size,
                                 d_model,
                                 num_heads,
                                 num_layers,
                                 dropout).to(device).train()

loss_fn = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=6e-4)

max_epochs = 1
for epoch in range(max_epochs):
    pbar = tqdm(enumerate(train_loader), total=len(train_loader))
    for it, (x, y) in pbar:
        x = x.to(device)
        y = y.to(device)
#         print(x.shape)
        optimizer.zero_grad()

        logits = model(x, True)
        loss = loss_fn(logits.view(-1, logits.size(-1)), y.view(-1))
        loss.backward()

        optimizer.step()

        pbar.set_description(f"epoch {epoch} iter {it}: train loss {loss.item():.5f}")

epoch 0 iter 1: train loss 10.27095:   0%|          | 1/15515 [00:00<56:54,  4.54it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5: train loss 9.06332:   0%|          | 5/15515 [00:00<22:23, 11.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9: train loss 8.15950:   0%|          | 9/15515 [00:00<18:25, 14.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13: train loss 7.52889:   0%|          | 13/15515 [00:01<17:04, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 17: train loss 7.06375:   0%|          | 17/15515 [00:01<16:43, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 21: train loss 6.69737:   0%|          | 21/15515 [00:01<16:22, 15.76it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 25: train loss 6.34521:   0%|          | 25/15515 [00:01<16:15, 15.88it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 29: train loss 6.12798:   0%|          | 29/15515 [00:02<16:19, 15.80it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 33: train loss 5.78199:   0%|          | 33/15515 [00:02<16:13, 15.91it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 37: train loss 5.57625:   0%|          | 37/15515 [00:02<16:17, 15.83it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 41: train loss 5.42398:   0%|          | 41/15515 [00:02<16:09, 15.97it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 45: train loss 5.28528:   0%|          | 45/15515 [00:03<16:22, 15.75it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 49: train loss 5.14667:   0%|          | 49/15515 [00:03<16:41, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 53: train loss 5.09801:   0%|          | 53/15515 [00:03<16:53, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 57: train loss 5.00127:   0%|          | 57/15515 [00:03<17:00, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 61: train loss 4.86261:   0%|          | 61/15515 [00:04<16:52, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 65: train loss 4.78094:   0%|          | 65/15515 [00:04<16:50, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 69: train loss 4.79721:   0%|          | 69/15515 [00:04<16:59, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 73: train loss 4.75165:   0%|          | 73/15515 [00:04<17:06, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 77: train loss 4.63127:   0%|          | 77/15515 [00:05<17:02, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 81: train loss 4.68757:   1%|          | 81/15515 [00:05<16:58, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 85: train loss 4.65476:   1%|          | 85/15515 [00:05<16:56, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 89: train loss 4.52042:   1%|          | 89/15515 [00:05<16:44, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 93: train loss 4.55103:   1%|          | 93/15515 [00:06<16:46, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 97: train loss 4.57979:   1%|          | 97/15515 [00:06<16:38, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 101: train loss 4.52410:   1%|          | 101/15515 [00:06<16:45, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 105: train loss 4.48585:   1%|          | 105/15515 [00:07<16:37, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 109: train loss 4.43194:   1%|          | 109/15515 [00:07<16:37, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 113: train loss 4.46797:   1%|          | 113/15515 [00:07<16:42, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 117: train loss 4.44571:   1%|          | 117/15515 [00:07<16:41, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 121: train loss 4.46231:   1%|          | 121/15515 [00:08<16:37, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 125: train loss 4.45435:   1%|          | 125/15515 [00:08<16:39, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 129: train loss 4.36312:   1%|          | 129/15515 [00:08<16:34, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 133: train loss 4.31987:   1%|          | 133/15515 [00:08<16:42, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 137: train loss 4.44753:   1%|          | 137/15515 [00:09<16:22, 15.65it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 141: train loss 4.31052:   1%|          | 141/15515 [00:09<16:28, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 145: train loss 4.32178:   1%|          | 145/15515 [00:09<16:32, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 149: train loss 4.26431:   1%|          | 149/15515 [00:09<16:24, 15.60it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 153: train loss 4.32781:   1%|          | 153/15515 [00:10<16:31, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 157: train loss 4.26087:   1%|          | 157/15515 [00:10<16:37, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 161: train loss 4.27286:   1%|          | 161/15515 [00:10<16:39, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 165: train loss 4.31721:   1%|          | 165/15515 [00:10<16:30, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 169: train loss 4.39097:   1%|          | 169/15515 [00:11<16:34, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 173: train loss 4.15859:   1%|          | 173/15515 [00:11<16:35, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 177: train loss 4.23062:   1%|          | 177/15515 [00:11<16:47, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 181: train loss 4.22801:   1%|          | 181/15515 [00:11<16:44, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 185: train loss 4.23474:   1%|          | 185/15515 [00:12<16:48, 15.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 189: train loss 4.27231:   1%|          | 189/15515 [00:12<16:49, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 193: train loss 4.14097:   1%|          | 193/15515 [00:12<16:49, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 197: train loss 4.19355:   1%|▏         | 197/15515 [00:12<16:41, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 201: train loss 4.21404:   1%|▏         | 201/15515 [00:13<16:44, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 205: train loss 4.12297:   1%|▏         | 205/15515 [00:13<16:48, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 209: train loss 4.12288:   1%|▏         | 209/15515 [00:13<16:45, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 213: train loss 4.15650:   1%|▏         | 213/15515 [00:14<16:44, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 217: train loss 4.10398:   1%|▏         | 217/15515 [00:14<16:42, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 221: train loss 4.13636:   1%|▏         | 221/15515 [00:14<16:41, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 225: train loss 4.18281:   1%|▏         | 225/15515 [00:14<16:43, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 229: train loss 4.17018:   1%|▏         | 229/15515 [00:15<16:39, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 233: train loss 4.12485:   2%|▏         | 233/15515 [00:15<16:48, 15.15it/s]

32459 128
32459 128
32459 128


epoch 0 iter 236: train loss 4.09457:   2%|▏         | 237/15515 [00:15<16:59, 14.98it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 240: train loss 4.13006:   2%|▏         | 241/15515 [00:15<17:01, 14.95it/s]

32459 128
32459 128
32459 128


epoch 0 iter 243: train loss 4.03531:   2%|▏         | 243/15515 [00:16<17:08, 14.85it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 247: train loss 3.99483:   2%|▏         | 247/15515 [00:16<17:16, 14.74it/s]

32459 128
32459 128
32459 128


epoch 0 iter 249: train loss 4.08003:   2%|▏         | 249/15515 [00:16<18:04, 14.07it/s]

32459 128
32459 128
32459 128


epoch 0 iter 253: train loss 4.07259:   2%|▏         | 253/15515 [00:16<18:04, 14.07it/s]

32459 128
32459 128
32459 128


epoch 0 iter 256: train loss 4.06550:   2%|▏         | 255/15515 [00:16<18:36, 13.67it/s]

32459 128
32459 128
32459 128


epoch 0 iter 258: train loss 4.11067:   2%|▏         | 259/15515 [00:17<18:44, 13.57it/s]

32459 128
32459 128
32459 128


epoch 0 iter 262: train loss 4.01369:   2%|▏         | 263/15515 [00:17<18:17, 13.90it/s]

32459 128
32459 128
32459 128


epoch 0 iter 265: train loss 4.05273:   2%|▏         | 265/15515 [00:17<18:01, 14.10it/s]

32459 128
32459 128
32459 128


epoch 0 iter 268: train loss 4.04519:   2%|▏         | 269/15515 [00:17<17:51, 14.22it/s]

32459 128
32459 128
32459 128


epoch 0 iter 271: train loss 4.00662:   2%|▏         | 271/15515 [00:18<17:46, 14.30it/s]

32459 128
32459 128
32459 128


epoch 0 iter 274: train loss 4.05154:   2%|▏         | 275/15515 [00:18<17:32, 14.48it/s]

32459 128
32459 128
32459 128


epoch 0 iter 277: train loss 3.98290:   2%|▏         | 277/15515 [00:18<17:37, 14.41it/s]

32459 128
32459 128
32459 128


epoch 0 iter 280: train loss 4.04556:   2%|▏         | 281/15515 [00:18<17:15, 14.70it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 284: train loss 4.03948:   2%|▏         | 285/15515 [00:18<17:01, 14.91it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 288: train loss 4.05952:   2%|▏         | 289/15515 [00:19<17:00, 14.92it/s]

32459 128
32459 128
32459 128


epoch 0 iter 291: train loss 3.98510:   2%|▏         | 291/15515 [00:19<16:58, 14.95it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 295: train loss 4.01097:   2%|▏         | 295/15515 [00:19<16:57, 14.96it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 299: train loss 3.95150:   2%|▏         | 299/15515 [00:19<16:47, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 303: train loss 3.95387:   2%|▏         | 303/15515 [00:20<16:43, 15.16it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 306: train loss 4.05516:   2%|▏         | 307/15515 [00:20<16:51, 15.04it/s]


32459 128
32459 128
32459 128


epoch 0 iter 310: train loss 3.99886:   2%|▏         | 311/15515 [00:20<16:47, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 314: train loss 3.98578:   2%|▏         | 315/15515 [00:20<16:51, 15.02it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 317: train loss 4.00872:   2%|▏         | 317/15515 [00:21<16:57, 14.94it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 321: train loss 3.96135:   2%|▏         | 321/15515 [00:21<16:47, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 325: train loss 4.02954:   2%|▏         | 325/15515 [00:21<16:33, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 329: train loss 4.01298:   2%|▏         | 329/15515 [00:21<16:39, 15.20it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 332: train loss 3.94105:   2%|▏         | 333/15515 [00:22<17:05, 14.80it/s]

128
32459 128
32459 128


epoch 0 iter 334: train loss 3.94063:   2%|▏         | 335/15515 [00:22<17:29, 14.47it/s]

32459 128
32459 128
32459 128


epoch 0 iter 338: train loss 4.00130:   2%|▏         | 339/15515 [00:22<17:05, 14.80it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 342: train loss 3.90594:   2%|▏         | 343/15515 [00:22<16:42, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 346: train loss 3.91278:   2%|▏         | 347/15515 [00:23<16:53, 14.97it/s]

32459 128
32459 128
32459 128


epoch 0 iter 349: train loss 3.89665:   2%|▏         | 349/15515 [00:23<16:48, 15.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 353: train loss 3.93144:   2%|▏         | 353/15515 [00:23<16:39, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 357: train loss 3.95457:   2%|▏         | 357/15515 [00:23<16:29, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 361: train loss 3.93593:   2%|▏         | 361/15515 [00:24<16:34, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 365: train loss 3.91468:   2%|▏         | 365/15515 [00:24<16:34, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 369: train loss 3.95364:   2%|▏         | 369/15515 [00:24<16:35, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 373: train loss 4.01251:   2%|▏         | 373/15515 [00:24<16:42, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 377: train loss 3.96771:   2%|▏         | 377/15515 [00:25<16:41, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 381: train loss 3.91992:   2%|▏         | 381/15515 [00:25<16:35, 15.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 385: train loss 3.92970:   2%|▏         | 385/15515 [00:25<16:27, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 389: train loss 3.98452:   3%|▎         | 389/15515 [00:25<16:23, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 393: train loss 3.90508:   3%|▎         | 393/15515 [00:26<16:25, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 397: train loss 3.98158:   3%|▎         | 397/15515 [00:26<16:25, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 401: train loss 3.99755:   3%|▎         | 401/15515 [00:26<16:24, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 405: train loss 3.92946:   3%|▎         | 405/15515 [00:26<16:18, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 409: train loss 3.88645:   3%|▎         | 409/15515 [00:27<16:18, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 413: train loss 3.88370:   3%|▎         | 413/15515 [00:27<16:19, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 417: train loss 3.93892:   3%|▎         | 417/15515 [00:27<16:19, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 421: train loss 3.90929:   3%|▎         | 421/15515 [00:27<16:17, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 425: train loss 3.90872:   3%|▎         | 425/15515 [00:28<16:11, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 429: train loss 3.94387:   3%|▎         | 429/15515 [00:28<16:10, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 433: train loss 3.89434:   3%|▎         | 433/15515 [00:28<16:37, 15.13it/s]

32459 128
32459 128
32459 128


epoch 0 iter 436: train loss 3.96850:   3%|▎         | 437/15515 [00:28<16:47, 14.97it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 439: train loss 3.84136:   3%|▎         | 439/15515 [00:29<16:40, 15.07it/s]

32459 128
32459 128
32459 128


epoch 0 iter 443: train loss 3.90264:   3%|▎         | 443/15515 [00:29<16:47, 14.95it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 447: train loss 3.97994:   3%|▎         | 447/15515 [00:29<16:31, 15.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 451: train loss 3.84387:   3%|▎         | 451/15515 [00:29<16:54, 14.84it/s]

32459 128
32459 128
32459 128


epoch 0 iter 454: train loss 3.85301:   3%|▎         | 455/15515 [00:30<17:00, 14.75it/s]

32459 128
32459 128
32459 128


epoch 0 iter 457: train loss 3.93805:   3%|▎         | 457/15515 [00:30<17:05, 14.69it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 461: train loss 3.78127:   3%|▎         | 461/15515 [00:30<16:46, 14.96it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 465: train loss 3.89461:   3%|▎         | 465/15515 [00:30<16:45, 14.97it/s]

32459 128
32459 128
32459 128


epoch 0 iter 468: train loss 3.85181:   3%|▎         | 469/15515 [00:31<16:41, 15.02it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 472: train loss 3.97659:   3%|▎         | 473/15515 [00:31<16:20, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 476: train loss 3.91811:   3%|▎         | 477/15515 [00:31<16:13, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 480: train loss 3.92467:   3%|▎         | 481/15515 [00:31<16:08, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 484: train loss 3.83731:   3%|▎         | 485/15515 [00:32<16:03, 15.59it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 488: train loss 3.86489:   3%|▎         | 489/15515 [00:32<16:13, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 492: train loss 3.95836:   3%|▎         | 493/15515 [00:32<16:10, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 496: train loss 3.81346:   3%|▎         | 497/15515 [00:32<16:07, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 500: train loss 3.85021:   3%|▎         | 501/15515 [00:33<16:07, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 504: train loss 3.83236:   3%|▎         | 505/15515 [00:33<16:09, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 508: train loss 3.89355:   3%|▎         | 509/15515 [00:33<16:06, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 512: train loss 3.84591:   3%|▎         | 513/15515 [00:33<15:57, 15.66it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 516: train loss 3.83029:   3%|▎         | 517/15515 [00:34<16:01, 15.59it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 520: train loss 3.85627:   3%|▎         | 521/15515 [00:34<16:01, 15.59it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 524: train loss 3.85310:   3%|▎         | 525/15515 [00:34<16:01, 15.60it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 528: train loss 3.78692:   3%|▎         | 529/15515 [00:34<15:57, 15.65it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 532: train loss 3.84681:   3%|▎         | 533/15515 [00:35<15:51, 15.75it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 536: train loss 3.88908:   3%|▎         | 537/15515 [00:35<15:57, 15.65it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 540: train loss 3.78802:   3%|▎         | 541/15515 [00:35<16:01, 15.58it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 544: train loss 3.82083:   4%|▎         | 545/15515 [00:35<15:54, 15.68it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 548: train loss 3.87827:   4%|▎         | 549/15515 [00:36<16:05, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 552: train loss 3.72469:   4%|▎         | 553/15515 [00:36<15:54, 15.68it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 556: train loss 3.83248:   4%|▎         | 557/15515 [00:36<15:52, 15.71it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 560: train loss 3.76434:   4%|▎         | 561/15515 [00:36<15:45, 15.82it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 564: train loss 3.76603:   4%|▎         | 565/15515 [00:37<15:57, 15.62it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 568: train loss 3.82122:   4%|▎         | 569/15515 [00:37<16:00, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 572: train loss 3.91091:   4%|▎         | 573/15515 [00:37<15:46, 15.79it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 576: train loss 3.93531:   4%|▎         | 577/15515 [00:37<15:49, 15.73it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 580: train loss 3.88629:   4%|▎         | 581/15515 [00:38<15:43, 15.82it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 584: train loss 3.83206:   4%|▍         | 585/15515 [00:38<16:00, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 588: train loss 3.74253:   4%|▍         | 589/15515 [00:38<16:04, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 592: train loss 3.74741:   4%|▍         | 593/15515 [00:39<15:47, 15.75it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 596: train loss 3.80364:   4%|▍         | 597/15515 [00:39<15:50, 15.70it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 600: train loss 3.74464:   4%|▍         | 601/15515 [00:39<15:50, 15.69it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 604: train loss 3.81752:   4%|▍         | 605/15515 [00:39<15:54, 15.62it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 608: train loss 3.72655:   4%|▍         | 609/15515 [00:40<15:53, 15.63it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 612: train loss 3.77574:   4%|▍         | 613/15515 [00:40<15:48, 15.72it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 616: train loss 3.78524:   4%|▍         | 617/15515 [00:40<15:48, 15.71it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 620: train loss 3.73980:   4%|▍         | 621/15515 [00:40<15:47, 15.72it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 624: train loss 3.82188:   4%|▍         | 625/15515 [00:41<15:59, 15.52it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 627: train loss 3.71028:   4%|▍         | 627/15515 [00:41<16:15, 15.26it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 631: train loss 3.75928:   4%|▍         | 631/15515 [00:41<16:15, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 635: train loss 3.76201:   4%|▍         | 635/15515 [00:41<16:30, 15.03it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 638: train loss 3.76715:   4%|▍         | 639/15515 [00:41<16:16, 15.23it/s]


32459 128
32459 128
32459 128


epoch 0 iter 642: train loss 3.74176:   4%|▍         | 643/15515 [00:42<16:06, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 646: train loss 3.82304:   4%|▍         | 647/15515 [00:42<16:12, 15.29it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 649: train loss 3.84540:   4%|▍         | 649/15515 [00:42<16:14, 15.25it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 653: train loss 3.74211:   4%|▍         | 653/15515 [00:42<16:11, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 657: train loss 3.76327:   4%|▍         | 657/15515 [00:43<16:13, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 661: train loss 3.81081:   4%|▍         | 661/15515 [00:43<16:08, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 665: train loss 3.78681:   4%|▍         | 665/15515 [00:43<16:16, 15.21it/s]

32459 128
32459 128
32459 128


epoch 0 iter 668: train loss 3.72143:   4%|▍         | 669/15515 [00:43<16:07, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 672: train loss 3.68451:   4%|▍         | 673/15515 [00:44<15:53, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 676: train loss 3.72963:   4%|▍         | 677/15515 [00:44<15:46, 15.67it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 680: train loss 3.74725:   4%|▍         | 681/15515 [00:44<15:49, 15.63it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 684: train loss 3.75262:   4%|▍         | 685/15515 [00:44<15:36, 15.84it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 688: train loss 3.70819:   4%|▍         | 689/15515 [00:45<15:46, 15.66it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 692: train loss 3.86522:   4%|▍         | 693/15515 [00:45<15:52, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 696: train loss 3.70970:   4%|▍         | 697/15515 [00:45<15:47, 15.63it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 700: train loss 3.69363:   5%|▍         | 701/15515 [00:45<15:44, 15.68it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 704: train loss 3.74302:   5%|▍         | 705/15515 [00:46<15:39, 15.76it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 708: train loss 3.86250:   5%|▍         | 709/15515 [00:46<15:45, 15.66it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 712: train loss 3.73337:   5%|▍         | 713/15515 [00:46<15:37, 15.80it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 716: train loss 3.82968:   5%|▍         | 717/15515 [00:47<15:39, 15.75it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 720: train loss 3.71894:   5%|▍         | 721/15515 [00:47<15:43, 15.67it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 724: train loss 3.75707:   5%|▍         | 725/15515 [00:47<15:42, 15.70it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 728: train loss 3.70016:   5%|▍         | 729/15515 [00:47<15:40, 15.72it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 732: train loss 3.71095:   5%|▍         | 733/15515 [00:48<15:33, 15.83it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 736: train loss 3.72313:   5%|▍         | 737/15515 [00:48<15:34, 15.81it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 740: train loss 3.70440:   5%|▍         | 741/15515 [00:48<15:41, 15.69it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 744: train loss 3.74090:   5%|▍         | 745/15515 [00:48<15:43, 15.65it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 748: train loss 3.62210:   5%|▍         | 749/15515 [00:49<15:33, 15.83it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 752: train loss 3.74375:   5%|▍         | 753/15515 [00:49<15:35, 15.78it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 756: train loss 3.67184:   5%|▍         | 757/15515 [00:49<15:34, 15.80it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 760: train loss 3.66863:   5%|▍         | 761/15515 [00:49<15:45, 15.61it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 764: train loss 3.66598:   5%|▍         | 765/15515 [00:50<15:34, 15.78it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 768: train loss 3.76416:   5%|▍         | 769/15515 [00:50<15:34, 15.78it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 772: train loss 3.64683:   5%|▍         | 773/15515 [00:50<15:30, 15.84it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 776: train loss 3.65801:   5%|▌         | 777/15515 [00:50<15:39, 15.69it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 780: train loss 3.73575:   5%|▌         | 781/15515 [00:51<15:37, 15.72it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 784: train loss 3.72680:   5%|▌         | 785/15515 [00:51<15:37, 15.71it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 788: train loss 3.77049:   5%|▌         | 789/15515 [00:51<15:35, 15.74it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 792: train loss 3.77516:   5%|▌         | 793/15515 [00:51<15:33, 15.78it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 796: train loss 3.64690:   5%|▌         | 797/15515 [00:52<15:34, 15.76it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 800: train loss 3.68206:   5%|▌         | 801/15515 [00:52<15:34, 15.74it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 804: train loss 3.72366:   5%|▌         | 805/15515 [00:52<15:39, 15.65it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 808: train loss 3.73034:   5%|▌         | 809/15515 [00:52<15:38, 15.67it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 812: train loss 3.67385:   5%|▌         | 813/15515 [00:53<15:41, 15.61it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 816: train loss 3.61710:   5%|▌         | 817/15515 [00:53<15:44, 15.57it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 820: train loss 3.69126:   5%|▌         | 821/15515 [00:53<15:44, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 824: train loss 3.64680:   5%|▌         | 825/15515 [00:53<15:45, 15.54it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 827: train loss 3.65252:   5%|▌         | 827/15515 [00:54<16:00, 15.29it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 831: train loss 3.70355:   5%|▌         | 831/15515 [00:54<15:54, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 835: train loss 3.68841:   5%|▌         | 835/15515 [00:54<15:40, 15.61it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 839: train loss 3.68307:   5%|▌         | 839/15515 [00:54<15:43, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 843: train loss 3.76731:   5%|▌         | 843/15515 [00:55<15:47, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 847: train loss 3.71054:   5%|▌         | 847/15515 [00:55<15:40, 15.60it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 851: train loss 3.60538:   5%|▌         | 851/15515 [00:55<16:11, 15.10it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 854: train loss 3.63107:   6%|▌         | 855/15515 [00:55<16:21, 14.93it/s]

 128
32459 128
32459 128


epoch 0 iter 857: train loss 3.76146:   6%|▌         | 857/15515 [00:56<16:24, 14.89it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 861: train loss 3.64300:   6%|▌         | 861/15515 [00:56<16:02, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 865: train loss 3.71992:   6%|▌         | 865/15515 [00:56<15:59, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 869: train loss 3.66247:   6%|▌         | 869/15515 [00:56<15:50, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 873: train loss 3.65853:   6%|▌         | 873/15515 [00:57<15:34, 15.66it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 877: train loss 3.70542:   6%|▌         | 877/15515 [00:57<15:30, 15.73it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 881: train loss 3.69621:   6%|▌         | 881/15515 [00:57<15:26, 15.80it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 885: train loss 3.64052:   6%|▌         | 885/15515 [00:57<15:26, 15.80it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 889: train loss 3.63050:   6%|▌         | 889/15515 [00:58<15:22, 15.86it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 893: train loss 3.69611:   6%|▌         | 893/15515 [00:58<15:27, 15.76it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 897: train loss 3.66167:   6%|▌         | 897/15515 [00:58<15:33, 15.66it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 901: train loss 3.62443:   6%|▌         | 901/15515 [00:58<15:33, 15.66it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 905: train loss 3.66170:   6%|▌         | 905/15515 [00:59<15:29, 15.73it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 909: train loss 3.69999:   6%|▌         | 909/15515 [00:59<15:22, 15.83it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 913: train loss 3.61249:   6%|▌         | 913/15515 [00:59<15:28, 15.73it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 917: train loss 3.65357:   6%|▌         | 917/15515 [00:59<15:18, 15.90it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 921: train loss 3.66581:   6%|▌         | 921/15515 [01:00<15:24, 15.78it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 925: train loss 3.65980:   6%|▌         | 925/15515 [01:00<15:21, 15.82it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 929: train loss 3.65460:   6%|▌         | 929/15515 [01:00<15:33, 15.62it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 933: train loss 3.56246:   6%|▌         | 933/15515 [01:00<15:27, 15.73it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 937: train loss 3.68075:   6%|▌         | 937/15515 [01:01<15:24, 15.77it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 941: train loss 3.64879:   6%|▌         | 941/15515 [01:01<15:27, 15.71it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 945: train loss 3.60133:   6%|▌         | 945/15515 [01:01<15:23, 15.77it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 949: train loss 3.63374:   6%|▌         | 949/15515 [01:01<15:25, 15.74it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 953: train loss 3.56644:   6%|▌         | 953/15515 [01:02<15:21, 15.80it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 957: train loss 3.67360:   6%|▌         | 957/15515 [01:02<15:15, 15.89it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 961: train loss 3.65674:   6%|▌         | 961/15515 [01:02<15:23, 15.76it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 965: train loss 3.69471:   6%|▌         | 965/15515 [01:02<15:24, 15.74it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 969: train loss 3.60254:   6%|▌         | 969/15515 [01:03<15:21, 15.79it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 973: train loss 3.65474:   6%|▋         | 973/15515 [01:03<15:20, 15.79it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 977: train loss 3.73723:   6%|▋         | 977/15515 [01:03<15:19, 15.81it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 981: train loss 3.65488:   6%|▋         | 981/15515 [01:03<15:36, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 985: train loss 3.62872:   6%|▋         | 985/15515 [01:04<15:30, 15.61it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 989: train loss 3.57075:   6%|▋         | 989/15515 [01:04<15:30, 15.62it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 993: train loss 3.61742:   6%|▋         | 993/15515 [01:04<15:25, 15.68it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 997: train loss 3.67721:   6%|▋         | 997/15515 [01:04<15:26, 15.66it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1001: train loss 3.62577:   6%|▋         | 1001/15515 [01:05<15:30, 15.60it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1005: train loss 3.65312:   6%|▋         | 1005/15515 [01:05<15:28, 15.62it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1009: train loss 3.54443:   7%|▋         | 1009/15515 [01:05<15:20, 15.76it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1013: train loss 3.60513:   7%|▋         | 1013/15515 [01:05<15:21, 15.74it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1017: train loss 3.57489:   7%|▋         | 1017/15515 [01:06<15:15, 15.84it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1021: train loss 3.56235:   7%|▋         | 1021/15515 [01:06<15:20, 15.75it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1025: train loss 3.59732:   7%|▋         | 1025/15515 [01:06<15:16, 15.82it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1029: train loss 3.60607:   7%|▋         | 1029/15515 [01:06<15:39, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1033: train loss 3.64702:   7%|▋         | 1033/15515 [01:07<15:55, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1037: train loss 3.55212:   7%|▋         | 1037/15515 [01:07<15:49, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1041: train loss 3.60656:   7%|▋         | 1041/15515 [01:07<15:38, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1045: train loss 3.66435:   7%|▋         | 1045/15515 [01:08<15:41, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1049: train loss 3.61962:   7%|▋         | 1049/15515 [01:08<15:52, 15.18it/s]

32459 128
32459 128
32459 128


epoch 0 iter 1052: train loss 3.57177:   7%|▋         | 1053/15515 [01:08<15:49, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1056: train loss 3.59070:   7%|▋         | 1057/15515 [01:08<15:52, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1060: train loss 3.61136:   7%|▋         | 1061/15515 [01:09<15:39, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1064: train loss 3.54417:   7%|▋         | 1065/15515 [01:09<15:31, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1068: train loss 3.59402:   7%|▋         | 1069/15515 [01:09<15:44, 15.30it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 1071: train loss 3.57473:   7%|▋         | 1071/15515 [01:09<15:48, 15.23it/s]


32459 128
32459 128
32459 128


epoch 0 iter 1075: train loss 3.61395:   7%|▋         | 1075/15515 [01:09<15:29, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1079: train loss 3.65501:   7%|▋         | 1079/15515 [01:10<15:27, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1083: train loss 3.63744:   7%|▋         | 1083/15515 [01:10<15:27, 15.57it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1087: train loss 3.63380:   7%|▋         | 1087/15515 [01:10<15:18, 15.71it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1091: train loss 3.61384:   7%|▋         | 1091/15515 [01:11<15:18, 15.70it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1095: train loss 3.62566:   7%|▋         | 1095/15515 [01:11<15:17, 15.72it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1099: train loss 3.50119:   7%|▋         | 1099/15515 [01:11<15:16, 15.74it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1103: train loss 3.54333:   7%|▋         | 1103/15515 [01:11<15:11, 15.81it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1107: train loss 3.58255:   7%|▋         | 1107/15515 [01:12<15:11, 15.80it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1111: train loss 3.52251:   7%|▋         | 1111/15515 [01:12<15:11, 15.80it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1115: train loss 3.63790:   7%|▋         | 1115/15515 [01:12<15:15, 15.73it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1119: train loss 3.58166:   7%|▋         | 1119/15515 [01:12<15:16, 15.71it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1123: train loss 3.61655:   7%|▋         | 1123/15515 [01:13<15:19, 15.66it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1127: train loss 3.55297:   7%|▋         | 1127/15515 [01:13<15:09, 15.83it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1131: train loss 3.59582:   7%|▋         | 1131/15515 [01:13<15:08, 15.83it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1135: train loss 3.51217:   7%|▋         | 1135/15515 [01:13<15:05, 15.88it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1139: train loss 3.69418:   7%|▋         | 1139/15515 [01:14<15:12, 15.76it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1143: train loss 3.60706:   7%|▋         | 1143/15515 [01:14<15:10, 15.79it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1147: train loss 3.57460:   7%|▋         | 1147/15515 [01:14<15:15, 15.69it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1151: train loss 3.45880:   7%|▋         | 1151/15515 [01:14<15:10, 15.77it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1155: train loss 3.56549:   7%|▋         | 1155/15515 [01:15<15:14, 15.70it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1159: train loss 3.50411:   7%|▋         | 1159/15515 [01:15<15:21, 15.58it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1163: train loss 3.53580:   7%|▋         | 1163/15515 [01:15<15:09, 15.78it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1167: train loss 3.53980:   8%|▊         | 1167/15515 [01:15<15:13, 15.70it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1171: train loss 3.51141:   8%|▊         | 1171/15515 [01:16<15:23, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1175: train loss 3.67748:   8%|▊         | 1175/15515 [01:16<15:25, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1179: train loss 3.54942:   8%|▊         | 1179/15515 [01:16<15:17, 15.62it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1183: train loss 3.61253:   8%|▊         | 1183/15515 [01:16<15:20, 15.58it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1187: train loss 3.51951:   8%|▊         | 1187/15515 [01:17<15:17, 15.61it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1191: train loss 3.55131:   8%|▊         | 1191/15515 [01:17<15:19, 15.59it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1195: train loss 3.52365:   8%|▊         | 1195/15515 [01:17<15:14, 15.67it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1199: train loss 3.55441:   8%|▊         | 1199/15515 [01:17<15:15, 15.64it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1203: train loss 3.54713:   8%|▊         | 1203/15515 [01:18<15:21, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1207: train loss 3.55343:   8%|▊         | 1207/15515 [01:18<15:15, 15.63it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1211: train loss 3.62187:   8%|▊         | 1211/15515 [01:18<15:22, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1215: train loss 3.54483:   8%|▊         | 1215/15515 [01:18<15:22, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1219: train loss 3.57121:   8%|▊         | 1219/15515 [01:19<15:17, 15.58it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1223: train loss 3.58550:   8%|▊         | 1223/15515 [01:19<15:16, 15.60it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1227: train loss 3.53045:   8%|▊         | 1227/15515 [01:19<15:16, 15.58it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1231: train loss 3.52406:   8%|▊         | 1231/15515 [01:19<15:28, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1235: train loss 3.46623:   8%|▊         | 1235/15515 [01:20<15:29, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1239: train loss 3.59555:   8%|▊         | 1239/15515 [01:20<15:19, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1243: train loss 3.61427:   8%|▊         | 1243/15515 [01:20<15:28, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1247: train loss 3.58101:   8%|▊         | 1247/15515 [01:20<15:26, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1251: train loss 3.61420:   8%|▊         | 1251/15515 [01:21<15:14, 15.60it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1255: train loss 3.51030:   8%|▊         | 1255/15515 [01:21<15:31, 15.31it/s]

32459 128
32459 128
32459 128


epoch 0 iter 1258: train loss 3.56240:   8%|▊         | 1259/15515 [01:21<15:42, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1262: train loss 3.55626:   8%|▊         | 1263/15515 [01:21<15:50, 15.00it/s]

32459 128
32459 128
32459 128


epoch 0 iter 1265: train loss 3.56408:   8%|▊         | 1265/15515 [01:22<15:53, 14.94it/s]

32459 128
32459 128
32459 128


epoch 0 iter 1268: train loss 3.50562:   8%|▊         | 1269/15515 [01:22<15:56, 14.90it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 1271: train loss 3.55092:   8%|▊         | 1271/15515 [01:22<15:51, 14.97it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 1275: train loss 3.57456:   8%|▊         | 1275/15515 [01:22<15:46, 15.05it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1279: train loss 3.56853:   8%|▊         | 1279/15515 [01:23<15:24, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1283: train loss 3.56961:   8%|▊         | 1283/15515 [01:23<15:22, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1287: train loss 3.51613:   8%|▊         | 1287/15515 [01:23<15:17, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1291: train loss 3.46860:   8%|▊         | 1291/15515 [01:23<15:14, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1295: train loss 3.58454:   8%|▊         | 1295/15515 [01:24<15:15, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1299: train loss 3.58621:   8%|▊         | 1299/15515 [01:24<15:16, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1303: train loss 3.54140:   8%|▊         | 1303/15515 [01:24<15:13, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1307: train loss 3.62790:   8%|▊         | 1307/15515 [01:24<15:15, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1311: train loss 3.57563:   8%|▊         | 1311/15515 [01:25<15:21, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1315: train loss 3.50088:   8%|▊         | 1315/15515 [01:25<15:28, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1319: train loss 3.60284:   9%|▊         | 1319/15515 [01:25<15:22, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1323: train loss 3.45305:   9%|▊         | 1323/15515 [01:25<15:38, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1327: train loss 3.40042:   9%|▊         | 1327/15515 [01:26<15:26, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1331: train loss 3.57605:   9%|▊         | 1331/15515 [01:26<15:22, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1335: train loss 3.60543:   9%|▊         | 1335/15515 [01:26<15:18, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1339: train loss 3.46768:   9%|▊         | 1339/15515 [01:26<15:20, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1343: train loss 3.49176:   9%|▊         | 1343/15515 [01:27<15:21, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1347: train loss 3.52738:   9%|▊         | 1347/15515 [01:27<15:15, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1351: train loss 3.51785:   9%|▊         | 1351/15515 [01:27<15:13, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1355: train loss 3.63260:   9%|▊         | 1355/15515 [01:28<15:17, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1359: train loss 3.61377:   9%|▉         | 1359/15515 [01:28<15:21, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1363: train loss 3.56227:   9%|▉         | 1363/15515 [01:28<15:24, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1367: train loss 3.46383:   9%|▉         | 1367/15515 [01:28<15:15, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1371: train loss 3.51720:   9%|▉         | 1371/15515 [01:29<15:14, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1375: train loss 3.43617:   9%|▉         | 1375/15515 [01:29<15:15, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1379: train loss 3.54256:   9%|▉         | 1379/15515 [01:29<15:17, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1383: train loss 3.58452:   9%|▉         | 1383/15515 [01:29<15:11, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1387: train loss 3.53043:   9%|▉         | 1387/15515 [01:30<15:17, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1391: train loss 3.53871:   9%|▉         | 1391/15515 [01:30<15:18, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1395: train loss 3.58615:   9%|▉         | 1395/15515 [01:30<15:19, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1399: train loss 3.50843:   9%|▉         | 1399/15515 [01:30<15:19, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1403: train loss 3.46411:   9%|▉         | 1403/15515 [01:31<15:21, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1407: train loss 3.50693:   9%|▉         | 1407/15515 [01:31<15:23, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1411: train loss 3.52127:   9%|▉         | 1411/15515 [01:31<15:24, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1415: train loss 3.48262:   9%|▉         | 1415/15515 [01:31<15:14, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1419: train loss 3.51292:   9%|▉         | 1419/15515 [01:32<15:16, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1423: train loss 3.44745:   9%|▉         | 1423/15515 [01:32<15:16, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1427: train loss 3.56307:   9%|▉         | 1427/15515 [01:32<15:12, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1431: train loss 3.52093:   9%|▉         | 1431/15515 [01:32<15:25, 15.22it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 1434: train loss 3.48365:   9%|▉         | 1435/15515 [01:33<15:36, 15.03it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 1438: train loss 3.57941:   9%|▉         | 1439/15515 [01:33<15:23, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1442: train loss 3.45733:   9%|▉         | 1443/15515 [01:33<15:23, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1446: train loss 3.47737:   9%|▉         | 1447/15515 [01:33<15:22, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1450: train loss 3.44088:   9%|▉         | 1451/15515 [01:34<15:39, 14.97it/s]

32459 128
32459 128
32459 128


epoch 0 iter 1453: train loss 3.42627:   9%|▉         | 1453/15515 [01:34<15:50, 14.80it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 1456: train loss 3.46262:   9%|▉         | 1457/15515 [01:34<15:51, 14.77it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 1459: train loss 3.41440:   9%|▉         | 1459/15515 [01:34<15:44, 14.88it/s]

32459 128
32459 128
32459 128


epoch 0 iter 1463: train loss 3.53069:   9%|▉         | 1463/15515 [01:35<15:52, 14.76it/s]

32459 128
32459 128
32459 128


epoch 0 iter 1466: train loss 3.43879:   9%|▉         | 1467/15515 [01:35<15:56, 14.69it/s]

32459 128
32459 128
32459 128


epoch 0 iter 1469: train loss 3.53542:   9%|▉         | 1469/15515 [01:35<15:57, 14.67it/s]

32459 128
32459 128
32459 128


epoch 0 iter 1472: train loss 3.53995:   9%|▉         | 1473/15515 [01:35<15:51, 14.76it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1476: train loss 3.49030:  10%|▉         | 1477/15515 [01:35<15:27, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1480: train loss 3.49500:  10%|▉         | 1481/15515 [01:36<15:19, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1484: train loss 3.50301:  10%|▉         | 1485/15515 [01:36<15:11, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1488: train loss 3.40118:  10%|▉         | 1489/15515 [01:36<15:07, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1492: train loss 3.56107:  10%|▉         | 1493/15515 [01:37<15:03, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1496: train loss 3.48581:  10%|▉         | 1497/15515 [01:37<15:07, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1500: train loss 3.47283:  10%|▉         | 1501/15515 [01:37<15:19, 15.23it/s]

32459 128
32459 128
32459 128


epoch 0 iter 1503: train loss 3.53203:  10%|▉         | 1503/15515 [01:37<15:18, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1507: train loss 3.56013:  10%|▉         | 1507/15515 [01:38<15:19, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1511: train loss 3.51274:  10%|▉         | 1511/15515 [01:38<15:17, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1515: train loss 3.44862:  10%|▉         | 1515/15515 [01:38<15:18, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1519: train loss 3.51666:  10%|▉         | 1519/15515 [01:38<15:14, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1523: train loss 3.40948:  10%|▉         | 1523/15515 [01:39<15:11, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1527: train loss 3.49067:  10%|▉         | 1527/15515 [01:39<15:10, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1531: train loss 3.41570:  10%|▉         | 1531/15515 [01:39<15:06, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1535: train loss 3.49256:  10%|▉         | 1535/15515 [01:39<15:07, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1539: train loss 3.52105:  10%|▉         | 1539/15515 [01:40<15:12, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1543: train loss 3.43806:  10%|▉         | 1543/15515 [01:40<15:14, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1547: train loss 3.46485:  10%|▉         | 1547/15515 [01:40<15:10, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1551: train loss 3.47174:  10%|▉         | 1551/15515 [01:40<15:09, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1555: train loss 3.44020:  10%|█         | 1555/15515 [01:41<15:08, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1559: train loss 3.52307:  10%|█         | 1559/15515 [01:41<15:12, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1563: train loss 3.43501:  10%|█         | 1563/15515 [01:41<15:17, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1567: train loss 3.49066:  10%|█         | 1567/15515 [01:41<15:15, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1571: train loss 3.41971:  10%|█         | 1571/15515 [01:42<15:10, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1575: train loss 3.46381:  10%|█         | 1575/15515 [01:42<15:14, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1579: train loss 3.52717:  10%|█         | 1579/15515 [01:42<15:01, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1583: train loss 3.57165:  10%|█         | 1583/15515 [01:42<15:00, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1587: train loss 3.53467:  10%|█         | 1587/15515 [01:43<15:00, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1591: train loss 3.39566:  10%|█         | 1591/15515 [01:43<15:00, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1595: train loss 3.53502:  10%|█         | 1595/15515 [01:43<14:57, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1599: train loss 3.49491:  10%|█         | 1599/15515 [01:43<15:11, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1603: train loss 3.49290:  10%|█         | 1603/15515 [01:44<15:06, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1607: train loss 3.54300:  10%|█         | 1607/15515 [01:44<15:04, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1611: train loss 3.46355:  10%|█         | 1611/15515 [01:44<15:03, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1615: train loss 3.44834:  10%|█         | 1615/15515 [01:45<15:03, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1619: train loss 3.46993:  10%|█         | 1619/15515 [01:45<14:59, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1623: train loss 3.52443:  10%|█         | 1623/15515 [01:45<14:59, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1627: train loss 3.44910:  10%|█         | 1627/15515 [01:45<14:56, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1631: train loss 3.43789:  11%|█         | 1631/15515 [01:46<15:08, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1635: train loss 3.48567:  11%|█         | 1635/15515 [01:46<15:24, 15.02it/s]

32459 128
32459 128
32459 128


epoch 0 iter 1638: train loss 3.50849:  11%|█         | 1639/15515 [01:46<15:20, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1642: train loss 3.50834:  11%|█         | 1643/15515 [01:46<15:16, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1646: train loss 3.42520:  11%|█         | 1647/15515 [01:47<15:14, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1650: train loss 3.40532:  11%|█         | 1651/15515 [01:47<15:20, 15.07it/s]

32459 128
32459 128
32459 128


epoch 0 iter 1653: train loss 3.47095:  11%|█         | 1653/15515 [01:47<15:37, 14.78it/s]

32459 128
32459 128
32459 128


epoch 0 iter 1656: train loss 3.43148:  11%|█         | 1657/15515 [01:47<15:22, 15.02it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1660: train loss 3.54372:  11%|█         | 1661/15515 [01:48<15:27, 14.93it/s]

32459 128
32459 128
32459 128


epoch 0 iter 1663: train loss 3.47244:  11%|█         | 1663/15515 [01:48<15:23, 15.00it/s]

32459 128
32459 128
32459 128


epoch 0 iter 1666: train loss 3.44246:  11%|█         | 1667/15515 [01:48<15:31, 14.86it/s]

32459 128
32459 128
32459 128


epoch 0 iter 1669: train loss 3.51322:  11%|█         | 1669/15515 [01:48<15:37, 14.76it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1673: train loss 3.51361:  11%|█         | 1673/15515 [01:48<15:25, 14.95it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1677: train loss 3.51937:  11%|█         | 1677/15515 [01:49<15:08, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1681: train loss 3.48782:  11%|█         | 1681/15515 [01:49<15:17, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1685: train loss 3.46331:  11%|█         | 1685/15515 [01:49<15:05, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1689: train loss 3.52311:  11%|█         | 1689/15515 [01:49<14:57, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1693: train loss 3.49830:  11%|█         | 1693/15515 [01:50<14:56, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1697: train loss 3.43855:  11%|█         | 1697/15515 [01:50<14:53, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1701: train loss 3.46399:  11%|█         | 1701/15515 [01:50<15:00, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1705: train loss 3.46356:  11%|█         | 1705/15515 [01:50<14:50, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1709: train loss 3.42431:  11%|█         | 1709/15515 [01:51<14:51, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1713: train loss 3.41247:  11%|█         | 1713/15515 [01:51<14:55, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1717: train loss 3.41663:  11%|█         | 1717/15515 [01:51<14:57, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1721: train loss 3.49737:  11%|█         | 1721/15515 [01:52<14:53, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1725: train loss 3.47912:  11%|█         | 1725/15515 [01:52<14:55, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1729: train loss 3.44633:  11%|█         | 1729/15515 [01:52<14:57, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1733: train loss 3.47634:  11%|█         | 1733/15515 [01:52<15:00, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1737: train loss 3.40535:  11%|█         | 1737/15515 [01:53<15:02, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1741: train loss 3.44914:  11%|█         | 1741/15515 [01:53<15:01, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1745: train loss 3.48750:  11%|█         | 1745/15515 [01:53<14:58, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1749: train loss 3.39557:  11%|█▏        | 1749/15515 [01:53<14:59, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1753: train loss 3.41527:  11%|█▏        | 1753/15515 [01:54<15:00, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1757: train loss 3.45859:  11%|█▏        | 1757/15515 [01:54<14:54, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1761: train loss 3.49126:  11%|█▏        | 1761/15515 [01:54<14:49, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1765: train loss 3.49288:  11%|█▏        | 1765/15515 [01:54<14:55, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1769: train loss 3.46187:  11%|█▏        | 1769/15515 [01:55<14:53, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1773: train loss 3.45945:  11%|█▏        | 1773/15515 [01:55<15:03, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1777: train loss 3.40410:  11%|█▏        | 1777/15515 [01:55<15:11, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1781: train loss 3.40698:  11%|█▏        | 1781/15515 [01:55<14:58, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1785: train loss 3.50488:  12%|█▏        | 1785/15515 [01:56<14:59, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1789: train loss 3.48834:  12%|█▏        | 1789/15515 [01:56<14:54, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1793: train loss 3.43186:  12%|█▏        | 1793/15515 [01:56<14:53, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1797: train loss 3.48312:  12%|█▏        | 1797/15515 [01:56<14:52, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1801: train loss 3.40301:  12%|█▏        | 1801/15515 [01:57<14:50, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1805: train loss 3.40150:  12%|█▏        | 1805/15515 [01:57<14:42, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1809: train loss 3.41777:  12%|█▏        | 1809/15515 [01:57<14:45, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1813: train loss 3.41324:  12%|█▏        | 1813/15515 [01:57<14:44, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1817: train loss 3.39648:  12%|█▏        | 1817/15515 [01:58<14:38, 15.58it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1821: train loss 3.38357:  12%|█▏        | 1821/15515 [01:58<14:42, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1825: train loss 3.39460:  12%|█▏        | 1825/15515 [01:58<14:44, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1829: train loss 3.38768:  12%|█▏        | 1829/15515 [01:59<14:57, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1833: train loss 3.38598:  12%|█▏        | 1833/15515 [01:59<15:10, 15.02it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1837: train loss 3.41313:  12%|█▏        | 1837/15515 [01:59<15:12, 14.98it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1841: train loss 3.46296:  12%|█▏        | 1841/15515 [01:59<15:12, 14.98it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1845: train loss 3.39809:  12%|█▏        | 1845/15515 [02:00<15:19, 14.86it/s]

32459 128
32459 128
32459 128


epoch 0 iter 1848: train loss 3.43177:  12%|█▏        | 1849/15515 [02:00<15:13, 14.96it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1852: train loss 3.40763:  12%|█▏        | 1853/15515 [02:00<15:05, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1856: train loss 3.40755:  12%|█▏        | 1857/15515 [02:00<15:04, 15.11it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 1859: train loss 3.42687:  12%|█▏        | 1859/15515 [02:01<15:06, 15.07it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 1862: train loss 3.52858:  12%|█▏        | 1863/15515 [02:01<15:00, 15.15it/s]

32459 128
32459 128
32459 128


epoch 0 iter 1866: train loss 3.42288:  12%|█▏        | 1867/15515 [02:01<14:50, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1870: train loss 3.45105:  12%|█▏        | 1871/15515 [02:01<15:07, 15.04it/s]

32459 128
32459 128
32459 128


epoch 0 iter 1873: train loss 3.41980:  12%|█▏        | 1873/15515 [02:01<15:04, 15.08it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 1876: train loss 3.44759:  12%|█▏        | 1877/15515 [02:02<15:01, 15.13it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 1880: train loss 3.45686:  12%|█▏        | 1881/15515 [02:02<14:54, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1884: train loss 3.36892:  12%|█▏        | 1885/15515 [02:02<14:51, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1888: train loss 3.48487:  12%|█▏        | 1889/15515 [02:02<14:40, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1892: train loss 3.44222:  12%|█▏        | 1893/15515 [02:03<14:43, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1896: train loss 3.43222:  12%|█▏        | 1897/15515 [02:03<14:43, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1900: train loss 3.42822:  12%|█▏        | 1901/15515 [02:03<14:38, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1904: train loss 3.40824:  12%|█▏        | 1905/15515 [02:03<14:37, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1908: train loss 3.34638:  12%|█▏        | 1909/15515 [02:04<14:43, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1912: train loss 3.43919:  12%|█▏        | 1913/15515 [02:04<14:41, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1916: train loss 3.46726:  12%|█▏        | 1917/15515 [02:04<14:37, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1920: train loss 3.45737:  12%|█▏        | 1921/15515 [02:05<14:36, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1924: train loss 3.41307:  12%|█▏        | 1925/15515 [02:05<14:39, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1928: train loss 3.45876:  12%|█▏        | 1929/15515 [02:05<14:39, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1932: train loss 3.43906:  12%|█▏        | 1933/15515 [02:05<14:32, 15.57it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1936: train loss 3.46687:  12%|█▏        | 1937/15515 [02:06<14:36, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1940: train loss 3.39563:  13%|█▎        | 1941/15515 [02:06<14:38, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1944: train loss 3.43542:  13%|█▎        | 1945/15515 [02:06<14:41, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1948: train loss 3.40206:  13%|█▎        | 1949/15515 [02:06<14:38, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1952: train loss 3.41700:  13%|█▎        | 1953/15515 [02:07<14:33, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1956: train loss 3.39601:  13%|█▎        | 1957/15515 [02:07<14:34, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1960: train loss 3.39962:  13%|█▎        | 1961/15515 [02:07<14:35, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1964: train loss 3.35241:  13%|█▎        | 1965/15515 [02:07<14:33, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1968: train loss 3.44700:  13%|█▎        | 1969/15515 [02:08<14:27, 15.62it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1972: train loss 3.41204:  13%|█▎        | 1973/15515 [02:08<14:30, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1976: train loss 3.44406:  13%|█▎        | 1977/15515 [02:08<14:34, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1980: train loss 3.40006:  13%|█▎        | 1981/15515 [02:08<14:33, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1984: train loss 3.41028:  13%|█▎        | 1985/15515 [02:09<14:27, 15.59it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1988: train loss 3.37575:  13%|█▎        | 1989/15515 [02:09<14:29, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1992: train loss 3.46422:  13%|█▎        | 1993/15515 [02:09<14:32, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 1996: train loss 3.43153:  13%|█▎        | 1997/15515 [02:09<14:38, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2000: train loss 3.40071:  13%|█▎        | 2001/15515 [02:10<14:37, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2004: train loss 3.30538:  13%|█▎        | 2005/15515 [02:10<14:32, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2008: train loss 3.40805:  13%|█▎        | 2009/15515 [02:10<14:32, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2012: train loss 3.40450:  13%|█▎        | 2013/15515 [02:10<14:29, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2016: train loss 3.38932:  13%|█▎        | 2017/15515 [02:11<14:23, 15.63it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2020: train loss 3.37115:  13%|█▎        | 2021/15515 [02:11<14:25, 15.58it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2024: train loss 3.53507:  13%|█▎        | 2025/15515 [02:11<14:34, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2028: train loss 3.34913:  13%|█▎        | 2029/15515 [02:11<14:31, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2032: train loss 3.33664:  13%|█▎        | 2033/15515 [02:12<14:54, 15.07it/s]

32459 128
32459 128
32459 128


epoch 0 iter 2035: train loss 3.50463:  13%|█▎        | 2035/15515 [02:12<14:58, 15.00it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2039: train loss 3.38251:  13%|█▎        | 2039/15515 [02:12<14:52, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2043: train loss 3.45746:  13%|█▎        | 2043/15515 [02:12<14:39, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2047: train loss 3.41355:  13%|█▎        | 2047/15515 [02:13<14:47, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2051: train loss 3.47004:  13%|█▎        | 2051/15515 [02:13<14:39, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2055: train loss 3.51745:  13%|█▎        | 2055/15515 [02:13<14:43, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2059: train loss 3.37167:  13%|█▎        | 2059/15515 [02:14<14:42, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2063: train loss 3.39954:  13%|█▎        | 2063/15515 [02:14<14:48, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2067: train loss 3.41963:  13%|█▎        | 2067/15515 [02:14<14:43, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2071: train loss 3.41891:  13%|█▎        | 2071/15515 [02:14<14:33, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2075: train loss 3.43100:  13%|█▎        | 2075/15515 [02:15<14:28, 15.48it/s]

32459 128
32459 128
32459 128


epoch 0 iter 2078: train loss 3.41301:  13%|█▎        | 2079/15515 [02:15<14:48, 15.12it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 2081: train loss 3.40005:  13%|█▎        | 2081/15515 [02:15<14:35, 15.34it/s]


32459 128
32459 128
32459 128


epoch 0 iter 2085: train loss 3.52484:  13%|█▎        | 2085/15515 [02:15<14:29, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2089: train loss 3.37775:  13%|█▎        | 2089/15515 [02:15<14:15, 15.69it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2093: train loss 3.39840:  13%|█▎        | 2093/15515 [02:16<14:26, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2097: train loss 3.41166:  14%|█▎        | 2097/15515 [02:16<14:24, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2101: train loss 3.36210:  14%|█▎        | 2101/15515 [02:16<14:30, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2105: train loss 3.46353:  14%|█▎        | 2105/15515 [02:17<14:21, 15.57it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2109: train loss 3.38340:  14%|█▎        | 2109/15515 [02:17<14:21, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2113: train loss 3.44387:  14%|█▎        | 2113/15515 [02:17<14:17, 15.63it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2117: train loss 3.42072:  14%|█▎        | 2117/15515 [02:17<14:24, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2121: train loss 3.40041:  14%|█▎        | 2121/15515 [02:18<14:24, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2125: train loss 3.34567:  14%|█▎        | 2125/15515 [02:18<14:27, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2129: train loss 3.38247:  14%|█▎        | 2129/15515 [02:18<14:18, 15.60it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2133: train loss 3.33341:  14%|█▎        | 2133/15515 [02:18<14:17, 15.61it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2137: train loss 3.33540:  14%|█▍        | 2137/15515 [02:19<14:18, 15.59it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2141: train loss 3.40415:  14%|█▍        | 2141/15515 [02:19<14:19, 15.57it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2145: train loss 3.45211:  14%|█▍        | 2145/15515 [02:19<14:22, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2149: train loss 3.35842:  14%|█▍        | 2149/15515 [02:19<14:27, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2153: train loss 3.29936:  14%|█▍        | 2153/15515 [02:20<14:24, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2157: train loss 3.43815:  14%|█▍        | 2157/15515 [02:20<14:27, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2161: train loss 3.42050:  14%|█▍        | 2161/15515 [02:20<14:16, 15.59it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2165: train loss 3.39481:  14%|█▍        | 2165/15515 [02:20<14:18, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2169: train loss 3.42991:  14%|█▍        | 2169/15515 [02:21<14:17, 15.57it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2173: train loss 3.33747:  14%|█▍        | 2173/15515 [02:21<14:11, 15.66it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2177: train loss 3.30891:  14%|█▍        | 2177/15515 [02:21<14:18, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2181: train loss 3.46250:  14%|█▍        | 2181/15515 [02:21<14:20, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2185: train loss 3.37409:  14%|█▍        | 2185/15515 [02:22<14:11, 15.65it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2189: train loss 3.41096:  14%|█▍        | 2189/15515 [02:22<14:20, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2193: train loss 3.37873:  14%|█▍        | 2193/15515 [02:22<14:19, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2197: train loss 3.37387:  14%|█▍        | 2197/15515 [02:22<14:18, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2201: train loss 3.45027:  14%|█▍        | 2201/15515 [02:23<14:16, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2205: train loss 3.39881:  14%|█▍        | 2205/15515 [02:23<14:15, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2209: train loss 3.38578:  14%|█▍        | 2209/15515 [02:23<14:14, 15.57it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2213: train loss 3.33645:  14%|█▍        | 2213/15515 [02:23<14:17, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2217: train loss 3.42349:  14%|█▍        | 2217/15515 [02:24<14:18, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2221: train loss 3.41155:  14%|█▍        | 2221/15515 [02:24<14:21, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2225: train loss 3.35901:  14%|█▍        | 2225/15515 [02:24<14:19, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2229: train loss 3.36056:  14%|█▍        | 2229/15515 [02:25<14:34, 15.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2233: train loss 3.33473:  14%|█▍        | 2233/15515 [02:25<14:23, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2237: train loss 3.41506:  14%|█▍        | 2237/15515 [02:25<14:33, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2241: train loss 3.41835:  14%|█▍        | 2241/15515 [02:25<14:46, 14.98it/s]

32459 128
32459 128
32459 128


epoch 0 iter 2244: train loss 3.41815:  14%|█▍        | 2245/15515 [02:26<14:44, 15.00it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2248: train loss 3.32997:  14%|█▍        | 2249/15515 [02:26<14:56, 14.80it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 2251: train loss 3.33764:  15%|█▍        | 2251/15515 [02:26<14:53, 14.84it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 2255: train loss 3.39698:  15%|█▍        | 2255/15515 [02:26<14:33, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2259: train loss 3.42132:  15%|█▍        | 2259/15515 [02:26<14:24, 15.33it/s]

32459 128
32459 128
32459 128


epoch 0 iter 2262: train loss 3.37068:  15%|█▍        | 2263/15515 [02:27<14:34, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2266: train loss 3.44021:  15%|█▍        | 2267/15515 [02:27<14:43, 15.00it/s]

32459 128
32459 128
32459 128


epoch 0 iter 2269: train loss 3.41012:  15%|█▍        | 2269/15515 [02:27<14:52, 14.84it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2273: train loss 3.37393:  15%|█▍        | 2273/15515 [02:27<14:48, 14.90it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 2276: train loss 3.33025:  15%|█▍        | 2277/15515 [02:28<14:48, 14.89it/s]

 128
32459 128
32459 128


epoch 0 iter 2279: train loss 3.40571:  15%|█▍        | 2279/15515 [02:28<14:46, 14.92it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2283: train loss 3.35250:  15%|█▍        | 2283/15515 [02:28<14:38, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2287: train loss 3.38624:  15%|█▍        | 2287/15515 [02:28<14:24, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2291: train loss 3.39229:  15%|█▍        | 2291/15515 [02:29<14:15, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2295: train loss 3.38438:  15%|█▍        | 2295/15515 [02:29<14:16, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2299: train loss 3.46988:  15%|█▍        | 2299/15515 [02:29<14:13, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2303: train loss 3.48409:  15%|█▍        | 2303/15515 [02:29<14:09, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2307: train loss 3.40447:  15%|█▍        | 2307/15515 [02:30<14:11, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2311: train loss 3.36080:  15%|█▍        | 2311/15515 [02:30<14:12, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2315: train loss 3.39718:  15%|█▍        | 2315/15515 [02:30<14:12, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2319: train loss 3.41163:  15%|█▍        | 2319/15515 [02:30<14:04, 15.62it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2323: train loss 3.43308:  15%|█▍        | 2323/15515 [02:31<14:09, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2327: train loss 3.33840:  15%|█▍        | 2327/15515 [02:31<14:05, 15.59it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2331: train loss 3.38222:  15%|█▌        | 2331/15515 [02:31<14:02, 15.65it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2335: train loss 3.33198:  15%|█▌        | 2335/15515 [02:31<14:06, 15.58it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2339: train loss 3.34249:  15%|█▌        | 2339/15515 [02:32<14:09, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2343: train loss 3.37563:  15%|█▌        | 2343/15515 [02:32<14:08, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2347: train loss 3.40714:  15%|█▌        | 2347/15515 [02:32<14:09, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2351: train loss 3.40809:  15%|█▌        | 2351/15515 [02:32<14:05, 15.58it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2355: train loss 3.32519:  15%|█▌        | 2355/15515 [02:33<14:06, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2359: train loss 3.40454:  15%|█▌        | 2359/15515 [02:33<14:07, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2363: train loss 3.47423:  15%|█▌        | 2363/15515 [02:33<13:59, 15.66it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2367: train loss 3.43026:  15%|█▌        | 2367/15515 [02:33<14:04, 15.58it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2371: train loss 3.41953:  15%|█▌        | 2371/15515 [02:34<14:10, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2375: train loss 3.37807:  15%|█▌        | 2375/15515 [02:34<14:13, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2379: train loss 3.36542:  15%|█▌        | 2379/15515 [02:34<14:17, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2383: train loss 3.39733:  15%|█▌        | 2383/15515 [02:35<14:09, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2387: train loss 3.42654:  15%|█▌        | 2387/15515 [02:35<14:07, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2391: train loss 3.32035:  15%|█▌        | 2391/15515 [02:35<14:06, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2395: train loss 3.36302:  15%|█▌        | 2395/15515 [02:35<14:06, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2399: train loss 3.34716:  15%|█▌        | 2399/15515 [02:36<14:04, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2403: train loss 3.32885:  15%|█▌        | 2403/15515 [02:36<14:09, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2407: train loss 3.32221:  16%|█▌        | 2407/15515 [02:36<14:12, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2411: train loss 3.43398:  16%|█▌        | 2411/15515 [02:36<14:08, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2415: train loss 3.42806:  16%|█▌        | 2415/15515 [02:37<14:13, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2419: train loss 3.41381:  16%|█▌        | 2419/15515 [02:37<14:08, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2423: train loss 3.38533:  16%|█▌        | 2423/15515 [02:37<14:04, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2427: train loss 3.39560:  16%|█▌        | 2427/15515 [02:37<14:10, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2431: train loss 3.30444:  16%|█▌        | 2431/15515 [02:38<14:02, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2435: train loss 3.34401:  16%|█▌        | 2435/15515 [02:38<14:12, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2439: train loss 3.39768:  16%|█▌        | 2439/15515 [02:38<14:20, 15.20it/s]

32459 128
32459 128
32459 128


epoch 0 iter 2442: train loss 3.35194:  16%|█▌        | 2443/15515 [02:38<14:23, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2446: train loss 3.38077:  16%|█▌        | 2447/15515 [02:39<14:11, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2450: train loss 3.40867:  16%|█▌        | 2451/15515 [02:39<14:09, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2454: train loss 3.37605:  16%|█▌        | 2455/15515 [02:39<14:08, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2458: train loss 3.31522:  16%|█▌        | 2459/15515 [02:39<14:06, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2462: train loss 3.35020:  16%|█▌        | 2463/15515 [02:40<13:59, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2466: train loss 3.37813:  16%|█▌        | 2467/15515 [02:40<14:16, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2470: train loss 3.39727:  16%|█▌        | 2469/15515 [02:40<14:26, 15.06it/s]

32459 128
32459 128
32459 128


epoch 0 iter 2473: train loss 3.33558:  16%|█▌        | 2473/15515 [02:40<14:29, 15.00it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2477: train loss 3.28492:  16%|█▌        | 2477/15515 [02:41<14:39, 14.82it/s]

32459 128
32459 128
32459 128


epoch 0 iter 2480: train loss 3.46229:  16%|█▌        | 2481/15515 [02:41<14:36, 14.87it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 2483: train loss 3.34329:  16%|█▌        | 2483/15515 [02:41<14:43, 14.76it/s]

 128
32459 128
32459 128


epoch 0 iter 2486: train loss 3.34509:  16%|█▌        | 2487/15515 [02:41<14:34, 14.90it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2490: train loss 3.38152:  16%|█▌        | 2491/15515 [02:42<14:20, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2494: train loss 3.30634:  16%|█▌        | 2495/15515 [02:42<14:15, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2498: train loss 3.36465:  16%|█▌        | 2499/15515 [02:42<14:09, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2502: train loss 3.39108:  16%|█▌        | 2503/15515 [02:42<13:56, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2506: train loss 3.39613:  16%|█▌        | 2507/15515 [02:43<13:57, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2510: train loss 3.41077:  16%|█▌        | 2511/15515 [02:43<13:57, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2514: train loss 3.25730:  16%|█▌        | 2515/15515 [02:43<13:50, 15.65it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2518: train loss 3.34304:  16%|█▌        | 2519/15515 [02:43<13:57, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2522: train loss 3.36054:  16%|█▋        | 2523/15515 [02:44<14:00, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2526: train loss 3.35387:  16%|█▋        | 2527/15515 [02:44<13:56, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2530: train loss 3.37892:  16%|█▋        | 2531/15515 [02:44<14:00, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2534: train loss 3.42412:  16%|█▋        | 2535/15515 [02:44<13:57, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2538: train loss 3.29748:  16%|█▋        | 2539/15515 [02:45<13:56, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2542: train loss 3.37304:  16%|█▋        | 2543/15515 [02:45<13:59, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2546: train loss 3.34216:  16%|█▋        | 2547/15515 [02:45<14:02, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2550: train loss 3.28162:  16%|█▋        | 2551/15515 [02:45<14:02, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2554: train loss 3.44869:  16%|█▋        | 2555/15515 [02:46<13:57, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2558: train loss 3.33563:  16%|█▋        | 2559/15515 [02:46<14:02, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2562: train loss 3.40207:  17%|█▋        | 2563/15515 [02:46<13:59, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2566: train loss 3.32192:  17%|█▋        | 2567/15515 [02:46<13:57, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2570: train loss 3.34848:  17%|█▋        | 2571/15515 [02:47<13:58, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2574: train loss 3.28863:  17%|█▋        | 2575/15515 [02:47<13:56, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2578: train loss 3.41989:  17%|█▋        | 2579/15515 [02:47<13:53, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2582: train loss 3.32667:  17%|█▋        | 2583/15515 [02:47<13:54, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2586: train loss 3.39780:  17%|█▋        | 2587/15515 [02:48<13:53, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2590: train loss 3.32406:  17%|█▋        | 2591/15515 [02:48<13:53, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2594: train loss 3.29216:  17%|█▋        | 2595/15515 [02:48<13:51, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2598: train loss 3.34201:  17%|█▋        | 2599/15515 [02:49<13:52, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2602: train loss 3.41735:  17%|█▋        | 2603/15515 [02:49<13:41, 15.71it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2606: train loss 3.36182:  17%|█▋        | 2607/15515 [02:49<13:48, 15.58it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2610: train loss 3.37314:  17%|█▋        | 2611/15515 [02:49<13:45, 15.63it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2614: train loss 3.38871:  17%|█▋        | 2615/15515 [02:50<13:50, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2618: train loss 3.38905:  17%|█▋        | 2619/15515 [02:50<13:56, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2622: train loss 3.33744:  17%|█▋        | 2623/15515 [02:50<13:56, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2626: train loss 3.26494:  17%|█▋        | 2627/15515 [02:50<13:55, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2630: train loss 3.31687:  17%|█▋        | 2631/15515 [02:51<13:51, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2634: train loss 3.32142:  17%|█▋        | 2635/15515 [02:51<13:51, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2638: train loss 3.37407:  17%|█▋        | 2639/15515 [02:51<13:51, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2642: train loss 3.30994:  17%|█▋        | 2643/15515 [02:51<14:05, 15.22it/s]

32459 128
32459 128
32459 128


epoch 0 iter 2645: train loss 3.26751:  17%|█▋        | 2645/15515 [02:52<14:11, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2649: train loss 3.36426:  17%|█▋        | 2649/15515 [02:52<14:17, 15.01it/s]

32459 128
32459 128
32459 128


epoch 0 iter 2652: train loss 3.33488:  17%|█▋        | 2653/15515 [02:52<14:24, 14.87it/s]

32459 128
32459 128
32459 128


epoch 0 iter 2655: train loss 3.30548:  17%|█▋        | 2655/15515 [02:52<14:34, 14.70it/s]

32459 128
32459 128
32459 128


epoch 0 iter 2658: train loss 3.30243:  17%|█▋        | 2659/15515 [02:52<14:24, 14.87it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2662: train loss 3.32071:  17%|█▋        | 2663/15515 [02:53<14:08, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2666: train loss 3.36524:  17%|█▋        | 2667/15515 [02:53<14:00, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2670: train loss 3.35360:  17%|█▋        | 2671/15515 [02:53<14:13, 15.06it/s]

32459 128
32459 128
32459 128


epoch 0 iter 2673: train loss 3.26559:  17%|█▋        | 2673/15515 [02:53<14:23, 14.87it/s]

32459 128
32459 128
32459 128


epoch 0 iter 2676: train loss 3.42278:  17%|█▋        | 2677/15515 [02:54<14:23, 14.86it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2680: train loss 3.33188:  17%|█▋        | 2681/15515 [02:54<14:22, 14.89it/s]

32459 128
32459 128
32459 128


epoch 0 iter 2683: train loss 3.31824:  17%|█▋        | 2683/15515 [02:54<14:16, 14.99it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2687: train loss 3.30797:  17%|█▋        | 2687/15515 [02:54<14:06, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2691: train loss 3.35147:  17%|█▋        | 2691/15515 [02:55<14:16, 14.97it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2695: train loss 3.40830:  17%|█▋        | 2695/15515 [02:55<14:05, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2699: train loss 3.31520:  17%|█▋        | 2699/15515 [02:55<13:58, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2703: train loss 3.28977:  17%|█▋        | 2703/15515 [02:55<13:48, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2707: train loss 3.30244:  17%|█▋        | 2707/15515 [02:56<13:51, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2711: train loss 3.32428:  17%|█▋        | 2711/15515 [02:56<14:00, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2715: train loss 3.32260:  17%|█▋        | 2715/15515 [02:56<13:48, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2719: train loss 3.32597:  18%|█▊        | 2719/15515 [02:56<13:53, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2723: train loss 3.33912:  18%|█▊        | 2723/15515 [02:57<14:00, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2727: train loss 3.27997:  18%|█▊        | 2727/15515 [02:57<13:57, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2731: train loss 3.34405:  18%|█▊        | 2731/15515 [02:57<13:54, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2735: train loss 3.26497:  18%|█▊        | 2735/15515 [02:58<13:45, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2739: train loss 3.35865:  18%|█▊        | 2739/15515 [02:58<13:47, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2743: train loss 3.37328:  18%|█▊        | 2743/15515 [02:58<13:44, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2747: train loss 3.36572:  18%|█▊        | 2747/15515 [02:58<13:50, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2751: train loss 3.36488:  18%|█▊        | 2751/15515 [02:59<13:51, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2755: train loss 3.35597:  18%|█▊        | 2755/15515 [02:59<13:48, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2759: train loss 3.41971:  18%|█▊        | 2759/15515 [02:59<13:45, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2763: train loss 3.32151:  18%|█▊        | 2763/15515 [02:59<13:49, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2767: train loss 3.37986:  18%|█▊        | 2767/15515 [03:00<13:46, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2771: train loss 3.29665:  18%|█▊        | 2771/15515 [03:00<13:44, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2775: train loss 3.42353:  18%|█▊        | 2775/15515 [03:00<13:47, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2779: train loss 3.37368:  18%|█▊        | 2779/15515 [03:00<13:49, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2783: train loss 3.27835:  18%|█▊        | 2783/15515 [03:01<13:45, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2787: train loss 3.33896:  18%|█▊        | 2787/15515 [03:01<13:40, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2791: train loss 3.34565:  18%|█▊        | 2791/15515 [03:01<13:41, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2795: train loss 3.35709:  18%|█▊        | 2795/15515 [03:01<13:37, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2799: train loss 3.31238:  18%|█▊        | 2799/15515 [03:02<13:51, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2803: train loss 3.33475:  18%|█▊        | 2803/15515 [03:02<13:44, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2807: train loss 3.33126:  18%|█▊        | 2807/15515 [03:02<13:39, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2811: train loss 3.33594:  18%|█▊        | 2811/15515 [03:02<13:37, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2815: train loss 3.27434:  18%|█▊        | 2815/15515 [03:03<13:31, 15.65it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2819: train loss 3.30384:  18%|█▊        | 2819/15515 [03:03<13:34, 15.58it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2823: train loss 3.28356:  18%|█▊        | 2823/15515 [03:03<13:37, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2827: train loss 3.35069:  18%|█▊        | 2827/15515 [03:03<13:37, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2831: train loss 3.34778:  18%|█▊        | 2831/15515 [03:04<13:42, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2835: train loss 3.28329:  18%|█▊        | 2835/15515 [03:04<13:44, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2839: train loss 3.35871:  18%|█▊        | 2839/15515 [03:04<13:50, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2843: train loss 3.25313:  18%|█▊        | 2843/15515 [03:04<13:49, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2847: train loss 3.33229:  18%|█▊        | 2847/15515 [03:05<13:55, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2851: train loss 3.25534:  18%|█▊        | 2851/15515 [03:05<14:05, 14.99it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 2854: train loss 3.35262:  18%|█▊        | 2855/15515 [03:05<13:52, 15.21it/s]


32459 128
32459 128
32459 128


epoch 0 iter 2858: train loss 3.30216:  18%|█▊        | 2859/15515 [03:05<13:47, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2862: train loss 3.39317:  18%|█▊        | 2863/15515 [03:06<13:55, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2865: train loss 3.34802:  18%|█▊        | 2865/15515 [03:06<13:52, 15.19it/s]

32459 128
32459 128
32459 128


epoch 0 iter 2869: train loss 3.25680:  18%|█▊        | 2869/15515 [03:06<13:43, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2873: train loss 3.33241:  19%|█▊        | 2873/15515 [03:06<13:40, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2877: train loss 3.28652:  19%|█▊        | 2877/15515 [03:07<14:02, 15.00it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2881: train loss 3.28602:  19%|█▊        | 2881/15515 [03:07<14:16, 14.75it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2885: train loss 3.39875:  19%|█▊        | 2885/15515 [03:07<14:19, 14.70it/s]

32459 128
32459 128
32459 128


epoch 0 iter 2888: train loss 3.28639:  19%|█▊        | 2889/15515 [03:07<14:05, 14.93it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2892: train loss 3.38729:  19%|█▊        | 2893/15515 [03:08<14:11, 14.83it/s]

32459 128
32459 128
32459 128


epoch 0 iter 2895: train loss 3.26945:  19%|█▊        | 2895/15515 [03:08<14:06, 14.90it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2899: train loss 3.27222:  19%|█▊        | 2899/15515 [03:08<13:56, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2903: train loss 3.31804:  19%|█▊        | 2903/15515 [03:08<13:51, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2907: train loss 3.30740:  19%|█▊        | 2907/15515 [03:09<13:41, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2911: train loss 3.33324:  19%|█▉        | 2911/15515 [03:09<13:43, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2915: train loss 3.37557:  19%|█▉        | 2915/15515 [03:09<13:41, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2919: train loss 3.31885:  19%|█▉        | 2919/15515 [03:10<13:37, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2923: train loss 3.36665:  19%|█▉        | 2923/15515 [03:10<13:42, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2927: train loss 3.36804:  19%|█▉        | 2927/15515 [03:10<13:43, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2931: train loss 3.24482:  19%|█▉        | 2931/15515 [03:10<13:38, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2935: train loss 3.36135:  19%|█▉        | 2935/15515 [03:11<13:35, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2939: train loss 3.34494:  19%|█▉        | 2939/15515 [03:11<13:34, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2943: train loss 3.33009:  19%|█▉        | 2943/15515 [03:11<13:46, 15.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2947: train loss 3.25977:  19%|█▉        | 2947/15515 [03:11<13:45, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2951: train loss 3.31633:  19%|█▉        | 2951/15515 [03:12<13:38, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2955: train loss 3.21900:  19%|█▉        | 2955/15515 [03:12<13:38, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2959: train loss 3.28601:  19%|█▉        | 2959/15515 [03:12<13:34, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2963: train loss 3.37290:  19%|█▉        | 2963/15515 [03:12<13:36, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2967: train loss 3.31477:  19%|█▉        | 2967/15515 [03:13<13:39, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2971: train loss 3.33158:  19%|█▉        | 2971/15515 [03:13<13:38, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2975: train loss 3.35917:  19%|█▉        | 2975/15515 [03:13<13:37, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2979: train loss 3.30641:  19%|█▉        | 2979/15515 [03:13<13:40, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2983: train loss 3.35637:  19%|█▉        | 2983/15515 [03:14<13:37, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2987: train loss 3.29090:  19%|█▉        | 2987/15515 [03:14<13:35, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2991: train loss 3.30024:  19%|█▉        | 2991/15515 [03:14<13:30, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2995: train loss 3.34329:  19%|█▉        | 2995/15515 [03:14<13:29, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 2999: train loss 3.29465:  19%|█▉        | 2999/15515 [03:15<13:24, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3003: train loss 3.29475:  19%|█▉        | 3003/15515 [03:15<13:26, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3007: train loss 3.28469:  19%|█▉        | 3007/15515 [03:15<13:33, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3011: train loss 3.39219:  19%|█▉        | 3011/15515 [03:16<13:30, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3015: train loss 3.28819:  19%|█▉        | 3015/15515 [03:16<13:31, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3019: train loss 3.36243:  19%|█▉        | 3019/15515 [03:16<13:31, 15.41it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 3022: train loss 3.38494:  19%|█▉        | 3023/15515 [03:16<13:43, 15.17it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 3026: train loss 3.36174:  20%|█▉        | 3027/15515 [03:16<13:42, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3030: train loss 3.35920:  20%|█▉        | 3031/15515 [03:17<13:40, 15.22it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 3033: train loss 3.31541:  20%|█▉        | 3033/15515 [03:17<13:42, 15.18it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 3037: train loss 3.34303:  20%|█▉        | 3037/15515 [03:17<13:38, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3041: train loss 3.36942:  20%|█▉        | 3041/15515 [03:17<13:34, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3045: train loss 3.32111:  20%|█▉        | 3045/15515 [03:18<13:32, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3049: train loss 3.32898:  20%|█▉        | 3049/15515 [03:18<13:37, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3052: train loss 3.37586:  20%|█▉        | 3053/15515 [03:18<13:43, 15.14it/s]

32459 128
32459 128


epoch 0 iter 3055: train loss 3.35951:  20%|█▉        | 3055/15515 [03:18<13:40, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3059: train loss 3.33751:  20%|█▉        | 3059/15515 [03:19<13:31, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3063: train loss 3.24000:  20%|█▉        | 3063/15515 [03:19<13:27, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3067: train loss 3.29324:  20%|█▉        | 3067/15515 [03:19<13:37, 15.22it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 3070: train loss 3.34403:  20%|█▉        | 3071/15515 [03:19<13:44, 15.09it/s]

128
32459 128
32459 128
32459

epoch 0 iter 3073: train loss 3.26879:  20%|█▉        | 3073/15515 [03:20<13:43, 15.11it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 3077: train loss 3.33345:  20%|█▉        | 3077/15515 [03:20<13:41, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3081: train loss 3.29633:  20%|█▉        | 3081/15515 [03:20<13:34, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3085: train loss 3.27842:  20%|█▉        | 3085/15515 [03:20<13:43, 15.09it/s]

32459 128
32459 128
32459 128


epoch 0 iter 3088: train loss 3.30830:  20%|█▉        | 3089/15515 [03:21<14:01, 14.76it/s]

32459 128
32459 128
32459 128


epoch 0 iter 3091: train loss 3.34137:  20%|█▉        | 3091/15515 [03:21<14:07, 14.66it/s]

32459 128
32459 128
32459 128


epoch 0 iter 3094: train loss 3.31709:  20%|█▉        | 3095/15515 [03:21<14:05, 14.70it/s]

32459 128
32459 128
32459 128


epoch 0 iter 3097: train loss 3.37264:  20%|█▉        | 3097/15515 [03:21<14:02, 14.74it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 3100: train loss 3.24607:  20%|█▉        | 3101/15515 [03:21<13:49, 14.96it/s]


32459 128
32459 128
32459 128


epoch 0 iter 3104: train loss 3.28307:  20%|██        | 3105/15515 [03:22<13:34, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3108: train loss 3.31051:  20%|██        | 3109/15515 [03:22<13:29, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3112: train loss 3.29903:  20%|██        | 3113/15515 [03:22<13:28, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3116: train loss 3.25380:  20%|██        | 3117/15515 [03:22<13:36, 15.19it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 3119: train loss 3.34751:  20%|██        | 3119/15515 [03:23<13:36, 15.18it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 3123: train loss 3.18115:  20%|██        | 3123/15515 [03:23<13:37, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3127: train loss 3.32467:  20%|██        | 3127/15515 [03:23<13:31, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3131: train loss 3.29600:  20%|██        | 3131/15515 [03:23<13:25, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3135: train loss 3.22040:  20%|██        | 3135/15515 [03:24<13:20, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3139: train loss 3.32504:  20%|██        | 3139/15515 [03:24<13:18, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3143: train loss 3.36525:  20%|██        | 3143/15515 [03:24<13:15, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3147: train loss 3.22970:  20%|██        | 3147/15515 [03:24<13:20, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3151: train loss 3.30297:  20%|██        | 3151/15515 [03:25<13:20, 15.44it/s]

32459 128
32459 128
32459 128


epoch 0 iter 3154: train loss 3.29121:  20%|██        | 3155/15515 [03:25<13:26, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3158: train loss 3.31731:  20%|██        | 3159/15515 [03:25<13:23, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3162: train loss 3.22284:  20%|██        | 3163/15515 [03:25<13:31, 15.23it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 3165: train loss 3.29858:  20%|██        | 3165/15515 [03:26<13:30, 15.24it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 3169: train loss 3.28401:  20%|██        | 3169/15515 [03:26<13:26, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3173: train loss 3.32613:  20%|██        | 3173/15515 [03:26<13:33, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3177: train loss 3.31197:  20%|██        | 3177/15515 [03:26<13:33, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3181: train loss 3.33436:  21%|██        | 3181/15515 [03:27<13:25, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3185: train loss 3.28164:  21%|██        | 3185/15515 [03:27<13:18, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3189: train loss 3.29706:  21%|██        | 3189/15515 [03:27<13:16, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3193: train loss 3.32588:  21%|██        | 3193/15515 [03:27<13:17, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3197: train loss 3.40743:  21%|██        | 3197/15515 [03:28<13:16, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3201: train loss 3.27086:  21%|██        | 3201/15515 [03:28<13:10, 15.57it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3205: train loss 3.31766:  21%|██        | 3205/15515 [03:28<13:10, 15.57it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3209: train loss 3.27201:  21%|██        | 3209/15515 [03:28<13:11, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3213: train loss 3.26403:  21%|██        | 3213/15515 [03:29<13:14, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3217: train loss 3.28072:  21%|██        | 3217/15515 [03:29<13:14, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3221: train loss 3.33416:  21%|██        | 3221/15515 [03:29<13:20, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3225: train loss 3.21427:  21%|██        | 3225/15515 [03:30<13:20, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3229: train loss 3.30493:  21%|██        | 3229/15515 [03:30<13:20, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3233: train loss 3.25253:  21%|██        | 3233/15515 [03:30<13:18, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3237: train loss 3.30667:  21%|██        | 3237/15515 [03:30<13:28, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3241: train loss 3.33427:  21%|██        | 3241/15515 [03:31<13:36, 15.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3245: train loss 3.36870:  21%|██        | 3245/15515 [03:31<13:27, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3249: train loss 3.32561:  21%|██        | 3249/15515 [03:31<13:23, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3253: train loss 3.30753:  21%|██        | 3253/15515 [03:31<13:13, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3257: train loss 3.31068:  21%|██        | 3257/15515 [03:32<13:33, 15.07it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 3260: train loss 3.33599:  21%|██        | 3261/15515 [03:32<13:52, 14.72it/s]

 128
32459 128
32459 128


epoch 0 iter 3263: train loss 3.25041:  21%|██        | 3263/15515 [03:32<13:47, 14.81it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3267: train loss 3.32702:  21%|██        | 3267/15515 [03:32<13:29, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3271: train loss 3.36614:  21%|██        | 3271/15515 [03:33<13:22, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3275: train loss 3.24732:  21%|██        | 3275/15515 [03:33<13:17, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3279: train loss 3.31834:  21%|██        | 3279/15515 [03:33<13:10, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3283: train loss 3.28927:  21%|██        | 3283/15515 [03:33<13:19, 15.30it/s]

32459 128
32459 128
32459 128


epoch 0 iter 3286: train loss 3.31064:  21%|██        | 3287/15515 [03:34<13:29, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3290: train loss 3.31818:  21%|██        | 3291/15515 [03:34<13:28, 15.11it/s]

32459 128
32459 128
32459 128


epoch 0 iter 3293: train loss 3.30906:  21%|██        | 3293/15515 [03:34<13:23, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3297: train loss 3.37671:  21%|██▏       | 3297/15515 [03:34<13:31, 15.05it/s]

32459 128
32459 128
32459 128


epoch 0 iter 3300: train loss 3.21330:  21%|██▏       | 3301/15515 [03:34<13:44, 14.81it/s]

32459 128
32459 128
32459 128


epoch 0 iter 3303: train loss 3.23868:  21%|██▏       | 3303/15515 [03:35<13:47, 14.77it/s]

32459 128
32459 128
32459 128


epoch 0 iter 3305: train loss 3.36405:  21%|██▏       | 3305/15515 [03:35<14:15, 14.28it/s]

32459 128
32459 128
32459 128


epoch 0 iter 3308: train loss 3.28131:  21%|██▏       | 3309/15515 [03:35<15:21, 13.25it/s]

32459 128
32459 128
32459 128


epoch 0 iter 3311: train loss 3.28760:  21%|██▏       | 3311/15515 [03:35<15:48, 12.87it/s]

32459 128
32459 128
32459 128


epoch 0 iter 3315: train loss 3.27381:  21%|██▏       | 3315/15515 [03:36<14:43, 13.81it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3319: train loss 3.25149:  21%|██▏       | 3319/15515 [03:36<14:00, 14.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3323: train loss 3.26624:  21%|██▏       | 3323/15515 [03:36<13:30, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3327: train loss 3.28399:  21%|██▏       | 3327/15515 [03:36<13:20, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3331: train loss 3.27734:  21%|██▏       | 3331/15515 [03:37<13:12, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3335: train loss 3.36065:  21%|██▏       | 3335/15515 [03:37<13:12, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3339: train loss 3.30621:  22%|██▏       | 3339/15515 [03:37<13:10, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3343: train loss 3.25517:  22%|██▏       | 3343/15515 [03:37<13:07, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3347: train loss 3.26138:  22%|██▏       | 3347/15515 [03:38<13:01, 15.58it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3351: train loss 3.29200:  22%|██▏       | 3351/15515 [03:38<13:07, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3355: train loss 3.21286:  22%|██▏       | 3355/15515 [03:38<13:03, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3359: train loss 3.26187:  22%|██▏       | 3359/15515 [03:38<13:03, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3363: train loss 3.26566:  22%|██▏       | 3363/15515 [03:39<13:04, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3367: train loss 3.33362:  22%|██▏       | 3367/15515 [03:39<13:05, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3371: train loss 3.28388:  22%|██▏       | 3371/15515 [03:39<13:10, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3375: train loss 3.45539:  22%|██▏       | 3375/15515 [03:39<13:09, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3379: train loss 3.24396:  22%|██▏       | 3379/15515 [03:40<13:00, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3383: train loss 3.30673:  22%|██▏       | 3383/15515 [03:40<13:04, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3387: train loss 3.21592:  22%|██▏       | 3387/15515 [03:40<13:06, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3391: train loss 3.24488:  22%|██▏       | 3391/15515 [03:40<13:05, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3395: train loss 3.31572:  22%|██▏       | 3395/15515 [03:41<13:07, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3399: train loss 3.24475:  22%|██▏       | 3399/15515 [03:41<13:20, 15.13it/s]

32459 128
32459 128
32459 128


epoch 0 iter 3402: train loss 3.26042:  22%|██▏       | 3403/15515 [03:41<13:18, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3406: train loss 3.27165:  22%|██▏       | 3407/15515 [03:41<13:13, 15.25it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 3409: train loss 3.28608:  22%|██▏       | 3409/15515 [03:42<13:14, 15.24it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 3413: train loss 3.29721:  22%|██▏       | 3413/15515 [03:42<13:14, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3417: train loss 3.27448:  22%|██▏       | 3417/15515 [03:42<13:15, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3421: train loss 3.28688:  22%|██▏       | 3421/15515 [03:42<13:06, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3425: train loss 3.25336:  22%|██▏       | 3425/15515 [03:43<13:01, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3429: train loss 3.32357:  22%|██▏       | 3429/15515 [03:43<13:03, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3433: train loss 3.25737:  22%|██▏       | 3433/15515 [03:43<13:04, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3437: train loss 3.23810:  22%|██▏       | 3437/15515 [03:43<13:02, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3441: train loss 3.27944:  22%|██▏       | 3441/15515 [03:44<13:02, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3445: train loss 3.33914:  22%|██▏       | 3445/15515 [03:44<13:01, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3449: train loss 3.32399:  22%|██▏       | 3449/15515 [03:44<13:01, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3453: train loss 3.24614:  22%|██▏       | 3453/15515 [03:45<12:57, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3457: train loss 3.27207:  22%|██▏       | 3457/15515 [03:45<13:01, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3461: train loss 3.29487:  22%|██▏       | 3461/15515 [03:45<12:59, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3465: train loss 3.21876:  22%|██▏       | 3465/15515 [03:45<12:59, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3469: train loss 3.25622:  22%|██▏       | 3469/15515 [03:46<13:04, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3473: train loss 3.34961:  22%|██▏       | 3473/15515 [03:46<13:22, 15.01it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3477: train loss 3.27486:  22%|██▏       | 3477/15515 [03:46<13:11, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3481: train loss 3.28831:  22%|██▏       | 3481/15515 [03:46<13:05, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3485: train loss 3.32290:  22%|██▏       | 3485/15515 [03:47<12:59, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3489: train loss 3.33773:  22%|██▏       | 3489/15515 [03:47<13:01, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3493: train loss 3.31447:  23%|██▎       | 3493/15515 [03:47<13:03, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3497: train loss 3.17107:  23%|██▎       | 3497/15515 [03:47<13:15, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3501: train loss 3.30795:  23%|██▎       | 3501/15515 [03:48<13:23, 14.96it/s]

32459 128
32459 128
32459 128


epoch 0 iter 3504: train loss 3.25983:  23%|██▎       | 3505/15515 [03:48<13:29, 14.83it/s]

32459 128
32459 128
32459 128


epoch 0 iter 3507: train loss 3.25066:  23%|██▎       | 3507/15515 [03:48<13:35, 14.73it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3511: train loss 3.30460:  23%|██▎       | 3511/15515 [03:48<13:29, 14.82it/s]

32459 128
32459 128
32459 128


epoch 0 iter 3514: train loss 3.28678:  23%|██▎       | 3515/15515 [03:49<13:16, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3517: train loss 3.23698:  23%|██▎       | 3517/15515 [03:49<14:30, 13.78it/s]

32459 128
32459 128
32459 128


epoch 0 iter 3520: train loss 3.25473:  23%|██▎       | 3521/15515 [03:49<17:16, 11.57it/s]

32459 128
32459 128
32459 128


epoch 0 iter 3524: train loss 3.20367:  23%|██▎       | 3525/15515 [03:49<15:28, 12.91it/s]

32459 128
32459 128
32459 128


epoch 0 iter 3527: train loss 3.25432:  23%|██▎       | 3527/15515 [03:50<14:47, 13.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3531: train loss 3.31386:  23%|██▎       | 3531/15515 [03:50<13:50, 14.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3535: train loss 3.32373:  23%|██▎       | 3535/15515 [03:50<13:25, 14.87it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3539: train loss 3.32240:  23%|██▎       | 3539/15515 [03:50<13:15, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3543: train loss 3.36533:  23%|██▎       | 3543/15515 [03:51<13:07, 15.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3547: train loss 3.31212:  23%|██▎       | 3547/15515 [03:51<13:02, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3551: train loss 3.30238:  23%|██▎       | 3551/15515 [03:51<12:58, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3555: train loss 3.24972:  23%|██▎       | 3555/15515 [03:51<12:55, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3559: train loss 3.27654:  23%|██▎       | 3559/15515 [03:52<12:56, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3563: train loss 3.24594:  23%|██▎       | 3563/15515 [03:52<12:56, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3567: train loss 3.27473:  23%|██▎       | 3567/15515 [03:52<12:58, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3571: train loss 3.26642:  23%|██▎       | 3571/15515 [03:52<12:54, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3575: train loss 3.21739:  23%|██▎       | 3575/15515 [03:53<12:54, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3579: train loss 3.26315:  23%|██▎       | 3579/15515 [03:53<13:02, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3583: train loss 3.28128:  23%|██▎       | 3583/15515 [03:53<13:07, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3587: train loss 3.28158:  23%|██▎       | 3587/15515 [03:54<13:08, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3591: train loss 3.24296:  23%|██▎       | 3591/15515 [03:54<13:06, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3595: train loss 3.29658:  23%|██▎       | 3595/15515 [03:54<13:02, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3599: train loss 3.25378:  23%|██▎       | 3599/15515 [03:54<12:57, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3603: train loss 3.16044:  23%|██▎       | 3603/15515 [03:55<13:04, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3607: train loss 3.22463:  23%|██▎       | 3607/15515 [03:55<13:10, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3611: train loss 3.28206:  23%|██▎       | 3611/15515 [03:55<13:01, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3615: train loss 3.26916:  23%|██▎       | 3615/15515 [03:55<12:59, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3619: train loss 3.29060:  23%|██▎       | 3619/15515 [03:56<12:48, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3623: train loss 3.29797:  23%|██▎       | 3623/15515 [03:56<12:48, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3627: train loss 3.19011:  23%|██▎       | 3627/15515 [03:56<12:46, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3631: train loss 3.30661:  23%|██▎       | 3631/15515 [03:56<12:41, 15.62it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3635: train loss 3.31368:  23%|██▎       | 3635/15515 [03:57<12:50, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3639: train loss 3.29154:  23%|██▎       | 3639/15515 [03:57<12:53, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3643: train loss 3.22094:  23%|██▎       | 3643/15515 [03:57<12:52, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3647: train loss 3.28295:  24%|██▎       | 3647/15515 [03:57<12:49, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3651: train loss 3.20325:  24%|██▎       | 3651/15515 [03:58<12:42, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3655: train loss 3.27088:  24%|██▎       | 3655/15515 [03:58<12:58, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3659: train loss 3.21601:  24%|██▎       | 3659/15515 [03:58<13:02, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3663: train loss 3.23653:  24%|██▎       | 3663/15515 [03:58<12:52, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3667: train loss 3.26600:  24%|██▎       | 3667/15515 [03:59<12:55, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3671: train loss 3.25021:  24%|██▎       | 3671/15515 [03:59<13:20, 14.80it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3675: train loss 3.20901:  24%|██▎       | 3675/15515 [03:59<13:03, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3679: train loss 3.23118:  24%|██▎       | 3679/15515 [04:00<12:55, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3683: train loss 3.20404:  24%|██▎       | 3683/15515 [04:00<13:03, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3687: train loss 3.28859:  24%|██▍       | 3687/15515 [04:00<13:08, 15.00it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3691: train loss 3.29836:  24%|██▍       | 3691/15515 [04:00<13:02, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3695: train loss 3.28860:  24%|██▍       | 3695/15515 [04:01<13:02, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3699: train loss 3.21756:  24%|██▍       | 3699/15515 [04:01<13:05, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3703: train loss 3.31371:  24%|██▍       | 3703/15515 [04:01<13:02, 15.09it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 3706: train loss 3.29556:  24%|██▍       | 3707/15515 [04:01<13:18, 14.79it/s]

128
32459 128
32459 128


epoch 0 iter 3709: train loss 3.22848:  24%|██▍       | 3709/15515 [04:02<13:05, 15.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3713: train loss 3.25302:  24%|██▍       | 3713/15515 [04:02<13:05, 15.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3717: train loss 3.23121:  24%|██▍       | 3717/15515 [04:02<13:12, 14.90it/s]

32459 128
32459 128
32459 128


epoch 0 iter 3720: train loss 3.17970:  24%|██▍       | 3721/15515 [04:02<13:19, 14.74it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 3723: train loss 3.23195:  24%|██▍       | 3723/15515 [04:02<13:14, 14.85it/s]

 128
32459 128
32459 128
32459 128

epoch 0 iter 3726: train loss 3.24625:  24%|██▍       | 3727/15515 [04:03<13:22, 14.70it/s]


32459 128
32459 128


epoch 0 iter 3729: train loss 3.25272:  24%|██▍       | 3729/15515 [04:03<13:18, 14.77it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3733: train loss 3.29218:  24%|██▍       | 3733/15515 [04:03<12:59, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3737: train loss 3.20633:  24%|██▍       | 3737/15515 [04:03<12:50, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3741: train loss 3.20314:  24%|██▍       | 3741/15515 [04:04<12:40, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3745: train loss 3.24926:  24%|██▍       | 3745/15515 [04:04<12:41, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3749: train loss 3.33137:  24%|██▍       | 3749/15515 [04:04<12:37, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3753: train loss 3.27461:  24%|██▍       | 3753/15515 [04:04<12:37, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3757: train loss 3.34008:  24%|██▍       | 3757/15515 [04:05<12:48, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3761: train loss 3.30577:  24%|██▍       | 3761/15515 [04:05<12:45, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3765: train loss 3.29158:  24%|██▍       | 3765/15515 [04:05<12:47, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3769: train loss 3.27817:  24%|██▍       | 3769/15515 [04:05<12:47, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3773: train loss 3.34492:  24%|██▍       | 3773/15515 [04:06<12:44, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3777: train loss 3.26328:  24%|██▍       | 3777/15515 [04:06<12:42, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3781: train loss 3.22598:  24%|██▍       | 3781/15515 [04:06<12:40, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3785: train loss 3.23297:  24%|██▍       | 3785/15515 [04:06<12:46, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3789: train loss 3.28070:  24%|██▍       | 3789/15515 [04:07<12:49, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3793: train loss 3.25838:  24%|██▍       | 3793/15515 [04:07<12:49, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3797: train loss 3.24320:  24%|██▍       | 3797/15515 [04:07<12:48, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3801: train loss 3.31326:  24%|██▍       | 3801/15515 [04:08<12:40, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3805: train loss 3.22515:  25%|██▍       | 3805/15515 [04:08<12:40, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3809: train loss 3.29931:  25%|██▍       | 3809/15515 [04:08<12:35, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3813: train loss 3.29869:  25%|██▍       | 3813/15515 [04:08<12:32, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3817: train loss 3.23751:  25%|██▍       | 3817/15515 [04:09<12:35, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3821: train loss 3.19522:  25%|██▍       | 3821/15515 [04:09<12:29, 15.61it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3825: train loss 3.23643:  25%|██▍       | 3825/15515 [04:09<12:32, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3829: train loss 3.33305:  25%|██▍       | 3829/15515 [04:09<12:36, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3833: train loss 3.26319:  25%|██▍       | 3833/15515 [04:10<12:29, 15.59it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3837: train loss 3.28900:  25%|██▍       | 3837/15515 [04:10<12:26, 15.65it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3841: train loss 3.21903:  25%|██▍       | 3841/15515 [04:10<12:30, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3845: train loss 3.23721:  25%|██▍       | 3845/15515 [04:10<12:30, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3849: train loss 3.31433:  25%|██▍       | 3849/15515 [04:11<12:30, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3853: train loss 3.30746:  25%|██▍       | 3853/15515 [04:11<12:33, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3857: train loss 3.30544:  25%|██▍       | 3857/15515 [04:11<12:35, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3861: train loss 3.27010:  25%|██▍       | 3861/15515 [04:11<12:30, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3865: train loss 3.20776:  25%|██▍       | 3865/15515 [04:12<12:28, 15.57it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3869: train loss 3.26775:  25%|██▍       | 3869/15515 [04:12<12:31, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3873: train loss 3.20943:  25%|██▍       | 3873/15515 [04:12<12:29, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3877: train loss 3.26258:  25%|██▍       | 3877/15515 [04:12<12:33, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3881: train loss 3.24112:  25%|██▌       | 3881/15515 [04:13<12:32, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3885: train loss 3.20169:  25%|██▌       | 3885/15515 [04:13<12:36, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3888: train loss 3.24647:  25%|██▌       | 3889/15515 [04:13<12:53, 15.04it/s]

32459 128
32459 128


epoch 0 iter 3891: train loss 3.20107:  25%|██▌       | 3891/15515 [04:13<12:53, 15.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3895: train loss 3.26212:  25%|██▌       | 3895/15515 [04:14<12:41, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3899: train loss 3.20548:  25%|██▌       | 3899/15515 [04:14<12:37, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3903: train loss 3.28494:  25%|██▌       | 3903/15515 [04:14<12:30, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3907: train loss 3.22793:  25%|██▌       | 3907/15515 [04:14<12:34, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3911: train loss 3.26554:  25%|██▌       | 3911/15515 [04:15<12:41, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3915: train loss 3.25213:  25%|██▌       | 3915/15515 [04:15<12:56, 14.93it/s]

32459 128
32459 128
32459 128


epoch 0 iter 3918: train loss 3.27080:  25%|██▌       | 3919/15515 [04:15<12:58, 14.90it/s]

32459 128
32459 128
32459 128


epoch 0 iter 3921: train loss 3.25495:  25%|██▌       | 3921/15515 [04:15<13:02, 14.82it/s]

32459 128
32459 128
32459 128


epoch 0 iter 3924: train loss 3.22910:  25%|██▌       | 3925/15515 [04:16<13:04, 14.77it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3928: train loss 3.31080:  25%|██▌       | 3929/15515 [04:16<13:06, 14.74it/s]

32459 128
32459 128
32459 128


epoch 0 iter 3931: train loss 3.22549:  25%|██▌       | 3931/15515 [04:16<13:02, 14.80it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3935: train loss 3.17891:  25%|██▌       | 3935/15515 [04:16<13:05, 14.75it/s]

32459 128
32459 128
32459 128


epoch 0 iter 3938: train loss 3.23379:  25%|██▌       | 3939/15515 [04:16<12:52, 14.98it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3942: train loss 3.24881:  25%|██▌       | 3943/15515 [04:17<12:38, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3946: train loss 3.25215:  25%|██▌       | 3947/15515 [04:17<12:36, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3950: train loss 3.20652:  25%|██▌       | 3951/15515 [04:17<12:34, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3954: train loss 3.22179:  25%|██▌       | 3955/15515 [04:18<12:33, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3958: train loss 3.30013:  26%|██▌       | 3959/15515 [04:18<12:30, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3962: train loss 3.20404:  26%|██▌       | 3963/15515 [04:18<12:31, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3966: train loss 3.24726:  26%|██▌       | 3967/15515 [04:18<12:31, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3970: train loss 3.12933:  26%|██▌       | 3971/15515 [04:19<12:27, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3974: train loss 3.16225:  26%|██▌       | 3975/15515 [04:19<12:29, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3978: train loss 3.25399:  26%|██▌       | 3979/15515 [04:19<12:32, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3982: train loss 3.22238:  26%|██▌       | 3983/15515 [04:19<12:29, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3986: train loss 3.21687:  26%|██▌       | 3987/15515 [04:20<12:20, 15.58it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3990: train loss 3.21441:  26%|██▌       | 3991/15515 [04:20<12:20, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3994: train loss 3.24116:  26%|██▌       | 3995/15515 [04:20<12:19, 15.57it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 3998: train loss 3.24636:  26%|██▌       | 3999/15515 [04:20<12:20, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4002: train loss 3.24092:  26%|██▌       | 4003/15515 [04:21<12:19, 15.58it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4006: train loss 3.20228:  26%|██▌       | 4007/15515 [04:21<12:23, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4010: train loss 3.24766:  26%|██▌       | 4011/15515 [04:21<12:20, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4014: train loss 3.24624:  26%|██▌       | 4015/15515 [04:21<12:20, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4018: train loss 3.20080:  26%|██▌       | 4019/15515 [04:22<12:23, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4022: train loss 3.15939:  26%|██▌       | 4023/15515 [04:22<12:21, 15.51it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4025: train loss 3.18929:  26%|██▌       | 4025/15515 [04:22<12:31, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4029: train loss 3.23810:  26%|██▌       | 4029/15515 [04:22<12:32, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4033: train loss 3.27033:  26%|██▌       | 4033/15515 [04:23<12:25, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4037: train loss 3.32541:  26%|██▌       | 4037/15515 [04:23<12:21, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4041: train loss 3.29455:  26%|██▌       | 4041/15515 [04:23<12:21, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4045: train loss 3.15872:  26%|██▌       | 4045/15515 [04:23<12:20, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4049: train loss 3.21615:  26%|██▌       | 4049/15515 [04:24<12:18, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4053: train loss 3.29908:  26%|██▌       | 4053/15515 [04:24<12:18, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4057: train loss 3.31011:  26%|██▌       | 4057/15515 [04:24<12:16, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4061: train loss 3.21578:  26%|██▌       | 4061/15515 [04:24<12:13, 15.63it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4065: train loss 3.24213:  26%|██▌       | 4065/15515 [04:25<12:14, 15.60it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4069: train loss 3.16311:  26%|██▌       | 4069/15515 [04:25<12:16, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4073: train loss 3.22330:  26%|██▋       | 4073/15515 [04:25<12:17, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4077: train loss 3.19038:  26%|██▋       | 4077/15515 [04:25<12:19, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4081: train loss 3.20613:  26%|██▋       | 4081/15515 [04:26<12:25, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4085: train loss 3.16692:  26%|██▋       | 4085/15515 [04:26<12:25, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4089: train loss 3.23646:  26%|██▋       | 4089/15515 [04:26<12:28, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4093: train loss 3.18603:  26%|██▋       | 4093/15515 [04:27<12:35, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4097: train loss 3.29774:  26%|██▋       | 4097/15515 [04:27<12:40, 15.01it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 4100: train loss 3.27509:  26%|██▋       | 4101/15515 [04:27<12:33, 15.15it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 4104: train loss 3.26453:  26%|██▋       | 4105/15515 [04:27<12:26, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4108: train loss 3.22348:  26%|██▋       | 4109/15515 [04:28<12:20, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4112: train loss 3.23308:  27%|██▋       | 4113/15515 [04:28<12:28, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4116: train loss 3.16954:  27%|██▋       | 4117/15515 [04:28<12:17, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4120: train loss 3.18477:  27%|██▋       | 4121/15515 [04:28<12:14, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4124: train loss 3.19726:  27%|██▋       | 4125/15515 [04:29<12:13, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4128: train loss 3.25430:  27%|██▋       | 4129/15515 [04:29<12:24, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4132: train loss 3.23154:  27%|██▋       | 4133/15515 [04:29<12:28, 15.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4136: train loss 3.18414:  27%|██▋       | 4137/15515 [04:29<12:32, 15.12it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4139: train loss 3.24603:  27%|██▋       | 4139/15515 [04:30<12:36, 15.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4143: train loss 3.21192:  27%|██▋       | 4143/15515 [04:30<12:39, 14.97it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4147: train loss 3.24746:  27%|██▋       | 4147/15515 [04:30<12:22, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4151: train loss 3.24940:  27%|██▋       | 4151/15515 [04:30<12:14, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4155: train loss 3.23292:  27%|██▋       | 4155/15515 [04:31<12:29, 15.16it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4158: train loss 3.27768:  27%|██▋       | 4159/15515 [04:31<12:43, 14.88it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4161: train loss 3.24595:  27%|██▋       | 4161/15515 [04:31<12:34, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4165: train loss 3.25408:  27%|██▋       | 4165/15515 [04:31<12:23, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4169: train loss 3.21497:  27%|██▋       | 4169/15515 [04:32<12:19, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4173: train loss 3.26384:  27%|██▋       | 4173/15515 [04:32<12:13, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4177: train loss 3.19554:  27%|██▋       | 4177/15515 [04:32<12:23, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4181: train loss 3.23959:  27%|██▋       | 4181/15515 [04:32<12:18, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4185: train loss 3.17469:  27%|██▋       | 4185/15515 [04:33<12:14, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4189: train loss 3.25132:  27%|██▋       | 4189/15515 [04:33<12:11, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4193: train loss 3.28638:  27%|██▋       | 4193/15515 [04:33<12:12, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4197: train loss 3.15036:  27%|██▋       | 4197/15515 [04:33<12:09, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4201: train loss 3.15110:  27%|██▋       | 4201/15515 [04:34<12:10, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4205: train loss 3.18763:  27%|██▋       | 4205/15515 [04:34<12:09, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4209: train loss 3.19912:  27%|██▋       | 4209/15515 [04:34<12:07, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4213: train loss 3.21964:  27%|██▋       | 4213/15515 [04:34<12:09, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4217: train loss 3.17684:  27%|██▋       | 4217/15515 [04:35<12:12, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4221: train loss 3.23266:  27%|██▋       | 4221/15515 [04:35<12:16, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4225: train loss 3.23462:  27%|██▋       | 4225/15515 [04:35<12:12, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4229: train loss 3.22071:  27%|██▋       | 4229/15515 [04:35<12:05, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4233: train loss 3.27262:  27%|██▋       | 4233/15515 [04:36<12:06, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4237: train loss 3.25690:  27%|██▋       | 4237/15515 [04:36<12:11, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4241: train loss 3.24120:  27%|██▋       | 4241/15515 [04:36<12:12, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4245: train loss 3.19194:  27%|██▋       | 4245/15515 [04:36<12:05, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4249: train loss 3.25681:  27%|██▋       | 4249/15515 [04:37<12:12, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4253: train loss 3.25574:  27%|██▋       | 4253/15515 [04:37<12:13, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4257: train loss 3.24846:  27%|██▋       | 4257/15515 [04:37<12:13, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4261: train loss 3.22134:  27%|██▋       | 4261/15515 [04:37<12:14, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4265: train loss 3.26969:  27%|██▋       | 4265/15515 [04:38<12:16, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4269: train loss 3.18828:  28%|██▊       | 4269/15515 [04:38<12:14, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4273: train loss 3.19341:  28%|██▊       | 4273/15515 [04:38<12:10, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4277: train loss 3.23425:  28%|██▊       | 4277/15515 [04:39<12:07, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4281: train loss 3.25361:  28%|██▊       | 4281/15515 [04:39<12:10, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4285: train loss 3.17497:  28%|██▊       | 4285/15515 [04:39<12:10, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4289: train loss 3.26328:  28%|██▊       | 4289/15515 [04:39<12:14, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4293: train loss 3.17471:  28%|██▊       | 4293/15515 [04:40<12:18, 15.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4297: train loss 3.24860:  28%|██▊       | 4297/15515 [04:40<12:26, 15.04it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 4300: train loss 3.24141:  28%|██▊       | 4301/15515 [04:40<12:30, 14.95it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 4304: train loss 3.22585:  28%|██▊       | 4305/15515 [04:40<12:20, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4308: train loss 3.22751:  28%|██▊       | 4309/15515 [04:41<12:15, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4312: train loss 3.26720:  28%|██▊       | 4313/15515 [04:41<12:12, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4316: train loss 3.15449:  28%|██▊       | 4317/15515 [04:41<12:20, 15.12it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4319: train loss 3.14889:  28%|██▊       | 4319/15515 [04:41<12:28, 14.95it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4323: train loss 3.27917:  28%|██▊       | 4323/15515 [04:42<12:24, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4327: train loss 3.26647:  28%|██▊       | 4327/15515 [04:42<12:19, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4331: train loss 3.27967:  28%|██▊       | 4331/15515 [04:42<12:13, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4335: train loss 3.23765:  28%|██▊       | 4335/15515 [04:42<12:23, 15.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4339: train loss 3.27968:  28%|██▊       | 4339/15515 [04:43<12:07, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4343: train loss 3.31122:  28%|██▊       | 4343/15515 [04:43<11:59, 15.54it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4346: train loss 3.25610:  28%|██▊       | 4347/15515 [04:43<12:27, 14.94it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4349: train loss 3.21104:  28%|██▊       | 4349/15515 [04:43<12:33, 14.82it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 4352: train loss 3.21673:  28%|██▊       | 4353/15515 [04:43<12:22, 15.03it/s]


32459 128
32459 128
32459 128


epoch 0 iter 4356: train loss 3.19336:  28%|██▊       | 4357/15515 [04:44<12:16, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4360: train loss 3.23925:  28%|██▊       | 4361/15515 [04:44<12:18, 15.11it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4363: train loss 3.21467:  28%|██▊       | 4363/15515 [04:44<12:17, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4367: train loss 3.26191:  28%|██▊       | 4367/15515 [04:44<12:16, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4371: train loss 3.25284:  28%|██▊       | 4371/15515 [04:45<12:17, 15.11it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 4374: train loss 3.30094:  28%|██▊       | 4375/15515 [04:45<12:16, 15.13it/s]


32459 128
32459 128
32459 128


epoch 0 iter 4378: train loss 3.19085:  28%|██▊       | 4379/15515 [04:45<12:05, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4382: train loss 3.19705:  28%|██▊       | 4383/15515 [04:45<12:08, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4386: train loss 3.15810:  28%|██▊       | 4387/15515 [04:46<12:06, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4390: train loss 3.18294:  28%|██▊       | 4391/15515 [04:46<12:06, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4394: train loss 3.16633:  28%|██▊       | 4395/15515 [04:46<12:05, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4398: train loss 3.25846:  28%|██▊       | 4399/15515 [04:46<12:08, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4402: train loss 3.19635:  28%|██▊       | 4403/15515 [04:47<12:00, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4406: train loss 3.22087:  28%|██▊       | 4407/15515 [04:47<11:59, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4410: train loss 3.19595:  28%|██▊       | 4411/15515 [04:47<12:05, 15.30it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4413: train loss 3.15915:  28%|██▊       | 4413/15515 [04:47<12:05, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4417: train loss 3.26517:  28%|██▊       | 4417/15515 [04:48<12:02, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4421: train loss 3.23265:  28%|██▊       | 4421/15515 [04:48<11:59, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4425: train loss 3.23422:  29%|██▊       | 4425/15515 [04:48<11:59, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4429: train loss 3.17618:  29%|██▊       | 4429/15515 [04:48<11:55, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4433: train loss 3.23540:  29%|██▊       | 4433/15515 [04:49<11:55, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4437: train loss 3.18624:  29%|██▊       | 4437/15515 [04:49<11:56, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4441: train loss 3.18734:  29%|██▊       | 4441/15515 [04:49<11:52, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4445: train loss 3.21258:  29%|██▊       | 4445/15515 [04:50<11:52, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4449: train loss 3.22481:  29%|██▊       | 4449/15515 [04:50<11:52, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4453: train loss 3.23908:  29%|██▊       | 4453/15515 [04:50<11:52, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4457: train loss 3.21183:  29%|██▊       | 4457/15515 [04:50<11:49, 15.58it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4461: train loss 3.22684:  29%|██▉       | 4461/15515 [04:51<11:51, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4465: train loss 3.23361:  29%|██▉       | 4465/15515 [04:51<11:55, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4469: train loss 3.21221:  29%|██▉       | 4469/15515 [04:51<11:52, 15.50it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4472: train loss 3.15214:  29%|██▉       | 4473/15515 [04:51<12:04, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4476: train loss 3.22284:  29%|██▉       | 4477/15515 [04:52<11:56, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4480: train loss 3.18792:  29%|██▉       | 4481/15515 [04:52<12:00, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4484: train loss 3.18863:  29%|██▉       | 4485/15515 [04:52<12:07, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4488: train loss 3.21011:  29%|██▉       | 4489/15515 [04:52<11:57, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4492: train loss 3.22878:  29%|██▉       | 4493/15515 [04:53<11:57, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4496: train loss 3.17959:  29%|██▉       | 4497/15515 [04:53<12:00, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4500: train loss 3.22714:  29%|██▉       | 4501/15515 [04:53<11:57, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4504: train loss 3.18997:  29%|██▉       | 4505/15515 [04:53<11:57, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4508: train loss 3.17538:  29%|██▉       | 4509/15515 [04:54<11:54, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4512: train loss 3.21185:  29%|██▉       | 4513/15515 [04:54<11:53, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4516: train loss 3.23885:  29%|██▉       | 4517/15515 [04:54<11:52, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4520: train loss 3.17075:  29%|██▉       | 4521/15515 [04:54<11:54, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4524: train loss 3.18305:  29%|██▉       | 4525/15515 [04:55<11:50, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4527: train loss 3.22794:  29%|██▉       | 4527/15515 [04:55<11:55, 15.36it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4531: train loss 3.19879:  29%|██▉       | 4531/15515 [04:55<12:23, 14.78it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4533: train loss 3.17074:  29%|██▉       | 4533/15515 [04:55<13:26, 13.61it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4536: train loss 3.16883:  29%|██▉       | 4537/15515 [04:56<13:11, 13.87it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4540: train loss 3.21947:  29%|██▉       | 4541/15515 [04:56<12:48, 14.27it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4542: train loss 3.23120:  29%|██▉       | 4543/15515 [04:56<12:44, 14.35it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4545: train loss 3.22581:  29%|██▉       | 4545/15515 [04:56<13:30, 13.54it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4548: train loss 3.19077:  29%|██▉       | 4549/15515 [04:56<14:07, 12.94it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4551: train loss 3.18567:  29%|██▉       | 4551/15515 [04:57<14:25, 12.67it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4555: train loss 3.22464:  29%|██▉       | 4555/15515 [04:57<13:44, 13.30it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4558: train loss 3.20913:  29%|██▉       | 4559/15515 [04:57<12:59, 14.05it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4561: train loss 3.19082:  29%|██▉       | 4561/15515 [04:57<12:54, 14.15it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 4564: train loss 3.22187:  29%|██▉       | 4565/15515 [04:58<12:21, 14.78it/s]


32459 128
32459 128
32459 128


epoch 0 iter 4568: train loss 3.20536:  29%|██▉       | 4569/15515 [04:58<12:14, 14.90it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 4571: train loss 3.20183:  29%|██▉       | 4571/15515 [04:58<12:15, 14.87it/s]

 128
32459 128
32459 128


epoch 0 iter 4573: train loss 3.21301:  29%|██▉       | 4573/15515 [04:58<12:58, 14.05it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4577: train loss 3.18614:  30%|██▉       | 4577/15515 [04:58<13:08, 13.87it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4579: train loss 3.20894:  30%|██▉       | 4579/15515 [04:59<13:18, 13.69it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4582: train loss 3.19032:  30%|██▉       | 4583/15515 [04:59<15:58, 11.41it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4585: train loss 3.15954:  30%|██▉       | 4585/15515 [04:59<16:20, 11.15it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4589: train loss 3.23484:  30%|██▉       | 4589/15515 [05:00<14:34, 12.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4593: train loss 3.17078:  30%|██▉       | 4593/15515 [05:00<13:08, 13.85it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4597: train loss 3.21882:  30%|██▉       | 4597/15515 [05:00<12:31, 14.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4601: train loss 3.18567:  30%|██▉       | 4601/15515 [05:00<12:08, 14.97it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4605: train loss 3.14600:  30%|██▉       | 4605/15515 [05:01<11:54, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4609: train loss 3.30072:  30%|██▉       | 4609/15515 [05:01<11:45, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4613: train loss 3.12124:  30%|██▉       | 4613/15515 [05:01<11:42, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4617: train loss 3.27505:  30%|██▉       | 4617/15515 [05:01<11:55, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4621: train loss 3.18827:  30%|██▉       | 4621/15515 [05:02<11:47, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4625: train loss 3.28343:  30%|██▉       | 4625/15515 [05:02<11:45, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4629: train loss 3.20970:  30%|██▉       | 4629/15515 [05:02<11:42, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4633: train loss 3.29112:  30%|██▉       | 4633/15515 [05:02<11:47, 15.39it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4635: train loss 3.24323:  30%|██▉       | 4635/15515 [05:03<12:40, 14.31it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4639: train loss 3.17456:  30%|██▉       | 4639/15515 [05:03<13:23, 13.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4642: train loss 3.22402:  30%|██▉       | 4643/15515 [05:03<13:23, 13.52it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4645: train loss 3.19156:  30%|██▉       | 4645/15515 [05:03<14:31, 12.48it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4649: train loss 3.21832:  30%|██▉       | 4649/15515 [05:04<13:21, 13.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4653: train loss 3.19438:  30%|██▉       | 4653/15515 [05:04<12:28, 14.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4657: train loss 3.20681:  30%|███       | 4657/15515 [05:04<11:59, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4661: train loss 3.22659:  30%|███       | 4661/15515 [05:04<11:56, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4665: train loss 3.20747:  30%|███       | 4665/15515 [05:05<11:57, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4669: train loss 3.21992:  30%|███       | 4669/15515 [05:05<11:49, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4673: train loss 3.23634:  30%|███       | 4673/15515 [05:05<11:48, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4677: train loss 3.24046:  30%|███       | 4677/15515 [05:05<11:47, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4681: train loss 3.22853:  30%|███       | 4681/15515 [05:06<11:41, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4685: train loss 3.20270:  30%|███       | 4685/15515 [05:06<11:43, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4689: train loss 3.22265:  30%|███       | 4689/15515 [05:06<11:38, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4693: train loss 3.20695:  30%|███       | 4693/15515 [05:06<11:39, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4697: train loss 3.28771:  30%|███       | 4697/15515 [05:07<11:38, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4701: train loss 3.23683:  30%|███       | 4701/15515 [05:07<11:39, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4705: train loss 3.18479:  30%|███       | 4705/15515 [05:07<11:47, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4709: train loss 3.22004:  30%|███       | 4709/15515 [05:07<11:40, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4713: train loss 3.17537:  30%|███       | 4713/15515 [05:08<11:47, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4717: train loss 3.26131:  30%|███       | 4717/15515 [05:08<12:01, 14.96it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4721: train loss 3.16374:  30%|███       | 4721/15515 [05:08<11:47, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4725: train loss 3.16178:  30%|███       | 4725/15515 [05:09<11:40, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4729: train loss 3.26721:  30%|███       | 4729/15515 [05:09<11:42, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4733: train loss 3.20883:  31%|███       | 4733/15515 [05:09<11:37, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4737: train loss 3.16562:  31%|███       | 4737/15515 [05:09<11:35, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4741: train loss 3.24552:  31%|███       | 4741/15515 [05:10<11:45, 15.28it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4744: train loss 3.26976:  31%|███       | 4745/15515 [05:10<12:01, 14.94it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4747: train loss 3.21426:  31%|███       | 4747/15515 [05:10<12:06, 14.81it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4751: train loss 3.13988:  31%|███       | 4751/15515 [05:10<11:49, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4755: train loss 3.22511:  31%|███       | 4755/15515 [05:11<11:44, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4759: train loss 3.19178:  31%|███       | 4759/15515 [05:11<11:47, 15.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4763: train loss 3.24310:  31%|███       | 4763/15515 [05:11<11:42, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4767: train loss 3.28068:  31%|███       | 4767/15515 [05:11<11:43, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4771: train loss 3.25325:  31%|███       | 4771/15515 [05:12<11:46, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4775: train loss 3.27284:  31%|███       | 4775/15515 [05:12<11:40, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4779: train loss 3.18130:  31%|███       | 4779/15515 [05:12<11:30, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4783: train loss 3.15854:  31%|███       | 4783/15515 [05:12<11:46, 15.19it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4786: train loss 3.22827:  31%|███       | 4787/15515 [05:13<11:52, 15.07it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4789: train loss 3.24273:  31%|███       | 4789/15515 [05:13<11:55, 15.00it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4793: train loss 3.18795:  31%|███       | 4793/15515 [05:13<12:07, 14.75it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 4796: train loss 3.27382:  31%|███       | 4797/15515 [05:13<12:02, 14.83it/s]

128
32459 128
32459 128


epoch 0 iter 4799: train loss 3.12656:  31%|███       | 4799/15515 [05:13<11:56, 14.96it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4803: train loss 3.15113:  31%|███       | 4803/15515 [05:14<11:47, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4807: train loss 3.14989:  31%|███       | 4807/15515 [05:14<11:39, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4811: train loss 3.17615:  31%|███       | 4811/15515 [05:14<11:33, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4815: train loss 3.24402:  31%|███       | 4815/15515 [05:14<11:33, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4819: train loss 3.22261:  31%|███       | 4819/15515 [05:15<11:32, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4823: train loss 3.10506:  31%|███       | 4823/15515 [05:15<11:30, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4827: train loss 3.23000:  31%|███       | 4827/15515 [05:15<11:37, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4831: train loss 3.12447:  31%|███       | 4831/15515 [05:15<11:35, 15.37it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 4834: train loss 3.22529:  31%|███       | 4835/15515 [05:16<11:40, 15.25it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 4838: train loss 3.16221:  31%|███       | 4839/15515 [05:16<11:33, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4842: train loss 3.18030:  31%|███       | 4843/15515 [05:16<11:34, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4846: train loss 3.16756:  31%|███       | 4847/15515 [05:16<11:27, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4850: train loss 3.15969:  31%|███▏      | 4851/15515 [05:17<11:26, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4854: train loss 3.23502:  31%|███▏      | 4855/15515 [05:17<11:27, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4858: train loss 3.09715:  31%|███▏      | 4859/15515 [05:17<11:30, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4862: train loss 3.22093:  31%|███▏      | 4863/15515 [05:17<11:27, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4866: train loss 3.21831:  31%|███▏      | 4867/15515 [05:18<11:30, 15.41it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 4869: train loss 3.20234:  31%|███▏      | 4869/15515 [05:18<11:36, 15.28it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 4873: train loss 3.13118:  31%|███▏      | 4873/15515 [05:18<11:33, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4877: train loss 3.26557:  31%|███▏      | 4877/15515 [05:18<11:31, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4881: train loss 3.19729:  31%|███▏      | 4881/15515 [05:19<11:28, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4885: train loss 3.16651:  31%|███▏      | 4885/15515 [05:19<11:28, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4889: train loss 3.12075:  32%|███▏      | 4889/15515 [05:19<11:29, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4893: train loss 3.20136:  32%|███▏      | 4893/15515 [05:20<11:37, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4897: train loss 3.14739:  32%|███▏      | 4897/15515 [05:20<11:30, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4900: train loss 3.26571:  32%|███▏      | 4901/15515 [05:20<11:34, 15.29it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4904: train loss 3.21054:  32%|███▏      | 4905/15515 [05:20<11:26, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4908: train loss 3.23332:  32%|███▏      | 4909/15515 [05:20<11:32, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4912: train loss 3.18015:  32%|███▏      | 4913/15515 [05:21<11:36, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4916: train loss 3.23173:  32%|███▏      | 4917/15515 [05:21<11:29, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4920: train loss 3.27515:  32%|███▏      | 4921/15515 [05:21<11:26, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4924: train loss 3.25416:  32%|███▏      | 4925/15515 [05:22<11:24, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4928: train loss 3.25562:  32%|███▏      | 4929/15515 [05:22<11:24, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4932: train loss 3.15717:  32%|███▏      | 4933/15515 [05:22<11:27, 15.40it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4936: train loss 3.17912:  32%|███▏      | 4937/15515 [05:22<11:30, 15.32it/s]

32459 128
32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4940: train loss 3.17485:  32%|███▏      | 4941/15515 [05:23<11:35, 15.21it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4943: train loss 3.16733:  32%|███▏      | 4943/15515 [05:23<11:34, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4947: train loss 3.14346:  32%|███▏      | 4947/15515 [05:23<11:27, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4951: train loss 3.19530:  32%|███▏      | 4951/15515 [05:23<11:30, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4955: train loss 3.16206:  32%|███▏      | 4955/15515 [05:24<11:37, 15.13it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4958: train loss 3.17562:  32%|███▏      | 4959/15515 [05:24<11:37, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4962: train loss 3.22702:  32%|███▏      | 4961/15515 [05:24<11:42, 15.03it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4965: train loss 3.27000:  32%|███▏      | 4965/15515 [05:24<11:54, 14.78it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4969: train loss 3.17549:  32%|███▏      | 4969/15515 [05:24<11:52, 14.81it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4973: train loss 3.15550:  32%|███▏      | 4973/15515 [05:25<11:46, 14.92it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4977: train loss 3.14932:  32%|███▏      | 4977/15515 [05:25<11:44, 14.95it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4980: train loss 3.13721:  32%|███▏      | 4981/15515 [05:25<11:39, 15.07it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4984: train loss 3.18027:  32%|███▏      | 4985/15515 [05:25<11:34, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4988: train loss 3.12334:  32%|███▏      | 4989/15515 [05:26<11:41, 15.01it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4991: train loss 3.16116:  32%|███▏      | 4991/15515 [05:26<11:50, 14.81it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 4995: train loss 3.12625:  32%|███▏      | 4995/15515 [05:26<11:54, 14.73it/s]

32459 128
32459 128
32459 128


epoch 0 iter 4998: train loss 3.15086:  32%|███▏      | 4999/15515 [05:26<11:49, 14.82it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5001: train loss 3.12511:  32%|███▏      | 5001/15515 [05:27<11:50, 14.80it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5005: train loss 3.22470:  32%|███▏      | 5005/15515 [05:27<11:31, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5009: train loss 3.20445:  32%|███▏      | 5009/15515 [05:27<11:38, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5013: train loss 3.22278:  32%|███▏      | 5013/15515 [05:27<11:40, 14.99it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5017: train loss 3.20512:  32%|███▏      | 5017/15515 [05:28<11:44, 14.89it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5021: train loss 3.18222:  32%|███▏      | 5021/15515 [05:28<11:31, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5025: train loss 3.17383:  32%|███▏      | 5025/15515 [05:28<11:23, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5029: train loss 3.15151:  32%|███▏      | 5029/15515 [05:28<11:21, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5033: train loss 3.23219:  32%|███▏      | 5033/15515 [05:29<11:18, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5037: train loss 3.22739:  32%|███▏      | 5037/15515 [05:29<11:14, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5041: train loss 3.24367:  32%|███▏      | 5041/15515 [05:29<11:13, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5045: train loss 3.12535:  33%|███▎      | 5045/15515 [05:30<11:18, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5049: train loss 3.16152:  33%|███▎      | 5049/15515 [05:30<11:15, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5053: train loss 3.08783:  33%|███▎      | 5053/15515 [05:30<11:18, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5057: train loss 3.17388:  33%|███▎      | 5057/15515 [05:30<11:16, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5061: train loss 3.27341:  33%|███▎      | 5061/15515 [05:31<11:16, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5065: train loss 3.19844:  33%|███▎      | 5065/15515 [05:31<11:15, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5069: train loss 3.18852:  33%|███▎      | 5069/15515 [05:31<11:13, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5073: train loss 3.15297:  33%|███▎      | 5073/15515 [05:31<11:17, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5077: train loss 3.20950:  33%|███▎      | 5077/15515 [05:32<11:17, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5081: train loss 3.21096:  33%|███▎      | 5081/15515 [05:32<11:15, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5085: train loss 3.23508:  33%|███▎      | 5085/15515 [05:32<11:14, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5089: train loss 3.18743:  33%|███▎      | 5089/15515 [05:32<11:11, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5093: train loss 3.15843:  33%|███▎      | 5093/15515 [05:33<11:11, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5097: train loss 3.15777:  33%|███▎      | 5097/15515 [05:33<11:10, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5101: train loss 3.18323:  33%|███▎      | 5101/15515 [05:33<11:06, 15.63it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5105: train loss 3.17375:  33%|███▎      | 5105/15515 [05:33<11:13, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5109: train loss 3.18319:  33%|███▎      | 5109/15515 [05:34<11:12, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5113: train loss 3.12263:  33%|███▎      | 5113/15515 [05:34<11:09, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5117: train loss 3.24445:  33%|███▎      | 5117/15515 [05:34<11:06, 15.60it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5121: train loss 3.20011:  33%|███▎      | 5121/15515 [05:34<11:15, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5125: train loss 3.12935:  33%|███▎      | 5125/15515 [05:35<11:15, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5129: train loss 3.16282:  33%|███▎      | 5129/15515 [05:35<11:18, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5133: train loss 3.18320:  33%|███▎      | 5133/15515 [05:35<11:10, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5137: train loss 3.17444:  33%|███▎      | 5137/15515 [05:35<11:12, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5141: train loss 3.19009:  33%|███▎      | 5141/15515 [05:36<11:15, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5145: train loss 3.10908:  33%|███▎      | 5145/15515 [05:36<11:16, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5149: train loss 3.19837:  33%|███▎      | 5149/15515 [05:36<11:17, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5153: train loss 3.18481:  33%|███▎      | 5153/15515 [05:37<11:21, 15.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5157: train loss 3.18142:  33%|███▎      | 5157/15515 [05:37<11:24, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5161: train loss 3.19127:  33%|███▎      | 5161/15515 [05:37<11:20, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5165: train loss 3.17256:  33%|███▎      | 5165/15515 [05:37<11:14, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5169: train loss 3.22729:  33%|███▎      | 5169/15515 [05:38<11:11, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5173: train loss 3.12892:  33%|███▎      | 5173/15515 [05:38<11:36, 14.85it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5176: train loss 3.12780:  33%|███▎      | 5177/15515 [05:38<11:28, 15.01it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5180: train loss 3.14006:  33%|███▎      | 5181/15515 [05:38<11:23, 15.11it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 5183: train loss 3.23424:  33%|███▎      | 5183/15515 [05:38<11:29, 14.99it/s]

 128
32459 128
32459 128


epoch 0 iter 5186: train loss 3.09677:  33%|███▎      | 5187/15515 [05:39<11:21, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5190: train loss 3.23284:  33%|███▎      | 5191/15515 [05:39<11:17, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5194: train loss 3.24440:  33%|███▎      | 5195/15515 [05:39<11:06, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5198: train loss 3.13427:  34%|███▎      | 5199/15515 [05:39<11:08, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5202: train loss 3.21360:  34%|███▎      | 5203/15515 [05:40<11:18, 15.19it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5204: train loss 3.20135:  34%|███▎      | 5205/15515 [05:40<11:17, 15.23it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5208: train loss 3.22730:  34%|███▎      | 5209/15515 [05:40<11:52, 14.46it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5211: train loss 3.19356:  34%|███▎      | 5211/15515 [05:40<11:41, 14.69it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5214: train loss 3.21251:  34%|███▎      | 5215/15515 [05:41<11:41, 14.68it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5217: train loss 3.14526:  34%|███▎      | 5217/15515 [05:41<11:43, 14.64it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5220: train loss 3.18529:  34%|███▎      | 5221/15515 [05:41<11:36, 14.79it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5224: train loss 3.15739:  34%|███▎      | 5225/15515 [05:41<11:31, 14.87it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5227: train loss 3.12926:  34%|███▎      | 5227/15515 [05:41<11:30, 14.90it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5231: train loss 3.13139:  34%|███▎      | 5231/15515 [05:42<11:23, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5235: train loss 3.19424:  34%|███▎      | 5235/15515 [05:42<11:32, 14.85it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 5238: train loss 3.10265:  34%|███▍      | 5239/15515 [05:42<11:30, 14.88it/s]


32459 128
32459 128


epoch 0 iter 5240: train loss 3.16831:  34%|███▍      | 5241/15515 [05:42<12:14, 13.99it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5243: train loss 3.11247:  34%|███▍      | 5243/15515 [05:43<12:24, 13.80it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5247: train loss 3.14022:  34%|███▍      | 5247/15515 [05:43<12:20, 13.87it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5251: train loss 3.16588:  34%|███▍      | 5251/15515 [05:43<11:40, 14.64it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5255: train loss 3.18994:  34%|███▍      | 5255/15515 [05:43<11:20, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5259: train loss 3.18061:  34%|███▍      | 5259/15515 [05:44<11:12, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5263: train loss 3.14374:  34%|███▍      | 5263/15515 [05:44<11:09, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5267: train loss 3.19695:  34%|███▍      | 5267/15515 [05:44<11:07, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5271: train loss 3.21654:  34%|███▍      | 5271/15515 [05:44<11:09, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5275: train loss 3.12819:  34%|███▍      | 5275/15515 [05:45<11:08, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5279: train loss 3.19748:  34%|███▍      | 5279/15515 [05:45<10:59, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5283: train loss 3.15341:  34%|███▍      | 5283/15515 [05:45<11:01, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5287: train loss 3.14220:  34%|███▍      | 5287/15515 [05:45<11:03, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5291: train loss 3.15315:  34%|███▍      | 5291/15515 [05:46<11:03, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5295: train loss 3.13154:  34%|███▍      | 5295/15515 [05:46<11:03, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5299: train loss 3.11496:  34%|███▍      | 5299/15515 [05:46<11:02, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5303: train loss 3.15340:  34%|███▍      | 5303/15515 [05:46<11:01, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5307: train loss 3.19435:  34%|███▍      | 5307/15515 [05:47<11:02, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5311: train loss 3.16689:  34%|███▍      | 5311/15515 [05:47<10:59, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5315: train loss 3.18107:  34%|███▍      | 5315/15515 [05:47<11:02, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5319: train loss 3.14895:  34%|███▍      | 5319/15515 [05:48<11:04, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5323: train loss 3.15876:  34%|███▍      | 5323/15515 [05:48<11:03, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5327: train loss 3.14126:  34%|███▍      | 5327/15515 [05:48<11:05, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5331: train loss 3.13823:  34%|███▍      | 5331/15515 [05:48<11:09, 15.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5335: train loss 3.13197:  34%|███▍      | 5335/15515 [05:49<11:05, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5339: train loss 3.08348:  34%|███▍      | 5339/15515 [05:49<11:00, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5343: train loss 3.21309:  34%|███▍      | 5343/15515 [05:49<10:58, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5347: train loss 3.19463:  34%|███▍      | 5347/15515 [05:49<10:56, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5351: train loss 3.18486:  34%|███▍      | 5351/15515 [05:50<10:57, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5355: train loss 3.20204:  35%|███▍      | 5355/15515 [05:50<10:57, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5359: train loss 3.21862:  35%|███▍      | 5359/15515 [05:50<10:55, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5363: train loss 3.23139:  35%|███▍      | 5363/15515 [05:50<10:55, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5367: train loss 3.19233:  35%|███▍      | 5367/15515 [05:51<10:58, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5371: train loss 3.15538:  35%|███▍      | 5371/15515 [05:51<11:01, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5375: train loss 3.18516:  35%|███▍      | 5375/15515 [05:51<11:00, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5379: train loss 3.24752:  35%|███▍      | 5379/15515 [05:51<10:52, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5383: train loss 3.16477:  35%|███▍      | 5383/15515 [05:52<10:53, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5387: train loss 3.13695:  35%|███▍      | 5387/15515 [05:52<10:57, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5391: train loss 3.15764:  35%|███▍      | 5391/15515 [05:52<10:57, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5395: train loss 3.16675:  35%|███▍      | 5395/15515 [05:52<11:01, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5399: train loss 3.13535:  35%|███▍      | 5399/15515 [05:53<11:11, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5403: train loss 3.16744:  35%|███▍      | 5403/15515 [05:53<11:06, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5407: train loss 3.18849:  35%|███▍      | 5407/15515 [05:53<11:06, 15.17it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 5410: train loss 3.22494:  35%|███▍      | 5411/15515 [05:53<11:13, 15.00it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 5414: train loss 3.10857:  35%|███▍      | 5415/15515 [05:54<10:58, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5418: train loss 3.22328:  35%|███▍      | 5419/15515 [05:54<10:55, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5422: train loss 3.24574:  35%|███▍      | 5423/15515 [05:54<10:59, 15.30it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5425: train loss 3.17905:  35%|███▍      | 5425/15515 [05:54<11:03, 15.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5429: train loss 3.15140:  35%|███▍      | 5429/15515 [05:55<11:17, 14.88it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5433: train loss 3.15556:  35%|███▌      | 5433/15515 [05:55<11:12, 14.98it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5437: train loss 3.13008:  35%|███▌      | 5437/15515 [05:55<11:15, 14.91it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5440: train loss 3.21731:  35%|███▌      | 5441/15515 [05:55<11:16, 14.90it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5443: train loss 3.23288:  35%|███▌      | 5443/15515 [05:56<11:12, 14.99it/s]

32459 128
32459 128


epoch 0 iter 5446: train loss 3.14849:  35%|███▌      | 5447/15515 [05:56<11:16, 14.89it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5450: train loss 3.11864:  35%|███▌      | 5451/15515 [05:56<11:06, 15.09it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 5453: train loss 3.07988:  35%|███▌      | 5453/15515 [05:56<11:11, 14.98it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 5457: train loss 3.11901:  35%|███▌      | 5457/15515 [05:57<11:11, 14.99it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5461: train loss 3.23702:  35%|███▌      | 5461/15515 [05:57<10:59, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5465: train loss 3.08617:  35%|███▌      | 5465/15515 [05:57<10:56, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5469: train loss 3.11962:  35%|███▌      | 5469/15515 [05:57<10:54, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5473: train loss 3.12374:  35%|███▌      | 5473/15515 [05:58<10:47, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5477: train loss 3.16525:  35%|███▌      | 5477/15515 [05:58<10:51, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5481: train loss 3.20147:  35%|███▌      | 5481/15515 [05:58<10:57, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5485: train loss 3.10281:  35%|███▌      | 5485/15515 [05:58<10:54, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5489: train loss 3.24811:  35%|███▌      | 5489/15515 [05:59<10:54, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5493: train loss 3.16511:  35%|███▌      | 5493/15515 [05:59<10:52, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5497: train loss 3.15640:  35%|███▌      | 5497/15515 [05:59<10:50, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5501: train loss 3.15366:  35%|███▌      | 5501/15515 [05:59<10:49, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5505: train loss 3.26238:  35%|███▌      | 5505/15515 [06:00<10:49, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5509: train loss 3.15847:  36%|███▌      | 5509/15515 [06:00<10:49, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5513: train loss 3.08841:  36%|███▌      | 5513/15515 [06:00<10:47, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5517: train loss 3.10322:  36%|███▌      | 5517/15515 [06:00<10:42, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5521: train loss 3.20866:  36%|███▌      | 5521/15515 [06:01<10:43, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5525: train loss 3.20693:  36%|███▌      | 5525/15515 [06:01<10:48, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5529: train loss 3.14005:  36%|███▌      | 5529/15515 [06:01<10:42, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5533: train loss 3.22295:  36%|███▌      | 5533/15515 [06:01<10:39, 15.60it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5537: train loss 3.09750:  36%|███▌      | 5537/15515 [06:02<10:42, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5541: train loss 3.19849:  36%|███▌      | 5541/15515 [06:02<10:42, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5545: train loss 3.14288:  36%|███▌      | 5545/15515 [06:02<10:40, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5549: train loss 3.15206:  36%|███▌      | 5549/15515 [06:03<10:42, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5553: train loss 3.13177:  36%|███▌      | 5553/15515 [06:03<10:42, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5557: train loss 3.13239:  36%|███▌      | 5557/15515 [06:03<10:42, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5561: train loss 3.07503:  36%|███▌      | 5561/15515 [06:03<10:40, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5565: train loss 3.16778:  36%|███▌      | 5565/15515 [06:04<10:43, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5569: train loss 3.14406:  36%|███▌      | 5569/15515 [06:04<10:41, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5573: train loss 3.16863:  36%|███▌      | 5573/15515 [06:04<10:44, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5577: train loss 3.13162:  36%|███▌      | 5577/15515 [06:04<10:41, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5581: train loss 3.15236:  36%|███▌      | 5581/15515 [06:05<10:42, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5585: train loss 3.18042:  36%|███▌      | 5585/15515 [06:05<10:40, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5589: train loss 3.17913:  36%|███▌      | 5589/15515 [06:05<10:40, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5593: train loss 3.10502:  36%|███▌      | 5593/15515 [06:05<10:41, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5597: train loss 3.12583:  36%|███▌      | 5597/15515 [06:06<10:40, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5601: train loss 3.11721:  36%|███▌      | 5601/15515 [06:06<10:42, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5605: train loss 3.19176:  36%|███▌      | 5605/15515 [06:06<10:39, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5609: train loss 3.17218:  36%|███▌      | 5609/15515 [06:06<10:39, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5613: train loss 3.13057:  36%|███▌      | 5613/15515 [06:07<10:48, 15.26it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5616: train loss 3.22123:  36%|███▌      | 5617/15515 [06:07<10:53, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5620: train loss 3.14974:  36%|███▌      | 5621/15515 [06:07<10:56, 15.07it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5623: train loss 3.17602:  36%|███▌      | 5623/15515 [06:07<11:01, 14.95it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 5626: train loss 3.13281:  36%|███▋      | 5627/15515 [06:08<11:02, 14.93it/s]

 128
32459 128
32459 128


epoch 0 iter 5629: train loss 3.15606:  36%|███▋      | 5629/15515 [06:08<11:07, 14.82it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5633: train loss 3.10878:  36%|███▋      | 5633/15515 [06:08<10:58, 15.01it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5637: train loss 3.10840:  36%|███▋      | 5637/15515 [06:08<10:56, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5641: train loss 3.24728:  36%|███▋      | 5641/15515 [06:09<10:42, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5645: train loss 3.19004:  36%|███▋      | 5645/15515 [06:09<10:45, 15.29it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5648: train loss 3.10795:  36%|███▋      | 5649/15515 [06:09<10:50, 15.18it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5651: train loss 3.20967:  36%|███▋      | 5651/15515 [06:09<10:56, 15.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5655: train loss 3.15676:  36%|███▋      | 5655/15515 [06:09<10:47, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5659: train loss 3.12286:  36%|███▋      | 5659/15515 [06:10<10:55, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5662: train loss 3.15419:  37%|███▋      | 5663/15515 [06:10<10:57, 14.97it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5666: train loss 3.17497:  37%|███▋      | 5667/15515 [06:10<11:18, 14.51it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5669: train loss 3.15072:  37%|███▋      | 5669/15515 [06:10<11:16, 14.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5673: train loss 3.18809:  37%|███▋      | 5673/15515 [06:11<11:02, 14.86it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5677: train loss 3.15817:  37%|███▋      | 5677/15515 [06:11<11:03, 14.82it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5680: train loss 3.17062:  37%|███▋      | 5681/15515 [06:11<11:03, 14.82it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5683: train loss 3.10854:  37%|███▋      | 5683/15515 [06:11<11:05, 14.78it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5686: train loss 3.10615:  37%|███▋      | 5687/15515 [06:12<11:00, 14.88it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5690: train loss 3.14101:  37%|███▋      | 5691/15515 [06:12<10:44, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5694: train loss 3.14952:  37%|███▋      | 5695/15515 [06:12<10:48, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5698: train loss 3.18876:  37%|███▋      | 5699/15515 [06:12<10:42, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5702: train loss 3.13268:  37%|███▋      | 5703/15515 [06:13<10:35, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5706: train loss 3.15192:  37%|███▋      | 5707/15515 [06:13<10:36, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5710: train loss 3.16490:  37%|███▋      | 5711/15515 [06:13<10:39, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5714: train loss 3.16778:  37%|███▋      | 5715/15515 [06:13<10:40, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5718: train loss 3.13773:  37%|███▋      | 5719/15515 [06:14<10:36, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5722: train loss 3.18773:  37%|███▋      | 5723/15515 [06:14<10:37, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5726: train loss 3.18789:  37%|███▋      | 5727/15515 [06:14<10:33, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5730: train loss 3.14502:  37%|███▋      | 5731/15515 [06:14<10:32, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5734: train loss 3.19012:  37%|███▋      | 5735/15515 [06:15<10:34, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5738: train loss 3.13662:  37%|███▋      | 5739/15515 [06:15<10:34, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5742: train loss 3.16587:  37%|███▋      | 5743/15515 [06:15<10:35, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5746: train loss 3.10673:  37%|███▋      | 5747/15515 [06:15<10:36, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5750: train loss 3.09656:  37%|███▋      | 5751/15515 [06:16<10:34, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5754: train loss 3.18987:  37%|███▋      | 5755/15515 [06:16<10:33, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5758: train loss 3.10003:  37%|███▋      | 5759/15515 [06:16<10:32, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5762: train loss 3.08158:  37%|███▋      | 5763/15515 [06:16<10:38, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5766: train loss 3.17915:  37%|███▋      | 5767/15515 [06:17<10:38, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5770: train loss 3.15687:  37%|███▋      | 5771/15515 [06:17<10:32, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5774: train loss 3.11343:  37%|███▋      | 5775/15515 [06:17<10:38, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5778: train loss 3.17366:  37%|███▋      | 5779/15515 [06:18<10:34, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5782: train loss 3.14518:  37%|███▋      | 5783/15515 [06:18<10:30, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5786: train loss 3.16013:  37%|███▋      | 5787/15515 [06:18<10:32, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5790: train loss 3.12391:  37%|███▋      | 5791/15515 [06:18<10:34, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5794: train loss 3.11310:  37%|███▋      | 5795/15515 [06:19<10:33, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5798: train loss 3.19008:  37%|███▋      | 5799/15515 [06:19<10:36, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5802: train loss 3.13534:  37%|███▋      | 5803/15515 [06:19<10:31, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5806: train loss 3.06152:  37%|███▋      | 5807/15515 [06:19<10:35, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5810: train loss 3.10047:  37%|███▋      | 5811/15515 [06:20<10:26, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5814: train loss 3.20495:  37%|███▋      | 5815/15515 [06:20<10:30, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5818: train loss 3.22642:  38%|███▊      | 5819/15515 [06:20<10:30, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5822: train loss 3.05025:  38%|███▊      | 5823/15515 [06:20<10:33, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5826: train loss 3.19485:  38%|███▊      | 5827/15515 [06:21<10:30, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5830: train loss 3.12717:  38%|███▊      | 5831/15515 [06:21<10:30, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5834: train loss 3.14121:  38%|███▊      | 5835/15515 [06:21<10:24, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5838: train loss 3.18533:  38%|███▊      | 5839/15515 [06:21<10:28, 15.38it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 5841: train loss 3.15539:  38%|███▊      | 5841/15515 [06:22<10:39, 15.14it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 5845: train loss 3.16564:  38%|███▊      | 5845/15515 [06:22<10:43, 15.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5849: train loss 3.15226:  38%|███▊      | 5849/15515 [06:22<10:44, 14.99it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 5852: train loss 3.20869:  38%|███▊      | 5853/15515 [06:22<10:46, 14.95it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 5856: train loss 3.14158:  38%|███▊      | 5857/15515 [06:23<10:46, 14.93it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5859: train loss 3.17854:  38%|███▊      | 5859/15515 [06:23<10:41, 15.05it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5863: train loss 3.15994:  38%|███▊      | 5863/15515 [06:23<10:48, 14.89it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5866: train loss 3.12573:  38%|███▊      | 5867/15515 [06:23<10:48, 14.88it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5870: train loss 3.21054:  38%|███▊      | 5871/15515 [06:24<10:44, 14.97it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5874: train loss 3.10013:  38%|███▊      | 5875/15515 [06:24<10:40, 15.04it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5877: train loss 3.13459:  38%|███▊      | 5877/15515 [06:24<10:45, 14.93it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5881: train loss 3.15026:  38%|███▊      | 5881/15515 [06:24<10:36, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5885: train loss 3.11202:  38%|███▊      | 5885/15515 [06:25<10:47, 14.87it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5888: train loss 3.13843:  38%|███▊      | 5889/15515 [06:25<10:49, 14.82it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5891: train loss 3.11834:  38%|███▊      | 5891/15515 [06:25<10:48, 14.84it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5895: train loss 3.08823:  38%|███▊      | 5895/15515 [06:25<10:32, 15.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5899: train loss 3.19235:  38%|███▊      | 5899/15515 [06:26<10:32, 15.21it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5902: train loss 3.17225:  38%|███▊      | 5903/15515 [06:26<10:43, 14.94it/s]

32459 128
32459 128
32459 128


epoch 0 iter 5905: train loss 3.15726:  38%|███▊      | 5905/15515 [06:26<10:37, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5909: train loss 3.14817:  38%|███▊      | 5909/15515 [06:26<10:37, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5913: train loss 3.24303:  38%|███▊      | 5913/15515 [06:26<10:33, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5917: train loss 3.13934:  38%|███▊      | 5917/15515 [06:27<10:26, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5921: train loss 3.05941:  38%|███▊      | 5921/15515 [06:27<10:21, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5925: train loss 3.11304:  38%|███▊      | 5925/15515 [06:27<10:22, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5929: train loss 3.15847:  38%|███▊      | 5929/15515 [06:27<10:22, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5933: train loss 3.10832:  38%|███▊      | 5933/15515 [06:28<10:22, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5937: train loss 3.12161:  38%|███▊      | 5937/15515 [06:28<10:15, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5941: train loss 3.16519:  38%|███▊      | 5941/15515 [06:28<10:21, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5945: train loss 3.11665:  38%|███▊      | 5945/15515 [06:29<10:22, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5949: train loss 3.13985:  38%|███▊      | 5949/15515 [06:29<10:23, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5953: train loss 3.16519:  38%|███▊      | 5953/15515 [06:29<10:22, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5957: train loss 3.05274:  38%|███▊      | 5957/15515 [06:29<10:25, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5961: train loss 3.15653:  38%|███▊      | 5961/15515 [06:30<10:22, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5965: train loss 3.16819:  38%|███▊      | 5965/15515 [06:30<10:14, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5969: train loss 3.15409:  38%|███▊      | 5969/15515 [06:30<10:23, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5973: train loss 3.11933:  38%|███▊      | 5973/15515 [06:30<10:17, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5977: train loss 3.11885:  39%|███▊      | 5977/15515 [06:31<10:19, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5981: train loss 3.14751:  39%|███▊      | 5981/15515 [06:31<10:17, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5985: train loss 3.14242:  39%|███▊      | 5985/15515 [06:31<10:23, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5989: train loss 3.06383:  39%|███▊      | 5989/15515 [06:31<10:19, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5993: train loss 3.20901:  39%|███▊      | 5993/15515 [06:32<10:22, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 5997: train loss 3.17129:  39%|███▊      | 5997/15515 [06:32<10:18, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6001: train loss 3.17079:  39%|███▊      | 6001/15515 [06:32<10:19, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6005: train loss 3.18066:  39%|███▊      | 6005/15515 [06:32<10:23, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6009: train loss 3.17650:  39%|███▊      | 6009/15515 [06:33<10:23, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6013: train loss 3.23278:  39%|███▉      | 6013/15515 [06:33<10:22, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6017: train loss 3.06208:  39%|███▉      | 6017/15515 [06:33<10:23, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6021: train loss 3.11936:  39%|███▉      | 6021/15515 [06:33<10:26, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6025: train loss 3.13696:  39%|███▉      | 6025/15515 [06:34<10:21, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6029: train loss 3.13592:  39%|███▉      | 6029/15515 [06:34<10:25, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6033: train loss 3.08006:  39%|███▉      | 6033/15515 [06:34<10:17, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6037: train loss 3.11876:  39%|███▉      | 6037/15515 [06:35<10:18, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6041: train loss 3.16643:  39%|███▉      | 6041/15515 [06:35<10:15, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6045: train loss 3.14425:  39%|███▉      | 6045/15515 [06:35<10:12, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6049: train loss 3.10377:  39%|███▉      | 6049/15515 [06:35<10:12, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6053: train loss 3.14004:  39%|███▉      | 6053/15515 [06:36<10:13, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6057: train loss 3.16481:  39%|███▉      | 6057/15515 [06:36<10:18, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6061: train loss 3.18987:  39%|███▉      | 6061/15515 [06:36<10:25, 15.11it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 6064: train loss 3.17376:  39%|███▉      | 6065/15515 [06:36<10:31, 14.97it/s]

 128
32459 128
32459 128


epoch 0 iter 6067: train loss 3.06052:  39%|███▉      | 6067/15515 [06:36<10:36, 14.85it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6071: train loss 3.15815:  39%|███▉      | 6071/15515 [06:37<10:35, 14.87it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6074: train loss 3.13551:  39%|███▉      | 6075/15515 [06:37<10:28, 15.01it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6077: train loss 3.17657:  39%|███▉      | 6077/15515 [06:37<10:27, 15.03it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6081: train loss 3.08258:  39%|███▉      | 6081/15515 [06:37<10:23, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6085: train loss 3.13266:  39%|███▉      | 6085/15515 [06:38<10:17, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6089: train loss 3.11344:  39%|███▉      | 6089/15515 [06:38<10:19, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6093: train loss 3.13121:  39%|███▉      | 6093/15515 [06:38<10:24, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6097: train loss 3.10113:  39%|███▉      | 6097/15515 [06:38<10:21, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6101: train loss 3.13388:  39%|███▉      | 6101/15515 [06:39<10:17, 15.25it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 6104: train loss 3.10264:  39%|███▉      | 6105/15515 [06:39<10:21, 15.13it/s]


32459 128
32459 128


epoch 0 iter 6107: train loss 3.12700:  39%|███▉      | 6107/15515 [06:39<10:22, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6111: train loss 3.11841:  39%|███▉      | 6111/15515 [06:39<10:24, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6115: train loss 3.13787:  39%|███▉      | 6115/15515 [06:40<10:32, 14.87it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6119: train loss 3.13745:  39%|███▉      | 6119/15515 [06:40<10:33, 14.83it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6122: train loss 3.10996:  39%|███▉      | 6123/15515 [06:40<10:30, 14.90it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6125: train loss 3.12964:  39%|███▉      | 6125/15515 [06:40<10:34, 14.80it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6129: train loss 3.14934:  40%|███▉      | 6129/15515 [06:41<10:27, 14.97it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6133: train loss 3.07168:  40%|███▉      | 6133/15515 [06:41<10:15, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6137: train loss 3.19183:  40%|███▉      | 6137/15515 [06:41<10:07, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6141: train loss 3.14396:  40%|███▉      | 6141/15515 [06:41<10:11, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6145: train loss 3.15731:  40%|███▉      | 6145/15515 [06:42<10:06, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6148: train loss 3.10265:  40%|███▉      | 6149/15515 [06:42<10:25, 14.98it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6152: train loss 3.08215:  40%|███▉      | 6153/15515 [06:42<10:42, 14.56it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6154: train loss 3.18499:  40%|███▉      | 6155/15515 [06:42<10:48, 14.44it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6157: train loss 3.11482:  40%|███▉      | 6157/15515 [06:42<10:53, 14.31it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6161: train loss 3.10236:  40%|███▉      | 6161/15515 [06:43<11:03, 14.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6165: train loss 3.21197:  40%|███▉      | 6165/15515 [06:43<10:37, 14.66it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6168: train loss 3.10119:  40%|███▉      | 6169/15515 [06:43<11:22, 13.70it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6171: train loss 3.13612:  40%|███▉      | 6171/15515 [06:43<11:10, 13.94it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6174: train loss 3.09178:  40%|███▉      | 6175/15515 [06:44<12:45, 12.21it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6178: train loss 3.11536:  40%|███▉      | 6179/15515 [06:44<11:38, 13.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6182: train loss 3.13828:  40%|███▉      | 6183/15515 [06:44<10:53, 14.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6186: train loss 3.12218:  40%|███▉      | 6187/15515 [06:45<10:31, 14.77it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6190: train loss 3.12441:  40%|███▉      | 6191/15515 [06:45<10:16, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6194: train loss 3.12661:  40%|███▉      | 6195/15515 [06:45<10:11, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6198: train loss 3.13929:  40%|███▉      | 6199/15515 [06:45<10:07, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6202: train loss 3.15770:  40%|███▉      | 6203/15515 [06:46<10:08, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6206: train loss 3.21986:  40%|████      | 6207/15515 [06:46<10:10, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6210: train loss 3.11947:  40%|████      | 6211/15515 [06:46<10:07, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6214: train loss 3.13473:  40%|████      | 6215/15515 [06:46<10:04, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6218: train loss 3.08431:  40%|████      | 6219/15515 [06:47<10:01, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6222: train loss 3.12548:  40%|████      | 6223/15515 [06:47<10:01, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6226: train loss 3.13064:  40%|████      | 6227/15515 [06:47<10:00, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6230: train loss 3.12390:  40%|████      | 6231/15515 [06:47<09:58, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6234: train loss 3.18219:  40%|████      | 6235/15515 [06:48<10:01, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6238: train loss 3.15706:  40%|████      | 6239/15515 [06:48<10:12, 15.15it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6241: train loss 3.14238:  40%|████      | 6241/15515 [06:48<10:09, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6245: train loss 3.13448:  40%|████      | 6245/15515 [06:48<10:03, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6249: train loss 3.16140:  40%|████      | 6249/15515 [06:49<10:07, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6253: train loss 3.16732:  40%|████      | 6253/15515 [06:49<10:04, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6257: train loss 3.13028:  40%|████      | 6257/15515 [06:49<10:00, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6261: train loss 3.04113:  40%|████      | 6261/15515 [06:49<09:59, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6265: train loss 3.08072:  40%|████      | 6265/15515 [06:50<10:05, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6269: train loss 3.15374:  40%|████      | 6269/15515 [06:50<10:03, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6272: train loss 3.12315:  40%|████      | 6273/15515 [06:50<10:58, 14.03it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6275: train loss 3.15123:  40%|████      | 6275/15515 [06:50<11:11, 13.76it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6278: train loss 3.14752:  40%|████      | 6279/15515 [06:51<11:11, 13.74it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6282: train loss 3.13694:  40%|████      | 6283/15515 [06:51<10:43, 14.35it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6285: train loss 3.15902:  41%|████      | 6285/15515 [06:51<10:39, 14.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6289: train loss 3.09968:  41%|████      | 6289/15515 [06:51<10:22, 14.81it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 6292: train loss 3.07151:  41%|████      | 6293/15515 [06:52<10:22, 14.82it/s]

 128
32459 128
32459 128
32459

epoch 0 iter 6295: train loss 3.08199:  41%|████      | 6295/15515 [06:52<10:22, 14.81it/s]

 128
32459 128
32459 128


epoch 0 iter 6298: train loss 3.16404:  41%|████      | 6299/15515 [06:52<10:27, 14.69it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6301: train loss 3.12491:  41%|████      | 6301/15515 [06:52<10:23, 14.78it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6305: train loss 3.13259:  41%|████      | 6305/15515 [06:52<10:21, 14.83it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6308: train loss 3.11869:  41%|████      | 6309/15515 [06:53<10:23, 14.76it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6311: train loss 3.07605:  41%|████      | 6311/15515 [06:53<10:29, 14.61it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6315: train loss 3.10552:  41%|████      | 6315/15515 [06:53<10:19, 14.85it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 6318: train loss 3.11073:  41%|████      | 6319/15515 [06:53<10:13, 14.98it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 6322: train loss 3.12229:  41%|████      | 6323/15515 [06:54<10:13, 14.99it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6325: train loss 3.14577:  41%|████      | 6325/15515 [06:54<10:11, 15.02it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6329: train loss 3.11567:  41%|████      | 6329/15515 [06:54<10:16, 14.90it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6333: train loss 3.09694:  41%|████      | 6333/15515 [06:54<10:08, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6337: train loss 3.13693:  41%|████      | 6337/15515 [06:55<10:05, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6341: train loss 3.14719:  41%|████      | 6341/15515 [06:55<10:11, 15.01it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6344: train loss 3.13100:  41%|████      | 6345/15515 [06:55<10:06, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6348: train loss 3.10231:  41%|████      | 6349/15515 [06:55<09:56, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6352: train loss 3.14066:  41%|████      | 6353/15515 [06:56<09:54, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6356: train loss 3.14358:  41%|████      | 6357/15515 [06:56<09:56, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6360: train loss 3.15250:  41%|████      | 6361/15515 [06:56<09:57, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6364: train loss 3.10793:  41%|████      | 6365/15515 [06:56<09:54, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6368: train loss 3.13199:  41%|████      | 6369/15515 [06:57<09:53, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6372: train loss 3.09312:  41%|████      | 6373/15515 [06:57<09:56, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6376: train loss 3.13500:  41%|████      | 6377/15515 [06:57<09:55, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6380: train loss 3.10329:  41%|████      | 6381/15515 [06:57<09:54, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6384: train loss 3.15104:  41%|████      | 6385/15515 [06:58<09:54, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6388: train loss 3.12664:  41%|████      | 6389/15515 [06:58<09:52, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6392: train loss 3.10578:  41%|████      | 6393/15515 [06:58<09:54, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6396: train loss 3.09075:  41%|████      | 6397/15515 [06:58<09:54, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6400: train loss 3.13805:  41%|████▏     | 6401/15515 [06:59<09:54, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6404: train loss 3.15372:  41%|████▏     | 6405/15515 [06:59<09:50, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6408: train loss 3.01681:  41%|████▏     | 6409/15515 [06:59<09:51, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6412: train loss 3.20849:  41%|████▏     | 6413/15515 [07:00<09:51, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6416: train loss 3.08940:  41%|████▏     | 6417/15515 [07:00<09:53, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6420: train loss 3.01391:  41%|████▏     | 6421/15515 [07:00<09:49, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6424: train loss 3.17195:  41%|████▏     | 6425/15515 [07:00<09:54, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6428: train loss 3.15626:  41%|████▏     | 6429/15515 [07:01<09:52, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6432: train loss 3.19202:  41%|████▏     | 6433/15515 [07:01<09:53, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6436: train loss 3.18263:  41%|████▏     | 6437/15515 [07:01<09:47, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6440: train loss 3.08751:  42%|████▏     | 6441/15515 [07:01<09:48, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6444: train loss 3.13742:  42%|████▏     | 6445/15515 [07:02<09:46, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6448: train loss 3.09530:  42%|████▏     | 6449/15515 [07:02<09:49, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6452: train loss 3.05775:  42%|████▏     | 6453/15515 [07:02<09:45, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6456: train loss 3.11533:  42%|████▏     | 6457/15515 [07:02<09:46, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6460: train loss 3.16191:  42%|████▏     | 6461/15515 [07:03<09:42, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6464: train loss 3.14537:  42%|████▏     | 6465/15515 [07:03<09:41, 15.57it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6468: train loss 3.09272:  42%|████▏     | 6469/15515 [07:03<09:44, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6472: train loss 3.16527:  42%|████▏     | 6473/15515 [07:03<09:42, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6476: train loss 3.11453:  42%|████▏     | 6477/15515 [07:04<09:42, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6480: train loss 3.08029:  42%|████▏     | 6481/15515 [07:04<09:44, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6484: train loss 3.13712:  42%|████▏     | 6485/15515 [07:04<09:42, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6488: train loss 3.16331:  42%|████▏     | 6489/15515 [07:04<09:43, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6492: train loss 3.17691:  42%|████▏     | 6493/15515 [07:05<09:48, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6496: train loss 3.12055:  42%|████▏     | 6497/15515 [07:05<09:52, 15.22it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6499: train loss 3.12586:  42%|████▏     | 6499/15515 [07:05<10:03, 14.93it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6503: train loss 3.06060:  42%|████▏     | 6503/15515 [07:05<09:56, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6507: train loss 3.09001:  42%|████▏     | 6507/15515 [07:06<09:53, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6511: train loss 3.09578:  42%|████▏     | 6511/15515 [07:06<09:49, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6515: train loss 3.09634:  42%|████▏     | 6515/15515 [07:06<09:41, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6519: train loss 3.06868:  42%|████▏     | 6519/15515 [07:06<09:41, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6523: train loss 3.12855:  42%|████▏     | 6523/15515 [07:07<09:48, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6527: train loss 3.00742:  42%|████▏     | 6527/15515 [07:07<09:43, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6531: train loss 3.13544:  42%|████▏     | 6531/15515 [07:07<09:37, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6535: train loss 3.09459:  42%|████▏     | 6535/15515 [07:08<09:42, 15.43it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6538: train loss 3.07330:  42%|████▏     | 6539/15515 [07:08<09:57, 15.01it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6541: train loss 3.12582:  42%|████▏     | 6541/15515 [07:08<09:59, 14.96it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 6544: train loss 3.11988:  42%|████▏     | 6545/15515 [07:08<10:00, 14.94it/s]

 128
32459 128
32459 128
32459 

epoch 0 iter 6547: train loss 3.12642:  42%|████▏     | 6547/15515 [07:08<09:58, 15.00it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 6551: train loss 3.10908:  42%|████▏     | 6551/15515 [07:09<09:59, 14.96it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6555: train loss 3.15896:  42%|████▏     | 6555/15515 [07:09<10:04, 14.83it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 6558: train loss 3.05861:  42%|████▏     | 6559/15515 [07:09<09:59, 14.94it/s]


32459 128
32459 128
32459 128


epoch 0 iter 6562: train loss 3.13398:  42%|████▏     | 6563/15515 [07:09<09:51, 15.14it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 6565: train loss 3.06979:  42%|████▏     | 6565/15515 [07:10<09:50, 15.15it/s]


32459 128
32459 128
32459 128


epoch 0 iter 6569: train loss 3.04923:  42%|████▏     | 6569/15515 [07:10<09:58, 14.94it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6572: train loss 3.10633:  42%|████▏     | 6573/15515 [07:10<10:08, 14.69it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6575: train loss 3.14120:  42%|████▏     | 6575/15515 [07:10<10:10, 14.65it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6579: train loss 3.11111:  42%|████▏     | 6579/15515 [07:10<09:54, 15.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6583: train loss 3.14803:  42%|████▏     | 6583/15515 [07:11<09:52, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6587: train loss 3.10988:  42%|████▏     | 6587/15515 [07:11<09:52, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6591: train loss 3.16265:  42%|████▏     | 6591/15515 [07:11<09:52, 15.06it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 6594: train loss 3.10804:  43%|████▎     | 6595/15515 [07:11<09:52, 15.05it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 6598: train loss 3.14362:  43%|████▎     | 6599/15515 [07:12<09:52, 15.05it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6602: train loss 3.09371:  43%|████▎     | 6603/15515 [07:12<09:47, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6606: train loss 3.07265:  43%|████▎     | 6607/15515 [07:12<09:53, 15.02it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 6609: train loss 3.12805:  43%|████▎     | 6609/15515 [07:12<09:52, 15.02it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 6613: train loss 3.06349:  43%|████▎     | 6613/15515 [07:13<09:45, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6617: train loss 3.12977:  43%|████▎     | 6617/15515 [07:13<09:41, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6621: train loss 3.11683:  43%|████▎     | 6621/15515 [07:13<09:39, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6625: train loss 3.10116:  43%|████▎     | 6625/15515 [07:13<09:44, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6629: train loss 3.11450:  43%|████▎     | 6629/15515 [07:14<09:38, 15.36it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 6632: train loss 3.14279:  43%|████▎     | 6633/15515 [07:14<09:40, 15.30it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 6636: train loss 3.18023:  43%|████▎     | 6637/15515 [07:14<09:34, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6640: train loss 3.02772:  43%|████▎     | 6641/15515 [07:14<09:33, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6644: train loss 3.09517:  43%|████▎     | 6645/15515 [07:15<09:36, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6648: train loss 3.15746:  43%|████▎     | 6649/15515 [07:15<09:35, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6652: train loss 3.13054:  43%|████▎     | 6653/15515 [07:15<09:32, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6656: train loss 3.15509:  43%|████▎     | 6657/15515 [07:15<09:31, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6660: train loss 3.14769:  43%|████▎     | 6661/15515 [07:16<09:36, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6664: train loss 3.18707:  43%|████▎     | 6665/15515 [07:16<09:34, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6668: train loss 3.12569:  43%|████▎     | 6669/15515 [07:16<09:33, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6672: train loss 3.09704:  43%|████▎     | 6673/15515 [07:17<09:34, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6675: train loss 3.14625:  43%|████▎     | 6675/15515 [07:17<10:10, 14.48it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6678: train loss 3.16653:  43%|████▎     | 6679/15515 [07:17<10:44, 13.72it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6682: train loss 3.11581:  43%|████▎     | 6683/15515 [07:17<10:17, 14.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6686: train loss 3.13751:  43%|████▎     | 6687/15515 [07:18<10:07, 14.53it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6689: train loss 3.11897:  43%|████▎     | 6689/15515 [07:18<10:24, 14.13it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6691: train loss 3.13265:  43%|████▎     | 6691/15515 [07:18<10:11, 14.44it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6695: train loss 3.05693:  43%|████▎     | 6695/15515 [07:18<10:10, 14.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6699: train loss 3.08904:  43%|████▎     | 6699/15515 [07:18<09:49, 14.95it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6703: train loss 3.05425:  43%|████▎     | 6703/15515 [07:19<09:40, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6707: train loss 3.11124:  43%|████▎     | 6707/15515 [07:19<09:34, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6711: train loss 3.11220:  43%|████▎     | 6711/15515 [07:19<09:36, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6715: train loss 3.11209:  43%|████▎     | 6715/15515 [07:19<09:47, 14.99it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6718: train loss 3.07457:  43%|████▎     | 6719/15515 [07:20<10:40, 13.72it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6721: train loss 3.02420:  43%|████▎     | 6721/15515 [07:20<10:59, 13.34it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6725: train loss 3.08329:  43%|████▎     | 6725/15515 [07:20<10:36, 13.81it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6729: train loss 3.06988:  43%|████▎     | 6729/15515 [07:20<10:10, 14.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6733: train loss 3.10325:  43%|████▎     | 6733/15515 [07:21<10:02, 14.58it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6737: train loss 3.12623:  43%|████▎     | 6737/15515 [07:21<09:56, 14.72it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 6740: train loss 3.10277:  43%|████▎     | 6741/15515 [07:21<09:47, 14.94it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 6744: train loss 3.08651:  43%|████▎     | 6745/15515 [07:22<09:38, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6748: train loss 3.10645:  43%|████▎     | 6749/15515 [07:22<09:37, 15.18it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6750: train loss 3.15655:  44%|████▎     | 6751/15515 [07:22<10:19, 14.15it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6753: train loss 3.11338:  44%|████▎     | 6753/15515 [07:22<10:44, 13.59it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6756: train loss 3.04064:  44%|████▎     | 6757/15515 [07:22<12:16, 11.89it/s]

32459 128
32459 128


epoch 0 iter 6760: train loss 3.11447:  44%|████▎     | 6761/15515 [07:23<11:25, 12.77it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6763: train loss 3.11107:  44%|████▎     | 6763/15515 [07:23<10:53, 13.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6767: train loss 3.10676:  44%|████▎     | 6767/15515 [07:23<10:16, 14.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6771: train loss 3.07306:  44%|████▎     | 6771/15515 [07:23<10:00, 14.56it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6774: train loss 3.15157:  44%|████▎     | 6775/15515 [07:24<09:52, 14.76it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6778: train loss 3.15920:  44%|████▎     | 6779/15515 [07:24<09:45, 14.92it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 6781: train loss 3.11582:  44%|████▎     | 6781/15515 [07:24<09:41, 15.03it/s]

 128
32459 128
32459 128


epoch 0 iter 6784: train loss 3.09364:  44%|████▎     | 6783/15515 [07:24<10:20, 14.08it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6786: train loss 3.08884:  44%|████▎     | 6787/15515 [07:25<10:37, 13.70it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6790: train loss 3.12121:  44%|████▍     | 6791/15515 [07:25<10:28, 13.87it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6793: train loss 3.07984:  44%|████▍     | 6793/15515 [07:25<10:12, 14.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6797: train loss 3.04316:  44%|████▍     | 6797/15515 [07:25<09:54, 14.67it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6801: train loss 3.08874:  44%|████▍     | 6801/15515 [07:26<09:47, 14.84it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6805: train loss 3.14983:  44%|████▍     | 6805/15515 [07:26<09:39, 15.03it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6808: train loss 3.08340:  44%|████▍     | 6807/15515 [07:26<10:07, 14.33it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6811: train loss 3.06404:  44%|████▍     | 6811/15515 [07:26<10:18, 14.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6815: train loss 3.09359:  44%|████▍     | 6815/15515 [07:27<10:05, 14.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6819: train loss 3.13019:  44%|████▍     | 6819/15515 [07:27<09:43, 14.91it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6822: train loss 3.10181:  44%|████▍     | 6823/15515 [07:27<10:08, 14.29it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6826: train loss 3.11169:  44%|████▍     | 6827/15515 [07:27<09:56, 14.55it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6829: train loss 3.05705:  44%|████▍     | 6829/15515 [07:27<09:48, 14.76it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6833: train loss 3.07449:  44%|████▍     | 6833/15515 [07:28<09:33, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6837: train loss 3.14366:  44%|████▍     | 6837/15515 [07:28<09:26, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6841: train loss 3.07214:  44%|████▍     | 6841/15515 [07:28<09:24, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6845: train loss 3.13131:  44%|████▍     | 6845/15515 [07:29<09:25, 15.32it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6847: train loss 3.02175:  44%|████▍     | 6847/15515 [07:29<10:22, 13.93it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6851: train loss 3.14323:  44%|████▍     | 6851/15515 [07:29<10:09, 14.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6855: train loss 3.13929:  44%|████▍     | 6855/15515 [07:29<09:46, 14.77it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6858: train loss 3.04452:  44%|████▍     | 6859/15515 [07:29<09:37, 14.98it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6862: train loss 3.12675:  44%|████▍     | 6863/15515 [07:30<09:30, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6866: train loss 3.07689:  44%|████▍     | 6867/15515 [07:30<09:31, 15.13it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6869: train loss 3.10193:  44%|████▍     | 6869/15515 [07:30<09:25, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6873: train loss 3.06599:  44%|████▍     | 6873/15515 [07:30<09:38, 14.93it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6876: train loss 3.11504:  44%|████▍     | 6877/15515 [07:31<09:43, 14.79it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 6879: train loss 3.01140:  44%|████▍     | 6879/15515 [07:31<09:41, 14.85it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 6883: train loss 3.16973:  44%|████▍     | 6883/15515 [07:31<09:25, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6887: train loss 3.11031:  44%|████▍     | 6887/15515 [07:31<09:23, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6891: train loss 3.13412:  44%|████▍     | 6891/15515 [07:32<09:23, 15.31it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6893: train loss 3.20288:  44%|████▍     | 6893/15515 [07:32<10:07, 14.20it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6897: train loss 3.07354:  44%|████▍     | 6897/15515 [07:32<09:53, 14.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6901: train loss 3.09169:  44%|████▍     | 6901/15515 [07:32<09:36, 14.95it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6905: train loss 3.09278:  45%|████▍     | 6905/15515 [07:33<09:45, 14.70it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6909: train loss 3.10521:  45%|████▍     | 6909/15515 [07:33<09:37, 14.89it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6911: train loss 3.04620:  45%|████▍     | 6911/15515 [07:33<10:03, 14.27it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6915: train loss 3.05664:  45%|████▍     | 6915/15515 [07:33<10:18, 13.91it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6919: train loss 3.12197:  45%|████▍     | 6919/15515 [07:34<09:43, 14.73it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6923: train loss 3.10236:  45%|████▍     | 6923/15515 [07:34<09:35, 14.93it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6927: train loss 3.09772:  45%|████▍     | 6927/15515 [07:34<09:25, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6931: train loss 3.13761:  45%|████▍     | 6931/15515 [07:34<09:19, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6935: train loss 3.06165:  45%|████▍     | 6935/15515 [07:35<09:17, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6939: train loss 3.05278:  45%|████▍     | 6939/15515 [07:35<09:16, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6943: train loss 3.10589:  45%|████▍     | 6943/15515 [07:35<09:26, 15.14it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 6946: train loss 3.10529:  45%|████▍     | 6947/15515 [07:35<09:28, 15.07it/s]

 128
32459 128
32459 128
32459

epoch 0 iter 6949: train loss 3.09440:  45%|████▍     | 6949/15515 [07:36<09:22, 15.22it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 6952: train loss 3.10000:  45%|████▍     | 6953/15515 [07:36<09:22, 15.23it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6956: train loss 3.17496:  45%|████▍     | 6957/15515 [07:36<09:46, 14.58it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6959: train loss 3.04453:  45%|████▍     | 6959/15515 [07:36<09:45, 14.61it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6962: train loss 3.04602:  45%|████▍     | 6963/15515 [07:36<10:25, 13.68it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6965: train loss 3.11174:  45%|████▍     | 6965/15515 [07:37<10:40, 13.34it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6969: train loss 3.05075:  45%|████▍     | 6969/15515 [07:37<10:37, 13.41it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6972: train loss 3.08352:  45%|████▍     | 6973/15515 [07:37<09:58, 14.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6976: train loss 3.06635:  45%|████▍     | 6977/15515 [07:37<09:40, 14.70it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6979: train loss 3.02708:  45%|████▍     | 6979/15515 [07:38<09:36, 14.80it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6983: train loss 3.08871:  45%|████▌     | 6983/15515 [07:38<09:31, 14.92it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 6987: train loss 3.05358:  45%|████▌     | 6987/15515 [07:38<09:31, 14.92it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6989: train loss 3.08344:  45%|████▌     | 6989/15515 [07:38<09:40, 14.69it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6993: train loss 3.11024:  45%|████▌     | 6993/15515 [07:39<09:53, 14.36it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6995: train loss 3.09970:  45%|████▌     | 6995/15515 [07:39<10:18, 13.77it/s]

32459 128
32459 128
32459 128


epoch 0 iter 6998: train loss 3.04234:  45%|████▌     | 6999/15515 [07:39<10:40, 13.30it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7002: train loss 3.02513:  45%|████▌     | 7003/15515 [07:39<10:11, 13.93it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7005: train loss 3.14000:  45%|████▌     | 7005/15515 [07:39<09:59, 14.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7009: train loss 3.08328:  45%|████▌     | 7009/15515 [07:40<09:44, 14.55it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7011: train loss 3.15210:  45%|████▌     | 7011/15515 [07:40<10:01, 14.15it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7015: train loss 3.10250:  45%|████▌     | 7015/15515 [07:40<10:09, 13.94it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7017: train loss 3.07847:  45%|████▌     | 7017/15515 [07:40<10:26, 13.57it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7021: train loss 3.12426:  45%|████▌     | 7021/15515 [07:41<10:06, 14.00it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7023: train loss 3.13271:  45%|████▌     | 7023/15515 [07:41<10:11, 13.89it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7027: train loss 3.08799:  45%|████▌     | 7027/15515 [07:41<10:03, 14.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7031: train loss 3.14181:  45%|████▌     | 7031/15515 [07:41<09:41, 14.59it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7035: train loss 3.06781:  45%|████▌     | 7035/15515 [07:42<09:38, 14.66it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7038: train loss 3.09331:  45%|████▌     | 7039/15515 [07:42<09:40, 14.60it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7041: train loss 3.14325:  45%|████▌     | 7041/15515 [07:42<09:30, 14.85it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7045: train loss 3.04815:  45%|████▌     | 7045/15515 [07:42<09:41, 14.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7048: train loss 3.12904:  45%|████▌     | 7049/15515 [07:43<10:02, 14.05it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7052: train loss 3.06594:  45%|████▌     | 7053/15515 [07:43<09:37, 14.64it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7056: train loss 3.08694:  45%|████▌     | 7057/15515 [07:43<09:20, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7060: train loss 3.06949:  46%|████▌     | 7061/15515 [07:43<09:12, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7064: train loss 3.09114:  46%|████▌     | 7065/15515 [07:44<09:21, 15.05it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7067: train loss 3.05020:  46%|████▌     | 7067/15515 [07:44<09:19, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7071: train loss 3.08143:  46%|████▌     | 7071/15515 [07:44<09:13, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7075: train loss 3.05786:  46%|████▌     | 7075/15515 [07:44<09:08, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7079: train loss 3.14446:  46%|████▌     | 7079/15515 [07:45<09:05, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7083: train loss 3.05727:  46%|████▌     | 7083/15515 [07:45<09:10, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7087: train loss 3.06938:  46%|████▌     | 7087/15515 [07:45<09:08, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7091: train loss 3.12223:  46%|████▌     | 7091/15515 [07:45<09:09, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7095: train loss 3.08622:  46%|████▌     | 7095/15515 [07:46<09:11, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7098: train loss 3.12150:  46%|████▌     | 7099/15515 [07:46<09:08, 15.35it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7102: train loss 3.07373:  46%|████▌     | 7103/15515 [07:46<09:29, 14.76it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7106: train loss 3.13045:  46%|████▌     | 7105/15515 [07:46<09:29, 14.78it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7109: train loss 3.11321:  46%|████▌     | 7109/15515 [07:47<10:00, 14.00it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7113: train loss 3.15467:  46%|████▌     | 7113/15515 [07:47<09:26, 14.84it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7117: train loss 3.10359:  46%|████▌     | 7117/15515 [07:47<09:10, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7121: train loss 3.11882:  46%|████▌     | 7121/15515 [07:47<09:07, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7124: train loss 3.07018:  46%|████▌     | 7125/15515 [07:48<09:30, 14.72it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7128: train loss 3.06072:  46%|████▌     | 7129/15515 [07:48<09:37, 14.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7132: train loss 3.17123:  46%|████▌     | 7133/15515 [07:48<09:18, 15.02it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7136: train loss 3.05631:  46%|████▌     | 7137/15515 [07:48<09:07, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7140: train loss 3.07448:  46%|████▌     | 7141/15515 [07:49<09:04, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7144: train loss 3.01283:  46%|████▌     | 7145/15515 [07:49<09:01, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7148: train loss 3.06447:  46%|████▌     | 7149/15515 [07:49<09:04, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7152: train loss 3.06209:  46%|████▌     | 7153/15515 [07:49<09:00, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7156: train loss 3.02712:  46%|████▌     | 7157/15515 [07:50<09:02, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7160: train loss 3.11428:  46%|████▌     | 7161/15515 [07:50<09:04, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7164: train loss 3.06935:  46%|████▌     | 7165/15515 [07:50<09:10, 15.16it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7167: train loss 3.08233:  46%|████▌     | 7167/15515 [07:50<09:14, 15.06it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7170: train loss 3.10481:  46%|████▌     | 7171/15515 [07:51<09:22, 14.84it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7173: train loss 3.11959:  46%|████▌     | 7173/15515 [07:51<09:19, 14.90it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7177: train loss 3.11317:  46%|████▋     | 7177/15515 [07:51<09:15, 15.00it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7181: train loss 3.05587:  46%|████▋     | 7181/15515 [07:51<09:09, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7185: train loss 3.10987:  46%|████▋     | 7185/15515 [07:52<09:02, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7189: train loss 3.07488:  46%|████▋     | 7189/15515 [07:52<09:13, 15.05it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7192: train loss 3.16648:  46%|████▋     | 7193/15515 [07:52<09:19, 14.88it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 7195: train loss 3.05434:  46%|████▋     | 7195/15515 [07:52<09:12, 15.07it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 7199: train loss 3.07094:  46%|████▋     | 7199/15515 [07:52<09:18, 14.90it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7203: train loss 3.14578:  46%|████▋     | 7203/15515 [07:53<09:20, 14.82it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7206: train loss 3.03036:  46%|████▋     | 7207/15515 [07:53<09:25, 14.68it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7209: train loss 3.03620:  46%|████▋     | 7209/15515 [07:53<09:25, 14.69it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7212: train loss 3.03426:  46%|████▋     | 7213/15515 [07:53<09:18, 14.87it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7216: train loss 3.07274:  47%|████▋     | 7217/15515 [07:54<09:15, 14.94it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7219: train loss 3.10846:  47%|████▋     | 7219/15515 [07:54<09:23, 14.73it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7222: train loss 3.02900:  47%|████▋     | 7223/15515 [07:54<09:28, 14.59it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7225: train loss 3.12513:  47%|████▋     | 7225/15515 [07:54<09:26, 14.63it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7229: train loss 3.14186:  47%|████▋     | 7229/15515 [07:55<09:21, 14.76it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7232: train loss 3.04363:  47%|████▋     | 7233/15515 [07:55<09:16, 14.88it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7235: train loss 3.01343:  47%|████▋     | 7235/15515 [07:55<09:19, 14.80it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7239: train loss 3.07569:  47%|████▋     | 7239/15515 [07:55<09:08, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7243: train loss 3.10863:  47%|████▋     | 7243/15515 [07:55<09:07, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7247: train loss 3.08339:  47%|████▋     | 7247/15515 [07:56<09:04, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7251: train loss 3.13439:  47%|████▋     | 7251/15515 [07:56<09:12, 14.97it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7255: train loss 3.07228:  47%|████▋     | 7255/15515 [07:56<09:06, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7259: train loss 3.09659:  47%|████▋     | 7259/15515 [07:57<09:07, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7263: train loss 3.14590:  47%|████▋     | 7263/15515 [07:57<09:02, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7267: train loss 3.09928:  47%|████▋     | 7267/15515 [07:57<08:57, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7271: train loss 3.12646:  47%|████▋     | 7271/15515 [07:57<08:55, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7275: train loss 3.04590:  47%|████▋     | 7275/15515 [07:58<08:54, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7279: train loss 3.09769:  47%|████▋     | 7279/15515 [07:58<08:52, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7283: train loss 3.09316:  47%|████▋     | 7283/15515 [07:58<08:52, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7287: train loss 3.02483:  47%|████▋     | 7287/15515 [07:58<08:50, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7291: train loss 3.07490:  47%|████▋     | 7291/15515 [07:59<08:53, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7295: train loss 3.09732:  47%|████▋     | 7295/15515 [07:59<08:49, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7299: train loss 3.05952:  47%|████▋     | 7299/15515 [07:59<08:51, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7303: train loss 3.02454:  47%|████▋     | 7303/15515 [07:59<08:53, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7307: train loss 3.03921:  47%|████▋     | 7307/15515 [08:00<08:56, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7311: train loss 3.04891:  47%|████▋     | 7311/15515 [08:00<08:54, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7315: train loss 3.05513:  47%|████▋     | 7315/15515 [08:00<08:54, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7319: train loss 3.08151:  47%|████▋     | 7319/15515 [08:00<08:48, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7323: train loss 3.05437:  47%|████▋     | 7323/15515 [08:01<08:49, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7327: train loss 3.16115:  47%|████▋     | 7327/15515 [08:01<08:48, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7331: train loss 3.04040:  47%|████▋     | 7331/15515 [08:01<08:47, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7335: train loss 3.11620:  47%|████▋     | 7335/15515 [08:01<08:53, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7339: train loss 3.11666:  47%|████▋     | 7339/15515 [08:02<08:49, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7343: train loss 3.09339:  47%|████▋     | 7343/15515 [08:02<08:47, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7347: train loss 3.00698:  47%|████▋     | 7347/15515 [08:02<08:49, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7351: train loss 3.09159:  47%|████▋     | 7351/15515 [08:02<08:46, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7355: train loss 3.07868:  47%|████▋     | 7355/15515 [08:03<08:44, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7359: train loss 3.01567:  47%|████▋     | 7359/15515 [08:03<08:46, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7363: train loss 3.16456:  47%|████▋     | 7363/15515 [08:03<08:45, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7367: train loss 3.04665:  47%|████▋     | 7367/15515 [08:04<08:50, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7371: train loss 3.13993:  48%|████▊     | 7371/15515 [08:04<08:44, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7375: train loss 3.07185:  48%|████▊     | 7375/15515 [08:04<08:48, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7379: train loss 3.06840:  48%|████▊     | 7379/15515 [08:04<08:52, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7383: train loss 3.16178:  48%|████▊     | 7383/15515 [08:05<08:58, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7386: train loss 3.10960:  48%|████▊     | 7387/15515 [08:05<08:58, 15.08it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7390: train loss 3.15557:  48%|████▊     | 7391/15515 [08:05<09:01, 15.00it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7394: train loss 3.07890:  48%|████▊     | 7395/15515 [08:05<09:05, 14.89it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7397: train loss 3.01164:  48%|████▊     | 7397/15515 [08:06<09:06, 14.86it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7401: train loss 3.09071:  48%|████▊     | 7401/15515 [08:06<09:07, 14.81it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7405: train loss 3.09912:  48%|████▊     | 7405/15515 [08:06<09:01, 14.97it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 7408: train loss 3.07923:  48%|████▊     | 7409/15515 [08:06<09:01, 14.98it/s]

 128
32459 128
32459 128


epoch 0 iter 7411: train loss 3.09731:  48%|████▊     | 7411/15515 [08:06<08:55, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7415: train loss 3.07416:  48%|████▊     | 7415/15515 [08:07<09:07, 14.80it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 7418: train loss 3.11209:  48%|████▊     | 7419/15515 [08:07<09:05, 14.85it/s]

 128
32459 128
32459 128


epoch 0 iter 7421: train loss 3.04588:  48%|████▊     | 7421/15515 [08:07<09:03, 14.89it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 7424: train loss 3.06926:  48%|████▊     | 7425/15515 [08:07<09:05, 14.83it/s]

128
32459 128
32459 128


epoch 0 iter 7427: train loss 3.15111:  48%|████▊     | 7427/15515 [08:08<09:03, 14.89it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7431: train loss 3.05754:  48%|████▊     | 7431/15515 [08:08<08:59, 14.98it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7435: train loss 3.09934:  48%|████▊     | 7435/15515 [08:08<08:50, 15.24it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 7438: train loss 3.10081:  48%|████▊     | 7439/15515 [08:08<09:04, 14.82it/s]

 128
32459 128
32459 128


epoch 0 iter 7441: train loss 3.07237:  48%|████▊     | 7441/15515 [08:08<09:05, 14.80it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7445: train loss 3.05012:  48%|████▊     | 7445/15515 [08:09<08:55, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7449: train loss 3.03752:  48%|████▊     | 7449/15515 [08:09<08:54, 15.10it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 7452: train loss 3.09530:  48%|████▊     | 7453/15515 [08:09<08:56, 15.04it/s]

 128
32459 128
32459 128


epoch 0 iter 7455: train loss 3.02662:  48%|████▊     | 7455/15515 [08:09<08:56, 15.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7459: train loss 3.13769:  48%|████▊     | 7459/15515 [08:10<08:55, 15.05it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7463: train loss 3.05662:  48%|████▊     | 7463/15515 [08:10<09:00, 14.89it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7467: train loss 3.04791:  48%|████▊     | 7467/15515 [08:10<08:56, 15.01it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7471: train loss 3.06042:  48%|████▊     | 7471/15515 [08:10<08:51, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7475: train loss 3.06060:  48%|████▊     | 7475/15515 [08:11<08:53, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7479: train loss 3.13251:  48%|████▊     | 7479/15515 [08:11<08:51, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7483: train loss 3.05915:  48%|████▊     | 7483/15515 [08:11<08:49, 15.16it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7486: train loss 3.01347:  48%|████▊     | 7487/15515 [08:11<08:54, 15.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7490: train loss 3.13594:  48%|████▊     | 7489/15515 [08:12<09:04, 14.74it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7493: train loss 3.06318:  48%|████▊     | 7493/15515 [08:12<09:26, 14.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7497: train loss 3.04033:  48%|████▊     | 7497/15515 [08:12<08:57, 14.92it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7499: train loss 3.02338:  48%|████▊     | 7499/15515 [08:12<09:09, 14.59it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7503: train loss 3.06691:  48%|████▊     | 7503/15515 [08:13<09:22, 14.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7507: train loss 3.04647:  48%|████▊     | 7507/15515 [08:13<08:54, 14.98it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7511: train loss 3.03606:  48%|████▊     | 7511/15515 [08:13<08:46, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7515: train loss 3.13559:  48%|████▊     | 7515/15515 [08:13<08:47, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7519: train loss 3.07553:  48%|████▊     | 7519/15515 [08:14<08:43, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7523: train loss 3.03919:  48%|████▊     | 7523/15515 [08:14<08:39, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7527: train loss 3.08525:  49%|████▊     | 7527/15515 [08:14<08:33, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7531: train loss 3.08436:  49%|████▊     | 7531/15515 [08:14<08:34, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7534: train loss 3.02394:  49%|████▊     | 7535/15515 [08:15<08:53, 14.95it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7538: train loss 3.07087:  49%|████▊     | 7539/15515 [08:15<08:54, 14.93it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7542: train loss 3.06707:  49%|████▊     | 7543/15515 [08:15<08:48, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7545: train loss 3.12315:  49%|████▊     | 7545/15515 [08:15<08:45, 15.16it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7549: train loss 3.04401:  49%|████▊     | 7549/15515 [08:16<08:43, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7553: train loss 3.04532:  49%|████▊     | 7553/15515 [08:16<08:39, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7557: train loss 3.03200:  49%|████▊     | 7557/15515 [08:16<08:30, 15.59it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7561: train loss 3.09719:  49%|████▊     | 7561/15515 [08:16<08:32, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7565: train loss 3.08576:  49%|████▉     | 7565/15515 [08:17<08:34, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7569: train loss 3.08797:  49%|████▉     | 7569/15515 [08:17<08:39, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7573: train loss 3.11349:  49%|████▉     | 7573/15515 [08:17<08:34, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7577: train loss 3.12076:  49%|████▉     | 7577/15515 [08:17<08:32, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7581: train loss 3.06771:  49%|████▉     | 7581/15515 [08:18<08:35, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7585: train loss 3.07979:  49%|████▉     | 7585/15515 [08:18<08:36, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7589: train loss 3.10746:  49%|████▉     | 7589/15515 [08:18<08:37, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7593: train loss 3.08505:  49%|████▉     | 7593/15515 [08:18<08:34, 15.41it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7596: train loss 3.07703:  49%|████▉     | 7597/15515 [08:19<08:47, 15.00it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7599: train loss 3.03319:  49%|████▉     | 7599/15515 [08:19<08:49, 14.95it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7603: train loss 3.07720:  49%|████▉     | 7603/15515 [08:19<08:53, 14.82it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7607: train loss 3.13721:  49%|████▉     | 7607/15515 [08:19<08:54, 14.81it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7610: train loss 3.05771:  49%|████▉     | 7611/15515 [08:20<08:45, 15.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7614: train loss 3.10463:  49%|████▉     | 7615/15515 [08:20<08:42, 15.11it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7617: train loss 3.06269:  49%|████▉     | 7617/15515 [08:20<08:42, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7621: train loss 3.13778:  49%|████▉     | 7621/15515 [08:20<08:44, 15.05it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7625: train loss 3.09171:  49%|████▉     | 7625/15515 [08:21<08:52, 14.83it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7628: train loss 3.04519:  49%|████▉     | 7629/15515 [08:21<08:52, 14.80it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 7631: train loss 3.09083:  49%|████▉     | 7631/15515 [08:21<08:51, 14.84it/s]

 128
32459 128
32459 128


epoch 0 iter 7634: train loss 3.02534:  49%|████▉     | 7635/15515 [08:21<08:55, 14.71it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7637: train loss 2.99962:  49%|████▉     | 7637/15515 [08:21<08:55, 14.71it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7641: train loss 3.08765:  49%|████▉     | 7641/15515 [08:22<08:52, 14.79it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7644: train loss 2.96569:  49%|████▉     | 7645/15515 [08:22<08:50, 14.84it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7647: train loss 3.04040:  49%|████▉     | 7647/15515 [08:22<08:53, 14.74it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7651: train loss 3.06763:  49%|████▉     | 7651/15515 [08:22<08:57, 14.62it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7654: train loss 3.11697:  49%|████▉     | 7655/15515 [08:23<08:55, 14.68it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7657: train loss 3.03208:  49%|████▉     | 7657/15515 [08:23<08:57, 14.62it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 7660: train loss 3.04758:  49%|████▉     | 7661/15515 [08:23<08:42, 15.05it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 7664: train loss 3.02128:  49%|████▉     | 7665/15515 [08:23<08:38, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7668: train loss 3.12757:  49%|████▉     | 7669/15515 [08:24<08:39, 15.10it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7671: train loss 3.04064:  49%|████▉     | 7671/15515 [08:24<08:39, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7675: train loss 3.01793:  49%|████▉     | 7675/15515 [08:24<08:37, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7679: train loss 3.08118:  49%|████▉     | 7679/15515 [08:24<08:34, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7683: train loss 3.08406:  50%|████▉     | 7683/15515 [08:25<08:29, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7687: train loss 3.13821:  50%|████▉     | 7687/15515 [08:25<08:26, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7691: train loss 3.09040:  50%|████▉     | 7691/15515 [08:25<08:25, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7695: train loss 2.98342:  50%|████▉     | 7695/15515 [08:25<08:28, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7699: train loss 3.01902:  50%|████▉     | 7699/15515 [08:26<08:31, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7703: train loss 3.13457:  50%|████▉     | 7703/15515 [08:26<08:29, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7707: train loss 3.02070:  50%|████▉     | 7707/15515 [08:26<08:23, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7711: train loss 3.10282:  50%|████▉     | 7711/15515 [08:26<08:23, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7715: train loss 3.09671:  50%|████▉     | 7715/15515 [08:27<08:21, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7719: train loss 3.07743:  50%|████▉     | 7719/15515 [08:27<08:20, 15.57it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7723: train loss 3.07980:  50%|████▉     | 7723/15515 [08:27<08:27, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7727: train loss 3.02276:  50%|████▉     | 7727/15515 [08:27<08:25, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7731: train loss 3.01628:  50%|████▉     | 7731/15515 [08:28<08:32, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7735: train loss 3.04832:  50%|████▉     | 7735/15515 [08:28<08:26, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7739: train loss 3.13971:  50%|████▉     | 7739/15515 [08:28<08:26, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7743: train loss 3.08922:  50%|████▉     | 7743/15515 [08:28<08:27, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7747: train loss 3.09264:  50%|████▉     | 7747/15515 [08:29<08:25, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7751: train loss 3.03481:  50%|████▉     | 7751/15515 [08:29<08:28, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7755: train loss 3.06624:  50%|████▉     | 7755/15515 [08:29<08:25, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7759: train loss 3.10608:  50%|█████     | 7759/15515 [08:29<08:23, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7763: train loss 3.09924:  50%|█████     | 7763/15515 [08:30<08:24, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7767: train loss 3.05798:  50%|█████     | 7767/15515 [08:30<08:28, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7771: train loss 3.05099:  50%|█████     | 7771/15515 [08:30<08:23, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7774: train loss 3.07325:  50%|█████     | 7775/15515 [08:31<09:57, 12.96it/s]

32459 128
32459 128


epoch 0 iter 7776: train loss 3.04900:  50%|█████     | 7777/15515 [08:31<10:23, 12.41it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7780: train loss 3.05286:  50%|█████     | 7781/15515 [08:31<09:30, 13.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7784: train loss 3.07521:  50%|█████     | 7785/15515 [08:31<08:54, 14.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7788: train loss 3.12252:  50%|█████     | 7787/15515 [08:31<08:51, 14.55it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7791: train loss 3.07124:  50%|█████     | 7791/15515 [08:32<08:42, 14.79it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7795: train loss 3.11426:  50%|█████     | 7795/15515 [08:32<08:58, 14.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7799: train loss 3.11990:  50%|█████     | 7799/15515 [08:32<08:56, 14.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7803: train loss 3.09683:  50%|█████     | 7803/15515 [08:33<08:46, 14.64it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7807: train loss 3.12558:  50%|█████     | 7807/15515 [08:33<08:41, 14.79it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7811: train loss 3.11057:  50%|█████     | 7811/15515 [08:33<08:42, 14.76it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7814: train loss 3.03974:  50%|█████     | 7815/15515 [08:33<08:48, 14.58it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7818: train loss 3.11654:  50%|█████     | 7817/15515 [08:34<08:44, 14.66it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7820: train loss 3.02017:  50%|█████     | 7821/15515 [08:34<09:19, 13.74it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7823: train loss 3.06738:  50%|█████     | 7823/15515 [08:34<09:47, 13.09it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7826: train loss 3.07805:  50%|█████     | 7827/15515 [08:34<09:54, 12.93it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7830: train loss 2.97941:  50%|█████     | 7831/15515 [08:34<09:12, 13.90it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7833: train loss 3.03634:  50%|█████     | 7833/15515 [08:35<09:17, 13.77it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7836: train loss 3.01509:  50%|█████     | 7835/15515 [08:35<09:02, 14.15it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7839: train loss 3.07982:  51%|█████     | 7839/15515 [08:35<09:00, 14.20it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7842: train loss 3.05116:  51%|█████     | 7843/15515 [08:35<08:51, 14.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7846: train loss 3.13445:  51%|█████     | 7847/15515 [08:36<08:51, 14.42it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 7848: train loss 3.04525:  51%|█████     | 7849/15515 [08:36<08:59, 14.20it/s]

128
32459 128
32459 128


epoch 0 iter 7851: train loss 3.07522:  51%|█████     | 7851/15515 [08:36<08:55, 14.32it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7855: train loss 3.07048:  51%|█████     | 7855/15515 [08:36<08:45, 14.57it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7858: train loss 3.08164:  51%|█████     | 7859/15515 [08:36<09:40, 13.20it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7860: train loss 3.09862:  51%|█████     | 7861/15515 [08:37<10:35, 12.05it/s]

32459 128
32459 128


epoch 0 iter 7864: train loss 3.06707:  51%|█████     | 7863/15515 [08:37<12:02, 10.59it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7867: train loss 3.00239:  51%|█████     | 7867/15515 [08:37<10:14, 12.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7871: train loss 3.04054:  51%|█████     | 7871/15515 [08:37<09:12, 13.84it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7875: train loss 3.04144:  51%|█████     | 7875/15515 [08:38<08:47, 14.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7879: train loss 3.03007:  51%|█████     | 7879/15515 [08:38<08:37, 14.75it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7883: train loss 3.05008:  51%|█████     | 7883/15515 [08:38<08:45, 14.53it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7886: train loss 3.09925:  51%|█████     | 7887/15515 [08:39<08:42, 14.61it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7889: train loss 3.02715:  51%|█████     | 7889/15515 [08:39<08:39, 14.67it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7893: train loss 3.02938:  51%|█████     | 7893/15515 [08:39<09:00, 14.09it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7896: train loss 3.12412:  51%|█████     | 7897/15515 [08:39<08:55, 14.23it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7899: train loss 3.06378:  51%|█████     | 7899/15515 [08:39<08:57, 14.17it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7902: train loss 3.08932:  51%|█████     | 7903/15515 [08:40<08:57, 14.16it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7904: train loss 3.10645:  51%|█████     | 7905/15515 [08:40<08:58, 14.14it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7908: train loss 3.00834:  51%|█████     | 7909/15515 [08:40<09:02, 14.03it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 7910: train loss 3.06370:  51%|█████     | 7911/15515 [08:40<09:13, 13.73it/s]

 128
32459 128
32459 128


epoch 0 iter 7914: train loss 3.03306:  51%|█████     | 7915/15515 [08:41<09:03, 13.98it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7917: train loss 3.11187:  51%|█████     | 7917/15515 [08:41<09:07, 13.87it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7921: train loss 3.09195:  51%|█████     | 7921/15515 [08:41<09:15, 13.67it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7923: train loss 3.07548:  51%|█████     | 7923/15515 [08:41<09:00, 14.05it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7926: train loss 3.06159:  51%|█████     | 7927/15515 [08:41<09:10, 13.79it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7929: train loss 3.08215:  51%|█████     | 7929/15515 [08:42<09:15, 13.66it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7933: train loss 2.98160:  51%|█████     | 7933/15515 [08:42<10:00, 12.62it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7936: train loss 2.98463:  51%|█████     | 7935/15515 [08:42<09:33, 13.22it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7939: train loss 3.06102:  51%|█████     | 7939/15515 [08:42<09:06, 13.87it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7943: train loss 3.06072:  51%|█████     | 7943/15515 [08:43<08:52, 14.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7947: train loss 3.03661:  51%|█████     | 7947/15515 [08:43<08:39, 14.58it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7950: train loss 3.03159:  51%|█████     | 7951/15515 [08:43<08:30, 14.82it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7954: train loss 3.07505:  51%|█████▏    | 7955/15515 [08:43<08:23, 15.01it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7958: train loss 3.09984:  51%|█████▏    | 7959/15515 [08:44<08:29, 14.83it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7962: train loss 3.04376:  51%|█████▏    | 7963/15515 [08:44<08:47, 14.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7965: train loss 3.08367:  51%|█████▏    | 7965/15515 [08:44<09:06, 13.81it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7968: train loss 3.07545:  51%|█████▏    | 7969/15515 [08:44<09:01, 13.93it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7972: train loss 3.01794:  51%|█████▏    | 7973/15515 [08:45<08:51, 14.18it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7975: train loss 3.07771:  51%|█████▏    | 7975/15515 [08:45<08:42, 14.42it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 7977: train loss 2.99493:  51%|█████▏    | 7977/15515 [08:45<08:57, 14.02it/s]

 128
32459 128
32459 128


epoch 0 iter 7980: train loss 3.06274:  51%|█████▏    | 7981/15515 [08:45<09:10, 13.67it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7984: train loss 3.14132:  51%|█████▏    | 7985/15515 [08:46<08:52, 14.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7988: train loss 3.00278:  51%|█████▏    | 7989/15515 [08:46<08:32, 14.68it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7992: train loss 3.03819:  52%|█████▏    | 7993/15515 [08:46<08:24, 14.90it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 7996: train loss 3.11551:  52%|█████▏    | 7995/15515 [08:46<08:32, 14.69it/s]

32459 128
32459 128
32459 128


epoch 0 iter 7998: train loss 3.07792:  52%|█████▏    | 7999/15515 [08:46<08:52, 14.11it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8001: train loss 3.04338:  52%|█████▏    | 8001/15515 [08:47<09:16, 13.51it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8004: train loss 3.07435:  52%|█████▏    | 8005/15515 [08:47<09:19, 13.43it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8008: train loss 3.08532:  52%|█████▏    | 8009/15515 [08:47<08:49, 14.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8012: train loss 3.02525:  52%|█████▏    | 8013/15515 [08:47<08:25, 14.85it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8016: train loss 3.02752:  52%|█████▏    | 8017/15515 [08:48<08:14, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8020: train loss 3.02864:  52%|█████▏    | 8021/15515 [08:48<08:09, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8024: train loss 3.01910:  52%|█████▏    | 8025/15515 [08:48<08:08, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8028: train loss 3.10675:  52%|█████▏    | 8029/15515 [08:49<08:21, 14.93it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 8031: train loss 3.07226:  52%|█████▏    | 8031/15515 [08:49<08:20, 14.95it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 8035: train loss 3.05811:  52%|█████▏    | 8035/15515 [08:49<08:22, 14.89it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8039: train loss 3.11646:  52%|█████▏    | 8039/15515 [08:49<08:15, 15.09it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 8042: train loss 3.08123:  52%|█████▏    | 8043/15515 [08:49<08:15, 15.07it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 8046: train loss 3.02660:  52%|█████▏    | 8047/15515 [08:50<08:10, 15.24it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 8049: train loss 3.08106:  52%|█████▏    | 8049/15515 [08:50<08:10, 15.23it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 8053: train loss 3.00485:  52%|█████▏    | 8053/15515 [08:50<08:06, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8057: train loss 2.99432:  52%|█████▏    | 8057/15515 [08:50<08:06, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8061: train loss 3.00743:  52%|█████▏    | 8061/15515 [08:51<08:06, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8065: train loss 3.08272:  52%|█████▏    | 8065/15515 [08:51<08:15, 15.03it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8068: train loss 3.07683:  52%|█████▏    | 8069/15515 [08:51<08:22, 14.81it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8071: train loss 3.03655:  52%|█████▏    | 8071/15515 [08:51<08:21, 14.84it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8075: train loss 3.06713:  52%|█████▏    | 8075/15515 [08:52<08:19, 14.90it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8079: train loss 3.06188:  52%|█████▏    | 8079/15515 [08:52<08:20, 14.86it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8083: train loss 3.09873:  52%|█████▏    | 8083/15515 [08:52<08:20, 14.84it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8087: train loss 3.09434:  52%|█████▏    | 8087/15515 [08:52<08:20, 14.84it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8091: train loss 3.05181:  52%|█████▏    | 8091/15515 [08:53<08:18, 14.90it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8095: train loss 3.12459:  52%|█████▏    | 8095/15515 [08:53<08:12, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8099: train loss 3.09322:  52%|█████▏    | 8099/15515 [08:53<08:10, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8103: train loss 3.12504:  52%|█████▏    | 8103/15515 [08:53<08:08, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8107: train loss 3.00401:  52%|█████▏    | 8107/15515 [08:54<08:14, 14.98it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8110: train loss 3.04580:  52%|█████▏    | 8111/15515 [08:54<08:14, 14.96it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8114: train loss 3.05265:  52%|█████▏    | 8115/15515 [08:54<08:09, 15.11it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 8117: train loss 3.07058:  52%|█████▏    | 8117/15515 [08:54<08:07, 15.18it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 8121: train loss 3.02148:  52%|█████▏    | 8121/15515 [08:55<08:09, 15.12it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8124: train loss 3.10453:  52%|█████▏    | 8125/15515 [08:55<08:05, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8128: train loss 3.04066:  52%|█████▏    | 8129/15515 [08:55<08:05, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8132: train loss 3.04336:  52%|█████▏    | 8133/15515 [08:55<08:09, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8135: train loss 3.08927:  52%|█████▏    | 8135/15515 [08:56<08:07, 15.14it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8139: train loss 3.08929:  52%|█████▏    | 8139/15515 [08:56<08:01, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8143: train loss 3.06735:  52%|█████▏    | 8143/15515 [08:56<07:57, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8147: train loss 3.02645:  53%|█████▎    | 8147/15515 [08:56<07:56, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8151: train loss 2.99537:  53%|█████▎    | 8151/15515 [08:57<07:57, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8155: train loss 3.07899:  53%|█████▎    | 8155/15515 [08:57<08:04, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8159: train loss 3.04634:  53%|█████▎    | 8159/15515 [08:57<08:04, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8163: train loss 3.05270:  53%|█████▎    | 8163/15515 [08:57<08:03, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8167: train loss 3.07698:  53%|█████▎    | 8167/15515 [08:58<08:01, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8171: train loss 3.02793:  53%|█████▎    | 8171/15515 [08:58<07:59, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8175: train loss 3.01031:  53%|█████▎    | 8175/15515 [08:58<07:59, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8179: train loss 3.07558:  53%|█████▎    | 8179/15515 [08:58<07:59, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8183: train loss 3.07562:  53%|█████▎    | 8183/15515 [08:59<08:03, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8187: train loss 3.02850:  53%|█████▎    | 8187/15515 [08:59<07:56, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8191: train loss 3.11159:  53%|█████▎    | 8191/15515 [08:59<07:58, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8195: train loss 3.04332:  53%|█████▎    | 8195/15515 [09:00<07:54, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8199: train loss 3.11504:  53%|█████▎    | 8199/15515 [09:00<07:56, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8203: train loss 3.07676:  53%|█████▎    | 8203/15515 [09:00<07:59, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8207: train loss 3.09661:  53%|█████▎    | 8207/15515 [09:00<07:59, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8211: train loss 2.98086:  53%|█████▎    | 8211/15515 [09:01<07:57, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8215: train loss 3.02828:  53%|█████▎    | 8215/15515 [09:01<07:56, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8219: train loss 3.09110:  53%|█████▎    | 8219/15515 [09:01<07:54, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8223: train loss 2.99333:  53%|█████▎    | 8223/15515 [09:01<07:53, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8227: train loss 3.04240:  53%|█████▎    | 8227/15515 [09:02<07:54, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8231: train loss 3.03630:  53%|█████▎    | 8231/15515 [09:02<07:55, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8235: train loss 3.02454:  53%|█████▎    | 8235/15515 [09:02<07:51, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8239: train loss 3.05773:  53%|█████▎    | 8239/15515 [09:02<07:50, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8243: train loss 3.03392:  53%|█████▎    | 8243/15515 [09:03<07:54, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8247: train loss 3.05055:  53%|█████▎    | 8247/15515 [09:03<08:01, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8250: train loss 3.01145:  53%|█████▎    | 8251/15515 [09:03<08:04, 14.98it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8253: train loss 3.01788:  53%|█████▎    | 8253/15515 [09:03<08:50, 13.68it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8256: train loss 3.06267:  53%|█████▎    | 8257/15515 [09:04<09:05, 13.29it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8259: train loss 3.05482:  53%|█████▎    | 8259/15515 [09:04<09:33, 12.65it/s]

32459 128
32459 128


epoch 0 iter 8261: train loss 3.04010:  53%|█████▎    | 8261/15515 [09:04<10:30, 11.50it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8265: train loss 3.03710:  53%|█████▎    | 8265/15515 [09:04<09:43, 12.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8269: train loss 2.99829:  53%|█████▎    | 8269/15515 [09:05<08:48, 13.72it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8273: train loss 3.05264:  53%|█████▎    | 8273/15515 [09:05<08:21, 14.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8277: train loss 3.06375:  53%|█████▎    | 8277/15515 [09:05<08:07, 14.86it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8281: train loss 2.99248:  53%|█████▎    | 8281/15515 [09:05<07:59, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8285: train loss 2.99571:  53%|█████▎    | 8285/15515 [09:06<07:57, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8289: train loss 2.99579:  53%|█████▎    | 8289/15515 [09:06<07:55, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8293: train loss 3.08282:  53%|█████▎    | 8293/15515 [09:06<07:59, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8297: train loss 3.05400:  53%|█████▎    | 8297/15515 [09:06<07:53, 15.25it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 8299: train loss 3.02184:  53%|█████▎    | 8299/15515 [09:07<08:28, 14.19it/s]

128
32459 128
32459 128


epoch 0 iter 8302: train loss 3.09630:  54%|█████▎    | 8303/15515 [09:07<09:06, 13.19it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8305: train loss 3.05286:  54%|█████▎    | 8305/15515 [09:07<09:29, 12.66it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8309: train loss 3.01249:  54%|█████▎    | 8309/15515 [09:07<09:20, 12.85it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8313: train loss 3.03481:  54%|█████▎    | 8313/15515 [09:08<08:41, 13.80it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8317: train loss 3.06030:  54%|█████▎    | 8317/15515 [09:08<08:18, 14.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8321: train loss 3.06206:  54%|█████▎    | 8321/15515 [09:08<08:02, 14.90it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8325: train loss 3.03401:  54%|█████▎    | 8325/15515 [09:09<07:53, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8329: train loss 2.98371:  54%|█████▎    | 8329/15515 [09:09<07:50, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8332: train loss 3.01415:  54%|█████▎    | 8333/15515 [09:09<07:48, 15.33it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8335: train loss 3.08924:  54%|█████▎    | 8335/15515 [09:09<08:31, 14.04it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8338: train loss 3.08518:  54%|█████▎    | 8339/15515 [09:09<09:03, 13.20it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8342: train loss 3.04114:  54%|█████▍    | 8343/15515 [09:10<08:49, 13.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8346: train loss 3.06508:  54%|█████▍    | 8347/15515 [09:10<08:21, 14.28it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8349: train loss 3.01346:  54%|█████▍    | 8349/15515 [09:10<08:14, 14.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8353: train loss 3.04435:  54%|█████▍    | 8353/15515 [09:10<08:07, 14.68it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8357: train loss 3.05665:  54%|█████▍    | 8357/15515 [09:11<08:01, 14.87it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8361: train loss 3.00407:  54%|█████▍    | 8361/15515 [09:11<07:58, 14.96it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8365: train loss 3.09293:  54%|█████▍    | 8365/15515 [09:11<07:55, 15.05it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8369: train loss 3.08285:  54%|█████▍    | 8369/15515 [09:12<07:57, 14.96it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8373: train loss 3.09553:  54%|█████▍    | 8373/15515 [09:12<07:52, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8377: train loss 3.05803:  54%|█████▍    | 8377/15515 [09:12<07:51, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8381: train loss 3.03208:  54%|█████▍    | 8381/15515 [09:12<07:52, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8385: train loss 3.09523:  54%|█████▍    | 8385/15515 [09:13<07:51, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8389: train loss 2.98246:  54%|█████▍    | 8389/15515 [09:13<07:46, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8393: train loss 3.05556:  54%|█████▍    | 8393/15515 [09:13<07:47, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8397: train loss 3.02404:  54%|█████▍    | 8397/15515 [09:13<07:50, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8401: train loss 2.97412:  54%|█████▍    | 8401/15515 [09:14<07:53, 15.02it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8405: train loss 3.02695:  54%|█████▍    | 8405/15515 [09:14<07:47, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8409: train loss 3.06744:  54%|█████▍    | 8409/15515 [09:14<07:52, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8413: train loss 3.05582:  54%|█████▍    | 8413/15515 [09:14<07:52, 15.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8417: train loss 3.09182:  54%|█████▍    | 8417/15515 [09:15<07:51, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8421: train loss 3.01176:  54%|█████▍    | 8421/15515 [09:15<07:46, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8425: train loss 3.02245:  54%|█████▍    | 8425/15515 [09:15<07:42, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8429: train loss 3.01444:  54%|█████▍    | 8429/15515 [09:15<07:44, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8433: train loss 3.08286:  54%|█████▍    | 8433/15515 [09:16<07:41, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8437: train loss 3.03411:  54%|█████▍    | 8437/15515 [09:16<07:39, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8441: train loss 3.03682:  54%|█████▍    | 8441/15515 [09:16<07:39, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8445: train loss 3.02980:  54%|█████▍    | 8445/15515 [09:17<07:42, 15.29it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 8448: train loss 3.02783:  54%|█████▍    | 8449/15515 [09:17<07:44, 15.23it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 8452: train loss 3.08326:  54%|█████▍    | 8453/15515 [09:17<07:40, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8456: train loss 3.02022:  55%|█████▍    | 8457/15515 [09:17<07:38, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8460: train loss 3.05425:  55%|█████▍    | 8461/15515 [09:18<07:38, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8464: train loss 3.04459:  55%|█████▍    | 8465/15515 [09:18<07:36, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8468: train loss 2.97919:  55%|█████▍    | 8469/15515 [09:18<07:45, 15.15it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8471: train loss 3.02607:  55%|█████▍    | 8471/15515 [09:18<07:48, 15.02it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8474: train loss 3.04931:  55%|█████▍    | 8475/15515 [09:18<07:47, 15.05it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8478: train loss 3.06112:  55%|█████▍    | 8479/15515 [09:19<07:49, 14.99it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8481: train loss 3.03137:  55%|█████▍    | 8481/15515 [09:19<07:52, 14.90it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8485: train loss 2.98967:  55%|█████▍    | 8485/15515 [09:19<07:48, 15.00it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8489: train loss 3.01046:  55%|█████▍    | 8489/15515 [09:19<07:56, 14.73it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 8492: train loss 3.10502:  55%|█████▍    | 8493/15515 [09:20<07:56, 14.75it/s]

128
32459 128
32459 128


epoch 0 iter 8495: train loss 3.07227:  55%|█████▍    | 8495/15515 [09:20<07:56, 14.75it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8499: train loss 2.98336:  55%|█████▍    | 8499/15515 [09:20<07:53, 14.80it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8502: train loss 3.00981:  55%|█████▍    | 8503/15515 [09:20<07:50, 14.92it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8505: train loss 3.00087:  55%|█████▍    | 8505/15515 [09:21<07:54, 14.78it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8509: train loss 3.05390:  55%|█████▍    | 8509/15515 [09:21<07:49, 14.92it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8513: train loss 3.02540:  55%|█████▍    | 8513/15515 [09:21<07:47, 14.99it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8516: train loss 3.05201:  55%|█████▍    | 8517/15515 [09:21<07:47, 14.96it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8520: train loss 3.00953:  55%|█████▍    | 8521/15515 [09:22<07:49, 14.91it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8524: train loss 3.07861:  55%|█████▍    | 8525/15515 [09:22<07:51, 14.82it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8527: train loss 2.99690:  55%|█████▍    | 8527/15515 [09:22<07:51, 14.83it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8531: train loss 3.04325:  55%|█████▍    | 8531/15515 [09:22<07:57, 14.63it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8534: train loss 2.99285:  55%|█████▌    | 8535/15515 [09:22<07:52, 14.78it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8537: train loss 3.02045:  55%|█████▌    | 8537/15515 [09:23<07:44, 15.02it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8541: train loss 3.02649:  55%|█████▌    | 8541/15515 [09:23<07:36, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8545: train loss 3.10539:  55%|█████▌    | 8545/15515 [09:23<07:30, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8549: train loss 3.06708:  55%|█████▌    | 8549/15515 [09:23<07:36, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8553: train loss 2.97345:  55%|█████▌    | 8553/15515 [09:24<07:37, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8557: train loss 3.06794:  55%|█████▌    | 8557/15515 [09:24<07:36, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8561: train loss 3.03236:  55%|█████▌    | 8561/15515 [09:24<07:38, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8565: train loss 3.06264:  55%|█████▌    | 8565/15515 [09:25<07:37, 15.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8569: train loss 3.12081:  55%|█████▌    | 8569/15515 [09:25<07:36, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8573: train loss 3.02211:  55%|█████▌    | 8573/15515 [09:25<07:37, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8577: train loss 2.98344:  55%|█████▌    | 8577/15515 [09:25<07:34, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8581: train loss 3.08559:  55%|█████▌    | 8581/15515 [09:26<07:32, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8585: train loss 3.05474:  55%|█████▌    | 8585/15515 [09:26<07:32, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8589: train loss 3.06464:  55%|█████▌    | 8589/15515 [09:26<07:34, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8593: train loss 3.00121:  55%|█████▌    | 8593/15515 [09:26<07:33, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8597: train loss 3.07949:  55%|█████▌    | 8597/15515 [09:27<07:26, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8601: train loss 3.02635:  55%|█████▌    | 8601/15515 [09:27<07:28, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8605: train loss 3.01783:  55%|█████▌    | 8605/15515 [09:27<07:35, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8609: train loss 3.02855:  55%|█████▌    | 8609/15515 [09:27<07:39, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8613: train loss 3.07468:  56%|█████▌    | 8613/15515 [09:28<07:39, 15.01it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8617: train loss 3.08267:  56%|█████▌    | 8617/15515 [09:28<07:36, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8621: train loss 3.04588:  56%|█████▌    | 8621/15515 [09:28<07:31, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8625: train loss 3.10215:  56%|█████▌    | 8625/15515 [09:28<07:28, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8629: train loss 3.03228:  56%|█████▌    | 8629/15515 [09:29<07:27, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8633: train loss 3.07391:  56%|█████▌    | 8633/15515 [09:29<07:26, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8637: train loss 3.06159:  56%|█████▌    | 8637/15515 [09:29<07:25, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8641: train loss 3.02063:  56%|█████▌    | 8641/15515 [09:29<07:26, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8645: train loss 2.94361:  56%|█████▌    | 8645/15515 [09:30<07:22, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8649: train loss 2.92570:  56%|█████▌    | 8649/15515 [09:30<07:20, 15.58it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8653: train loss 3.00979:  56%|█████▌    | 8653/15515 [09:30<07:28, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8657: train loss 3.08656:  56%|█████▌    | 8657/15515 [09:31<07:32, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8661: train loss 3.06102:  56%|█████▌    | 8661/15515 [09:31<07:28, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8665: train loss 3.06723:  56%|█████▌    | 8665/15515 [09:31<07:26, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8669: train loss 3.04549:  56%|█████▌    | 8669/15515 [09:31<07:26, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8673: train loss 3.04006:  56%|█████▌    | 8673/15515 [09:32<07:22, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8677: train loss 2.99784:  56%|█████▌    | 8677/15515 [09:32<07:21, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8681: train loss 3.01427:  56%|█████▌    | 8681/15515 [09:32<07:20, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8685: train loss 3.04652:  56%|█████▌    | 8685/15515 [09:32<07:20, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8689: train loss 3.04267:  56%|█████▌    | 8689/15515 [09:33<07:25, 15.32it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 8692: train loss 3.01685:  56%|█████▌    | 8693/15515 [09:33<07:33, 15.05it/s]

 128
32459 128
32459 128


epoch 0 iter 8695: train loss 3.04650:  56%|█████▌    | 8695/15515 [09:33<07:31, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8699: train loss 2.99470:  56%|█████▌    | 8699/15515 [09:33<07:38, 14.87it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8702: train loss 3.01447:  56%|█████▌    | 8703/15515 [09:33<07:41, 14.77it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8705: train loss 3.03206:  56%|█████▌    | 8705/15515 [09:34<07:42, 14.72it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8709: train loss 2.99219:  56%|█████▌    | 8709/15515 [09:34<07:36, 14.89it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8713: train loss 2.98446:  56%|█████▌    | 8713/15515 [09:34<07:26, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8716: train loss 3.01128:  56%|█████▌    | 8717/15515 [09:34<07:29, 15.13it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8720: train loss 2.98539:  56%|█████▌    | 8721/15515 [09:35<07:39, 14.79it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8723: train loss 2.97419:  56%|█████▌    | 8723/15515 [09:35<07:40, 14.76it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8727: train loss 3.00270:  56%|█████▌    | 8727/15515 [09:35<07:36, 14.86it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8731: train loss 3.00892:  56%|█████▋    | 8731/15515 [09:35<07:31, 15.02it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8735: train loss 3.02548:  56%|█████▋    | 8735/15515 [09:36<07:30, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8739: train loss 3.09619:  56%|█████▋    | 8739/15515 [09:36<07:27, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8743: train loss 3.04276:  56%|█████▋    | 8743/15515 [09:36<07:22, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8747: train loss 3.03928:  56%|█████▋    | 8747/15515 [09:36<07:20, 15.36it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 8750: train loss 2.99029:  56%|█████▋    | 8751/15515 [09:37<07:25, 15.17it/s]


32459 128
32459 128
32459 128


epoch 0 iter 8754: train loss 3.03846:  56%|█████▋    | 8755/15515 [09:37<07:27, 15.09it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 8757: train loss 3.06895:  56%|█████▋    | 8757/15515 [09:37<07:28, 15.08it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 8761: train loss 3.05358:  56%|█████▋    | 8761/15515 [09:37<07:30, 15.01it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8765: train loss 3.05058:  56%|█████▋    | 8765/15515 [09:38<07:32, 14.91it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 8768: train loss 3.00929:  57%|█████▋    | 8769/15515 [09:38<07:31, 14.94it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 8772: train loss 3.01196:  57%|█████▋    | 8773/15515 [09:38<07:26, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8776: train loss 3.02085:  57%|█████▋    | 8777/15515 [09:38<07:22, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8780: train loss 3.08356:  57%|█████▋    | 8781/15515 [09:39<07:25, 15.11it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 8783: train loss 3.01158:  57%|█████▋    | 8783/15515 [09:39<07:25, 15.12it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 8787: train loss 3.05430:  57%|█████▋    | 8787/15515 [09:39<07:22, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8791: train loss 3.01130:  57%|█████▋    | 8791/15515 [09:39<07:20, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8795: train loss 2.93711:  57%|█████▋    | 8795/15515 [09:40<07:23, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8799: train loss 2.99365:  57%|█████▋    | 8799/15515 [09:40<07:24, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8803: train loss 3.03032:  57%|█████▋    | 8803/15515 [09:40<07:23, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8807: train loss 3.01396:  57%|█████▋    | 8807/15515 [09:40<07:25, 15.05it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8811: train loss 3.07904:  57%|█████▋    | 8811/15515 [09:41<07:28, 14.94it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8815: train loss 3.08733:  57%|█████▋    | 8815/15515 [09:41<07:28, 14.94it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8819: train loss 3.03707:  57%|█████▋    | 8819/15515 [09:41<07:26, 15.01it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8823: train loss 3.02851:  57%|█████▋    | 8823/15515 [09:42<07:25, 15.01it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8827: train loss 2.97187:  57%|█████▋    | 8827/15515 [09:42<07:24, 15.05it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8831: train loss 2.99413:  57%|█████▋    | 8831/15515 [09:42<07:23, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8835: train loss 2.99737:  57%|█████▋    | 8835/15515 [09:42<07:17, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8839: train loss 2.96372:  57%|█████▋    | 8839/15515 [09:43<07:14, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8843: train loss 3.01533:  57%|█████▋    | 8843/15515 [09:43<07:10, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8847: train loss 3.07431:  57%|█████▋    | 8847/15515 [09:43<07:11, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8851: train loss 3.00216:  57%|█████▋    | 8851/15515 [09:43<07:12, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8855: train loss 3.01860:  57%|█████▋    | 8855/15515 [09:44<07:10, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8859: train loss 3.07856:  57%|█████▋    | 8859/15515 [09:44<07:09, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8863: train loss 3.02793:  57%|█████▋    | 8863/15515 [09:44<07:14, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8867: train loss 2.98726:  57%|█████▋    | 8867/15515 [09:44<07:15, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8871: train loss 3.07106:  57%|█████▋    | 8871/15515 [09:45<07:15, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8875: train loss 3.07048:  57%|█████▋    | 8875/15515 [09:45<07:17, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8879: train loss 3.00399:  57%|█████▋    | 8879/15515 [09:45<07:15, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8883: train loss 3.01484:  57%|█████▋    | 8883/15515 [09:45<07:17, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8887: train loss 2.99525:  57%|█████▋    | 8887/15515 [09:46<07:17, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8891: train loss 3.05207:  57%|█████▋    | 8891/15515 [09:46<07:17, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8895: train loss 3.05998:  57%|█████▋    | 8895/15515 [09:46<07:12, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8899: train loss 3.00519:  57%|█████▋    | 8899/15515 [09:46<07:14, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8903: train loss 3.03696:  57%|█████▋    | 8903/15515 [09:47<07:13, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8907: train loss 3.08302:  57%|█████▋    | 8907/15515 [09:47<07:11, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8911: train loss 2.99552:  57%|█████▋    | 8911/15515 [09:47<07:09, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8915: train loss 3.02993:  57%|█████▋    | 8915/15515 [09:48<07:14, 15.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8919: train loss 3.04491:  57%|█████▋    | 8919/15515 [09:48<07:21, 14.93it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8923: train loss 3.08532:  58%|█████▊    | 8923/15515 [09:48<07:22, 14.90it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8927: train loss 3.02164:  58%|█████▊    | 8927/15515 [09:48<07:16, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8931: train loss 3.07656:  58%|█████▊    | 8931/15515 [09:49<07:16, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8935: train loss 3.04350:  58%|█████▊    | 8935/15515 [09:49<07:19, 14.97it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8939: train loss 3.05443:  58%|█████▊    | 8939/15515 [09:49<07:21, 14.89it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8943: train loss 2.93407:  58%|█████▊    | 8943/15515 [09:49<07:20, 14.91it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8946: train loss 3.01177:  58%|█████▊    | 8947/15515 [09:50<07:27, 14.68it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8949: train loss 2.99568:  58%|█████▊    | 8949/15515 [09:50<07:22, 14.84it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8953: train loss 3.02871:  58%|█████▊    | 8953/15515 [09:50<07:16, 15.04it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8956: train loss 3.06058:  58%|█████▊    | 8957/15515 [09:50<07:17, 14.99it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8960: train loss 3.06751:  58%|█████▊    | 8961/15515 [09:51<07:16, 15.02it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8964: train loss 3.04294:  58%|█████▊    | 8965/15515 [09:51<07:08, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8968: train loss 3.05369:  58%|█████▊    | 8969/15515 [09:51<07:07, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8972: train loss 3.02024:  58%|█████▊    | 8973/15515 [09:51<07:07, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8976: train loss 3.00675:  58%|█████▊    | 8977/15515 [09:52<07:09, 15.23it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8979: train loss 3.05746:  58%|█████▊    | 8979/15515 [09:52<07:12, 15.10it/s]

32459 128
32459 128
32459 128


epoch 0 iter 8982: train loss 3.05644:  58%|█████▊    | 8983/15515 [09:52<07:16, 14.97it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8986: train loss 3.02718:  58%|█████▊    | 8987/15515 [09:52<07:09, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8990: train loss 3.07603:  58%|█████▊    | 8991/15515 [09:53<07:07, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8994: train loss 3.02462:  58%|█████▊    | 8995/15515 [09:53<07:03, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 8998: train loss 3.05877:  58%|█████▊    | 8999/15515 [09:53<07:01, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9002: train loss 3.06507:  58%|█████▊    | 9003/15515 [09:53<07:04, 15.33it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 9005: train loss 2.98641:  58%|█████▊    | 9005/15515 [09:54<07:13, 15.01it/s]


32459 128
32459 128
32459

epoch 0 iter 9008: train loss 2.98838:  58%|█████▊    | 9009/15515 [09:54<07:11, 15.09it/s]

 128
32459 128
32459 128
32459 128

epoch 0 iter 9011: train loss 3.06769:  58%|█████▊    | 9011/15515 [09:54<07:11, 15.06it/s]


32459 128
32459 128
32459 128


epoch 0 iter 9015: train loss 2.93088:  58%|█████▊    | 9015/15515 [09:54<07:09, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9019: train loss 2.96025:  58%|█████▊    | 9019/15515 [09:54<07:11, 15.05it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 9022: train loss 3.01099:  58%|█████▊    | 9023/15515 [09:55<07:12, 15.01it/s]

 128
32459 128
32459 128


epoch 0 iter 9025: train loss 3.04413:  58%|█████▊    | 9025/15515 [09:55<07:10, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9029: train loss 3.04406:  58%|█████▊    | 9029/15515 [09:55<07:06, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9033: train loss 3.08882:  58%|█████▊    | 9033/15515 [09:55<07:08, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9037: train loss 2.93693:  58%|█████▊    | 9037/15515 [09:56<07:10, 15.05it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9041: train loss 2.95330:  58%|█████▊    | 9041/15515 [09:56<07:11, 14.99it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9045: train loss 3.00991:  58%|█████▊    | 9045/15515 [09:56<07:07, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9049: train loss 3.03411:  58%|█████▊    | 9049/15515 [09:56<07:09, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9053: train loss 3.00643:  58%|█████▊    | 9053/15515 [09:57<07:08, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9057: train loss 3.04636:  58%|█████▊    | 9057/15515 [09:57<07:08, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9061: train loss 3.03266:  58%|█████▊    | 9061/15515 [09:57<07:07, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9065: train loss 3.01305:  58%|█████▊    | 9065/15515 [09:58<07:12, 14.90it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9069: train loss 3.06557:  58%|█████▊    | 9069/15515 [09:58<07:07, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9073: train loss 3.08837:  58%|█████▊    | 9073/15515 [09:58<07:08, 15.05it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9076: train loss 3.00677:  59%|█████▊    | 9077/15515 [09:58<07:12, 14.89it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 9079: train loss 3.01479:  59%|█████▊    | 9079/15515 [09:58<07:14, 14.82it/s]

 128
32459 128
32459 128


epoch 0 iter 9082: train loss 3.02842:  59%|█████▊    | 9083/15515 [09:59<07:11, 14.91it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9086: train loss 3.05514:  59%|█████▊    | 9087/15515 [09:59<07:06, 15.07it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9089: train loss 2.98096:  59%|█████▊    | 9089/15515 [09:59<07:11, 14.89it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9093: train loss 3.03695:  59%|█████▊    | 9093/15515 [09:59<07:05, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9097: train loss 3.03423:  59%|█████▊    | 9097/15515 [10:00<07:05, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9101: train loss 3.03118:  59%|█████▊    | 9101/15515 [10:00<07:04, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9105: train loss 3.03108:  59%|█████▊    | 9105/15515 [10:00<07:01, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9109: train loss 3.06524:  59%|█████▊    | 9109/15515 [10:00<07:05, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9113: train loss 3.02287:  59%|█████▊    | 9113/15515 [10:01<07:00, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9117: train loss 2.99595:  59%|█████▉    | 9117/15515 [10:01<06:59, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9121: train loss 3.05960:  59%|█████▉    | 9121/15515 [10:01<07:02, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9125: train loss 2.99985:  59%|█████▉    | 9125/15515 [10:01<06:57, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9129: train loss 3.04073:  59%|█████▉    | 9129/15515 [10:02<06:54, 15.42it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 9132: train loss 3.00444:  59%|█████▉    | 9133/15515 [10:02<06:58, 15.23it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 9136: train loss 3.07414:  59%|█████▉    | 9137/15515 [10:02<07:03, 15.07it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9139: train loss 3.01919:  59%|█████▉    | 9139/15515 [10:02<07:01, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9143: train loss 3.04063:  59%|█████▉    | 9143/15515 [10:03<07:02, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9147: train loss 3.04831:  59%|█████▉    | 9147/15515 [10:03<06:58, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9151: train loss 3.02842:  59%|█████▉    | 9151/15515 [10:03<06:50, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9155: train loss 3.06963:  59%|█████▉    | 9155/15515 [10:03<06:53, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9159: train loss 3.02069:  59%|█████▉    | 9159/15515 [10:04<06:55, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9163: train loss 3.00925:  59%|█████▉    | 9163/15515 [10:04<06:53, 15.36it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 9166: train loss 3.00550:  59%|█████▉    | 9167/15515 [10:04<06:58, 15.16it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 9170: train loss 3.05209:  59%|█████▉    | 9171/15515 [10:04<07:00, 15.10it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 9173: train loss 3.00770:  59%|█████▉    | 9173/15515 [10:05<07:01, 15.03it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 9177: train loss 2.97207:  59%|█████▉    | 9177/15515 [10:05<07:02, 15.00it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9181: train loss 2.98480:  59%|█████▉    | 9181/15515 [10:05<07:00, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9185: train loss 3.01533:  59%|█████▉    | 9185/15515 [10:05<07:03, 14.96it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 9188: train loss 2.99290:  59%|█████▉    | 9189/15515 [10:06<07:02, 14.96it/s]

 128
32459 128
32459 128


epoch 0 iter 9191: train loss 3.03195:  59%|█████▉    | 9191/15515 [10:06<07:06, 14.81it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 9194: train loss 3.04948:  59%|█████▉    | 9195/15515 [10:06<06:59, 15.05it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 9198: train loss 3.06514:  59%|█████▉    | 9199/15515 [10:06<07:07, 14.77it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9201: train loss 2.97642:  59%|█████▉    | 9201/15515 [10:07<07:06, 14.82it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9204: train loss 2.99444:  59%|█████▉    | 9205/15515 [10:07<07:06, 14.78it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9208: train loss 3.07106:  59%|█████▉    | 9209/15515 [10:07<07:05, 14.82it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9211: train loss 2.98432:  59%|█████▉    | 9211/15515 [10:07<07:03, 14.88it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9215: train loss 2.98689:  59%|█████▉    | 9215/15515 [10:07<07:00, 14.97it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9219: train loss 2.99288:  59%|█████▉    | 9219/15515 [10:08<06:58, 15.05it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9223: train loss 2.98752:  59%|█████▉    | 9223/15515 [10:08<06:55, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9227: train loss 3.01574:  59%|█████▉    | 9227/15515 [10:08<06:55, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9231: train loss 3.07915:  59%|█████▉    | 9231/15515 [10:09<06:53, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9235: train loss 3.05493:  60%|█████▉    | 9235/15515 [10:09<06:51, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9239: train loss 3.02020:  60%|█████▉    | 9239/15515 [10:09<06:55, 15.10it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9242: train loss 3.00643:  60%|█████▉    | 9243/15515 [10:09<06:58, 14.98it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 9245: train loss 3.03639:  60%|█████▉    | 9245/15515 [10:09<06:57, 15.01it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 9249: train loss 2.96701:  60%|█████▉    | 9249/15515 [10:10<06:59, 14.94it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9253: train loss 2.99325:  60%|█████▉    | 9253/15515 [10:10<07:00, 14.90it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9256: train loss 2.99132:  60%|█████▉    | 9257/15515 [10:10<07:00, 14.87it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9259: train loss 2.97096:  60%|█████▉    | 9259/15515 [10:10<07:01, 14.86it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9262: train loss 3.04082:  60%|█████▉    | 9263/15515 [10:11<07:05, 14.68it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9265: train loss 3.04019:  60%|█████▉    | 9265/15515 [10:11<07:02, 14.78it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9269: train loss 2.97140:  60%|█████▉    | 9269/15515 [10:11<06:57, 14.97it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 9272: train loss 3.02552:  60%|█████▉    | 9273/15515 [10:11<06:59, 14.87it/s]

 128
32459 128
32459 128
32459

epoch 0 iter 9275: train loss 3.03249:  60%|█████▉    | 9275/15515 [10:11<06:56, 14.99it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 9279: train loss 3.00665:  60%|█████▉    | 9279/15515 [10:12<06:50, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9283: train loss 3.01817:  60%|█████▉    | 9283/15515 [10:12<06:46, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9287: train loss 3.04225:  60%|█████▉    | 9287/15515 [10:12<06:48, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9291: train loss 3.01333:  60%|█████▉    | 9291/15515 [10:12<06:50, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9295: train loss 3.05551:  60%|█████▉    | 9295/15515 [10:13<06:47, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9299: train loss 2.98223:  60%|█████▉    | 9299/15515 [10:13<06:43, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9303: train loss 3.08663:  60%|█████▉    | 9303/15515 [10:13<06:46, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9307: train loss 2.99890:  60%|█████▉    | 9307/15515 [10:14<06:45, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9311: train loss 3.02350:  60%|██████    | 9311/15515 [10:14<06:42, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9315: train loss 2.99368:  60%|██████    | 9315/15515 [10:14<06:47, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9319: train loss 2.91325:  60%|██████    | 9319/15515 [10:14<06:46, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9323: train loss 3.00429:  60%|██████    | 9323/15515 [10:15<06:49, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9327: train loss 3.03234:  60%|██████    | 9327/15515 [10:15<06:43, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9331: train loss 2.99113:  60%|██████    | 9331/15515 [10:15<06:41, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9335: train loss 3.01199:  60%|██████    | 9335/15515 [10:15<06:38, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9339: train loss 3.09061:  60%|██████    | 9339/15515 [10:16<06:43, 15.31it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9342: train loss 3.08605:  60%|██████    | 9343/15515 [10:16<06:49, 15.06it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 9345: train loss 2.99280:  60%|██████    | 9345/15515 [10:16<06:50, 15.02it/s]


32459 128
32459 128
32459 128


epoch 0 iter 9349: train loss 2.95991:  60%|██████    | 9349/15515 [10:16<06:54, 14.88it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9353: train loss 3.04194:  60%|██████    | 9353/15515 [10:17<06:49, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9357: train loss 3.03845:  60%|██████    | 9357/15515 [10:17<06:48, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9360: train loss 2.99116:  60%|██████    | 9361/15515 [10:17<07:01, 14.60it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9363: train loss 2.95530:  60%|██████    | 9363/15515 [10:17<07:27, 13.74it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9366: train loss 2.95894:  60%|██████    | 9367/15515 [10:18<08:07, 12.61it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9369: train loss 2.98443:  60%|██████    | 9369/15515 [10:18<08:32, 12.00it/s]

32459 128
32459 128
32459

epoch 0 iter 9371: train loss 3.08185:  60%|██████    | 9371/15515 [10:18<08:58, 11.41it/s]

 128
32459 128
32459 128


epoch 0 iter 9375: train loss 3.07439:  60%|██████    | 9375/15515 [10:18<08:02, 12.73it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9379: train loss 2.93142:  60%|██████    | 9379/15515 [10:19<07:27, 13.70it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9383: train loss 2.99975:  60%|██████    | 9383/15515 [10:19<07:03, 14.47it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 9385: train loss 2.99982:  60%|██████    | 9385/15515 [10:19<07:02, 14.51it/s]

 128
32459 128
32459 128


epoch 0 iter 9388: train loss 2.96188:  61%|██████    | 9389/15515 [10:19<07:59, 12.79it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9391: train loss 3.01246:  61%|██████    | 9391/15515 [10:19<07:45, 13.15it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9394: train loss 3.00001:  61%|██████    | 9395/15515 [10:20<08:12, 12.42it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9398: train loss 3.02980:  61%|██████    | 9399/15515 [10:20<07:54, 12.89it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9400: train loss 3.00282:  61%|██████    | 9401/15515 [10:20<08:02, 12.68it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9403: train loss 2.96344:  61%|██████    | 9403/15515 [10:20<08:19, 12.24it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9406: train loss 3.01322:  61%|██████    | 9407/15515 [10:21<08:34, 11.87it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9410: train loss 3.00686:  61%|██████    | 9411/15515 [10:21<08:14, 12.34it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9412: train loss 3.00744:  61%|██████    | 9413/15515 [10:21<08:18, 12.25it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9415: train loss 3.00219:  61%|██████    | 9415/15515 [10:21<08:31, 11.94it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9418: train loss 2.98269:  61%|██████    | 9419/15515 [10:22<08:37, 11.78it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9421: train loss 3.02276:  61%|██████    | 9421/15515 [10:22<08:21, 12.14it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9424: train loss 2.93974:  61%|██████    | 9425/15515 [10:22<09:57, 10.19it/s]

32459 128
32459 128


epoch 0 iter 9426: train loss 3.03951:  61%|██████    | 9427/15515 [10:23<10:26,  9.71it/s]

32459 128
32459 128


epoch 0 iter 9428: train loss 2.96362:  61%|██████    | 9428/15515 [10:23<10:35,  9.58it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9431: train loss 2.98698:  61%|██████    | 9432/15515 [10:23<09:52, 10.26it/s]

32459 128
32459 128


epoch 0 iter 9433: train loss 2.97683:  61%|██████    | 9434/15515 [10:23<10:23,  9.76it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9437: train loss 3.00003:  61%|██████    | 9438/15515 [10:24<08:30, 11.90it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9439: train loss 2.96072:  61%|██████    | 9440/15515 [10:24<08:34, 11.82it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9442: train loss 2.89457:  61%|██████    | 9442/15515 [10:24<08:26, 11.98it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9445: train loss 3.02780:  61%|██████    | 9446/15515 [10:24<08:33, 11.81it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9449: train loss 3.03050:  61%|██████    | 9450/15515 [10:24<07:39, 13.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9453: train loss 3.04948:  61%|██████    | 9454/15515 [10:25<07:03, 14.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9457: train loss 3.04994:  61%|██████    | 9458/15515 [10:25<06:47, 14.86it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9460: train loss 3.07042:  61%|██████    | 9460/15515 [10:25<06:48, 14.83it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9463: train loss 2.98434:  61%|██████    | 9464/15515 [10:25<07:15, 13.88it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9466: train loss 3.05703:  61%|██████    | 9466/15515 [10:26<07:45, 12.99it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9469: train loss 2.97060:  61%|██████    | 9470/15515 [10:26<07:45, 12.98it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9472: train loss 2.96045:  61%|██████    | 9472/15515 [10:26<07:52, 12.78it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9476: train loss 3.05711:  61%|██████    | 9476/15515 [10:26<07:39, 13.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9480: train loss 2.97956:  61%|██████    | 9480/15515 [10:27<07:01, 14.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9484: train loss 2.98105:  61%|██████    | 9484/15515 [10:27<06:44, 14.91it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9488: train loss 3.02167:  61%|██████    | 9488/15515 [10:27<06:41, 15.02it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9490: train loss 3.03109:  61%|██████    | 9490/15515 [10:27<06:41, 14.99it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9493: train loss 2.99855:  61%|██████    | 9494/15515 [10:28<07:30, 13.35it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9496: train loss 2.95476:  61%|██████    | 9496/15515 [10:28<07:44, 12.95it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9500: train loss 2.95522:  61%|██████    | 9500/15515 [10:28<07:44, 12.94it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9504: train loss 3.02729:  61%|██████▏   | 9504/15515 [10:28<07:07, 14.05it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9507: train loss 3.00971:  61%|██████▏   | 9508/15515 [10:29<07:09, 13.98it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9510: train loss 3.00136:  61%|██████▏   | 9510/15515 [10:29<07:19, 13.65it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9514: train loss 3.01985:  61%|██████▏   | 9514/15515 [10:29<07:30, 13.32it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9517: train loss 3.00005:  61%|██████▏   | 9518/15515 [10:29<07:17, 13.72it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9520: train loss 2.98029:  61%|██████▏   | 9520/15515 [10:30<07:04, 14.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9524: train loss 3.02359:  61%|██████▏   | 9524/15515 [10:30<06:49, 14.63it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9528: train loss 2.97578:  61%|██████▏   | 9528/15515 [10:30<06:39, 15.00it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9532: train loss 3.03781:  61%|██████▏   | 9532/15515 [10:30<06:37, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9536: train loss 2.98348:  61%|██████▏   | 9536/15515 [10:31<06:36, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9540: train loss 2.99274:  61%|██████▏   | 9540/15515 [10:31<06:34, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9544: train loss 2.98706:  62%|██████▏   | 9544/15515 [10:31<06:33, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9548: train loss 3.02652:  62%|██████▏   | 9548/15515 [10:31<06:38, 14.99it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9552: train loss 3.09422:  62%|██████▏   | 9552/15515 [10:32<06:36, 15.05it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9556: train loss 3.00352:  62%|██████▏   | 9556/15515 [10:32<06:32, 15.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9560: train loss 2.97628:  62%|██████▏   | 9560/15515 [10:32<06:35, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9564: train loss 3.00152:  62%|██████▏   | 9564/15515 [10:32<06:32, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9568: train loss 2.99241:  62%|██████▏   | 9568/15515 [10:33<06:36, 15.01it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9572: train loss 2.98888:  62%|██████▏   | 9572/15515 [10:33<06:35, 15.01it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 9575: train loss 3.03612:  62%|██████▏   | 9576/15515 [10:33<06:36, 14.99it/s]


32459 128
32459 128
32459 128


epoch 0 iter 9579: train loss 3.04160:  62%|██████▏   | 9580/15515 [10:33<06:40, 14.82it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9582: train loss 2.99816:  62%|██████▏   | 9582/15515 [10:34<06:43, 14.71it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9585: train loss 3.04972:  62%|██████▏   | 9586/15515 [10:34<06:48, 14.52it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9588: train loss 2.99919:  62%|██████▏   | 9588/15515 [10:34<06:45, 14.63it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9592: train loss 3.04333:  62%|██████▏   | 9592/15515 [10:34<06:42, 14.73it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9595: train loss 2.94147:  62%|██████▏   | 9596/15515 [10:35<06:39, 14.83it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9598: train loss 3.02467:  62%|██████▏   | 9598/15515 [10:35<06:42, 14.71it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9601: train loss 2.97939:  62%|██████▏   | 9602/15515 [10:35<06:42, 14.69it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9603: train loss 3.03847:  62%|██████▏   | 9604/15515 [10:35<06:41, 14.72it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9607: train loss 3.03395:  62%|██████▏   | 9608/15515 [10:35<06:48, 14.45it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9609: train loss 3.04803:  62%|██████▏   | 9610/15515 [10:36<06:57, 14.13it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9612: train loss 3.05447:  62%|██████▏   | 9612/15515 [10:36<06:52, 14.31it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9616: train loss 3.04990:  62%|██████▏   | 9616/15515 [10:36<06:48, 14.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9620: train loss 3.07047:  62%|██████▏   | 9620/15515 [10:36<06:50, 14.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9624: train loss 2.98807:  62%|██████▏   | 9624/15515 [10:37<06:52, 14.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9628: train loss 2.98393:  62%|██████▏   | 9628/15515 [10:37<06:42, 14.64it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 9630: train loss 3.04233:  62%|██████▏   | 9630/15515 [10:37<06:33, 14.97it/s]


32459 128
32459 128


epoch 0 iter 9634: train loss 3.02260:  62%|██████▏   | 9634/15515 [10:37<06:44, 14.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9638: train loss 2.99266:  62%|██████▏   | 9638/15515 [10:38<06:38, 14.76it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9641: train loss 2.99610:  62%|██████▏   | 9642/15515 [10:38<06:49, 14.34it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9645: train loss 2.92305:  62%|██████▏   | 9646/15515 [10:38<06:58, 14.01it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9648: train loss 3.04203:  62%|██████▏   | 9648/15515 [10:38<06:45, 14.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9652: train loss 3.00529:  62%|██████▏   | 9652/15515 [10:39<06:32, 14.96it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9656: train loss 2.98212:  62%|██████▏   | 9656/15515 [10:39<06:28, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9660: train loss 2.98421:  62%|██████▏   | 9660/15515 [10:39<06:29, 15.05it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9664: train loss 2.94040:  62%|██████▏   | 9664/15515 [10:39<06:32, 14.92it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 9667: train loss 2.94710:  62%|██████▏   | 9668/15515 [10:40<06:29, 15.01it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 9671: train loss 2.97029:  62%|██████▏   | 9672/15515 [10:40<06:19, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9675: train loss 3.02104:  62%|██████▏   | 9676/15515 [10:40<06:22, 15.25it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9678: train loss 3.05205:  62%|██████▏   | 9678/15515 [10:40<06:24, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9682: train loss 3.02373:  62%|██████▏   | 9682/15515 [10:40<06:24, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9686: train loss 2.98597:  62%|██████▏   | 9686/15515 [10:41<06:25, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9690: train loss 2.96929:  62%|██████▏   | 9690/15515 [10:41<06:23, 15.17it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9693: train loss 3.02221:  62%|██████▏   | 9694/15515 [10:41<06:29, 14.95it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9697: train loss 2.97669:  63%|██████▎   | 9698/15515 [10:41<06:30, 14.91it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 9700: train loss 3.01067:  63%|██████▎   | 9700/15515 [10:42<06:26, 15.04it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 9704: train loss 3.00759:  63%|██████▎   | 9704/15515 [10:42<06:23, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9708: train loss 3.07873:  63%|██████▎   | 9708/15515 [10:42<06:25, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9712: train loss 3.08538:  63%|██████▎   | 9712/15515 [10:42<06:24, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9716: train loss 2.99051:  63%|██████▎   | 9716/15515 [10:43<06:23, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9720: train loss 2.95644:  63%|██████▎   | 9720/15515 [10:43<06:24, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9724: train loss 2.99173:  63%|██████▎   | 9724/15515 [10:43<06:22, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9728: train loss 3.02865:  63%|██████▎   | 9728/15515 [10:44<06:20, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9732: train loss 3.03244:  63%|██████▎   | 9732/15515 [10:44<06:17, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9736: train loss 2.97753:  63%|██████▎   | 9736/15515 [10:44<06:21, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9740: train loss 3.04842:  63%|██████▎   | 9740/15515 [10:44<06:23, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9744: train loss 3.07363:  63%|██████▎   | 9744/15515 [10:45<06:20, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9748: train loss 2.94682:  63%|██████▎   | 9748/15515 [10:45<06:21, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9752: train loss 2.98127:  63%|██████▎   | 9752/15515 [10:45<06:17, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9756: train loss 2.95831:  63%|██████▎   | 9756/15515 [10:45<06:18, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9760: train loss 3.02792:  63%|██████▎   | 9760/15515 [10:46<06:22, 15.03it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9763: train loss 2.98187:  63%|██████▎   | 9764/15515 [10:46<06:24, 14.94it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9766: train loss 3.02692:  63%|██████▎   | 9766/15515 [10:46<06:23, 15.00it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9770: train loss 2.98094:  63%|██████▎   | 9770/15515 [10:46<06:21, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9774: train loss 3.00244:  63%|██████▎   | 9774/15515 [10:47<06:21, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9778: train loss 2.98781:  63%|██████▎   | 9778/15515 [10:47<06:18, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9782: train loss 2.97234:  63%|██████▎   | 9782/15515 [10:47<06:18, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9786: train loss 2.99351:  63%|██████▎   | 9786/15515 [10:47<06:16, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9790: train loss 2.99916:  63%|██████▎   | 9790/15515 [10:48<06:18, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9794: train loss 2.99799:  63%|██████▎   | 9794/15515 [10:48<06:12, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9798: train loss 2.94923:  63%|██████▎   | 9798/15515 [10:48<06:25, 14.82it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9802: train loss 2.92978:  63%|██████▎   | 9802/15515 [10:48<06:22, 14.93it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9806: train loss 3.02188:  63%|██████▎   | 9806/15515 [10:49<06:24, 14.83it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9808: train loss 3.04810:  63%|██████▎   | 9808/15515 [10:49<06:30, 14.62it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9812: train loss 2.95040:  63%|██████▎   | 9812/15515 [10:49<06:50, 13.90it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9815: train loss 2.99890:  63%|██████▎   | 9814/15515 [10:49<06:45, 14.05it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9817: train loss 2.97043:  63%|██████▎   | 9818/15515 [10:50<07:01, 13.51it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9821: train loss 2.95737:  63%|██████▎   | 9820/15515 [10:50<07:21, 12.91it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9824: train loss 3.01958:  63%|██████▎   | 9824/15515 [10:50<06:55, 13.71it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9827: train loss 2.97869:  63%|██████▎   | 9828/15515 [10:50<06:37, 14.31it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9830: train loss 3.01840:  63%|██████▎   | 9830/15515 [10:50<06:36, 14.35it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9832: train loss 3.01916:  63%|██████▎   | 9832/15515 [10:51<06:38, 14.28it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9836: train loss 3.05607:  63%|██████▎   | 9836/15515 [10:51<06:46, 13.97it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 9839: train loss 3.04610:  63%|██████▎   | 9838/15515 [10:51<06:34, 14.38it/s]

 128
32459 128
32459 128


epoch 0 iter 9842: train loss 3.00036:  63%|██████▎   | 9842/15515 [10:51<06:43, 14.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9845: train loss 3.03976:  63%|██████▎   | 9846/15515 [10:52<06:41, 14.12it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9849: train loss 2.98455:  63%|██████▎   | 9850/15515 [10:52<06:25, 14.68it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9852: train loss 2.96445:  63%|██████▎   | 9852/15515 [10:52<06:42, 14.06it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9854: train loss 3.03378:  64%|██████▎   | 9854/15515 [10:52<06:34, 14.34it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9858: train loss 3.02873:  64%|██████▎   | 9858/15515 [10:52<06:35, 14.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9862: train loss 2.98937:  64%|██████▎   | 9862/15515 [10:53<06:29, 14.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9866: train loss 3.04128:  64%|██████▎   | 9866/15515 [10:53<06:25, 14.67it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9868: train loss 2.98690:  64%|██████▎   | 9868/15515 [10:53<06:44, 13.96it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9872: train loss 2.97918:  64%|██████▎   | 9872/15515 [10:53<06:37, 14.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9876: train loss 3.00401:  64%|██████▎   | 9876/15515 [10:54<06:19, 14.87it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9880: train loss 2.91125:  64%|██████▎   | 9880/15515 [10:54<06:10, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9884: train loss 3.01789:  64%|██████▎   | 9884/15515 [10:54<06:08, 15.30it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9887: train loss 3.00344:  64%|██████▎   | 9888/15515 [10:54<06:12, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9891: train loss 3.03948:  64%|██████▍   | 9892/15515 [10:55<06:12, 15.10it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 9894: train loss 2.99023:  64%|██████▍   | 9894/15515 [10:55<06:14, 15.00it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 9897: train loss 3.01003:  64%|██████▍   | 9898/15515 [10:55<06:15, 14.96it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9900: train loss 3.02977:  64%|██████▍   | 9900/15515 [10:55<06:31, 14.33it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9904: train loss 2.95773:  64%|██████▍   | 9904/15515 [10:56<06:41, 13.97it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9908: train loss 2.98425:  64%|██████▍   | 9908/15515 [10:56<06:28, 14.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9911: train loss 3.00361:  64%|██████▍   | 9912/15515 [10:56<06:18, 14.79it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9915: train loss 3.02773:  64%|██████▍   | 9916/15515 [10:56<06:11, 15.07it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 9917: train loss 3.02255:  64%|██████▍   | 9918/15515 [10:56<06:27, 14.44it/s]

 128
32459 128
32459 128


epoch 0 iter 9921: train loss 2.95104:  64%|██████▍   | 9922/15515 [10:57<06:23, 14.57it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9924: train loss 2.96095:  64%|██████▍   | 9924/15515 [10:57<06:24, 14.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9928: train loss 2.95583:  64%|██████▍   | 9928/15515 [10:57<06:20, 14.68it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9932: train loss 3.00336:  64%|██████▍   | 9932/15515 [10:57<06:14, 14.91it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9935: train loss 3.02349:  64%|██████▍   | 9936/15515 [10:58<06:25, 14.47it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9938: train loss 3.02912:  64%|██████▍   | 9938/15515 [10:58<06:38, 14.01it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9942: train loss 2.99193:  64%|██████▍   | 9942/15515 [10:58<06:37, 14.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9946: train loss 2.98350:  64%|██████▍   | 9946/15515 [10:58<06:19, 14.67it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9950: train loss 2.99729:  64%|██████▍   | 9950/15515 [10:59<06:12, 14.95it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9954: train loss 2.94284:  64%|██████▍   | 9954/15515 [10:59<06:11, 14.95it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9956: train loss 3.04750:  64%|██████▍   | 9956/15515 [10:59<06:26, 14.38it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9960: train loss 2.97933:  64%|██████▍   | 9960/15515 [10:59<06:34, 14.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9964: train loss 3.02158:  64%|██████▍   | 9964/15515 [11:00<06:16, 14.75it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9968: train loss 2.95919:  64%|██████▍   | 9968/15515 [11:00<06:10, 14.98it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9971: train loss 2.99665:  64%|██████▍   | 9972/15515 [11:00<06:13, 14.84it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9974: train loss 2.97491:  64%|██████▍   | 9974/15515 [11:00<06:17, 14.68it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9978: train loss 2.98608:  64%|██████▍   | 9978/15515 [11:01<06:14, 14.80it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9981: train loss 2.99485:  64%|██████▍   | 9982/15515 [11:01<06:11, 14.90it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 9983: train loss 2.95007:  64%|██████▍   | 9984/15515 [11:01<06:25, 14.33it/s]

 128
32459 128
32459 128


epoch 0 iter 9986: train loss 2.94521:  64%|██████▍   | 9986/15515 [11:01<06:46, 13.59it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9990: train loss 2.96143:  64%|██████▍   | 9990/15515 [11:01<06:43, 13.69it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9993: train loss 2.95601:  64%|██████▍   | 9994/15515 [11:02<06:27, 14.24it/s]

32459 128
32459 128
32459 128


epoch 0 iter 9996: train loss 2.94829:  64%|██████▍   | 9996/15515 [11:02<06:19, 14.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 9999: train loss 2.95658:  64%|██████▍   | 10000/15515 [11:02<06:23, 14.38it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10003: train loss 3.01584:  64%|██████▍   | 10004/15515 [11:02<06:18, 14.55it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10006: train loss 3.04639:  64%|██████▍   | 10006/15515 [11:03<06:25, 14.28it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10008: train loss 2.99024:  65%|██████▍   | 10008/15515 [11:03<06:28, 14.19it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10011: train loss 2.98436:  65%|██████▍   | 10012/15515 [11:03<06:31, 14.07it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10015: train loss 2.96442:  65%|██████▍   | 10016/15515 [11:03<06:26, 14.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10019: train loss 2.95103:  65%|██████▍   | 10018/15515 [11:04<06:29, 14.12it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10022: train loss 3.00049:  65%|██████▍   | 10022/15515 [11:04<06:23, 14.34it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10024: train loss 3.01478:  65%|██████▍   | 10024/15515 [11:04<06:36, 13.85it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10027: train loss 2.99058:  65%|██████▍   | 10028/15515 [11:04<06:32, 13.98it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10031: train loss 2.91734:  65%|██████▍   | 10032/15515 [11:04<06:29, 14.06it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10033: train loss 2.97789:  65%|██████▍   | 10034/15515 [11:05<06:25, 14.22it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10036: train loss 2.99005:  65%|██████▍   | 10036/15515 [11:05<06:31, 14.01it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10040: train loss 3.01221:  65%|██████▍   | 10040/15515 [11:05<06:29, 14.05it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10043: train loss 2.99563:  65%|██████▍   | 10044/15515 [11:05<06:34, 13.88it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10046: train loss 2.98452:  65%|██████▍   | 10046/15515 [11:05<06:35, 13.82it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10050: train loss 2.98680:  65%|██████▍   | 10050/15515 [11:06<06:43, 13.53it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10053: train loss 3.02149:  65%|██████▍   | 10054/15515 [11:06<06:25, 14.17it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10056: train loss 3.02597:  65%|██████▍   | 10056/15515 [11:06<06:18, 14.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10060: train loss 3.05153:  65%|██████▍   | 10060/15515 [11:06<06:07, 14.83it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10064: train loss 3.00313:  65%|██████▍   | 10064/15515 [11:07<06:02, 15.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10068: train loss 3.01537:  65%|██████▍   | 10068/15515 [11:07<05:56, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10072: train loss 3.03335:  65%|██████▍   | 10072/15515 [11:07<05:57, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10076: train loss 3.01005:  65%|██████▍   | 10076/15515 [11:07<06:03, 14.96it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10080: train loss 3.01134:  65%|██████▍   | 10080/15515 [11:08<05:56, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10084: train loss 3.05157:  65%|██████▍   | 10084/15515 [11:08<05:58, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10088: train loss 2.98210:  65%|██████▌   | 10088/15515 [11:08<05:59, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10092: train loss 2.97750:  65%|██████▌   | 10092/15515 [11:09<06:02, 14.95it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10095: train loss 3.01757:  65%|██████▌   | 10096/15515 [11:09<05:59, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10099: train loss 2.95617:  65%|██████▌   | 10100/15515 [11:09<05:58, 15.09it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10102: train loss 3.02832:  65%|██████▌   | 10102/15515 [11:09<06:01, 14.96it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10106: train loss 2.96041:  65%|██████▌   | 10106/15515 [11:09<05:58, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10110: train loss 2.94919:  65%|██████▌   | 10110/15515 [11:10<05:53, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10114: train loss 2.98806:  65%|██████▌   | 10114/15515 [11:10<05:51, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10118: train loss 2.99158:  65%|██████▌   | 10118/15515 [11:10<05:54, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10122: train loss 3.02021:  65%|██████▌   | 10122/15515 [11:10<05:56, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10126: train loss 3.00462:  65%|██████▌   | 10126/15515 [11:11<05:56, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10130: train loss 3.00469:  65%|██████▌   | 10130/15515 [11:11<05:58, 15.01it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10134: train loss 2.99243:  65%|██████▌   | 10134/15515 [11:11<06:03, 14.82it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10138: train loss 3.00590:  65%|██████▌   | 10138/15515 [11:12<06:03, 14.78it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10141: train loss 2.97158:  65%|██████▌   | 10142/15515 [11:12<06:01, 14.88it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10144: train loss 2.98699:  65%|██████▌   | 10144/15515 [11:12<05:58, 14.97it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10148: train loss 3.00496:  65%|██████▌   | 10148/15515 [11:12<05:58, 14.97it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10152: train loss 3.01182:  65%|██████▌   | 10152/15515 [11:12<05:52, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10156: train loss 2.97069:  65%|██████▌   | 10156/15515 [11:13<05:47, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10160: train loss 3.00063:  65%|██████▌   | 10160/15515 [11:13<05:52, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10164: train loss 3.00250:  66%|██████▌   | 10164/15515 [11:13<05:49, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10168: train loss 2.94526:  66%|██████▌   | 10168/15515 [11:14<05:50, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10172: train loss 3.00686:  66%|██████▌   | 10172/15515 [11:14<05:50, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10176: train loss 2.99762:  66%|██████▌   | 10176/15515 [11:14<05:50, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10180: train loss 2.95791:  66%|██████▌   | 10180/15515 [11:14<05:50, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10184: train loss 2.97562:  66%|██████▌   | 10184/15515 [11:15<05:52, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10188: train loss 2.99836:  66%|██████▌   | 10188/15515 [11:15<05:54, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10192: train loss 2.96239:  66%|██████▌   | 10192/15515 [11:15<05:49, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10196: train loss 3.01215:  66%|██████▌   | 10196/15515 [11:15<05:49, 15.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10200: train loss 3.00046:  66%|██████▌   | 10200/15515 [11:16<05:48, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10204: train loss 2.88580:  66%|██████▌   | 10204/15515 [11:16<05:53, 15.03it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 10207: train loss 2.95875:  66%|██████▌   | 10208/15515 [11:16<05:58, 14.79it/s]

 128
32459 128
32459 128


epoch 0 iter 10210: train loss 2.89104:  66%|██████▌   | 10210/15515 [11:16<05:54, 14.95it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10214: train loss 3.03216:  66%|██████▌   | 10214/15515 [11:17<05:56, 14.86it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10217: train loss 3.03890:  66%|██████▌   | 10218/15515 [11:17<06:00, 14.69it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10220: train loss 3.05856:  66%|██████▌   | 10220/15515 [11:17<06:01, 14.65it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10224: train loss 3.03250:  66%|██████▌   | 10224/15515 [11:17<05:56, 14.83it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10228: train loss 2.96511:  66%|██████▌   | 10228/15515 [11:18<05:56, 14.82it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10232: train loss 3.01845:  66%|██████▌   | 10232/15515 [11:18<05:56, 14.81it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10235: train loss 2.94355:  66%|██████▌   | 10236/15515 [11:18<05:58, 14.73it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10238: train loss 3.08153:  66%|██████▌   | 10238/15515 [11:18<05:55, 14.85it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10242: train loss 2.96520:  66%|██████▌   | 10242/15515 [11:18<05:55, 14.82it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 10245: train loss 2.92541:  66%|██████▌   | 10246/15515 [11:19<05:57, 14.72it/s]

128
32459 128
32459 128


epoch 0 iter 10248: train loss 2.96303:  66%|██████▌   | 10248/15515 [11:19<05:56, 14.78it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10252: train loss 2.95536:  66%|██████▌   | 10252/15515 [11:19<05:59, 14.64it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10255: train loss 3.02137:  66%|██████▌   | 10256/15515 [11:19<05:59, 14.63it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10258: train loss 2.98603:  66%|██████▌   | 10258/15515 [11:20<05:59, 14.62it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10262: train loss 2.98596:  66%|██████▌   | 10262/15515 [11:20<05:55, 14.80it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10265: train loss 2.94588:  66%|██████▌   | 10266/15515 [11:20<05:54, 14.81it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10268: train loss 3.01053:  66%|██████▌   | 10268/15515 [11:20<05:55, 14.77it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 10271: train loss 2.99607:  66%|██████▌   | 10272/15515 [11:20<05:50, 14.95it/s]

 128
32459 128
32459 128


epoch 0 iter 10274: train loss 2.95604:  66%|██████▌   | 10274/15515 [11:21<05:51, 14.92it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10278: train loss 3.03449:  66%|██████▌   | 10278/15515 [11:21<05:48, 15.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10282: train loss 2.93515:  66%|██████▋   | 10282/15515 [11:21<05:45, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10286: train loss 3.03467:  66%|██████▋   | 10286/15515 [11:21<05:44, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10290: train loss 3.05494:  66%|██████▋   | 10290/15515 [11:22<05:42, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10294: train loss 2.95432:  66%|██████▋   | 10294/15515 [11:22<05:40, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10298: train loss 2.98780:  66%|██████▋   | 10298/15515 [11:22<05:46, 15.07it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10301: train loss 3.07925:  66%|██████▋   | 10302/15515 [11:22<05:47, 15.00it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10304: train loss 3.01747:  66%|██████▋   | 10304/15515 [11:23<05:43, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10308: train loss 3.00931:  66%|██████▋   | 10308/15515 [11:23<05:44, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10312: train loss 2.98242:  66%|██████▋   | 10312/15515 [11:23<05:47, 14.97it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10316: train loss 2.93473:  66%|██████▋   | 10316/15515 [11:23<05:45, 15.05it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10320: train loss 2.96450:  67%|██████▋   | 10320/15515 [11:24<05:46, 15.01it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10324: train loss 2.97394:  67%|██████▋   | 10324/15515 [11:24<05:43, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10328: train loss 3.01994:  67%|██████▋   | 10328/15515 [11:24<05:42, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10332: train loss 2.93619:  67%|██████▋   | 10332/15515 [11:24<05:42, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10336: train loss 2.97013:  67%|██████▋   | 10336/15515 [11:25<05:41, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10340: train loss 3.04053:  67%|██████▋   | 10340/15515 [11:25<05:40, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10344: train loss 2.96698:  67%|██████▋   | 10344/15515 [11:25<05:36, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10348: train loss 3.01316:  67%|██████▋   | 10348/15515 [11:26<05:33, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10352: train loss 2.97346:  67%|██████▋   | 10352/15515 [11:26<05:34, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10356: train loss 2.92920:  67%|██████▋   | 10356/15515 [11:26<05:33, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10360: train loss 2.96555:  67%|██████▋   | 10360/15515 [11:26<05:34, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10364: train loss 3.01218:  67%|██████▋   | 10364/15515 [11:27<05:36, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10368: train loss 3.03221:  67%|██████▋   | 10368/15515 [11:27<05:39, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10372: train loss 2.95972:  67%|██████▋   | 10372/15515 [11:27<05:36, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10376: train loss 2.97092:  67%|██████▋   | 10376/15515 [11:27<05:32, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10380: train loss 3.01326:  67%|██████▋   | 10380/15515 [11:28<05:36, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10384: train loss 2.95741:  67%|██████▋   | 10384/15515 [11:28<05:38, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10388: train loss 2.99160:  67%|██████▋   | 10388/15515 [11:28<05:41, 14.99it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10391: train loss 2.96955:  67%|██████▋   | 10392/15515 [11:28<05:42, 14.96it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 10394: train loss 3.00713:  67%|██████▋   | 10394/15515 [11:29<05:42, 14.93it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 10398: train loss 2.99804:  67%|██████▋   | 10398/15515 [11:29<05:38, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10406: train loss 2.99452:  67%|██████▋   | 10406/15515 [11:29<05:38, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10410: train loss 3.02672:  67%|██████▋   | 10410/15515 [11:30<05:39, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10414: train loss 2.95346:  67%|██████▋   | 10414/15515 [11:30<05:38, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10418: train loss 2.92287:  67%|██████▋   | 10418/15515 [11:30<05:37, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10422: train loss 3.00404:  67%|██████▋   | 10422/15515 [11:30<05:38, 15.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10426: train loss 3.00294:  67%|██████▋   | 10426/15515 [11:31<05:36, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10430: train loss 3.03203:  67%|██████▋   | 10430/15515 [11:31<05:46, 14.69it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10434: train loss 3.02018:  67%|██████▋   | 10434/15515 [11:31<05:41, 14.89it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10438: train loss 3.00254:  67%|██████▋   | 10438/15515 [11:31<05:39, 14.94it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10441: train loss 2.98328:  67%|██████▋   | 10442/15515 [11:32<05:42, 14.81it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10444: train loss 2.99231:  67%|██████▋   | 10444/15515 [11:32<05:39, 14.94it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10448: train loss 3.05076:  67%|██████▋   | 10448/15515 [11:32<05:37, 15.00it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10452: train loss 2.97823:  67%|██████▋   | 10452/15515 [11:32<05:40, 14.85it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10456: train loss 3.03842:  67%|██████▋   | 10456/15515 [11:33<05:38, 14.94it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10459: train loss 3.05383:  67%|██████▋   | 10460/15515 [11:33<05:39, 14.87it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10462: train loss 3.00164:  67%|██████▋   | 10462/15515 [11:33<05:39, 14.87it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10466: train loss 3.02033:  67%|██████▋   | 10466/15515 [11:33<05:40, 14.85it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10469: train loss 2.94171:  67%|██████▋   | 10470/15515 [11:34<05:40, 14.84it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10472: train loss 2.91556:  67%|██████▋   | 10472/15515 [11:34<05:41, 14.77it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10475: train loss 2.90255:  68%|██████▊   | 10476/15515 [11:34<05:38, 14.87it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10479: train loss 2.90255:  68%|██████▊   | 10480/15515 [11:34<05:36, 14.96it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10482: train loss 2.96910:  68%|██████▊   | 10482/15515 [11:34<05:39, 14.81it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 10485: train loss 3.01844:  68%|██████▊   | 10486/15515 [11:35<05:40, 14.78it/s]


32459 128
32459 128


epoch 0 iter 10488: train loss 2.97849:  68%|██████▊   | 10488/15515 [11:35<05:36, 14.92it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10491: train loss 3.01628:  68%|██████▊   | 10492/15515 [11:35<05:38, 14.84it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10495: train loss 2.99470:  68%|██████▊   | 10496/15515 [11:35<05:38, 14.81it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10498: train loss 3.01481:  68%|██████▊   | 10498/15515 [11:36<05:41, 14.70it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10502: train loss 2.93864:  68%|██████▊   | 10502/15515 [11:36<05:34, 14.97it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10506: train loss 3.02973:  68%|██████▊   | 10506/15515 [11:36<05:36, 14.89it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 10509: train loss 2.92139:  68%|██████▊   | 10510/15515 [11:36<05:34, 14.98it/s]

 128
32459 128
32459 128
32459

epoch 0 iter 10512: train loss 2.96137:  68%|██████▊   | 10512/15515 [11:36<05:34, 14.96it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 10516: train loss 2.97485:  68%|██████▊   | 10516/15515 [11:37<05:30, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10520: train loss 2.96558:  68%|██████▊   | 10520/15515 [11:37<05:32, 15.03it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 10523: train loss 2.95584:  68%|██████▊   | 10524/15515 [11:37<05:30, 15.10it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 10527: train loss 2.92567:  68%|██████▊   | 10528/15515 [11:37<05:27, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10531: train loss 2.93689:  68%|██████▊   | 10532/15515 [11:38<05:24, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10535: train loss 2.94588:  68%|██████▊   | 10536/15515 [11:38<05:23, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10539: train loss 2.99630:  68%|██████▊   | 10540/15515 [11:38<05:22, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10543: train loss 2.92218:  68%|██████▊   | 10544/15515 [11:38<05:22, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10547: train loss 2.93688:  68%|██████▊   | 10548/15515 [11:39<05:20, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10551: train loss 2.90583:  68%|██████▊   | 10552/15515 [11:39<05:20, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10555: train loss 3.00982:  68%|██████▊   | 10556/15515 [11:39<05:22, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10559: train loss 3.05068:  68%|██████▊   | 10560/15515 [11:40<05:22, 15.36it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 10562: train loss 3.01744:  68%|██████▊   | 10562/15515 [11:40<05:23, 15.32it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 10566: train loss 3.02173:  68%|██████▊   | 10566/15515 [11:40<05:24, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10570: train loss 2.96159:  68%|██████▊   | 10570/15515 [11:40<05:24, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10574: train loss 2.93194:  68%|██████▊   | 10574/15515 [11:41<05:23, 15.30it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 10577: train loss 2.97062:  68%|██████▊   | 10578/15515 [11:41<05:23, 15.25it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 10581: train loss 2.97883:  68%|██████▊   | 10582/15515 [11:41<05:24, 15.22it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 10584: train loss 3.01070:  68%|██████▊   | 10584/15515 [11:41<05:25, 15.14it/s]


32459 128
32459 128
32459

epoch 0 iter 10587: train loss 2.93686:  68%|██████▊   | 10588/15515 [11:41<05:28, 14.99it/s]

 128
32459 128
32459 128


epoch 0 iter 10590: train loss 2.90190:  68%|██████▊   | 10590/15515 [11:42<05:26, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10594: train loss 2.99031:  68%|██████▊   | 10594/15515 [11:42<05:24, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10598: train loss 2.97740:  68%|██████▊   | 10598/15515 [11:42<05:25, 15.08it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 10601: train loss 3.03676:  68%|██████▊   | 10602/15515 [11:42<05:25, 15.11it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 10605: train loss 2.97284:  68%|██████▊   | 10606/15515 [11:43<05:20, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10609: train loss 2.89531:  68%|██████▊   | 10610/15515 [11:43<05:17, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10613: train loss 2.94730:  68%|██████▊   | 10614/15515 [11:43<05:18, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10617: train loss 3.00575:  68%|██████▊   | 10618/15515 [11:43<05:18, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10621: train loss 2.95535:  68%|██████▊   | 10622/15515 [11:44<05:21, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10625: train loss 2.96600:  68%|██████▊   | 10626/15515 [11:44<05:23, 15.13it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10628: train loss 2.94529:  69%|██████▊   | 10628/15515 [11:44<05:24, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10632: train loss 2.94532:  69%|██████▊   | 10632/15515 [11:44<05:24, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10636: train loss 2.97031:  69%|██████▊   | 10636/15515 [11:45<05:21, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10640: train loss 2.99710:  69%|██████▊   | 10640/15515 [11:45<05:23, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10644: train loss 2.95101:  69%|██████▊   | 10644/15515 [11:45<05:23, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10648: train loss 2.92060:  69%|██████▊   | 10648/15515 [11:45<05:24, 15.00it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10651: train loss 2.91038:  69%|██████▊   | 10652/15515 [11:46<05:28, 14.82it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10654: train loss 3.01447:  69%|██████▊   | 10654/15515 [11:46<05:28, 14.82it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10658: train loss 2.94487:  69%|██████▊   | 10658/15515 [11:46<05:24, 14.98it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10661: train loss 2.95976:  69%|██████▊   | 10662/15515 [11:46<05:20, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10665: train loss 3.00574:  69%|██████▊   | 10666/15515 [11:47<05:22, 15.03it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 10668: train loss 2.95718:  69%|██████▉   | 10668/15515 [11:47<05:22, 15.01it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 10672: train loss 2.99242:  69%|██████▉   | 10672/15515 [11:47<05:25, 14.88it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 10675: train loss 3.01874:  69%|██████▉   | 10676/15515 [11:47<05:27, 14.78it/s]


32459 128
32459 128


epoch 0 iter 10678: train loss 2.97532:  69%|██████▉   | 10678/15515 [11:47<05:28, 14.73it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10681: train loss 2.97925:  69%|██████▉   | 10682/15515 [11:48<05:28, 14.70it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10685: train loss 2.96062:  69%|██████▉   | 10686/15515 [11:48<05:24, 14.88it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10688: train loss 3.00503:  69%|██████▉   | 10688/15515 [11:48<05:28, 14.72it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10692: train loss 2.97289:  69%|██████▉   | 10692/15515 [11:48<05:19, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10696: train loss 3.03969:  69%|██████▉   | 10696/15515 [11:49<05:16, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10700: train loss 2.93765:  69%|██████▉   | 10700/15515 [11:49<05:17, 15.19it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 10703: train loss 2.95703:  69%|██████▉   | 10704/15515 [11:49<05:20, 14.99it/s]

128
32459 128
32459 128


epoch 0 iter 10706: train loss 2.94168:  69%|██████▉   | 10706/15515 [11:49<05:21, 14.94it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10710: train loss 2.98311:  69%|██████▉   | 10710/15515 [11:50<05:14, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10714: train loss 2.98490:  69%|██████▉   | 10714/15515 [11:50<05:12, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10718: train loss 2.98294:  69%|██████▉   | 10718/15515 [11:50<05:11, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10722: train loss 2.99737:  69%|██████▉   | 10722/15515 [11:50<05:13, 15.28it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 10725: train loss 2.95922:  69%|██████▉   | 10726/15515 [11:51<05:15, 15.16it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 10729: train loss 3.02896:  69%|██████▉   | 10730/15515 [11:51<05:17, 15.08it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10732: train loss 3.04306:  69%|██████▉   | 10732/15515 [11:51<05:16, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10736: train loss 3.00489:  69%|██████▉   | 10736/15515 [11:51<05:16, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10740: train loss 2.99401:  69%|██████▉   | 10740/15515 [11:52<05:16, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10744: train loss 3.00252:  69%|██████▉   | 10744/15515 [11:52<05:17, 15.02it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 10747: train loss 2.97585:  69%|██████▉   | 10748/15515 [11:52<05:18, 14.99it/s]

 128
32459 128
32459 128


epoch 0 iter 10750: train loss 2.97626:  69%|██████▉   | 10750/15515 [11:52<05:17, 15.00it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10754: train loss 2.95028:  69%|██████▉   | 10754/15515 [11:52<05:16, 15.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10758: train loss 2.93512:  69%|██████▉   | 10758/15515 [11:53<05:18, 14.94it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10762: train loss 3.00322:  69%|██████▉   | 10762/15515 [11:53<05:12, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10766: train loss 2.91968:  69%|██████▉   | 10766/15515 [11:53<05:14, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10770: train loss 3.01400:  69%|██████▉   | 10770/15515 [11:54<05:14, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10774: train loss 2.97083:  69%|██████▉   | 10774/15515 [11:54<05:12, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10778: train loss 2.98542:  69%|██████▉   | 10778/15515 [11:54<05:14, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10782: train loss 2.94957:  69%|██████▉   | 10782/15515 [11:54<05:11, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10786: train loss 2.96788:  70%|██████▉   | 10786/15515 [11:55<05:10, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10790: train loss 2.92312:  70%|██████▉   | 10790/15515 [11:55<05:08, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10794: train loss 2.95514:  70%|██████▉   | 10794/15515 [11:55<05:09, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10798: train loss 2.95837:  70%|██████▉   | 10798/15515 [11:55<05:07, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10802: train loss 3.02385:  70%|██████▉   | 10802/15515 [11:56<05:08, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10806: train loss 3.02831:  70%|██████▉   | 10806/15515 [11:56<05:09, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10810: train loss 3.00067:  70%|██████▉   | 10810/15515 [11:56<05:09, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10814: train loss 2.94777:  70%|██████▉   | 10814/15515 [11:56<05:09, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10818: train loss 2.97243:  70%|██████▉   | 10818/15515 [11:57<05:10, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10822: train loss 3.00120:  70%|██████▉   | 10822/15515 [11:57<05:09, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10826: train loss 2.96618:  70%|██████▉   | 10826/15515 [11:57<05:06, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10830: train loss 2.98036:  70%|██████▉   | 10830/15515 [11:57<05:02, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10834: train loss 2.98708:  70%|██████▉   | 10834/15515 [11:58<05:01, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10838: train loss 2.95249:  70%|██████▉   | 10838/15515 [11:58<05:02, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10842: train loss 2.96981:  70%|██████▉   | 10842/15515 [11:58<05:08, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10846: train loss 2.91448:  70%|██████▉   | 10846/15515 [11:58<05:08, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10850: train loss 3.03334:  70%|██████▉   | 10850/15515 [11:59<05:09, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10854: train loss 2.96650:  70%|██████▉   | 10854/15515 [11:59<05:09, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10858: train loss 2.96122:  70%|██████▉   | 10858/15515 [11:59<05:08, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10862: train loss 2.88816:  70%|███████   | 10862/15515 [12:00<05:14, 14.78it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10865: train loss 3.03897:  70%|███████   | 10866/15515 [12:00<05:16, 14.68it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10868: train loss 2.95295:  70%|███████   | 10868/15515 [12:00<05:12, 14.87it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10872: train loss 2.98500:  70%|███████   | 10872/15515 [12:00<05:14, 14.78it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10876: train loss 2.96655:  70%|███████   | 10876/15515 [12:01<05:17, 14.63it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 10879: train loss 2.94313:  70%|███████   | 10880/15515 [12:01<05:14, 14.73it/s]

128
32459 128
32459 128


epoch 0 iter 10882: train loss 2.98269:  70%|███████   | 10882/15515 [12:01<05:10, 14.91it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10886: train loss 2.95811:  70%|███████   | 10886/15515 [12:01<05:07, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10890: train loss 3.00229:  70%|███████   | 10890/15515 [12:01<05:06, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10894: train loss 2.94186:  70%|███████   | 10894/15515 [12:02<05:08, 14.99it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10897: train loss 2.95973:  70%|███████   | 10898/15515 [12:02<05:10, 14.87it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10900: train loss 2.90588:  70%|███████   | 10900/15515 [12:02<05:11, 14.79it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10904: train loss 2.94607:  70%|███████   | 10904/15515 [12:02<05:08, 14.97it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10907: train loss 2.96045:  70%|███████   | 10908/15515 [12:03<05:07, 14.97it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10911: train loss 2.90535:  70%|███████   | 10912/15515 [12:03<05:05, 15.08it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 10914: train loss 2.93524:  70%|███████   | 10914/15515 [12:03<05:04, 15.10it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 10918: train loss 2.98781:  70%|███████   | 10918/15515 [12:03<05:02, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10922: train loss 2.96200:  70%|███████   | 10922/15515 [12:04<05:07, 14.96it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 10925: train loss 3.00590:  70%|███████   | 10926/15515 [12:04<05:09, 14.81it/s]

128
32459 128
32459 128


epoch 0 iter 10928: train loss 2.96801:  70%|███████   | 10928/15515 [12:04<05:10, 14.77it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10931: train loss 2.93770:  70%|███████   | 10932/15515 [12:04<05:38, 13.54it/s]

32459 128
32459 128


epoch 0 iter 10933: train loss 3.01568:  70%|███████   | 10934/15515 [12:04<06:29, 11.77it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10937: train loss 2.97120:  70%|███████   | 10938/15515 [12:05<06:01, 12.66it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10940: train loss 2.94802:  71%|███████   | 10940/15515 [12:05<05:44, 13.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10944: train loss 2.99473:  71%|███████   | 10944/15515 [12:05<05:22, 14.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10948: train loss 2.99393:  71%|███████   | 10948/15515 [12:05<05:15, 14.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10952: train loss 2.98848:  71%|███████   | 10952/15515 [12:06<05:08, 14.79it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10956: train loss 2.98479:  71%|███████   | 10956/15515 [12:06<05:10, 14.69it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10959: train loss 2.96096:  71%|███████   | 10960/15515 [12:06<05:05, 14.91it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10963: train loss 2.96232:  71%|███████   | 10964/15515 [12:06<05:02, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10967: train loss 2.96626:  71%|███████   | 10968/15515 [12:07<05:00, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10971: train loss 2.95405:  71%|███████   | 10972/15515 [12:07<05:01, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10975: train loss 2.96974:  71%|███████   | 10976/15515 [12:07<05:00, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10979: train loss 3.01085:  71%|███████   | 10980/15515 [12:08<05:02, 15.01it/s]

32459 128
32459 128
32459 128


epoch 0 iter 10982: train loss 2.97708:  71%|███████   | 10982/15515 [12:08<05:03, 14.94it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10986: train loss 2.94613:  71%|███████   | 10986/15515 [12:08<04:56, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10990: train loss 2.89885:  71%|███████   | 10990/15515 [12:08<04:52, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10994: train loss 2.95590:  71%|███████   | 10994/15515 [12:09<04:57, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 10998: train loss 2.93996:  71%|███████   | 10998/15515 [12:09<04:59, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11002: train loss 2.93719:  71%|███████   | 11002/15515 [12:09<04:55, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11006: train loss 2.90046:  71%|███████   | 11006/15515 [12:09<04:52, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11010: train loss 2.95143:  71%|███████   | 11010/15515 [12:10<04:52, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11014: train loss 2.99632:  71%|███████   | 11014/15515 [12:10<04:48, 15.60it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11018: train loss 3.00087:  71%|███████   | 11018/15515 [12:10<04:49, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11022: train loss 2.99849:  71%|███████   | 11022/15515 [12:10<04:49, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11026: train loss 2.98599:  71%|███████   | 11026/15515 [12:11<04:49, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11030: train loss 2.96915:  71%|███████   | 11030/15515 [12:11<04:52, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11034: train loss 2.96759:  71%|███████   | 11034/15515 [12:11<04:55, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11038: train loss 2.97369:  71%|███████   | 11038/15515 [12:11<04:56, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11042: train loss 2.98110:  71%|███████   | 11042/15515 [12:12<04:56, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11046: train loss 2.92672:  71%|███████   | 11046/15515 [12:12<04:54, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11050: train loss 2.94192:  71%|███████   | 11050/15515 [12:12<04:54, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11054: train loss 2.96380:  71%|███████   | 11054/15515 [12:12<04:53, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11058: train loss 2.99330:  71%|███████▏  | 11058/15515 [12:13<04:54, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11062: train loss 2.95248:  71%|███████▏  | 11062/15515 [12:13<04:56, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11066: train loss 2.96033:  71%|███████▏  | 11066/15515 [12:13<04:53, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11070: train loss 2.96265:  71%|███████▏  | 11070/15515 [12:14<04:53, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11074: train loss 2.90486:  71%|███████▏  | 11074/15515 [12:14<04:56, 14.98it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11078: train loss 3.05064:  71%|███████▏  | 11078/15515 [12:14<04:54, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11082: train loss 3.02245:  71%|███████▏  | 11082/15515 [12:14<04:52, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11086: train loss 2.92502:  71%|███████▏  | 11086/15515 [12:15<04:55, 14.99it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11090: train loss 3.03844:  71%|███████▏  | 11090/15515 [12:15<04:52, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11094: train loss 3.00000:  72%|███████▏  | 11094/15515 [12:15<04:58, 14.83it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11097: train loss 2.95347:  72%|███████▏  | 11098/15515 [12:15<04:55, 14.95it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11101: train loss 2.98777:  72%|███████▏  | 11102/15515 [12:16<04:55, 14.95it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11104: train loss 2.95191:  72%|███████▏  | 11104/15515 [12:16<05:00, 14.70it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11107: train loss 2.98232:  72%|███████▏  | 11108/15515 [12:16<04:58, 14.76it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11111: train loss 2.97097:  72%|███████▏  | 11112/15515 [12:16<04:59, 14.69it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11114: train loss 2.96325:  72%|███████▏  | 11114/15515 [12:16<05:00, 14.67it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11117: train loss 2.97197:  72%|███████▏  | 11118/15515 [12:17<04:56, 14.84it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11121: train loss 2.99822:  72%|███████▏  | 11122/15515 [12:17<04:54, 14.89it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11124: train loss 2.94051:  72%|███████▏  | 11124/15515 [12:17<04:55, 14.88it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11128: train loss 2.96157:  72%|███████▏  | 11128/15515 [12:17<04:51, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11132: train loss 2.87902:  72%|███████▏  | 11132/15515 [12:18<04:46, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11136: train loss 2.96227:  72%|███████▏  | 11136/15515 [12:18<04:42, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11140: train loss 2.97477:  72%|███████▏  | 11140/15515 [12:18<04:42, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11144: train loss 2.93120:  72%|███████▏  | 11144/15515 [12:18<04:44, 15.36it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 11147: train loss 2.99409:  72%|███████▏  | 11148/15515 [12:19<04:51, 15.00it/s]

 128
32459 128
32459 128


epoch 0 iter 11150: train loss 3.02030:  72%|███████▏  | 11150/15515 [12:19<04:50, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11154: train loss 3.00152:  72%|███████▏  | 11154/15515 [12:19<04:48, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11158: train loss 2.98176:  72%|███████▏  | 11158/15515 [12:19<04:41, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11162: train loss 2.87104:  72%|███████▏  | 11162/15515 [12:20<04:40, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11166: train loss 2.94799:  72%|███████▏  | 11166/15515 [12:20<04:41, 15.46it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11169: train loss 3.01551:  72%|███████▏  | 11170/15515 [12:20<04:43, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11173: train loss 2.95717:  72%|███████▏  | 11174/15515 [12:20<04:40, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11177: train loss 2.94468:  72%|███████▏  | 11178/15515 [12:21<04:43, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11181: train loss 2.97074:  72%|███████▏  | 11182/15515 [12:21<04:44, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11185: train loss 2.94782:  72%|███████▏  | 11186/15515 [12:21<04:45, 15.15it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11188: train loss 2.93940:  72%|███████▏  | 11188/15515 [12:21<04:44, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11192: train loss 2.92293:  72%|███████▏  | 11192/15515 [12:22<04:43, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11196: train loss 2.92338:  72%|███████▏  | 11196/15515 [12:22<04:41, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11200: train loss 2.97445:  72%|███████▏  | 11200/15515 [12:22<04:42, 15.26it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 11203: train loss 2.96607:  72%|███████▏  | 11204/15515 [12:22<04:42, 15.26it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 11207: train loss 2.95126:  72%|███████▏  | 11208/15515 [12:23<04:45, 15.06it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 11210: train loss 3.01117:  72%|███████▏  | 11210/15515 [12:23<04:46, 15.04it/s]


32459 128
32459 128
32459 128


epoch 0 iter 11214: train loss 2.93798:  72%|███████▏  | 11214/15515 [12:23<04:44, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11218: train loss 3.02530:  72%|███████▏  | 11218/15515 [12:23<04:47, 14.95it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11221: train loss 3.09217:  72%|███████▏  | 11222/15515 [12:23<04:46, 14.98it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11225: train loss 2.88629:  72%|███████▏  | 11226/15515 [12:24<04:48, 14.85it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11228: train loss 2.97765:  72%|███████▏  | 11228/15515 [12:24<04:47, 14.94it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 11231: train loss 3.03113:  72%|███████▏  | 11232/15515 [12:24<04:49, 14.80it/s]

128
32459 128
32459 128


epoch 0 iter 11234: train loss 2.99758:  72%|███████▏  | 11234/15515 [12:24<04:47, 14.90it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11238: train loss 2.96150:  72%|███████▏  | 11238/15515 [12:25<04:42, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11242: train loss 3.01454:  72%|███████▏  | 11242/15515 [12:25<04:39, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11246: train loss 2.98822:  72%|███████▏  | 11246/15515 [12:25<04:40, 15.22it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 11249: train loss 2.97659:  73%|███████▎  | 11250/15515 [12:25<04:39, 15.25it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 11253: train loss 2.92744:  73%|███████▎  | 11254/15515 [12:26<04:43, 15.03it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11256: train loss 2.95389:  73%|███████▎  | 11256/15515 [12:26<04:43, 15.01it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11260: train loss 2.95814:  73%|███████▎  | 11260/15515 [12:26<04:42, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11264: train loss 2.90386:  73%|███████▎  | 11264/15515 [12:26<04:41, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11268: train loss 2.95633:  73%|███████▎  | 11268/15515 [12:27<04:35, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11272: train loss 2.91253:  73%|███████▎  | 11272/15515 [12:27<04:38, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11276: train loss 3.04278:  73%|███████▎  | 11276/15515 [12:27<04:38, 15.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11280: train loss 2.90965:  73%|███████▎  | 11280/15515 [12:27<04:37, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11284: train loss 2.95756:  73%|███████▎  | 11284/15515 [12:28<04:38, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11288: train loss 2.90055:  73%|███████▎  | 11288/15515 [12:28<04:37, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11292: train loss 2.95486:  73%|███████▎  | 11292/15515 [12:28<04:39, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11296: train loss 2.90893:  73%|███████▎  | 11296/15515 [12:28<04:39, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11299: train loss 2.99471:  73%|███████▎  | 11300/15515 [12:29<04:42, 14.93it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11303: train loss 2.94397:  73%|███████▎  | 11304/15515 [12:29<04:43, 14.83it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11306: train loss 2.95281:  73%|███████▎  | 11306/15515 [12:29<04:40, 15.00it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11310: train loss 2.90314:  73%|███████▎  | 11310/15515 [12:29<04:41, 14.92it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11313: train loss 2.97728:  73%|███████▎  | 11314/15515 [12:30<04:41, 14.94it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11316: train loss 2.98569:  73%|███████▎  | 11316/15515 [12:30<04:39, 15.01it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11320: train loss 2.96724:  73%|███████▎  | 11320/15515 [12:30<04:36, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11324: train loss 2.93208:  73%|███████▎  | 11324/15515 [12:30<04:33, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11328: train loss 3.00068:  73%|███████▎  | 11328/15515 [12:31<04:33, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11332: train loss 2.96634:  73%|███████▎  | 11332/15515 [12:31<04:31, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11336: train loss 2.98275:  73%|███████▎  | 11336/15515 [12:31<04:34, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11340: train loss 2.93792:  73%|███████▎  | 11340/15515 [12:31<04:37, 15.02it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 11343: train loss 3.01831:  73%|███████▎  | 11344/15515 [12:32<04:37, 15.01it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 11347: train loss 2.96297:  73%|███████▎  | 11348/15515 [12:32<04:38, 14.98it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11350: train loss 2.96546:  73%|███████▎  | 11350/15515 [12:32<04:40, 14.84it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11353: train loss 2.89191:  73%|███████▎  | 11354/15515 [12:32<04:40, 14.85it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11357: train loss 2.93118:  73%|███████▎  | 11358/15515 [12:33<04:36, 15.05it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11361: train loss 2.92323:  73%|███████▎  | 11362/15515 [12:33<04:37, 14.97it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11364: train loss 2.97537:  73%|███████▎  | 11364/15515 [12:33<04:37, 14.98it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11368: train loss 2.99123:  73%|███████▎  | 11368/15515 [12:33<04:38, 14.90it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11372: train loss 2.96729:  73%|███████▎  | 11372/15515 [12:33<04:32, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11376: train loss 2.93321:  73%|███████▎  | 11376/15515 [12:34<04:32, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11380: train loss 2.93021:  73%|███████▎  | 11380/15515 [12:34<04:36, 14.97it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11384: train loss 2.94175:  73%|███████▎  | 11384/15515 [12:34<04:33, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11388: train loss 2.96757:  73%|███████▎  | 11388/15515 [12:35<04:33, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11392: train loss 2.95353:  73%|███████▎  | 11392/15515 [12:35<04:32, 15.12it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 11395: train loss 2.94742:  73%|███████▎  | 11396/15515 [12:35<04:35, 14.94it/s]

 128
32459 128
32459 128


epoch 0 iter 11398: train loss 2.98023:  73%|███████▎  | 11398/15515 [12:35<04:33, 15.05it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11402: train loss 2.97742:  73%|███████▎  | 11402/15515 [12:35<04:28, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11406: train loss 2.90701:  74%|███████▎  | 11406/15515 [12:36<04:25, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11410: train loss 2.97654:  74%|███████▎  | 11410/15515 [12:36<04:27, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11414: train loss 2.96720:  74%|███████▎  | 11414/15515 [12:36<04:28, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11418: train loss 3.01236:  74%|███████▎  | 11418/15515 [12:37<04:30, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11422: train loss 2.94968:  74%|███████▎  | 11422/15515 [12:37<04:30, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11426: train loss 2.92010:  74%|███████▎  | 11426/15515 [12:37<04:31, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11430: train loss 2.93697:  74%|███████▎  | 11430/15515 [12:37<04:29, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11434: train loss 2.97196:  74%|███████▎  | 11434/15515 [12:38<04:28, 15.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11438: train loss 2.92719:  74%|███████▎  | 11438/15515 [12:38<04:28, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11442: train loss 2.97087:  74%|███████▎  | 11442/15515 [12:38<04:26, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11446: train loss 2.91455:  74%|███████▍  | 11446/15515 [12:38<04:25, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11450: train loss 2.94323:  74%|███████▍  | 11450/15515 [12:39<04:21, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11454: train loss 2.96620:  74%|███████▍  | 11454/15515 [12:39<04:21, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11458: train loss 2.91843:  74%|███████▍  | 11458/15515 [12:39<04:23, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11462: train loss 2.91680:  74%|███████▍  | 11462/15515 [12:39<04:23, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11466: train loss 2.93606:  74%|███████▍  | 11466/15515 [12:40<04:23, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11470: train loss 3.00254:  74%|███████▍  | 11470/15515 [12:40<04:27, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11474: train loss 2.93832:  74%|███████▍  | 11474/15515 [12:40<04:28, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11478: train loss 2.98564:  74%|███████▍  | 11478/15515 [12:40<04:27, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11482: train loss 2.96760:  74%|███████▍  | 11482/15515 [12:41<04:28, 15.02it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11485: train loss 2.95361:  74%|███████▍  | 11486/15515 [12:41<04:25, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11489: train loss 2.90795:  74%|███████▍  | 11490/15515 [12:41<04:26, 15.13it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11492: train loss 2.88874:  74%|███████▍  | 11492/15515 [12:41<04:27, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11496: train loss 3.00155:  74%|███████▍  | 11496/15515 [12:42<04:23, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11500: train loss 2.97137:  74%|███████▍  | 11500/15515 [12:42<04:25, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11504: train loss 2.92700:  74%|███████▍  | 11504/15515 [12:42<04:26, 15.06it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 11507: train loss 3.00322:  74%|███████▍  | 11508/15515 [12:42<04:27, 14.96it/s]

 128
32459 128
32459 128
32459

epoch 0 iter 11510: train loss 2.99978:  74%|███████▍  | 11510/15515 [12:43<04:25, 15.06it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 11514: train loss 2.89678:  74%|███████▍  | 11514/15515 [12:43<04:22, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11518: train loss 2.94301:  74%|███████▍  | 11518/15515 [12:43<04:22, 15.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11522: train loss 2.92516:  74%|███████▍  | 11522/15515 [12:43<04:22, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11526: train loss 2.94651:  74%|███████▍  | 11526/15515 [12:44<04:23, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11530: train loss 2.95333:  74%|███████▍  | 11530/15515 [12:44<04:24, 15.05it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11534: train loss 2.99247:  74%|███████▍  | 11534/15515 [12:44<04:23, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11538: train loss 2.90961:  74%|███████▍  | 11538/15515 [12:44<04:19, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11542: train loss 2.98699:  74%|███████▍  | 11542/15515 [12:45<04:18, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11546: train loss 2.93142:  74%|███████▍  | 11546/15515 [12:45<04:18, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11550: train loss 2.99513:  74%|███████▍  | 11550/15515 [12:45<04:16, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11554: train loss 2.97714:  74%|███████▍  | 11554/15515 [12:45<04:14, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11558: train loss 2.97258:  74%|███████▍  | 11558/15515 [12:46<04:16, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11562: train loss 2.91349:  75%|███████▍  | 11562/15515 [12:46<04:20, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11566: train loss 2.96450:  75%|███████▍  | 11566/15515 [12:46<04:21, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11570: train loss 2.94715:  75%|███████▍  | 11570/15515 [12:47<04:20, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11574: train loss 2.90162:  75%|███████▍  | 11574/15515 [12:47<04:22, 15.01it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11577: train loss 2.98840:  75%|███████▍  | 11578/15515 [12:47<04:22, 14.97it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 11580: train loss 2.96779:  75%|███████▍  | 11580/15515 [12:47<04:20, 15.13it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 11584: train loss 2.88746:  75%|███████▍  | 11584/15515 [12:47<04:17, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11588: train loss 2.97883:  75%|███████▍  | 11588/15515 [12:48<04:21, 15.04it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11591: train loss 2.93471:  75%|███████▍  | 11592/15515 [12:48<04:23, 14.87it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11594: train loss 3.02143:  75%|███████▍  | 11594/15515 [12:48<04:22, 14.92it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11598: train loss 2.94037:  75%|███████▍  | 11598/15515 [12:48<04:19, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11602: train loss 2.95106:  75%|███████▍  | 11602/15515 [12:49<04:18, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11606: train loss 3.01845:  75%|███████▍  | 11606/15515 [12:49<04:13, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11610: train loss 2.94926:  75%|███████▍  | 11610/15515 [12:49<04:13, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11614: train loss 2.85579:  75%|███████▍  | 11614/15515 [12:49<04:11, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11618: train loss 2.92765:  75%|███████▍  | 11618/15515 [12:50<04:14, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11622: train loss 2.90782:  75%|███████▍  | 11622/15515 [12:50<04:14, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11626: train loss 2.97750:  75%|███████▍  | 11626/15515 [12:50<04:11, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11630: train loss 3.00436:  75%|███████▍  | 11630/15515 [12:50<04:11, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11634: train loss 2.91047:  75%|███████▍  | 11634/15515 [12:51<04:15, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11638: train loss 2.92796:  75%|███████▌  | 11638/15515 [12:51<04:12, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11642: train loss 2.87536:  75%|███████▌  | 11642/15515 [12:51<04:08, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11646: train loss 2.91164:  75%|███████▌  | 11646/15515 [12:51<04:09, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11650: train loss 2.94887:  75%|███████▌  | 11650/15515 [12:52<04:10, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11654: train loss 2.94335:  75%|███████▌  | 11654/15515 [12:52<04:12, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11658: train loss 2.96182:  75%|███████▌  | 11658/15515 [12:52<04:12, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11662: train loss 2.99305:  75%|███████▌  | 11662/15515 [12:53<04:15, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11666: train loss 2.92574:  75%|███████▌  | 11666/15515 [12:53<04:14, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11670: train loss 2.89183:  75%|███████▌  | 11670/15515 [12:53<04:14, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11674: train loss 2.90480:  75%|███████▌  | 11674/15515 [12:53<04:13, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11678: train loss 2.93149:  75%|███████▌  | 11678/15515 [12:54<04:12, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11682: train loss 2.94674:  75%|███████▌  | 11682/15515 [12:54<04:11, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11686: train loss 2.90966:  75%|███████▌  | 11686/15515 [12:54<04:11, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11690: train loss 2.98303:  75%|███████▌  | 11690/15515 [12:54<04:11, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11694: train loss 2.95898:  75%|███████▌  | 11694/15515 [12:55<04:09, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11698: train loss 2.95915:  75%|███████▌  | 11698/15515 [12:55<04:10, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11702: train loss 2.91939:  75%|███████▌  | 11702/15515 [12:55<04:10, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11706: train loss 2.99505:  75%|███████▌  | 11706/15515 [12:55<04:10, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11710: train loss 2.97730:  75%|███████▌  | 11710/15515 [12:56<04:10, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11714: train loss 2.91494:  76%|███████▌  | 11714/15515 [12:56<04:11, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11718: train loss 2.94044:  76%|███████▌  | 11718/15515 [12:56<04:10, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11722: train loss 2.95317:  76%|███████▌  | 11722/15515 [12:56<04:09, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11726: train loss 2.97800:  76%|███████▌  | 11726/15515 [12:57<04:08, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11730: train loss 2.96023:  76%|███████▌  | 11730/15515 [12:57<04:08, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11734: train loss 2.96933:  76%|███████▌  | 11734/15515 [12:57<04:08, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11738: train loss 2.87587:  76%|███████▌  | 11738/15515 [12:58<04:07, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11742: train loss 3.05740:  76%|███████▌  | 11742/15515 [12:58<04:07, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11746: train loss 2.93194:  76%|███████▌  | 11746/15515 [12:58<04:07, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11750: train loss 2.96244:  76%|███████▌  | 11750/15515 [12:58<04:08, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11754: train loss 2.92703:  76%|███████▌  | 11754/15515 [12:59<04:04, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11758: train loss 2.90582:  76%|███████▌  | 11758/15515 [12:59<04:02, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11762: train loss 2.93611:  76%|███████▌  | 11762/15515 [12:59<04:03, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11766: train loss 2.88261:  76%|███████▌  | 11766/15515 [12:59<04:00, 15.61it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11770: train loss 2.93740:  76%|███████▌  | 11770/15515 [13:00<03:58, 15.67it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11774: train loss 2.98021:  76%|███████▌  | 11774/15515 [13:00<04:01, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11778: train loss 2.94912:  76%|███████▌  | 11778/15515 [13:00<04:02, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11782: train loss 2.92569:  76%|███████▌  | 11782/15515 [13:00<04:01, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11786: train loss 2.92636:  76%|███████▌  | 11786/15515 [13:01<04:00, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11790: train loss 2.93689:  76%|███████▌  | 11790/15515 [13:01<04:04, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11793: train loss 2.88551:  76%|███████▌  | 11794/15515 [13:01<04:06, 15.11it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11797: train loss 2.96214:  76%|███████▌  | 11798/15515 [13:01<04:09, 14.93it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11800: train loss 2.89621:  76%|███████▌  | 11800/15515 [13:02<04:10, 14.83it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11803: train loss 2.99478:  76%|███████▌  | 11804/15515 [13:02<04:11, 14.76it/s]

32459 128
32459 128


epoch 0 iter 11806: train loss 2.97282:  76%|███████▌  | 11806/15515 [13:02<04:09, 14.86it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11810: train loss 2.99529:  76%|███████▌  | 11810/15515 [13:02<04:07, 14.96it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 11814: train loss 2.94589:  76%|███████▌  | 11814/15515 [13:03<04:12, 14.67it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 11817: train loss 2.91285:  76%|███████▌  | 11818/15515 [13:03<04:11, 14.69it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11820: train loss 2.97139:  76%|███████▌  | 11820/15515 [13:03<04:06, 14.98it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11824: train loss 2.89184:  76%|███████▌  | 11824/15515 [13:03<04:03, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11828: train loss 2.97343:  76%|███████▌  | 11828/15515 [13:03<04:01, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11832: train loss 2.96956:  76%|███████▋  | 11832/15515 [13:04<04:01, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11836: train loss 2.89805:  76%|███████▋  | 11836/15515 [13:04<04:00, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11840: train loss 2.93733:  76%|███████▋  | 11840/15515 [13:04<04:01, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11844: train loss 2.90675:  76%|███████▋  | 11844/15515 [13:05<04:00, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11848: train loss 2.93540:  76%|███████▋  | 11848/15515 [13:05<04:00, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11852: train loss 2.87993:  76%|███████▋  | 11852/15515 [13:05<04:02, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11856: train loss 2.95920:  76%|███████▋  | 11856/15515 [13:05<04:00, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11860: train loss 2.99968:  76%|███████▋  | 11860/15515 [13:06<04:04, 14.96it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11864: train loss 2.90820:  76%|███████▋  | 11864/15515 [13:06<04:01, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11868: train loss 2.89051:  76%|███████▋  | 11868/15515 [13:06<04:02, 15.02it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11872: train loss 2.95714:  77%|███████▋  | 11872/15515 [13:06<04:00, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11876: train loss 2.95542:  77%|███████▋  | 11876/15515 [13:07<03:59, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11880: train loss 2.93756:  77%|███████▋  | 11880/15515 [13:07<03:58, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11884: train loss 2.95607:  77%|███████▋  | 11884/15515 [13:07<03:59, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11888: train loss 2.92942:  77%|███████▋  | 11888/15515 [13:07<03:59, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11892: train loss 2.96349:  77%|███████▋  | 11892/15515 [13:08<04:01, 15.02it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11896: train loss 2.97010:  77%|███████▋  | 11896/15515 [13:08<03:59, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11900: train loss 2.94250:  77%|███████▋  | 11900/15515 [13:08<03:59, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11904: train loss 2.88732:  77%|███████▋  | 11904/15515 [13:08<03:59, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11908: train loss 2.98232:  77%|███████▋  | 11908/15515 [13:09<03:58, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11912: train loss 2.88050:  77%|███████▋  | 11912/15515 [13:09<03:55, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11916: train loss 2.89857:  77%|███████▋  | 11916/15515 [13:09<03:54, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11920: train loss 2.92069:  77%|███████▋  | 11920/15515 [13:10<03:54, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11924: train loss 2.93028:  77%|███████▋  | 11924/15515 [13:10<03:53, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11928: train loss 2.83808:  77%|███████▋  | 11928/15515 [13:10<03:50, 15.59it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11932: train loss 3.00569:  77%|███████▋  | 11932/15515 [13:10<03:51, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11936: train loss 2.94637:  77%|███████▋  | 11936/15515 [13:11<03:53, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11940: train loss 2.89542:  77%|███████▋  | 11940/15515 [13:11<03:52, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11944: train loss 2.99237:  77%|███████▋  | 11944/15515 [13:11<03:51, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11948: train loss 2.93725:  77%|███████▋  | 11948/15515 [13:11<03:49, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11952: train loss 2.95962:  77%|███████▋  | 11952/15515 [13:12<03:49, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11956: train loss 2.98471:  77%|███████▋  | 11956/15515 [13:12<03:52, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11960: train loss 2.91983:  77%|███████▋  | 11960/15515 [13:12<03:52, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11964: train loss 2.89935:  77%|███████▋  | 11964/15515 [13:12<03:54, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11968: train loss 2.92921:  77%|███████▋  | 11968/15515 [13:13<03:54, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11972: train loss 2.85897:  77%|███████▋  | 11972/15515 [13:13<03:55, 15.02it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11975: train loss 3.03029:  77%|███████▋  | 11976/15515 [13:13<03:56, 14.96it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11979: train loss 2.99081:  77%|███████▋  | 11980/15515 [13:13<03:55, 14.98it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11982: train loss 2.95692:  77%|███████▋  | 11982/15515 [13:14<03:55, 14.97it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11986: train loss 2.97489:  77%|███████▋  | 11986/15515 [13:14<03:53, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11990: train loss 2.93667:  77%|███████▋  | 11990/15515 [13:14<03:53, 15.12it/s]

32459 128
32459 128
32459 128


epoch 0 iter 11993: train loss 2.97840:  77%|███████▋  | 11994/15515 [13:14<03:51, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 11997: train loss 2.94411:  77%|███████▋  | 11998/15515 [13:15<03:50, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12001: train loss 2.84781:  77%|███████▋  | 12002/15515 [13:15<03:50, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12005: train loss 2.95000:  77%|███████▋  | 12006/15515 [13:15<03:48, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12009: train loss 2.89019:  77%|███████▋  | 12010/15515 [13:15<03:48, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12013: train loss 2.96584:  77%|███████▋  | 12014/15515 [13:16<03:50, 15.21it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 12016: train loss 2.96414:  77%|███████▋  | 12016/15515 [13:16<03:52, 15.08it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 12020: train loss 2.93200:  77%|███████▋  | 12020/15515 [13:16<03:51, 15.10it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 12023: train loss 2.91768:  77%|███████▋  | 12024/15515 [13:16<03:52, 14.98it/s]

 128
32459 128
32459 128
32459 128

epoch 0 iter 12026: train loss 2.95904:  78%|███████▊  | 12026/15515 [13:16<03:52, 14.99it/s]


32459 128
32459 128


epoch 0 iter 12029: train loss 2.91068:  78%|███████▊  | 12030/15515 [13:17<03:50, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12033: train loss 2.93151:  78%|███████▊  | 12034/15515 [13:17<03:48, 15.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12037: train loss 2.99090:  78%|███████▊  | 12038/15515 [13:17<03:48, 15.20it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 12040: train loss 2.95663:  78%|███████▊  | 12040/15515 [13:17<03:46, 15.31it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 12044: train loss 2.97843:  78%|███████▊  | 12044/15515 [13:18<03:48, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12048: train loss 2.97007:  78%|███████▊  | 12048/15515 [13:18<03:51, 15.00it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12052: train loss 2.92280:  78%|███████▊  | 12052/15515 [13:18<03:48, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12056: train loss 2.90088:  78%|███████▊  | 12056/15515 [13:18<03:45, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12060: train loss 2.96564:  78%|███████▊  | 12060/15515 [13:19<03:42, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12064: train loss 2.95289:  78%|███████▊  | 12064/15515 [13:19<03:41, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12068: train loss 2.91081:  78%|███████▊  | 12068/15515 [13:19<03:43, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12072: train loss 2.95336:  78%|███████▊  | 12072/15515 [13:19<03:42, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12076: train loss 2.97812:  78%|███████▊  | 12076/15515 [13:20<03:40, 15.62it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12080: train loss 2.96042:  78%|███████▊  | 12080/15515 [13:20<03:40, 15.59it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12084: train loss 2.94941:  78%|███████▊  | 12084/15515 [13:20<03:39, 15.64it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12088: train loss 2.97157:  78%|███████▊  | 12088/15515 [13:20<03:39, 15.63it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12092: train loss 2.95221:  78%|███████▊  | 12092/15515 [13:21<03:40, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12096: train loss 2.96236:  78%|███████▊  | 12096/15515 [13:21<03:42, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12100: train loss 2.90376:  78%|███████▊  | 12100/15515 [13:21<03:41, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12104: train loss 2.91270:  78%|███████▊  | 12104/15515 [13:22<03:42, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12108: train loss 2.93156:  78%|███████▊  | 12108/15515 [13:22<03:43, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12112: train loss 2.96059:  78%|███████▊  | 12112/15515 [13:22<03:47, 14.99it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12116: train loss 2.94404:  78%|███████▊  | 12116/15515 [13:22<03:46, 15.02it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12119: train loss 2.93886:  78%|███████▊  | 12120/15515 [13:23<03:48, 14.85it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12122: train loss 2.97356:  78%|███████▊  | 12122/15515 [13:23<03:47, 14.94it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12126: train loss 2.99513:  78%|███████▊  | 12126/15515 [13:23<03:44, 15.07it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12129: train loss 2.95854:  78%|███████▊  | 12130/15515 [13:23<03:46, 14.97it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12132: train loss 2.96371:  78%|███████▊  | 12132/15515 [13:23<03:47, 14.87it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12136: train loss 2.92346:  78%|███████▊  | 12136/15515 [13:24<03:44, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12140: train loss 2.97033:  78%|███████▊  | 12140/15515 [13:24<03:44, 15.06it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 12143: train loss 2.89469:  78%|███████▊  | 12144/15515 [13:24<03:47, 14.85it/s]

128
32459 128
32459 128


epoch 0 iter 12146: train loss 3.00110:  78%|███████▊  | 12146/15515 [13:24<03:47, 14.83it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 12149: train loss 2.92615:  78%|███████▊  | 12150/15515 [13:25<03:47, 14.76it/s]

 128
32459 128
32459 128


epoch 0 iter 12152: train loss 2.92589:  78%|███████▊  | 12152/15515 [13:25<03:47, 14.79it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12156: train loss 2.95466:  78%|███████▊  | 12156/15515 [13:25<03:43, 15.04it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 12159: train loss 2.95113:  78%|███████▊  | 12160/15515 [13:25<03:43, 14.99it/s]

128
32459 128
32459 128


epoch 0 iter 12162: train loss 2.90120:  78%|███████▊  | 12162/15515 [13:25<03:43, 15.01it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12166: train loss 2.93871:  78%|███████▊  | 12166/15515 [13:26<03:41, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12170: train loss 2.85846:  78%|███████▊  | 12170/15515 [13:26<03:42, 15.04it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12173: train loss 2.86984:  78%|███████▊  | 12174/15515 [13:26<03:43, 14.94it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12176: train loss 2.93633:  78%|███████▊  | 12176/15515 [13:26<03:42, 15.02it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12180: train loss 2.94197:  79%|███████▊  | 12180/15515 [13:27<03:40, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12184: train loss 2.94242:  79%|███████▊  | 12184/15515 [13:27<03:39, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12188: train loss 2.88513:  79%|███████▊  | 12188/15515 [13:27<03:41, 14.99it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 12191: train loss 2.89046:  79%|███████▊  | 12192/15515 [13:27<03:39, 15.13it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 12195: train loss 2.92368:  79%|███████▊  | 12196/15515 [13:28<03:39, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12198: train loss 2.93835:  79%|███████▊  | 12198/15515 [13:28<03:39, 15.14it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12202: train loss 2.98075:  79%|███████▊  | 12202/15515 [13:28<03:37, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12206: train loss 2.89191:  79%|███████▊  | 12206/15515 [13:28<03:37, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12210: train loss 2.95927:  79%|███████▊  | 12210/15515 [13:29<03:39, 15.07it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12213: train loss 2.86436:  79%|███████▊  | 12214/15515 [13:29<03:40, 14.95it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12216: train loss 2.87968:  79%|███████▊  | 12216/15515 [13:29<03:40, 14.98it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12220: train loss 2.88601:  79%|███████▉  | 12220/15515 [13:29<03:38, 15.05it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12224: train loss 2.99640:  79%|███████▉  | 12224/15515 [13:30<03:34, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12228: train loss 2.94412:  79%|███████▉  | 12228/15515 [13:30<03:34, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12232: train loss 2.93304:  79%|███████▉  | 12232/15515 [13:30<03:32, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12236: train loss 2.93457:  79%|███████▉  | 12236/15515 [13:30<03:30, 15.60it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12240: train loss 2.97639:  79%|███████▉  | 12240/15515 [13:31<03:30, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12244: train loss 2.88660:  79%|███████▉  | 12244/15515 [13:31<03:35, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12248: train loss 2.91714:  79%|███████▉  | 12248/15515 [13:31<03:34, 15.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12252: train loss 2.91087:  79%|███████▉  | 12252/15515 [13:31<03:35, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12256: train loss 2.94816:  79%|███████▉  | 12256/15515 [13:32<03:36, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12260: train loss 2.94318:  79%|███████▉  | 12260/15515 [13:32<03:36, 15.05it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12264: train loss 2.89764:  79%|███████▉  | 12264/15515 [13:32<03:36, 14.98it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12268: train loss 2.94068:  79%|███████▉  | 12268/15515 [13:32<03:38, 14.88it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 12271: train loss 2.96810:  79%|███████▉  | 12272/15515 [13:33<03:35, 15.06it/s]


32459 128
32459 128
32459 128


epoch 0 iter 12275: train loss 2.93102:  79%|███████▉  | 12276/15515 [13:33<03:39, 14.76it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12278: train loss 2.94563:  79%|███████▉  | 12278/15515 [13:33<03:37, 14.88it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12282: train loss 2.94502:  79%|███████▉  | 12282/15515 [13:33<03:33, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12286: train loss 2.93888:  79%|███████▉  | 12286/15515 [13:34<03:33, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12290: train loss 2.94807:  79%|███████▉  | 12290/15515 [13:34<03:32, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12294: train loss 2.94648:  79%|███████▉  | 12294/15515 [13:34<03:31, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12298: train loss 2.91007:  79%|███████▉  | 12298/15515 [13:34<03:29, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12302: train loss 2.91938:  79%|███████▉  | 12302/15515 [13:35<03:27, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12306: train loss 2.87670:  79%|███████▉  | 12306/15515 [13:35<03:26, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12310: train loss 2.89973:  79%|███████▉  | 12310/15515 [13:35<03:28, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12314: train loss 2.88384:  79%|███████▉  | 12314/15515 [13:35<03:29, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12318: train loss 2.93194:  79%|███████▉  | 12318/15515 [13:36<03:29, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12322: train loss 2.96906:  79%|███████▉  | 12322/15515 [13:36<03:31, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12326: train loss 2.97766:  79%|███████▉  | 12326/15515 [13:36<03:31, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12330: train loss 2.87133:  79%|███████▉  | 12330/15515 [13:37<03:30, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12334: train loss 2.92619:  79%|███████▉  | 12334/15515 [13:37<03:27, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12338: train loss 2.97308:  80%|███████▉  | 12338/15515 [13:37<03:26, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12342: train loss 2.92468:  80%|███████▉  | 12342/15515 [13:37<03:28, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12346: train loss 2.96544:  80%|███████▉  | 12346/15515 [13:38<03:28, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12350: train loss 2.95536:  80%|███████▉  | 12350/15515 [13:38<03:27, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12354: train loss 2.91842:  80%|███████▉  | 12354/15515 [13:38<03:27, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12358: train loss 2.92703:  80%|███████▉  | 12358/15515 [13:38<03:26, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12362: train loss 2.93910:  80%|███████▉  | 12362/15515 [13:39<03:25, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12366: train loss 2.91009:  80%|███████▉  | 12366/15515 [13:39<03:26, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12370: train loss 2.85462:  80%|███████▉  | 12370/15515 [13:39<03:27, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12374: train loss 2.90514:  80%|███████▉  | 12374/15515 [13:39<03:27, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12378: train loss 2.96572:  80%|███████▉  | 12378/15515 [13:40<03:28, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12382: train loss 2.91406:  80%|███████▉  | 12382/15515 [13:40<03:29, 14.96it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12386: train loss 2.92718:  80%|███████▉  | 12386/15515 [13:40<03:28, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12390: train loss 2.90298:  80%|███████▉  | 12390/15515 [13:40<03:28, 14.96it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12394: train loss 2.99648:  80%|███████▉  | 12394/15515 [13:41<03:29, 14.92it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12397: train loss 2.94666:  80%|███████▉  | 12398/15515 [13:41<03:28, 14.95it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 12400: train loss 2.93425:  80%|███████▉  | 12400/15515 [13:41<03:27, 15.04it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 12404: train loss 2.91323:  80%|███████▉  | 12404/15515 [13:41<03:29, 14.86it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12408: train loss 2.93592:  80%|███████▉  | 12408/15515 [13:42<03:25, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12412: train loss 2.93392:  80%|████████  | 12412/15515 [13:42<03:26, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12416: train loss 2.88698:  80%|████████  | 12416/15515 [13:42<03:24, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12420: train loss 2.89720:  80%|████████  | 12420/15515 [13:42<03:24, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12424: train loss 2.90225:  80%|████████  | 12424/15515 [13:43<03:24, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12428: train loss 2.86629:  80%|████████  | 12428/15515 [13:43<03:23, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12432: train loss 2.95122:  80%|████████  | 12432/15515 [13:43<03:26, 14.91it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12436: train loss 2.95597:  80%|████████  | 12436/15515 [13:44<03:26, 14.94it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12440: train loss 2.95368:  80%|████████  | 12440/15515 [13:44<03:24, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12444: train loss 2.91864:  80%|████████  | 12444/15515 [13:44<03:21, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12448: train loss 2.91838:  80%|████████  | 12448/15515 [13:44<03:19, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12452: train loss 2.92912:  80%|████████  | 12452/15515 [13:45<03:19, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12456: train loss 2.98411:  80%|████████  | 12456/15515 [13:45<03:18, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12460: train loss 2.89385:  80%|████████  | 12460/15515 [13:45<03:18, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12464: train loss 2.89308:  80%|████████  | 12464/15515 [13:45<03:19, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12468: train loss 2.90105:  80%|████████  | 12468/15515 [13:46<03:22, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12472: train loss 2.91159:  80%|████████  | 12472/15515 [13:46<03:22, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12476: train loss 2.92476:  80%|████████  | 12476/15515 [13:46<03:21, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12480: train loss 2.87830:  80%|████████  | 12480/15515 [13:46<03:23, 14.91it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12484: train loss 2.96681:  80%|████████  | 12484/15515 [13:47<03:20, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12488: train loss 2.89091:  80%|████████  | 12488/15515 [13:47<03:23, 14.91it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12491: train loss 2.85985:  81%|████████  | 12492/15515 [13:47<03:22, 14.94it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12494: train loss 2.94330:  81%|████████  | 12494/15515 [13:47<03:22, 14.88it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12497: train loss 2.97590:  81%|████████  | 12498/15515 [13:48<03:24, 14.77it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 12500: train loss 2.91603:  81%|████████  | 12500/15515 [13:48<03:23, 14.83it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 12504: train loss 2.93801:  81%|████████  | 12504/15515 [13:48<03:20, 14.99it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12508: train loss 2.89022:  81%|████████  | 12508/15515 [13:48<03:20, 15.00it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12512: train loss 2.93691:  81%|████████  | 12512/15515 [13:49<03:20, 15.01it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12516: train loss 2.95971:  81%|████████  | 12516/15515 [13:49<03:19, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12520: train loss 2.90951:  81%|████████  | 12520/15515 [13:49<03:19, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12524: train loss 2.88556:  81%|████████  | 12524/15515 [13:49<03:18, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12528: train loss 2.92057:  81%|████████  | 12528/15515 [13:50<03:17, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12532: train loss 2.92484:  81%|████████  | 12532/15515 [13:50<03:15, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12536: train loss 2.91610:  81%|████████  | 12536/15515 [13:50<03:15, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12540: train loss 2.90511:  81%|████████  | 12540/15515 [13:50<03:16, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12544: train loss 2.98795:  81%|████████  | 12544/15515 [13:51<03:14, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12548: train loss 2.92659:  81%|████████  | 12548/15515 [13:51<03:15, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12552: train loss 2.97728:  81%|████████  | 12552/15515 [13:51<03:15, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12556: train loss 2.91352:  81%|████████  | 12556/15515 [13:51<03:15, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12560: train loss 2.86213:  81%|████████  | 12560/15515 [13:52<03:15, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12564: train loss 2.85816:  81%|████████  | 12564/15515 [13:52<03:16, 15.05it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12568: train loss 2.95618:  81%|████████  | 12568/15515 [13:52<03:14, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12572: train loss 2.90633:  81%|████████  | 12572/15515 [13:53<03:13, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12576: train loss 2.90119:  81%|████████  | 12576/15515 [13:53<03:12, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12580: train loss 2.91786:  81%|████████  | 12580/15515 [13:53<03:12, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12584: train loss 2.90347:  81%|████████  | 12584/15515 [13:53<03:11, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12588: train loss 2.92620:  81%|████████  | 12588/15515 [13:54<03:11, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12592: train loss 2.98004:  81%|████████  | 12592/15515 [13:54<03:09, 15.41it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 12595: train loss 2.88807:  81%|████████  | 12596/15515 [13:54<03:12, 15.20it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 12599: train loss 2.91438:  81%|████████  | 12600/15515 [13:54<03:11, 15.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12603: train loss 2.92099:  81%|████████  | 12604/15515 [13:55<03:08, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12607: train loss 2.98259:  81%|████████▏ | 12608/15515 [13:55<03:07, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12611: train loss 2.92920:  81%|████████▏ | 12612/15515 [13:55<03:08, 15.38it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12614: train loss 2.93195:  81%|████████▏ | 12614/15515 [13:55<03:12, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12618: train loss 2.89641:  81%|████████▏ | 12618/15515 [13:56<03:12, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12622: train loss 2.90524:  81%|████████▏ | 12622/15515 [13:56<03:10, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12626: train loss 2.89216:  81%|████████▏ | 12626/15515 [13:56<03:11, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12630: train loss 2.87013:  81%|████████▏ | 12630/15515 [13:56<03:09, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12634: train loss 2.90460:  81%|████████▏ | 12634/15515 [13:57<03:09, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12638: train loss 2.89074:  81%|████████▏ | 12638/15515 [13:57<03:08, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12642: train loss 2.87014:  81%|████████▏ | 12642/15515 [13:57<03:06, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12646: train loss 2.95651:  82%|████████▏ | 12646/15515 [13:57<03:05, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12650: train loss 2.89814:  82%|████████▏ | 12650/15515 [13:58<03:07, 15.27it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 12653: train loss 2.91766:  82%|████████▏ | 12654/15515 [13:58<03:08, 15.17it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 12656: train loss 2.96753:  82%|████████▏ | 12656/15515 [13:58<03:08, 15.13it/s]

32459 128
32459 128


epoch 0 iter 12659: train loss 2.92633:  82%|████████▏ | 12660/15515 [13:58<03:12, 14.82it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12662: train loss 2.97807:  82%|████████▏ | 12662/15515 [13:58<03:14, 14.64it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12665: train loss 2.92042:  82%|████████▏ | 12666/15515 [13:59<03:14, 14.66it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12668: train loss 2.87414:  82%|████████▏ | 12668/15515 [13:59<03:12, 14.78it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12672: train loss 2.98455:  82%|████████▏ | 12672/15515 [13:59<03:12, 14.78it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12676: train loss 2.85954:  82%|████████▏ | 12676/15515 [13:59<03:11, 14.84it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12680: train loss 2.95932:  82%|████████▏ | 12680/15515 [14:00<03:09, 14.93it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12683: train loss 2.95286:  82%|████████▏ | 12684/15515 [14:00<03:09, 14.95it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12686: train loss 2.94917:  82%|████████▏ | 12686/15515 [14:00<03:09, 14.93it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12689: train loss 2.89214:  82%|████████▏ | 12690/15515 [14:00<03:08, 14.95it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 12692: train loss 2.90453:  82%|████████▏ | 12692/15515 [14:00<03:11, 14.78it/s]


32459 128
32459 128


epoch 0 iter 12695: train loss 2.95113:  82%|████████▏ | 12696/15515 [14:01<03:09, 14.87it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12699: train loss 2.91023:  82%|████████▏ | 12700/15515 [14:01<03:07, 15.03it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 12702: train loss 2.90047:  82%|████████▏ | 12702/15515 [14:01<03:05, 15.13it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 12706: train loss 2.98133:  82%|████████▏ | 12706/15515 [14:01<03:04, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12710: train loss 2.90242:  82%|████████▏ | 12710/15515 [14:02<03:03, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12714: train loss 2.86483:  82%|████████▏ | 12714/15515 [14:02<03:07, 14.98it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12718: train loss 2.90580:  82%|████████▏ | 12718/15515 [14:02<03:04, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12722: train loss 2.93466:  82%|████████▏ | 12722/15515 [14:02<03:04, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12726: train loss 2.96751:  82%|████████▏ | 12726/15515 [14:03<03:05, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12730: train loss 2.86629:  82%|████████▏ | 12730/15515 [14:03<03:02, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12734: train loss 2.85044:  82%|████████▏ | 12734/15515 [14:03<02:58, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12738: train loss 2.88089:  82%|████████▏ | 12738/15515 [14:03<02:59, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12742: train loss 2.96211:  82%|████████▏ | 12742/15515 [14:04<03:00, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12746: train loss 2.90375:  82%|████████▏ | 12746/15515 [14:04<03:01, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12750: train loss 2.96156:  82%|████████▏ | 12750/15515 [14:04<03:02, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12754: train loss 2.90999:  82%|████████▏ | 12754/15515 [14:05<03:01, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12758: train loss 2.93279:  82%|████████▏ | 12758/15515 [14:05<03:00, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12762: train loss 2.96335:  82%|████████▏ | 12762/15515 [14:05<02:59, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12766: train loss 2.90891:  82%|████████▏ | 12766/15515 [14:05<02:59, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12770: train loss 2.93340:  82%|████████▏ | 12770/15515 [14:06<02:57, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12774: train loss 2.90935:  82%|████████▏ | 12774/15515 [14:06<02:59, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12778: train loss 2.88981:  82%|████████▏ | 12778/15515 [14:06<03:00, 15.16it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12781: train loss 2.91877:  82%|████████▏ | 12782/15515 [14:06<03:00, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12785: train loss 2.87515:  82%|████████▏ | 12786/15515 [14:07<03:01, 15.07it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12788: train loss 2.91564:  82%|████████▏ | 12788/15515 [14:07<03:01, 14.98it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12792: train loss 2.91573:  82%|████████▏ | 12792/15515 [14:07<02:59, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12796: train loss 2.89164:  82%|████████▏ | 12796/15515 [14:07<02:59, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12800: train loss 2.88612:  83%|████████▎ | 12800/15515 [14:08<03:00, 15.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12804: train loss 2.94498:  83%|████████▎ | 12804/15515 [14:08<02:56, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12808: train loss 2.94461:  83%|████████▎ | 12808/15515 [14:08<02:57, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12812: train loss 2.83979:  83%|████████▎ | 12812/15515 [14:08<02:55, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12816: train loss 2.93309:  83%|████████▎ | 12816/15515 [14:09<02:53, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12820: train loss 2.89847:  83%|████████▎ | 12820/15515 [14:09<02:54, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12824: train loss 2.89753:  83%|████████▎ | 12824/15515 [14:09<02:53, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12828: train loss 2.91025:  83%|████████▎ | 12828/15515 [14:09<02:52, 15.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12832: train loss 2.90839:  83%|████████▎ | 12832/15515 [14:10<02:52, 15.58it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12836: train loss 2.94309:  83%|████████▎ | 12836/15515 [14:10<02:54, 15.35it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 12839: train loss 2.89945:  83%|████████▎ | 12840/15515 [14:10<02:57, 15.11it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 12843: train loss 2.93393:  83%|████████▎ | 12844/15515 [14:10<02:57, 15.05it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12847: train loss 2.93130:  83%|████████▎ | 12848/15515 [14:11<02:57, 15.03it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12850: train loss 2.97106:  83%|████████▎ | 12850/15515 [14:11<02:56, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12854: train loss 2.87765:  83%|████████▎ | 12854/15515 [14:11<02:56, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12858: train loss 2.90420:  83%|████████▎ | 12858/15515 [14:11<02:54, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12862: train loss 2.93932:  83%|████████▎ | 12862/15515 [14:12<02:52, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12866: train loss 2.93564:  83%|████████▎ | 12866/15515 [14:12<02:52, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12870: train loss 2.92364:  83%|████████▎ | 12870/15515 [14:12<02:57, 14.88it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12873: train loss 2.89085:  83%|████████▎ | 12874/15515 [14:12<03:00, 14.61it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 12875: train loss 2.92047:  83%|████████▎ | 12876/15515 [14:13<03:05, 14.22it/s]


32459 128
32459 128


epoch 0 iter 12878: train loss 2.89276:  83%|████████▎ | 12878/15515 [14:13<03:07, 14.09it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12882: train loss 2.92130:  83%|████████▎ | 12882/15515 [14:13<03:10, 13.79it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12884: train loss 2.96936:  83%|████████▎ | 12884/15515 [14:13<03:13, 13.63it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12888: train loss 2.92848:  83%|████████▎ | 12888/15515 [14:13<03:08, 13.95it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12891: train loss 2.89189:  83%|████████▎ | 12892/15515 [14:14<03:05, 14.13it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12893: train loss 2.98359:  83%|████████▎ | 12894/15515 [14:14<03:05, 14.11it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12897: train loss 2.93149:  83%|████████▎ | 12898/15515 [14:14<03:06, 14.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12901: train loss 2.95876:  83%|████████▎ | 12902/15515 [14:14<03:03, 14.24it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12903: train loss 2.91806:  83%|████████▎ | 12904/15515 [14:15<03:07, 13.90it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12907: train loss 2.92311:  83%|████████▎ | 12908/15515 [14:15<03:11, 13.60it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12909: train loss 2.84974:  83%|████████▎ | 12910/15515 [14:15<03:11, 13.62it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12913: train loss 2.95057:  83%|████████▎ | 12914/15515 [14:15<03:05, 13.99it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12914: train loss 2.92260:  83%|████████▎ | 12914/15515 [14:15<03:05, 13.99it/s]

32459 128
32459 128


epoch 0 iter 12916: train loss 2.90182:  83%|████████▎ | 12916/15515 [14:16<03:46, 11.46it/s]

32459 128
32459 128


epoch 0 iter 12919: train loss 2.86212:  83%|████████▎ | 12920/15515 [14:16<04:08, 10.43it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12922: train loss 2.91919:  83%|████████▎ | 12922/15515 [14:16<03:52, 11.16it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12925: train loss 2.90291:  83%|████████▎ | 12926/15515 [14:16<03:33, 12.15it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12929: train loss 2.86828:  83%|████████▎ | 12930/15515 [14:17<03:17, 13.09it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12931: train loss 2.90763:  83%|████████▎ | 12932/15515 [14:17<03:15, 13.22it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12935: train loss 2.88414:  83%|████████▎ | 12936/15515 [14:17<03:08, 13.71it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12938: train loss 2.92559:  83%|████████▎ | 12938/15515 [14:17<03:04, 13.95it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12942: train loss 2.87010:  83%|████████▎ | 12942/15515 [14:18<02:57, 14.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12946: train loss 2.94198:  83%|████████▎ | 12946/15515 [14:18<02:53, 14.77it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12948: train loss 2.87221:  83%|████████▎ | 12948/15515 [14:18<03:02, 14.06it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12952: train loss 2.95061:  83%|████████▎ | 12952/15515 [14:18<03:00, 14.22it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 12955: train loss 2.95946:  83%|████████▎ | 12954/15515 [14:19<03:04, 13.91it/s]

 128
32459 128
32459 128


epoch 0 iter 12958: train loss 2.92012:  84%|████████▎ | 12958/15515 [14:19<02:58, 14.33it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12960: train loss 2.92523:  84%|████████▎ | 12960/15515 [14:19<03:00, 14.14it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12964: train loss 2.89942:  84%|████████▎ | 12964/15515 [14:19<02:59, 14.19it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 12967: train loss 2.97194:  84%|████████▎ | 12966/15515 [14:19<03:00, 14.11it/s]

128
32459 128
32459 128


epoch 0 iter 12969: train loss 2.90323:  84%|████████▎ | 12970/15515 [14:20<03:21, 12.63it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12973: train loss 2.91354:  84%|████████▎ | 12974/15515 [14:20<03:07, 13.58it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12977: train loss 2.92482:  84%|████████▎ | 12978/15515 [14:20<03:02, 13.90it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12980: train loss 2.90940:  84%|████████▎ | 12980/15515 [14:20<02:58, 14.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12983: train loss 3.01041:  84%|████████▎ | 12984/15515 [14:21<03:03, 13.80it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12987: train loss 2.90440:  84%|████████▎ | 12988/15515 [14:21<03:05, 13.64it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12989: train loss 2.98452:  84%|████████▎ | 12990/15515 [14:21<03:03, 13.78it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12993: train loss 2.90429:  84%|████████▍ | 12994/15515 [14:21<03:00, 14.00it/s]

32459 128
32459 128
32459 128


epoch 0 iter 12996: train loss 2.93942:  84%|████████▍ | 12996/15515 [14:21<02:58, 14.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 12999: train loss 2.90667:  84%|████████▍ | 13000/15515 [14:22<02:56, 14.28it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13003: train loss 2.90044:  84%|████████▍ | 13004/15515 [14:22<02:53, 14.50it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13006: train loss 2.90756:  84%|████████▍ | 13006/15515 [14:22<02:59, 13.97it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13009: train loss 2.89510:  84%|████████▍ | 13010/15515 [14:22<03:08, 13.26it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13013: train loss 2.89677:  84%|████████▍ | 13014/15515 [14:23<03:00, 13.86it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13015: train loss 2.95876:  84%|████████▍ | 13016/15515 [14:23<03:07, 13.36it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13019: train loss 2.89798:  84%|████████▍ | 13018/15515 [14:23<03:07, 13.31it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13022: train loss 2.89171:  84%|████████▍ | 13022/15515 [14:23<02:59, 13.87it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13024: train loss 2.89763:  84%|████████▍ | 13024/15515 [14:24<03:01, 13.73it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13027: train loss 2.85869:  84%|████████▍ | 13028/15515 [14:24<03:16, 12.65it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13030: train loss 2.91085:  84%|████████▍ | 13030/15515 [14:24<03:27, 12.00it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13033: train loss 2.91545:  84%|████████▍ | 13034/15515 [14:24<03:51, 10.70it/s]

32459 128
32459 128


epoch 0 iter 13035: train loss 2.95027:  84%|████████▍ | 13036/15515 [14:25<04:02, 10.21it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13039: train loss 2.90917:  84%|████████▍ | 13040/15515 [14:25<03:20, 12.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13043: train loss 2.86174:  84%|████████▍ | 13044/15515 [14:25<03:02, 13.56it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13046: train loss 2.89435:  84%|████████▍ | 13046/15515 [14:25<02:55, 14.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13050: train loss 2.86231:  84%|████████▍ | 13050/15515 [14:26<02:45, 14.87it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13054: train loss 2.92749:  84%|████████▍ | 13054/15515 [14:26<02:43, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13058: train loss 2.97482:  84%|████████▍ | 13058/15515 [14:26<02:41, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13062: train loss 2.93540:  84%|████████▍ | 13062/15515 [14:26<02:40, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13066: train loss 2.89655:  84%|████████▍ | 13066/15515 [14:27<02:38, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13070: train loss 2.90122:  84%|████████▍ | 13070/15515 [14:27<02:37, 15.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13074: train loss 2.94052:  84%|████████▍ | 13074/15515 [14:27<02:39, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13078: train loss 2.83604:  84%|████████▍ | 13078/15515 [14:27<02:41, 15.05it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13082: train loss 2.92774:  84%|████████▍ | 13082/15515 [14:28<02:41, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13085: train loss 2.88718:  84%|████████▍ | 13086/15515 [14:28<02:41, 15.00it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13089: train loss 2.92667:  84%|████████▍ | 13090/15515 [14:28<02:39, 15.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13093: train loss 2.92919:  84%|████████▍ | 13094/15515 [14:28<02:37, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13097: train loss 2.90822:  84%|████████▍ | 13098/15515 [14:29<02:40, 15.08it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 13100: train loss 2.90481:  84%|████████▍ | 13100/15515 [14:29<02:38, 15.26it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 13104: train loss 2.91867:  84%|████████▍ | 13104/15515 [14:29<02:37, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13108: train loss 2.91759:  84%|████████▍ | 13108/15515 [14:29<02:37, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13112: train loss 2.96282:  85%|████████▍ | 13112/15515 [14:30<02:37, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13116: train loss 2.96582:  85%|████████▍ | 13116/15515 [14:30<02:39, 15.03it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13119: train loss 2.83719:  85%|████████▍ | 13120/15515 [14:30<02:39, 15.02it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13122: train loss 2.93064:  85%|████████▍ | 13122/15515 [14:30<02:45, 14.42it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13125: train loss 2.95975:  85%|████████▍ | 13126/15515 [14:31<03:02, 13.10it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13128: train loss 2.86526:  85%|████████▍ | 13128/15515 [14:31<03:17, 12.07it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13131: train loss 2.89432:  85%|████████▍ | 13132/15515 [14:31<03:06, 12.78it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13135: train loss 2.85775:  85%|████████▍ | 13136/15515 [14:31<02:59, 13.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13139: train loss 2.91578:  85%|████████▍ | 13140/15515 [14:32<02:47, 14.16it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 13142: train loss 2.88585:  85%|████████▍ | 13142/15515 [14:32<02:45, 14.37it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 13146: train loss 2.84746:  85%|████████▍ | 13146/15515 [14:32<02:42, 14.61it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13149: train loss 2.83740:  85%|████████▍ | 13150/15515 [14:32<02:41, 14.67it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13152: train loss 2.86137:  85%|████████▍ | 13152/15515 [14:33<02:39, 14.81it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13156: train loss 2.92019:  85%|████████▍ | 13156/15515 [14:33<02:36, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13160: train loss 2.89127:  85%|████████▍ | 13160/15515 [14:33<02:33, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13164: train loss 2.90447:  85%|████████▍ | 13164/15515 [14:33<02:32, 15.37it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13166: train loss 2.91375:  85%|████████▍ | 13166/15515 [14:33<02:42, 14.49it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13169: train loss 2.88413:  85%|████████▍ | 13170/15515 [14:34<02:59, 13.05it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13172: train loss 2.92684:  85%|████████▍ | 13172/15515 [14:34<02:57, 13.21it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13175: train loss 2.95343:  85%|████████▍ | 13176/15515 [14:34<03:13, 12.10it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13179: train loss 2.85733:  85%|████████▍ | 13180/15515 [14:34<02:53, 13.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13183: train loss 2.91996:  85%|████████▍ | 13184/15515 [14:35<02:42, 14.33it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 13186: train loss 2.90991:  85%|████████▍ | 13186/15515 [14:35<02:42, 14.34it/s]


32459 128
32459 128
32459 128


epoch 0 iter 13190: train loss 2.91649:  85%|████████▌ | 13190/15515 [14:35<02:37, 14.78it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13193: train loss 2.91004:  85%|████████▌ | 13194/15515 [14:35<02:39, 14.52it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13196: train loss 2.87941:  85%|████████▌ | 13196/15515 [14:36<02:44, 14.10it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13200: train loss 2.93685:  85%|████████▌ | 13200/15515 [14:36<02:46, 13.94it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13204: train loss 2.89966:  85%|████████▌ | 13204/15515 [14:36<02:37, 14.72it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13208: train loss 2.90797:  85%|████████▌ | 13208/15515 [14:36<02:33, 14.99it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13212: train loss 2.87081:  85%|████████▌ | 13212/15515 [14:37<02:31, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13216: train loss 2.98015:  85%|████████▌ | 13216/15515 [14:37<02:29, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13220: train loss 2.90292:  85%|████████▌ | 13220/15515 [14:37<02:30, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13224: train loss 2.90311:  85%|████████▌ | 13224/15515 [14:37<02:30, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13228: train loss 2.85612:  85%|████████▌ | 13228/15515 [14:38<02:31, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13232: train loss 2.84253:  85%|████████▌ | 13232/15515 [14:38<02:30, 15.18it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 13234: train loss 2.88252:  85%|████████▌ | 13234/15515 [14:38<02:35, 14.66it/s]


32459 128
32459 128


epoch 0 iter 13237: train loss 2.93763:  85%|████████▌ | 13238/15515 [14:38<02:46, 13.64it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13240: train loss 2.87513:  85%|████████▌ | 13240/15515 [14:39<02:53, 13.13it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13244: train loss 2.90749:  85%|████████▌ | 13244/15515 [14:39<02:50, 13.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13248: train loss 2.84741:  85%|████████▌ | 13248/15515 [14:39<02:39, 14.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13252: train loss 2.92120:  85%|████████▌ | 13252/15515 [14:39<02:33, 14.74it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13255: train loss 2.85824:  85%|████████▌ | 13256/15515 [14:40<02:40, 14.12it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13258: train loss 2.92834:  85%|████████▌ | 13258/15515 [14:40<02:46, 13.55it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13262: train loss 2.82885:  85%|████████▌ | 13262/15515 [14:40<02:43, 13.80it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13264: train loss 2.83889:  85%|████████▌ | 13264/15515 [14:40<02:39, 14.13it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13268: train loss 2.91946:  86%|████████▌ | 13268/15515 [14:41<02:38, 14.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13271: train loss 2.92311:  86%|████████▌ | 13272/15515 [14:41<02:47, 13.39it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13275: train loss 2.95893:  86%|████████▌ | 13276/15515 [14:41<02:45, 13.57it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13278: train loss 2.89182:  86%|████████▌ | 13278/15515 [14:41<02:45, 13.53it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13281: train loss 2.90123:  86%|████████▌ | 13282/15515 [14:42<02:49, 13.16it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13285: train loss 2.94788:  86%|████████▌ | 13286/15515 [14:42<02:38, 14.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13289: train loss 2.98778:  86%|████████▌ | 13290/15515 [14:42<02:30, 14.77it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13293: train loss 2.91832:  86%|████████▌ | 13294/15515 [14:42<02:27, 15.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13297: train loss 2.83030:  86%|████████▌ | 13298/15515 [14:43<02:28, 14.92it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13300: train loss 2.90522:  86%|████████▌ | 13300/15515 [14:43<02:28, 14.92it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13303: train loss 2.82886:  86%|████████▌ | 13304/15515 [14:43<02:29, 14.81it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13306: train loss 2.80040:  86%|████████▌ | 13306/15515 [14:43<02:28, 14.84it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13310: train loss 2.85920:  86%|████████▌ | 13310/15515 [14:44<02:29, 14.77it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13314: train loss 2.84943:  86%|████████▌ | 13314/15515 [14:44<02:30, 14.62it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13317: train loss 2.87907:  86%|████████▌ | 13318/15515 [14:44<02:29, 14.69it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13320: train loss 2.87942:  86%|████████▌ | 13320/15515 [14:44<02:27, 14.87it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13324: train loss 2.94330:  86%|████████▌ | 13324/15515 [14:44<02:25, 15.08it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13326: train loss 2.89867:  86%|████████▌ | 13326/15515 [14:45<02:28, 14.78it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13329: train loss 2.94174:  86%|████████▌ | 13330/15515 [14:45<02:41, 13.56it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13333: train loss 2.87295:  86%|████████▌ | 13334/15515 [14:45<02:36, 13.90it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13336: train loss 2.91781:  86%|████████▌ | 13336/15515 [14:45<02:35, 14.06it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13339: train loss 2.88755:  86%|████████▌ | 13340/15515 [14:46<02:29, 14.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13343: train loss 2.92841:  86%|████████▌ | 13344/15515 [14:46<02:27, 14.68it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 13346: train loss 2.93838:  86%|████████▌ | 13346/15515 [14:46<02:28, 14.59it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 13350: train loss 2.90168:  86%|████████▌ | 13350/15515 [14:46<02:24, 15.02it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13354: train loss 2.96629:  86%|████████▌ | 13354/15515 [14:47<02:22, 15.15it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 13357: train loss 2.90715:  86%|████████▌ | 13358/15515 [14:47<02:24, 14.92it/s]

 128
32459 128
32459 128


epoch 0 iter 13360: train loss 2.89310:  86%|████████▌ | 13360/15515 [14:47<02:24, 14.91it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13364: train loss 2.94451:  86%|████████▌ | 13364/15515 [14:47<02:23, 14.96it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 13367: train loss 2.96827:  86%|████████▌ | 13368/15515 [14:47<02:21, 15.15it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 13371: train loss 2.92375:  86%|████████▌ | 13372/15515 [14:48<02:22, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13375: train loss 2.88868:  86%|████████▌ | 13376/15515 [14:48<02:24, 14.80it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13378: train loss 2.92817:  86%|████████▌ | 13378/15515 [14:48<02:28, 14.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13381: train loss 2.85661:  86%|████████▋ | 13382/15515 [14:48<02:36, 13.63it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13384: train loss 2.90477:  86%|████████▋ | 13384/15515 [14:49<02:31, 14.02it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13387: train loss 2.95422:  86%|████████▋ | 13388/15515 [14:49<02:36, 13.56it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13390: train loss 2.85754:  86%|████████▋ | 13390/15515 [14:49<02:39, 13.34it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13394: train loss 2.91324:  86%|████████▋ | 13394/15515 [14:49<02:31, 14.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13398: train loss 2.93791:  86%|████████▋ | 13398/15515 [14:50<02:23, 14.75it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13402: train loss 2.93835:  86%|████████▋ | 13402/15515 [14:50<02:21, 14.98it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13406: train loss 2.91395:  86%|████████▋ | 13406/15515 [14:50<02:19, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13410: train loss 2.91637:  86%|████████▋ | 13410/15515 [14:50<02:19, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13414: train loss 2.94009:  86%|████████▋ | 13414/15515 [14:51<02:18, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13418: train loss 2.93871:  86%|████████▋ | 13418/15515 [14:51<02:18, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13422: train loss 2.90306:  87%|████████▋ | 13422/15515 [14:51<02:17, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13426: train loss 2.90263:  87%|████████▋ | 13426/15515 [14:51<02:17, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13430: train loss 2.96703:  87%|████████▋ | 13430/15515 [14:52<02:18, 15.02it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 13433: train loss 2.88858:  87%|████████▋ | 13434/15515 [14:52<02:19, 14.96it/s]

 128
32459 128
32459 128
32459

epoch 0 iter 13436: train loss 2.88953:  87%|████████▋ | 13436/15515 [14:52<02:18, 15.02it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 13440: train loss 2.89655:  87%|████████▋ | 13440/15515 [14:52<02:18, 14.99it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13444: train loss 2.93302:  87%|████████▋ | 13444/15515 [14:53<02:18, 14.99it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13448: train loss 2.84247:  87%|████████▋ | 13448/15515 [14:53<02:16, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13452: train loss 2.86951:  87%|████████▋ | 13452/15515 [14:53<02:15, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13456: train loss 2.88415:  87%|████████▋ | 13456/15515 [14:53<02:14, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13460: train loss 2.91345:  87%|████████▋ | 13460/15515 [14:54<02:12, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13464: train loss 2.84299:  87%|████████▋ | 13464/15515 [14:54<02:13, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13468: train loss 2.94741:  87%|████████▋ | 13468/15515 [14:54<02:13, 15.30it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 13471: train loss 2.91543:  87%|████████▋ | 13472/15515 [14:54<02:15, 15.04it/s]

 128
32459 128
32459 128


epoch 0 iter 13474: train loss 2.91378:  87%|████████▋ | 13474/15515 [14:55<02:16, 15.01it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13478: train loss 2.85976:  87%|████████▋ | 13478/15515 [14:55<02:15, 15.02it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13482: train loss 2.93727:  87%|████████▋ | 13482/15515 [14:55<02:15, 15.04it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 13485: train loss 2.87138:  87%|████████▋ | 13486/15515 [14:55<02:15, 14.97it/s]

 128
32459 128
32459 128


epoch 0 iter 13488: train loss 2.84730:  87%|████████▋ | 13488/15515 [14:56<02:15, 14.95it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13492: train loss 2.85168:  87%|████████▋ | 13492/15515 [14:56<02:14, 15.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13496: train loss 2.91328:  87%|████████▋ | 13496/15515 [14:56<02:13, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13500: train loss 2.88875:  87%|████████▋ | 13500/15515 [14:56<02:12, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13504: train loss 2.87657:  87%|████████▋ | 13504/15515 [14:57<02:13, 15.02it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13508: train loss 2.88461:  87%|████████▋ | 13508/15515 [14:57<02:14, 14.91it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13512: train loss 2.90329:  87%|████████▋ | 13512/15515 [14:57<02:15, 14.80it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 13515: train loss 2.91281:  87%|████████▋ | 13516/15515 [14:57<02:13, 15.03it/s]

 128
32459 128
32459 128


epoch 0 iter 13518: train loss 2.93366:  87%|████████▋ | 13518/15515 [14:58<02:14, 14.84it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13521: train loss 2.85179:  87%|████████▋ | 13522/15515 [14:58<02:13, 14.98it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13525: train loss 2.89224:  87%|████████▋ | 13526/15515 [14:58<02:11, 15.10it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 13528: train loss 2.88107:  87%|████████▋ | 13528/15515 [14:58<02:12, 15.05it/s]


32459 128
32459 128
32459 128


epoch 0 iter 13532: train loss 2.91279:  87%|████████▋ | 13532/15515 [14:59<02:12, 14.96it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13536: train loss 2.88635:  87%|████████▋ | 13536/15515 [14:59<02:12, 14.94it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13539: train loss 2.81655:  87%|████████▋ | 13540/15515 [14:59<02:12, 14.86it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13542: train loss 2.93528:  87%|████████▋ | 13542/15515 [14:59<02:12, 14.84it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13546: train loss 2.90985:  87%|████████▋ | 13546/15515 [14:59<02:11, 14.96it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13550: train loss 2.91072:  87%|████████▋ | 13550/15515 [15:00<02:11, 14.98it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13554: train loss 2.90532:  87%|████████▋ | 13554/15515 [15:00<02:12, 14.80it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13557: train loss 2.88819:  87%|████████▋ | 13558/15515 [15:00<02:12, 14.78it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 13560: train loss 2.89026:  87%|████████▋ | 13560/15515 [15:00<02:11, 14.83it/s]

 128
32459 128
32459 128


epoch 0 iter 13563: train loss 2.90004:  87%|████████▋ | 13564/15515 [15:01<02:11, 14.85it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13567: train loss 2.85750:  87%|████████▋ | 13568/15515 [15:01<02:12, 14.72it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13570: train loss 2.91274:  87%|████████▋ | 13570/15515 [15:01<02:11, 14.78it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13574: train loss 2.88966:  87%|████████▋ | 13574/15515 [15:01<02:10, 14.88it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13578: train loss 2.93838:  88%|████████▊ | 13578/15515 [15:02<02:10, 14.90it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13581: train loss 2.86232:  88%|████████▊ | 13582/15515 [15:02<02:08, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13585: train loss 2.96630:  88%|████████▊ | 13586/15515 [15:02<02:07, 15.08it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 13588: train loss 2.87131:  88%|████████▊ | 13588/15515 [15:02<02:07, 15.16it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 13592: train loss 2.86991:  88%|████████▊ | 13592/15515 [15:03<02:06, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13596: train loss 2.84956:  88%|████████▊ | 13596/15515 [15:03<02:06, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13600: train loss 2.91230:  88%|████████▊ | 13600/15515 [15:03<02:06, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13604: train loss 2.88271:  88%|████████▊ | 13604/15515 [15:03<02:05, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13608: train loss 2.87653:  88%|████████▊ | 13608/15515 [15:04<02:06, 15.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13612: train loss 2.90643:  88%|████████▊ | 13612/15515 [15:04<02:07, 14.97it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13616: train loss 2.89417:  88%|████████▊ | 13616/15515 [15:04<02:06, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13620: train loss 2.93145:  88%|████████▊ | 13620/15515 [15:04<02:05, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13624: train loss 2.85391:  88%|████████▊ | 13624/15515 [15:05<02:04, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13628: train loss 3.01568:  88%|████████▊ | 13628/15515 [15:05<02:04, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13632: train loss 2.77287:  88%|████████▊ | 13632/15515 [15:05<02:03, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13636: train loss 2.96748:  88%|████████▊ | 13636/15515 [15:05<02:03, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13640: train loss 2.89202:  88%|████████▊ | 13640/15515 [15:06<02:04, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13644: train loss 2.86104:  88%|████████▊ | 13644/15515 [15:06<02:04, 14.99it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13648: train loss 2.95173:  88%|████████▊ | 13648/15515 [15:06<02:03, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13652: train loss 2.90493:  88%|████████▊ | 13652/15515 [15:06<02:03, 15.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13656: train loss 2.90723:  88%|████████▊ | 13656/15515 [15:07<02:02, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13660: train loss 2.86043:  88%|████████▊ | 13660/15515 [15:07<02:02, 15.14it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 13663: train loss 2.84116:  88%|████████▊ | 13664/15515 [15:07<02:01, 15.25it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 13667: train loss 2.92661:  88%|████████▊ | 13668/15515 [15:07<01:59, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13671: train loss 2.84878:  88%|████████▊ | 13672/15515 [15:08<01:59, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13675: train loss 2.87148:  88%|████████▊ | 13676/15515 [15:08<01:58, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13679: train loss 2.88971:  88%|████████▊ | 13680/15515 [15:08<01:58, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13683: train loss 2.85999:  88%|████████▊ | 13684/15515 [15:09<01:58, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13687: train loss 2.95308:  88%|████████▊ | 13688/15515 [15:09<01:59, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13691: train loss 2.92551:  88%|████████▊ | 13692/15515 [15:09<01:59, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13695: train loss 2.89057:  88%|████████▊ | 13696/15515 [15:09<01:58, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13699: train loss 2.87577:  88%|████████▊ | 13700/15515 [15:10<01:57, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13703: train loss 2.87298:  88%|████████▊ | 13704/15515 [15:10<01:56, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13707: train loss 2.92734:  88%|████████▊ | 13708/15515 [15:10<01:56, 15.46it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13711: train loss 2.92178:  88%|████████▊ | 13712/15515 [15:10<01:57, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13715: train loss 2.91329:  88%|████████▊ | 13716/15515 [15:11<01:57, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13719: train loss 2.86319:  88%|████████▊ | 13720/15515 [15:11<01:57, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13723: train loss 2.84794:  88%|████████▊ | 13724/15515 [15:11<01:57, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13727: train loss 2.89458:  88%|████████▊ | 13728/15515 [15:11<01:58, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13731: train loss 2.83940:  89%|████████▊ | 13732/15515 [15:12<01:58, 15.03it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 13734: train loss 2.91229:  89%|████████▊ | 13734/15515 [15:12<01:58, 15.05it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 13738: train loss 2.91170:  89%|████████▊ | 13738/15515 [15:12<01:57, 15.07it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13742: train loss 2.89370:  89%|████████▊ | 13742/15515 [15:12<01:58, 14.99it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13746: train loss 2.82635:  89%|████████▊ | 13746/15515 [15:13<01:55, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13750: train loss 2.88182:  89%|████████▊ | 13750/15515 [15:13<01:55, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13754: train loss 2.82260:  89%|████████▊ | 13754/15515 [15:13<01:57, 14.93it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13757: train loss 2.85489:  89%|████████▊ | 13758/15515 [15:13<01:57, 14.89it/s]

32459 128
32459 128


epoch 0 iter 13760: train loss 2.95570:  89%|████████▊ | 13760/15515 [15:14<01:58, 14.84it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 13763: train loss 2.94407:  89%|████████▊ | 13764/15515 [15:14<01:57, 14.88it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 13767: train loss 2.89675:  89%|████████▊ | 13768/15515 [15:14<01:57, 14.85it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13770: train loss 2.86471:  89%|████████▉ | 13770/15515 [15:14<01:57, 14.84it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13774: train loss 2.85253:  89%|████████▉ | 13774/15515 [15:15<01:56, 14.93it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13778: train loss 2.84650:  89%|████████▉ | 13778/15515 [15:15<01:57, 14.82it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13782: train loss 2.89586:  89%|████████▉ | 13782/15515 [15:15<01:56, 14.84it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 13785: train loss 2.91946:  89%|████████▉ | 13786/15515 [15:15<01:56, 14.86it/s]


32459 128
32459 128
32459 128


epoch 0 iter 13789: train loss 2.90506:  89%|████████▉ | 13790/15515 [15:16<01:55, 14.91it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13792: train loss 2.88610:  89%|████████▉ | 13792/15515 [15:16<01:55, 14.91it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 13795: train loss 2.88169:  89%|████████▉ | 13796/15515 [15:16<01:53, 15.09it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 13799: train loss 2.91568:  89%|████████▉ | 13800/15515 [15:16<01:53, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13803: train loss 2.92901:  89%|████████▉ | 13804/15515 [15:16<01:54, 14.89it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13806: train loss 2.89558:  89%|████████▉ | 13806/15515 [15:17<01:54, 14.92it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13810: train loss 2.93231:  89%|████████▉ | 13810/15515 [15:17<01:53, 15.05it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13814: train loss 2.91518:  89%|████████▉ | 13814/15515 [15:17<01:52, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13818: train loss 2.90521:  89%|████████▉ | 13818/15515 [15:17<01:52, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13822: train loss 2.87161:  89%|████████▉ | 13822/15515 [15:18<01:51, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13826: train loss 2.93032:  89%|████████▉ | 13826/15515 [15:18<01:52, 15.07it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 13829: train loss 2.93716:  89%|████████▉ | 13828/15515 [15:18<01:52, 15.01it/s]

 128
32459 128
32459 128


epoch 0 iter 13832: train loss 2.91251:  89%|████████▉ | 13832/15515 [15:18<01:54, 14.69it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13836: train loss 2.92306:  89%|████████▉ | 13836/15515 [15:19<01:52, 14.88it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13840: train loss 2.90757:  89%|████████▉ | 13840/15515 [15:19<01:51, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13844: train loss 2.85957:  89%|████████▉ | 13844/15515 [15:19<01:50, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13848: train loss 2.84418:  89%|████████▉ | 13848/15515 [15:19<01:50, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13852: train loss 2.86964:  89%|████████▉ | 13852/15515 [15:20<01:50, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13856: train loss 2.88427:  89%|████████▉ | 13856/15515 [15:20<01:48, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13860: train loss 2.85530:  89%|████████▉ | 13860/15515 [15:20<01:47, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13864: train loss 2.88100:  89%|████████▉ | 13864/15515 [15:21<01:47, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13868: train loss 2.86876:  89%|████████▉ | 13868/15515 [15:21<01:48, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13872: train loss 2.90699:  89%|████████▉ | 13872/15515 [15:21<01:48, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13876: train loss 2.88583:  89%|████████▉ | 13876/15515 [15:21<01:48, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13880: train loss 2.85744:  89%|████████▉ | 13880/15515 [15:22<01:47, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13884: train loss 2.90775:  89%|████████▉ | 13884/15515 [15:22<01:47, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13888: train loss 2.80469:  90%|████████▉ | 13888/15515 [15:22<01:48, 14.99it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 13891: train loss 2.88239:  90%|████████▉ | 13892/15515 [15:22<01:47, 15.14it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 13895: train loss 2.94378:  90%|████████▉ | 13896/15515 [15:23<01:46, 15.21it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13898: train loss 2.90468:  90%|████████▉ | 13898/15515 [15:23<01:46, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13902: train loss 2.86642:  90%|████████▉ | 13902/15515 [15:23<01:47, 14.95it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13905: train loss 2.85548:  90%|████████▉ | 13906/15515 [15:23<01:47, 14.94it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13909: train loss 2.90476:  90%|████████▉ | 13910/15515 [15:23<01:47, 14.92it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13912: train loss 2.86824:  90%|████████▉ | 13912/15515 [15:24<01:46, 15.00it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13916: train loss 2.86857:  90%|████████▉ | 13916/15515 [15:24<01:46, 15.03it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 13919: train loss 2.88373:  90%|████████▉ | 13920/15515 [15:24<01:46, 14.97it/s]

 128
32459 128
32459 128


epoch 0 iter 13922: train loss 2.84977:  90%|████████▉ | 13922/15515 [15:24<01:46, 14.95it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13926: train loss 2.89263:  90%|████████▉ | 13926/15515 [15:25<01:45, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13930: train loss 2.90577:  90%|████████▉ | 13930/15515 [15:25<01:45, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13934: train loss 2.88650:  90%|████████▉ | 13934/15515 [15:25<01:43, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13938: train loss 2.90719:  90%|████████▉ | 13938/15515 [15:25<01:41, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13942: train loss 2.89912:  90%|████████▉ | 13942/15515 [15:26<01:43, 15.27it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13945: train loss 2.82954:  90%|████████▉ | 13946/15515 [15:26<01:44, 15.06it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13948: train loss 2.86358:  90%|████████▉ | 13948/15515 [15:26<01:44, 14.95it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13952: train loss 2.89010:  90%|████████▉ | 13952/15515 [15:26<01:43, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13956: train loss 2.89229:  90%|████████▉ | 13956/15515 [15:27<01:45, 14.83it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13959: train loss 2.86801:  90%|████████▉ | 13960/15515 [15:27<01:44, 14.93it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13963: train loss 2.87814:  90%|█████████ | 13964/15515 [15:27<01:44, 14.79it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13966: train loss 2.90741:  90%|█████████ | 13966/15515 [15:27<01:44, 14.86it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13970: train loss 2.91382:  90%|█████████ | 13970/15515 [15:28<01:42, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13974: train loss 2.96179:  90%|█████████ | 13974/15515 [15:28<01:41, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13978: train loss 2.88907:  90%|█████████ | 13978/15515 [15:28<01:41, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13982: train loss 2.90087:  90%|█████████ | 13982/15515 [15:28<01:41, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13986: train loss 2.86560:  90%|█████████ | 13986/15515 [15:29<01:41, 15.05it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13990: train loss 2.87702:  90%|█████████ | 13990/15515 [15:29<01:40, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 13994: train loss 2.88429:  90%|█████████ | 13994/15515 [15:29<01:42, 14.82it/s]

32459 128
32459 128
32459 128


epoch 0 iter 13997: train loss 2.85383:  90%|█████████ | 13998/15515 [15:29<01:43, 14.68it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14000: train loss 2.88361:  90%|█████████ | 14000/15515 [15:30<01:42, 14.72it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14004: train loss 2.87713:  90%|█████████ | 14004/15515 [15:30<01:41, 14.84it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 14007: train loss 2.86374:  90%|█████████ | 14008/15515 [15:30<01:39, 15.08it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 14011: train loss 2.93340:  90%|█████████ | 14012/15515 [15:30<01:39, 15.06it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14015: train loss 2.82461:  90%|█████████ | 14014/15515 [15:31<01:39, 15.11it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14017: train loss 2.95322:  90%|█████████ | 14018/15515 [15:31<01:47, 13.88it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14020: train loss 2.83690:  90%|█████████ | 14020/15515 [15:31<01:48, 13.76it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14024: train loss 2.87247:  90%|█████████ | 14024/15515 [15:31<01:49, 13.67it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14028: train loss 2.88464:  90%|█████████ | 14028/15515 [15:31<01:42, 14.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14031: train loss 2.88265:  90%|█████████ | 14032/15515 [15:32<01:40, 14.79it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14034: train loss 2.89227:  90%|█████████ | 14034/15515 [15:32<01:46, 13.94it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14037: train loss 2.85204:  90%|█████████ | 14038/15515 [15:32<01:49, 13.46it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14041: train loss 2.88397:  91%|█████████ | 14042/15515 [15:32<01:42, 14.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14045: train loss 2.88788:  91%|█████████ | 14046/15515 [15:33<01:38, 14.84it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 14048: train loss 2.91799:  91%|█████████ | 14048/15515 [15:33<01:38, 14.89it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 14052: train loss 2.98027:  91%|█████████ | 14052/15515 [15:33<01:38, 14.83it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14056: train loss 2.88042:  91%|█████████ | 14056/15515 [15:33<01:37, 15.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14060: train loss 2.86672:  91%|█████████ | 14060/15515 [15:34<01:34, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14064: train loss 2.89526:  91%|█████████ | 14064/15515 [15:34<01:34, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14068: train loss 2.94652:  91%|█████████ | 14068/15515 [15:34<01:35, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14072: train loss 2.86977:  91%|█████████ | 14072/15515 [15:34<01:34, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14076: train loss 2.92040:  91%|█████████ | 14076/15515 [15:35<01:33, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14080: train loss 2.87967:  91%|█████████ | 14080/15515 [15:35<01:33, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14084: train loss 2.82035:  91%|█████████ | 14084/15515 [15:35<01:33, 15.35it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14088: train loss 2.89646:  91%|█████████ | 14088/15515 [15:35<01:33, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14092: train loss 2.87004:  91%|█████████ | 14092/15515 [15:36<01:33, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14096: train loss 2.90853:  91%|█████████ | 14096/15515 [15:36<01:33, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14100: train loss 2.85123:  91%|█████████ | 14100/15515 [15:36<01:32, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14104: train loss 2.78268:  91%|█████████ | 14104/15515 [15:37<01:33, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14108: train loss 2.93084:  91%|█████████ | 14108/15515 [15:37<01:32, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14112: train loss 2.88075:  91%|█████████ | 14112/15515 [15:37<01:32, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14116: train loss 2.86532:  91%|█████████ | 14116/15515 [15:37<01:32, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14119: train loss 2.91617:  91%|█████████ | 14120/15515 [15:38<01:39, 14.00it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14123: train loss 2.89847:  91%|█████████ | 14124/15515 [15:38<01:35, 14.59it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14125: train loss 2.81598:  91%|█████████ | 14126/15515 [15:38<01:40, 13.85it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14129: train loss 2.85137:  91%|█████████ | 14130/15515 [15:38<01:41, 13.63it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14132: train loss 2.92504:  91%|█████████ | 14132/15515 [15:39<01:39, 13.84it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14135: train loss 2.87897:  91%|█████████ | 14136/15515 [15:39<01:34, 14.54it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14139: train loss 2.84603:  91%|█████████ | 14140/15515 [15:39<01:31, 14.99it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14142: train loss 2.86287:  91%|█████████ | 14142/15515 [15:39<01:32, 14.89it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14145: train loss 2.86502:  91%|█████████ | 14146/15515 [15:39<01:36, 14.14it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14148: train loss 2.87751:  91%|█████████ | 14148/15515 [15:40<01:37, 13.98it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14151: train loss 2.89051:  91%|█████████ | 14152/15515 [15:40<02:00, 11.34it/s]

32459 128
32459 128


epoch 0 iter 14153: train loss 2.97434:  91%|█████████ | 14154/15515 [15:40<02:05, 10.87it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14157: train loss 2.89070:  91%|█████████▏| 14158/15515 [15:40<01:46, 12.73it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14161: train loss 2.88686:  91%|█████████▏| 14162/15515 [15:41<01:38, 13.75it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14164: train loss 2.90776:  91%|█████████▏| 14164/15515 [15:41<01:35, 14.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14168: train loss 2.86573:  91%|█████████▏| 14168/15515 [15:41<01:32, 14.52it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14171: train loss 2.88043:  91%|█████████▏| 14172/15515 [15:41<01:30, 14.81it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14174: train loss 2.80388:  91%|█████████▏| 14174/15515 [15:42<01:30, 14.84it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14178: train loss 2.86285:  91%|█████████▏| 14178/15515 [15:42<01:32, 14.50it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 14181: train loss 2.88137:  91%|█████████▏| 14182/15515 [15:42<01:29, 14.83it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 14185: train loss 2.86535:  91%|█████████▏| 14184/15515 [15:42<01:32, 14.43it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14187: train loss 2.92006:  91%|█████████▏| 14188/15515 [15:42<01:39, 13.38it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14191: train loss 2.90426:  91%|█████████▏| 14190/15515 [15:43<01:36, 13.77it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14193: train loss 2.79606:  91%|█████████▏| 14194/15515 [15:43<01:34, 13.91it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14197: train loss 2.88454:  92%|█████████▏| 14198/15515 [15:43<01:32, 14.21it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14199: train loss 2.88611:  92%|█████████▏| 14200/15515 [15:43<01:36, 13.66it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14202: train loss 2.88653:  92%|█████████▏| 14202/15515 [15:44<01:46, 12.37it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14206: train loss 2.83753:  92%|█████████▏| 14206/15515 [15:44<01:41, 12.91it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14208: train loss 2.93109:  92%|█████████▏| 14208/15515 [15:44<01:37, 13.34it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14211: train loss 2.90726:  92%|█████████▏| 14212/15515 [15:44<01:38, 13.21it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14214: train loss 2.84996:  92%|█████████▏| 14214/15515 [15:45<01:44, 12.45it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14218: train loss 2.87696:  92%|█████████▏| 14218/15515 [15:45<01:39, 13.08it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14221: train loss 2.85566:  92%|█████████▏| 14222/15515 [15:45<01:32, 13.94it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14224: train loss 2.84928:  92%|█████████▏| 14224/15515 [15:45<01:30, 14.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14228: train loss 2.89785:  92%|█████████▏| 14228/15515 [15:46<01:27, 14.65it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14230: train loss 2.89192:  92%|█████████▏| 14230/15515 [15:46<01:30, 14.15it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14233: train loss 2.90806:  92%|█████████▏| 14234/15515 [15:46<01:32, 13.78it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14237: train loss 2.85386:  92%|█████████▏| 14238/15515 [15:46<01:31, 13.89it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14241: train loss 2.85472:  92%|█████████▏| 14242/15515 [15:46<01:27, 14.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14245: train loss 2.97778:  92%|█████████▏| 14246/15515 [15:47<01:26, 14.74it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14248: train loss 2.89149:  92%|█████████▏| 14248/15515 [15:47<01:25, 14.88it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14252: train loss 2.90227:  92%|█████████▏| 14252/15515 [15:47<01:23, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14256: train loss 2.89286:  92%|█████████▏| 14256/15515 [15:47<01:22, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14260: train loss 2.90151:  92%|█████████▏| 14260/15515 [15:48<01:22, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14264: train loss 2.86527:  92%|█████████▏| 14264/15515 [15:48<01:20, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14268: train loss 2.83221:  92%|█████████▏| 14268/15515 [15:48<01:22, 15.09it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14270: train loss 2.89702:  92%|█████████▏| 14270/15515 [15:48<01:25, 14.51it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14274: train loss 2.85029:  92%|█████████▏| 14274/15515 [15:49<01:27, 14.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14278: train loss 2.84724:  92%|█████████▏| 14278/15515 [15:49<01:24, 14.63it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14282: train loss 2.86123:  92%|█████████▏| 14282/15515 [15:49<01:27, 14.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14285: train loss 2.89194:  92%|█████████▏| 14286/15515 [15:49<01:26, 14.17it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14289: train loss 2.89882:  92%|█████████▏| 14288/15515 [15:50<01:27, 14.04it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14291: train loss 2.90450:  92%|█████████▏| 14292/15515 [15:50<01:28, 13.77it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14295: train loss 2.96116:  92%|█████████▏| 14296/15515 [15:50<01:22, 14.70it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14299: train loss 2.95506:  92%|█████████▏| 14300/15515 [15:50<01:20, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14303: train loss 2.89745:  92%|█████████▏| 14304/15515 [15:51<01:19, 15.28it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14307: train loss 2.88955:  92%|█████████▏| 14308/15515 [15:51<01:18, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14310: train loss 2.92949:  92%|█████████▏| 14310/15515 [15:51<01:18, 15.44it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14314: train loss 2.94704:  92%|█████████▏| 14314/15515 [15:51<01:21, 14.74it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14318: train loss 2.93289:  92%|█████████▏| 14318/15515 [15:52<01:20, 14.86it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14322: train loss 2.89034:  92%|█████████▏| 14322/15515 [15:52<01:20, 14.76it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14325: train loss 2.87107:  92%|█████████▏| 14326/15515 [15:52<01:22, 14.41it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14329: train loss 2.91821:  92%|█████████▏| 14330/15515 [15:52<01:19, 14.83it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 14332: train loss 2.87689:  92%|█████████▏| 14332/15515 [15:53<01:19, 14.80it/s]

 128
32459 128
32459 128
32459 128

epoch 0 iter 14334: train loss 2.93554:  92%|█████████▏| 14334/15515 [15:53<01:23, 14.22it/s]


32459 128
32459 128


epoch 0 iter 14338: train loss 2.82533:  92%|█████████▏| 14338/15515 [15:53<01:22, 14.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14341: train loss 2.90016:  92%|█████████▏| 14342/15515 [15:53<01:24, 13.90it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14344: train loss 2.89936:  92%|█████████▏| 14344/15515 [15:54<01:26, 13.53it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14347: train loss 2.83671:  92%|█████████▏| 14348/15515 [15:54<01:39, 11.74it/s]

32459 128
32459 128


epoch 0 iter 14349: train loss 2.90753:  92%|█████████▏| 14350/15515 [15:54<01:47, 10.87it/s]

32459 128
32459 128


epoch 0 iter 14352: train loss 2.87886:  93%|█████████▎| 14352/15515 [15:54<01:43, 11.25it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 14355: train loss 2.87061:  93%|█████████▎| 14356/15515 [15:54<01:29, 12.93it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 14359: train loss 2.94204:  93%|█████████▎| 14360/15515 [15:55<01:21, 14.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14363: train loss 2.88730:  93%|█████████▎| 14364/15515 [15:55<01:19, 14.50it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14366: train loss 2.88391:  93%|█████████▎| 14366/15515 [15:55<01:18, 14.60it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14370: train loss 2.88207:  93%|█████████▎| 14370/15515 [15:55<01:17, 14.71it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14374: train loss 2.83781:  93%|█████████▎| 14374/15515 [15:56<01:16, 14.83it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14378: train loss 2.88382:  93%|█████████▎| 14378/15515 [15:56<01:16, 14.95it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14382: train loss 2.83142:  93%|█████████▎| 14382/15515 [15:56<01:14, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14386: train loss 2.89333:  93%|█████████▎| 14386/15515 [15:56<01:14, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14390: train loss 2.88791:  93%|█████████▎| 14390/15515 [15:57<01:20, 14.04it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14392: train loss 2.85762:  93%|█████████▎| 14392/15515 [15:57<01:17, 14.41it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14396: train loss 2.87079:  93%|█████████▎| 14396/15515 [15:57<01:20, 13.93it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14399: train loss 2.88255:  93%|█████████▎| 14400/15515 [15:57<01:17, 14.35it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14402: train loss 2.83854:  93%|█████████▎| 14402/15515 [15:58<01:18, 14.18it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14404: train loss 2.93486:  93%|█████████▎| 14404/15515 [15:58<01:22, 13.49it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14408: train loss 2.81807:  93%|█████████▎| 14408/15515 [15:58<01:20, 13.75it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14410: train loss 2.91673:  93%|█████████▎| 14410/15515 [15:58<01:21, 13.64it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14413: train loss 2.95053:  93%|█████████▎| 14414/15515 [15:59<01:27, 12.60it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14416: train loss 2.89819:  93%|█████████▎| 14416/15515 [15:59<01:31, 11.95it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14420: train loss 2.85344:  93%|█████████▎| 14420/15515 [15:59<01:29, 12.28it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14423: train loss 2.90309:  93%|█████████▎| 14424/15515 [15:59<01:21, 13.31it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14426: train loss 2.89503:  93%|█████████▎| 14426/15515 [16:00<01:19, 13.77it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14429: train loss 2.86214:  93%|█████████▎| 14430/15515 [16:00<01:15, 14.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14433: train loss 2.85461:  93%|█████████▎| 14434/15515 [16:00<01:13, 14.70it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 14435: train loss 2.92003:  93%|█████████▎| 14436/15515 [16:00<01:15, 14.32it/s]

128
32459 128
32459 128


epoch 0 iter 14438: train loss 2.85572:  93%|█████████▎| 14438/15515 [16:00<01:16, 13.99it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14442: train loss 2.86295:  93%|█████████▎| 14442/15515 [16:01<01:15, 14.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14446: train loss 2.84401:  93%|█████████▎| 14446/15515 [16:01<01:13, 14.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14450: train loss 2.82020:  93%|█████████▎| 14450/15515 [16:01<01:10, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14454: train loss 2.96322:  93%|█████████▎| 14454/15515 [16:01<01:10, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14458: train loss 2.79826:  93%|█████████▎| 14458/15515 [16:02<01:09, 15.21it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14460: train loss 2.85876:  93%|█████████▎| 14460/15515 [16:02<01:14, 14.25it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14463: train loss 2.84422:  93%|█████████▎| 14464/15515 [16:02<01:16, 13.66it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14467: train loss 2.93406:  93%|█████████▎| 14468/15515 [16:02<01:14, 13.99it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14471: train loss 2.85525:  93%|█████████▎| 14472/15515 [16:03<01:11, 14.59it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14475: train loss 2.93446:  93%|█████████▎| 14476/15515 [16:03<01:10, 14.84it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14479: train loss 2.87567:  93%|█████████▎| 14480/15515 [16:03<01:08, 15.00it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14483: train loss 2.87844:  93%|█████████▎| 14484/15515 [16:03<01:08, 15.04it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 14486: train loss 2.88515:  93%|█████████▎| 14486/15515 [16:04<01:08, 15.05it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 14490: train loss 2.89994:  93%|█████████▎| 14490/15515 [16:04<01:07, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14494: train loss 2.88160:  93%|█████████▎| 14494/15515 [16:04<01:08, 14.84it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 14496: train loss 2.90191:  93%|█████████▎| 14496/15515 [16:04<01:08, 14.90it/s]


32459 128
32459 128


epoch 0 iter 14500: train loss 2.86901:  93%|█████████▎| 14500/15515 [16:05<01:09, 14.58it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14504: train loss 2.88686:  93%|█████████▎| 14504/15515 [16:05<01:08, 14.81it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14506: train loss 2.90368:  93%|█████████▎| 14506/15515 [16:05<01:09, 14.49it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14509: train loss 2.87212:  94%|█████████▎| 14510/15515 [16:05<01:13, 13.62it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14513: train loss 2.83006:  94%|█████████▎| 14514/15515 [16:06<01:12, 13.73it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14516: train loss 2.92932:  94%|█████████▎| 14516/15515 [16:06<01:10, 14.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14520: train loss 2.95314:  94%|█████████▎| 14520/15515 [16:06<01:06, 14.90it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14524: train loss 2.91521:  94%|█████████▎| 14524/15515 [16:06<01:05, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14528: train loss 2.93742:  94%|█████████▎| 14528/15515 [16:06<01:05, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14532: train loss 2.92718:  94%|█████████▎| 14532/15515 [16:07<01:04, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14536: train loss 2.90962:  94%|█████████▎| 14536/15515 [16:07<01:04, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14540: train loss 2.94141:  94%|█████████▎| 14540/15515 [16:07<01:04, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14544: train loss 2.89693:  94%|█████████▎| 14544/15515 [16:08<01:04, 15.03it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 14547: train loss 2.90505:  94%|█████████▍| 14548/15515 [16:08<01:04, 14.96it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 14551: train loss 2.85055:  94%|█████████▍| 14552/15515 [16:08<01:03, 15.11it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 14554: train loss 2.90957:  94%|█████████▍| 14554/15515 [16:08<01:03, 15.10it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 14558: train loss 2.82242:  94%|█████████▍| 14558/15515 [16:08<01:03, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14562: train loss 2.83612:  94%|█████████▍| 14562/15515 [16:09<01:02, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14566: train loss 2.90061:  94%|█████████▍| 14566/15515 [16:09<01:03, 14.97it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 14569: train loss 2.86813:  94%|█████████▍| 14570/15515 [16:09<01:03, 14.96it/s]

 128
32459 128
32459 128
32459

epoch 0 iter 14572: train loss 2.89635:  94%|█████████▍| 14572/15515 [16:09<01:02, 15.10it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 14576: train loss 2.90696:  94%|█████████▍| 14576/15515 [16:10<01:02, 15.02it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 14579: train loss 2.88930:  94%|█████████▍| 14580/15515 [16:10<01:01, 15.10it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 14583: train loss 2.86517:  94%|█████████▍| 14584/15515 [16:10<01:00, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14587: train loss 2.89123:  94%|█████████▍| 14588/15515 [16:10<01:00, 15.33it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 14590: train loss 2.89712:  94%|█████████▍| 14590/15515 [16:11<01:00, 15.23it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 14593: train loss 2.85611:  94%|█████████▍| 14594/15515 [16:11<01:04, 14.24it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14596: train loss 2.85395:  94%|█████████▍| 14596/15515 [16:11<01:05, 14.06it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14600: train loss 2.85060:  94%|█████████▍| 14600/15515 [16:11<01:03, 14.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14604: train loss 2.81268:  94%|█████████▍| 14604/15515 [16:12<01:01, 14.88it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14608: train loss 2.91982:  94%|█████████▍| 14608/15515 [16:12<00:59, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14612: train loss 2.77668:  94%|█████████▍| 14612/15515 [16:12<00:59, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14616: train loss 2.93053:  94%|█████████▍| 14616/15515 [16:12<01:00, 14.98it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14619: train loss 2.85307:  94%|█████████▍| 14620/15515 [16:13<00:59, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14623: train loss 2.79322:  94%|█████████▍| 14624/15515 [16:13<00:59, 15.05it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 14626: train loss 2.82290:  94%|█████████▍| 14626/15515 [16:13<00:59, 14.99it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 14630: train loss 2.89694:  94%|█████████▍| 14630/15515 [16:13<00:58, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14634: train loss 2.86672:  94%|█████████▍| 14634/15515 [16:14<00:58, 15.01it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14638: train loss 2.84548:  94%|█████████▍| 14638/15515 [16:14<01:00, 14.59it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14641: train loss 2.88468:  94%|█████████▍| 14642/15515 [16:14<00:59, 14.62it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14643: train loss 2.89848:  94%|█████████▍| 14644/15515 [16:14<01:00, 14.51it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14646: train loss 2.87369:  94%|█████████▍| 14646/15515 [16:14<01:01, 14.03it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14650: train loss 2.88384:  94%|█████████▍| 14650/15515 [16:15<01:02, 13.91it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14653: train loss 2.91034:  94%|█████████▍| 14654/15515 [16:15<01:02, 13.82it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14656: train loss 2.87377:  94%|█████████▍| 14656/15515 [16:15<01:01, 14.03it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14659: train loss 2.88968:  94%|█████████▍| 14660/15515 [16:15<00:59, 14.29it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14662: train loss 2.88661:  95%|█████████▍| 14662/15515 [16:16<00:59, 14.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14666: train loss 2.90882:  95%|█████████▍| 14666/15515 [16:16<01:00, 14.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14670: train loss 2.87752:  95%|█████████▍| 14670/15515 [16:16<00:58, 14.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14673: train loss 2.87769:  95%|█████████▍| 14674/15515 [16:16<00:56, 14.84it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14677: train loss 2.81287:  95%|█████████▍| 14676/15515 [16:17<00:59, 14.01it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14680: train loss 2.89386:  95%|█████████▍| 14680/15515 [16:17<00:59, 14.09it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14683: train loss 2.88084:  95%|█████████▍| 14684/15515 [16:17<00:57, 14.57it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14686: train loss 2.91993:  95%|█████████▍| 14686/15515 [16:17<00:56, 14.73it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14690: train loss 2.87038:  95%|█████████▍| 14690/15515 [16:17<00:55, 14.84it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14693: train loss 2.89750:  95%|█████████▍| 14694/15515 [16:18<00:54, 14.99it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14697: train loss 2.88466:  95%|█████████▍| 14698/15515 [16:18<00:54, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14701: train loss 2.88811:  95%|█████████▍| 14702/15515 [16:18<00:53, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14705: train loss 2.90370:  95%|█████████▍| 14706/15515 [16:18<00:53, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14709: train loss 2.91283:  95%|█████████▍| 14710/15515 [16:19<00:53, 15.09it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14712: train loss 2.90686:  95%|█████████▍| 14712/15515 [16:19<00:53, 15.01it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14715: train loss 2.81471:  95%|█████████▍| 14716/15515 [16:19<00:52, 15.12it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14717: train loss 2.85054:  95%|█████████▍| 14718/15515 [16:19<00:59, 13.30it/s]

32459 128
32459 128


epoch 0 iter 14720: train loss 2.84159:  95%|█████████▍| 14720/15515 [16:20<01:08, 11.55it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14723: train loss 2.85987:  95%|█████████▍| 14724/15515 [16:20<01:06, 11.90it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14726: train loss 2.87392:  95%|█████████▍| 14726/15515 [16:20<01:06, 11.94it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14729: train loss 2.87880:  95%|█████████▍| 14730/15515 [16:20<01:05, 12.07it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14733: train loss 2.82960:  95%|█████████▍| 14734/15515 [16:21<00:58, 13.35it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14736: train loss 2.86426:  95%|█████████▍| 14736/15515 [16:21<00:56, 13.76it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14739: train loss 2.85245:  95%|█████████▌| 14740/15515 [16:21<00:56, 13.72it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14742: train loss 2.90737:  95%|█████████▌| 14742/15515 [16:21<00:56, 13.70it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14746: train loss 2.94342:  95%|█████████▌| 14746/15515 [16:22<00:59, 13.01it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14750: train loss 2.92697:  95%|█████████▌| 14750/15515 [16:22<00:54, 14.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14754: train loss 2.86289:  95%|█████████▌| 14754/15515 [16:22<00:51, 14.75it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14758: train loss 2.83489:  95%|█████████▌| 14758/15515 [16:22<00:50, 14.96it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14761: train loss 2.91072:  95%|█████████▌| 14762/15515 [16:23<00:53, 14.16it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14764: train loss 2.85614:  95%|█████████▌| 14764/15515 [16:23<00:54, 13.74it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14767: train loss 2.87540:  95%|█████████▌| 14768/15515 [16:23<01:00, 12.25it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14771: train loss 2.82167:  95%|█████████▌| 14772/15515 [16:23<00:55, 13.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14775: train loss 2.90439:  95%|█████████▌| 14776/15515 [16:24<00:51, 14.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14779: train loss 2.87607:  95%|█████████▌| 14778/15515 [16:24<00:50, 14.50it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14782: train loss 2.84750:  95%|█████████▌| 14782/15515 [16:24<00:50, 14.48it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14785: train loss 2.92340:  95%|█████████▌| 14786/15515 [16:24<00:52, 13.80it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14788: train loss 2.88158:  95%|█████████▌| 14788/15515 [16:25<00:53, 13.60it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14792: train loss 2.88850:  95%|█████████▌| 14792/15515 [16:25<00:56, 12.90it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14795: train loss 2.88078:  95%|█████████▌| 14796/15515 [16:25<00:51, 13.95it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 14798: train loss 2.89769:  95%|█████████▌| 14798/15515 [16:25<00:50, 14.33it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 14802: train loss 2.86715:  95%|█████████▌| 14802/15515 [16:26<00:48, 14.85it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14806: train loss 2.84356:  95%|█████████▌| 14806/15515 [16:26<00:47, 14.84it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14809: train loss 2.83606:  95%|█████████▌| 14810/15515 [16:26<00:47, 14.86it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14813: train loss 2.84228:  95%|█████████▌| 14814/15515 [16:26<00:47, 14.79it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14816: train loss 2.85689:  95%|█████████▌| 14816/15515 [16:27<00:48, 14.53it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14819: train loss 2.90675:  96%|█████████▌| 14820/15515 [16:27<00:47, 14.75it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14823: train loss 2.85624:  96%|█████████▌| 14824/15515 [16:27<00:46, 14.85it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14826: train loss 2.88621:  96%|█████████▌| 14826/15515 [16:27<00:46, 14.91it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14830: train loss 2.88704:  96%|█████████▌| 14830/15515 [16:27<00:45, 14.99it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14834: train loss 2.85224:  96%|█████████▌| 14834/15515 [16:28<00:45, 14.89it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14838: train loss 2.85740:  96%|█████████▌| 14838/15515 [16:28<00:45, 14.90it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14842: train loss 2.91622:  96%|█████████▌| 14842/15515 [16:28<00:45, 14.83it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14846: train loss 2.86195:  96%|█████████▌| 14846/15515 [16:29<00:44, 14.92it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14850: train loss 2.88244:  96%|█████████▌| 14850/15515 [16:29<00:44, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14854: train loss 2.88458:  96%|█████████▌| 14854/15515 [16:29<00:43, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14858: train loss 2.88544:  96%|█████████▌| 14858/15515 [16:29<00:42, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14862: train loss 2.85497:  96%|█████████▌| 14862/15515 [16:30<00:42, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14866: train loss 2.86311:  96%|█████████▌| 14866/15515 [16:30<00:41, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14870: train loss 2.85686:  96%|█████████▌| 14870/15515 [16:30<00:41, 15.52it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14874: train loss 2.83578:  96%|█████████▌| 14874/15515 [16:30<00:41, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14878: train loss 2.86279:  96%|█████████▌| 14878/15515 [16:31<00:40, 15.56it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14882: train loss 2.94716:  96%|█████████▌| 14882/15515 [16:31<00:40, 15.67it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14886: train loss 2.90598:  96%|█████████▌| 14886/15515 [16:31<00:40, 15.55it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14890: train loss 2.87616:  96%|█████████▌| 14890/15515 [16:31<00:40, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14894: train loss 2.84536:  96%|█████████▌| 14894/15515 [16:32<00:40, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14898: train loss 2.92448:  96%|█████████▌| 14898/15515 [16:32<00:40, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14902: train loss 2.82414:  96%|█████████▌| 14902/15515 [16:32<00:40, 15.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14906: train loss 2.88236:  96%|█████████▌| 14906/15515 [16:32<00:40, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14910: train loss 2.82572:  96%|█████████▌| 14910/15515 [16:33<00:40, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14914: train loss 2.86924:  96%|█████████▌| 14914/15515 [16:33<00:39, 15.10it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 14917: train loss 2.90459:  96%|█████████▌| 14918/15515 [16:33<00:40, 14.81it/s]

 128
32459 128
32459 128


epoch 0 iter 14920: train loss 2.82932:  96%|█████████▌| 14920/15515 [16:33<00:39, 14.90it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14924: train loss 2.83979:  96%|█████████▌| 14924/15515 [16:34<00:39, 14.96it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14928: train loss 2.83040:  96%|█████████▌| 14928/15515 [16:34<00:38, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14932: train loss 2.88466:  96%|█████████▌| 14932/15515 [16:34<00:38, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14936: train loss 2.90440:  96%|█████████▋| 14936/15515 [16:34<00:37, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14939: train loss 2.89963:  96%|█████████▋| 14940/15515 [16:35<00:37, 15.18it/s]

32459 128
32459 128
32459 128


epoch 0 iter 14943: train loss 2.82007:  96%|█████████▋| 14944/15515 [16:35<00:36, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14947: train loss 2.88197:  96%|█████████▋| 14948/15515 [16:35<00:36, 15.44it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14951: train loss 2.88546:  96%|█████████▋| 14952/15515 [16:35<00:36, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14955: train loss 2.91695:  96%|█████████▋| 14956/15515 [16:36<00:36, 15.42it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14959: train loss 2.83677:  96%|█████████▋| 14960/15515 [16:36<00:35, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14963: train loss 2.84258:  96%|█████████▋| 14964/15515 [16:36<00:35, 15.46it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 14966: train loss 2.84528:  96%|█████████▋| 14966/15515 [16:36<00:35, 15.33it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 14970: train loss 2.90658:  96%|█████████▋| 14970/15515 [16:37<00:35, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14974: train loss 2.90348:  97%|█████████▋| 14974/15515 [16:37<00:35, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14978: train loss 2.85594:  97%|█████████▋| 14978/15515 [16:37<00:35, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14982: train loss 2.81710:  97%|█████████▋| 14982/15515 [16:37<00:35, 15.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14986: train loss 2.83950:  97%|█████████▋| 14986/15515 [16:38<00:35, 14.94it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14990: train loss 2.90309:  97%|█████████▋| 14990/15515 [16:38<00:34, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14994: train loss 2.88728:  97%|█████████▋| 14994/15515 [16:38<00:34, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 14998: train loss 2.90174:  97%|█████████▋| 14998/15515 [16:38<00:34, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15002: train loss 2.86928:  97%|█████████▋| 15002/15515 [16:39<00:33, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15006: train loss 2.90282:  97%|█████████▋| 15006/15515 [16:39<00:34, 14.89it/s]

32459 128
32459 128
32459 128


epoch 0 iter 15009: train loss 2.86060:  97%|█████████▋| 15010/15515 [16:39<00:33, 14.97it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15013: train loss 2.79599:  97%|█████████▋| 15014/15515 [16:39<00:32, 15.20it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15017: train loss 2.90448:  97%|█████████▋| 15018/15515 [16:40<00:32, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15021: train loss 2.84556:  97%|█████████▋| 15022/15515 [16:40<00:31, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15025: train loss 2.84176:  97%|█████████▋| 15026/15515 [16:40<00:31, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15029: train loss 2.91272:  97%|█████████▋| 15030/15515 [16:41<00:31, 15.26it/s]

32459 128
32459 128
32459 128
32459 

epoch 0 iter 15032: train loss 2.86413:  97%|█████████▋| 15032/15515 [16:41<00:31, 15.16it/s]

128
32459 128
32459 128
32459 128


epoch 0 iter 15036: train loss 2.89731:  97%|█████████▋| 15036/15515 [16:41<00:32, 14.89it/s]

32459 128
32459 128
32459 128


epoch 0 iter 15039: train loss 2.81850:  97%|█████████▋| 15040/15515 [16:41<00:32, 14.74it/s]

32459 128
32459 128
32459 128


epoch 0 iter 15042: train loss 2.86972:  97%|█████████▋| 15042/15515 [16:41<00:32, 14.73it/s]

32459 128
32459 128
32459 128


epoch 0 iter 15045: train loss 2.83792:  97%|█████████▋| 15046/15515 [16:42<00:31, 14.72it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15049: train loss 2.88532:  97%|█████████▋| 15050/15515 [16:42<00:31, 14.80it/s]

32459 128
32459 128
32459 128


epoch 0 iter 15052: train loss 2.83795:  97%|█████████▋| 15052/15515 [16:42<00:31, 14.68it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15056: train loss 2.88741:  97%|█████████▋| 15056/15515 [16:42<00:31, 14.76it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15060: train loss 2.85531:  97%|█████████▋| 15060/15515 [16:43<00:30, 14.74it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15064: train loss 2.88394:  97%|█████████▋| 15064/15515 [16:43<00:30, 14.94it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15068: train loss 2.89549:  97%|█████████▋| 15068/15515 [16:43<00:29, 14.94it/s]

32459 128
32459 128
32459 128


epoch 0 iter 15071: train loss 2.87667:  97%|█████████▋| 15072/15515 [16:43<00:29, 14.92it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 15074: train loss 2.89509:  97%|█████████▋| 15074/15515 [16:44<00:29, 14.93it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 15078: train loss 2.84433:  97%|█████████▋| 15078/15515 [16:44<00:29, 15.03it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15082: train loss 2.89795:  97%|█████████▋| 15082/15515 [16:44<00:28, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15086: train loss 2.85726:  97%|█████████▋| 15086/15515 [16:44<00:27, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15090: train loss 2.85363:  97%|█████████▋| 15090/15515 [16:45<00:27, 15.50it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15094: train loss 2.87106:  97%|█████████▋| 15094/15515 [16:45<00:27, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15098: train loss 2.80789:  97%|█████████▋| 15098/15515 [16:45<00:26, 15.60it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15102: train loss 2.85534:  97%|█████████▋| 15102/15515 [16:45<00:27, 15.28it/s]

32459 128
32459 128
32459 128


epoch 0 iter 15105: train loss 2.88766:  97%|█████████▋| 15106/15515 [16:46<00:27, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15109: train loss 2.81960:  97%|█████████▋| 15110/15515 [16:46<00:26, 15.12it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 15112: train loss 2.90375:  97%|█████████▋| 15112/15515 [16:46<00:26, 15.06it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 15116: train loss 2.85352:  97%|█████████▋| 15116/15515 [16:46<00:26, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15120: train loss 2.82972:  97%|█████████▋| 15120/15515 [16:47<00:25, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15124: train loss 2.85059:  97%|█████████▋| 15124/15515 [16:47<00:25, 15.22it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15128: train loss 2.89189:  98%|█████████▊| 15128/15515 [16:47<00:25, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15132: train loss 2.86416:  98%|█████████▊| 15132/15515 [16:47<00:25, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15136: train loss 2.90392:  98%|█████████▊| 15136/15515 [16:48<00:25, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15140: train loss 2.87677:  98%|█████████▊| 15140/15515 [16:48<00:24, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15144: train loss 2.86359:  98%|█████████▊| 15144/15515 [16:48<00:24, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15148: train loss 2.87352:  98%|█████████▊| 15148/15515 [16:48<00:24, 15.08it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15152: train loss 2.83259:  98%|█████████▊| 15152/15515 [16:49<00:24, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15156: train loss 2.81150:  98%|█████████▊| 15156/15515 [16:49<00:23, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15160: train loss 2.88808:  98%|█████████▊| 15160/15515 [16:49<00:23, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15164: train loss 2.84988:  98%|█████████▊| 15164/15515 [16:49<00:22, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15168: train loss 2.88596:  98%|█████████▊| 15168/15515 [16:50<00:22, 15.38it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15172: train loss 2.77997:  98%|█████████▊| 15172/15515 [16:50<00:22, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15176: train loss 2.86330:  98%|█████████▊| 15176/15515 [16:50<00:22, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15180: train loss 2.87128:  98%|█████████▊| 15180/15515 [16:51<00:22, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15184: train loss 2.84868:  98%|█████████▊| 15184/15515 [16:51<00:21, 15.26it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15188: train loss 2.85315:  98%|█████████▊| 15188/15515 [16:51<00:21, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15192: train loss 2.83142:  98%|█████████▊| 15192/15515 [16:51<00:21, 15.34it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15196: train loss 2.83776:  98%|█████████▊| 15196/15515 [16:52<00:20, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15200: train loss 2.83273:  98%|█████████▊| 15200/15515 [16:52<00:20, 15.17it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 15203: train loss 2.90670:  98%|█████████▊| 15204/15515 [16:52<00:20, 15.16it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 15207: train loss 2.85116:  98%|█████████▊| 15208/15515 [16:52<00:20, 15.10it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 15210: train loss 2.87498:  98%|█████████▊| 15210/15515 [16:53<00:20, 15.07it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 15214: train loss 2.81871:  98%|█████████▊| 15214/15515 [16:53<00:19, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15218: train loss 2.86336:  98%|█████████▊| 15218/15515 [16:53<00:19, 15.02it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15222: train loss 2.86742:  98%|█████████▊| 15222/15515 [16:53<00:19, 15.10it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15226: train loss 2.87122:  98%|█████████▊| 15226/15515 [16:54<00:18, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15230: train loss 2.92160:  98%|█████████▊| 15230/15515 [16:54<00:18, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15234: train loss 2.84705:  98%|█████████▊| 15234/15515 [16:54<00:18, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15238: train loss 2.88020:  98%|█████████▊| 15238/15515 [16:54<00:18, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15242: train loss 2.83416:  98%|█████████▊| 15242/15515 [16:55<00:18, 15.16it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15246: train loss 2.88385:  98%|█████████▊| 15246/15515 [16:55<00:17, 15.12it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15250: train loss 2.88054:  98%|█████████▊| 15250/15515 [16:55<00:17, 15.17it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15254: train loss 2.91749:  98%|█████████▊| 15254/15515 [16:55<00:17, 15.11it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15258: train loss 2.87050:  98%|█████████▊| 15258/15515 [16:56<00:16, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15262: train loss 2.89263:  98%|█████████▊| 15262/15515 [16:56<00:16, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15266: train loss 2.90799:  98%|█████████▊| 15266/15515 [16:56<00:16, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15270: train loss 2.88027:  98%|█████████▊| 15270/15515 [16:56<00:16, 14.96it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15274: train loss 2.88683:  98%|█████████▊| 15274/15515 [16:57<00:15, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15278: train loss 2.85001:  98%|█████████▊| 15278/15515 [16:57<00:15, 15.09it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 15281: train loss 2.89307:  98%|█████████▊| 15282/15515 [16:57<00:15, 15.09it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 15285: train loss 2.88766:  99%|█████████▊| 15286/15515 [16:57<00:14, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15289: train loss 2.85411:  99%|█████████▊| 15290/15515 [16:58<00:14, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15293: train loss 2.83694:  99%|█████████▊| 15294/15515 [16:58<00:14, 15.24it/s]

32459 128
32459 128
32459 128


epoch 0 iter 15295: train loss 2.86406:  99%|█████████▊| 15296/15515 [16:58<00:14, 14.89it/s]

32459 128
32459 128
32459 128


epoch 0 iter 15299: train loss 2.90741:  99%|█████████▊| 15300/15515 [16:58<00:14, 14.93it/s]

32459 128
32459 128
32459 128
32459

epoch 0 iter 15302: train loss 2.82128:  99%|█████████▊| 15302/15515 [16:59<00:14, 14.95it/s]

 128
32459 128
32459 128
32459 128


epoch 0 iter 15306: train loss 2.83119:  99%|█████████▊| 15306/15515 [16:59<00:13, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15310: train loss 2.85294:  99%|█████████▊| 15310/15515 [16:59<00:13, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15314: train loss 2.90410:  99%|█████████▊| 15314/15515 [16:59<00:13, 15.37it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15318: train loss 2.84368:  99%|█████████▊| 15318/15515 [17:00<00:12, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15322: train loss 2.84572:  99%|█████████▉| 15322/15515 [17:00<00:12, 15.53it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15326: train loss 2.93735:  99%|█████████▉| 15326/15515 [17:00<00:12, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15330: train loss 2.85546:  99%|█████████▉| 15330/15515 [17:00<00:12, 15.36it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15334: train loss 2.85239:  99%|█████████▉| 15334/15515 [17:01<00:11, 15.45it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15338: train loss 2.89839:  99%|█████████▉| 15338/15515 [17:01<00:11, 15.61it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15342: train loss 2.92948:  99%|█████████▉| 15342/15515 [17:01<00:11, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15346: train loss 2.90703:  99%|█████████▉| 15346/15515 [17:01<00:11, 15.32it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15350: train loss 2.86401:  99%|█████████▉| 15350/15515 [17:02<00:10, 15.39it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15354: train loss 2.88352:  99%|█████████▉| 15354/15515 [17:02<00:10, 15.47it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15358: train loss 2.90447:  99%|█████████▉| 15358/15515 [17:02<00:10, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15362: train loss 2.92150:  99%|█████████▉| 15362/15515 [17:02<00:10, 15.21it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15366: train loss 2.95725:  99%|█████████▉| 15366/15515 [17:03<00:09, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15370: train loss 2.86093:  99%|█████████▉| 15370/15515 [17:03<00:09, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15374: train loss 2.90174:  99%|█████████▉| 15374/15515 [17:03<00:09, 15.04it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15378: train loss 2.85117:  99%|█████████▉| 15378/15515 [17:04<00:09, 14.89it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15382: train loss 2.88279:  99%|█████████▉| 15382/15515 [17:04<00:08, 15.23it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15386: train loss 2.79104:  99%|█████████▉| 15386/15515 [17:04<00:08, 15.41it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15390: train loss 2.88214:  99%|█████████▉| 15390/15515 [17:04<00:08, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15394: train loss 2.89946:  99%|█████████▉| 15394/15515 [17:05<00:07, 15.27it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15398: train loss 2.84864:  99%|█████████▉| 15398/15515 [17:05<00:07, 15.31it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15402: train loss 2.86503:  99%|█████████▉| 15402/15515 [17:05<00:07, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15406: train loss 2.89662:  99%|█████████▉| 15406/15515 [17:05<00:07, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15410: train loss 2.84976:  99%|█████████▉| 15410/15515 [17:06<00:06, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15414: train loss 2.84759:  99%|█████████▉| 15414/15515 [17:06<00:06, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15418: train loss 2.80761:  99%|█████████▉| 15418/15515 [17:06<00:06, 15.14it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15422: train loss 2.85566:  99%|█████████▉| 15422/15515 [17:06<00:06, 15.18it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15426: train loss 2.84791:  99%|█████████▉| 15426/15515 [17:07<00:05, 15.40it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15430: train loss 2.84390:  99%|█████████▉| 15430/15515 [17:07<00:05, 15.51it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15434: train loss 2.85337:  99%|█████████▉| 15434/15515 [17:07<00:05, 15.49it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15438: train loss 2.89012: 100%|█████████▉| 15438/15515 [17:07<00:04, 15.44it/s]

32459 128
32459 128
32459 128


epoch 0 iter 15441: train loss 2.80059: 100%|█████████▉| 15442/15515 [17:08<00:04, 15.25it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15445: train loss 2.88709: 100%|█████████▉| 15446/15515 [17:08<00:04, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15449: train loss 2.88958: 100%|█████████▉| 15450/15515 [17:08<00:04, 15.43it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15453: train loss 2.90595: 100%|█████████▉| 15454/15515 [17:08<00:04, 15.02it/s]

32459 128
32459 128
32459 128


epoch 0 iter 15456: train loss 2.84337: 100%|█████████▉| 15456/15515 [17:09<00:03, 15.15it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15460: train loss 2.76989: 100%|█████████▉| 15460/15515 [17:09<00:03, 15.09it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15464: train loss 2.80910: 100%|█████████▉| 15464/15515 [17:09<00:03, 15.13it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15468: train loss 2.87029: 100%|█████████▉| 15468/15515 [17:09<00:03, 15.24it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15472: train loss 2.88750: 100%|█████████▉| 15472/15515 [17:10<00:02, 15.33it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15476: train loss 2.87934: 100%|█████████▉| 15476/15515 [17:10<00:02, 15.19it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15480: train loss 2.81360: 100%|█████████▉| 15480/15515 [17:10<00:02, 15.29it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15484: train loss 2.89854: 100%|█████████▉| 15484/15515 [17:11<00:02, 15.30it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15488: train loss 2.82670: 100%|█████████▉| 15488/15515 [17:11<00:01, 15.34it/s]

32459 128
32459 128
32459 128
32459 128

epoch 0 iter 15491: train loss 2.84711: 100%|█████████▉| 15492/15515 [17:11<00:01, 15.04it/s]


32459 128
32459 128
32459

epoch 0 iter 15494: train loss 2.94370: 100%|█████████▉| 15494/15515 [17:11<00:01, 14.92it/s]

 128
32459 128
32459 128
32459

epoch 0 iter 15497: train loss 2.86359: 100%|█████████▉| 15498/15515 [17:11<00:01, 14.92it/s]

 128
32459 128
32459 128


epoch 0 iter 15500: train loss 2.93495: 100%|█████████▉| 15500/15515 [17:12<00:01, 14.95it/s]

32459 128
32459 128
32459 128


epoch 0 iter 15503: train loss 2.89763: 100%|█████████▉| 15504/15515 [17:12<00:00, 14.85it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15507: train loss 2.86149: 100%|█████████▉| 15508/15515 [17:12<00:00, 14.78it/s]

32459 128
32459 128
32459 128


epoch 0 iter 15510: train loss 2.87943: 100%|█████████▉| 15510/15515 [17:12<00:00, 14.87it/s]

32459 128
32459 128
32459 128
32459 128


epoch 0 iter 15513: train loss 2.88508: 100%|█████████▉| 15514/15515 [17:12<00:00, 14.87it/s]

32459 128
32459 128


epoch 0 iter 15514: train loss 2.80284: 100%|██████████| 15515/15515 [17:13<00:00, 15.01it/s]

32459 128





In [86]:
block_size

32

In [95]:
model.eval()

# def generate_text(model, dataset, device, seed_text, n_words
# ):
#     block_size = 32
#     word2idx = dataset.get_word2idx()
#     idx2word = dataset.get_idx2word()
#     vocab_size = dataset.get_vocab_size()
#     words = dataset.get_words()
#     model.eval()
#     with torch.no_grad():
#         x = torch.tensor([word2idx[s] for s in seed_text], dtype=torch.long).to(device)
#         for _ in range(n_words):
#             x = x.unsqueeze(0)
#             logits = model(x, mask=True)
#             logits = logits[:, -1, :]
#             probas = F.softmax(logits, dim=-1)
#             next_token = torch.multinomial(probas, num_samples=1)
#             next_token = next_token.item()
#             x = torch.cat((x, torch.tensor([[next_token]], dtype=torch.long).to(device)), dim=1)

#     return [idx2word[i] for i in x[0]]

# generate_text(model, train_dataset, device, "the", 100)



MyTransformerDecoderOnly(
  (embedding): Embedding(
    (embedding): Embedding(32459, 128)
  )
  (transpose_embedding): Linear(in_features=128, out_features=32459, bias=True)
  (pos_enc): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (decoder): Decoder(
    (layers): ModuleList(
      (0-1): 2 x DecoderLayer(
        (multi_head_attention): MultiHeadAttention(
          (WO): Linear(in_features=128, out_features=128, bias=True)
          (WQ): Linear(in_features=128, out_features=128, bias=True)
          (WK): Linear(in_features=128, out_features=128, bias=True)
          (WV): Linear(in_features=128, out_features=128, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
        (ff): Sequential(
       