# This notebook trains a CNN for text classification (AG_NEWS dataset) with GloVe embeddings

In [None]:
!pip uninstall --yes torch
!pip install torch==2.2.0
!pip install portalocker
!pip install torchmetrics
!pip install torchtext==0.17.0

In [2]:
import argparse
import logging
import time

import torch
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
from torchtext.data.utils import get_tokenizer, ngrams_iterator
from torchtext.datasets import DATASETS
from torchtext.prototype.transforms import load_sp_model, PRETRAINED_SP_MODEL, SentencePieceTokenizer
from torchtext.utils import download_from_url
from torchtext.vocab import build_vocab_from_iterator
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from torchtext.vocab import GloVe
from tqdm import tqdm

torch.autograd.set_detect_anomaly(True)


### Constants

In [3]:
DATASET = "AG_NEWS"
DATA_DIR = ".data"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EMBED_DIM = 300
LR = 1.0
BATCH_SIZE = 128
NUM_EPOCHS = 5
PADDING_VALUE = 0
PADDING_IDX = PADDING_VALUE

In [4]:
print(DEVICE)

cuda


### Get the tokenizer
- Use the WordLevel tokenizer.


In [5]:
# Get basic tokenizer
basic_english_tokenizer = get_tokenizer("basic_english")

In [6]:
basic_english_tokenizer("This is some text ...")

['this', 'is', 'some', 'text', '.', '.', '.']

In [7]:
TOKENIZER = basic_english_tokenizer

### Get the data and get the vocabulary

In [8]:
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield TOKENIZER(text)

In [9]:
train_iter = DATASETS[DATASET](root=DATA_DIR, split="train")
VOCAB = build_vocab_from_iterator(yield_tokens(train_iter), specials=('<pad>', '<unk>'))

# Make the default index the same as that of the unk_token.
VOCAB.set_default_index(VOCAB['<unk>'])

In [10]:
print(next(iter(train_iter)))

(3, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")


In [11]:
print("Vocabulary size:", len(VOCAB))

Vocabulary size: 95812


### Get GLOVE embeddings

In [12]:
# Get the GloVe embeddings
GLOVE = GloVe(name="840B", dim=EMBED_DIM)

In [13]:
len(GLOVE), GLOVE.vectors.shape

(2196017, torch.Size([2196017, 300]))

### Helper functions

In [14]:
def text_pipeline(text):
    return VOCAB(TOKENIZER(text))

def label_pipeline(label):
    return int(label) - 1

Nice link on collate_fn and DataLoader in PyTorch: https://python.plainenglish.io/understanding-collate-fn-in-pytorch-f9d1742647d3

In [15]:
# As before, loop through the batch and transform into tensors
def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _text) in batch:
        # Get the label from {1, 2, 3, 4} to {0, 1, 2, 3}
        label_list.append(label_pipeline(_label))

        # Return a list of ints
        processed_text = text_pipeline(_text)
        # Append to text_list
        text_list.append(torch.tensor(processed_text, dtype=torch.int64))

    # Pad and make into tensors as needed
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = pad_sequence(text_list, batch_first=True, padding_value=0)

    return label_list.to(DEVICE), text_list.to(DEVICE)

### Get the data

In [16]:
train_iter = DATASETS[DATASET](root=DATA_DIR, split="train")
num_class = len(set([label for (label, _) in train_iter]))

print(f"The number of classes is {num_class} ...")

The number of classes is 4 ...


### Set up the model

Good reference on this type of model
- CNN for Sentence Classification: https://arxiv.org/pdf/1408.5882.pdf

In [17]:
class CNN1dTextClassificationModel(nn.Module):
    def __init__(
        self,
        vocab_size,
        num_class,
        embed_dim = 300,
        use_pretrained = True,
        fine_tune_embeddings = True,
        debug = False
    ):

        super(CNN1dTextClassificationModel, self).__init__()

        self.embedding = nn.Embedding(
            vocab_size,
            embed_dim,
            padding_idx=PADDING_IDX
        )

        if use_pretrained:
            # Set the embeddings to not requiring gradients since we'll try and modify
            self.embedding.weight.requires_grad = False
            for i in range(vocab_size):
                # Get the token for the index i
                token = VOCAB.lookup_token(i)
                # Modify the embedding for index i by the embedding for that token
                # Do this only if token is in the stoi dictionary for GLOVE
                if token in GLOVE.stoi:
                    self.embedding.weight.data[i] = GLOVE.vectors[GLOVE.stoi[token]].clone().detach()
            self.embedding.weight.requires_grad = fine_tune_embeddings
        else:
            # Otherwise, initialize the weights as specified below
            self.init_weights()

        # If weights do not get changed, turn off gradients for the GloVe embeddings
        if not fine_tune_embeddings:
            self.embedding.weight.requires_grad = False

        # Define a Conv1d layer that collapses all the channels and does not collapse the time dimension
        self.cnn1 = nn.Conv1d(in_channels=embed_dim, out_channels=1, kernel_size=1)

        # Define 3 Conv1d layers each having 1 filter and kernel sizes 2, 3 and 4
        self.cnn2 = nn.Conv1d(in_channels=embed_dim, out_channels=1, kernel_size=2)
        self.cnn3 = nn.Conv1d(in_channels=embed_dim, out_channels=1, kernel_size=3)
        self.cnn4 = nn.Conv1d(in_channels=embed_dim, out_channels=1, kernel_size=4)

        # A linear map from some dimensions to num_class
        self.fc = nn.Linear(3, num_class)

        self.debug = debug

    def init_weights(self):
        initrange = 0.5
        # Initialize the embedding weight matrix to uniform between the [-0.5, 0.5]
        self.embedding.weight.data.uniform_(-initrange, initrange)
        # Initialize the weight matrix of fc to uniform between the [-0.5, 0.5]
        self.fc.weight.data.uniform_(-initrange, initrange)
        # Initialize the bias for fc to zero
        self.fc.bias.data.zero_()

    # B = batch_size, L = sequence length, D = vector dimension
    def forward(self, text):

        # Get the embeddings for the text passed in
        embedded = self.embedding(text)

        if self.debug:
            print('embedding', embedded.shape)

        # Transpose the embedding above as needed
        embedded = embedded.permute(0, 2, 1)

        # Pass through cnn1
        cnn1 = self.cnn1(embedded)
        if self.debug:
            print('cnn1', cnn1.shape)

        # Take Max pooling over time
        cnn1 = F.max_pool1d(cnn1, cnn1.shape[2]).squeeze(2)
        if self.debug:
          print('cnn1 after max pool', cnn1.shape)

        # Pass through cnn2 and add a RELU
        cnn2 = F.relu(self.cnn2(embedded))
        if self.debug:
            print('cnn2', cnn2.shape)

        # Pass through cnn3 and add a RELU
        cnn3 = F.relu(self.cnn3(embedded))
        if self.debug:
            print('cnn3', cnn3.shape)

        # Pass through cnn4 and add a RELU
        cnn4 = F.relu(self.cnn4(embedded))
        if self.debug:
            print('cnn4', cnn4.shape)

        # Apply max pooling to each of cnn2, cnn3 and cnn4
        cnn2 = F.max_pool1d(cnn2, cnn2.shape[2]).squeeze(2)
        cnn3 = F.max_pool1d(cnn3, cnn3.shape[2]).squeeze(2)
        cnn4 = F.max_pool1d(cnn4, cnn4.shape[2]).squeeze(2)

        # Apply max pooling over time
        if self.debug:
            print('cnn2 after max', cnn2.shape)

        # Add to each cnn2, 3, 4 a skip connection to cnn1 and average the results
        cnn2 = (cnn2 + cnn1) / 2
        cnn3 = (cnn3 + cnn1) / 2
        cnn4 = (cnn4 + cnn1) / 2
        if self.debug:
            print('cnn2 after skip connection', cnn2.shape)

        # Concatenate the above
        cnn_concat = torch.cat((cnn2, cnn3, cnn4), dim=1)
        if self.debug:
            print('cnn concat', cnn_concat.shape)
            # Set the debug to False after the first forward pass
            self.debug = False

        # Pass through an appropriate Linear layer to get the right dimensions needed
        out = self.fc(cnn_concat)

        return out

### Set up the model

In [18]:
# If this is True, we will initialize the Embedding layer with GLOVE
USE_PRETRANED = True,

# If this is True, we will allow for gradient updates on the nn.Embedding layer
FINE_TUNE_EMBEDDINGS = True

# Set the loss
criterion = nn.CrossEntropyLoss()

model = CNN1dTextClassificationModel(
    vocab_size=len(VOCAB),
    num_class=num_class,
    embed_dim=EMBED_DIM,
    use_pretrained=USE_PRETRANED,
    fine_tune_embeddings=FINE_TUNE_EMBEDDINGS
).to(DEVICE)

# Set the optimizer to SGD
# Add an L2 regularizer of 0.00001
optimizer = torch.optim.SGD(model.parameters(), lr=LR, weight_decay=0.00001)

# Set the scheduler to StepLR with gamma=0.1 and step_size = 1.0
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

### Set up the data

In [19]:
train_iter, test_iter = DATASETS[DATASET]()
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

### Train the model

In [20]:
def train(dataloader, model, optimizer, criterion, epoch):
    model.train()
    total_acc, total_count = 0, 0
    total_loss, total_batches = 0.0, 0
    total_zero_gradients_percentage = []
    log_interval = 10

    for idx, (label, text) in tqdm(enumerate(dataloader)):
        # Set gradients to zero
        optimizer.zero_grad()

        # Get the predictions
        predicted_label = model(text)

        # Get the loss
        loss = criterion(input=predicted_label, target=label)

        # Do back propagation and get the gradients
        loss.backward()

        # Get the loss per batch and the number of batches
        with torch.no_grad():
            total_loss += loss.item()
            total_batches += 1

        # Loop through all the parameters
        # Append this to the list above which will print out the total every 10 batches
        total_nonzero_gradients = 0.0
        total_param_count = 0.0
        for param in model.parameters():
            if param.grad is not None:
                total_nonzero_gradients += (param.grad != 0).sum().item()
                total_param_count += param.grad.numel()

        # Append to total_zero_gradients_percentage
        if total_param_count > 0:
            total_zero_gradients_percentage.append(1 - (total_nonzero_gradients / total_param_count))


        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)

        # Do an optimization step
        optimizer.step()

        # Get the accuracy
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)

        # Log results
        if idx % log_interval == 0 and idx > 0:
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f} "
                "| loss {:8.3f} "
                "| zero gradients percentage {:8.3f}".format(
                    epoch, idx,
                    len(dataloader),
                    total_acc / total_count,
                    total_loss / total_batches,
                    torch.tensor(total_zero_gradients_percentage).mean().item()
                    )
            )
            # Reset variables as needed
            total_acc, total_count = 0, 0
            total_loss, total_batches = 0.0, 0
            total_zero_gradients_percentage = []

In [21]:
def evaluate(dataloader, model, criterion):
    model.eval()
    total_acc, total_count = 0, 0
    total_loss = 0.0

    with torch.no_grad():
        for idx, (label, text) in enumerate(dataloader):
            predicted_label = model(text)
            loss = criterion(input=predicted_label, target=label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
            total_loss += loss.item()
    return total_acc / total_count, total_loss / total_count

In [22]:
for epoch in range(1, NUM_EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader, model, optimizer, criterion, epoch)
    accu_val, loss_val = evaluate(valid_dataloader, model, criterion)
    scheduler.step()
    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(
            epoch,
            time.time() - epoch_start_time,
            accu_val,
            loss_val
            )
    )
    print("-" * 59)

print("Checking the results of test dataset.")
accu_test, loss_test = evaluate(test_dataloader, model, criterion)
print("test accuracy {:8.3f}".format(accu_test))
print("test loss {:8.3f}".format(loss_test))

16it [00:01, 21.43it/s]

| epoch   1 |    10/  891 batches | accuracy    0.354 | loss    1.357 | zero gradients percentage    0.993


25it [00:01, 25.43it/s]

| epoch   1 |    20/  891 batches | accuracy    0.587 | loss    1.222 | zero gradients percentage    0.992


34it [00:01, 27.17it/s]

| epoch   1 |    30/  891 batches | accuracy    0.653 | loss    1.074 | zero gradients percentage    0.993


44it [00:02, 28.10it/s]

| epoch   1 |    40/  891 batches | accuracy    0.640 | loss    0.965 | zero gradients percentage    0.993


57it [00:02, 29.06it/s]

| epoch   1 |    50/  891 batches | accuracy    0.683 | loss    0.866 | zero gradients percentage    0.993


66it [00:02, 28.76it/s]

| epoch   1 |    60/  891 batches | accuracy    0.711 | loss    0.806 | zero gradients percentage    0.993


76it [00:03, 29.56it/s]

| epoch   1 |    70/  891 batches | accuracy    0.718 | loss    0.789 | zero gradients percentage    0.993


85it [00:03, 28.68it/s]

| epoch   1 |    80/  891 batches | accuracy    0.741 | loss    0.718 | zero gradients percentage    0.993


97it [00:03, 30.73it/s]

| epoch   1 |    90/  891 batches | accuracy    0.768 | loss    0.681 | zero gradients percentage    0.993


105it [00:04, 30.52it/s]

| epoch   1 |   100/  891 batches | accuracy    0.795 | loss    0.630 | zero gradients percentage    0.992


117it [00:04, 31.39it/s]

| epoch   1 |   110/  891 batches | accuracy    0.788 | loss    0.614 | zero gradients percentage    0.993


125it [00:04, 31.64it/s]

| epoch   1 |   120/  891 batches | accuracy    0.817 | loss    0.571 | zero gradients percentage    0.992


137it [00:05, 30.79it/s]

| epoch   1 |   130/  891 batches | accuracy    0.811 | loss    0.552 | zero gradients percentage    0.992


145it [00:05, 30.46it/s]

| epoch   1 |   140/  891 batches | accuracy    0.808 | loss    0.557 | zero gradients percentage    0.992


155it [00:05, 29.40it/s]

| epoch   1 |   150/  891 batches | accuracy    0.808 | loss    0.566 | zero gradients percentage    0.992


166it [00:06, 30.03it/s]

| epoch   1 |   160/  891 batches | accuracy    0.816 | loss    0.538 | zero gradients percentage    0.992


174it [00:06, 30.83it/s]

| epoch   1 |   170/  891 batches | accuracy    0.826 | loss    0.527 | zero gradients percentage    0.992


186it [00:06, 30.16it/s]

| epoch   1 |   180/  891 batches | accuracy    0.827 | loss    0.519 | zero gradients percentage    0.992


194it [00:07, 30.92it/s]

| epoch   1 |   190/  891 batches | accuracy    0.836 | loss    0.495 | zero gradients percentage    0.992


206it [00:07, 31.28it/s]

| epoch   1 |   200/  891 batches | accuracy    0.832 | loss    0.498 | zero gradients percentage    0.992


214it [00:07, 31.50it/s]

| epoch   1 |   210/  891 batches | accuracy    0.823 | loss    0.526 | zero gradients percentage    0.992


226it [00:08, 31.16it/s]

| epoch   1 |   220/  891 batches | accuracy    0.831 | loss    0.504 | zero gradients percentage    0.992


234it [00:08, 31.42it/s]

| epoch   1 |   230/  891 batches | accuracy    0.815 | loss    0.495 | zero gradients percentage    0.992


246it [00:08, 30.73it/s]

| epoch   1 |   240/  891 batches | accuracy    0.825 | loss    0.483 | zero gradients percentage    0.992


257it [00:09, 30.16it/s]

| epoch   1 |   250/  891 batches | accuracy    0.837 | loss    0.474 | zero gradients percentage    0.992


265it [00:09, 30.82it/s]

| epoch   1 |   260/  891 batches | accuracy    0.852 | loss    0.445 | zero gradients percentage    0.992


277it [00:09, 31.24it/s]

| epoch   1 |   270/  891 batches | accuracy    0.845 | loss    0.459 | zero gradients percentage    0.992


285it [00:09, 31.19it/s]

| epoch   1 |   280/  891 batches | accuracy    0.827 | loss    0.461 | zero gradients percentage    0.992


297it [00:10, 31.27it/s]

| epoch   1 |   290/  891 batches | accuracy    0.849 | loss    0.430 | zero gradients percentage    0.992


305it [00:10, 32.11it/s]

| epoch   1 |   300/  891 batches | accuracy    0.843 | loss    0.481 | zero gradients percentage    0.992


317it [00:10, 31.38it/s]

| epoch   1 |   310/  891 batches | accuracy    0.838 | loss    0.464 | zero gradients percentage    0.992


325it [00:11, 31.76it/s]

| epoch   1 |   320/  891 batches | accuracy    0.844 | loss    0.471 | zero gradients percentage    0.992


333it [00:11, 31.16it/s]

| epoch   1 |   330/  891 batches | accuracy    0.851 | loss    0.439 | zero gradients percentage    0.992


344it [00:11, 30.62it/s]

| epoch   1 |   340/  891 batches | accuracy    0.841 | loss    0.455 | zero gradients percentage    0.992


356it [00:12, 29.88it/s]

| epoch   1 |   350/  891 batches | accuracy    0.858 | loss    0.416 | zero gradients percentage    0.992


367it [00:12, 30.52it/s]

| epoch   1 |   360/  891 batches | accuracy    0.841 | loss    0.442 | zero gradients percentage    0.992


375it [00:12, 30.82it/s]

| epoch   1 |   370/  891 batches | accuracy    0.834 | loss    0.436 | zero gradients percentage    0.992


387it [00:13, 31.38it/s]

| epoch   1 |   380/  891 batches | accuracy    0.855 | loss    0.437 | zero gradients percentage    0.992


395it [00:13, 31.66it/s]

| epoch   1 |   390/  891 batches | accuracy    0.845 | loss    0.457 | zero gradients percentage    0.992


407it [00:13, 32.08it/s]

| epoch   1 |   400/  891 batches | accuracy    0.848 | loss    0.422 | zero gradients percentage    0.992


415it [00:14, 31.85it/s]

| epoch   1 |   410/  891 batches | accuracy    0.838 | loss    0.443 | zero gradients percentage    0.992


427it [00:14, 32.54it/s]

| epoch   1 |   420/  891 batches | accuracy    0.845 | loss    0.451 | zero gradients percentage    0.992


435it [00:14, 32.51it/s]

| epoch   1 |   430/  891 batches | accuracy    0.865 | loss    0.426 | zero gradients percentage    0.992


447it [00:15, 32.19it/s]

| epoch   1 |   440/  891 batches | accuracy    0.862 | loss    0.407 | zero gradients percentage    0.992


455it [00:15, 32.24it/s]

| epoch   1 |   450/  891 batches | accuracy    0.855 | loss    0.445 | zero gradients percentage    0.992


467it [00:15, 31.51it/s]

| epoch   1 |   460/  891 batches | accuracy    0.851 | loss    0.453 | zero gradients percentage    0.992


475it [00:16, 31.69it/s]

| epoch   1 |   470/  891 batches | accuracy    0.857 | loss    0.405 | zero gradients percentage    0.992


487it [00:16, 31.18it/s]

| epoch   1 |   480/  891 batches | accuracy    0.838 | loss    0.496 | zero gradients percentage    0.992


495it [00:16, 31.28it/s]

| epoch   1 |   490/  891 batches | accuracy    0.846 | loss    0.433 | zero gradients percentage    0.992


507it [00:17, 31.41it/s]

| epoch   1 |   500/  891 batches | accuracy    0.859 | loss    0.432 | zero gradients percentage    0.992


515it [00:17, 31.57it/s]

| epoch   1 |   510/  891 batches | accuracy    0.868 | loss    0.390 | zero gradients percentage    0.992


527it [00:17, 32.26it/s]

| epoch   1 |   520/  891 batches | accuracy    0.843 | loss    0.467 | zero gradients percentage    0.992


535it [00:17, 31.35it/s]

| epoch   1 |   530/  891 batches | accuracy    0.827 | loss    0.475 | zero gradients percentage    0.992


547it [00:18, 31.74it/s]

| epoch   1 |   540/  891 batches | accuracy    0.845 | loss    0.434 | zero gradients percentage    0.992


555it [00:18, 32.07it/s]

| epoch   1 |   550/  891 batches | accuracy    0.851 | loss    0.433 | zero gradients percentage    0.992


567it [00:18, 31.94it/s]

| epoch   1 |   560/  891 batches | accuracy    0.847 | loss    0.453 | zero gradients percentage    0.992


575it [00:19, 31.04it/s]

| epoch   1 |   570/  891 batches | accuracy    0.856 | loss    0.434 | zero gradients percentage    0.992


587it [00:19, 30.49it/s]

| epoch   1 |   580/  891 batches | accuracy    0.834 | loss    0.450 | zero gradients percentage    0.992


595it [00:19, 31.35it/s]

| epoch   1 |   590/  891 batches | accuracy    0.852 | loss    0.426 | zero gradients percentage    0.992


607it [00:20, 30.40it/s]

| epoch   1 |   600/  891 batches | accuracy    0.870 | loss    0.404 | zero gradients percentage    0.992


615it [00:20, 31.13it/s]

| epoch   1 |   610/  891 batches | accuracy    0.862 | loss    0.421 | zero gradients percentage    0.992


627it [00:20, 30.91it/s]

| epoch   1 |   620/  891 batches | accuracy    0.845 | loss    0.443 | zero gradients percentage    0.992


635it [00:21, 30.64it/s]

| epoch   1 |   630/  891 batches | accuracy    0.845 | loss    0.411 | zero gradients percentage    0.992


647it [00:21, 30.72it/s]

| epoch   1 |   640/  891 batches | accuracy    0.858 | loss    0.395 | zero gradients percentage    0.992


655it [00:21, 31.21it/s]

| epoch   1 |   650/  891 batches | accuracy    0.839 | loss    0.458 | zero gradients percentage    0.992


667it [00:22, 31.52it/s]

| epoch   1 |   660/  891 batches | accuracy    0.856 | loss    0.418 | zero gradients percentage    0.992


675it [00:22, 32.17it/s]

| epoch   1 |   670/  891 batches | accuracy    0.837 | loss    0.458 | zero gradients percentage    0.992


687it [00:22, 31.08it/s]

| epoch   1 |   680/  891 batches | accuracy    0.873 | loss    0.389 | zero gradients percentage    0.992


695it [00:23, 31.63it/s]

| epoch   1 |   690/  891 batches | accuracy    0.850 | loss    0.429 | zero gradients percentage    0.992


707it [00:23, 32.16it/s]

| epoch   1 |   700/  891 batches | accuracy    0.853 | loss    0.416 | zero gradients percentage    0.992


715it [00:23, 32.06it/s]

| epoch   1 |   710/  891 batches | accuracy    0.849 | loss    0.449 | zero gradients percentage    0.992


727it [00:24, 31.85it/s]

| epoch   1 |   720/  891 batches | accuracy    0.866 | loss    0.394 | zero gradients percentage    0.992


735it [00:24, 31.90it/s]

| epoch   1 |   730/  891 batches | accuracy    0.859 | loss    0.422 | zero gradients percentage    0.992


747it [00:24, 32.06it/s]

| epoch   1 |   740/  891 batches | accuracy    0.863 | loss    0.419 | zero gradients percentage    0.992


755it [00:24, 32.03it/s]

| epoch   1 |   750/  891 batches | accuracy    0.859 | loss    0.392 | zero gradients percentage    0.992


767it [00:25, 32.26it/s]

| epoch   1 |   760/  891 batches | accuracy    0.864 | loss    0.399 | zero gradients percentage    0.992


775it [00:25, 32.51it/s]

| epoch   1 |   770/  891 batches | accuracy    0.862 | loss    0.399 | zero gradients percentage    0.992


787it [00:25, 31.87it/s]

| epoch   1 |   780/  891 batches | accuracy    0.850 | loss    0.415 | zero gradients percentage    0.992


795it [00:26, 31.78it/s]

| epoch   1 |   790/  891 batches | accuracy    0.850 | loss    0.439 | zero gradients percentage    0.992


807it [00:26, 31.75it/s]

| epoch   1 |   800/  891 batches | accuracy    0.863 | loss    0.401 | zero gradients percentage    0.992


815it [00:26, 31.97it/s]

| epoch   1 |   810/  891 batches | accuracy    0.841 | loss    0.455 | zero gradients percentage    0.992


827it [00:27, 31.96it/s]

| epoch   1 |   820/  891 batches | accuracy    0.854 | loss    0.428 | zero gradients percentage    0.992


835it [00:27, 32.21it/s]

| epoch   1 |   830/  891 batches | accuracy    0.852 | loss    0.413 | zero gradients percentage    0.992


847it [00:27, 31.71it/s]

| epoch   1 |   840/  891 batches | accuracy    0.866 | loss    0.398 | zero gradients percentage    0.992


855it [00:28, 31.11it/s]

| epoch   1 |   850/  891 batches | accuracy    0.852 | loss    0.452 | zero gradients percentage    0.992


863it [00:28, 31.16it/s]

| epoch   1 |   860/  891 batches | accuracy    0.849 | loss    0.410 | zero gradients percentage    0.992


875it [00:28, 30.81it/s]

| epoch   1 |   870/  891 batches | accuracy    0.867 | loss    0.410 | zero gradients percentage    0.992


887it [00:29, 31.47it/s]

| epoch   1 |   880/  891 batches | accuracy    0.860 | loss    0.412 | zero gradients percentage    0.992


891it [00:29, 30.49it/s]


| epoch   1 |   890/  891 batches | accuracy    0.870 | loss    0.381 | zero gradients percentage    0.992
-----------------------------------------------------------
| end of epoch   1 | time: 29.60s | valid accuracy    0.853 
-----------------------------------------------------------


15it [00:00, 31.16it/s]

| epoch   2 |    10/  891 batches | accuracy    0.864 | loss    0.404 | zero gradients percentage    0.992


27it [00:00, 31.76it/s]

| epoch   2 |    20/  891 batches | accuracy    0.864 | loss    0.387 | zero gradients percentage    0.992


35it [00:01, 31.97it/s]

| epoch   2 |    30/  891 batches | accuracy    0.867 | loss    0.382 | zero gradients percentage    0.992


47it [00:01, 31.50it/s]

| epoch   2 |    40/  891 batches | accuracy    0.865 | loss    0.409 | zero gradients percentage    0.992


55it [00:01, 31.02it/s]

| epoch   2 |    50/  891 batches | accuracy    0.870 | loss    0.406 | zero gradients percentage    0.992


67it [00:02, 31.52it/s]

| epoch   2 |    60/  891 batches | accuracy    0.873 | loss    0.368 | zero gradients percentage    0.992


75it [00:02, 31.51it/s]

| epoch   2 |    70/  891 batches | accuracy    0.870 | loss    0.391 | zero gradients percentage    0.992


87it [00:02, 31.10it/s]

| epoch   2 |    80/  891 batches | accuracy    0.871 | loss    0.368 | zero gradients percentage    0.992


95it [00:03, 30.69it/s]

| epoch   2 |    90/  891 batches | accuracy    0.878 | loss    0.357 | zero gradients percentage    0.992


107it [00:03, 31.59it/s]

| epoch   2 |   100/  891 batches | accuracy    0.877 | loss    0.379 | zero gradients percentage    0.992


115it [00:03, 31.35it/s]

| epoch   2 |   110/  891 batches | accuracy    0.861 | loss    0.402 | zero gradients percentage    0.992


127it [00:04, 31.89it/s]

| epoch   2 |   120/  891 batches | accuracy    0.859 | loss    0.413 | zero gradients percentage    0.992


135it [00:04, 32.11it/s]

| epoch   2 |   130/  891 batches | accuracy    0.883 | loss    0.360 | zero gradients percentage    0.992


147it [00:04, 32.34it/s]

| epoch   2 |   140/  891 batches | accuracy    0.873 | loss    0.378 | zero gradients percentage    0.992


155it [00:04, 32.06it/s]

| epoch   2 |   150/  891 batches | accuracy    0.872 | loss    0.361 | zero gradients percentage    0.992


167it [00:05, 32.24it/s]

| epoch   2 |   160/  891 batches | accuracy    0.858 | loss    0.389 | zero gradients percentage    0.992


175it [00:05, 32.42it/s]

| epoch   2 |   170/  891 batches | accuracy    0.866 | loss    0.392 | zero gradients percentage    0.992


187it [00:05, 32.25it/s]

| epoch   2 |   180/  891 batches | accuracy    0.858 | loss    0.408 | zero gradients percentage    0.992


195it [00:06, 32.34it/s]

| epoch   2 |   190/  891 batches | accuracy    0.862 | loss    0.407 | zero gradients percentage    0.992


207it [00:06, 32.44it/s]

| epoch   2 |   200/  891 batches | accuracy    0.869 | loss    0.389 | zero gradients percentage    0.992


215it [00:06, 32.46it/s]

| epoch   2 |   210/  891 batches | accuracy    0.863 | loss    0.397 | zero gradients percentage    0.992


227it [00:07, 31.77it/s]

| epoch   2 |   220/  891 batches | accuracy    0.862 | loss    0.420 | zero gradients percentage    0.992


235it [00:07, 31.64it/s]

| epoch   2 |   230/  891 batches | accuracy    0.863 | loss    0.397 | zero gradients percentage    0.992


247it [00:07, 31.85it/s]

| epoch   2 |   240/  891 batches | accuracy    0.864 | loss    0.422 | zero gradients percentage    0.992


255it [00:08, 32.05it/s]

| epoch   2 |   250/  891 batches | accuracy    0.851 | loss    0.420 | zero gradients percentage    0.992


267it [00:08, 32.34it/s]

| epoch   2 |   260/  891 batches | accuracy    0.866 | loss    0.380 | zero gradients percentage    0.992


275it [00:08, 32.58it/s]

| epoch   2 |   270/  891 batches | accuracy    0.879 | loss    0.351 | zero gradients percentage    0.992


287it [00:09, 32.34it/s]

| epoch   2 |   280/  891 batches | accuracy    0.870 | loss    0.410 | zero gradients percentage    0.992


295it [00:09, 31.68it/s]

| epoch   2 |   290/  891 batches | accuracy    0.873 | loss    0.369 | zero gradients percentage    0.992


307it [00:09, 31.67it/s]

| epoch   2 |   300/  891 batches | accuracy    0.879 | loss    0.351 | zero gradients percentage    0.992


315it [00:09, 31.21it/s]

| epoch   2 |   310/  891 batches | accuracy    0.873 | loss    0.375 | zero gradients percentage    0.992


327it [00:10, 30.20it/s]

| epoch   2 |   320/  891 batches | accuracy    0.854 | loss    0.405 | zero gradients percentage    0.992


335it [00:10, 30.57it/s]

| epoch   2 |   330/  891 batches | accuracy    0.852 | loss    0.423 | zero gradients percentage    0.992


347it [00:10, 31.23it/s]

| epoch   2 |   340/  891 batches | accuracy    0.860 | loss    0.404 | zero gradients percentage    0.992


355it [00:11, 31.46it/s]

| epoch   2 |   350/  891 batches | accuracy    0.862 | loss    0.414 | zero gradients percentage    0.992


367it [00:11, 31.90it/s]

| epoch   2 |   360/  891 batches | accuracy    0.877 | loss    0.385 | zero gradients percentage    0.992


375it [00:11, 31.73it/s]

| epoch   2 |   370/  891 batches | accuracy    0.882 | loss    0.347 | zero gradients percentage    0.992


387it [00:12, 31.78it/s]

| epoch   2 |   380/  891 batches | accuracy    0.852 | loss    0.423 | zero gradients percentage    0.992


395it [00:12, 31.61it/s]

| epoch   2 |   390/  891 batches | accuracy    0.877 | loss    0.366 | zero gradients percentage    0.992


407it [00:12, 31.11it/s]

| epoch   2 |   400/  891 batches | accuracy    0.864 | loss    0.392 | zero gradients percentage    0.992


415it [00:13, 30.88it/s]

| epoch   2 |   410/  891 batches | accuracy    0.873 | loss    0.389 | zero gradients percentage    0.992


427it [00:13, 30.92it/s]

| epoch   2 |   420/  891 batches | accuracy    0.872 | loss    0.381 | zero gradients percentage    0.992


435it [00:13, 31.07it/s]

| epoch   2 |   430/  891 batches | accuracy    0.865 | loss    0.404 | zero gradients percentage    0.992


447it [00:14, 30.22it/s]

| epoch   2 |   440/  891 batches | accuracy    0.866 | loss    0.391 | zero gradients percentage    0.992


455it [00:14, 30.37it/s]

| epoch   2 |   450/  891 batches | accuracy    0.856 | loss    0.417 | zero gradients percentage    0.992


467it [00:14, 30.92it/s]

| epoch   2 |   460/  891 batches | accuracy    0.872 | loss    0.398 | zero gradients percentage    0.992


475it [00:15, 30.95it/s]

| epoch   2 |   470/  891 batches | accuracy    0.861 | loss    0.415 | zero gradients percentage    0.992


487it [00:15, 31.16it/s]

| epoch   2 |   480/  891 batches | accuracy    0.859 | loss    0.409 | zero gradients percentage    0.992


495it [00:15, 31.40it/s]

| epoch   2 |   490/  891 batches | accuracy    0.869 | loss    0.412 | zero gradients percentage    0.992


507it [00:16, 31.33it/s]

| epoch   2 |   500/  891 batches | accuracy    0.880 | loss    0.366 | zero gradients percentage    0.992


515it [00:16, 31.23it/s]

| epoch   2 |   510/  891 batches | accuracy    0.849 | loss    0.419 | zero gradients percentage    0.992


527it [00:16, 31.39it/s]

| epoch   2 |   520/  891 batches | accuracy    0.858 | loss    0.427 | zero gradients percentage    0.992


535it [00:16, 31.59it/s]

| epoch   2 |   530/  891 batches | accuracy    0.868 | loss    0.396 | zero gradients percentage    0.992


547it [00:17, 31.58it/s]

| epoch   2 |   540/  891 batches | accuracy    0.873 | loss    0.384 | zero gradients percentage    0.992


555it [00:17, 31.81it/s]

| epoch   2 |   550/  891 batches | accuracy    0.859 | loss    0.408 | zero gradients percentage    0.992


567it [00:17, 31.59it/s]

| epoch   2 |   560/  891 batches | accuracy    0.870 | loss    0.392 | zero gradients percentage    0.992


575it [00:18, 31.83it/s]

| epoch   2 |   570/  891 batches | accuracy    0.851 | loss    0.389 | zero gradients percentage    0.992


587it [00:18, 31.84it/s]

| epoch   2 |   580/  891 batches | accuracy    0.870 | loss    0.378 | zero gradients percentage    0.992


595it [00:18, 32.00it/s]

| epoch   2 |   590/  891 batches | accuracy    0.862 | loss    0.388 | zero gradients percentage    0.992


607it [00:19, 31.50it/s]

| epoch   2 |   600/  891 batches | accuracy    0.855 | loss    0.390 | zero gradients percentage    0.992


615it [00:19, 31.43it/s]

| epoch   2 |   610/  891 batches | accuracy    0.866 | loss    0.377 | zero gradients percentage    0.992


627it [00:19, 31.35it/s]

| epoch   2 |   620/  891 batches | accuracy    0.860 | loss    0.399 | zero gradients percentage    0.992


635it [00:20, 31.52it/s]

| epoch   2 |   630/  891 batches | accuracy    0.851 | loss    0.425 | zero gradients percentage    0.992


647it [00:20, 30.67it/s]

| epoch   2 |   640/  891 batches | accuracy    0.862 | loss    0.414 | zero gradients percentage    0.992


655it [00:20, 30.27it/s]

| epoch   2 |   650/  891 batches | accuracy    0.868 | loss    0.374 | zero gradients percentage    0.992


667it [00:21, 31.00it/s]

| epoch   2 |   660/  891 batches | accuracy    0.878 | loss    0.400 | zero gradients percentage    0.992


675it [00:21, 30.74it/s]

| epoch   2 |   670/  891 batches | accuracy    0.879 | loss    0.378 | zero gradients percentage    0.992


687it [00:21, 30.86it/s]

| epoch   2 |   680/  891 batches | accuracy    0.871 | loss    0.375 | zero gradients percentage    0.992


695it [00:22, 30.60it/s]

| epoch   2 |   690/  891 batches | accuracy    0.877 | loss    0.375 | zero gradients percentage    0.992


707it [00:22, 31.14it/s]

| epoch   2 |   700/  891 batches | accuracy    0.857 | loss    0.424 | zero gradients percentage    0.992


715it [00:22, 31.02it/s]

| epoch   2 |   710/  891 batches | accuracy    0.873 | loss    0.392 | zero gradients percentage    0.992


727it [00:23, 31.20it/s]

| epoch   2 |   720/  891 batches | accuracy    0.853 | loss    0.430 | zero gradients percentage    0.992


735it [00:23, 31.16it/s]

| epoch   2 |   730/  891 batches | accuracy    0.868 | loss    0.392 | zero gradients percentage    0.992


747it [00:23, 31.14it/s]

| epoch   2 |   740/  891 batches | accuracy    0.872 | loss    0.378 | zero gradients percentage    0.992


755it [00:24, 31.15it/s]

| epoch   2 |   750/  891 batches | accuracy    0.867 | loss    0.387 | zero gradients percentage    0.992


767it [00:24, 30.50it/s]

| epoch   2 |   760/  891 batches | accuracy    0.863 | loss    0.415 | zero gradients percentage    0.992


775it [00:24, 29.80it/s]

| epoch   2 |   770/  891 batches | accuracy    0.846 | loss    0.439 | zero gradients percentage    0.992


787it [00:25, 30.69it/s]

| epoch   2 |   780/  891 batches | accuracy    0.858 | loss    0.388 | zero gradients percentage    0.992


795it [00:25, 30.19it/s]

| epoch   2 |   790/  891 batches | accuracy    0.870 | loss    0.406 | zero gradients percentage    0.992


807it [00:25, 30.52it/s]

| epoch   2 |   800/  891 batches | accuracy    0.871 | loss    0.378 | zero gradients percentage    0.992


815it [00:25, 31.01it/s]

| epoch   2 |   810/  891 batches | accuracy    0.854 | loss    0.411 | zero gradients percentage    0.992


827it [00:26, 31.36it/s]

| epoch   2 |   820/  891 batches | accuracy    0.866 | loss    0.377 | zero gradients percentage    0.992


835it [00:26, 31.37it/s]

| epoch   2 |   830/  891 batches | accuracy    0.880 | loss    0.368 | zero gradients percentage    0.992


847it [00:27, 30.99it/s]

| epoch   2 |   840/  891 batches | accuracy    0.863 | loss    0.384 | zero gradients percentage    0.992


855it [00:27, 31.19it/s]

| epoch   2 |   850/  891 batches | accuracy    0.866 | loss    0.374 | zero gradients percentage    0.992


867it [00:27, 30.91it/s]

| epoch   2 |   860/  891 batches | accuracy    0.874 | loss    0.378 | zero gradients percentage    0.992


875it [00:27, 31.31it/s]

| epoch   2 |   870/  891 batches | accuracy    0.863 | loss    0.379 | zero gradients percentage    0.992


887it [00:28, 31.72it/s]

| epoch   2 |   880/  891 batches | accuracy    0.877 | loss    0.366 | zero gradients percentage    0.992


891it [00:28, 31.33it/s]


| epoch   2 |   890/  891 batches | accuracy    0.856 | loss    0.410 | zero gradients percentage    0.992
-----------------------------------------------------------
| end of epoch   2 | time: 28.79s | valid accuracy    0.865 
-----------------------------------------------------------


16it [00:00, 31.86it/s]

| epoch   3 |    10/  891 batches | accuracy    0.871 | loss    0.372 | zero gradients percentage    0.992


24it [00:00, 31.47it/s]

| epoch   3 |    20/  891 batches | accuracy    0.867 | loss    0.378 | zero gradients percentage    0.992


36it [00:01, 32.07it/s]

| epoch   3 |    30/  891 batches | accuracy    0.864 | loss    0.390 | zero gradients percentage    0.992


44it [00:01, 31.94it/s]

| epoch   3 |    40/  891 batches | accuracy    0.872 | loss    0.359 | zero gradients percentage    0.992


56it [00:01, 32.05it/s]

| epoch   3 |    50/  891 batches | accuracy    0.864 | loss    0.412 | zero gradients percentage    0.992


64it [00:02, 32.11it/s]

| epoch   3 |    60/  891 batches | accuracy    0.880 | loss    0.361 | zero gradients percentage    0.992


76it [00:02, 32.46it/s]

| epoch   3 |    70/  891 batches | accuracy    0.877 | loss    0.393 | zero gradients percentage    0.992


84it [00:02, 32.26it/s]

| epoch   3 |    80/  891 batches | accuracy    0.868 | loss    0.393 | zero gradients percentage    0.992


96it [00:03, 31.62it/s]

| epoch   3 |    90/  891 batches | accuracy    0.861 | loss    0.382 | zero gradients percentage    0.992


104it [00:03, 31.78it/s]

| epoch   3 |   100/  891 batches | accuracy    0.882 | loss    0.349 | zero gradients percentage    0.992


116it [00:03, 31.84it/s]

| epoch   3 |   110/  891 batches | accuracy    0.887 | loss    0.338 | zero gradients percentage    0.992


124it [00:03, 30.85it/s]

| epoch   3 |   120/  891 batches | accuracy    0.868 | loss    0.408 | zero gradients percentage    0.992


136it [00:04, 30.62it/s]

| epoch   3 |   130/  891 batches | accuracy    0.854 | loss    0.401 | zero gradients percentage    0.992


144it [00:04, 31.28it/s]

| epoch   3 |   140/  891 batches | accuracy    0.875 | loss    0.376 | zero gradients percentage    0.992


156it [00:04, 31.98it/s]

| epoch   3 |   150/  891 batches | accuracy    0.861 | loss    0.389 | zero gradients percentage    0.992


164it [00:05, 31.94it/s]

| epoch   3 |   160/  891 batches | accuracy    0.877 | loss    0.370 | zero gradients percentage    0.992


176it [00:05, 32.11it/s]

| epoch   3 |   170/  891 batches | accuracy    0.869 | loss    0.396 | zero gradients percentage    0.992


184it [00:05, 32.28it/s]

| epoch   3 |   180/  891 batches | accuracy    0.869 | loss    0.395 | zero gradients percentage    0.992


196it [00:06, 32.18it/s]

| epoch   3 |   190/  891 batches | accuracy    0.878 | loss    0.390 | zero gradients percentage    0.992


204it [00:06, 32.05it/s]

| epoch   3 |   200/  891 batches | accuracy    0.856 | loss    0.398 | zero gradients percentage    0.992


216it [00:06, 31.91it/s]

| epoch   3 |   210/  891 batches | accuracy    0.866 | loss    0.415 | zero gradients percentage    0.992


224it [00:07, 31.77it/s]

| epoch   3 |   220/  891 batches | accuracy    0.866 | loss    0.387 | zero gradients percentage    0.992


236it [00:07, 32.48it/s]

| epoch   3 |   230/  891 batches | accuracy    0.863 | loss    0.381 | zero gradients percentage    0.992


244it [00:07, 31.61it/s]

| epoch   3 |   240/  891 batches | accuracy    0.874 | loss    0.391 | zero gradients percentage    0.992


256it [00:08, 30.87it/s]

| epoch   3 |   250/  891 batches | accuracy    0.854 | loss    0.415 | zero gradients percentage    0.992


264it [00:08, 31.20it/s]

| epoch   3 |   260/  891 batches | accuracy    0.857 | loss    0.420 | zero gradients percentage    0.992


276it [00:08, 31.00it/s]

| epoch   3 |   270/  891 batches | accuracy    0.875 | loss    0.379 | zero gradients percentage    0.992


284it [00:08, 31.83it/s]

| epoch   3 |   280/  891 batches | accuracy    0.853 | loss    0.410 | zero gradients percentage    0.992


296it [00:09, 31.50it/s]

| epoch   3 |   290/  891 batches | accuracy    0.853 | loss    0.431 | zero gradients percentage    0.992


304it [00:09, 31.78it/s]

| epoch   3 |   300/  891 batches | accuracy    0.883 | loss    0.357 | zero gradients percentage    0.992


316it [00:09, 32.16it/s]

| epoch   3 |   310/  891 batches | accuracy    0.874 | loss    0.362 | zero gradients percentage    0.992


324it [00:10, 32.28it/s]

| epoch   3 |   320/  891 batches | accuracy    0.855 | loss    0.407 | zero gradients percentage    0.992


336it [00:10, 32.40it/s]

| epoch   3 |   330/  891 batches | accuracy    0.893 | loss    0.335 | zero gradients percentage    0.992


344it [00:10, 31.97it/s]

| epoch   3 |   340/  891 batches | accuracy    0.870 | loss    0.378 | zero gradients percentage    0.992


356it [00:11, 31.80it/s]

| epoch   3 |   350/  891 batches | accuracy    0.863 | loss    0.424 | zero gradients percentage    0.992


364it [00:11, 32.12it/s]

| epoch   3 |   360/  891 batches | accuracy    0.859 | loss    0.424 | zero gradients percentage    0.992


376it [00:11, 32.26it/s]

| epoch   3 |   370/  891 batches | accuracy    0.864 | loss    0.385 | zero gradients percentage    0.992


384it [00:12, 32.22it/s]

| epoch   3 |   380/  891 batches | accuracy    0.856 | loss    0.409 | zero gradients percentage    0.992


396it [00:12, 32.34it/s]

| epoch   3 |   390/  891 batches | accuracy    0.882 | loss    0.363 | zero gradients percentage    0.992


404it [00:12, 32.18it/s]

| epoch   3 |   400/  891 batches | accuracy    0.875 | loss    0.377 | zero gradients percentage    0.992


416it [00:13, 32.24it/s]

| epoch   3 |   410/  891 batches | accuracy    0.889 | loss    0.330 | zero gradients percentage    0.992


424it [00:13, 32.17it/s]

| epoch   3 |   420/  891 batches | accuracy    0.863 | loss    0.387 | zero gradients percentage    0.992


436it [00:13, 31.87it/s]

| epoch   3 |   430/  891 batches | accuracy    0.879 | loss    0.394 | zero gradients percentage    0.992


444it [00:13, 32.04it/s]

| epoch   3 |   440/  891 batches | accuracy    0.865 | loss    0.392 | zero gradients percentage    0.992


456it [00:14, 31.67it/s]

| epoch   3 |   450/  891 batches | accuracy    0.874 | loss    0.369 | zero gradients percentage    0.992


464it [00:14, 31.70it/s]

| epoch   3 |   460/  891 batches | accuracy    0.869 | loss    0.399 | zero gradients percentage    0.992


476it [00:14, 31.97it/s]

| epoch   3 |   470/  891 batches | accuracy    0.848 | loss    0.428 | zero gradients percentage    0.992


484it [00:15, 31.86it/s]

| epoch   3 |   480/  891 batches | accuracy    0.877 | loss    0.369 | zero gradients percentage    0.992


496it [00:15, 31.86it/s]

| epoch   3 |   490/  891 batches | accuracy    0.863 | loss    0.385 | zero gradients percentage    0.992


504it [00:15, 32.09it/s]

| epoch   3 |   500/  891 batches | accuracy    0.873 | loss    0.384 | zero gradients percentage    0.992


516it [00:16, 31.87it/s]

| epoch   3 |   510/  891 batches | accuracy    0.870 | loss    0.390 | zero gradients percentage    0.992


524it [00:16, 31.85it/s]

| epoch   3 |   520/  891 batches | accuracy    0.863 | loss    0.373 | zero gradients percentage    0.992


536it [00:16, 31.59it/s]

| epoch   3 |   530/  891 batches | accuracy    0.871 | loss    0.380 | zero gradients percentage    0.992


544it [00:17, 31.54it/s]

| epoch   3 |   540/  891 batches | accuracy    0.849 | loss    0.414 | zero gradients percentage    0.992


556it [00:17, 31.34it/s]

| epoch   3 |   550/  891 batches | accuracy    0.854 | loss    0.415 | zero gradients percentage    0.992


564it [00:17, 31.04it/s]

| epoch   3 |   560/  891 batches | accuracy    0.870 | loss    0.396 | zero gradients percentage    0.992


576it [00:18, 31.78it/s]

| epoch   3 |   570/  891 batches | accuracy    0.875 | loss    0.355 | zero gradients percentage    0.992


584it [00:18, 32.08it/s]

| epoch   3 |   580/  891 batches | accuracy    0.859 | loss    0.394 | zero gradients percentage    0.992


596it [00:18, 31.56it/s]

| epoch   3 |   590/  891 batches | accuracy    0.876 | loss    0.389 | zero gradients percentage    0.992


604it [00:18, 31.19it/s]

| epoch   3 |   600/  891 batches | accuracy    0.873 | loss    0.371 | zero gradients percentage    0.992


616it [00:19, 31.55it/s]

| epoch   3 |   610/  891 batches | accuracy    0.874 | loss    0.374 | zero gradients percentage    0.992


624it [00:19, 30.72it/s]

| epoch   3 |   620/  891 batches | accuracy    0.866 | loss    0.385 | zero gradients percentage    0.992


636it [00:20, 31.27it/s]

| epoch   3 |   630/  891 batches | accuracy    0.855 | loss    0.418 | zero gradients percentage    0.992


644it [00:20, 31.23it/s]

| epoch   3 |   640/  891 batches | accuracy    0.866 | loss    0.398 | zero gradients percentage    0.992


656it [00:20, 31.96it/s]

| epoch   3 |   650/  891 batches | accuracy    0.865 | loss    0.395 | zero gradients percentage    0.992


664it [00:20, 31.89it/s]

| epoch   3 |   660/  891 batches | accuracy    0.865 | loss    0.397 | zero gradients percentage    0.992


676it [00:21, 31.81it/s]

| epoch   3 |   670/  891 batches | accuracy    0.870 | loss    0.393 | zero gradients percentage    0.992


684it [00:21, 32.23it/s]

| epoch   3 |   680/  891 batches | accuracy    0.880 | loss    0.361 | zero gradients percentage    0.992


696it [00:21, 31.97it/s]

| epoch   3 |   690/  891 batches | accuracy    0.865 | loss    0.404 | zero gradients percentage    0.992


704it [00:22, 31.73it/s]

| epoch   3 |   700/  891 batches | accuracy    0.875 | loss    0.363 | zero gradients percentage    0.992


716it [00:22, 31.60it/s]

| epoch   3 |   710/  891 batches | accuracy    0.855 | loss    0.423 | zero gradients percentage    0.992


724it [00:22, 31.87it/s]

| epoch   3 |   720/  891 batches | accuracy    0.880 | loss    0.369 | zero gradients percentage    0.992


736it [00:23, 32.03it/s]

| epoch   3 |   730/  891 batches | accuracy    0.870 | loss    0.402 | zero gradients percentage    0.992


744it [00:23, 32.34it/s]

| epoch   3 |   740/  891 batches | accuracy    0.856 | loss    0.415 | zero gradients percentage    0.992


756it [00:23, 32.53it/s]

| epoch   3 |   750/  891 batches | accuracy    0.863 | loss    0.403 | zero gradients percentage    0.992


764it [00:24, 32.36it/s]

| epoch   3 |   760/  891 batches | accuracy    0.862 | loss    0.376 | zero gradients percentage    0.992


776it [00:24, 32.51it/s]

| epoch   3 |   770/  891 batches | accuracy    0.873 | loss    0.396 | zero gradients percentage    0.992


784it [00:24, 32.48it/s]

| epoch   3 |   780/  891 batches | accuracy    0.870 | loss    0.355 | zero gradients percentage    0.992


796it [00:24, 32.69it/s]

| epoch   3 |   790/  891 batches | accuracy    0.874 | loss    0.360 | zero gradients percentage    0.992


804it [00:25, 32.33it/s]

| epoch   3 |   800/  891 batches | accuracy    0.868 | loss    0.400 | zero gradients percentage    0.992


816it [00:25, 32.05it/s]

| epoch   3 |   810/  891 batches | accuracy    0.866 | loss    0.383 | zero gradients percentage    0.992


824it [00:25, 31.48it/s]

| epoch   3 |   820/  891 batches | accuracy    0.873 | loss    0.392 | zero gradients percentage    0.992


836it [00:26, 31.73it/s]

| epoch   3 |   830/  891 batches | accuracy    0.870 | loss    0.382 | zero gradients percentage    0.992


844it [00:26, 31.72it/s]

| epoch   3 |   840/  891 batches | accuracy    0.870 | loss    0.380 | zero gradients percentage    0.992


856it [00:26, 31.80it/s]

| epoch   3 |   850/  891 batches | accuracy    0.860 | loss    0.393 | zero gradients percentage    0.992


864it [00:27, 31.66it/s]

| epoch   3 |   860/  891 batches | accuracy    0.869 | loss    0.372 | zero gradients percentage    0.992


876it [00:27, 31.91it/s]

| epoch   3 |   870/  891 batches | accuracy    0.875 | loss    0.395 | zero gradients percentage    0.992


884it [00:27, 31.62it/s]

| epoch   3 |   880/  891 batches | accuracy    0.868 | loss    0.390 | zero gradients percentage    0.992


891it [00:28, 31.82it/s]


| epoch   3 |   890/  891 batches | accuracy    0.864 | loss    0.381 | zero gradients percentage    0.992
-----------------------------------------------------------
| end of epoch   3 | time: 28.36s | valid accuracy    0.865 
-----------------------------------------------------------


15it [00:00, 31.44it/s]

| epoch   4 |    10/  891 batches | accuracy    0.873 | loss    0.384 | zero gradients percentage    0.992


27it [00:00, 32.35it/s]

| epoch   4 |    20/  891 batches | accuracy    0.852 | loss    0.425 | zero gradients percentage    0.992


35it [00:01, 32.05it/s]

| epoch   4 |    30/  891 batches | accuracy    0.854 | loss    0.431 | zero gradients percentage    0.992


47it [00:01, 31.82it/s]

| epoch   4 |    40/  891 batches | accuracy    0.866 | loss    0.403 | zero gradients percentage    0.992


55it [00:01, 31.79it/s]

| epoch   4 |    50/  891 batches | accuracy    0.862 | loss    0.416 | zero gradients percentage    0.992


67it [00:02, 31.24it/s]

| epoch   4 |    60/  891 batches | accuracy    0.861 | loss    0.390 | zero gradients percentage    0.992


75it [00:02, 31.39it/s]

| epoch   4 |    70/  891 batches | accuracy    0.852 | loss    0.425 | zero gradients percentage    0.992


87it [00:02, 30.96it/s]

| epoch   4 |    80/  891 batches | accuracy    0.862 | loss    0.400 | zero gradients percentage    0.992


95it [00:03, 31.35it/s]

| epoch   4 |    90/  891 batches | accuracy    0.887 | loss    0.345 | zero gradients percentage    0.992


107it [00:03, 30.67it/s]

| epoch   4 |   100/  891 batches | accuracy    0.860 | loss    0.400 | zero gradients percentage    0.992


115it [00:03, 31.50it/s]

| epoch   4 |   110/  891 batches | accuracy    0.869 | loss    0.385 | zero gradients percentage    0.992


127it [00:04, 32.28it/s]

| epoch   4 |   120/  891 batches | accuracy    0.876 | loss    0.378 | zero gradients percentage    0.992


135it [00:04, 32.77it/s]

| epoch   4 |   130/  891 batches | accuracy    0.873 | loss    0.333 | zero gradients percentage    0.992


147it [00:04, 32.68it/s]

| epoch   4 |   140/  891 batches | accuracy    0.886 | loss    0.348 | zero gradients percentage    0.992


155it [00:04, 32.09it/s]

| epoch   4 |   150/  891 batches | accuracy    0.877 | loss    0.399 | zero gradients percentage    0.992


167it [00:05, 32.06it/s]

| epoch   4 |   160/  891 batches | accuracy    0.880 | loss    0.354 | zero gradients percentage    0.992


175it [00:05, 32.27it/s]

| epoch   4 |   170/  891 batches | accuracy    0.877 | loss    0.340 | zero gradients percentage    0.992


187it [00:05, 32.34it/s]

| epoch   4 |   180/  891 batches | accuracy    0.861 | loss    0.419 | zero gradients percentage    0.992


195it [00:06, 31.82it/s]

| epoch   4 |   190/  891 batches | accuracy    0.875 | loss    0.387 | zero gradients percentage    0.992


207it [00:06, 31.91it/s]

| epoch   4 |   200/  891 batches | accuracy    0.877 | loss    0.376 | zero gradients percentage    0.992


215it [00:06, 32.03it/s]

| epoch   4 |   210/  891 batches | accuracy    0.860 | loss    0.409 | zero gradients percentage    0.992


227it [00:07, 32.08it/s]

| epoch   4 |   220/  891 batches | accuracy    0.884 | loss    0.359 | zero gradients percentage    0.992


235it [00:07, 31.97it/s]

| epoch   4 |   230/  891 batches | accuracy    0.856 | loss    0.401 | zero gradients percentage    0.992


247it [00:07, 31.75it/s]

| epoch   4 |   240/  891 batches | accuracy    0.873 | loss    0.383 | zero gradients percentage    0.992


255it [00:08, 31.87it/s]

| epoch   4 |   250/  891 batches | accuracy    0.880 | loss    0.381 | zero gradients percentage    0.992


267it [00:08, 31.75it/s]

| epoch   4 |   260/  891 batches | accuracy    0.895 | loss    0.330 | zero gradients percentage    0.992


275it [00:08, 31.66it/s]

| epoch   4 |   270/  891 batches | accuracy    0.861 | loss    0.421 | zero gradients percentage    0.992


287it [00:09, 32.01it/s]

| epoch   4 |   280/  891 batches | accuracy    0.859 | loss    0.393 | zero gradients percentage    0.992


295it [00:09, 32.14it/s]

| epoch   4 |   290/  891 batches | accuracy    0.859 | loss    0.392 | zero gradients percentage    0.992


307it [00:09, 31.96it/s]

| epoch   4 |   300/  891 batches | accuracy    0.870 | loss    0.371 | zero gradients percentage    0.992


315it [00:09, 31.82it/s]

| epoch   4 |   310/  891 batches | accuracy    0.881 | loss    0.370 | zero gradients percentage    0.992


327it [00:10, 31.95it/s]

| epoch   4 |   320/  891 batches | accuracy    0.877 | loss    0.341 | zero gradients percentage    0.992


335it [00:10, 31.36it/s]

| epoch   4 |   330/  891 batches | accuracy    0.865 | loss    0.384 | zero gradients percentage    0.992


347it [00:10, 31.69it/s]

| epoch   4 |   340/  891 batches | accuracy    0.859 | loss    0.390 | zero gradients percentage    0.992


355it [00:11, 31.89it/s]

| epoch   4 |   350/  891 batches | accuracy    0.875 | loss    0.384 | zero gradients percentage    0.992


367it [00:11, 31.94it/s]

| epoch   4 |   360/  891 batches | accuracy    0.879 | loss    0.366 | zero gradients percentage    0.992


375it [00:11, 32.16it/s]

| epoch   4 |   370/  891 batches | accuracy    0.881 | loss    0.367 | zero gradients percentage    0.992


387it [00:12, 31.55it/s]

| epoch   4 |   380/  891 batches | accuracy    0.870 | loss    0.382 | zero gradients percentage    0.992


395it [00:12, 30.80it/s]

| epoch   4 |   390/  891 batches | accuracy    0.872 | loss    0.391 | zero gradients percentage    0.992


407it [00:12, 31.14it/s]

| epoch   4 |   400/  891 batches | accuracy    0.862 | loss    0.391 | zero gradients percentage    0.992


415it [00:13, 31.69it/s]

| epoch   4 |   410/  891 batches | accuracy    0.854 | loss    0.411 | zero gradients percentage    0.992


427it [00:13, 30.98it/s]

| epoch   4 |   420/  891 batches | accuracy    0.873 | loss    0.379 | zero gradients percentage    0.992


435it [00:13, 31.63it/s]

| epoch   4 |   430/  891 batches | accuracy    0.887 | loss    0.371 | zero gradients percentage    0.992


447it [00:14, 31.90it/s]

| epoch   4 |   440/  891 batches | accuracy    0.859 | loss    0.392 | zero gradients percentage    0.992


455it [00:14, 31.82it/s]

| epoch   4 |   450/  891 batches | accuracy    0.857 | loss    0.425 | zero gradients percentage    0.992


467it [00:14, 32.29it/s]

| epoch   4 |   460/  891 batches | accuracy    0.877 | loss    0.397 | zero gradients percentage    0.992


475it [00:14, 32.19it/s]

| epoch   4 |   470/  891 batches | accuracy    0.856 | loss    0.413 | zero gradients percentage    0.992


487it [00:15, 32.04it/s]

| epoch   4 |   480/  891 batches | accuracy    0.855 | loss    0.385 | zero gradients percentage    0.992


495it [00:15, 31.89it/s]

| epoch   4 |   490/  891 batches | accuracy    0.866 | loss    0.398 | zero gradients percentage    0.992


507it [00:15, 32.19it/s]

| epoch   4 |   500/  891 batches | accuracy    0.851 | loss    0.445 | zero gradients percentage    0.992


515it [00:16, 32.52it/s]

| epoch   4 |   510/  891 batches | accuracy    0.873 | loss    0.351 | zero gradients percentage    0.992


527it [00:16, 32.26it/s]

| epoch   4 |   520/  891 batches | accuracy    0.855 | loss    0.394 | zero gradients percentage    0.992


535it [00:16, 32.33it/s]

| epoch   4 |   530/  891 batches | accuracy    0.884 | loss    0.364 | zero gradients percentage    0.992


547it [00:17, 32.09it/s]

| epoch   4 |   540/  891 batches | accuracy    0.854 | loss    0.405 | zero gradients percentage    0.992


555it [00:17, 31.82it/s]

| epoch   4 |   550/  891 batches | accuracy    0.871 | loss    0.380 | zero gradients percentage    0.992


567it [00:17, 31.57it/s]

| epoch   4 |   560/  891 batches | accuracy    0.865 | loss    0.390 | zero gradients percentage    0.992


575it [00:18, 31.50it/s]

| epoch   4 |   570/  891 batches | accuracy    0.880 | loss    0.373 | zero gradients percentage    0.992


587it [00:18, 31.12it/s]

| epoch   4 |   580/  891 batches | accuracy    0.872 | loss    0.386 | zero gradients percentage    0.992


595it [00:18, 31.37it/s]

| epoch   4 |   590/  891 batches | accuracy    0.861 | loss    0.397 | zero gradients percentage    0.992


607it [00:19, 31.99it/s]

| epoch   4 |   600/  891 batches | accuracy    0.877 | loss    0.360 | zero gradients percentage    0.992


615it [00:19, 31.95it/s]

| epoch   4 |   610/  891 batches | accuracy    0.866 | loss    0.402 | zero gradients percentage    0.992


627it [00:19, 31.59it/s]

| epoch   4 |   620/  891 batches | accuracy    0.879 | loss    0.370 | zero gradients percentage    0.992


635it [00:19, 31.44it/s]

| epoch   4 |   630/  891 batches | accuracy    0.874 | loss    0.386 | zero gradients percentage    0.992


647it [00:20, 31.52it/s]

| epoch   4 |   640/  891 batches | accuracy    0.868 | loss    0.373 | zero gradients percentage    0.992


655it [00:20, 31.46it/s]

| epoch   4 |   650/  891 batches | accuracy    0.893 | loss    0.340 | zero gradients percentage    0.992


667it [00:21, 31.14it/s]

| epoch   4 |   660/  891 batches | accuracy    0.875 | loss    0.362 | zero gradients percentage    0.992


675it [00:21, 31.51it/s]

| epoch   4 |   670/  891 batches | accuracy    0.859 | loss    0.404 | zero gradients percentage    0.992


687it [00:21, 31.33it/s]

| epoch   4 |   680/  891 batches | accuracy    0.866 | loss    0.400 | zero gradients percentage    0.992


695it [00:21, 30.74it/s]

| epoch   4 |   690/  891 batches | accuracy    0.848 | loss    0.399 | zero gradients percentage    0.992


707it [00:22, 30.45it/s]

| epoch   4 |   700/  891 batches | accuracy    0.871 | loss    0.391 | zero gradients percentage    0.992


715it [00:22, 31.04it/s]

| epoch   4 |   710/  891 batches | accuracy    0.870 | loss    0.378 | zero gradients percentage    0.992


727it [00:22, 30.83it/s]

| epoch   4 |   720/  891 batches | accuracy    0.865 | loss    0.384 | zero gradients percentage    0.992


735it [00:23, 31.60it/s]

| epoch   4 |   730/  891 batches | accuracy    0.870 | loss    0.375 | zero gradients percentage    0.992


747it [00:23, 31.02it/s]

| epoch   4 |   740/  891 batches | accuracy    0.884 | loss    0.365 | zero gradients percentage    0.992


755it [00:23, 31.12it/s]

| epoch   4 |   750/  891 batches | accuracy    0.860 | loss    0.406 | zero gradients percentage    0.992


767it [00:24, 31.42it/s]

| epoch   4 |   760/  891 batches | accuracy    0.878 | loss    0.365 | zero gradients percentage    0.992


775it [00:24, 31.00it/s]

| epoch   4 |   770/  891 batches | accuracy    0.848 | loss    0.421 | zero gradients percentage    0.992


786it [00:24, 29.32it/s]

| epoch   4 |   780/  891 batches | accuracy    0.851 | loss    0.419 | zero gradients percentage    0.992


796it [00:25, 29.66it/s]

| epoch   4 |   790/  891 batches | accuracy    0.839 | loss    0.474 | zero gradients percentage    0.992


804it [00:25, 30.81it/s]

| epoch   4 |   800/  891 batches | accuracy    0.874 | loss    0.384 | zero gradients percentage    0.992


816it [00:25, 31.36it/s]

| epoch   4 |   810/  891 batches | accuracy    0.870 | loss    0.388 | zero gradients percentage    0.992


824it [00:26, 31.62it/s]

| epoch   4 |   820/  891 batches | accuracy    0.857 | loss    0.407 | zero gradients percentage    0.992


836it [00:26, 31.53it/s]

| epoch   4 |   830/  891 batches | accuracy    0.869 | loss    0.384 | zero gradients percentage    0.992


844it [00:26, 31.58it/s]

| epoch   4 |   840/  891 batches | accuracy    0.865 | loss    0.391 | zero gradients percentage    0.992


856it [00:27, 31.44it/s]

| epoch   4 |   850/  891 batches | accuracy    0.884 | loss    0.349 | zero gradients percentage    0.992


864it [00:27, 30.66it/s]

| epoch   4 |   860/  891 batches | accuracy    0.873 | loss    0.371 | zero gradients percentage    0.992


876it [00:27, 30.25it/s]

| epoch   4 |   870/  891 batches | accuracy    0.867 | loss    0.382 | zero gradients percentage    0.992


884it [00:28, 30.24it/s]

| epoch   4 |   880/  891 batches | accuracy    0.864 | loss    0.415 | zero gradients percentage    0.992


891it [00:28, 31.51it/s]


| epoch   4 |   890/  891 batches | accuracy    0.873 | loss    0.370 | zero gradients percentage    0.992
-----------------------------------------------------------
| end of epoch   4 | time: 28.65s | valid accuracy    0.865 
-----------------------------------------------------------


15it [00:00, 31.96it/s]

| epoch   5 |    10/  891 batches | accuracy    0.857 | loss    0.406 | zero gradients percentage    0.992


27it [00:00, 31.61it/s]

| epoch   5 |    20/  891 batches | accuracy    0.881 | loss    0.359 | zero gradients percentage    0.992


35it [00:01, 31.95it/s]

| epoch   5 |    30/  891 batches | accuracy    0.874 | loss    0.382 | zero gradients percentage    0.992


47it [00:01, 31.68it/s]

| epoch   5 |    40/  891 batches | accuracy    0.876 | loss    0.385 | zero gradients percentage    0.992


55it [00:01, 31.44it/s]

| epoch   5 |    50/  891 batches | accuracy    0.877 | loss    0.350 | zero gradients percentage    0.992


67it [00:02, 31.35it/s]

| epoch   5 |    60/  891 batches | accuracy    0.866 | loss    0.397 | zero gradients percentage    0.992


75it [00:02, 31.55it/s]

| epoch   5 |    70/  891 batches | accuracy    0.873 | loss    0.371 | zero gradients percentage    0.992


87it [00:02, 30.73it/s]

| epoch   5 |    80/  891 batches | accuracy    0.863 | loss    0.414 | zero gradients percentage    0.992


95it [00:03, 31.24it/s]

| epoch   5 |    90/  891 batches | accuracy    0.861 | loss    0.409 | zero gradients percentage    0.992


107it [00:03, 31.62it/s]

| epoch   5 |   100/  891 batches | accuracy    0.856 | loss    0.408 | zero gradients percentage    0.992


115it [00:03, 31.47it/s]

| epoch   5 |   110/  891 batches | accuracy    0.874 | loss    0.372 | zero gradients percentage    0.992


127it [00:04, 31.77it/s]

| epoch   5 |   120/  891 batches | accuracy    0.873 | loss    0.382 | zero gradients percentage    0.992


135it [00:04, 32.16it/s]

| epoch   5 |   130/  891 batches | accuracy    0.853 | loss    0.422 | zero gradients percentage    0.992


147it [00:04, 32.05it/s]

| epoch   5 |   140/  891 batches | accuracy    0.890 | loss    0.335 | zero gradients percentage    0.992


155it [00:04, 31.81it/s]

| epoch   5 |   150/  891 batches | accuracy    0.859 | loss    0.408 | zero gradients percentage    0.992


167it [00:05, 32.18it/s]

| epoch   5 |   160/  891 batches | accuracy    0.866 | loss    0.369 | zero gradients percentage    0.992


175it [00:05, 32.18it/s]

| epoch   5 |   170/  891 batches | accuracy    0.879 | loss    0.361 | zero gradients percentage    0.992


187it [00:05, 31.87it/s]

| epoch   5 |   180/  891 batches | accuracy    0.873 | loss    0.370 | zero gradients percentage    0.992


195it [00:06, 31.73it/s]

| epoch   5 |   190/  891 batches | accuracy    0.868 | loss    0.378 | zero gradients percentage    0.992


207it [00:06, 31.34it/s]

| epoch   5 |   200/  891 batches | accuracy    0.867 | loss    0.391 | zero gradients percentage    0.992


215it [00:06, 31.53it/s]

| epoch   5 |   210/  891 batches | accuracy    0.853 | loss    0.421 | zero gradients percentage    0.992


227it [00:07, 30.90it/s]

| epoch   5 |   220/  891 batches | accuracy    0.858 | loss    0.428 | zero gradients percentage    0.992


235it [00:07, 30.25it/s]

| epoch   5 |   230/  891 batches | accuracy    0.881 | loss    0.361 | zero gradients percentage    0.992


247it [00:07, 31.35it/s]

| epoch   5 |   240/  891 batches | accuracy    0.878 | loss    0.372 | zero gradients percentage    0.992


255it [00:08, 31.51it/s]

| epoch   5 |   250/  891 batches | accuracy    0.859 | loss    0.389 | zero gradients percentage    0.992


267it [00:08, 31.90it/s]

| epoch   5 |   260/  891 batches | accuracy    0.869 | loss    0.397 | zero gradients percentage    0.992


275it [00:08, 31.74it/s]

| epoch   5 |   270/  891 batches | accuracy    0.870 | loss    0.378 | zero gradients percentage    0.992


287it [00:09, 31.84it/s]

| epoch   5 |   280/  891 batches | accuracy    0.874 | loss    0.395 | zero gradients percentage    0.992


295it [00:09, 32.02it/s]

| epoch   5 |   290/  891 batches | accuracy    0.870 | loss    0.404 | zero gradients percentage    0.992


307it [00:09, 31.61it/s]

| epoch   5 |   300/  891 batches | accuracy    0.882 | loss    0.341 | zero gradients percentage    0.992


315it [00:09, 31.85it/s]

| epoch   5 |   310/  891 batches | accuracy    0.858 | loss    0.388 | zero gradients percentage    0.992


327it [00:10, 32.20it/s]

| epoch   5 |   320/  891 batches | accuracy    0.856 | loss    0.407 | zero gradients percentage    0.992


335it [00:10, 32.15it/s]

| epoch   5 |   330/  891 batches | accuracy    0.866 | loss    0.390 | zero gradients percentage    0.992


347it [00:10, 31.73it/s]

| epoch   5 |   340/  891 batches | accuracy    0.856 | loss    0.413 | zero gradients percentage    0.992


355it [00:11, 31.91it/s]

| epoch   5 |   350/  891 batches | accuracy    0.856 | loss    0.406 | zero gradients percentage    0.992


367it [00:11, 31.66it/s]

| epoch   5 |   360/  891 batches | accuracy    0.866 | loss    0.399 | zero gradients percentage    0.992


375it [00:11, 32.13it/s]

| epoch   5 |   370/  891 batches | accuracy    0.871 | loss    0.377 | zero gradients percentage    0.992


387it [00:12, 31.70it/s]

| epoch   5 |   380/  891 batches | accuracy    0.863 | loss    0.405 | zero gradients percentage    0.992


395it [00:12, 30.74it/s]

| epoch   5 |   390/  891 batches | accuracy    0.866 | loss    0.383 | zero gradients percentage    0.992


407it [00:12, 30.12it/s]

| epoch   5 |   400/  891 batches | accuracy    0.858 | loss    0.399 | zero gradients percentage    0.992


415it [00:13, 30.82it/s]

| epoch   5 |   410/  891 batches | accuracy    0.866 | loss    0.417 | zero gradients percentage    0.992


427it [00:13, 31.78it/s]

| epoch   5 |   420/  891 batches | accuracy    0.863 | loss    0.389 | zero gradients percentage    0.992


435it [00:13, 31.04it/s]

| epoch   5 |   430/  891 batches | accuracy    0.868 | loss    0.394 | zero gradients percentage    0.992


447it [00:14, 31.40it/s]

| epoch   5 |   440/  891 batches | accuracy    0.866 | loss    0.403 | zero gradients percentage    0.992


455it [00:14, 31.29it/s]

| epoch   5 |   450/  891 batches | accuracy    0.859 | loss    0.408 | zero gradients percentage    0.992


467it [00:14, 30.87it/s]

| epoch   5 |   460/  891 batches | accuracy    0.891 | loss    0.352 | zero gradients percentage    0.992


475it [00:15, 30.67it/s]

| epoch   5 |   470/  891 batches | accuracy    0.870 | loss    0.379 | zero gradients percentage    0.992


487it [00:15, 31.52it/s]

| epoch   5 |   480/  891 batches | accuracy    0.868 | loss    0.391 | zero gradients percentage    0.992


495it [00:15, 31.11it/s]

| epoch   5 |   490/  891 batches | accuracy    0.877 | loss    0.365 | zero gradients percentage    0.992


507it [00:16, 30.85it/s]

| epoch   5 |   500/  891 batches | accuracy    0.874 | loss    0.361 | zero gradients percentage    0.992


515it [00:16, 30.99it/s]

| epoch   5 |   510/  891 batches | accuracy    0.881 | loss    0.368 | zero gradients percentage    0.992


527it [00:16, 31.41it/s]

| epoch   5 |   520/  891 batches | accuracy    0.860 | loss    0.388 | zero gradients percentage    0.992


535it [00:16, 31.83it/s]

| epoch   5 |   530/  891 batches | accuracy    0.866 | loss    0.397 | zero gradients percentage    0.992


547it [00:17, 31.79it/s]

| epoch   5 |   540/  891 batches | accuracy    0.876 | loss    0.344 | zero gradients percentage    0.992


555it [00:17, 31.20it/s]

| epoch   5 |   550/  891 batches | accuracy    0.879 | loss    0.359 | zero gradients percentage    0.992


567it [00:18, 30.98it/s]

| epoch   5 |   560/  891 batches | accuracy    0.866 | loss    0.383 | zero gradients percentage    0.992


575it [00:18, 31.04it/s]

| epoch   5 |   570/  891 batches | accuracy    0.863 | loss    0.388 | zero gradients percentage    0.992


587it [00:18, 30.49it/s]

| epoch   5 |   580/  891 batches | accuracy    0.870 | loss    0.400 | zero gradients percentage    0.992


595it [00:18, 31.20it/s]

| epoch   5 |   590/  891 batches | accuracy    0.855 | loss    0.414 | zero gradients percentage    0.992


607it [00:19, 31.66it/s]

| epoch   5 |   600/  891 batches | accuracy    0.859 | loss    0.392 | zero gradients percentage    0.992


615it [00:19, 32.02it/s]

| epoch   5 |   610/  891 batches | accuracy    0.872 | loss    0.374 | zero gradients percentage    0.992


627it [00:19, 31.95it/s]

| epoch   5 |   620/  891 batches | accuracy    0.868 | loss    0.390 | zero gradients percentage    0.992


635it [00:20, 32.13it/s]

| epoch   5 |   630/  891 batches | accuracy    0.864 | loss    0.402 | zero gradients percentage    0.992


647it [00:20, 32.29it/s]

| epoch   5 |   640/  891 batches | accuracy    0.880 | loss    0.380 | zero gradients percentage    0.992


655it [00:20, 31.95it/s]

| epoch   5 |   650/  891 batches | accuracy    0.873 | loss    0.360 | zero gradients percentage    0.992


667it [00:21, 32.18it/s]

| epoch   5 |   660/  891 batches | accuracy    0.853 | loss    0.415 | zero gradients percentage    0.992


675it [00:21, 32.25it/s]

| epoch   5 |   670/  891 batches | accuracy    0.881 | loss    0.361 | zero gradients percentage    0.992


687it [00:21, 31.97it/s]

| epoch   5 |   680/  891 batches | accuracy    0.863 | loss    0.380 | zero gradients percentage    0.992


695it [00:22, 32.08it/s]

| epoch   5 |   690/  891 batches | accuracy    0.885 | loss    0.369 | zero gradients percentage    0.992


707it [00:22, 31.71it/s]

| epoch   5 |   700/  891 batches | accuracy    0.873 | loss    0.357 | zero gradients percentage    0.992


715it [00:22, 31.53it/s]

| epoch   5 |   710/  891 batches | accuracy    0.866 | loss    0.387 | zero gradients percentage    0.992


727it [00:23, 31.51it/s]

| epoch   5 |   720/  891 batches | accuracy    0.848 | loss    0.418 | zero gradients percentage    0.992


735it [00:23, 31.84it/s]

| epoch   5 |   730/  891 batches | accuracy    0.862 | loss    0.393 | zero gradients percentage    0.992


747it [00:23, 31.85it/s]

| epoch   5 |   740/  891 batches | accuracy    0.872 | loss    0.372 | zero gradients percentage    0.992


755it [00:23, 31.79it/s]

| epoch   5 |   750/  891 batches | accuracy    0.866 | loss    0.380 | zero gradients percentage    0.992


767it [00:24, 31.78it/s]

| epoch   5 |   760/  891 batches | accuracy    0.872 | loss    0.381 | zero gradients percentage    0.992


775it [00:24, 31.88it/s]

| epoch   5 |   770/  891 batches | accuracy    0.877 | loss    0.360 | zero gradients percentage    0.992


787it [00:24, 31.62it/s]

| epoch   5 |   780/  891 batches | accuracy    0.853 | loss    0.418 | zero gradients percentage    0.992


795it [00:25, 31.48it/s]

| epoch   5 |   790/  891 batches | accuracy    0.852 | loss    0.444 | zero gradients percentage    0.992


807it [00:25, 31.56it/s]

| epoch   5 |   800/  891 batches | accuracy    0.863 | loss    0.393 | zero gradients percentage    0.992


815it [00:25, 31.23it/s]

| epoch   5 |   810/  891 batches | accuracy    0.851 | loss    0.420 | zero gradients percentage    0.992


827it [00:26, 31.64it/s]

| epoch   5 |   820/  891 batches | accuracy    0.862 | loss    0.414 | zero gradients percentage    0.992


835it [00:26, 31.29it/s]

| epoch   5 |   830/  891 batches | accuracy    0.892 | loss    0.333 | zero gradients percentage    0.992


847it [00:26, 31.67it/s]

| epoch   5 |   840/  891 batches | accuracy    0.873 | loss    0.390 | zero gradients percentage    0.992


855it [00:27, 31.87it/s]

| epoch   5 |   850/  891 batches | accuracy    0.885 | loss    0.354 | zero gradients percentage    0.992


867it [00:27, 32.08it/s]

| epoch   5 |   860/  891 batches | accuracy    0.890 | loss    0.372 | zero gradients percentage    0.992


875it [00:27, 31.94it/s]

| epoch   5 |   870/  891 batches | accuracy    0.867 | loss    0.401 | zero gradients percentage    0.992


887it [00:28, 32.18it/s]

| epoch   5 |   880/  891 batches | accuracy    0.859 | loss    0.410 | zero gradients percentage    0.992


891it [00:28, 31.56it/s]


| epoch   5 |   890/  891 batches | accuracy    0.877 | loss    0.368 | zero gradients percentage    0.992
-----------------------------------------------------------
| end of epoch   5 | time: 28.60s | valid accuracy    0.865 
-----------------------------------------------------------
Checking the results of test dataset.
test accuracy    0.865
test loss    0.003
