In [1]:
# -*- coding: utf-8 -*-
"""
Created on Fri Nov  2 10:16:26 2018

@author: PISME_Public01

"""
"""
方差缩放初始化？
https://zhuanlan.zhihu.com/p/38315135
"""

import numpy as np
import tensorflow as tf
from tqdm import trange, tqdm
from PIL import Image
import matplotlib.pyplot as plt
import os
import shutil

from tensorflow.examples.tutorials.mnist import input_data

class DenseNet(object):
    def __init__(self, datasets, growth_rate, depth, 
                 total_blocks, keep_prob, weight_decay, nesterov_momentum, model_type, dataset_name,
                 should_save_logs, should_save_model, 
                 renew_logs=False,
                 reduction=1.0,
                 bc_mode=False,
                 **kwargs):
        
        self.datasets = datasets
        if dataset_name == 'fashion-mnist':
            self.data_shape = (28, 28, 1)
            self.n_classes = 10
        elif dataset_name == 'mnist':
            self.data_shape = (28, 28, 1)
            self.n_classes = 10
        
        self.depth = depth
        self.growth_rate = growth_rate
        self.first_output_features = growth_rate * 2
        self.total_blocks = total_blocks
        self.layers_per_block = (depth - (total_blocks + 1)) // total_blocks
        self.bc_mode = bc_mode
        self.reduction = reduction
        
        if not bc_mode:
            print(f"Build {model_type} model with {self.total_blocks} blocks, "
                  f"{self.layers_per_block} composite layers each.")
        else:
            self.layers_per_block = self.layers_per_block // 2
            print(f"Build {model_type} model with {self.total_blocks} blocks, "
                  f"{self.layers_per_block} bottleneck layers and {self.layers_per_block} composite layers each.")
        
        print(f"Reduction at transition layers: {self.reduction}")
        
        self.keep_prob = keep_prob
        self.weight_decay = weight_decay
        self.nesterov_momentum = nesterov_momentum
        self.model_type = model_type
        self.dataset_name = dataset_name
        self.should_save_logs = should_save_logs
        self.should_save_model = should_save_model
        self.renew_logs = renew_logs
        self.batches_step = 0
        
        self._define_inputs()
        self._build_graph()
        self._initialize_session()
        self._count_trainable_params()
    
    def _initialize_session(self):
        
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        self.sess.run(tf.global_variables_initializer())
        logswriter = tf.summary.FileWriter
        
        self.saver = tf.train.Saver()
        self.summary_weiter = logswriter(self.logs_path)
        
    def _count_trainable_params(self):
        
        total_parameters = 0
        for variable in tf.trainable_variables():
            shape = variable.get_shape()
            variable_parameters = 1
            for dim in shape:
                variable_parameters *= dim.value
            total_parameters += variable_parameters
        print(f"Total training params: {total_parameters / 1e6}M")
    
    @property
    def save_path(self):
        
        try:
            save_path = self._save_path
        except AttributeError:
            save_path = f"saves/{self.model_identifier}"
            os.makedirs(save_path, exist_ok=True)
            save_path = os.path.join(save_path, "model.ckpt")
            self._save_path = save_path
        return save_path
    
    @property
    def logs_path(self):
        
        try:
            logs_path = self._logs_path
        except AttributeError:
            logs_path = f"logs/{self.model_identifier}"
            if self.renew_logs:
                shutil.rmtree(logs_path, ignore_errors=True)
            os.makedirs(logs_path, exist_ok=True)
            self._logs_path = logs_path
        return logs_path
    
    @property
    def model_identifier(self):
        
        return f"{self.model_type}_growth_rate={self.growth_rate}_depth={self.depth}_dataset_{self.dataset_name}"
    
    def save_model(self, global_step=None):
        
        self.saver.save(self.sess, self.save_path, global_step=global_step)
    
    def load_model(self):
        
        try:
            self.saver.restore(self.sess, self.save_path)
        except Exception as e:
            raise IOError("Failed to load model "
                          f"from save path: {self.save_path}")
        self.saver.restore(self.sess, self.save_path)
        print(f"Successfully load model from save path: {self.save_path}")
    
    def log_loss_accuracy(self, loss, accuracy, epoch, prefix, should_print=True):
        
        if should_print:
            print(f"mean cross_entropy: {loss}, mean accuracy: {accuracy}")
        summary = tf.Summary(value=[
                tf.Summary.Value(
                        tag=f'loss_{prefix}', simple_value=float(loss)),
                tf.Summary.Value(
                        tag=f'accuracy_{prefix}', simple_value=float(accuracy))
                ])
        self.summary_weiter.add_summary(summary)
        
    def _define_inputs(self):
        
        shape = [None]
        shape.extend(self.data_shape)
        self.images = tf.placeholder(
                tf.float32, 
                shape=shape,
                name='input_images')
        
        self.labels = tf.placeholder(
                tf.float32,
                shape=[None, self.n_classes],
                name='labels')
        
        self.learning_rate = tf.placeholder(
                tf.float32,
                shape=[],
                name='learning_rate')
        
        self.is_training = tf.placeholder(
                tf.bool,
                shape=[])
        
    def composite_function(self, _input, out_features, kernel_size=3):
        
        with tf.variable_scope("composite_function"):
            # BN
            output = self.batch_norm(_input)
            # ReLu
            output = tf.nn.relu(output)
            # convolution
            output = self.conv2d(
                    output, out_features=out_features, kernel_size=kernel_size)
            # dropout
            output = self.dropout(output)
        return output
            
    def bottleneck(self, _input, out_features):
        
        with tf.variable_scope("bottleneck"):
            output = self.batch_norm(_input)
            output = tf.nn.relu(output)
            inter_features = out_features * 4
            output = self.conv2d(
                    output, out_features=inter_features, kernel_size=1,
                    padding='VALID')
            output = self.dropout(output)
        return output
    
    def add_internal_layer(self, _input, growth_rate):
        
        if not self.bc_mode:
            comp_out = self.composite_function(
                    _input, out_features=growth_rate, kernel_size=3)
        elif self.bc_mode:
            bottleneck_out = self.bottleneck(
                    _input, out_features=growth_rate)
            comp_out = self.composite_function(
                    bottleneck_out, out_features=growth_rate, kernel_size=3)
            
        output = tf.concat(axis=3, values=(_input, comp_out))
        return output
        
    def add_block(self, _input, growth_rate, layers_per_block):
        
        output = _input
        for layer in range(layers_per_block):
            with tf.variable_scope(f"layer_{layer}"):
                output = self.add_internal_layer(
                        output, growth_rate)
        return output
    
    def trainsition_layer(self, _input):
        
        out_features = int(int(_input.get_shape()[-1]) * self.reduction)
        output = self.composite_function(
                _input, out_features, kernel_size=1)
        output = self.avg_pool(_input, 2)
        return output
    
    def transition_layer_to_classes(self, _input):
        
        # BN
        output = self.batch_norm(_input)
        # ReLu
        output = tf.nn.relu(output)
        # average pooling by channel
        last_pool_kernel = int(output.get_shape()[-2])
        output = self.avg_pool(output, k=last_pool_kernel)
        # FC
        features_total = int(output.get_shape()[-1])
        output = tf.reshape(output, [-1, features_total])
        W  = self.weight_variable_xavier(
                [features_total, self.n_classes], name='W')
        bias = self.bias_variable([self.n_classes])
        logits = tf.matmul(output, W) + bias
        return logits
        
    def conv2d(self, _input, out_features, kernel_size, 
               strides=[1, 1, 1, 1], padding='SAME'):
        
        in_features = int(_input.get_shape()[-1])
        kernel = self.weight_variable_msra(
                [kernel_size, kernel_size, in_features, out_features], name='kernel1')
        output = tf.nn.conv2d(_input, kernel, strides, padding)
        return output
    
    def avg_pool(self, _input, k):
        
        ksize = [1, k, k, 1]
        strides = [1, k, k, 1]
        padding = 'VALID'
        output = tf.nn.avg_pool(_input, ksize, strides, padding)
        return output
    
    def batch_norm(self, _input):
        
        output = tf.contrib.layers.batch_norm(
                _input, scale=True, is_training=self.is_training,
                updates_collections=None)
        return output
    
    def dropout(self, _input):
        
        if self.keep_prob < 1:
            output = tf.cond(
                    self.is_training,
                    lambda: tf.nn.dropout(_input, self.keep_prob),
                    lambda: _input
            )
        else:
            output = _input
        return output
    
    
    def weight_variable_msra(self, shape, name):
        
        return tf.get_variable(
                name=name, 
                shape=shape,
                initializer=tf.contrib.layers.variance_scaling_initializer())
    
    def weight_variable_xavier(self, shape, name):
        
        return tf.get_variable(
                name=name,
                shape=shape,
                initializer=tf.contrib.layers.xavier_initializer())
    
    def bias_variable(self, shape, name='bias'):
        
        inital = tf.constant(0.0, shape=shape)
        return tf.get_variable(name, initializer=inital)
    
    
    def _build_graph(self):
        
        growth_rate = self.growth_rate
        layers_per_block = self.layers_per_block
        # first 
        with tf.variable_scope("Initial_convolution"):
            output = self.conv2d(
                    self.images,
                    out_features=self.first_output_features,
                    kernel_size=3)
        
        for block in range(self.total_blocks):
            with tf.variable_scope(f"Block_{block}"):
                output = self.add_block(
                        output, 
                        growth_rate,
                        layers_per_block)
            if block != self.total_blocks - 1:
                with tf.variable_scope(f"Transition_after_block_{block}"):
                    output = self.trainsition_layer(output)
                
        with tf.variable_scope("Transition_to_classes"):
            logits = self.transition_layer_to_classes(output)
        prediction = tf.nn.softmax(logits)
        
        # Losses
        cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
                logits=logits, labels=self.labels))
        self.cross_entropy = cross_entropy
        l2_loss = tf.add_n([tf.nn.l2_loss(var) for var in tf.trainable_variables()])
        
        #optimizer = tf.train.MomentumOptimizer(
        #        self.learning_rate, self.nesterov_momentum, use_nesterov=True)
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.train_step = optimizer.minimize(
                cross_entropy + l2_loss * self.weight_decay)
        
        correct_prediction = tf.equal(
                tf.argmax(prediction, 1), 
                tf.argmax(self.labels, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        
    def train_all_epochs(self, train_params):
        
        n_epochs = train_params['n_epochs']
        learning_rate = train_params['initial_learning_rate']
        batch_size = train_params['batch_size']
        reduce_lr_epoch_1 = train_params['reduce_lr_epoch_1']
        reduce_lr_epoch_2 = train_params['reduce_lr_epoch_2']
        for epoch in trange(1, n_epochs + 1):
            if epoch == reduce_lr_epoch_1 or epoch == reduce_lr_epoch_2:
                learning_rate = learning_rate / 10
                print(f"Decrease learning rate, new lr = {learning_rate}")
            
            print("Training...")
            loss, acc = self.train_one_epoch(
                    self.datasets.train, batch_size, learning_rate)
            if self.should_save_logs:
                self.log_loss_accuracy(loss, acc, epoch, prefix='train')
            
            if train_params.get('validation_set', False) and epoch % 20==0:
                print("Validation...")
                loss, acc =self.test(self.datasets.validation, batch_size)
                if self.should_save_logs:
                    self.log_loss_accuracy(loss, acc, epoch, prefix='valid')
            
            if self.should_save_model:
                self.save_model()
    
    
    def train_one_epoch(self, data, batch_size, learning_rate):
        
        num_examples = data.num_examples
        total_loss = []
        total_accuracy = []
        for i in range(num_examples // batch_size):
            batch = data.next_batch(batch_size)
            images, labels = batch
            # images, labels = data.images[0:100], data.labels[0:100]
            shape = [-1]
            shape.extend(self.data_shape)
            images = images.reshape(shape)
            feed_dict = {
                    self.images: images, 
                    self.labels: labels,
                    self.learning_rate: learning_rate,
                    self.is_training: True,
            }
            fetches = [self.train_step, self.cross_entropy, self.accuracy]
            result = self.sess.run(fetches, feed_dict=feed_dict)
            _, loss, accuracy = result
            total_loss.append(loss)
            total_accuracy.append(accuracy)
            if self.should_save_logs:
                self.batches_step += 1
                self.log_loss_accuracy(
                        loss, accuracy, self.batches_step, prefix='per_batch',
                        should_print=False)
        mean_loss = np.mean(total_loss)
        mean_accuracy = np.mean(total_accuracy)
        return mean_loss, mean_accuracy
            
    def test(self, data, batch_size):
        
        num_examples = data.num_examples
        total_loss = []
        total_accuracy = []
        for i in range(num_examples // batch_size):
            batch = data.next_batch(batch_size)
            images, labels = batch
            # images, labels = data.images[0:100], data.labels[0:100]
            shape = [-1]
            shape.extend(self.data_shape)
            images = images.reshape(shape)
            feed_dict = {
                    self.images: images,
                    self.labels: labels,
                    self.is_training: False
            }
            fetches = [self.cross_entropy, self.accuracy]
            loss, accuracy = self.sess.run(
                    fetches, feed_dict=feed_dict)
            total_loss.append(loss)
            total_accuracy.append(accuracy)
        mean_loss = np.mean(total_loss)
        mean_accuracy = np.mean(total_accuracy)
        return mean_loss, mean_accuracy
            
if __name__ == '__main__':
    
    # data = input_data.read_data_sets('data/fashion')
    data = input_data.read_data_sets('data/fashion', source_url='http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/', one_hot=True)
    
    params = {
            'datasets': data,
            'growth_rate': 12,
            'depth': 40,
            'total_blocks': 3,
            'keep_prob': 0.8,
            'weight_decay': 0.001,
            'nesterov_momentum': 1 ,
            'model_type': 'DenseNet_BC',
            'dataset_name': 'fashion-mnist',
            'should_save_logs': True,
            'should_save_model': True}
    model = DenseNet(
            params['datasets'], params['growth_rate'], params['depth'],
            params['total_blocks'], params['keep_prob'], params['weight_decay'],
            params['nesterov_momentum'], params['model_type'], params['dataset_name'],
            params['should_save_logs'], params['should_save_model'], reduction=0.5, renew_logs=True, bc_mode=True)
    
    train_params = {
            'n_epochs': 300,
            'initial_learning_rate': 0.001,
            'batch_size': 8,
            'reduce_lr_epoch_1': 100,
            'reduce_lr_epoch_2': 200,
            'validation_set': True}
    
    model.train_all_epochs(train_params)       
        

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


Extracting data/fashion/train-images-idx3-ubyte.gz
Extracting data/fashion/train-labels-idx1-ubyte.gz
Extracting data/fashion/t10k-images-idx3-ubyte.gz
Extracting data/fashion/t10k-labels-idx1-ubyte.gz
Build DenseNet_BC model with 3 blocks, 6 bottleneck layers and 6 composite layers each.
Reduction at transition layers: 0.5


  0%|          | 0/300 [00:00<?, ?it/s]

Total training params: 0.230794M
Training...
mean cross_entropy: 0.6310693025588989, mean accuracy: 0.7753818035125732


  0%|          | 1/300 [03:03<15:14:01, 183.42s/it]

Training...
mean cross_entropy: 0.46602770686149597, mean accuracy: 0.8369091153144836


  1%|          | 2/300 [06:05<15:09:29, 183.12s/it]

Training...
mean cross_entropy: 0.4229069650173187, mean accuracy: 0.8508363366127014


  1%|          | 3/300 [08:58<14:50:27, 179.89s/it]

Training...
mean cross_entropy: 0.404252827167511, mean accuracy: 0.8592363595962524


  1%|▏         | 4/300 [11:47<14:31:49, 176.72s/it]

Training...
mean cross_entropy: 0.3930783271789551, mean accuracy: 0.8633090853691101


  2%|▏         | 5/300 [14:37<14:18:19, 174.57s/it]

Training...
mean cross_entropy: 0.38384437561035156, mean accuracy: 0.8667636513710022


  2%|▏         | 6/300 [17:26<14:07:51, 173.03s/it]

Training...
mean cross_entropy: 0.3828975558280945, mean accuracy: 0.8669818043708801


  2%|▏         | 7/300 [20:15<13:59:18, 171.87s/it]

Training...
mean cross_entropy: 0.3777834475040436, mean accuracy: 0.8690181970596313


  3%|▎         | 8/300 [23:04<13:52:02, 170.97s/it]

Training...
mean cross_entropy: 0.3765944540500641, mean accuracy: 0.869381844997406


  3%|▎         | 9/300 [25:53<13:46:59, 170.51s/it]

Training...
mean cross_entropy: 0.3745744228363037, mean accuracy: 0.8697090744972229


  3%|▎         | 10/300 [28:46<13:47:19, 171.17s/it]

Training...
mean cross_entropy: 0.370576411485672, mean accuracy: 0.8711272478103638


  4%|▎         | 11/300 [31:40<13:47:53, 171.88s/it]

Training...
mean cross_entropy: 0.3681941032409668, mean accuracy: 0.8725273013114929


  4%|▍         | 12/300 [34:34<13:48:39, 172.64s/it]

Training...
mean cross_entropy: 0.3715224862098694, mean accuracy: 0.8724363446235657


  4%|▍         | 13/300 [37:28<13:47:03, 172.91s/it]

Training...
mean cross_entropy: 0.36698225140571594, mean accuracy: 0.8717272877693176


  5%|▍         | 14/300 [40:18<13:40:33, 172.15s/it]

Training...
mean cross_entropy: 0.36435312032699585, mean accuracy: 0.8736909031867981


  5%|▌         | 15/300 [43:08<13:34:09, 171.40s/it]

Training...
mean cross_entropy: 0.36363905668258667, mean accuracy: 0.874963641166687


  5%|▌         | 16/300 [45:58<13:30:10, 171.16s/it]

Training...
mean cross_entropy: 0.36328864097595215, mean accuracy: 0.8741818070411682


  6%|▌         | 17/300 [48:54<13:34:11, 172.62s/it]

Training...
mean cross_entropy: 0.3659854531288147, mean accuracy: 0.8736909031867981


  6%|▌         | 18/300 [51:54<13:41:06, 174.70s/it]

Training...
mean cross_entropy: 0.360918253660202, mean accuracy: 0.8763090968132019


  6%|▋         | 19/300 [54:54<13:45:41, 176.30s/it]

Training...
mean cross_entropy: 0.36108916997909546, mean accuracy: 0.8735091090202332
Validation...
mean cross_entropy: 0.34900495409965515, mean accuracy: 0.8791999816894531


  7%|▋         | 20/300 [57:55<13:49:55, 177.84s/it]

Training...
mean cross_entropy: 0.35962042212486267, mean accuracy: 0.8745454549789429


  7%|▋         | 21/300 [1:00:49<13:41:26, 176.65s/it]

Training...
mean cross_entropy: 0.3605555295944214, mean accuracy: 0.8743818402290344


  7%|▋         | 22/300 [1:03:43<13:34:49, 175.86s/it]

Training...
mean cross_entropy: 0.36097413301467896, mean accuracy: 0.8757091164588928


  8%|▊         | 23/300 [1:06:37<13:28:51, 175.20s/it]

Training...
mean cross_entropy: 0.3590625822544098, mean accuracy: 0.8755272626876831


  8%|▊         | 24/300 [1:09:39<13:36:03, 177.40s/it]

Training...
mean cross_entropy: 0.35915815830230713, mean accuracy: 0.8745272755622864


  8%|▊         | 25/300 [1:12:43<13:41:33, 179.25s/it]

Training...
mean cross_entropy: 0.35935503244400024, mean accuracy: 0.8766727447509766


  9%|▊         | 26/300 [1:15:46<13:43:37, 180.36s/it]

Training...
mean cross_entropy: 0.3576614558696747, mean accuracy: 0.8762909173965454


  9%|▉         | 27/300 [1:18:42<13:34:46, 179.07s/it]

Training...
mean cross_entropy: 0.36046600341796875, mean accuracy: 0.8752181529998779


  9%|▉         | 28/300 [1:21:34<13:21:45, 176.86s/it]

Training...
mean cross_entropy: 0.3592335879802704, mean accuracy: 0.8764363527297974


 10%|▉         | 29/300 [1:24:25<13:11:45, 175.30s/it]

Training...
mean cross_entropy: 0.3573177456855774, mean accuracy: 0.8779090642929077


 10%|█         | 30/300 [1:27:17<13:04:10, 174.26s/it]

Training...
mean cross_entropy: 0.35550767183303833, mean accuracy: 0.8773090839385986


 10%|█         | 31/300 [1:30:08<12:56:05, 173.11s/it]

Training...
mean cross_entropy: 0.35392600297927856, mean accuracy: 0.8763999938964844


 11%|█         | 32/300 [1:32:58<12:49:11, 172.21s/it]

Training...
mean cross_entropy: 0.3571510314941406, mean accuracy: 0.8756726980209351


 11%|█         | 33/300 [1:35:48<12:44:12, 171.73s/it]

Training...
mean cross_entropy: 0.3577081263065338, mean accuracy: 0.8748363852500916


 11%|█▏        | 34/300 [1:38:52<12:56:52, 175.24s/it]

Training...
mean cross_entropy: 0.35611939430236816, mean accuracy: 0.8743273019790649


 12%|█▏        | 35/300 [1:42:04<13:16:27, 180.33s/it]

Training...
mean cross_entropy: 0.35524505376815796, mean accuracy: 0.8766182065010071


 12%|█▏        | 36/300 [1:45:17<13:29:42, 184.03s/it]

Training...
mean cross_entropy: 0.35472172498703003, mean accuracy: 0.8764545321464539


 12%|█▏        | 37/300 [1:48:19<13:23:57, 183.41s/it]

Training...
mean cross_entropy: 0.35324183106422424, mean accuracy: 0.8765818476676941


 13%|█▎        | 38/300 [1:51:08<13:02:50, 179.28s/it]

Training...
mean cross_entropy: 0.35268905758857727, mean accuracy: 0.8778727054595947


 13%|█▎        | 39/300 [1:53:58<12:47:58, 176.55s/it]

Training...
mean cross_entropy: 0.35218968987464905, mean accuracy: 0.8785818219184875
Validation...
mean cross_entropy: 0.4840070307254791, mean accuracy: 0.8438000082969666


 13%|█▎        | 40/300 [1:56:52<12:41:26, 175.72s/it]

Training...
mean cross_entropy: 0.35500767827033997, mean accuracy: 0.8755272626876831


 14%|█▎        | 41/300 [1:59:44<12:33:01, 174.45s/it]

Training...
mean cross_entropy: 0.3527330160140991, mean accuracy: 0.8771818280220032


 14%|█▍        | 42/300 [2:02:35<12:25:50, 173.45s/it]

Training...
mean cross_entropy: 0.35443949699401855, mean accuracy: 0.8777454495429993


 14%|█▍        | 43/300 [2:05:26<12:20:23, 172.85s/it]

Training...
mean cross_entropy: 0.3545302450656891, mean accuracy: 0.8771091103553772


 15%|█▍        | 44/300 [2:08:19<12:17:39, 172.89s/it]

Training...
mean cross_entropy: 0.3533320724964142, mean accuracy: 0.8769636154174805


 15%|█▌        | 45/300 [2:11:15<12:17:48, 173.60s/it]

Training...
mean cross_entropy: 0.35219648480415344, mean accuracy: 0.8782727122306824


 15%|█▌        | 46/300 [2:14:10<12:16:44, 174.03s/it]

Training...
mean cross_entropy: 0.35067543387413025, mean accuracy: 0.8780182003974915


 16%|█▌        | 47/300 [2:17:05<12:15:49, 174.50s/it]

Training...
mean cross_entropy: 0.3543684780597687, mean accuracy: 0.8775272965431213


 16%|█▌        | 48/300 [2:19:54<12:06:05, 172.88s/it]

Training...
mean cross_entropy: 0.3500736653804779, mean accuracy: 0.878818154335022


 16%|█▋        | 49/300 [2:22:44<11:59:22, 171.96s/it]

Training...
mean cross_entropy: 0.3508884906768799, mean accuracy: 0.8774545192718506


 17%|█▋        | 50/300 [2:25:33<11:53:19, 171.20s/it]

Training...
mean cross_entropy: 0.35248664021492004, mean accuracy: 0.8789272904396057


 17%|█▋        | 51/300 [2:28:30<11:56:30, 172.65s/it]

Training...
mean cross_entropy: 0.3552696406841278, mean accuracy: 0.876909077167511


 17%|█▋        | 52/300 [2:31:32<12:06:23, 175.74s/it]

Training...
mean cross_entropy: 0.3490094542503357, mean accuracy: 0.8800727128982544


 18%|█▊        | 53/300 [2:34:36<12:12:27, 177.92s/it]

Training...
mean cross_entropy: 0.3535579442977905, mean accuracy: 0.8781636357307434


 18%|█▊        | 54/300 [2:37:37<12:13:23, 178.88s/it]

Training...
mean cross_entropy: 0.35163578391075134, mean accuracy: 0.8778908848762512


 18%|█▊        | 55/300 [2:40:31<12:04:26, 177.41s/it]

Training...
mean cross_entropy: 0.3522878587245941, mean accuracy: 0.8777818083763123


 19%|█▊        | 56/300 [2:43:24<11:57:03, 176.33s/it]

Training...
mean cross_entropy: 0.3514598309993744, mean accuracy: 0.8791454434394836


 19%|█▉        | 57/300 [2:46:18<11:50:21, 175.40s/it]

Training...
mean cross_entropy: 0.349497526884079, mean accuracy: 0.8794727325439453


 19%|█▉        | 58/300 [2:49:12<11:46:09, 175.08s/it]

Training...
mean cross_entropy: 0.34990185499191284, mean accuracy: 0.8799999952316284


 20%|█▉        | 59/300 [2:52:06<11:41:45, 174.71s/it]

Training...
mean cross_entropy: 0.35152480006217957, mean accuracy: 0.8780545592308044
Validation...
mean cross_entropy: 0.3525792062282562, mean accuracy: 0.8840000033378601


 20%|██        | 60/300 [2:55:03<11:41:46, 175.45s/it]

Training...
mean cross_entropy: 0.35382217168807983, mean accuracy: 0.8773454427719116


 20%|██        | 61/300 [2:57:56<11:35:25, 174.58s/it]

Training...
mean cross_entropy: 0.34912607073783875, mean accuracy: 0.8794363737106323


 21%|██        | 62/300 [3:00:47<11:29:19, 173.78s/it]

Training...
mean cross_entropy: 0.3494609296321869, mean accuracy: 0.8778727054595947


 21%|██        | 63/300 [3:03:39<11:23:36, 173.07s/it]

Training...
mean cross_entropy: 0.3522260785102844, mean accuracy: 0.8767091035842896


 21%|██▏       | 64/300 [3:06:30<11:18:22, 172.47s/it]

Training...
mean cross_entropy: 0.35130879282951355, mean accuracy: 0.878745436668396


 22%|██▏       | 65/300 [3:09:32<11:26:43, 175.33s/it]

Training...
mean cross_entropy: 0.3496171832084656, mean accuracy: 0.8790000081062317


 22%|██▏       | 66/300 [3:12:36<11:34:09, 177.99s/it]

Training...
mean cross_entropy: 0.34941670298576355, mean accuracy: 0.8779090642929077


 22%|██▏       | 67/300 [3:15:40<11:38:20, 179.83s/it]

Training...
mean cross_entropy: 0.3482595384120941, mean accuracy: 0.8790545463562012


 23%|██▎       | 68/300 [3:18:38<11:32:58, 179.22s/it]

Training...
mean cross_entropy: 0.35219481587409973, mean accuracy: 0.8785091042518616


 23%|██▎       | 69/300 [3:21:31<11:22:55, 177.38s/it]

Training...
mean cross_entropy: 0.34809601306915283, mean accuracy: 0.8803636431694031


 23%|██▎       | 70/300 [3:24:24<11:14:54, 176.06s/it]

Training...
mean cross_entropy: 0.3481891453266144, mean accuracy: 0.8784727454185486


 24%|██▎       | 71/300 [3:27:17<11:08:15, 175.09s/it]

Training...
mean cross_entropy: 0.3470747470855713, mean accuracy: 0.8808000087738037


 24%|██▍       | 72/300 [3:30:06<10:58:25, 173.27s/it]

Training...
mean cross_entropy: 0.348682701587677, mean accuracy: 0.8785272836685181


 24%|██▍       | 73/300 [3:32:55<10:50:24, 171.91s/it]

Training...
mean cross_entropy: 0.3481087386608124, mean accuracy: 0.8801272511482239


 25%|██▍       | 74/300 [3:35:43<10:43:04, 170.73s/it]

Training...
mean cross_entropy: 0.35082077980041504, mean accuracy: 0.8775454759597778


 25%|██▌       | 75/300 [3:38:35<10:41:45, 171.14s/it]

Training...
mean cross_entropy: 0.35102516412734985, mean accuracy: 0.8792909383773804


 25%|██▌       | 76/300 [3:41:29<10:41:54, 171.94s/it]

Training...
mean cross_entropy: 0.34953075647354126, mean accuracy: 0.8794909119606018


 26%|██▌       | 77/300 [3:44:22<10:40:57, 172.45s/it]

Training...
mean cross_entropy: 0.34948793053627014, mean accuracy: 0.8794545531272888


 26%|██▌       | 78/300 [3:47:17<10:40:53, 173.21s/it]

Training...
mean cross_entropy: 0.34899473190307617, mean accuracy: 0.8801272511482239


 26%|██▋       | 79/300 [3:50:20<10:49:06, 176.23s/it]

Training...
mean cross_entropy: 0.34898093342781067, mean accuracy: 0.8790545463562012
Validation...
mean cross_entropy: 0.4412505328655243, mean accuracy: 0.8482000231742859


 27%|██▋       | 80/300 [3:53:27<10:57:37, 179.35s/it]

Training...
mean cross_entropy: 0.3482694923877716, mean accuracy: 0.8789272904396057


 27%|██▋       | 81/300 [3:56:30<10:58:49, 180.50s/it]

Training...
mean cross_entropy: 0.3483436107635498, mean accuracy: 0.878600001335144


 27%|██▋       | 82/300 [3:59:22<10:46:35, 177.96s/it]

Training...
mean cross_entropy: 0.3505926728248596, mean accuracy: 0.8791999816894531


 28%|██▊       | 83/300 [4:02:12<10:35:08, 175.61s/it]

Training...
mean cross_entropy: 0.3505707085132599, mean accuracy: 0.8782181739807129


 28%|██▊       | 84/300 [4:05:03<10:26:19, 173.98s/it]

Training...
mean cross_entropy: 0.34946298599243164, mean accuracy: 0.8796545267105103


 28%|██▊       | 85/300 [4:07:54<10:20:59, 173.30s/it]

Training...
mean cross_entropy: 0.34697091579437256, mean accuracy: 0.8808545470237732


 29%|██▊       | 86/300 [4:10:50<10:20:44, 174.04s/it]

Training...
mean cross_entropy: 0.3482618033885956, mean accuracy: 0.878563642501831


 29%|██▉       | 87/300 [4:13:45<10:18:58, 174.36s/it]

Training...
mean cross_entropy: 0.35118067264556885, mean accuracy: 0.8790363669395447


 29%|██▉       | 88/300 [4:16:41<10:18:03, 174.92s/it]

Training...
mean cross_entropy: 0.3508528470993042, mean accuracy: 0.8779818415641785


 30%|██▉       | 89/300 [4:19:41<10:19:49, 176.25s/it]

Training...
mean cross_entropy: 0.35248249769210815, mean accuracy: 0.8792726993560791


 30%|███       | 90/300 [4:22:41<10:20:30, 177.29s/it]

Training...
mean cross_entropy: 0.3481561839580536, mean accuracy: 0.879800021648407


 30%|███       | 91/300 [4:25:41<10:20:24, 178.11s/it]

Training...
mean cross_entropy: 0.34965944290161133, mean accuracy: 0.877963662147522


 31%|███       | 92/300 [4:28:41<10:20:20, 178.94s/it]

Training...
mean cross_entropy: 0.3481830656528473, mean accuracy: 0.8775272965431213


 31%|███       | 93/300 [4:31:45<10:22:03, 180.31s/it]

Training...
mean cross_entropy: 0.3469133973121643, mean accuracy: 0.880509078502655


 31%|███▏      | 94/300 [4:34:48<10:21:59, 181.16s/it]

Training...
mean cross_entropy: 0.34740081429481506, mean accuracy: 0.8796545267105103


 32%|███▏      | 95/300 [4:37:51<10:21:11, 181.81s/it]

Training...
mean cross_entropy: 0.34659522771835327, mean accuracy: 0.8810545206069946


 32%|███▏      | 96/300 [4:40:55<10:19:38, 182.25s/it]

Training...
mean cross_entropy: 0.3486523926258087, mean accuracy: 0.8795999884605408


 32%|███▏      | 97/300 [4:43:58<10:17:51, 182.62s/it]

Training...
mean cross_entropy: 0.34833651781082153, mean accuracy: 0.8785272836685181


 33%|███▎      | 98/300 [4:47:01<10:15:19, 182.77s/it]

Training...
mean cross_entropy: 0.34969407320022583, mean accuracy: 0.8769636154174805


 33%|███▎      | 99/300 [4:49:51<9:59:04, 178.83s/it] 

Decrease learning rate, new lr = 0.0001
Training...
mean cross_entropy: 0.29334354400634766, mean accuracy: 0.8980181813240051
Validation...
mean cross_entropy: 0.3139762878417969, mean accuracy: 0.897599995136261


 33%|███▎      | 100/300 [4:52:44<9:49:52, 176.96s/it]

Training...
mean cross_entropy: 0.28363072872161865, mean accuracy: 0.9028909206390381


 34%|███▎      | 101/300 [4:55:33<9:39:14, 174.65s/it]

Training...
mean cross_entropy: 0.2765654921531677, mean accuracy: 0.904872715473175


 34%|███▍      | 102/300 [4:58:23<9:31:31, 173.19s/it]

Training...
mean cross_entropy: 0.27372464537620544, mean accuracy: 0.9065999984741211


 34%|███▍      | 103/300 [5:01:12<9:25:22, 172.20s/it]

Training...
mean cross_entropy: 0.27187052369117737, mean accuracy: 0.9063454270362854


 35%|███▍      | 104/300 [5:04:01<9:18:35, 171.00s/it]

Training...
mean cross_entropy: 0.26967862248420715, mean accuracy: 0.9072363376617432


 35%|███▌      | 105/300 [5:06:50<9:14:18, 170.56s/it]

Training...
mean cross_entropy: 0.266796350479126, mean accuracy: 0.9092000126838684


 35%|███▌      | 106/300 [5:09:42<9:12:24, 170.85s/it]

Training...
mean cross_entropy: 0.2634674608707428, mean accuracy: 0.9100727438926697


 36%|███▌      | 107/300 [5:12:33<9:10:24, 171.11s/it]

Training...
mean cross_entropy: 0.26316037774086, mean accuracy: 0.9103272557258606


 36%|███▌      | 108/300 [5:15:25<9:07:51, 171.21s/it]

Training...
mean cross_entropy: 0.26343509554862976, mean accuracy: 0.9088727235794067


 36%|███▋      | 109/300 [5:18:16<9:04:29, 171.05s/it]

Training...
mean cross_entropy: 0.26255500316619873, mean accuracy: 0.90989089012146


 37%|███▋      | 110/300 [5:21:05<9:00:09, 170.58s/it]

Training...
mean cross_entropy: 0.2594439685344696, mean accuracy: 0.9105636477470398


 37%|███▋      | 111/300 [5:23:54<8:56:17, 170.25s/it]

Training...
mean cross_entropy: 0.26292353868484497, mean accuracy: 0.9087818264961243


 37%|███▋      | 112/300 [5:26:44<8:52:20, 169.90s/it]

Training...
mean cross_entropy: 0.2593279778957367, mean accuracy: 0.9114181995391846


 38%|███▊      | 113/300 [5:29:33<8:48:57, 169.72s/it]

Training...
mean cross_entropy: 0.2581340968608856, mean accuracy: 0.9114181995391846


 38%|███▊      | 114/300 [5:32:22<8:45:57, 169.66s/it]

Training...
mean cross_entropy: 0.256681352853775, mean accuracy: 0.9120727181434631


 38%|███▊      | 115/300 [5:35:13<8:43:35, 169.81s/it]

Training...
mean cross_entropy: 0.2558209300041199, mean accuracy: 0.911763608455658


 39%|███▊      | 116/300 [5:38:04<8:42:07, 170.26s/it]

Training...
mean cross_entropy: 0.2558088004589081, mean accuracy: 0.9132909178733826


 39%|███▉      | 117/300 [5:40:57<8:42:10, 171.20s/it]

Training...
mean cross_entropy: 0.25690513849258423, mean accuracy: 0.9121090769767761


 39%|███▉      | 118/300 [5:43:52<8:42:20, 172.20s/it]

Training...
mean cross_entropy: 0.25581106543540955, mean accuracy: 0.9126909375190735


 40%|███▉      | 119/300 [5:46:46<8:41:18, 172.81s/it]

Training...
mean cross_entropy: 0.2551417350769043, mean accuracy: 0.9124181866645813
Validation...
mean cross_entropy: 0.28215718269348145, mean accuracy: 0.9088000059127808


 40%|████      | 120/300 [5:49:42<8:40:55, 173.64s/it]

Training...
mean cross_entropy: 0.25291907787323, mean accuracy: 0.9134363532066345


 40%|████      | 121/300 [5:52:33<8:36:26, 173.11s/it]

Training...
mean cross_entropy: 0.2529096007347107, mean accuracy: 0.9138545393943787


 41%|████      | 122/300 [5:55:26<8:32:44, 172.84s/it]

Training...
mean cross_entropy: 0.25267985463142395, mean accuracy: 0.9136727452278137


 41%|████      | 123/300 [5:58:19<8:30:08, 172.93s/it]

Training...
mean cross_entropy: 0.2522973120212555, mean accuracy: 0.9138727188110352


 41%|████▏     | 124/300 [6:01:13<8:27:58, 173.17s/it]

Training...
mean cross_entropy: 0.25307461619377136, mean accuracy: 0.9120727181434631


 42%|████▏     | 125/300 [6:04:07<8:26:07, 173.53s/it]

Training...
mean cross_entropy: 0.25340163707733154, mean accuracy: 0.913454532623291


 42%|████▏     | 126/300 [6:07:01<8:23:43, 173.70s/it]

Training...
mean cross_entropy: 0.25146088004112244, mean accuracy: 0.9146727323532104


 42%|████▏     | 127/300 [6:09:48<8:15:12, 171.75s/it]

Training...
mean cross_entropy: 0.2527235448360443, mean accuracy: 0.9136363863945007


 43%|████▎     | 128/300 [6:12:37<8:09:42, 170.83s/it]

Training...
mean cross_entropy: 0.25414422154426575, mean accuracy: 0.9130181670188904


 43%|████▎     | 129/300 [6:15:27<8:06:16, 170.62s/it]

Training...
mean cross_entropy: 0.2510465681552887, mean accuracy: 0.9140727519989014


 43%|████▎     | 130/300 [6:18:19<8:04:17, 170.93s/it]

Training...
mean cross_entropy: 0.2503567337989807, mean accuracy: 0.9135817885398865


 44%|████▎     | 131/300 [6:21:13<8:03:56, 171.81s/it]

Training...
mean cross_entropy: 0.2523191273212433, mean accuracy: 0.9126909375190735


 44%|████▍     | 132/300 [6:24:07<8:03:05, 172.53s/it]

Training...
mean cross_entropy: 0.24936358630657196, mean accuracy: 0.9150909185409546


 44%|████▍     | 133/300 [6:27:01<8:01:57, 173.16s/it]

Training...
mean cross_entropy: 0.24999554455280304, mean accuracy: 0.914545476436615


 45%|████▍     | 134/300 [6:30:04<8:07:15, 176.12s/it]

Training...
mean cross_entropy: 0.25151771306991577, mean accuracy: 0.9141636490821838


 45%|████▌     | 135/300 [6:33:08<8:10:08, 178.23s/it]

Training...
mean cross_entropy: 0.24986210465431213, mean accuracy: 0.913490891456604


 45%|████▌     | 136/300 [6:36:11<8:11:03, 179.65s/it]

Training...
mean cross_entropy: 0.2482973039150238, mean accuracy: 0.9148908853530884


 46%|████▌     | 137/300 [6:39:06<8:04:48, 178.46s/it]

Training...
mean cross_entropy: 0.24762384593486786, mean accuracy: 0.9150363802909851


 46%|████▌     | 138/300 [6:42:00<7:58:22, 177.18s/it]

Training...
mean cross_entropy: 0.2491748332977295, mean accuracy: 0.9154363870620728


 46%|████▋     | 139/300 [6:44:54<7:52:35, 176.12s/it]

Training...
mean cross_entropy: 0.2467050403356552, mean accuracy: 0.916454553604126
Validation...
mean cross_entropy: 0.23442815244197845, mean accuracy: 0.9179999828338623


 47%|████▋     | 140/300 [6:47:52<7:50:44, 176.53s/it]

Training...
mean cross_entropy: 0.2489224374294281, mean accuracy: 0.9141272902488708


 47%|████▋     | 141/300 [6:50:48<7:47:57, 176.59s/it]

Training...
mean cross_entropy: 0.24946381151676178, mean accuracy: 0.9151999950408936


 47%|████▋     | 142/300 [6:53:43<7:43:27, 176.00s/it]

Training...
mean cross_entropy: 0.24816499650478363, mean accuracy: 0.9160909056663513


 48%|████▊     | 143/300 [6:56:39<7:40:44, 176.08s/it]

Training...
mean cross_entropy: 0.24464264512062073, mean accuracy: 0.916454553604126


 48%|████▊     | 144/300 [6:59:30<7:33:38, 174.48s/it]

Training...
mean cross_entropy: 0.24657081067562103, mean accuracy: 0.9154000282287598


 48%|████▊     | 145/300 [7:02:20<7:27:07, 173.08s/it]

Training...
mean cross_entropy: 0.24536015093326569, mean accuracy: 0.9160181879997253


 49%|████▊     | 146/300 [7:05:10<7:21:56, 172.18s/it]

Training...
mean cross_entropy: 0.24730998277664185, mean accuracy: 0.9147818088531494


 49%|████▉     | 147/300 [7:08:05<7:21:07, 172.99s/it]

Training...
mean cross_entropy: 0.24500766396522522, mean accuracy: 0.9160545468330383


 49%|████▉     | 148/300 [7:11:09<7:26:38, 176.31s/it]

Training...
mean cross_entropy: 0.24661482870578766, mean accuracy: 0.915363609790802


 50%|████▉     | 149/300 [7:14:13<7:29:46, 178.72s/it]

Training...
mean cross_entropy: 0.24435138702392578, mean accuracy: 0.9169272780418396


 50%|█████     | 150/300 [7:17:16<7:29:57, 179.99s/it]

Training...
mean cross_entropy: 0.24622462689876556, mean accuracy: 0.9159818291664124


 50%|█████     | 151/300 [7:20:10<7:22:22, 178.14s/it]

Training...
mean cross_entropy: 0.244206041097641, mean accuracy: 0.9158909320831299


 51%|█████     | 152/300 [7:23:04<7:16:31, 176.97s/it]

Training...
mean cross_entropy: 0.24696975946426392, mean accuracy: 0.9147272706031799


 51%|█████     | 153/300 [7:25:58<7:11:29, 176.12s/it]

Training...
mean cross_entropy: 0.2457752823829651, mean accuracy: 0.916490912437439


 51%|█████▏    | 154/300 [7:28:49<7:04:53, 174.61s/it]

Training...
mean cross_entropy: 0.24741274118423462, mean accuracy: 0.915290892124176


 52%|█████▏    | 155/300 [7:31:39<6:58:15, 173.07s/it]

Training...
mean cross_entropy: 0.24553197622299194, mean accuracy: 0.9164363741874695


 52%|█████▏    | 156/300 [7:34:28<6:52:30, 171.88s/it]

Training...
mean cross_entropy: 0.2454584836959839, mean accuracy: 0.9173091053962708


 52%|█████▏    | 157/300 [7:37:17<6:47:44, 171.08s/it]

Training...
mean cross_entropy: 0.2432222068309784, mean accuracy: 0.9159818291664124


 53%|█████▎    | 158/300 [7:40:07<6:43:44, 170.60s/it]

Training...
mean cross_entropy: 0.24326644837856293, mean accuracy: 0.9162726998329163


 53%|█████▎    | 159/300 [7:42:56<6:39:56, 170.19s/it]

Training...
mean cross_entropy: 0.24373836815357208, mean accuracy: 0.9167636632919312
Validation...
mean cross_entropy: 0.2593333125114441, mean accuracy: 0.9111999869346619


 53%|█████▎    | 160/300 [7:45:48<6:38:42, 170.87s/it]

Training...
mean cross_entropy: 0.24415984749794006, mean accuracy: 0.916527271270752


 54%|█████▎    | 161/300 [7:48:46<6:40:49, 173.02s/it]

Training...
mean cross_entropy: 0.24243561923503876, mean accuracy: 0.9177636504173279


 54%|█████▍    | 162/300 [7:51:50<6:45:34, 176.34s/it]

Training...
mean cross_entropy: 0.2449599802494049, mean accuracy: 0.9161090850830078


 54%|█████▍    | 163/300 [7:54:55<6:48:00, 178.69s/it]

Training...
mean cross_entropy: 0.24353507161140442, mean accuracy: 0.9172727465629578


 55%|█████▍    | 164/300 [7:57:58<6:48:12, 180.09s/it]

Training...
mean cross_entropy: 0.24613860249519348, mean accuracy: 0.9160909056663513


 55%|█████▌    | 165/300 [8:00:57<6:44:38, 179.84s/it]

Training...
mean cross_entropy: 0.2437857985496521, mean accuracy: 0.9157817959785461


 55%|█████▌    | 166/300 [8:03:57<6:41:34, 179.81s/it]

Training...
mean cross_entropy: 0.2433411180973053, mean accuracy: 0.9172363877296448


 56%|█████▌    | 167/300 [8:06:56<6:38:24, 179.73s/it]

Training...
mean cross_entropy: 0.24094147980213165, mean accuracy: 0.9175817966461182


 56%|█████▌    | 168/300 [8:09:45<6:28:19, 176.51s/it]

Training...
mean cross_entropy: 0.24383245408535004, mean accuracy: 0.9161999821662903


 56%|█████▋    | 169/300 [8:12:35<6:20:41, 174.37s/it]

Training...
mean cross_entropy: 0.24221867322921753, mean accuracy: 0.9176909327507019


 57%|█████▋    | 170/300 [8:15:24<6:14:26, 172.82s/it]

Training...
mean cross_entropy: 0.24244454503059387, mean accuracy: 0.9178000092506409


 57%|█████▋    | 171/300 [8:18:16<6:10:48, 172.47s/it]

Training...
mean cross_entropy: 0.24513858556747437, mean accuracy: 0.9166908860206604


 57%|█████▋    | 172/300 [8:21:09<6:08:39, 172.81s/it]

Training...
mean cross_entropy: 0.24381908774375916, mean accuracy: 0.9174908995628357


 58%|█████▊    | 173/300 [8:24:03<6:06:34, 173.19s/it]

Training...
mean cross_entropy: 0.24304190278053284, mean accuracy: 0.9174181818962097


 58%|█████▊    | 174/300 [8:26:58<6:04:20, 173.50s/it]

Training...
mean cross_entropy: 0.24453729391098022, mean accuracy: 0.9176181554794312


 58%|█████▊    | 175/300 [8:29:48<5:59:30, 172.56s/it]

Training...
mean cross_entropy: 0.24354512989521027, mean accuracy: 0.9160000085830688


 59%|█████▊    | 176/300 [8:32:38<5:55:09, 171.85s/it]

Training...
mean cross_entropy: 0.2453823834657669, mean accuracy: 0.9165636301040649


 59%|█████▉    | 177/300 [8:35:28<5:51:09, 171.30s/it]

Training...
mean cross_entropy: 0.2420538365840912, mean accuracy: 0.9185818433761597


 59%|█████▉    | 178/300 [8:38:18<5:47:40, 170.99s/it]

Training...
mean cross_entropy: 0.2415822446346283, mean accuracy: 0.9176363348960876


 60%|█████▉    | 179/300 [8:41:08<5:44:07, 170.64s/it]

Training...
mean cross_entropy: 0.24368132650852203, mean accuracy: 0.9166727066040039
Validation...
mean cross_entropy: 0.252875953912735, mean accuracy: 0.9142000079154968


 60%|██████    | 180/300 [8:44:02<5:43:11, 171.60s/it]

Training...
mean cross_entropy: 0.24059194326400757, mean accuracy: 0.9172727465629578


 60%|██████    | 181/300 [8:46:52<5:39:31, 171.19s/it]

Training...
mean cross_entropy: 0.24217909574508667, mean accuracy: 0.9167454838752747


 61%|██████    | 182/300 [8:49:47<5:38:44, 172.25s/it]

Training...
mean cross_entropy: 0.23962019383907318, mean accuracy: 0.9179454445838928


 61%|██████    | 183/300 [8:52:41<5:36:46, 172.71s/it]

Training...
mean cross_entropy: 0.24220937490463257, mean accuracy: 0.9173636436462402


 61%|██████▏   | 184/300 [8:55:35<5:34:32, 173.03s/it]

Training...
mean cross_entropy: 0.24381762742996216, mean accuracy: 0.9166181683540344


 62%|██████▏   | 185/300 [8:58:28<5:31:58, 173.20s/it]

Training...
mean cross_entropy: 0.24265208840370178, mean accuracy: 0.9174908995628357


 62%|██████▏   | 186/300 [9:01:22<5:29:16, 173.31s/it]

Training...
mean cross_entropy: 0.24102360010147095, mean accuracy: 0.9178363680839539


 62%|██████▏   | 187/300 [9:04:15<5:26:31, 173.37s/it]

Training...
mean cross_entropy: 0.23933248221874237, mean accuracy: 0.9188908934593201


 63%|██████▎   | 188/300 [9:07:10<5:24:11, 173.67s/it]

Training...
mean cross_entropy: 0.24275411665439606, mean accuracy: 0.9166727066040039


 63%|██████▎   | 189/300 [9:10:13<5:26:40, 176.58s/it]

Training...
mean cross_entropy: 0.24205248057842255, mean accuracy: 0.9175636172294617


 63%|██████▎   | 190/300 [9:13:17<5:27:54, 178.86s/it]

Training...
mean cross_entropy: 0.24110183119773865, mean accuracy: 0.917163610458374


 64%|██████▎   | 191/300 [9:16:21<5:27:50, 180.47s/it]

Training...
mean cross_entropy: 0.24309390783309937, mean accuracy: 0.9175817966461182


 64%|██████▍   | 192/300 [9:19:14<5:20:44, 178.19s/it]

Training...
mean cross_entropy: 0.2390342354774475, mean accuracy: 0.9180908799171448


 64%|██████▍   | 193/300 [9:22:05<5:13:37, 175.87s/it]

Training...
mean cross_entropy: 0.24191942811012268, mean accuracy: 0.9172727465629578


 65%|██████▍   | 194/300 [9:24:55<5:07:37, 174.13s/it]

Training...
mean cross_entropy: 0.24046668410301208, mean accuracy: 0.9176909327507019


 65%|██████▌   | 195/300 [9:27:51<5:05:39, 174.67s/it]

Training...
mean cross_entropy: 0.24053049087524414, mean accuracy: 0.918218195438385


 65%|██████▌   | 196/300 [9:31:03<5:11:54, 179.95s/it]

Training...
mean cross_entropy: 0.2417289912700653, mean accuracy: 0.9177091121673584


 66%|██████▌   | 197/300 [9:34:15<5:15:08, 183.58s/it]

Training...
mean cross_entropy: 0.24150429666042328, mean accuracy: 0.9179999828338623


 66%|██████▌   | 198/300 [9:37:23<5:14:32, 185.02s/it]

Training...
mean cross_entropy: 0.24218019843101501, mean accuracy: 0.9188727140426636


 66%|██████▋   | 199/300 [9:40:13<5:03:47, 180.47s/it]

Decrease learning rate, new lr = 1e-05
Training...
mean cross_entropy: 0.22915326058864594, mean accuracy: 0.9215818047523499
Validation...
mean cross_entropy: 0.20914697647094727, mean accuracy: 0.9265999794006348


 67%|██████▋   | 200/300 [9:43:07<4:57:11, 178.31s/it]

Training...
mean cross_entropy: 0.22743824124336243, mean accuracy: 0.9229090809822083


 67%|██████▋   | 201/300 [9:45:57<4:50:11, 175.88s/it]

Training...
mean cross_entropy: 0.22352179884910583, mean accuracy: 0.9252363443374634


 67%|██████▋   | 202/300 [9:48:49<4:45:28, 174.78s/it]

Training...
mean cross_entropy: 0.22344528138637543, mean accuracy: 0.9246000051498413


 68%|██████▊   | 203/300 [9:51:43<4:42:21, 174.66s/it]

Training...
mean cross_entropy: 0.2238573133945465, mean accuracy: 0.9232909083366394


 68%|██████▊   | 204/300 [9:54:38<4:39:21, 174.60s/it]

Training...
mean cross_entropy: 0.22388148307800293, mean accuracy: 0.9243454337120056


 68%|██████▊   | 205/300 [9:57:32<4:36:02, 174.34s/it]

Training...
mean cross_entropy: 0.22203530371189117, mean accuracy: 0.9244909286499023


 69%|██████▊   | 206/300 [10:00:23<4:31:59, 173.61s/it]

Training...
mean cross_entropy: 0.2210531234741211, mean accuracy: 0.9246545433998108


 69%|██████▉   | 207/300 [10:03:15<4:28:20, 173.12s/it]

Training...
mean cross_entropy: 0.22335663437843323, mean accuracy: 0.9246727228164673


 69%|██████▉   | 208/300 [10:06:07<4:24:42, 172.63s/it]

Training...
mean cross_entropy: 0.22089695930480957, mean accuracy: 0.9249454736709595


 70%|██████▉   | 209/300 [10:08:57<4:20:37, 171.84s/it]

Training...
mean cross_entropy: 0.21996469795703888, mean accuracy: 0.925636351108551


 70%|███████   | 210/300 [10:11:46<4:16:36, 171.07s/it]

Training...
mean cross_entropy: 0.21947859227657318, mean accuracy: 0.9253090620040894


 70%|███████   | 211/300 [10:14:36<4:13:01, 170.57s/it]

Training...
mean cross_entropy: 0.21926218271255493, mean accuracy: 0.9266545176506042


 71%|███████   | 212/300 [10:17:25<4:09:41, 170.24s/it]

Training...
mean cross_entropy: 0.2200506031513214, mean accuracy: 0.9251636266708374


 71%|███████   | 213/300 [10:20:17<4:07:40, 170.81s/it]

Training...
mean cross_entropy: 0.21864062547683716, mean accuracy: 0.9265454411506653


 71%|███████▏  | 214/300 [10:23:09<4:05:24, 171.22s/it]

Training...
mean cross_entropy: 0.2211998552083969, mean accuracy: 0.9247817993164062


 72%|███████▏  | 215/300 [10:26:02<4:03:10, 171.65s/it]

Training...
mean cross_entropy: 0.21815408766269684, mean accuracy: 0.9264181852340698


 72%|███████▏  | 216/300 [10:29:03<4:04:03, 174.33s/it]

Training...
mean cross_entropy: 0.21879942715168, mean accuracy: 0.9252908825874329


 72%|███████▏  | 217/300 [10:32:07<4:05:15, 177.30s/it]

Training...
mean cross_entropy: 0.2161252796649933, mean accuracy: 0.9271636605262756


 73%|███████▎  | 218/300 [10:35:11<4:05:03, 179.31s/it]

Training...
mean cross_entropy: 0.21493735909461975, mean accuracy: 0.927545428276062


 73%|███████▎  | 219/300 [10:38:09<4:01:28, 178.87s/it]

Training...
mean cross_entropy: 0.2176458239555359, mean accuracy: 0.9282545447349548
Validation...
mean cross_entropy: 0.20817790925502777, mean accuracy: 0.9264000058174133


 73%|███████▎  | 220/300 [10:41:02<3:56:17, 177.22s/it]

Training...
mean cross_entropy: 0.2182786762714386, mean accuracy: 0.9250181913375854


 74%|███████▎  | 221/300 [10:43:51<3:50:13, 174.86s/it]

Training...
mean cross_entropy: 0.21852152049541473, mean accuracy: 0.9253454804420471


 74%|███████▍  | 222/300 [10:46:42<3:45:30, 173.47s/it]

Training...
mean cross_entropy: 0.21974815428256989, mean accuracy: 0.9250727295875549


 74%|███████▍  | 223/300 [10:49:40<3:44:31, 174.95s/it]

Training...
mean cross_entropy: 0.21640092134475708, mean accuracy: 0.9271818399429321


 75%|███████▍  | 224/300 [10:52:40<3:43:28, 176.42s/it]

Training...
mean cross_entropy: 0.2177891582250595, mean accuracy: 0.926727294921875


 75%|███████▌  | 225/300 [10:55:39<3:41:37, 177.30s/it]

Training...
mean cross_entropy: 0.2177267074584961, mean accuracy: 0.9264545440673828


 75%|███████▌  | 226/300 [10:58:33<3:37:19, 176.21s/it]

Training...
mean cross_entropy: 0.21803390979766846, mean accuracy: 0.9267818331718445


 76%|███████▌  | 227/300 [11:01:23<3:31:57, 174.21s/it]

Training...
mean cross_entropy: 0.21628765761852264, mean accuracy: 0.9254909157752991


 76%|███████▌  | 228/300 [11:04:12<3:27:23, 172.83s/it]

Training...
mean cross_entropy: 0.21956823766231537, mean accuracy: 0.9246181845664978


 76%|███████▋  | 229/300 [11:07:02<3:23:20, 171.84s/it]

Training...
mean cross_entropy: 0.219085231423378, mean accuracy: 0.9273999929428101


 77%|███████▋  | 230/300 [11:09:51<3:19:26, 170.95s/it]

Training...
mean cross_entropy: 0.21677589416503906, mean accuracy: 0.9271273016929626


 77%|███████▋  | 231/300 [11:12:40<3:15:55, 170.37s/it]

Training...
mean cross_entropy: 0.21620333194732666, mean accuracy: 0.9265272617340088


 77%|███████▋  | 232/300 [11:15:29<3:12:56, 170.25s/it]

Training...
mean cross_entropy: 0.21446429193019867, mean accuracy: 0.9282909035682678


 78%|███████▊  | 233/300 [11:18:21<3:10:35, 170.68s/it]

Training...
mean cross_entropy: 0.21655763685703278, mean accuracy: 0.9267091155052185


 78%|███████▊  | 234/300 [11:21:16<3:09:02, 171.86s/it]

Training...
mean cross_entropy: 0.21598239243030548, mean accuracy: 0.927436351776123


 78%|███████▊  | 235/300 [11:24:10<3:06:58, 172.59s/it]

Training...
mean cross_entropy: 0.21582381427288055, mean accuracy: 0.9263636469841003


 79%|███████▊  | 236/300 [11:27:04<3:04:38, 173.10s/it]

Training...
mean cross_entropy: 0.2162892371416092, mean accuracy: 0.926800012588501


 79%|███████▉  | 237/300 [11:29:59<3:02:14, 173.57s/it]

Training...
mean cross_entropy: 0.2151535302400589, mean accuracy: 0.9270181655883789


 79%|███████▉  | 238/300 [11:32:56<3:00:19, 174.51s/it]

Training...
mean cross_entropy: 0.21543334424495697, mean accuracy: 0.9268545508384705


 80%|███████▉  | 239/300 [11:35:50<2:57:25, 174.52s/it]

Training...
mean cross_entropy: 0.21514996886253357, mean accuracy: 0.9270727038383484
Validation...
mean cross_entropy: 0.21092793345451355, mean accuracy: 0.925599992275238


 80%|████████  | 240/300 [11:38:50<2:56:03, 176.07s/it]

Training...
mean cross_entropy: 0.21527911722660065, mean accuracy: 0.9282727241516113


 80%|████████  | 241/300 [11:41:44<2:52:39, 175.58s/it]

Training...
mean cross_entropy: 0.2187637835741043, mean accuracy: 0.9258363842964172


 81%|████████  | 242/300 [11:44:41<2:49:59, 175.85s/it]

Training...
mean cross_entropy: 0.21718890964984894, mean accuracy: 0.926763653755188


 81%|████████  | 243/300 [11:47:36<2:46:54, 175.69s/it]

Training...
mean cross_entropy: 0.21777842938899994, mean accuracy: 0.925636351108551


 81%|████████▏ | 244/300 [11:50:28<2:42:59, 174.63s/it]

Training...
mean cross_entropy: 0.21753515303134918, mean accuracy: 0.9267091155052185


 82%|████████▏ | 245/300 [11:53:21<2:39:25, 173.91s/it]

Training...
mean cross_entropy: 0.2146589308977127, mean accuracy: 0.9277454614639282


 82%|████████▏ | 246/300 [11:56:12<2:35:56, 173.28s/it]

Training...
mean cross_entropy: 0.2145032435655594, mean accuracy: 0.9285091161727905


 82%|████████▏ | 247/300 [11:59:06<2:33:05, 173.30s/it]

Training...
mean cross_entropy: 0.21620716154575348, mean accuracy: 0.9270908832550049


 83%|████████▎ | 248/300 [12:02:00<2:30:21, 173.48s/it]

Training...
mean cross_entropy: 0.2148701697587967, mean accuracy: 0.9274908900260925


 83%|████████▎ | 249/300 [12:04:53<2:27:32, 173.57s/it]

Training...
mean cross_entropy: 0.21451203525066376, mean accuracy: 0.9279636144638062


 83%|████████▎ | 250/300 [12:07:50<2:25:21, 174.43s/it]

Training...
mean cross_entropy: 0.2168542593717575, mean accuracy: 0.9260727167129517


 84%|████████▎ | 251/300 [12:10:53<2:24:33, 177.01s/it]

Training...
mean cross_entropy: 0.2156459391117096, mean accuracy: 0.926690936088562


 84%|████████▍ | 252/300 [12:13:56<2:23:06, 178.89s/it]

Training...
mean cross_entropy: 0.2160773128271103, mean accuracy: 0.9265817999839783


 84%|████████▍ | 253/300 [12:17:00<2:21:11, 180.25s/it]

Training...
mean cross_entropy: 0.21350941061973572, mean accuracy: 0.927545428276062


 85%|████████▍ | 254/300 [12:20:04<2:19:03, 181.37s/it]

Training...
mean cross_entropy: 0.2148553431034088, mean accuracy: 0.926727294921875


 85%|████████▌ | 255/300 [12:23:08<2:16:38, 182.19s/it]

Training...
mean cross_entropy: 0.21686016023159027, mean accuracy: 0.9262909293174744


 85%|████████▌ | 256/300 [12:26:12<2:14:02, 182.79s/it]

Training...
mean cross_entropy: 0.21268782019615173, mean accuracy: 0.928672730922699


 86%|████████▌ | 257/300 [12:29:05<2:08:55, 179.88s/it]

Training...
mean cross_entropy: 0.21400557458400726, mean accuracy: 0.9259636402130127


 86%|████████▌ | 258/300 [12:31:54<2:03:39, 176.66s/it]

Training...
mean cross_entropy: 0.21615907549858093, mean accuracy: 0.9265636205673218


 86%|████████▋ | 259/300 [12:34:43<1:59:09, 174.38s/it]

Training...
mean cross_entropy: 0.21524576842784882, mean accuracy: 0.9267818331718445
Validation...
mean cross_entropy: 0.20813317596912384, mean accuracy: 0.9236000180244446


 87%|████████▋ | 260/300 [12:37:36<1:55:58, 173.95s/it]

Training...
mean cross_entropy: 0.21352946758270264, mean accuracy: 0.928600013256073


 87%|████████▋ | 261/300 [12:40:26<1:52:18, 172.78s/it]

Training...
mean cross_entropy: 0.21620124578475952, mean accuracy: 0.9272000193595886


 87%|████████▋ | 262/300 [12:43:16<1:48:56, 172.01s/it]

Training...
mean cross_entropy: 0.21590064465999603, mean accuracy: 0.926800012588501


 88%|████████▊ | 263/300 [12:46:07<1:45:43, 171.45s/it]

Training...
mean cross_entropy: 0.2156447023153305, mean accuracy: 0.9271273016929626


 88%|████████▊ | 264/300 [12:48:56<1:42:32, 170.89s/it]

Training...
mean cross_entropy: 0.2161027193069458, mean accuracy: 0.9254000186920166


 88%|████████▊ | 265/300 [12:51:46<1:39:28, 170.54s/it]

Training...
mean cross_entropy: 0.21682177484035492, mean accuracy: 0.9261999726295471


 89%|████████▊ | 266/300 [12:54:36<1:36:31, 170.33s/it]

Training...
mean cross_entropy: 0.21548406779766083, mean accuracy: 0.9272727370262146


 89%|████████▉ | 267/300 [12:57:26<1:33:41, 170.36s/it]

Training...
mean cross_entropy: 0.2120896726846695, mean accuracy: 0.9289454817771912


 89%|████████▉ | 268/300 [13:00:18<1:31:07, 170.87s/it]

Training...
mean cross_entropy: 0.21592450141906738, mean accuracy: 0.9269454479217529


 90%|████████▉ | 269/300 [13:03:10<1:28:22, 171.06s/it]

Training...
mean cross_entropy: 0.212208092212677, mean accuracy: 0.9284545183181763


 90%|█████████ | 270/300 [13:06:02<1:25:46, 171.56s/it]

Training...
mean cross_entropy: 0.2147158533334732, mean accuracy: 0.9284363389015198


 90%|█████████ | 271/300 [13:08:53<1:22:47, 171.29s/it]

Training...
mean cross_entropy: 0.2137208878993988, mean accuracy: 0.9287818074226379


 91%|█████████ | 272/300 [13:11:43<1:19:46, 170.93s/it]

Training...
mean cross_entropy: 0.21506543457508087, mean accuracy: 0.9269818067550659


 91%|█████████ | 273/300 [13:14:33<1:16:46, 170.62s/it]

Training...
mean cross_entropy: 0.21493913233280182, mean accuracy: 0.9273454546928406


 91%|█████████▏| 274/300 [13:17:26<1:14:15, 171.37s/it]

Training...
mean cross_entropy: 0.21546736359596252, mean accuracy: 0.927472710609436


 92%|█████████▏| 275/300 [13:20:38<1:13:58, 177.54s/it]

Training...
mean cross_entropy: 0.21315351128578186, mean accuracy: 0.9272909164428711


 92%|█████████▏| 276/300 [13:23:50<1:12:47, 181.97s/it]

Training...
mean cross_entropy: 0.2141139954328537, mean accuracy: 0.9279090762138367


 92%|█████████▏| 277/300 [13:27:02<1:10:51, 184.83s/it]

Training...
mean cross_entropy: 0.21446789801120758, mean accuracy: 0.9274545311927795


 93%|█████████▎| 278/300 [13:29:56<1:06:35, 181.61s/it]

Training...
mean cross_entropy: 0.21313419938087463, mean accuracy: 0.9282363653182983


 93%|█████████▎| 279/300 [13:32:50<1:02:45, 179.31s/it]

Training...
mean cross_entropy: 0.21426984667778015, mean accuracy: 0.9272363781929016
Validation...
mean cross_entropy: 0.20643523335456848, mean accuracy: 0.9258000254631042


 93%|█████████▎| 280/300 [13:35:48<59:37, 178.87s/it]  

Training...
mean cross_entropy: 0.2147994339466095, mean accuracy: 0.9266545176506042


 94%|█████████▎| 281/300 [13:38:43<56:14, 177.62s/it]

Training...
mean cross_entropy: 0.21456418931484222, mean accuracy: 0.9270363450050354


 94%|█████████▍| 282/300 [13:41:37<53:00, 176.70s/it]

Training...
mean cross_entropy: 0.21153156459331512, mean accuracy: 0.9281272888183594


 94%|█████████▍| 283/300 [13:44:34<50:02, 176.65s/it]

Training...
mean cross_entropy: 0.21482056379318237, mean accuracy: 0.9270545244216919


 95%|█████████▍| 284/300 [13:47:28<46:54, 175.93s/it]

Training...
mean cross_entropy: 0.21469974517822266, mean accuracy: 0.9271090626716614


 95%|█████████▌| 285/300 [13:50:18<43:32, 174.16s/it]

Training...
mean cross_entropy: 0.2123405784368515, mean accuracy: 0.9287636280059814


 95%|█████████▌| 286/300 [13:53:08<40:20, 172.92s/it]

Training...
mean cross_entropy: 0.21382875740528107, mean accuracy: 0.9278363585472107


 96%|█████████▌| 287/300 [13:55:58<37:16, 172.07s/it]

Training...
mean cross_entropy: 0.21510247886180878, mean accuracy: 0.9260545372962952


 96%|█████████▌| 288/300 [13:58:48<34:17, 171.42s/it]

Training...
mean cross_entropy: 0.2133244425058365, mean accuracy: 0.928745448589325


 96%|█████████▋| 289/300 [14:01:38<31:22, 171.12s/it]

Training...
mean cross_entropy: 0.2137003242969513, mean accuracy: 0.928672730922699


 97%|█████████▋| 290/300 [14:04:28<28:27, 170.77s/it]

Training...
mean cross_entropy: 0.21404314041137695, mean accuracy: 0.9272000193595886


 97%|█████████▋| 291/300 [14:07:19<25:36, 170.72s/it]

Training...
mean cross_entropy: 0.212672621011734, mean accuracy: 0.928745448589325


 97%|█████████▋| 292/300 [14:10:11<22:48, 171.12s/it]

Training...
mean cross_entropy: 0.21169544756412506, mean accuracy: 0.9289818406105042


 98%|█████████▊| 293/300 [14:13:03<20:00, 171.45s/it]

Training...
mean cross_entropy: 0.2145676612854004, mean accuracy: 0.9274545311927795


 98%|█████████▊| 294/300 [14:15:55<17:09, 171.61s/it]

Training...
mean cross_entropy: 0.2153862863779068, mean accuracy: 0.9272181987762451


 98%|█████████▊| 295/300 [14:18:49<14:21, 172.24s/it]

Training...
mean cross_entropy: 0.21512769162654877, mean accuracy: 0.9275272488594055


 99%|█████████▊| 296/300 [14:21:43<11:31, 172.80s/it]

Training...
mean cross_entropy: 0.21168223023414612, mean accuracy: 0.9278545379638672


 99%|█████████▉| 297/300 [14:24:37<08:39, 173.23s/it]

Training...
mean cross_entropy: 0.21291331946849823, mean accuracy: 0.9280363917350769


 99%|█████████▉| 298/300 [14:27:31<05:46, 173.32s/it]

Training...
mean cross_entropy: 0.2141805738210678, mean accuracy: 0.9280909299850464


100%|█████████▉| 299/300 [14:30:22<02:52, 172.79s/it]

Training...
mean cross_entropy: 0.21071234345436096, mean accuracy: 0.9279090762138367
Validation...
mean cross_entropy: 0.20586206018924713, mean accuracy: 0.9282000064849854


100%|██████████| 300/300 [14:33:17<00:00, 173.29s/it]
