In [1]:
# -*- coding: utf-8 -*-
"""
Created on Fri Nov  2 10:16:26 2018

@author: PISME_Public01

"""
"""
方差缩放初始化？
https://zhuanlan.zhihu.com/p/38315135
"""

import numpy as np
import tensorflow as tf
from tqdm import trange, tqdm
from PIL import Image
import matplotlib.pyplot as plt
import os
import shutil

from tensorflow.examples.tutorials.mnist import input_data

class DenseNet(object):
    def __init__(self, datasets, growth_rate, depth, 
                 total_blocks, keep_prob, weight_decay, nesterov_momentum, model_type, dataset_name,
                 should_save_logs, should_save_model, 
                 renew_logs=False,
                 reduction=1.0,
                 bc_mode=False,
                 **kwargs):
        
        self.datasets = datasets
        if dataset_name == 'fashion-mnist':
            self.data_shape = (28, 28, 1)
            self.n_classes = 10
        elif dataset_name == 'mnist':
            self.data_shape = (28, 28, 1)
            self.n_classes = 10
        
        self.depth = depth
        self.growth_rate = growth_rate
        self.first_output_features = growth_rate * 2
        self.total_blocks = total_blocks
        self.layers_per_block = (depth - (total_blocks + 1)) // total_blocks
        self.bc_mode = bc_mode
        self.reduction = reduction
        
        if not bc_mode:
            print(f"Build {model_type} model with {self.total_blocks} blocks, "
                  f"{self.layers_per_block} composite layers each.")
        else:
            self.layers_per_block = self.layers_per_block // 2
            print(f"Build {model_type} model with {self.total_blocks} blocks, "
                  f"{self.layers_per_block} bottleneck layers and {self.layers_per_block} composite layers each.")
        
        print(f"Reduction at transition layers: {self.reduction}")
        
        self.keep_prob = keep_prob
        self.weight_decay = weight_decay
        self.nesterov_momentum = nesterov_momentum
        self.model_type = model_type
        self.dataset_name = dataset_name
        self.should_save_logs = should_save_logs
        self.should_save_model = should_save_model
        self.renew_logs = renew_logs
        self.batches_step = 0
        
        self._define_inputs()
        self._build_graph()
        self._initialize_session()
        self._count_trainable_params()
    
    def _initialize_session(self):
        
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        self.sess.run(tf.global_variables_initializer())
        logswriter = tf.summary.FileWriter
        
        self.saver = tf.train.Saver()
        self.summary_weiter = logswriter(self.logs_path)
        
    def _count_trainable_params(self):
        
        total_parameters = 0
        for variable in tf.trainable_variables():
            shape = variable.get_shape()
            variable_parameters = 1
            for dim in shape:
                variable_parameters *= dim.value
            total_parameters += variable_parameters
        print(f"Total training params: {total_parameters / 1e6}M")
    
    @property
    def save_path(self):
        
        try:
            save_path = self._save_path
        except AttributeError:
            save_path = f"saves/{self.model_identifier}"
            os.makedirs(save_path, exist_ok=True)
            save_path = os.path.join(save_path, "model.ckpt")
            self._save_path = save_path
        return save_path
    
    @property
    def logs_path(self):
        
        try:
            logs_path = self._logs_path
        except AttributeError:
            logs_path = f"logs/{self.model_identifier}"
            if self.renew_logs:
                shutil.rmtree(logs_path, ignore_errors=True)
            os.makedirs(logs_path, exist_ok=True)
            self._logs_path = logs_path
        return logs_path
    
    @property
    def model_identifier(self):
        
        return f"{self.model_type}_growth_rate={self.growth_rate}_depth={self.depth}_dataset_{self.dataset_name}"
    
    def save_model(self, global_step=None):
        
        self.saver.save(self.sess, self.save_path, global_step=global_step)
    
    def load_model(self):
        
        try:
            self.saver.restore(self.sess, self.save_path)
        except Exception as e:
            raise IOError("Failed to load model "
                          f"from save path: {self.save_path}")
        self.saver.restore(self.sess, self.save_path)
        print(f"Successfully load model from save path: {self.save_path}")
    
    def log_loss_accuracy(self, loss, accuracy, epoch, prefix, should_print=True):
        
        if should_print:
            print(f"mean cross_entropy: {loss}, mean accuracy: {accuracy}")
        summary = tf.Summary(value=[
                tf.Summary.Value(
                        tag=f'loss_{prefix}', simple_value=float(loss)),
                tf.Summary.Value(
                        tag=f'accuracy_{prefix}', simple_value=float(accuracy))
                ])
        self.summary_weiter.add_summary(summary)
        
    def _define_inputs(self):
        
        shape = [None]
        shape.extend(self.data_shape)
        self.images = tf.placeholder(
                tf.float32, 
                shape=shape,
                name='input_images')
        
        self.labels = tf.placeholder(
                tf.float32,
                shape=[None, self.n_classes],
                name='labels')
        
        self.learning_rate = tf.placeholder(
                tf.float32,
                shape=[],
                name='learning_rate')
        
        self.is_training = tf.placeholder(
                tf.bool,
                shape=[])
        
    def composite_function(self, _input, out_features, kernel_size=3):
        
        with tf.variable_scope("composite_function"):
            # BN
            output = self.batch_norm(_input)
            # ReLu
            output = tf.nn.relu(output)
            # convolution
            output = self.conv2d(
                    output, out_features=out_features, kernel_size=kernel_size)
            # dropout
            output = self.dropout(output)
        return output
            
    def bottleneck(self, _input, out_features):
        
        with tf.variable_scope("bottleneck"):
            output = self.batch_norm(_input)
            output = tf.nn.relu(output)
            inter_features = out_features * 4
            output = self.conv2d(
                    output, out_features=inter_features, kernel_size=1,
                    padding='VALID')
            output = self.dropout(output)
        return output
    
    def add_internal_layer(self, _input, growth_rate):
        
        if not self.bc_mode:
            comp_out = self.composite_function(
                    _input, out_features=growth_rate, kernel_size=3)
        elif self.bc_mode:
            bottleneck_out = self.bottleneck(
                    _input, out_features=growth_rate)
            comp_out = self.composite_function(
                    bottleneck_out, out_features=growth_rate, kernel_size=3)
            
        output = tf.concat(axis=3, values=(_input, comp_out))
        return output
        
    def add_block(self, _input, growth_rate, layers_per_block):
        
        output = _input
        for layer in range(layers_per_block):
            with tf.variable_scope(f"layer_{layer}"):
                output = self.add_internal_layer(
                        output, growth_rate)
        return output
    
    def trainsition_layer(self, _input):
        
        out_features = int(int(_input.get_shape()[-1]) * self.reduction)
        output = self.composite_function(
                _input, out_features, kernel_size=1)
        output = self.avg_pool(_input, 2)
        return output
    
    def transition_layer_to_classes(self, _input):
        
        # BN
        output = self.batch_norm(_input)
        # ReLu
        output = tf.nn.relu(output)
        # average pooling by channel
        last_pool_kernel = int(output.get_shape()[-2])
        output = self.avg_pool(output, k=last_pool_kernel)
        # FC
        features_total = int(output.get_shape()[-1])
        output = tf.reshape(output, [-1, features_total])
        W  = self.weight_variable_xavier(
                [features_total, self.n_classes], name='W')
        bias = self.bias_variable([self.n_classes])
        logits = tf.matmul(output, W) + bias
        return logits
        
    def conv2d(self, _input, out_features, kernel_size, 
               strides=[1, 1, 1, 1], padding='SAME'):
        
        in_features = int(_input.get_shape()[-1])
        kernel = self.weight_variable_msra(
                [kernel_size, kernel_size, in_features, out_features], name='kernel1')
        output = tf.nn.conv2d(_input, kernel, strides, padding)
        return output
    
    def avg_pool(self, _input, k):
        
        ksize = [1, k, k, 1]
        strides = [1, k, k, 1]
        padding = 'VALID'
        output = tf.nn.avg_pool(_input, ksize, strides, padding)
        return output
    
    def batch_norm(self, _input):
        
        output = tf.contrib.layers.batch_norm(
                _input, scale=True, is_training=self.is_training,
                updates_collections=None)
        return output
    
    def dropout(self, _input):
        
        if self.keep_prob < 1:
            output = tf.cond(
                    self.is_training,
                    lambda: tf.nn.dropout(_input, self.keep_prob),
                    lambda: _input
            )
        else:
            output = _input
        return output
    
    
    def weight_variable_msra(self, shape, name):
        
        return tf.get_variable(
                name=name, 
                shape=shape,
                initializer=tf.contrib.layers.variance_scaling_initializer())
    
    def weight_variable_xavier(self, shape, name):
        
        return tf.get_variable(
                name=name,
                shape=shape,
                initializer=tf.contrib.layers.xavier_initializer())
    
    def bias_variable(self, shape, name='bias'):
        
        inital = tf.constant(0.0, shape=shape)
        return tf.get_variable(name, initializer=inital)
    
    
    def _build_graph(self):
        
        growth_rate = self.growth_rate
        layers_per_block = self.layers_per_block
        # first 
        with tf.variable_scope("Initial_convolution"):
            output = self.conv2d(
                    self.images,
                    out_features=self.first_output_features,
                    kernel_size=3)
        
        for block in range(self.total_blocks):
            with tf.variable_scope(f"Block_{block}"):
                output = self.add_block(
                        output, 
                        growth_rate,
                        layers_per_block)
            if block != self.total_blocks - 1:
                with tf.variable_scope(f"Transition_after_block_{block}"):
                    output = self.trainsition_layer(output)
                
        with tf.variable_scope("Transition_to_classes"):
            logits = self.transition_layer_to_classes(output)
        prediction = tf.nn.softmax(logits)
        
        # Losses
        cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
                logits=logits, labels=self.labels))
        self.cross_entropy = cross_entropy
        l2_loss = tf.add_n([tf.nn.l2_loss(var) for var in tf.trainable_variables()])
        
        #optimizer = tf.train.MomentumOptimizer(
        #        self.learning_rate, self.nesterov_momentum, use_nesterov=True)
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.train_step = optimizer.minimize(
                cross_entropy + l2_loss * self.weight_decay)
        
        correct_prediction = tf.equal(
                tf.arg_max(prediction, 1), 
                tf.arg_max(self.labels, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        
    def train_all_epochs(self, train_params):
        
        n_epochs = train_params['n_epochs']
        learning_rate = train_params['initial_learning_rate']
        batch_size = train_params['batch_size']
        reduce_lr_epoch_1 = train_params['reduce_lr_epoch_1']
        reduce_lr_epoch_2 = train_params['reduce_lr_epoch_2']
        for epoch in trange(1, n_epochs + 1):
            if epoch == reduce_lr_epoch_1 or epoch == reduce_lr_epoch_2:
                learning_rate = learning_rate / 10
                print(f"Decrease learning rate, new lr = {learning_rate}")
            
            print("Training...")
            loss, acc = self.train_one_epoch(
                    self.datasets.train, batch_size, learning_rate)
            if self.should_save_logs:
                self.log_loss_accuracy(loss, acc, epoch, prefix='train')
            
            if train_params.get('validation_set', False) and epoch % 20==0:
                print("Validation...")
                loss, acc =self.test(self.datasets.validation, batch_size)
                if self.should_save_logs:
                    self.log_loss_accuracy(loss, acc, epoch, prefix='valid')
            
            if self.should_save_model:
                self.save_model()
    
    
    def train_one_epoch(self, data, batch_size, learning_rate):
        
        num_examples = data.num_examples
        total_loss = []
        total_accuracy = []
        for i in range(num_examples // batch_size):
            batch = data.next_batch(batch_size)
            images, labels = batch
            shape = [-1]
            shape.extend(self.data_shape)
            images = images.reshape(shape)
            feed_dict = {
                    self.images: images, 
                    self.labels: labels,
                    self.learning_rate: learning_rate,
                    self.is_training: True,
            }
            fetches = [self.train_step, self.cross_entropy, self.accuracy]
            result = self.sess.run(fetches, feed_dict=feed_dict)
            _, loss, accuracy = result
            total_loss.append(loss)
            total_accuracy.append(accuracy)
            if self.should_save_logs:
                self.batches_step += 1
                self.log_loss_accuracy(
                        loss, accuracy, self.batches_step, prefix='per_batch',
                        should_print=False)
        mean_loss = np.mean(total_loss)
        mean_accuracy = np.mean(total_accuracy)
        return mean_loss, mean_accuracy
            
    def test(self, data, batch_size):
        
        num_examples = data.num_examples
        total_loss = []
        total_accuracy = []
        for i in range(num_examples // batch_size):
            batch = data.next_batch(batch_size)
            images, labels = batch
            shape = [-1]
            shape.extend(self.data_shape)
            images = images.reshape(shape)
            feed_dict = {
                    self.images: images,
                    self.labels: labels,
                    self.is_training: False
            }
            fetches = [self.cross_entropy, self.accuracy]
            loss, accuracy = self.sess.run(
                    fetches, feed_dict=feed_dict)
            total_loss.append(loss)
            total_accuracy.append(accuracy)
        mean_loss = np.mean(total_loss)
        mean_accuracy = np.mean(total_accuracy)
        return mean_loss, mean_accuracy
            
if __name__ == '__main__':
    
    # data = input_data.read_data_sets('data/fashion')
    data = input_data.read_data_sets('data/fashion', source_url='http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/', one_hot=True)
    
    params = {
            'datasets': data,
            'growth_rate': 12,
            'depth': 40,
            'total_blocks': 3,
            'keep_prob': 0.8,
            'weight_decay': 0.0005,
            'nesterov_momentum': 1 ,
            'model_type': 'DenseNet_BC',
            'dataset_name': 'fashion-mnist',
            'should_save_logs': True,
            'should_save_model': True}
    model = DenseNet(
            params['datasets'], params['growth_rate'], params['depth'],
            params['total_blocks'], params['keep_prob'], params['weight_decay'],
            params['nesterov_momentum'], params['model_type'], params['dataset_name'],
            params['should_save_logs'], params['should_save_model'], reduction=0.5, renew_logs=True, bc_mode=True)
    
    train_params = {
            'n_epochs': 300,
            'initial_learning_rate': 0.1,
            'batch_size': 32,
            'reduce_lr_epoch_1': 100,
            'reduce_lr_epoch_2': 200,
            'validation_set': True}
    
    model.train_all_epochs(train_params)       
        

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


Extracting data/fashion/train-images-idx3-ubyte.gz
Extracting data/fashion/train-labels-idx1-ubyte.gz
Extracting data/fashion/t10k-images-idx3-ubyte.gz
Extracting data/fashion/t10k-labels-idx1-ubyte.gz
Build DenseNet_BC model with 3 blocks, 6 bottleneck layers and 6 composite layers each.
Reduction at transition layers: 0.5
Instructions for updating:
Use `argmax` instead


  0%|          | 0/300 [00:00<?, ?it/s]

Total training params: 0.230794M
Training...
mean cross_entropy: 1.097834825515747, mean accuracy: 0.5897118449211121


  0%|          | 1/300 [00:58<4:50:28, 58.29s/it]

Training...
mean cross_entropy: 1.0126304626464844, mean accuracy: 0.6209618449211121


  1%|          | 2/300 [01:55<4:47:45, 57.94s/it]

Training...
mean cross_entropy: 1.0228843688964844, mean accuracy: 0.6168327927589417


  1%|          | 3/300 [02:52<4:45:27, 57.67s/it]

Training...
mean cross_entropy: 1.0229977369308472, mean accuracy: 0.6143772006034851


  1%|▏         | 4/300 [03:49<4:43:35, 57.49s/it]

Training...
mean cross_entropy: 1.0103710889816284, mean accuracy: 0.6214348077774048


  2%|▏         | 5/300 [04:46<4:41:55, 57.34s/it]

Training...
mean cross_entropy: 1.0196638107299805, mean accuracy: 0.6167418360710144


  2%|▏         | 6/300 [05:43<4:40:26, 57.23s/it]

Training...
mean cross_entropy: 1.0167113542556763, mean accuracy: 0.6214711666107178


  2%|▏         | 7/300 [06:40<4:39:07, 57.16s/it]

Training...
mean cross_entropy: 1.0060333013534546, mean accuracy: 0.6249454021453857


  3%|▎         | 8/300 [07:37<4:37:48, 57.08s/it]

Training...
mean cross_entropy: 0.9987244606018066, mean accuracy: 0.6258367300033569


  3%|▎         | 9/300 [08:34<4:36:58, 57.11s/it]

Training...
mean cross_entropy: 1.0149109363555908, mean accuracy: 0.621762216091156


  3%|▎         | 10/300 [09:31<4:35:41, 57.04s/it]

Training...
mean cross_entropy: 1.00667405128479, mean accuracy: 0.624399721622467


  4%|▎         | 11/300 [10:28<4:34:38, 57.02s/it]

Training...
mean cross_entropy: 1.0116785764694214, mean accuracy: 0.6211801767349243


  4%|▍         | 12/300 [11:25<4:33:46, 57.04s/it]

Training...
mean cross_entropy: 1.0398575067520142, mean accuracy: 0.6067193150520325


  4%|▍         | 13/300 [12:22<4:32:51, 57.05s/it]

Training...
mean cross_entropy: 1.022675633430481, mean accuracy: 0.6108665466308594


  5%|▍         | 14/300 [13:19<4:31:41, 57.00s/it]

Training...
mean cross_entropy: 1.030190110206604, mean accuracy: 0.6111757755279541


  5%|▌         | 15/300 [14:16<4:30:44, 57.00s/it]

Training...
mean cross_entropy: 1.0235024690628052, mean accuracy: 0.6133403778076172


  5%|▌         | 16/300 [15:13<4:29:59, 57.04s/it]

Training...
mean cross_entropy: 1.0389697551727295, mean accuracy: 0.6065555810928345


  6%|▌         | 17/300 [16:10<4:29:15, 57.08s/it]

Training...
mean cross_entropy: 1.0381320714950562, mean accuracy: 0.6071194410324097


  6%|▌         | 18/300 [17:07<4:28:15, 57.08s/it]

Training...
mean cross_entropy: 1.0317747592926025, mean accuracy: 0.6119033694267273


  6%|▋         | 19/300 [18:04<4:27:14, 57.06s/it]

Training...
mean cross_entropy: 1.0824521780014038, mean accuracy: 0.5915853977203369
Validation...
mean cross_entropy: 25.79124641418457, mean accuracy: 0.0989583358168602


  7%|▋         | 20/300 [19:03<4:28:04, 57.45s/it]

Training...
mean cross_entropy: 1.0870777368545532, mean accuracy: 0.5923857688903809


  7%|▋         | 21/300 [20:00<4:26:29, 57.31s/it]

Training...
mean cross_entropy: 1.0334954261779785, mean accuracy: 0.6156322956085205


  7%|▋         | 22/300 [20:57<4:25:09, 57.23s/it]

Training...
mean cross_entropy: 1.0205355882644653, mean accuracy: 0.6225079894065857


  8%|▊         | 23/300 [21:54<4:23:58, 57.18s/it]

Training...
mean cross_entropy: 1.0409926176071167, mean accuracy: 0.6144863367080688


  8%|▊         | 24/300 [22:51<4:22:51, 57.14s/it]

Training...
mean cross_entropy: 1.0238794088363647, mean accuracy: 0.619361162185669


  8%|▊         | 25/300 [23:48<4:21:50, 57.13s/it]

Training...
mean cross_entropy: 1.0167208909988403, mean accuracy: 0.6252546310424805


  9%|▊         | 26/300 [24:45<4:20:51, 57.12s/it]

Training...
mean cross_entropy: 1.027093529701233, mean accuracy: 0.615886926651001


  9%|▉         | 27/300 [25:45<4:24:05, 58.04s/it]

Training...
mean cross_entropy: 1.0224610567092896, mean accuracy: 0.6212528944015503


  9%|▉         | 28/300 [26:45<4:26:08, 58.71s/it]

Training...
mean cross_entropy: 1.0317790508270264, mean accuracy: 0.6172693371772766


 10%|▉         | 29/300 [27:46<4:27:01, 59.12s/it]

Training...
mean cross_entropy: 1.0112366676330566, mean accuracy: 0.6256912350654602


 10%|█         | 30/300 [28:46<4:27:21, 59.41s/it]

Training...
mean cross_entropy: 1.017600178718567, mean accuracy: 0.6244361400604248


 10%|█         | 31/300 [29:46<4:27:38, 59.70s/it]

Training...
mean cross_entropy: 1.0130975246429443, mean accuracy: 0.6255275011062622


 11%|█         | 32/300 [30:46<4:27:40, 59.93s/it]

Training...
mean cross_entropy: 1.0062617063522339, mean accuracy: 0.6251454949378967


 11%|█         | 33/300 [31:47<4:27:01, 60.00s/it]

Training...
mean cross_entropy: 1.0122792720794678, mean accuracy: 0.6271281838417053


 11%|█▏        | 34/300 [32:47<4:26:25, 60.10s/it]

Training...
mean cross_entropy: 1.0084949731826782, mean accuracy: 0.6268917322158813


 12%|█▏        | 35/300 [33:47<4:25:30, 60.11s/it]

Training...
mean cross_entropy: 1.0211879014968872, mean accuracy: 0.6215075850486755


 12%|█▏        | 36/300 [34:47<4:24:38, 60.14s/it]

Training...
mean cross_entropy: 1.0077441930770874, mean accuracy: 0.62800133228302


 12%|█▏        | 37/300 [35:44<4:19:38, 59.23s/it]

Training...
mean cross_entropy: 1.008834958076477, mean accuracy: 0.627873957157135


 13%|█▎        | 38/300 [36:41<4:15:37, 58.54s/it]

Training...
mean cross_entropy: 1.0141724348068237, mean accuracy: 0.6262369155883789


 13%|█▎        | 39/300 [37:38<4:12:37, 58.08s/it]

Training...
mean cross_entropy: 1.019834280014038, mean accuracy: 0.6258185505867004
Validation...
mean cross_entropy: 86.2596435546875, mean accuracy: 0.10036057978868484


 13%|█▎        | 40/300 [38:37<4:11:50, 58.12s/it]

Training...
mean cross_entropy: 1.0339313745498657, mean accuracy: 0.61568683385849


 14%|█▎        | 41/300 [39:34<4:09:30, 57.80s/it]

Training...
mean cross_entropy: 1.0230101346969604, mean accuracy: 0.6193975806236267


 14%|█▍        | 42/300 [40:31<4:07:28, 57.55s/it]

Training...
mean cross_entropy: 1.0129934549331665, mean accuracy: 0.625200092792511


 14%|█▍        | 43/300 [41:28<4:06:02, 57.44s/it]

Training...
mean cross_entropy: 1.0580480098724365, mean accuracy: 0.6018990278244019


 15%|█▍        | 44/300 [42:25<4:04:37, 57.33s/it]

Training...
mean cross_entropy: 1.0362770557403564, mean accuracy: 0.6135586500167847


 15%|█▌        | 45/300 [43:22<4:03:21, 57.26s/it]

Training...
mean cross_entropy: 1.0059970617294312, mean accuracy: 0.6288926005363464


 15%|█▌        | 46/300 [44:19<4:02:07, 57.19s/it]

Training...
mean cross_entropy: 1.0013421773910522, mean accuracy: 0.6297293305397034


 16%|█▌        | 47/300 [45:16<4:00:59, 57.15s/it]

Training...
mean cross_entropy: 1.0120644569396973, mean accuracy: 0.6234356760978699


 16%|█▌        | 48/300 [46:13<4:00:10, 57.18s/it]

Training...
mean cross_entropy: 1.0078728199005127, mean accuracy: 0.6287834644317627


 16%|█▋        | 49/300 [47:10<3:59:04, 57.15s/it]

Training...
mean cross_entropy: 1.016607642173767, mean accuracy: 0.6250181794166565


 17%|█▋        | 50/300 [48:08<3:58:05, 57.14s/it]

Training...
mean cross_entropy: 0.9995772242546082, mean accuracy: 0.6309298872947693


 17%|█▋        | 51/300 [49:05<3:57:11, 57.16s/it]

Training...
mean cross_entropy: 1.0075269937515259, mean accuracy: 0.6280195116996765


 17%|█▋        | 52/300 [50:02<3:56:20, 57.18s/it]

Training...
mean cross_entropy: 1.0063472986221313, mean accuracy: 0.6279103755950928


 18%|█▊        | 53/300 [50:59<3:55:07, 57.11s/it]

Training...
mean cross_entropy: 1.005046010017395, mean accuracy: 0.6293655633926392


 18%|█▊        | 54/300 [51:56<3:54:08, 57.11s/it]

Training...
mean cross_entropy: 0.9966884255409241, mean accuracy: 0.6326397061347961


 18%|█▊        | 55/300 [52:53<3:53:18, 57.14s/it]

Training...
mean cross_entropy: 0.9880956411361694, mean accuracy: 0.6350407600402832


 19%|█▊        | 56/300 [53:50<3:52:16, 57.12s/it]

Training...
mean cross_entropy: 0.9906715154647827, mean accuracy: 0.6345677971839905


 19%|█▉        | 57/300 [54:47<3:51:17, 57.11s/it]

Training...
mean cross_entropy: 1.0065053701400757, mean accuracy: 0.6298384666442871


 19%|█▉        | 58/300 [55:45<3:50:23, 57.12s/it]

Training...
mean cross_entropy: 1.0015920400619507, mean accuracy: 0.630911648273468


 20%|█▉        | 59/300 [56:42<3:49:27, 57.13s/it]

Training...
mean cross_entropy: 1.0078346729278564, mean accuracy: 0.6261823177337646
Validation...
mean cross_entropy: 12.05134105682373, mean accuracy: 0.18429486453533173


 20%|██        | 60/300 [57:40<3:49:36, 57.40s/it]

Training...
mean cross_entropy: 1.0031174421310425, mean accuracy: 0.6283105611801147


 20%|██        | 61/300 [58:37<3:48:12, 57.29s/it]

Training...
mean cross_entropy: 1.0215779542922974, mean accuracy: 0.6205071210861206


 21%|██        | 62/300 [59:34<3:46:55, 57.21s/it]

Training...
mean cross_entropy: 1.003490924835205, mean accuracy: 0.6284560561180115


 21%|██        | 63/300 [1:00:31<3:45:45, 57.15s/it]

Training...
mean cross_entropy: 0.9959203600883484, mean accuracy: 0.6312572956085205


 21%|██▏       | 64/300 [1:01:28<3:44:42, 57.13s/it]

Training...
mean cross_entropy: 1.0233222246170044, mean accuracy: 0.6218895316123962


 22%|██▏       | 65/300 [1:02:25<3:43:39, 57.11s/it]

Training...
mean cross_entropy: 1.0122830867767334, mean accuracy: 0.622126042842865


 22%|██▏       | 66/300 [1:03:22<3:42:42, 57.10s/it]

Training...
mean cross_entropy: 1.0312045812606812, mean accuracy: 0.6190701127052307


 22%|██▏       | 67/300 [1:04:19<3:41:44, 57.10s/it]

Training...
mean cross_entropy: 1.0083192586898804, mean accuracy: 0.6297839283943176


 23%|██▎       | 68/300 [1:05:17<3:41:21, 57.25s/it]

Training...
mean cross_entropy: 1.0045086145401, mean accuracy: 0.6269826889038086


 23%|██▎       | 69/300 [1:06:15<3:41:32, 57.54s/it]

Training...
mean cross_entropy: 1.0051995515823364, mean accuracy: 0.6259458661079407


 23%|██▎       | 70/300 [1:07:13<3:41:16, 57.72s/it]

Training...
mean cross_entropy: 1.0359017848968506, mean accuracy: 0.6154685616493225


 24%|██▎       | 71/300 [1:08:11<3:40:44, 57.84s/it]

Training...
mean cross_entropy: 1.0603418350219727, mean accuracy: 0.6064646244049072


 24%|██▍       | 72/300 [1:09:10<3:40:30, 58.03s/it]

Training...
mean cross_entropy: 1.061813473701477, mean accuracy: 0.6046456694602966


 24%|██▍       | 73/300 [1:10:08<3:39:35, 58.04s/it]

Training...
mean cross_entropy: 1.0543630123138428, mean accuracy: 0.6075924038887024


 25%|██▍       | 74/300 [1:11:06<3:38:48, 58.09s/it]

Training...
mean cross_entropy: 1.057349443435669, mean accuracy: 0.6041545271873474


 25%|██▌       | 75/300 [1:12:04<3:37:42, 58.06s/it]

Training...
mean cross_entropy: 1.0507646799087524, mean accuracy: 0.6011532545089722


 25%|██▌       | 76/300 [1:13:02<3:36:59, 58.12s/it]

Training...
mean cross_entropy: 1.0346198081970215, mean accuracy: 0.6103754639625549


 26%|██▌       | 77/300 [1:14:00<3:35:59, 58.12s/it]

Training...
mean cross_entropy: 1.0504658222198486, mean accuracy: 0.6052641272544861


 26%|██▌       | 78/300 [1:14:58<3:34:49, 58.06s/it]

Training...
mean cross_entropy: 1.0510696172714233, mean accuracy: 0.6038452982902527


 26%|██▋       | 79/300 [1:15:55<3:32:50, 57.78s/it]

Training...
mean cross_entropy: 1.0179742574691772, mean accuracy: 0.6149228811264038
Validation...
mean cross_entropy: 4.177855968475342, mean accuracy: 0.09134615212678909


 27%|██▋       | 80/300 [1:16:54<3:32:21, 57.91s/it]

Training...
mean cross_entropy: 1.0504555702209473, mean accuracy: 0.6053550839424133


 27%|██▋       | 81/300 [1:17:51<3:30:32, 57.68s/it]

Training...
mean cross_entropy: 1.035586953163147, mean accuracy: 0.6110484600067139


 27%|██▋       | 82/300 [1:18:48<3:29:06, 57.55s/it]

Training...
mean cross_entropy: 1.0969759225845337, mean accuracy: 0.5789799094200134


 28%|██▊       | 83/300 [1:19:45<3:27:47, 57.45s/it]

Training...
mean cross_entropy: 1.069018006324768, mean accuracy: 0.5976426005363464


 28%|██▊       | 84/300 [1:20:42<3:26:34, 57.38s/it]

Training...
mean cross_entropy: 1.0469800233840942, mean accuracy: 0.6021536588668823


 28%|██▊       | 85/300 [1:21:40<3:25:32, 57.36s/it]

Training...
mean cross_entropy: 1.0212208032608032, mean accuracy: 0.6137405633926392


 29%|██▊       | 86/300 [1:22:37<3:24:24, 57.31s/it]

Training...
mean cross_entropy: 1.013736367225647, mean accuracy: 0.6153957843780518


 29%|██▉       | 87/300 [1:23:34<3:23:16, 57.26s/it]

Training...
mean cross_entropy: 1.0348576307296753, mean accuracy: 0.603263258934021


 29%|██▉       | 88/300 [1:24:31<3:22:08, 57.21s/it]

Training...
mean cross_entropy: 1.091996192932129, mean accuracy: 0.5766516327857971


 30%|██▉       | 89/300 [1:25:28<3:20:59, 57.15s/it]

Training...
mean cross_entropy: 1.1249042749404907, mean accuracy: 0.5548784732818604


 30%|███       | 90/300 [1:26:25<3:19:43, 57.06s/it]

Training...
mean cross_entropy: 1.1093878746032715, mean accuracy: 0.559826135635376


 30%|███       | 91/300 [1:27:22<3:18:30, 56.99s/it]

Training...
mean cross_entropy: 1.1128041744232178, mean accuracy: 0.5614268183708191


 31%|███       | 92/300 [1:28:19<3:17:34, 56.99s/it]

Training...
mean cross_entropy: 1.1005305051803589, mean accuracy: 0.5715402960777283


 31%|███       | 93/300 [1:29:16<3:16:27, 56.95s/it]

Training...
mean cross_entropy: 1.081190824508667, mean accuracy: 0.5827451944351196


 31%|███▏      | 94/300 [1:30:13<3:15:25, 56.92s/it]

Training...
mean cross_entropy: 1.0770559310913086, mean accuracy: 0.5910033583641052


 32%|███▏      | 95/300 [1:31:09<3:14:29, 56.92s/it]

Training...
mean cross_entropy: 1.0668755769729614, mean accuracy: 0.6010076999664307


 32%|███▏      | 96/300 [1:32:06<3:13:25, 56.89s/it]

Training...
mean cross_entropy: 1.052807331085205, mean accuracy: 0.6023173928260803


 32%|███▏      | 97/300 [1:33:03<3:12:27, 56.88s/it]

Training...
mean cross_entropy: 1.0690913200378418, mean accuracy: 0.5976789593696594


 33%|███▎      | 98/300 [1:34:00<3:11:32, 56.89s/it]

Training...
mean cross_entropy: 1.0483367443084717, mean accuracy: 0.6022627949714661


 33%|███▎      | 99/300 [1:34:57<3:10:34, 56.89s/it]

Decrease learning rate, new lr = 0.01
Training...
mean cross_entropy: 0.9020873308181763, mean accuracy: 0.6669637560844421
Validation...
mean cross_entropy: 4.223336219787598, mean accuracy: 0.09855769574642181


 33%|███▎      | 100/300 [1:35:55<3:10:28, 57.14s/it]

Training...
mean cross_entropy: 0.8829614520072937, mean accuracy: 0.6720023155212402


 34%|███▎      | 101/300 [1:36:51<3:09:09, 57.03s/it]

Training...
mean cross_entropy: 0.875270426273346, mean accuracy: 0.6739486455917358


 34%|███▍      | 102/300 [1:37:48<3:07:58, 56.96s/it]

Training...
mean cross_entropy: 0.8629959225654602, mean accuracy: 0.6787143349647522


 34%|███▍      | 103/300 [1:38:45<3:06:53, 56.92s/it]

Training...
mean cross_entropy: 0.8618829846382141, mean accuracy: 0.6779322028160095


 35%|███▍      | 104/300 [1:39:42<3:05:49, 56.88s/it]

Training...
mean cross_entropy: 0.8537923693656921, mean accuracy: 0.6778594255447388


 35%|███▌      | 105/300 [1:40:39<3:04:44, 56.85s/it]

Training...
mean cross_entropy: 0.8514806628227234, mean accuracy: 0.6822795271873474


 35%|███▌      | 106/300 [1:41:35<3:03:48, 56.85s/it]

Training...
mean cross_entropy: 0.8501515984535217, mean accuracy: 0.6820430755615234


 36%|███▌      | 107/300 [1:42:32<3:02:48, 56.83s/it]

Training...
mean cross_entropy: 0.8465796709060669, mean accuracy: 0.6859902739524841


 36%|███▌      | 108/300 [1:43:29<3:01:43, 56.79s/it]

Training...
mean cross_entropy: 0.820061981678009, mean accuracy: 0.705726146697998


 36%|███▋      | 109/300 [1:44:26<3:00:44, 56.78s/it]

Training...
mean cross_entropy: 0.8119019269943237, mean accuracy: 0.7079089283943176


 37%|███▋      | 110/300 [1:45:23<3:00:23, 56.97s/it]

Training...
mean cross_entropy: 0.8058400750160217, mean accuracy: 0.7094186544418335


 37%|███▋      | 111/300 [1:46:21<3:00:19, 57.24s/it]

Training...
mean cross_entropy: 0.8091527819633484, mean accuracy: 0.7095096111297607


 37%|███▋      | 112/300 [1:47:19<2:59:38, 57.33s/it]

Training...
mean cross_entropy: 0.797767162322998, mean accuracy: 0.7146391272544861


 38%|███▊      | 113/300 [1:48:16<2:59:01, 57.44s/it]

Training...
mean cross_entropy: 0.7951884269714355, mean accuracy: 0.7147846221923828


 38%|███▊      | 114/300 [1:49:14<2:58:08, 57.47s/it]

Training...
mean cross_entropy: 0.7918463349342346, mean accuracy: 0.7168036699295044


 38%|███▊      | 115/300 [1:50:11<2:57:24, 57.54s/it]

Training...
mean cross_entropy: 0.7848306894302368, mean accuracy: 0.7168946266174316


 39%|███▊      | 116/300 [1:51:09<2:56:37, 57.59s/it]

Training...
mean cross_entropy: 0.7939144968986511, mean accuracy: 0.7169128060340881


 39%|███▉      | 117/300 [1:52:07<2:55:42, 57.61s/it]

Training...
mean cross_entropy: 0.7873008847236633, mean accuracy: 0.7169674038887024


 39%|███▉      | 118/300 [1:53:04<2:54:41, 57.59s/it]

Training...
mean cross_entropy: 0.7873428463935852, mean accuracy: 0.7165672183036804


 40%|███▉      | 119/300 [1:54:02<2:53:56, 57.66s/it]

Training...
mean cross_entropy: 0.7927761673927307, mean accuracy: 0.7161670327186584
Validation...
mean cross_entropy: 1.1883008480072021, mean accuracy: 0.5598958134651184


 40%|████      | 120/300 [1:55:01<2:53:54, 57.97s/it]

Training...
mean cross_entropy: 0.7827472686767578, mean accuracy: 0.7190955877304077


 40%|████      | 121/300 [1:55:59<2:52:43, 57.90s/it]

Training...
mean cross_entropy: 0.7867497205734253, mean accuracy: 0.715948760509491


 41%|████      | 122/300 [1:56:56<2:51:42, 57.88s/it]

Training...
mean cross_entropy: 0.7779492735862732, mean accuracy: 0.7201324105262756


 41%|████      | 123/300 [1:57:54<2:50:29, 57.80s/it]

Training...
mean cross_entropy: 0.7918642163276672, mean accuracy: 0.7150210738182068


 41%|████▏     | 124/300 [1:58:52<2:49:26, 57.76s/it]

Training...
mean cross_entropy: 0.7853161096572876, mean accuracy: 0.7184771299362183


 42%|████▏     | 125/300 [1:59:49<2:48:18, 57.71s/it]

Training...
mean cross_entropy: 0.783591091632843, mean accuracy: 0.7194048166275024


 42%|████▏     | 126/300 [2:00:47<2:47:22, 57.72s/it]

Training...
mean cross_entropy: 0.7814297676086426, mean accuracy: 0.7182224988937378


 42%|████▏     | 127/300 [2:01:45<2:46:19, 57.68s/it]

Training...
mean cross_entropy: 0.7826079726219177, mean accuracy: 0.7194594144821167


 43%|████▎     | 128/300 [2:02:42<2:45:22, 57.69s/it]

Training...
mean cross_entropy: 0.7848449349403381, mean accuracy: 0.7204052805900574


 43%|████▎     | 129/300 [2:03:40<2:44:21, 57.67s/it]

Training...
mean cross_entropy: 0.7848160266876221, mean accuracy: 0.7158578038215637


 43%|████▎     | 130/300 [2:04:38<2:43:31, 57.71s/it]

Training...
mean cross_entropy: 0.7817813158035278, mean accuracy: 0.7196049094200134


 44%|████▎     | 131/300 [2:05:37<2:43:34, 58.07s/it]

Training...
mean cross_entropy: 0.7812918424606323, mean accuracy: 0.7192411422729492


 44%|████▍     | 132/300 [2:06:36<2:43:22, 58.35s/it]

Training...
mean cross_entropy: 0.7826464772224426, mean accuracy: 0.7182406783103943


 44%|████▍     | 133/300 [2:07:34<2:42:45, 58.47s/it]

Training...
mean cross_entropy: 0.7805707454681396, mean accuracy: 0.7188954949378967


 45%|████▍     | 134/300 [2:08:33<2:42:06, 58.59s/it]

Training...
mean cross_entropy: 0.7812789678573608, mean accuracy: 0.7215148210525513


 45%|████▌     | 135/300 [2:09:32<2:41:25, 58.70s/it]

Training...
mean cross_entropy: 0.7844687104225159, mean accuracy: 0.7181861400604248


 45%|████▌     | 136/300 [2:10:31<2:40:42, 58.79s/it]

Training...
mean cross_entropy: 0.7810177803039551, mean accuracy: 0.7171493172645569


 46%|████▌     | 137/300 [2:11:30<2:39:48, 58.82s/it]

Training...
mean cross_entropy: 0.7817679643630981, mean accuracy: 0.718950092792511


 46%|████▌     | 138/300 [2:12:29<2:38:56, 58.87s/it]

Training...
mean cross_entropy: 0.7820683121681213, mean accuracy: 0.7188227772712708


 46%|████▋     | 139/300 [2:13:28<2:37:53, 58.84s/it]

Training...
mean cross_entropy: 0.785114049911499, mean accuracy: 0.7163307666778564
Validation...
mean cross_entropy: 0.8183243870735168, mean accuracy: 0.6975160241127014


 47%|████▋     | 140/300 [2:14:28<2:37:41, 59.13s/it]

Training...
mean cross_entropy: 0.7867596745491028, mean accuracy: 0.7167673110961914


 47%|████▋     | 141/300 [2:15:26<2:35:48, 58.79s/it]

Training...
mean cross_entropy: 0.7828940153121948, mean accuracy: 0.7173311710357666


 47%|████▋     | 142/300 [2:16:23<2:33:56, 58.46s/it]

Training...
mean cross_entropy: 0.78830885887146, mean accuracy: 0.7149665355682373


 48%|████▊     | 143/300 [2:17:21<2:32:19, 58.22s/it]

Training...
mean cross_entropy: 0.783481776714325, mean accuracy: 0.7182952761650085


 48%|████▊     | 144/300 [2:18:19<2:30:59, 58.08s/it]

Training...
mean cross_entropy: 0.7823006510734558, mean accuracy: 0.7177495360374451


 48%|████▊     | 145/300 [2:19:16<2:29:39, 57.93s/it]

Training...
mean cross_entropy: 0.7816281318664551, mean accuracy: 0.7194957733154297


 49%|████▊     | 146/300 [2:20:14<2:28:30, 57.86s/it]

Training...
mean cross_entropy: 0.7796071767807007, mean accuracy: 0.7194775938987732


 49%|████▉     | 147/300 [2:21:12<2:27:25, 57.81s/it]

Training...
mean cross_entropy: 0.7789371013641357, mean accuracy: 0.7196958661079407


 49%|████▉     | 148/300 [2:22:10<2:26:23, 57.79s/it]

Training...
mean cross_entropy: 0.7785714864730835, mean accuracy: 0.7199687361717224


 50%|████▉     | 149/300 [2:23:07<2:25:20, 57.75s/it]

Training...
mean cross_entropy: 0.7819622159004211, mean accuracy: 0.7194775938987732


 50%|█████     | 150/300 [2:24:05<2:24:22, 57.75s/it]

Training...
mean cross_entropy: 0.7842400670051575, mean accuracy: 0.7182224988937378


 50%|█████     | 151/300 [2:25:02<2:23:09, 57.65s/it]

Training...
mean cross_entropy: 0.7879701256752014, mean accuracy: 0.7152211666107178


 51%|█████     | 152/300 [2:26:00<2:22:02, 57.59s/it]

Training...
mean cross_entropy: 0.7824748754501343, mean accuracy: 0.71574866771698


 51%|█████     | 153/300 [2:26:58<2:21:15, 57.66s/it]

Training...
mean cross_entropy: 0.7817471027374268, mean accuracy: 0.7194229960441589


 51%|█████▏    | 154/300 [2:27:55<2:20:07, 57.59s/it]

Training...
mean cross_entropy: 0.7826552987098694, mean accuracy: 0.7177131772041321


 52%|█████▏    | 155/300 [2:28:53<2:19:27, 57.71s/it]

Training...
mean cross_entropy: 0.787237286567688, mean accuracy: 0.7182588577270508


 52%|█████▏    | 156/300 [2:29:50<2:18:02, 57.52s/it]

Training...
mean cross_entropy: 0.7765951752662659, mean accuracy: 0.7198231816291809


 52%|█████▏    | 157/300 [2:30:48<2:17:38, 57.75s/it]

Training...
mean cross_entropy: 0.783352792263031, mean accuracy: 0.7187681794166565


 53%|█████▎    | 158/300 [2:31:45<2:16:03, 57.49s/it]

Training...
mean cross_entropy: 0.7833041548728943, mean accuracy: 0.7165490388870239


 53%|█████▎    | 159/300 [2:32:44<2:15:43, 57.75s/it]

Training...
mean cross_entropy: 0.7808438539505005, mean accuracy: 0.7197322249412537
Validation...
mean cross_entropy: 1.7089691162109375, mean accuracy: 0.4641426205635071


 53%|█████▎    | 160/300 [2:33:42<2:14:50, 57.79s/it]

Training...
mean cross_entropy: 0.779811680316925, mean accuracy: 0.7194957733154297


 54%|█████▎    | 161/300 [2:34:40<2:14:11, 57.92s/it]

Training...
mean cross_entropy: 0.7858384847640991, mean accuracy: 0.7159669399261475


 54%|█████▍    | 162/300 [2:35:37<2:12:31, 57.62s/it]

Training...
mean cross_entropy: 0.7826817035675049, mean accuracy: 0.7172220349311829


 54%|█████▍    | 163/300 [2:36:35<2:11:59, 57.80s/it]

Training...
mean cross_entropy: 0.7810590863227844, mean accuracy: 0.7190228700637817


 55%|█████▍    | 164/300 [2:37:32<2:10:23, 57.52s/it]

Training...
mean cross_entropy: 0.7831488251686096, mean accuracy: 0.7194957733154297


 55%|█████▌    | 165/300 [2:38:30<2:09:56, 57.75s/it]

Training...
mean cross_entropy: 0.7790931463241577, mean accuracy: 0.7199141383171082


 55%|█████▌    | 166/300 [2:39:27<2:08:23, 57.49s/it]

Training...
mean cross_entropy: 0.7857949733734131, mean accuracy: 0.7170583605766296


 56%|█████▌    | 167/300 [2:40:25<2:07:54, 57.71s/it]

Training...
mean cross_entropy: 0.7806799411773682, mean accuracy: 0.7194775938987732


 56%|█████▌    | 168/300 [2:41:22<2:06:33, 57.52s/it]

Training...
mean cross_entropy: 0.7822707295417786, mean accuracy: 0.7196958661079407


 56%|█████▋    | 169/300 [2:42:20<2:05:58, 57.70s/it]

Training...
mean cross_entropy: 0.7801306843757629, mean accuracy: 0.7190955877304077


 57%|█████▋    | 170/300 [2:43:18<2:04:41, 57.55s/it]

Training...
mean cross_entropy: 0.7813405394554138, mean accuracy: 0.718786358833313


 57%|█████▋    | 171/300 [2:44:16<2:04:02, 57.70s/it]

Training...
mean cross_entropy: 0.7873878479003906, mean accuracy: 0.7153485417366028


 57%|█████▋    | 172/300 [2:45:14<2:03:18, 57.80s/it]

Training...
mean cross_entropy: 0.7835496664047241, mean accuracy: 0.718713641166687


 58%|█████▊    | 173/300 [2:46:11<2:02:17, 57.78s/it]

Training...
mean cross_entropy: 0.777291476726532, mean accuracy: 0.7210964560508728


 58%|█████▊    | 174/300 [2:47:09<2:01:19, 57.77s/it]

Training...
mean cross_entropy: 0.78329998254776, mean accuracy: 0.7188773155212402


 58%|█████▊    | 175/300 [2:48:07<2:00:16, 57.73s/it]

Training...
mean cross_entropy: 0.7803444862365723, mean accuracy: 0.7202051877975464


 59%|█████▊    | 176/300 [2:49:04<1:59:14, 57.70s/it]

Training...
mean cross_entropy: 0.7853822708129883, mean accuracy: 0.7174585461616516


 59%|█████▉    | 177/300 [2:50:02<1:58:16, 57.70s/it]

Training...
mean cross_entropy: 0.7805825471878052, mean accuracy: 0.7178223133087158


 59%|█████▉    | 178/300 [2:51:00<1:57:14, 57.66s/it]

Training...
mean cross_entropy: 0.7855525612831116, mean accuracy: 0.7171674966812134


 60%|█████▉    | 179/300 [2:51:57<1:56:19, 57.68s/it]

Training...
mean cross_entropy: 0.7831721305847168, mean accuracy: 0.7173311710357666
Validation...
mean cross_entropy: 2.8416361808776855, mean accuracy: 0.36017629504203796


 60%|██████    | 180/300 [2:52:56<1:55:59, 58.00s/it]

Training...
mean cross_entropy: 0.7708075046539307, mean accuracy: 0.7216967344284058


 60%|██████    | 181/300 [2:53:54<1:54:52, 57.92s/it]

Training...
mean cross_entropy: 0.7856318950653076, mean accuracy: 0.7169492244720459


 61%|██████    | 182/300 [2:54:52<1:53:43, 57.83s/it]

Training...
mean cross_entropy: 0.7828724384307861, mean accuracy: 0.7181679010391235


 61%|██████    | 183/300 [2:55:49<1:52:18, 57.59s/it]

Training...
mean cross_entropy: 0.7828251123428345, mean accuracy: 0.7169674038887024


 61%|██████▏   | 184/300 [2:56:46<1:50:58, 57.40s/it]

Training...
mean cross_entropy: 0.7815866470336914, mean accuracy: 0.7198959589004517


 62%|██████▏   | 185/300 [2:57:43<1:49:49, 57.30s/it]

Training...
mean cross_entropy: 0.7900845408439636, mean accuracy: 0.715475857257843


 62%|██████▏   | 186/300 [2:58:40<1:48:45, 57.24s/it]

Training...
mean cross_entropy: 0.7837485074996948, mean accuracy: 0.7198050022125244


 62%|██████▏   | 187/300 [2:59:37<1:47:41, 57.19s/it]

Training...
mean cross_entropy: 0.7825070023536682, mean accuracy: 0.7173675894737244


 63%|██████▎   | 188/300 [3:00:34<1:46:42, 57.17s/it]

Training...
mean cross_entropy: 0.7813695073127747, mean accuracy: 0.71875


 63%|██████▎   | 189/300 [3:01:31<1:45:43, 57.15s/it]

Training...
mean cross_entropy: 0.7811574935913086, mean accuracy: 0.7169128060340881


 63%|██████▎   | 190/300 [3:02:28<1:44:46, 57.15s/it]

Training...
mean cross_entropy: 0.7863426804542542, mean accuracy: 0.7170401811599731


 64%|██████▎   | 191/300 [3:03:25<1:43:50, 57.16s/it]

Training...
mean cross_entropy: 0.7763262391090393, mean accuracy: 0.7211692333221436


 64%|██████▍   | 192/300 [3:04:22<1:42:52, 57.16s/it]

Training...
mean cross_entropy: 0.7770987153053284, mean accuracy: 0.7173857688903809


 64%|██████▍   | 193/300 [3:05:19<1:41:50, 57.11s/it]

Training...
mean cross_entropy: 0.7788107991218567, mean accuracy: 0.7188045978546143


 65%|██████▍   | 194/300 [3:06:16<1:40:48, 57.06s/it]

Training...
mean cross_entropy: 0.7868019938468933, mean accuracy: 0.7182588577270508


 65%|██████▌   | 195/300 [3:07:13<1:39:38, 56.94s/it]

Training...
mean cross_entropy: 0.7899401783943176, mean accuracy: 0.7143480777740479


 65%|██████▌   | 196/300 [3:08:10<1:38:35, 56.88s/it]

Training...
mean cross_entropy: 0.7825214266777039, mean accuracy: 0.718313455581665


 66%|██████▌   | 197/300 [3:09:07<1:37:35, 56.85s/it]

Training...
mean cross_entropy: 0.7855778932571411, mean accuracy: 0.7172766327857971


 66%|██████▌   | 198/300 [3:10:03<1:36:38, 56.85s/it]

Training...
mean cross_entropy: 0.7809267640113831, mean accuracy: 0.7194048166275024


 66%|██████▋   | 199/300 [3:11:00<1:35:37, 56.81s/it]

Decrease learning rate, new lr = 0.001
Training...
mean cross_entropy: 0.7460287809371948, mean accuracy: 0.7330835461616516
Validation...
mean cross_entropy: 0.9355143904685974, mean accuracy: 0.6694711446762085


 67%|██████▋   | 200/300 [3:11:58<1:35:15, 57.15s/it]

Training...
mean cross_entropy: 0.7426385283470154, mean accuracy: 0.734375


 67%|██████▋   | 201/300 [3:12:55<1:34:04, 57.02s/it]

Training...
mean cross_entropy: 0.7389270067214966, mean accuracy: 0.7353390455245972


 67%|██████▋   | 202/300 [3:13:52<1:33:00, 56.95s/it]

Training...
mean cross_entropy: 0.735403835773468, mean accuracy: 0.73737633228302


 68%|██████▊   | 203/300 [3:14:48<1:32:00, 56.91s/it]

Training...
mean cross_entropy: 0.7348904013633728, mean accuracy: 0.7381402850151062


 68%|██████▊   | 204/300 [3:15:45<1:31:06, 56.94s/it]

Training...
mean cross_entropy: 0.7347442507743835, mean accuracy: 0.7365396022796631


 68%|██████▊   | 205/300 [3:16:42<1:30:11, 56.96s/it]

Training...
mean cross_entropy: 0.7309635877609253, mean accuracy: 0.7367760539054871


 69%|██████▊   | 206/300 [3:17:39<1:29:13, 56.96s/it]

Training...
mean cross_entropy: 0.7371196150779724, mean accuracy: 0.737449049949646


 69%|██████▉   | 207/300 [3:18:36<1:28:19, 56.98s/it]

Training...
mean cross_entropy: 0.7325724363327026, mean accuracy: 0.7391225099563599


 69%|██████▉   | 208/300 [3:19:33<1:27:25, 57.01s/it]

Training...
mean cross_entropy: 0.73431396484375, mean accuracy: 0.7359211444854736


 70%|██████▉   | 209/300 [3:20:30<1:26:23, 56.96s/it]

Training...
mean cross_entropy: 0.7321207523345947, mean accuracy: 0.7389951944351196


 70%|███████   | 210/300 [3:21:27<1:25:29, 56.99s/it]

Training...
mean cross_entropy: 0.7320166826248169, mean accuracy: 0.7361212372779846


 70%|███████   | 211/300 [3:22:24<1:24:30, 56.97s/it]

Training...
mean cross_entropy: 0.7340640425682068, mean accuracy: 0.7369579672813416


 71%|███████   | 212/300 [3:23:21<1:23:33, 56.97s/it]

Training...
mean cross_entropy: 0.733562171459198, mean accuracy: 0.7371034622192383


 71%|███████   | 213/300 [3:24:18<1:22:36, 56.97s/it]

Training...
mean cross_entropy: 0.7377848625183105, mean accuracy: 0.7351207733154297


 71%|███████▏  | 214/300 [3:25:15<1:21:35, 56.93s/it]

Training...
mean cross_entropy: 0.7329214811325073, mean accuracy: 0.7351389527320862


 72%|███████▏  | 215/300 [3:26:12<1:20:34, 56.88s/it]

Training...
mean cross_entropy: 0.7309198975563049, mean accuracy: 0.7389770150184631


 72%|███████▏  | 216/300 [3:27:09<1:19:37, 56.88s/it]

Training...
mean cross_entropy: 0.7336282134056091, mean accuracy: 0.7371398210525513


 72%|███████▏  | 217/300 [3:28:06<1:18:41, 56.88s/it]

Training...
mean cross_entropy: 0.7358258962631226, mean accuracy: 0.7344841361045837


 73%|███████▎  | 218/300 [3:29:02<1:17:39, 56.83s/it]

Training...
mean cross_entropy: 0.7305013537406921, mean accuracy: 0.7377219200134277


 73%|███████▎  | 219/300 [3:29:59<1:16:43, 56.83s/it]

Training...
mean cross_entropy: 0.7313838601112366, mean accuracy: 0.7377219200134277
Validation...
mean cross_entropy: 0.8248556852340698, mean accuracy: 0.6951121687889099


 73%|███████▎  | 220/300 [3:30:57<1:16:12, 57.16s/it]

Training...
mean cross_entropy: 0.7311431169509888, mean accuracy: 0.738012969493866


 74%|███████▎  | 221/300 [3:31:54<1:15:06, 57.05s/it]

Training...
mean cross_entropy: 0.7333400249481201, mean accuracy: 0.737576425075531


 74%|███████▍  | 222/300 [3:32:51<1:14:02, 56.95s/it]

Training...
mean cross_entropy: 0.7302034497261047, mean accuracy: 0.7386131882667542


 74%|███████▍  | 223/300 [3:33:47<1:13:00, 56.89s/it]

Training...
mean cross_entropy: 0.7304155230522156, mean accuracy: 0.737176239490509


 75%|███████▍  | 224/300 [3:34:44<1:12:00, 56.85s/it]

Training...
mean cross_entropy: 0.7354788780212402, mean accuracy: 0.7371398210525513


 75%|███████▌  | 225/300 [3:35:43<1:11:48, 57.44s/it]

Training...
mean cross_entropy: 0.7278323769569397, mean accuracy: 0.7390315532684326


 75%|███████▌  | 226/300 [3:36:42<1:11:24, 57.90s/it]

Training...
mean cross_entropy: 0.7308216094970703, mean accuracy: 0.7388496994972229


 76%|███████▌  | 227/300 [3:37:41<1:10:51, 58.23s/it]

Training...
mean cross_entropy: 0.7336230874061584, mean accuracy: 0.7386859655380249


 76%|███████▌  | 228/300 [3:38:40<1:10:07, 58.44s/it]

Training...
mean cross_entropy: 0.7329907417297363, mean accuracy: 0.7357210516929626


 76%|███████▋  | 229/300 [3:39:39<1:09:22, 58.62s/it]

Training...
mean cross_entropy: 0.7325382828712463, mean accuracy: 0.7371944189071655


 77%|███████▋  | 230/300 [3:40:38<1:08:29, 58.71s/it]

Training...
mean cross_entropy: 0.7326725721359253, mean accuracy: 0.7356846332550049


 77%|███████▋  | 231/300 [3:41:37<1:07:34, 58.75s/it]

Training...
mean cross_entropy: 0.7332788109779358, mean accuracy: 0.7389406561851501


 77%|███████▋  | 232/300 [3:42:36<1:06:38, 58.80s/it]

Training...
mean cross_entropy: 0.7340590953826904, mean accuracy: 0.7377219200134277


 78%|███████▊  | 233/300 [3:43:34<1:05:43, 58.85s/it]

Training...
mean cross_entropy: 0.7364771962165833, mean accuracy: 0.7363576889038086


 78%|███████▊  | 234/300 [3:44:33<1:04:44, 58.86s/it]

Training...
mean cross_entropy: 0.7283861041069031, mean accuracy: 0.7386496067047119


 78%|███████▊  | 235/300 [3:45:31<1:03:16, 58.41s/it]

Training...
mean cross_entropy: 0.7286657094955444, mean accuracy: 0.7405958771705627


 79%|███████▊  | 236/300 [3:46:28<1:01:50, 57.97s/it]

Training...
mean cross_entropy: 0.731759786605835, mean accuracy: 0.7378674149513245


 79%|███████▉  | 237/300 [3:47:25<1:00:33, 57.68s/it]

Training...
mean cross_entropy: 0.7316244840621948, mean accuracy: 0.7365759611129761


 79%|███████▉  | 238/300 [3:48:22<59:20, 57.43s/it]  

Training...
mean cross_entropy: 0.7346295714378357, mean accuracy: 0.7356482744216919


 80%|███████▉  | 239/300 [3:49:18<58:12, 57.26s/it]

Training...
mean cross_entropy: 0.7322520613670349, mean accuracy: 0.7382494211196899
Validation...
mean cross_entropy: 0.7857972383499146, mean accuracy: 0.7183493375778198


 80%|████████  | 240/300 [3:50:16<57:27, 57.45s/it]

Training...
mean cross_entropy: 0.7303167581558228, mean accuracy: 0.7398137450218201


 80%|████████  | 241/300 [3:51:13<56:20, 57.30s/it]

Training...
mean cross_entropy: 0.7343061566352844, mean accuracy: 0.7372853755950928


 81%|████████  | 242/300 [3:52:10<55:18, 57.21s/it]

Training...
mean cross_entropy: 0.7296916842460632, mean accuracy: 0.7381221055984497


 81%|████████  | 243/300 [3:53:07<54:15, 57.12s/it]

Training...
mean cross_entropy: 0.7342777252197266, mean accuracy: 0.737812876701355


 81%|████████▏ | 244/300 [3:54:04<53:15, 57.07s/it]

Training...
mean cross_entropy: 0.7334643602371216, mean accuracy: 0.7386677861213684


 82%|████████▏ | 245/300 [3:55:01<52:21, 57.12s/it]

Training...
mean cross_entropy: 0.7277432680130005, mean accuracy: 0.7384131550788879


 82%|████████▏ | 246/300 [3:55:59<51:36, 57.34s/it]

Training...
mean cross_entropy: 0.7330462336540222, mean accuracy: 0.7367396950721741


 82%|████████▏ | 247/300 [3:56:57<50:43, 57.43s/it]

Training...
mean cross_entropy: 0.7333105206489563, mean accuracy: 0.737248957157135


 83%|████████▎ | 248/300 [3:57:55<49:50, 57.51s/it]

Training...
mean cross_entropy: 0.73604816198349, mean accuracy: 0.734538733959198


 83%|████████▎ | 249/300 [3:58:52<48:55, 57.55s/it]

Training...
mean cross_entropy: 0.7296040058135986, mean accuracy: 0.737212598323822


 83%|████████▎ | 250/300 [3:59:50<48:04, 57.68s/it]

Training...
mean cross_entropy: 0.7307218313217163, mean accuracy: 0.7381584644317627


 84%|████████▎ | 251/300 [4:00:48<47:07, 57.70s/it]

Training...
mean cross_entropy: 0.7309259176254272, mean accuracy: 0.737649142742157


 84%|████████▍ | 252/300 [4:01:46<46:09, 57.71s/it]

Training...
mean cross_entropy: 0.7299723625183105, mean accuracy: 0.7393407821655273


 84%|████████▍ | 253/300 [4:02:43<45:10, 57.68s/it]

Training...
mean cross_entropy: 0.7348595857620239, mean accuracy: 0.7362121939659119


 85%|████████▍ | 254/300 [4:03:41<44:13, 57.67s/it]

Training...
mean cross_entropy: 0.7270094752311707, mean accuracy: 0.7392316460609436


 85%|████████▌ | 255/300 [4:04:39<43:14, 57.66s/it]

Training...
mean cross_entropy: 0.7325206995010376, mean accuracy: 0.7360302805900574


 85%|████████▌ | 256/300 [4:05:37<42:29, 57.95s/it]

Training...
mean cross_entropy: 0.7328242063522339, mean accuracy: 0.7364304661750793


 86%|████████▌ | 257/300 [4:06:36<41:44, 58.25s/it]

Training...
mean cross_entropy: 0.7339839339256287, mean accuracy: 0.7362121939659119


 86%|████████▌ | 258/300 [4:07:35<40:56, 58.50s/it]

Training...
mean cross_entropy: 0.7303458452224731, mean accuracy: 0.7374672293663025


 86%|████████▋ | 259/300 [4:08:34<40:04, 58.66s/it]

Training...
mean cross_entropy: 0.729832649230957, mean accuracy: 0.7374672293663025
Validation...
mean cross_entropy: 0.7676705718040466, mean accuracy: 0.7349759340286255


 87%|████████▋ | 260/300 [4:09:34<39:22, 59.07s/it]

Training...
mean cross_entropy: 0.7302057147026062, mean accuracy: 0.740886926651001


 87%|████████▋ | 261/300 [4:10:33<38:20, 58.98s/it]

Training...
mean cross_entropy: 0.7344357371330261, mean accuracy: 0.7363030910491943


 87%|████████▋ | 262/300 [4:11:32<37:23, 59.03s/it]

Training...
mean cross_entropy: 0.7341935038566589, mean accuracy: 0.7366123199462891


 88%|████████▊ | 263/300 [4:12:31<36:24, 59.04s/it]

Training...
mean cross_entropy: 0.7296180725097656, mean accuracy: 0.7395045161247253


 88%|████████▊ | 264/300 [4:13:30<35:25, 59.04s/it]

Training...
mean cross_entropy: 0.7300114631652832, mean accuracy: 0.7384858727455139


 88%|████████▊ | 265/300 [4:14:29<34:24, 58.99s/it]

Training...
mean cross_entropy: 0.7335383892059326, mean accuracy: 0.7384676933288574


 89%|████████▊ | 266/300 [4:15:27<33:17, 58.74s/it]

Training...
mean cross_entropy: 0.7339809536933899, mean accuracy: 0.7368670105934143


 89%|████████▉ | 267/300 [4:16:25<32:07, 58.41s/it]

Training...
mean cross_entropy: 0.7333447337150574, mean accuracy: 0.7374308705329895


 89%|████████▉ | 268/300 [4:17:23<31:03, 58.23s/it]

Training...
mean cross_entropy: 0.7270709872245789, mean accuracy: 0.7397773861885071


 90%|████████▉ | 269/300 [4:18:21<30:01, 58.11s/it]

Training...
mean cross_entropy: 0.7314392924308777, mean accuracy: 0.7383949160575867


 90%|█████████ | 270/300 [4:19:18<29:00, 58.02s/it]

Training...
mean cross_entropy: 0.7367812991142273, mean accuracy: 0.7373580932617188


 90%|█████████ | 271/300 [4:20:16<27:58, 57.89s/it]

Training...
mean cross_entropy: 0.7363172769546509, mean accuracy: 0.7367396950721741


 91%|█████████ | 272/300 [4:21:14<26:59, 57.82s/it]

Training...
mean cross_entropy: 0.7332392930984497, mean accuracy: 0.735011637210846


 91%|█████████ | 273/300 [4:22:11<25:58, 57.72s/it]

Training...
mean cross_entropy: 0.7308622598648071, mean accuracy: 0.7398319244384766


 91%|█████████▏| 274/300 [4:23:09<25:01, 57.76s/it]

Training...
mean cross_entropy: 0.7313395738601685, mean accuracy: 0.7379947900772095


 92%|█████████▏| 275/300 [4:24:07<24:03, 57.73s/it]

Training...
mean cross_entropy: 0.7299139499664307, mean accuracy: 0.7377219200134277


 92%|█████████▏| 276/300 [4:25:04<23:04, 57.71s/it]

Training...
mean cross_entropy: 0.7342053055763245, mean accuracy: 0.73737633228302


 92%|█████████▏| 277/300 [4:26:02<22:04, 57.58s/it]

Training...
mean cross_entropy: 0.7333202958106995, mean accuracy: 0.7398682832717896


 93%|█████████▎| 278/300 [4:26:59<21:04, 57.48s/it]

Training...
mean cross_entropy: 0.7323241233825684, mean accuracy: 0.737412691116333


 93%|█████████▎| 279/300 [4:27:56<20:06, 57.45s/it]

Training...
mean cross_entropy: 0.7293021082878113, mean accuracy: 0.7393953800201416
Validation...
mean cross_entropy: 0.8536226153373718, mean accuracy: 0.7033253312110901


 93%|█████████▎| 280/300 [4:28:55<19:14, 57.71s/it]

Training...
mean cross_entropy: 0.7329943776130676, mean accuracy: 0.7381402850151062


 94%|█████████▎| 281/300 [4:29:52<18:13, 57.54s/it]

Training...
mean cross_entropy: 0.7320435047149658, mean accuracy: 0.7374308705329895


 94%|█████████▍| 282/300 [4:30:49<17:15, 57.50s/it]

Training...
mean cross_entropy: 0.7340679168701172, mean accuracy: 0.7370306849479675


 94%|█████████▍| 283/300 [4:31:46<16:15, 57.40s/it]

Training...
mean cross_entropy: 0.7317564487457275, mean accuracy: 0.7390861511230469


 95%|█████████▍| 284/300 [4:32:43<15:17, 57.34s/it]

Training...
mean cross_entropy: 0.7322118282318115, mean accuracy: 0.737449049949646


 95%|█████████▌| 285/300 [4:33:41<14:19, 57.31s/it]

Training...
mean cross_entropy: 0.7317089438438416, mean accuracy: 0.7399410605430603


 95%|█████████▌| 286/300 [4:34:38<13:22, 57.35s/it]

Training...
mean cross_entropy: 0.7313361167907715, mean accuracy: 0.7369397282600403


 96%|█████████▌| 287/300 [4:35:36<12:27, 57.47s/it]

Training...
mean cross_entropy: 0.7326070666313171, mean accuracy: 0.7363213300704956


 96%|█████████▌| 288/300 [4:36:33<11:29, 57.46s/it]

Training...
mean cross_entropy: 0.7321988940238953, mean accuracy: 0.7377219200134277


 96%|█████████▋| 289/300 [4:37:31<10:32, 57.50s/it]

Training...
mean cross_entropy: 0.737375020980835, mean accuracy: 0.7366305589675903


 97%|█████████▋| 290/300 [4:38:29<09:36, 57.60s/it]

Training...
mean cross_entropy: 0.736225962638855, mean accuracy: 0.7366669178009033


 97%|█████████▋| 291/300 [4:39:26<08:38, 57.63s/it]

Training...
mean cross_entropy: 0.7303743958473206, mean accuracy: 0.7386677861213684


 97%|█████████▋| 292/300 [4:40:24<07:41, 57.64s/it]

Training...
mean cross_entropy: 0.7299852967262268, mean accuracy: 0.7379220128059387


 98%|█████████▊| 293/300 [4:41:22<06:43, 57.64s/it]

Training...
mean cross_entropy: 0.7314216494560242, mean accuracy: 0.7387223243713379


 98%|█████████▊| 294/300 [4:42:19<05:45, 57.64s/it]

Training...
mean cross_entropy: 0.7316970825195312, mean accuracy: 0.736976146697998


 98%|█████████▊| 295/300 [4:43:17<04:48, 57.61s/it]

Training...
mean cross_entropy: 0.7293456792831421, mean accuracy: 0.7388860583305359


 99%|█████████▊| 296/300 [4:44:15<03:50, 57.65s/it]

Training...
mean cross_entropy: 0.7317208051681519, mean accuracy: 0.7379401922225952


 99%|█████████▉| 297/300 [4:45:12<02:52, 57.61s/it]

Training...
mean cross_entropy: 0.7303033471107483, mean accuracy: 0.7365396022796631


 99%|█████████▉| 298/300 [4:46:09<01:54, 57.47s/it]

Training...
mean cross_entropy: 0.7351428270339966, mean accuracy: 0.7375400066375732


100%|█████████▉| 299/300 [4:47:07<00:57, 57.42s/it]

Training...
mean cross_entropy: 0.7272161841392517, mean accuracy: 0.7396136522293091
Validation...
mean cross_entropy: 0.7763261795043945, mean accuracy: 0.7279647588729858


100%|██████████| 300/300 [4:48:05<00:00, 57.64s/it]
