In [None]:
!pip install wandb
!wandb login

In [1]:
import os
import random
import shutil
import time
import warnings

import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torch.optim
import torch.cuda.amp as amp

import torch.utils.data
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models

#importing distributed and weights and biases
import torch.distributed as dist
import wandb

In [2]:
wandb.init(project='Homework9', entity='malachyiii')

In [3]:
print(torch.cuda.is_available())
print(torch.cuda.current_device())
torch.cuda.device(0)
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))
torch.cuda.empty_cache()

True
0
1
Tesla T4


In [4]:
SEED=1
random.seed(SEED)
torch.manual_seed(SEED)
cudnn.deterministic = True

In [5]:
#Trying resnet34 because it is fast
ARCH = torchvision.models.resnet34(pretrained=False)
START_EPOCH = 0
EPOCHS = 3

#The following parameters based on the paper at https://arxiv.org/pdf/1512.03385.pdf
TRAIN_BATCH=128
VAL_BATCH=128
MOMENTUM = 0.9
WEIGHT_DECAY = 1e-4
LR = 0.01
IMG_SIZE = 224

#Logging wandb config

wandb.config = {
  "learning_rate": LR,
  "epochs": EPOCHS,
  "batch_size": TRAIN_BATCH,
  "momentum": MOMENTUM,
  "weight_decay": WEIGHT_DECAY
  "image_size": IMG_SIZE
}

PRINT_FREQ = 50
WORKERS=2

TRAINDIR="/data/train"
VALDIR="/data/val"

In [6]:
GPU = torch.cuda.current_device()
torch.cuda.device(GPU)
cudnn.benchmark = True

In [None]:
if not torch.cuda.is_available():
    print('GPU not detected.. did you pass through your GPU?')

In [None]:
#Setting up the cluster and the world
WORLD_SIZE = 2
BACKEND = 'nccl'
URL = 'tcp://35.163.183.149:8888'
RANK = 0

dist.init_process_group(backend = BACKEND, init_method= URL,
                                world_size= WORLD_SIZE, rank=RANK)

In [7]:
imagenet_mean_RGB = [0.47889522, 0.47227842, 0.43047404]
imagenet_std_RGB = [0.229, 0.224, 0.225]

In [8]:
transform_train = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    # transformations based on https://arxiv.org/pdf/1512.03385.pdf
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(imagenet_mean_RGB, imagenet_std_RGB),
])

transform_val = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(imagenet_mean_RGB, imagenet_std_RGB),
])

train_dataset = torchvision.datasets.ImageFolder('/data/train', transform=transform_train)
val_dataset = torchvision.datasets.ImageFolder('/data/val', transform=transform_val)

In [9]:
#Setting up the train loader with a distributed Sampler
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=TRAIN_BATCH, 
                              shuffle=False, num_workers=WORKERS, pin_memory=True, 
                                           sampler=torch.utils.data.distributed.DistributedSampler(train_dataset))
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=VAL_BATCH, 
                              shuffle=False, num_workers=WORKERS, sampler=None)

In [10]:
def train(train_loader, model, criterion, optimizer, epoch):
    batch_time = AverageMeter('Time', ':6.3f')
    data_time = AverageMeter('Data', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, top1, top5],
        prefix="Epoch: [{}]".format(epoch))
    
    ######################
    # switch model to train mode here
    model.train()
    ################

    end = time.time()
    for i, (images, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)

        #####################
        # send the images to cuda device
        if GPU is not None:
            images, target = images.cuda(GPU, non_blocking = True), target.cuda(GPU, non_blocking = True)
        # send the target to cuda device

        
        ####Utilizing PyTorch native AMP####
        with amp.autocast():
            # compute output
            output = model(images)
            # compute loss 
            loss = criterion(output, target)
        
        
        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), images.size(0))
        top1.update(acc1[0], images.size(0))
        top5.update(acc5[0], images.size(0))
        
        #Logging the metrics
        wandb.log({"loss": loss,
                   "acc1": acc1,
                   "acc5": acc5})
        
        wandb.watch(model)

        # compute gradient and do SGD step
        
        #### zero out gradients in the optimier
        optimizer.zero_grad()
        
        ## backprop!
        scaler.scale(loss).backward()
        
        # update the weights!
        scaler.step(optimizer)
        scaler.update()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % PRINT_FREQ == 0:
            progress.display(i)

In [11]:
def validate(val_loader, model, criterion):
    batch_time = AverageMeter('Time', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(
        len(val_loader),
        [batch_time, losses, top1, top5],
        prefix='Test: ')

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (images, target) in enumerate(val_loader):
            
            
            ### send the images and target to cuda
            images, target = images.to(device), target.to(device)

            # compute output
            output = model(images)

            # compute loss
            loss = criterion(output, target)


            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % PRINT_FREQ == 0:
                progress.display(i)

        # TODO: this should also be done with the ProgressMeter
        print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
              .format(top1=top1, top5=top5))

    return top1.avg

In [12]:
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    # save the model state!
    torch.save(state, filename) 
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')

In [13]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)

In [14]:
class ProgressMeter(object):
    def __init__(self, num_batches, meters, prefix=""):
        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
        self.meters = meters
        self.prefix = prefix

    def display(self, batch):
        entries = [self.prefix + self.batch_fmtstr.format(batch)]
        entries += [str(meter) for meter in self.meters]
        print('\t'.join(entries))

    def _get_batch_fmtstr(self, num_batches):
        num_digits = len(str(num_batches // 1))
        fmt = '{:' + str(num_digits) + 'd}'
        return '[' + fmt + '/' + fmt.format(num_batches) + ']'

In [15]:
def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res

In [16]:
def adjust_learning_rate(optimizer, epoch):
    """Sets the learning rate to the initial LR decayed by 10 every 1/3 of the epochs"""
    lr = LR * (0.1 ** (epoch // int(round(EPOCHS/3, 0))))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [17]:
normalize = transforms.Normalize(mean=imagenet_mean_RGB, std=imagenet_std_RGB)

In [18]:
model = ARCH
model.cuda(GPU)
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[GPU])

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [19]:
criterion = nn.CrossEntropyLoss().cuda(GPU)
optimizer = torch.optim.SGD(params = model.parameters(), lr=LR, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)
#Based on the same paper, https://arxiv.org/pdf/1512.03385.pdf
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[2])
scaler = amp.GradScaler()

In [20]:
best_acc1 = 0

In [21]:
#Initiating weights and biases
wandb.watch(model)

for epoch in range(START_EPOCH, EPOCHS):
    #adjust_learning_rate(optimizer, epoch)
    
    # train for one epoch
    train(train_loader, model, criterion, optimizer, epoch)

    # evaluate on validation set
    acc1 = validate(val_loader, model, criterion)

    # remember best acc@1 and save checkpoint
    is_best = acc1 > best_acc1
    best_acc1 = max(acc1, best_acc1)


    save_checkpoint({
        'epoch': epoch + 1,
        'arch': ARCH,
        'state_dict': model.state_dict(),
        'best_acc1': best_acc1,
        'optimizer' : optimizer.state_dict(),
    }, is_best)
    
    scheduler.step(acc1)
    
    print('lr: ' + str(scheduler.get_last_lr()))

Epoch: [0][    0/10010]	Time  5.010 ( 5.010)	Data  0.925 ( 0.925)	Loss 8.2048e-01 (8.2048e-01)	Acc@1  75.78 ( 75.78)	Acc@5  92.19 ( 92.19)
Epoch: [0][   50/10010]	Time  0.573 ( 0.648)	Data  0.012 ( 0.030)	Loss 1.1897e+00 (1.0064e+00)	Acc@1  67.19 ( 73.13)	Acc@5  89.84 ( 92.19)
Epoch: [0][  100/10010]	Time  0.592 ( 0.614)	Data  0.011 ( 0.024)	Loss 1.3954e+00 (1.1121e+00)	Acc@1  66.41 ( 70.66)	Acc@5  83.59 ( 91.05)
Epoch: [0][  150/10010]	Time  0.602 ( 0.607)	Data  0.011 ( 0.023)	Loss 1.3031e+00 (1.1480e+00)	Acc@1  67.97 ( 69.61)	Acc@5  85.16 ( 90.71)
Epoch: [0][  200/10010]	Time  0.611 ( 0.606)	Data  0.011 ( 0.022)	Loss 1.5232e+00 (1.1847e+00)	Acc@1  63.28 ( 68.81)	Acc@5  82.81 ( 90.24)
Epoch: [0][  250/10010]	Time  0.619 ( 0.607)	Data  0.010 ( 0.021)	Loss 1.4456e+00 (1.1968e+00)	Acc@1  64.84 ( 68.58)	Acc@5  87.50 ( 89.97)
Epoch: [0][  300/10010]	Time  0.632 ( 0.610)	Data  0.011 ( 0.021)	Loss 1.3302e+00 (1.2153e+00)	Acc@1  64.84 ( 68.15)	Acc@5  88.28 ( 89.69)
Epoch: [0][  350/10010]	Tim

Epoch: [0][ 2950/10010]	Time  0.668 ( 0.663)	Data  0.012 ( 0.021)	Loss 1.4625e+00 (1.2583e+00)	Acc@1  62.50 ( 67.43)	Acc@5  89.06 ( 89.15)
Epoch: [0][ 3000/10010]	Time  0.668 ( 0.663)	Data  0.012 ( 0.021)	Loss 1.3596e+00 (1.2582e+00)	Acc@1  70.31 ( 67.44)	Acc@5  87.50 ( 89.15)
Epoch: [0][ 3050/10010]	Time  0.671 ( 0.664)	Data  0.012 ( 0.021)	Loss 1.2929e+00 (1.2577e+00)	Acc@1  63.28 ( 67.45)	Acc@5  90.62 ( 89.15)
Epoch: [0][ 3100/10010]	Time  0.672 ( 0.664)	Data  0.011 ( 0.021)	Loss 1.2014e+00 (1.2565e+00)	Acc@1  67.19 ( 67.48)	Acc@5  89.06 ( 89.16)
Epoch: [0][ 3150/10010]	Time  0.670 ( 0.664)	Data  0.011 ( 0.021)	Loss 1.2095e+00 (1.2564e+00)	Acc@1  71.09 ( 67.49)	Acc@5  89.84 ( 89.16)
Epoch: [0][ 3200/10010]	Time  0.667 ( 0.664)	Data  0.012 ( 0.021)	Loss 1.4457e+00 (1.2564e+00)	Acc@1  62.50 ( 67.49)	Acc@5  85.16 ( 89.17)
Epoch: [0][ 3250/10010]	Time  0.669 ( 0.664)	Data  0.012 ( 0.021)	Loss 1.1085e+00 (1.2567e+00)	Acc@1  71.09 ( 67.48)	Acc@5  89.84 ( 89.16)
Epoch: [0][ 3300/10010]	Tim

Epoch: [0][ 5900/10010]	Time  0.671 ( 0.667)	Data  0.012 ( 0.021)	Loss 1.3379e+00 (1.2460e+00)	Acc@1  69.53 ( 67.77)	Acc@5  87.50 ( 89.29)
Epoch: [0][ 5950/10010]	Time  0.666 ( 0.667)	Data  0.012 ( 0.021)	Loss 1.2933e+00 (1.2456e+00)	Acc@1  60.16 ( 67.77)	Acc@5  88.28 ( 89.30)
Epoch: [0][ 6000/10010]	Time  0.667 ( 0.667)	Data  0.012 ( 0.021)	Loss 1.3702e+00 (1.2454e+00)	Acc@1  67.19 ( 67.78)	Acc@5  89.06 ( 89.30)
Epoch: [0][ 6050/10010]	Time  0.670 ( 0.667)	Data  0.012 ( 0.021)	Loss 1.2918e+00 (1.2450e+00)	Acc@1  66.41 ( 67.79)	Acc@5  88.28 ( 89.31)
Epoch: [0][ 6100/10010]	Time  0.673 ( 0.667)	Data  0.012 ( 0.021)	Loss 1.3573e+00 (1.2445e+00)	Acc@1  65.62 ( 67.80)	Acc@5  87.50 ( 89.31)
Epoch: [0][ 6150/10010]	Time  0.668 ( 0.667)	Data  0.012 ( 0.021)	Loss 1.1862e+00 (1.2442e+00)	Acc@1  70.31 ( 67.81)	Acc@5  89.06 ( 89.31)
Epoch: [0][ 6200/10010]	Time  0.669 ( 0.667)	Data  0.012 ( 0.021)	Loss 1.1913e+00 (1.2434e+00)	Acc@1  72.66 ( 67.83)	Acc@5  89.06 ( 89.32)
Epoch: [0][ 6250/10010]	Tim

Epoch: [0][ 8850/10010]	Time  0.671 ( 0.668)	Data  0.012 ( 0.021)	Loss 1.2083e+00 (1.2319e+00)	Acc@1  73.44 ( 68.15)	Acc@5  87.50 ( 89.44)
Epoch: [0][ 8900/10010]	Time  0.669 ( 0.668)	Data  0.012 ( 0.021)	Loss 1.3743e+00 (1.2319e+00)	Acc@1  67.19 ( 68.15)	Acc@5  88.28 ( 89.44)
Epoch: [0][ 8950/10010]	Time  0.673 ( 0.668)	Data  0.012 ( 0.021)	Loss 1.2429e+00 (1.2315e+00)	Acc@1  67.19 ( 68.16)	Acc@5  88.28 ( 89.44)
Epoch: [0][ 9000/10010]	Time  0.666 ( 0.668)	Data  0.012 ( 0.021)	Loss 1.0899e+00 (1.2313e+00)	Acc@1  73.44 ( 68.17)	Acc@5  92.97 ( 89.44)
Epoch: [0][ 9050/10010]	Time  0.671 ( 0.668)	Data  0.012 ( 0.021)	Loss 1.1266e+00 (1.2311e+00)	Acc@1  72.66 ( 68.17)	Acc@5  91.41 ( 89.45)
Epoch: [0][ 9100/10010]	Time  0.667 ( 0.668)	Data  0.012 ( 0.021)	Loss 1.0526e+00 (1.2311e+00)	Acc@1  75.00 ( 68.18)	Acc@5  92.97 ( 89.45)
Epoch: [0][ 9150/10010]	Time  0.671 ( 0.668)	Data  0.013 ( 0.021)	Loss 1.3977e+00 (1.2308e+00)	Acc@1  64.84 ( 68.18)	Acc@5  88.28 ( 89.45)
Epoch: [0][ 9200/10010]	Tim



Epoch: [1][    0/10010]	Time  1.401 ( 1.401)	Data  1.133 ( 1.133)	Loss 9.3917e-01 (9.3917e-01)	Acc@1  71.09 ( 71.09)	Acc@5  93.75 ( 93.75)
Epoch: [1][   50/10010]	Time  0.669 ( 0.668)	Data  0.011 ( 0.043)	Loss 1.0001e+00 (1.0070e+00)	Acc@1  73.44 ( 72.89)	Acc@5  91.41 ( 92.52)
Epoch: [1][  100/10010]	Time  0.671 ( 0.668)	Data  0.013 ( 0.032)	Loss 9.5221e-01 (9.8391e-01)	Acc@1  77.34 ( 73.55)	Acc@5  92.97 ( 92.79)
Epoch: [1][  150/10010]	Time  0.678 ( 0.670)	Data  0.013 ( 0.028)	Loss 9.6058e-01 (9.7236e-01)	Acc@1  75.78 ( 74.18)	Acc@5  89.84 ( 92.81)
Epoch: [1][  200/10010]	Time  0.671 ( 0.671)	Data  0.012 ( 0.026)	Loss 7.5036e-01 (9.5121e-01)	Acc@1  81.25 ( 74.77)	Acc@5  94.53 ( 93.02)
Epoch: [1][  250/10010]	Time  0.657 ( 0.670)	Data  0.012 ( 0.025)	Loss 8.1889e-01 (9.4425e-01)	Acc@1  76.56 ( 75.01)	Acc@5  96.09 ( 93.07)
Epoch: [1][  300/10010]	Time  0.664 ( 0.669)	Data  0.012 ( 0.025)	Loss 9.9348e-01 (9.2778e-01)	Acc@1  70.31 ( 75.42)	Acc@5  92.97 ( 93.22)
Epoch: [1][  350/10010]	Tim

Epoch: [1][ 2950/10010]	Time  0.655 ( 0.657)	Data  0.012 ( 0.021)	Loss 7.5706e-01 (8.0787e-01)	Acc@1  82.03 ( 78.57)	Acc@5  96.88 ( 94.41)
Epoch: [1][ 3000/10010]	Time  0.651 ( 0.657)	Data  0.012 ( 0.021)	Loss 8.8418e-01 (8.0676e-01)	Acc@1  79.69 ( 78.60)	Acc@5  92.97 ( 94.43)
Epoch: [1][ 3050/10010]	Time  0.650 ( 0.657)	Data  0.012 ( 0.021)	Loss 9.5597e-01 (8.0589e-01)	Acc@1  73.44 ( 78.61)	Acc@5  92.97 ( 94.44)
Epoch: [1][ 3100/10010]	Time  0.654 ( 0.657)	Data  0.012 ( 0.021)	Loss 6.1787e-01 (8.0538e-01)	Acc@1  80.47 ( 78.63)	Acc@5  96.09 ( 94.44)
Epoch: [1][ 3150/10010]	Time  0.657 ( 0.657)	Data  0.011 ( 0.021)	Loss 9.2571e-01 (8.0461e-01)	Acc@1  77.34 ( 78.66)	Acc@5  92.19 ( 94.45)
Epoch: [1][ 3200/10010]	Time  0.654 ( 0.657)	Data  0.012 ( 0.021)	Loss 7.0723e-01 (8.0372e-01)	Acc@1  81.25 ( 78.69)	Acc@5  96.09 ( 94.46)
Epoch: [1][ 3250/10010]	Time  0.654 ( 0.657)	Data  0.012 ( 0.021)	Loss 9.5346e-01 (8.0342e-01)	Acc@1  71.09 ( 78.70)	Acc@5  95.31 ( 94.46)
Epoch: [1][ 3300/10010]	Tim

Epoch: [1][ 5900/10010]	Time  0.649 ( 0.655)	Data  0.012 ( 0.021)	Loss 6.4074e-01 (7.7527e-01)	Acc@1  79.69 ( 79.37)	Acc@5  93.75 ( 94.74)
Epoch: [1][ 5950/10010]	Time  0.648 ( 0.655)	Data  0.012 ( 0.021)	Loss 7.4781e-01 (7.7460e-01)	Acc@1  78.91 ( 79.39)	Acc@5  94.53 ( 94.75)
Epoch: [1][ 6000/10010]	Time  0.646 ( 0.655)	Data  0.012 ( 0.021)	Loss 8.7794e-01 (7.7419e-01)	Acc@1  78.91 ( 79.40)	Acc@5  92.97 ( 94.75)
Epoch: [1][ 6050/10010]	Time  0.651 ( 0.655)	Data  0.012 ( 0.021)	Loss 6.1814e-01 (7.7381e-01)	Acc@1  81.25 ( 79.41)	Acc@5  97.66 ( 94.76)
Epoch: [1][ 6100/10010]	Time  0.648 ( 0.655)	Data  0.012 ( 0.021)	Loss 8.8249e-01 (7.7356e-01)	Acc@1  74.22 ( 79.41)	Acc@5  98.44 ( 94.76)
Epoch: [1][ 6150/10010]	Time  0.659 ( 0.655)	Data  0.012 ( 0.021)	Loss 6.1128e-01 (7.7311e-01)	Acc@1  83.59 ( 79.42)	Acc@5  96.88 ( 94.76)
Epoch: [1][ 6200/10010]	Time  0.648 ( 0.655)	Data  0.012 ( 0.021)	Loss 8.0495e-01 (7.7251e-01)	Acc@1  79.69 ( 79.43)	Acc@5  92.19 ( 94.77)
Epoch: [1][ 6250/10010]	Tim

Epoch: [1][ 8850/10010]	Time  0.654 ( 0.654)	Data  0.013 ( 0.021)	Loss 6.6842e-01 (7.5553e-01)	Acc@1  82.81 ( 79.82)	Acc@5  95.31 ( 94.95)
Epoch: [1][ 8900/10010]	Time  0.656 ( 0.654)	Data  0.012 ( 0.021)	Loss 7.8268e-01 (7.5536e-01)	Acc@1  78.91 ( 79.82)	Acc@5  96.09 ( 94.95)
Epoch: [1][ 8950/10010]	Time  0.655 ( 0.654)	Data  0.012 ( 0.021)	Loss 7.2384e-01 (7.5512e-01)	Acc@1  82.03 ( 79.83)	Acc@5  92.97 ( 94.95)
Epoch: [1][ 9000/10010]	Time  0.654 ( 0.654)	Data  0.012 ( 0.021)	Loss 5.3152e-01 (7.5479e-01)	Acc@1  84.38 ( 79.84)	Acc@5  97.66 ( 94.95)
Epoch: [1][ 9050/10010]	Time  0.657 ( 0.654)	Data  0.012 ( 0.021)	Loss 8.0330e-01 (7.5447e-01)	Acc@1  81.25 ( 79.85)	Acc@5  91.41 ( 94.96)
Epoch: [1][ 9100/10010]	Time  0.652 ( 0.654)	Data  0.012 ( 0.021)	Loss 8.6668e-01 (7.5435e-01)	Acc@1  78.12 ( 79.85)	Acc@5  92.19 ( 94.96)
Epoch: [1][ 9150/10010]	Time  0.654 ( 0.654)	Data  0.012 ( 0.021)	Loss 8.8994e-01 (7.5414e-01)	Acc@1  75.78 ( 79.86)	Acc@5  93.75 ( 94.96)
Epoch: [1][ 9200/10010]	Tim

Epoch: [2][ 1450/10010]	Time  0.668 ( 0.669)	Data  0.012 ( 0.022)	Loss 5.3365e-01 (6.4864e-01)	Acc@1  84.38 ( 82.58)	Acc@5  96.88 ( 96.14)
Epoch: [2][ 1500/10010]	Time  0.668 ( 0.669)	Data  0.012 ( 0.022)	Loss 6.3738e-01 (6.4812e-01)	Acc@1  80.47 ( 82.59)	Acc@5  96.88 ( 96.16)
Epoch: [2][ 1550/10010]	Time  0.667 ( 0.669)	Data  0.012 ( 0.021)	Loss 6.5923e-01 (6.4795e-01)	Acc@1  84.38 ( 82.61)	Acc@5  97.66 ( 96.15)
Epoch: [2][ 1600/10010]	Time  0.668 ( 0.669)	Data  0.012 ( 0.021)	Loss 7.8712e-01 (6.4809e-01)	Acc@1  84.38 ( 82.60)	Acc@5  92.97 ( 96.15)
Epoch: [2][ 1650/10010]	Time  0.669 ( 0.669)	Data  0.012 ( 0.021)	Loss 7.9525e-01 (6.4781e-01)	Acc@1  85.16 ( 82.62)	Acc@5  93.75 ( 96.15)
Epoch: [2][ 1700/10010]	Time  0.669 ( 0.669)	Data  0.012 ( 0.021)	Loss 5.2072e-01 (6.4783e-01)	Acc@1  85.94 ( 82.61)	Acc@5  97.66 ( 96.15)
Epoch: [2][ 1750/10010]	Time  0.671 ( 0.669)	Data  0.012 ( 0.021)	Loss 5.7405e-01 (6.4776e-01)	Acc@1  87.50 ( 82.60)	Acc@5  96.09 ( 96.16)
Epoch: [2][ 1800/10010]	Tim

Epoch: [2][ 4400/10010]	Time  0.670 ( 0.669)	Data  0.012 ( 0.021)	Loss 6.6312e-01 (6.4753e-01)	Acc@1  82.81 ( 82.57)	Acc@5  96.09 ( 96.10)
Epoch: [2][ 4450/10010]	Time  0.673 ( 0.669)	Data  0.013 ( 0.021)	Loss 6.9763e-01 (6.4768e-01)	Acc@1  82.03 ( 82.57)	Acc@5  96.09 ( 96.10)
Epoch: [2][ 4500/10010]	Time  0.667 ( 0.669)	Data  0.012 ( 0.021)	Loss 6.4347e-01 (6.4771e-01)	Acc@1  83.59 ( 82.57)	Acc@5  95.31 ( 96.10)
Epoch: [2][ 4550/10010]	Time  0.669 ( 0.669)	Data  0.012 ( 0.021)	Loss 7.5447e-01 (6.4767e-01)	Acc@1  79.69 ( 82.57)	Acc@5  92.97 ( 96.10)
Epoch: [2][ 4600/10010]	Time  0.672 ( 0.669)	Data  0.012 ( 0.021)	Loss 5.8040e-01 (6.4753e-01)	Acc@1  83.59 ( 82.58)	Acc@5  99.22 ( 96.10)
Epoch: [2][ 4650/10010]	Time  0.671 ( 0.669)	Data  0.012 ( 0.021)	Loss 7.4410e-01 (6.4730e-01)	Acc@1  82.03 ( 82.59)	Acc@5  93.75 ( 96.10)
Epoch: [2][ 4700/10010]	Time  0.668 ( 0.669)	Data  0.012 ( 0.021)	Loss 6.0467e-01 (6.4712e-01)	Acc@1  85.16 ( 82.60)	Acc@5  96.09 ( 96.10)
Epoch: [2][ 4750/10010]	Tim

Epoch: [2][ 7350/10010]	Time  0.670 ( 0.669)	Data  0.012 ( 0.021)	Loss 7.4669e-01 (6.4690e-01)	Acc@1  82.03 ( 82.55)	Acc@5  95.31 ( 96.08)
Epoch: [2][ 7400/10010]	Time  0.672 ( 0.669)	Data  0.012 ( 0.021)	Loss 4.5446e-01 (6.4690e-01)	Acc@1  85.94 ( 82.55)	Acc@5  99.22 ( 96.08)
Epoch: [2][ 7450/10010]	Time  0.668 ( 0.669)	Data  0.012 ( 0.021)	Loss 6.7798e-01 (6.4689e-01)	Acc@1  82.03 ( 82.55)	Acc@5  94.53 ( 96.08)
Epoch: [2][ 7500/10010]	Time  0.670 ( 0.669)	Data  0.012 ( 0.021)	Loss 5.7805e-01 (6.4671e-01)	Acc@1  81.25 ( 82.55)	Acc@5  97.66 ( 96.08)
Epoch: [2][ 7550/10010]	Time  0.668 ( 0.669)	Data  0.012 ( 0.021)	Loss 6.9078e-01 (6.4665e-01)	Acc@1  84.38 ( 82.55)	Acc@5  95.31 ( 96.08)
Epoch: [2][ 7600/10010]	Time  0.665 ( 0.669)	Data  0.012 ( 0.021)	Loss 5.9993e-01 (6.4672e-01)	Acc@1  84.38 ( 82.55)	Acc@5  97.66 ( 96.08)
Epoch: [2][ 7650/10010]	Time  0.674 ( 0.669)	Data  0.012 ( 0.021)	Loss 7.7159e-01 (6.4667e-01)	Acc@1  80.47 ( 82.55)	Acc@5  94.53 ( 96.08)
Epoch: [2][ 7700/10010]	Tim

Test: [350/391]	Time  0.214 ( 0.434)	Loss 1.2942e+00 (1.1092e+00)	Acc@1  72.66 ( 71.95)	Acc@5  89.84 ( 90.76)
 * Acc@1 71.968 Acc@5 90.792
lr: [0.001]
