In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import sys

IN_IPYTHON = True
try:
    __IPYTHON__
except NameError:
    IN_IPYTHON = False

if IN_IPYTHON:
    workspace_dir, output_fpath = 'food-11', 'predict.csv'
else:
    try:
        workspace_dir = sys.argv[1]
    except:
        workspace_dir = 'food-11'

    try:
        output_fpath = sys.argv[2]
    except:
        output_fpath = "predict.csv"

In [3]:
import os
import cv2
import numpy as np

IMAGE_SIZE = 192
def readfile(path, label):
    # label 是一個 boolean variable，代表需不需要回傳 y 值
    image_dir = sorted(os.listdir(path))
    x = np.zeros((len(image_dir), IMAGE_SIZE, IMAGE_SIZE, 3), dtype=np.uint8)
    y = np.zeros((len(image_dir)), dtype=np.uint8)
    for i, file in enumerate(image_dir):
        img = cv2.imread(os.path.join(path, file))
        # resize to IMAGE_SIZE x ? or ? x IMAGE_SIZE
        height = img.shape[0]
        width = img.shape[1]
        rate = IMAGE_SIZE / max(height, width)
        height = int(height * rate)
        width = int(width * rate)
        img = cv2.resize(img, (width, height))
        # pad black
        # from https://blog.csdn.net/qq_20622615/article/details/80929746
        W, H = IMAGE_SIZE, IMAGE_SIZE
        top = (H - height) // 2
        bottom = (H - height) // 2
        if top + bottom + height < H:
            bottom += 1
        left = (W - width) // 2
        right = (W - width) // 2
        if left + right + width < W:
            right += 1
        img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0))
        # to np array
        x[i, :, :] = img
        if label:
            y[i] = int(file.split("_")[0])
    if label:
      return x, y
    else:
      return x

In [4]:
import torchvision.transforms as transforms

transform_mean = np.array([ 69.58238342,  92.66689336, 115.24940137]) / 255
transform_std = np.array([71.8342021 , 76.83536755, 83.40123168]) / 255

train_transform1 = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomChoice([
        transforms.RandomVerticalFlip(),
        transforms.RandomHorizontalFlip(),
        transforms.RandomPerspective()
    ]),
    transforms.RandomChoice([
        transforms.RandomAffine(10), # 隨機線性轉換
        transforms.RandomRotation(40)
    ]),
    transforms.ColorJitter(), # 隨機色溫等
    transforms.ToTensor(), # 將圖片轉成 Tensor，並把數值 normalize 到 [0,1] (data normalization)
    transforms.Normalize(
        transform_mean,
        transform_std
    )
])
train_transform2 = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomOrder([
        transforms.RandomChoice([
            transforms.RandomHorizontalFlip(),
            transforms.RandomPerspective()
        ]),
        transforms.RandomAffine(30), # 隨機線性轉換
        transforms.RandomResizedCrop((IMAGE_SIZE, IMAGE_SIZE), scale=(0.5, 1.0)), # 隨機子圖
    ]),
    transforms.RandomChoice([
        transforms.ColorJitter(), # 隨機色溫等
        transforms.RandomGrayscale(),
    ]),
    transforms.ToTensor(), # 將圖片轉成 Tensor，並把數值 normalize 到 [0,1] (data normalization)
    transforms.RandomErasing(0.2),
    transforms.Normalize(
        transform_mean,
        transform_std
    )
])
test_transform = transforms.Compose([
    transforms.ToPILImage(),                                    
    transforms.ToTensor(),
    transforms.Normalize(
        transform_mean,
        transform_std
    )
])

In [5]:
import torch
from torch.utils.data import DataLoader, Dataset, ConcatDataset

class ImgDataset(Dataset):
    def __init__(self, x, y=None, transform=None):
        self.x = x
        # label is required to be a LongTensor
        self.y = y
        if y is not None:
            self.y = torch.LongTensor(y)
        self.transform = transform
    def __len__(self):
        return len(self.x)
    def __getitem__(self, index):
        X = self.x[index]
        if self.transform is not None:
            X = self.transform(X)
        if self.y is not None:
            Y = self.y[index]
            return X, Y
        else:
            return X

In [6]:
print("Reading data")
train_x, train_y = readfile(os.path.join(workspace_dir, "training"), True)
print("Size of training data = {}".format(len(train_x)))
val_x, val_y = readfile(os.path.join(workspace_dir, "validation"), True)
print("Size of validation data = {}".format(len(val_x)))

batch_size = 128
train_set = ConcatDataset([
    ImgDataset(train_x, train_y, train_transform2),
    ImgDataset(train_x, train_y, test_transform),
#     ImgDataset(train_x, train_y, train_transform1),
])
val_set = ImgDataset(val_x, val_y, test_transform)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=(16 if os.name=='posix' else 0))
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=(16 if os.name=='posix' else 0))

Reading data
Size of training data = 9866
Size of validation data = 3430


In [7]:
CALCULATE_STD_MEAN = False
if CALCULATE_STD_MEAN:
    tmp = ConcatDataset([train_set, val_set])
    tot, tot2 = np.zeros(3), np.zeros(3)
    tot_n = len(tmp) * IMAGE_SIZE ** 2
    for x, y in tmp:
        x = np.array(x, dtype=np.float64)
        tot += x.sum(axis=(0,1))
        tot2 += (x*x).sum(axis=(0,1))
    tot /= tot_n
    tot2 /= tot_n
    tot, np.sqrt(tot2 - tot*tot)

In [8]:
import torch.nn as nn

class TeacherNet_oToToT(nn.Module):
    def __init__(self):
        super(TeacherNet_oToToT, self).__init__()
        # torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
        # torch.nn.MaxPool2d(kernel_size, stride, padding)
        # input 維度 [3, IMAGE_SIZE, IMAGE_SIZE]
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 64, 3, 1, 1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),
            
            nn.Dropout(0.4),
            
            nn.Conv2d(64, 128, 3, 1, 1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.Conv2d(128, 128, 3, 1, 1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),
            
            nn.Dropout(0.4),
            
            nn.Conv2d(128, 256, 3, 1, 1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.Conv2d(256, 256, 3, 1, 1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),
            
            nn.Dropout(0.4),
            
            nn.Conv2d(256, 512, 3, 1, 1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.Conv2d(512, 512, 3, 1, 1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.Conv2d(512, 512, 3, 1, 1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),
        )
        self.fc = nn.Sequential(
            nn.Linear(12*12*512, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            
            nn.Dropout(0.4),
            
            nn.Linear(1024, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
                        
            nn.Linear(1024, 11)
        )

    def forward(self, x):
        out = self.cnn(x)
        out = out.view(out.size()[0], -1)
        return self.fc(out)

In [9]:
teacher_net = TeacherNet_oToToT().cuda()
# teacher_net.load_state_dict(torch.load('teacher_model.bin'))
loss = nn.CrossEntropyLoss() # 因為是 classification task，所以 loss 使用 CrossEntropyLoss
optimizers = [
    (torch.optim.Adam, 0.002),
    (torch.optim.SGD, 0.001)
]
num_epochs = [
    80,
    250
]

In [10]:
import time

TRAIN_TEACHER_NET = False

if TRAIN_TEACHER_NET:
    best_acc = 0

    for (optimizer, lr), num_epoch in zip(optimizers, num_epochs):
        optimizer = optimizer(teacher_net.parameters(), lr)
        for epoch in range(num_epoch):
            epoch_start_time = time.time()
            train_acc = 0.0
            train_loss = 0.0
            val_acc = 0.0
            val_loss = 0.0

            teacher_net.train() # 確保 model 是在 train model (開啟 Dropout 等...)
            for i, data in enumerate(train_loader):
                optimizer.zero_grad() # 用 optimizer 將 model 參數的 gradient 歸零
                train_pred = teacher_net(data[0].cuda()) # 利用 model 得到預測的機率分佈 這邊實際上就是去呼叫 model 的 forward 函數
                batch_loss = loss(train_pred, data[1].cuda()) # 計算 loss （注意 prediction 跟 label 必須同時在 CPU 或是 GPU 上）
                batch_loss.backward() 
                optimizer.step() # 以 optimizer 用 gradient 更新參數值

                train_acc += np.sum(np.argmax(train_pred.cpu().data.numpy(), axis=1) == data[1].numpy())
                train_loss += batch_loss.item()

#             print('[%03d/%03d] %2.2f sec(s) Train Acc: %3.6f Loss: %3.6f' % 
#                 (epoch + 1, num_epoch, time.time()-epoch_start_time, train_acc/len(train_set), train_loss/len(train_set)))
                
            teacher_net.eval()
            with torch.no_grad():
                for i, data in enumerate(val_loader):
                    val_pred = teacher_net(data[0].cuda())
                    batch_loss = loss(val_pred, data[1].cuda())
                    val_acc += np.sum(np.argmax(val_pred.cpu().data.numpy(), axis=1) == data[1].numpy())
                    val_loss += batch_loss.item()

                if val_acc > best_acc:
                    torch.save(teacher_net.state_dict(), 'teacher_model.bin')

                #將結果 print 出來
                print('[%03d/%03d] %2.2f sec(s) Train Acc: %3.6f Loss: %3.6f | Val Acc: %3.6f loss: %3.6f' % 
                      (epoch + 1, num_epoch, time.time()-epoch_start_time, train_acc/len(train_set),
                       train_loss/len(train_set), val_acc/len(val_set), val_loss/len(val_set)))
#     torch.save(teacher_net.state_dict(), 'teacher_model.bin')

In [11]:
teacher_net = TeacherNet_oToToT().cuda()
teacher_net.load_state_dict(torch.load('teacher_model.bin'))

<All keys matched successfully>

In [12]:
CHECK_TEACHER_NET = False
if CHECK_TEACHER_NET:
    test_x = readfile(os.path.join(workspace_dir, "testing"), False)
    print("Size of Testing data = {}".format(len(test_x)))
    test_set = ImgDataset(test_x, transform=test_transform)
    test_loader = DataLoader(test_set, batch_size=64, shuffle=False, num_workers=(16 if os.name=='posix' else 0))

    teacher_net.eval()

    prediction = []
    with torch.no_grad():
        for i, data in enumerate(test_loader):
            test_pred = teacher_net(data.cuda())
            test_label = np.argmax(test_pred.cpu().data.numpy(), axis=1)
            for y in test_label:
                prediction.append(y)

    with open(output_fpath, 'w') as f:
        f.write('Id,Category\n')
        for i, y in enumerate(prediction):
            f.write('{},{}\n'.format(i, y))

In [13]:
import torch.nn.functional as F

class swish(nn.Module):
    def __init__(self):
        super(swish, self).__init__()
 
    def forward(self, x):
        x = x * F.sigmoid(x)
        return x


class StudentNet(nn.Module):
    '''
      在這個Net裡面，我們會使用Depthwise & Pointwise Convolution Layer來疊model。
      你會發現，將原本的Convolution Layer換成Dw & Pw後，Accuracy通常不會降很多。

      另外，取名為StudentNet是因為這個Model等會要做Knowledge Distillation。
    '''

    def __init__(self, base=16, width_mult=1):
        '''
          Args:
            base: 這個model一開始的ch數量，每過一層都會*2，直到base*16為止。
            width_mult: 為了之後的Network Pruning使用，在base*8 chs的Layer上會 * width_mult代表剪枝後的ch數量。        
        '''
        super(StudentNet, self).__init__()
        multiplier = [2, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16]

        # bandwidth: 每一層Layer所使用的ch數量
        bandwidth = [int(base * m) for m in multiplier]

        # 我們只Pruning第三層以後的Layer
        for i in range(5, 13):
            bandwidth[i] = int(bandwidth[i] * width_mult)

        self.cnn = nn.Sequential(
            # 第一層我們通常不會拆解Convolution Layer。
            nn.Sequential(
                nn.Conv2d(3, bandwidth[0], 3, 1, 1),
                nn.BatchNorm2d(bandwidth[0]),
                nn.ReLU6(),
                nn.MaxPool2d(2, 2, 0),
            ),
            # 接下來每一個Sequential Block都一樣，所以我們只講一個Block
            nn.Sequential(
                # Depthwise Convolution
                nn.Conv2d(bandwidth[0], bandwidth[0], 3, 1, 1, groups=bandwidth[0]),
                # Batch Normalization
                nn.BatchNorm2d(bandwidth[0]),
                # ReLU6 是限制Neuron最小只會到0，最大只會到6。 MobileNet系列都是使用ReLU6。
                # 使用ReLU6的原因是因為如果數字太大，會不好壓到float16 / or further qunatization，因此才給個限制。
                nn.ReLU6(),
                # Pointwise Convolution
                nn.Conv2d(bandwidth[0], bandwidth[1], 1),
                # 過完Pointwise Convolution不需要再做ReLU，經驗上Pointwise + ReLU效果都會變差。
                nn.MaxPool2d(2, 2, 0),
                # 每過完一個Block就Down Sampling
            ),

            nn.Sequential(
                nn.Conv2d(bandwidth[1], bandwidth[1], 3, 1, 1, groups=bandwidth[1]),
                nn.BatchNorm2d(bandwidth[1]),
                nn.ReLU6(),
                nn.Conv2d(bandwidth[1], bandwidth[2], 1),
                nn.MaxPool2d(2, 2, 0),
            ),

            nn.Sequential(
                nn.Conv2d(bandwidth[2], bandwidth[2], 3, 1, 1, groups=bandwidth[2]),
                nn.BatchNorm2d(bandwidth[2]),
                nn.ReLU6(),
                nn.Conv2d(bandwidth[2], bandwidth[3], 1),
                nn.MaxPool2d(2, 2, 0),
            ),
            
            # 到這邊為止因為圖片已經被Down Sample很多次了，所以就不做MaxPool

            nn.Sequential(
                nn.Conv2d(bandwidth[3], bandwidth[3], 3, 1, 1, groups=bandwidth[3]),
                nn.BatchNorm2d(bandwidth[3]),
                nn.ReLU6(),
                nn.Conv2d(bandwidth[3], bandwidth[4], 1),
            ),
            
            *[
                nn.Sequential(
                    nn.Conv2d(bandwidth[_], bandwidth[_], 3, 1, 1, groups=bandwidth[_]),
                    nn.BatchNorm2d(bandwidth[_]),
                    swish(),
                    nn.Conv2d(bandwidth[_], bandwidth[_ + 1], 1),
                ) for _ in range(4, 13)
            ],

            # 這邊我們採用Global Average Pooling。
            # 如果輸入圖片大小不一樣的話，就會因為Global Average Pooling壓成一樣的形狀，這樣子接下來做FC就不會對不起來。
            nn.AdaptiveAvgPool2d((1, 1)),
        )
        self.fc = nn.Sequential(
            # 這邊我們直接Project到11維輸出答案。
            nn.Linear(bandwidth[13], 160),
            nn.BatchNorm1d(160),
            nn.ReLU6(),
            
            nn.Dropout(0.4),
            
            nn.Linear(160, 160),
            nn.BatchNorm1d(160),
            swish(),
                        
            nn.Linear(160, 11)
        )

    def forward(self, x):
        out = self.cnn(x)
        out = out.view(out.size()[0], -1)
        return self.fc(out)

In [14]:
def loss_fn_kd(outputs, labels, teacher_outputs, T=20, alpha=0.5):
    # 一般的Cross Entropy
    hard_loss = F.cross_entropy(outputs, labels) * (1. - alpha)
    # 讓logits的log_softmax對目標機率(teacher的logits/T後softmax)做KL Divergence。
    soft_loss = nn.KLDivLoss(reduction='batchmean')(F.log_softmax(outputs/T, dim=1),
                             F.softmax(teacher_outputs/T, dim=1)) * (alpha * T * T)
    return hard_loss + soft_loss

In [15]:
from torchsummary import summary

student_net_base = 8
student_net = StudentNet(student_net_base).cuda()

summary(student_net, (3, IMAGE_SIZE, IMAGE_SIZE))

optimizer = torch.optim.Adam(student_net.parameters(), lr=1e-3)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 16, 192, 192]             448
       BatchNorm2d-2         [-1, 16, 192, 192]              32
             ReLU6-3         [-1, 16, 192, 192]               0
         MaxPool2d-4           [-1, 16, 96, 96]               0
            Conv2d-5           [-1, 16, 96, 96]             160
       BatchNorm2d-6           [-1, 16, 96, 96]              32
             ReLU6-7           [-1, 16, 96, 96]               0
            Conv2d-8           [-1, 32, 96, 96]             544
         MaxPool2d-9           [-1, 32, 48, 48]               0
           Conv2d-10           [-1, 32, 48, 48]             320
      BatchNorm2d-11           [-1, 32, 48, 48]              64
            ReLU6-12           [-1, 32, 48, 48]               0
           Conv2d-13           [-1, 32, 48, 48]           1,056
        MaxPool2d-14           [-1, 32,

In [16]:
def run_epoch(dataloader, update=True, alpha=0.5):
    total_num, total_hit, total_loss = 0, 0, 0
    for now_step, batch_data in enumerate(dataloader):
        # 清空 optimizer
        optimizer.zero_grad()
        # 處理 input
        inputs, hard_labels = batch_data
        inputs = inputs.cuda()
        hard_labels = torch.LongTensor(hard_labels).cuda()
        # 因為Teacher沒有要backprop，所以我們使用torch.no_grad
        # 告訴torch不要暫存中間值(去做backprop)以浪費記憶體空間。
        with torch.no_grad():
            soft_labels = teacher_net(inputs)

        if update:
            logits = student_net(inputs)
            # 使用我們之前所寫的融合soft label&hard label的loss。
            # T=20是原始論文的參數設定。
            loss = loss_fn_kd(logits, hard_labels, soft_labels, 20, alpha)
            loss.backward()
            optimizer.step()    
        else:
            # 只是算validation acc的話，就開no_grad節省空間。
            with torch.no_grad():
                logits = student_net(inputs)
                loss = loss_fn_kd(logits, hard_labels, soft_labels, 20, alpha)
            
        total_hit += torch.sum(torch.argmax(logits, dim=1) == hard_labels).item()
        total_num += len(inputs)

        total_loss += loss.item() * len(inputs)
    return total_loss / total_num, total_hit / total_num

In [17]:
num_epoch = 0

# TeacherNet永遠都是Eval mode.
teacher_net.eval()
now_best_acc = 0.84
for epoch in range(num_epoch):
    epoch_start_time = time.time()
    
    student_net.train()
    train_loss, train_acc = run_epoch(train_loader, update=True)
    student_net.eval()
    valid_loss, valid_acc = run_epoch(val_loader, update=False)

    # 存下最好的model。
    if valid_acc > now_best_acc:
        now_best_acc = valid_acc
        torch.save(student_net.state_dict(), 'student_model.bin')
    print('[%03d/%03d] %2.2f sec(s) Train Acc: %3.6f Loss: %3.6f | Val Acc: %3.6f loss: %3.6f' % 
            (epoch + 1, num_epoch, time.time()-epoch_start_time, train_acc,
            train_loss, valid_acc, valid_loss))

In [18]:
num_epoch = 200

for epoch in range(num_epoch):
    epoch_start_time = time.time()
    
    student_net.train()
    train_loss, train_acc = run_epoch(train_loader, update=True, alpha=0)
    student_net.eval()
    valid_loss, valid_acc = run_epoch(val_loader, update=False, alpha=0)

    # 存下最好的model。
    if valid_acc > now_best_acc:
        now_best_acc = valid_acc
        torch.save(student_net.state_dict(), 'student_model_long.bin')
    print('[%03d/%03d] %2.2f sec(s) Train Acc: %3.6f Loss: %3.6f | Val Acc: %3.6f loss: %3.6f' % 
            (epoch + 1, num_epoch, time.time()-epoch_start_time, train_acc,
            train_loss, valid_acc, valid_loss))

[001/200] 39.56 sec(s) Train Acc: 0.217667 Loss: 2.203395 | Val Acc: 0.221866 loss: 2.138959
[002/200] 40.02 sec(s) Train Acc: 0.235506 Loss: 2.145249 | Val Acc: 0.238776 loss: 2.118534
[003/200] 40.31 sec(s) Train Acc: 0.222937 Loss: 2.168748 | Val Acc: 0.228571 loss: 2.135883
[004/200] 40.80 sec(s) Train Acc: 0.226079 Loss: 2.166416 | Val Acc: 0.234402 loss: 2.124787
[005/200] 41.30 sec(s) Train Acc: 0.217515 Loss: 2.195265 | Val Acc: 0.230321 loss: 2.160595
[006/200] 41.79 sec(s) Train Acc: 0.227448 Loss: 2.172449 | Val Acc: 0.202041 loss: 2.200933
[007/200] 41.93 sec(s) Train Acc: 0.213967 Loss: 2.209075 | Val Acc: 0.205831 loss: 2.233175
[008/200] 41.45 sec(s) Train Acc: 0.200740 Loss: 2.219565 | Val Acc: 0.186297 loss: 2.239271
[009/200] 41.53 sec(s) Train Acc: 0.195368 Loss: 2.234588 | Val Acc: 0.186006 loss: 2.218773
[010/200] 41.45 sec(s) Train Acc: 0.193594 Loss: 2.224470 | Val Acc: 0.206997 loss: 2.206872
[011/200] 41.43 sec(s) Train Acc: 0.204693 Loss: 2.200051 | Val Acc: 0

[090/200] 41.73 sec(s) Train Acc: 0.201145 Loss: 2.226241 | Val Acc: 0.183673 loss: 2.243764
[091/200] 41.73 sec(s) Train Acc: 0.192631 Loss: 2.236214 | Val Acc: 0.165598 loss: 2.266110
[092/200] 41.60 sec(s) Train Acc: 0.193341 Loss: 2.243590 | Val Acc: 0.159767 loss: 2.291364
[093/200] 41.86 sec(s) Train Acc: 0.190959 Loss: 2.243233 | Val Acc: 0.166472 loss: 2.268293
[094/200] 41.86 sec(s) Train Acc: 0.197142 Loss: 2.237805 | Val Acc: 0.184548 loss: 2.270989
[095/200] 41.72 sec(s) Train Acc: 0.199169 Loss: 2.230646 | Val Acc: 0.162974 loss: 2.305685
[096/200] 41.52 sec(s) Train Acc: 0.197648 Loss: 2.228375 | Val Acc: 0.167930 loss: 2.251294
[097/200] 41.54 sec(s) Train Acc: 0.202159 Loss: 2.223994 | Val Acc: 0.148397 loss: 2.313702
[098/200] 41.71 sec(s) Train Acc: 0.204642 Loss: 2.215136 | Val Acc: 0.177259 loss: 2.277352
[099/200] 41.65 sec(s) Train Acc: 0.201855 Loss: 2.219134 | Val Acc: 0.175802 loss: 2.352398
[100/200] 41.95 sec(s) Train Acc: 0.204135 Loss: 2.217177 | Val Acc: 0

[179/200] 41.09 sec(s) Train Acc: 0.197192 Loss: 2.218909 | Val Acc: 0.203207 loss: 2.226622
[180/200] 41.05 sec(s) Train Acc: 0.193442 Loss: 2.220714 | Val Acc: 0.156851 loss: 2.381394
[181/200] 40.98 sec(s) Train Acc: 0.192783 Loss: 2.216777 | Val Acc: 0.158017 loss: 2.258715
[182/200] 40.99 sec(s) Train Acc: 0.197496 Loss: 2.214475 | Val Acc: 0.133236 loss: 2.683775
[183/200] 41.04 sec(s) Train Acc: 0.197142 Loss: 2.213842 | Val Acc: 0.160350 loss: 2.247798
[184/200] 41.08 sec(s) Train Acc: 0.198105 Loss: 2.213617 | Val Acc: 0.164140 loss: 2.310762
[185/200] 40.94 sec(s) Train Acc: 0.196990 Loss: 2.211659 | Val Acc: 0.156268 loss: 2.388533
[186/200] 40.97 sec(s) Train Acc: 0.198713 Loss: 2.210979 | Val Acc: 0.165015 loss: 2.287943
[187/200] 41.23 sec(s) Train Acc: 0.198611 Loss: 2.211056 | Val Acc: 0.154227 loss: 2.389191
[188/200] 41.01 sec(s) Train Acc: 0.195419 Loss: 2.214052 | Val Acc: 0.157143 loss: 2.366946
[189/200] 41.06 sec(s) Train Acc: 0.194557 Loss: 2.212299 | Val Acc: 0