In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import sys

IN_IPYTHON = True
try:
    __IPYTHON__
except NameError:
    IN_IPYTHON = False

if IN_IPYTHON:
    workspace_dir, output_fpath = 'food-11', 'predict.csv'
else:
    try:
        workspace_dir = sys.argv[1]
    except:
        workspace_dir = 'food-11'

    try:
        output_fpath = sys.argv[2]
    except:
        output_fpath = "predict.csv"

In [3]:
import torch.nn as nn
import torch.nn.functional as F
import torch

class StudentNet(nn.Module):
    '''
      在這個Net裡面，我們會使用Depthwise & Pointwise Convolution Layer來疊model。
      你會發現，將原本的Convolution Layer換成Dw & Pw後，Accuracy通常不會降很多。

      另外，取名為StudentNet是因為這個Model等會要做Knowledge Distillation。
    '''

    def __init__(self, base=16, width_mult=1):
        '''
          Args:
            base: 這個model一開始的ch數量，每過一層都會*2，直到base*16為止。
            width_mult: 為了之後的Network Pruning使用，在base*8 chs的Layer上會 * width_mult代表剪枝後的ch數量。        
        '''
        super(StudentNet, self).__init__()
        multiplier = [1, 2, 4, 8, 16, 16, 16, 16]

        # bandwidth: 每一層Layer所使用的ch數量
        bandwidth = [ base * m for m in multiplier]

        # 我們只Pruning第三層以後的Layer
        for i in range(3, 7):
            bandwidth[i] = int(bandwidth[i] * width_mult)

        self.cnn = nn.Sequential(
            # 第一層我們通常不會拆解Convolution Layer。
            nn.Sequential(
                nn.Conv2d(3, bandwidth[0], 3, 1, 1),
                nn.BatchNorm2d(bandwidth[0]),
                nn.ReLU6(),
                nn.MaxPool2d(2, 2, 0),
            ),
            # 接下來每一個Sequential Block都一樣，所以我們只講一個Block
            nn.Sequential(
                # Depthwise Convolution
                nn.Conv2d(bandwidth[0], bandwidth[0], 3, 1, 1, groups=bandwidth[0]),
                # Batch Normalization
                nn.BatchNorm2d(bandwidth[0]),
                # ReLU6 是限制Neuron最小只會到0，最大只會到6。 MobileNet系列都是使用ReLU6。
                # 使用ReLU6的原因是因為如果數字太大，會不好壓到float16 / or further qunatization，因此才給個限制。
                nn.ReLU6(),
                # Pointwise Convolution
                nn.Conv2d(bandwidth[0], bandwidth[1], 1),
                # 過完Pointwise Convolution不需要再做ReLU，經驗上Pointwise + ReLU效果都會變差。
                nn.MaxPool2d(2, 2, 0),
                # 每過完一個Block就Down Sampling
            ),

            nn.Sequential(
                nn.Conv2d(bandwidth[1], bandwidth[1], 3, 1, 1, groups=bandwidth[1]),
                nn.BatchNorm2d(bandwidth[1]),
                nn.ReLU6(),
                nn.Conv2d(bandwidth[1], bandwidth[2], 1),
                nn.MaxPool2d(2, 2, 0),
            ),

            nn.Sequential(
                nn.Conv2d(bandwidth[2], bandwidth[2], 3, 1, 1, groups=bandwidth[2]),
                nn.BatchNorm2d(bandwidth[2]),
                nn.ReLU6(),
                nn.Conv2d(bandwidth[2], bandwidth[3], 1),
                nn.MaxPool2d(2, 2, 0),
            ),

            # 到這邊為止因為圖片已經被Down Sample很多次了，所以就不做MaxPool
            nn.Sequential(
                nn.Conv2d(bandwidth[3], bandwidth[3], 3, 1, 1, groups=bandwidth[3]),
                nn.BatchNorm2d(bandwidth[3]),
                nn.ReLU6(),
                nn.Conv2d(bandwidth[3], bandwidth[4], 1),
            ),

            nn.Sequential(
                nn.Conv2d(bandwidth[4], bandwidth[4], 3, 1, 1, groups=bandwidth[4]),
                nn.BatchNorm2d(bandwidth[4]),
                nn.ReLU6(),
                nn.Conv2d(bandwidth[5], bandwidth[5], 1),
            ),

            nn.Sequential(
                nn.Conv2d(bandwidth[5], bandwidth[5], 3, 1, 1, groups=bandwidth[5]),
                nn.BatchNorm2d(bandwidth[5]),
                nn.ReLU6(),
                nn.Conv2d(bandwidth[6], bandwidth[6], 1),
            ),

            nn.Sequential(
                nn.Conv2d(bandwidth[6], bandwidth[6], 3, 1, 1, groups=bandwidth[6]),
                nn.BatchNorm2d(bandwidth[6]),
                nn.ReLU6(),
                nn.Conv2d(bandwidth[6], bandwidth[7], 1),
            ),

            # 這邊我們採用Global Average Pooling。
            # 如果輸入圖片大小不一樣的話，就會因為Global Average Pooling壓成一樣的形狀，這樣子接下來做FC就不會對不起來。
            nn.AdaptiveAvgPool2d((1, 1)),
        )
        self.fc = nn.Sequential(
            # 這邊我們直接Project到11維輸出答案。
            nn.Linear(bandwidth[7], 11),
        )

    def forward(self, x):
        out = self.cnn(x)
        out = out.view(out.size()[0], -1)
        return self.fc(out)

In [4]:
import os
import cv2
import numpy as np

IMAGE_SIZE = 192
def readfile(path, label):
    # label 是一個 boolean variable，代表需不需要回傳 y 值
    image_dir = sorted(os.listdir(path))
    x = np.zeros((len(image_dir), IMAGE_SIZE, IMAGE_SIZE, 3), dtype=np.uint8)
    y = np.zeros((len(image_dir)), dtype=np.uint8)
    for i, file in enumerate(image_dir):
        img = cv2.imread(os.path.join(path, file))
        # resize to IMAGE_SIZE x ? or ? x IMAGE_SIZE
        height = img.shape[0]
        width = img.shape[1]
        rate = IMAGE_SIZE / max(height, width)
        height = int(height * rate)
        width = int(width * rate)
        img = cv2.resize(img, (width, height))
        # pad black
        # from https://blog.csdn.net/qq_20622615/article/details/80929746
        W, H = IMAGE_SIZE, IMAGE_SIZE
        top = (H - height) // 2
        bottom = (H - height) // 2
        if top + bottom + height < H:
            bottom += 1
        left = (W - width) // 2
        right = (W - width) // 2
        if left + right + width < W:
            right += 1
        img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0))
        # to np array
        x[i, :, :] = img
        if label:
            y[i] = int(file.split("_")[0])
    if label:
      return x, y
    else:
      return x

In [5]:
import torchvision.transforms as transforms

transform_mean = np.array([ 69.58238342,  92.66689336, 115.24940137]) / 255
transform_std = np.array([71.8342021 , 76.83536755, 83.40123168]) / 255

train_transform1 = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomChoice([
        transforms.RandomVerticalFlip(),
        transforms.RandomHorizontalFlip(),
        transforms.RandomPerspective()
    ]),
    transforms.RandomChoice([
        transforms.RandomAffine(10), # 隨機線性轉換
        transforms.RandomRotation(40)
    ]),
    transforms.ColorJitter(), # 隨機色溫等
    transforms.ToTensor(), # 將圖片轉成 Tensor，並把數值 normalize 到 [0,1] (data normalization)
    transforms.Normalize(
        transform_mean,
        transform_std
    )
])
train_transform2 = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomOrder([
        transforms.RandomChoice([
            transforms.RandomHorizontalFlip(),
            transforms.RandomPerspective()
        ]),
        transforms.RandomAffine(30), # 隨機線性轉換
        transforms.RandomResizedCrop((IMAGE_SIZE, IMAGE_SIZE), scale=(0.5, 1.0)), # 隨機子圖
    ]),
    transforms.RandomChoice([
        transforms.ColorJitter(), # 隨機色溫等
        transforms.RandomGrayscale(),
    ]),
    transforms.ToTensor(), # 將圖片轉成 Tensor，並把數值 normalize 到 [0,1] (data normalization)
    transforms.RandomErasing(0.2),
    transforms.Normalize(
        transform_mean,
        transform_std
    )
])
test_transform = transforms.Compose([
    transforms.ToPILImage(),                                    
    transforms.ToTensor(),
    transforms.Normalize(
        transform_mean,
        transform_std
    )
])

In [6]:
from torch.utils.data import DataLoader, Dataset, ConcatDataset

class ImgDataset(Dataset):
    def __init__(self, x, y=None, transform=None):
        self.x = x
        # label is required to be a LongTensor
        self.y = y
        if y is not None:
            self.y = torch.LongTensor(y)
        self.transform = transform
    def __len__(self):
        return len(self.x)
    def __getitem__(self, index):
        X = self.x[index]
        if self.transform is not None:
            X = self.transform(X)
        if self.y is not None:
            Y = self.y[index]
            return X, Y
        else:
            return X

In [7]:
print("Reading data")
train_x, train_y = readfile(os.path.join(workspace_dir, "training"), True)
print("Size of training data = {}".format(len(train_x)))
val_x, val_y = readfile(os.path.join(workspace_dir, "validation"), True)
print("Size of validation data = {}".format(len(val_x)))

batch_size = 32
train_set = ConcatDataset([
    ImgDataset(train_x, train_y, train_transform1),
    ImgDataset(train_x, train_y, train_transform2),
    ImgDataset(train_x, train_y, test_transform),
#     ImgDataset(val_x, val_y, train_transform1),
#     ImgDataset(val_x, val_y, train_transform2),
#     ImgDataset(val_x, val_y, test_transform)
])
val_set = ImgDataset(val_x, val_y, test_transform)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=(16 if os.name=='posix' else 0))
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=(16 if os.name=='posix' else 0))

Reading data
Size of training data = 9866
Size of validation data = 3430


In [8]:
CALCULATE_STD_MEAN = False
if CALCULATE_STD_MEAN:
    tmp = ConcatDataset([train_set, val_set])
    tot, tot2 = np.zeros(3), np.zeros(3)
    tot_n = len(tmp) * IMAGE_SIZE ** 2
    for x, y in tmp:
        x = np.array(x, dtype=np.float64)
        tot += x.sum(axis=(0,1))
        tot2 += (x*x).sum(axis=(0,1))
    tot /= tot_n
    tot2 /= tot_n
    tot, np.sqrt(tot2 - tot*tot)

In [9]:
class TeacherNet_oToToT(nn.Module):
    def __init__(self):
        super(TeacherNet_oToToT, self).__init__()
        # torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
        # torch.nn.MaxPool2d(kernel_size, stride, padding)
        # input 維度 [3, IMAGE_SIZE, IMAGE_SIZE]
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 64, 3, 1, 1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),
            
            nn.Dropout(0.4),
            
            nn.Conv2d(64, 128, 3, 1, 1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.Conv2d(128, 128, 3, 1, 1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),
            
            nn.Dropout(0.4),
            
            nn.Conv2d(128, 256, 3, 1, 1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.Conv2d(256, 256, 3, 1, 1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),
            
            nn.Dropout(0.4),
            
            nn.Conv2d(256, 512, 3, 1, 1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.Conv2d(512, 512, 3, 1, 1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.Conv2d(512, 512, 3, 1, 1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(2, 2, 0),
        )
        self.fc = nn.Sequential(
            nn.Linear(12*12*512, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            
            nn.Dropout(0.4),
            
            nn.Linear(1024, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
                        
            nn.Linear(1024, 11)
        )

    def forward(self, x):
        out = self.cnn(x)
        out = out.view(out.size()[0], -1)
        return self.fc(out)

In [10]:
teacher_net = TeacherNet_oToToT().cuda()
loss = nn.CrossEntropyLoss() # 因為是 classification task，所以 loss 使用 CrossEntropyLoss
optimizers = [
    (torch.optim.Adam, 0.002),
    (torch.optim.SGD, 0.001)
]
num_epochs = [
    80,
    250
]

In [11]:
import time

TRAIN_TEACHER_NET = False

if TRAIN_TEACHER_NET:
    best_acc = 0

    for (optimizer, lr), num_epoch in zip(optimizers, num_epochs):
        optimizer = optimizer(teacher_net.parameters(), lr)
        for epoch in range(num_epoch):
            epoch_start_time = time.time()
            train_acc = 0.0
            train_loss = 0.0
            val_acc = 0.0
            val_loss = 0.0

            teacher_net.train() # 確保 model 是在 train model (開啟 Dropout 等...)
            for i, data in enumerate(train_loader):
                optimizer.zero_grad() # 用 optimizer 將 model 參數的 gradient 歸零
                train_pred = teacher_net(data[0].cuda()) # 利用 model 得到預測的機率分佈 這邊實際上就是去呼叫 model 的 forward 函數
                batch_loss = loss(train_pred, data[1].cuda()) # 計算 loss （注意 prediction 跟 label 必須同時在 CPU 或是 GPU 上）
                batch_loss.backward() 
                optimizer.step() # 以 optimizer 用 gradient 更新參數值

                train_acc += np.sum(np.argmax(train_pred.cpu().data.numpy(), axis=1) == data[1].numpy())
                train_loss += batch_loss.item()

#             print('[%03d/%03d] %2.2f sec(s) Train Acc: %3.6f Loss: %3.6f' % 
#                 (epoch + 1, num_epoch, time.time()-epoch_start_time, train_acc/len(train_set), train_loss/len(train_set)))
                
            teacher_net.eval()
            with torch.no_grad():
                for i, data in enumerate(val_loader):
                    val_pred = teacher_net(data[0].cuda())
                    batch_loss = loss(val_pred, data[1].cuda())
                    val_acc += np.sum(np.argmax(val_pred.cpu().data.numpy(), axis=1) == data[1].numpy())
                    val_loss += batch_loss.item()

                if val_acc > best_acc:
                    torch.save(teacher_net.state_dict(), 'teacher_model.bin')

                #將結果 print 出來
                print('[%03d/%03d] %2.2f sec(s) Train Acc: %3.6f Loss: %3.6f | Val Acc: %3.6f loss: %3.6f' % 
                      (epoch + 1, num_epoch, time.time()-epoch_start_time, train_acc/len(train_set),
                       train_loss/len(train_set), val_acc/len(val_set), val_loss/len(val_set)))
#     torch.save(teacher_net.state_dict(), 'teacher_model.bin')

In [12]:
teacher_net = TeacherNet_oToToT().cuda()
teacher_net.load_state_dict(torch.load('teacher_model.bin'))

<All keys matched successfully>

In [13]:
CHECK_TEACHER_NET = False
if CHECK_TEACHER_NET:
    test_x = readfile(os.path.join(workspace_dir, "testing"), False)
    print("Size of Testing data = {}".format(len(test_x)))
    test_set = ImgDataset(test_x, transform=test_transform)
    test_loader = DataLoader(test_set, batch_size=64, shuffle=False, num_workers=(16 if os.name=='posix' else 0))

    teacher_net.eval()

    prediction = []
    with torch.no_grad():
        for i, data in enumerate(test_loader):
            test_pred = teacher_net(data.cuda())
            test_label = np.argmax(test_pred.cpu().data.numpy(), axis=1)
            for y in test_label:
                prediction.append(y)

    with open(output_fpath, 'w') as f:
        f.write('Id,Category\n')
        for i, y in enumerate(prediction):
            f.write('{},{}\n'.format(i, y))

In [14]:
class swish(nn.Module):
    def __init__(self):
        super(swish, self).__init__()
 
    def forward(self, x):
        x = x * F.sigmoid(x)
        return x


class StudentNet(nn.Module):
    '''
      在這個Net裡面，我們會使用Depthwise & Pointwise Convolution Layer來疊model。
      你會發現，將原本的Convolution Layer換成Dw & Pw後，Accuracy通常不會降很多。

      另外，取名為StudentNet是因為這個Model等會要做Knowledge Distillation。
    '''

    def __init__(self, base=16, width_mult=1):
        '''
          Args:
            base: 這個model一開始的ch數量，每過一層都會*2，直到base*16為止。
            width_mult: 為了之後的Network Pruning使用，在base*8 chs的Layer上會 * width_mult代表剪枝後的ch數量。        
        '''
        super(StudentNet, self).__init__()
        multiplier = [2, 4, 8, 8, 16, 16, 16, 16]

        # bandwidth: 每一層Layer所使用的ch數量
        bandwidth = [ base * m for m in multiplier]

        # 我們只Pruning第三層以後的Layer
        for i in range(3, 7):
            bandwidth[i] = int(bandwidth[i] * width_mult)

        self.cnn = nn.Sequential(
            # 第一層我們通常不會拆解Convolution Layer。
            nn.Sequential(
                nn.Conv2d(3, bandwidth[0], 3, 1, 1),
                nn.BatchNorm2d(bandwidth[0]),
                nn.ReLU6(),
                nn.MaxPool2d(2, 2, 0),
            ),
            # 接下來每一個Sequential Block都一樣，所以我們只講一個Block
            nn.Sequential(
                # Depthwise Convolution
                nn.Conv2d(bandwidth[0], bandwidth[0], 3, 1, 1, groups=bandwidth[0]),
                # Batch Normalization
                nn.BatchNorm2d(bandwidth[0]),
                # ReLU6 是限制Neuron最小只會到0，最大只會到6。 MobileNet系列都是使用ReLU6。
                # 使用ReLU6的原因是因為如果數字太大，會不好壓到float16 / or further qunatization，因此才給個限制。
                nn.ReLU6(),
                # Pointwise Convolution
                nn.Conv2d(bandwidth[0], bandwidth[1], 1),
                # 過完Pointwise Convolution不需要再做ReLU，經驗上Pointwise + ReLU效果都會變差。
                nn.MaxPool2d(2, 2, 0),
                # 每過完一個Block就Down Sampling
            ),

            nn.Sequential(
                nn.Conv2d(bandwidth[1], bandwidth[1], 3, 1, 1, groups=bandwidth[1]),
                nn.BatchNorm2d(bandwidth[1]),
                nn.ReLU6(),
                nn.Conv2d(bandwidth[1], bandwidth[2], 1),
                nn.MaxPool2d(2, 2, 0),
            ),

            nn.Sequential(
                nn.Conv2d(bandwidth[2], bandwidth[2], 3, 1, 1, groups=bandwidth[2]),
                nn.BatchNorm2d(bandwidth[2]),
                nn.ReLU6(),
                nn.Conv2d(bandwidth[2], bandwidth[3], 1),
                nn.MaxPool2d(2, 2, 0),
            ),

            # 到這邊為止因為圖片已經被Down Sample很多次了，所以就不做MaxPool
            nn.Sequential(
                nn.Conv2d(bandwidth[3], bandwidth[3], 3, 1, 1, groups=bandwidth[3]),
                nn.BatchNorm2d(bandwidth[3]),
                nn.ReLU6(),
                nn.Conv2d(bandwidth[3], bandwidth[4], 1),
            ),

            nn.Sequential(
                nn.Conv2d(bandwidth[4], bandwidth[4], 3, 1, 1, groups=bandwidth[4]),
                nn.BatchNorm2d(bandwidth[4]),
                swish(),
                nn.Conv2d(bandwidth[4], bandwidth[5], 1),
            ),

            nn.Sequential(
                nn.Conv2d(bandwidth[5], bandwidth[5], 3, 1, 1, groups=bandwidth[5]),
                nn.BatchNorm2d(bandwidth[5]),
                swish(),
                nn.Conv2d(bandwidth[5], bandwidth[6], 1),
            ),

            nn.Sequential(
                nn.Conv2d(bandwidth[6], bandwidth[6], 3, 1, 1, groups=bandwidth[6]),
                nn.BatchNorm2d(bandwidth[6]),
                swish(),
                nn.Conv2d(bandwidth[6], bandwidth[7], 1),
            ),

            # 這邊我們採用Global Average Pooling。
            # 如果輸入圖片大小不一樣的話，就會因為Global Average Pooling壓成一樣的形狀，這樣子接下來做FC就不會對不起來。
            nn.AdaptiveAvgPool2d((1, 1)),
        )
        self.fc = nn.Sequential(
            # 這邊我們直接Project到11維輸出答案。
            nn.Linear(bandwidth[7], 128),
            nn.BatchNorm1d(128),
            nn.ReLU6(),
            
            nn.Dropout(0.4),
            
            nn.Linear(128, 128),
            nn.BatchNorm1d(128),
            swish(),
                        
            nn.Linear(128, 11)
        )

    def forward(self, x):
        out = self.cnn(x)
        out = out.view(out.size()[0], -1)
        return self.fc(out)

In [15]:
def loss_fn_kd(outputs, labels, teacher_outputs, T=20, alpha=0.5):
    # 一般的Cross Entropy
    hard_loss = F.cross_entropy(outputs, labels) * (1. - alpha)
    # 讓logits的log_softmax對目標機率(teacher的logits/T後softmax)做KL Divergence。
    soft_loss = nn.KLDivLoss(reduction='batchmean')(F.log_softmax(outputs/T, dim=1),
                             F.softmax(teacher_outputs/T, dim=1)) * (alpha * T * T)
    return hard_loss + soft_loss

In [16]:
from torchvision.models import mobilenet_v2

student_net = StudentNet(12).cuda()
student_net.load_state_dict(torch.load('student_model.bin'))
# student_net = mobilenet_v2(
#     num_classes=11,
#     width_mult=0.6,
#     round_nearest=4,
#     inverted_residual_setting = [
#         # t, c, n, s
#         [1, 16, 1, 1],
#         [6, 24, 2, 2],
# #         [6, 32, 3, 2],
#         [6, 64, 4, 2],
#         [6, 96, 3, 1],
# #         [6, 160, 3, 2],
#         [6, 320, 1, 1],
#     ]
# ).cuda()
optimizer = torch.optim.Adam(student_net.parameters(), lr=1e-3)

In [17]:
def run_epoch(dataloader, update=True, alpha=0.5):
    total_num, total_hit, total_loss = 0, 0, 0
    for now_step, batch_data in enumerate(dataloader):
        # 清空 optimizer
        optimizer.zero_grad()
        # 處理 input
        inputs, hard_labels = batch_data
        inputs = inputs.cuda()
        hard_labels = torch.LongTensor(hard_labels).cuda()
        # 因為Teacher沒有要backprop，所以我們使用torch.no_grad
        # 告訴torch不要暫存中間值(去做backprop)以浪費記憶體空間。
        with torch.no_grad():
            soft_labels = teacher_net(inputs)

        if update:
            logits = student_net(inputs)
            # 使用我們之前所寫的融合soft label&hard label的loss。
            # T=20是原始論文的參數設定。
            loss = loss_fn_kd(logits, hard_labels, soft_labels, 20, alpha)
            loss.backward()
            optimizer.step()    
        else:
            # 只是算validation acc的話，就開no_grad節省空間。
            with torch.no_grad():
                logits = student_net(inputs)
                loss = loss_fn_kd(logits, hard_labels, soft_labels, 20, alpha)
            
        total_hit += torch.sum(torch.argmax(logits, dim=1) == hard_labels).item()
        total_num += len(inputs)

        total_loss += loss.item() * len(inputs)
    return total_loss / total_num, total_hit / total_num

In [18]:
num_epoch = 0

# TeacherNet永遠都是Eval mode.
teacher_net.eval()
now_best_acc = 0.846064
for epoch in range(num_epoch):
    epoch_start_time = time.time()
    
    student_net.train()
    train_loss, train_acc = run_epoch(train_loader, update=True)
    student_net.eval()
    valid_loss, valid_acc = run_epoch(val_loader, update=False)

    # 存下最好的model。
    if valid_acc > now_best_acc:
        now_best_acc = valid_acc
        torch.save(student_net.state_dict(), 'student_model.bin')
    print('[%03d/%03d] %2.2f sec(s) Train Acc: %3.6f Loss: %3.6f | Val Acc: %3.6f loss: %3.6f' % 
            (epoch + 1, num_epoch, time.time()-epoch_start_time, train_acc,
            train_loss, valid_acc, valid_loss))

In [19]:
num_epoch = 0

for epoch in range(num_epoch):
    epoch_start_time = time.time()
    
    student_net.train()
    train_loss, train_acc = run_epoch(train_loader, update=True, alpha=0)
    student_net.eval()
    valid_loss, valid_acc = run_epoch(val_loader, update=False, alpha=0)

    # 存下最好的model。
    if valid_acc > now_best_acc:
        now_best_acc = valid_acc
        torch.save(student_net.state_dict(), 'student_model.bin')
    print('[%03d/%03d] %2.2f sec(s) Train Acc: %3.6f Loss: %3.6f | Val Acc: %3.6f loss: %3.6f' % 
            (epoch + 1, num_epoch, time.time()-epoch_start_time, train_acc,
            train_loss, valid_acc, valid_loss))

In [20]:
def network_slimming(old_model, new_model):
    params = old_model.state_dict()
    new_params = new_model.state_dict()
    
    # selected_idx: 每一層所選擇的neuron index
    selected_idx = []
    # 我們總共有7層CNN，因此逐一抓取選擇的neuron index們。
    for i in range(8):
        # 根據上表，我們要抓的gamma係數在cnn.{i}.1.weight內。
        importance = params[f'cnn.{i}.1.weight']
        # 抓取總共要篩選幾個neuron。
        old_dim = len(importance)
        new_dim = len(new_params[f'cnn.{i}.1.weight'])
        # 以Ranking做Index排序，較大的會在前面(descending=True)。
        ranking = torch.argsort(importance, descending=True)
        # 把篩選結果放入selected_idx中。
        selected_idx.append(ranking[:new_dim])

    now_processed = 1
    for (name, p1), (name2, p2) in zip(params.items(), new_params.items()):
        # 如果是cnn層，則移植參數。
        # 如果是FC層，或是該參數只有一個數字(例如batchnorm的tracenum等等資訊)，那麼就直接複製。
        if name.startswith('cnn') and p1.size() != torch.Size([]) and now_processed != len(selected_idx):
            # 當處理到Pointwise的weight時，讓now_processed+1，表示該層的移植已經完成。
            if name.startswith(f'cnn.{now_processed}.3'):
                now_processed += 1

            # 如果是pointwise，weight會被上一層的pruning和下一層的pruning所影響，因此需要特判。
            if name.endswith('3.weight'):
                # 如果是最後一層cnn，則輸出的neuron不需要prune掉。
                if len(selected_idx) == now_processed:
                    new_params[name] = p1[:,selected_idx[now_processed-1]]
                # 反之，就依照上層和下層所選擇的index進行移植。
                # 這裡需要注意的是Conv2d(x,y,1)的weight shape是(y,x,1,1)，順序是反的。
                else:
                    new_params[name] = p1[selected_idx[now_processed]][:,selected_idx[now_processed-1]]
            else:
                new_params[name] = p1[selected_idx[now_processed]]
        else:
            new_params[name] = p1

    # 讓新model load進被我們篩選過的parameters，並回傳new_model。        
    new_model.load_state_dict(new_params)
    return new_model

In [None]:
from torchsummary import summary

student_net = StudentNet(12).cuda()
student_net.load_state_dict(torch.load('student_model.bin'))

now_width_mult = 1
for i in range(30):
    now_width_mult *= 0.85
    print(now_width_mult)
    new_net = StudentNet(12, width_mult=now_width_mult).cuda()
    student_net = network_slimming(student_net, new_net)
    summary(student_net, (3, IMAGE_SIZE, IMAGE_SIZE))
    now_best_acc = 0
    for epoch in range(200):
        epoch_start_time = time.time()

        student_net.train()
        train_loss, train_acc = run_epoch(train_loader, update=True)
        student_net.eval()
        valid_loss, valid_acc = run_epoch(val_loader, update=False)

        # 存下最好的model。
        if valid_acc > now_best_acc:
            now_best_acc = valid_acc
            torch.save(student_net.state_dict(), f'student_model-pruned_{i}.bin')
        print('[%03d/%03d] %2.2f sec(s) Train Acc: %3.6f Loss: %3.6f | Val Acc: %3.6f loss: %3.6f' % 
                (epoch + 1, 200, time.time()-epoch_start_time, train_acc,
                train_loss, valid_acc, valid_loss))
    for epoch in range(0):
        epoch_start_time = time.time()

        student_net.train()
        train_loss, train_acc = run_epoch(train_loader, update=True, alpha=0)
        student_net.eval()
        valid_loss, valid_acc = run_epoch(val_loader, update=False, alpha=0)

        # 存下最好的model。
        if valid_acc > now_best_acc:
            now_best_acc = valid_acc
            torch.save(student_net.state_dict(), f'student_model-pruned_{i}.bin')
        print('[%03d/%03d] %2.2f sec(s) Train Acc: %3.6f Loss: %3.6f | Val Acc: %3.6f loss: %3.6f' % 
                (epoch + 1, 0, time.time()-epoch_start_time, train_acc,
                train_loss, valid_acc, valid_loss))

0.85
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 24, 192, 192]             672
       BatchNorm2d-2         [-1, 24, 192, 192]              48
             ReLU6-3         [-1, 24, 192, 192]               0
         MaxPool2d-4           [-1, 24, 96, 96]               0
            Conv2d-5           [-1, 24, 96, 96]             240
       BatchNorm2d-6           [-1, 24, 96, 96]              48
             ReLU6-7           [-1, 24, 96, 96]               0
            Conv2d-8           [-1, 48, 96, 96]           1,200
         MaxPool2d-9           [-1, 48, 48, 48]               0
           Conv2d-10           [-1, 48, 48, 48]             480
      BatchNorm2d-11           [-1, 48, 48, 48]              96
            ReLU6-12           [-1, 48, 48, 48]               0
           Conv2d-13           [-1, 96, 48, 48]           4,704
        MaxPool2d-14           [-1

[052/200] 124.43 sec(s) Train Acc: 0.825090 Loss: 5.600588 | Val Acc: 0.779300 loss: 10.254207
[053/200] 124.56 sec(s) Train Acc: 0.825867 Loss: 5.585746 | Val Acc: 0.780466 loss: 10.253874
[054/200] 124.54 sec(s) Train Acc: 0.822893 Loss: 5.594600 | Val Acc: 0.777843 loss: 10.146941
[055/200] 124.46 sec(s) Train Acc: 0.826745 Loss: 5.583877 | Val Acc: 0.777843 loss: 10.384850
[056/200] 124.44 sec(s) Train Acc: 0.826542 Loss: 5.546320 | Val Acc: 0.779883 loss: 10.196867
[057/200] 124.46 sec(s) Train Acc: 0.824245 Loss: 5.509893 | Val Acc: 0.777551 loss: 10.215672
[058/200] 124.40 sec(s) Train Acc: 0.825833 Loss: 5.540670 | Val Acc: 0.775802 loss: 10.157976
[059/200] 124.40 sec(s) Train Acc: 0.826846 Loss: 5.571777 | Val Acc: 0.778426 loss: 10.237158
[060/200] 124.56 sec(s) Train Acc: 0.826069 Loss: 5.571442 | Val Acc: 0.781050 loss: 10.341229
[061/200] 124.44 sec(s) Train Acc: 0.827590 Loss: 5.537953 | Val Acc: 0.778717 loss: 10.266468
[062/200] 124.47 sec(s) Train Acc: 0.826002 Loss: 

[139/200] 124.41 sec(s) Train Acc: 0.824008 Loss: 5.586171 | Val Acc: 0.774927 loss: 10.358473
[140/200] 124.47 sec(s) Train Acc: 0.826542 Loss: 5.550780 | Val Acc: 0.774344 loss: 10.200647
[141/200] 124.56 sec(s) Train Acc: 0.828603 Loss: 5.560835 | Val Acc: 0.781633 loss: 10.214131
[142/200] 124.60 sec(s) Train Acc: 0.825495 Loss: 5.577298 | Val Acc: 0.773469 loss: 10.294959
[143/200] 124.39 sec(s) Train Acc: 0.827117 Loss: 5.557418 | Val Acc: 0.779300 loss: 10.164828
[144/200] 124.42 sec(s) Train Acc: 0.826306 Loss: 5.547466 | Val Acc: 0.776093 loss: 10.112123
[145/200] 124.58 sec(s) Train Acc: 0.825090 Loss: 5.550752 | Val Acc: 0.781633 loss: 10.285808
[146/200] 124.56 sec(s) Train Acc: 0.823299 Loss: 5.577780 | Val Acc: 0.781924 loss: 10.160929
[147/200] 124.49 sec(s) Train Acc: 0.825056 Loss: 5.549945 | Val Acc: 0.775802 loss: 10.194465
[148/200] 124.59 sec(s) Train Acc: 0.826846 Loss: 5.544383 | Val Acc: 0.778134 loss: 10.256600
[149/200] 124.47 sec(s) Train Acc: 0.826171 Loss: 

[001/200] 124.31 sec(s) Train Acc: 0.699709 Loss: 9.039336 | Val Acc: 0.684548 loss: 13.916706
[002/200] 124.37 sec(s) Train Acc: 0.692952 Loss: 9.050467 | Val Acc: 0.681050 loss: 14.134139
[003/200] 124.45 sec(s) Train Acc: 0.696838 Loss: 9.172328 | Val Acc: 0.677551 loss: 13.997515
[004/200] 124.51 sec(s) Train Acc: 0.699203 Loss: 9.070137 | Val Acc: 0.681050 loss: 13.851232
[005/200] 124.42 sec(s) Train Acc: 0.694844 Loss: 9.155139 | Val Acc: 0.678134 loss: 14.031890
[006/200] 124.40 sec(s) Train Acc: 0.697648 Loss: 9.108827 | Val Acc: 0.677551 loss: 13.992419
[007/200] 124.46 sec(s) Train Acc: 0.697885 Loss: 9.116390 | Val Acc: 0.679300 loss: 14.005812
[008/200] 124.33 sec(s) Train Acc: 0.698594 Loss: 9.098923 | Val Acc: 0.679009 loss: 14.038006
[009/200] 124.56 sec(s) Train Acc: 0.697513 Loss: 9.112505 | Val Acc: 0.684548 loss: 13.955056
[010/200] 124.35 sec(s) Train Acc: 0.699642 Loss: 9.095919 | Val Acc: 0.677843 loss: 14.128754
[011/200] 124.43 sec(s) Train Acc: 0.695520 Loss: 

[088/200] 124.54 sec(s) Train Acc: 0.697378 Loss: 9.111963 | Val Acc: 0.680758 loss: 14.123755
[089/200] 124.61 sec(s) Train Acc: 0.694202 Loss: 9.125511 | Val Acc: 0.687172 loss: 14.002967
[090/200] 124.63 sec(s) Train Acc: 0.696128 Loss: 9.149549 | Val Acc: 0.684548 loss: 13.959394
[091/200] 124.63 sec(s) Train Acc: 0.700723 Loss: 9.090030 | Val Acc: 0.673761 loss: 14.121826
[092/200] 124.43 sec(s) Train Acc: 0.695148 Loss: 9.125735 | Val Acc: 0.677843 loss: 14.074315
[093/200] 124.49 sec(s) Train Acc: 0.695419 Loss: 9.138289 | Val Acc: 0.674636 loss: 13.975737
[094/200] 124.52 sec(s) Train Acc: 0.692277 Loss: 9.150199 | Val Acc: 0.679009 loss: 14.078492
[095/200] 124.30 sec(s) Train Acc: 0.695554 Loss: 9.113907 | Val Acc: 0.676968 loss: 14.019463
[096/200] 124.50 sec(s) Train Acc: 0.693425 Loss: 9.128665 | Val Acc: 0.679300 loss: 13.974465
[097/200] 124.41 sec(s) Train Acc: 0.697919 Loss: 9.091504 | Val Acc: 0.676093 loss: 14.089254
[098/200] 124.55 sec(s) Train Acc: 0.698459 Loss: 

[175/200] 124.67 sec(s) Train Acc: 0.693527 Loss: 9.139698 | Val Acc: 0.681924 loss: 13.939891
[176/200] 124.56 sec(s) Train Acc: 0.696702 Loss: 9.133788 | Val Acc: 0.680758 loss: 14.024369
[177/200] 124.69 sec(s) Train Acc: 0.693391 Loss: 9.138213 | Val Acc: 0.679592 loss: 14.149450
[178/200] 124.51 sec(s) Train Acc: 0.697277 Loss: 9.053356 | Val Acc: 0.684840 loss: 14.084900
[179/200] 124.62 sec(s) Train Acc: 0.699372 Loss: 9.093963 | Val Acc: 0.683090 loss: 14.011254
[180/200] 124.75 sec(s) Train Acc: 0.696635 Loss: 9.125257 | Val Acc: 0.678426 loss: 14.177246
[181/200] 124.50 sec(s) Train Acc: 0.698426 Loss: 9.085432 | Val Acc: 0.683673 loss: 13.960062
[182/200] 124.61 sec(s) Train Acc: 0.694608 Loss: 9.162323 | Val Acc: 0.679009 loss: 13.911092
[183/200] 124.68 sec(s) Train Acc: 0.695655 Loss: 9.091036 | Val Acc: 0.680175 loss: 14.067623
[184/200] 124.26 sec(s) Train Acc: 0.692648 Loss: 9.107795 | Val Acc: 0.686589 loss: 14.018195
[185/200] 124.50 sec(s) Train Acc: 0.699372 Loss: 

[026/200] 123.34 sec(s) Train Acc: 0.552368 Loss: 14.301383 | Val Acc: 0.550146 loss: 19.361426
[027/200] 123.39 sec(s) Train Acc: 0.553889 Loss: 14.214046 | Val Acc: 0.554227 loss: 19.560861
[028/200] 123.44 sec(s) Train Acc: 0.549463 Loss: 14.264414 | Val Acc: 0.554810 loss: 19.270667
[029/200] 123.39 sec(s) Train Acc: 0.551726 Loss: 14.258906 | Val Acc: 0.557434 loss: 19.348658
[030/200] 123.45 sec(s) Train Acc: 0.547469 Loss: 14.295571 | Val Acc: 0.551020 loss: 19.347019
[031/200] 123.42 sec(s) Train Acc: 0.553145 Loss: 14.246154 | Val Acc: 0.556851 loss: 19.343092
[032/200] 123.46 sec(s) Train Acc: 0.549970 Loss: 14.329275 | Val Acc: 0.551312 loss: 19.386261
[033/200] 123.23 sec(s) Train Acc: 0.549801 Loss: 14.211557 | Val Acc: 0.553061 loss: 19.346589
[034/200] 123.37 sec(s) Train Acc: 0.551963 Loss: 14.238501 | Val Acc: 0.557434 loss: 19.349897
[035/200] 123.39 sec(s) Train Acc: 0.546118 Loss: 14.331856 | Val Acc: 0.551603 loss: 19.407149
[036/200] 123.25 sec(s) Train Acc: 0.546

[112/200] 123.37 sec(s) Train Acc: 0.549497 Loss: 14.232686 | Val Acc: 0.555685 loss: 19.276933
[113/200] 123.44 sec(s) Train Acc: 0.552199 Loss: 14.187120 | Val Acc: 0.552187 loss: 19.395181
[114/200] 123.32 sec(s) Train Acc: 0.550780 Loss: 14.229474 | Val Acc: 0.553061 loss: 19.382726
[115/200] 123.40 sec(s) Train Acc: 0.548686 Loss: 14.172643 | Val Acc: 0.553353 loss: 19.424629
[116/200] 123.38 sec(s) Train Acc: 0.546929 Loss: 14.374086 | Val Acc: 0.553936 loss: 19.333439
[117/200] 123.26 sec(s) Train Acc: 0.548044 Loss: 14.299564 | Val Acc: 0.551603 loss: 19.372116
[118/200] 123.37 sec(s) Train Acc: 0.550071 Loss: 14.255390 | Val Acc: 0.553936 loss: 19.375189
[119/200] 123.32 sec(s) Train Acc: 0.555544 Loss: 14.256298 | Val Acc: 0.551895 loss: 19.423818
[120/200] 123.43 sec(s) Train Acc: 0.548686 Loss: 14.299644 | Val Acc: 0.556268 loss: 19.292301
[121/200] 123.42 sec(s) Train Acc: 0.546422 Loss: 14.285439 | Val Acc: 0.553353 loss: 19.387517
[122/200] 123.54 sec(s) Train Acc: 0.547

[198/200] 123.80 sec(s) Train Acc: 0.553956 Loss: 14.275463 | Val Acc: 0.554810 loss: 19.244741
[199/200] 123.93 sec(s) Train Acc: 0.546186 Loss: 14.323346 | Val Acc: 0.555394 loss: 19.360142
[200/200] 124.00 sec(s) Train Acc: 0.550679 Loss: 14.336715 | Val Acc: 0.554810 loss: 19.400276
0.5220062499999999
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 24, 192, 192]             672
       BatchNorm2d-2         [-1, 24, 192, 192]              48
             ReLU6-3         [-1, 24, 192, 192]               0
         MaxPool2d-4           [-1, 24, 96, 96]               0
            Conv2d-5           [-1, 24, 96, 96]             240
       BatchNorm2d-6           [-1, 24, 96, 96]              48
             ReLU6-7           [-1, 24, 96, 96]               0
            Conv2d-8           [-1, 48, 96, 96]           1,200
         MaxPool2d-9           [-1, 48, 48, 48]     

[048/200] 123.16 sec(s) Train Acc: 0.403102 Loss: 19.957625 | Val Acc: 0.425656 loss: 25.208448
[049/200] 122.90 sec(s) Train Acc: 0.399453 Loss: 19.935100 | Val Acc: 0.422157 loss: 25.238876
[050/200] 122.97 sec(s) Train Acc: 0.401514 Loss: 19.914066 | Val Acc: 0.423324 loss: 25.140511
[051/200] 123.06 sec(s) Train Acc: 0.401784 Loss: 19.877688 | Val Acc: 0.416910 loss: 25.548550
[052/200] 123.09 sec(s) Train Acc: 0.407629 Loss: 19.962534 | Val Acc: 0.419534 loss: 25.439621
[053/200] 122.95 sec(s) Train Acc: 0.403338 Loss: 19.935317 | Val Acc: 0.422449 loss: 25.178616
[054/200] 122.90 sec(s) Train Acc: 0.398946 Loss: 19.891629 | Val Acc: 0.427697 loss: 25.366241
[055/200] 122.85 sec(s) Train Acc: 0.403237 Loss: 19.884634 | Val Acc: 0.425656 loss: 25.262605
[056/200] 122.98 sec(s) Train Acc: 0.397797 Loss: 20.002473 | Val Acc: 0.427114 loss: 25.195261
[057/200] 123.11 sec(s) Train Acc: 0.403000 Loss: 19.999661 | Val Acc: 0.418076 loss: 25.181239
[058/200] 123.01 sec(s) Train Acc: 0.402

[134/200] 123.17 sec(s) Train Acc: 0.400939 Loss: 19.981654 | Val Acc: 0.426822 loss: 25.366798
[135/200] 123.35 sec(s) Train Acc: 0.401851 Loss: 19.828795 | Val Acc: 0.423615 loss: 25.326493
[136/200] 123.62 sec(s) Train Acc: 0.405196 Loss: 19.920959 | Val Acc: 0.423324 loss: 25.217592
[137/200] 123.37 sec(s) Train Acc: 0.401851 Loss: 19.936860 | Val Acc: 0.424781 loss: 25.350724
[138/200] 123.31 sec(s) Train Acc: 0.404115 Loss: 19.857697 | Val Acc: 0.429155 loss: 25.029343
[139/200] 123.36 sec(s) Train Acc: 0.403811 Loss: 19.996238 | Val Acc: 0.423032 loss: 25.567714
[140/200] 123.28 sec(s) Train Acc: 0.402257 Loss: 19.827450 | Val Acc: 0.415452 loss: 25.321554
[141/200] 123.57 sec(s) Train Acc: 0.403102 Loss: 19.918025 | Val Acc: 0.427114 loss: 25.218776
[142/200] 123.51 sec(s) Train Acc: 0.401851 Loss: 19.889248 | Val Acc: 0.421574 loss: 25.404546
[143/200] 123.51 sec(s) Train Acc: 0.402189 Loss: 19.902712 | Val Acc: 0.426822 loss: 25.411162
[144/200] 123.31 sec(s) Train Acc: 0.401

[001/200] 122.85 sec(s) Train Acc: 0.301676 Loss: 25.289472 | Val Acc: 0.322449 loss: 30.901257
[002/200] 122.62 sec(s) Train Acc: 0.298635 Loss: 25.409341 | Val Acc: 0.328863 loss: 30.862350
[003/200] 122.75 sec(s) Train Acc: 0.300324 Loss: 25.291922 | Val Acc: 0.315452 loss: 31.084402
[004/200] 122.75 sec(s) Train Acc: 0.301068 Loss: 25.298294 | Val Acc: 0.323324 loss: 31.125551
[005/200] 122.74 sec(s) Train Acc: 0.297284 Loss: 25.346527 | Val Acc: 0.323032 loss: 30.939706
[006/200] 122.84 sec(s) Train Acc: 0.301608 Loss: 25.208625 | Val Acc: 0.321866 loss: 31.419141
[007/200] 122.82 sec(s) Train Acc: 0.299649 Loss: 25.447447 | Val Acc: 0.327114 loss: 30.982403
[008/200] 122.69 sec(s) Train Acc: 0.301135 Loss: 25.221098 | Val Acc: 0.327405 loss: 30.952584
[009/200] 122.83 sec(s) Train Acc: 0.297892 Loss: 25.317028 | Val Acc: 0.325364 loss: 30.748925
[010/200] 122.85 sec(s) Train Acc: 0.300628 Loss: 25.319928 | Val Acc: 0.330612 loss: 31.002499
[011/200] 122.89 sec(s) Train Acc: 0.298

[087/200] 122.66 sec(s) Train Acc: 0.304345 Loss: 25.231199 | Val Acc: 0.322157 loss: 30.943238
[088/200] 122.71 sec(s) Train Acc: 0.302723 Loss: 25.378054 | Val Acc: 0.325073 loss: 31.246472
[089/200] 122.53 sec(s) Train Acc: 0.298804 Loss: 25.304120 | Val Acc: 0.327988 loss: 31.138057
[090/200] 122.66 sec(s) Train Acc: 0.303162 Loss: 25.310757 | Val Acc: 0.326531 loss: 30.931092
[091/200] 122.54 sec(s) Train Acc: 0.300628 Loss: 25.387471 | Val Acc: 0.326822 loss: 30.661047
[092/200] 122.71 sec(s) Train Acc: 0.299074 Loss: 25.458390 | Val Acc: 0.326822 loss: 30.881221
[093/200] 122.75 sec(s) Train Acc: 0.301676 Loss: 25.308233 | Val Acc: 0.321574 loss: 31.232723
[094/200] 122.70 sec(s) Train Acc: 0.298736 Loss: 25.314837 | Val Acc: 0.327114 loss: 30.885039
[095/200] 122.67 sec(s) Train Acc: 0.300122 Loss: 25.213920 | Val Acc: 0.325364 loss: 30.796250
[096/200] 122.64 sec(s) Train Acc: 0.301507 Loss: 25.347030 | Val Acc: 0.320700 loss: 31.229533
[097/200] 122.81 sec(s) Train Acc: 0.303

[173/200] 122.81 sec(s) Train Acc: 0.297081 Loss: 25.331767 | Val Acc: 0.320408 loss: 31.132232
[174/200] 122.86 sec(s) Train Acc: 0.305156 Loss: 25.210857 | Val Acc: 0.323907 loss: 31.111645
[175/200] 122.84 sec(s) Train Acc: 0.297486 Loss: 25.277644 | Val Acc: 0.325364 loss: 30.961052
[176/200] 122.66 sec(s) Train Acc: 0.303635 Loss: 25.401650 | Val Acc: 0.323907 loss: 31.033859
[177/200] 122.70 sec(s) Train Acc: 0.300020 Loss: 25.313240 | Val Acc: 0.325656 loss: 30.884797
[178/200] 122.67 sec(s) Train Acc: 0.301101 Loss: 25.350589 | Val Acc: 0.324198 loss: 30.683787
[179/200] 122.73 sec(s) Train Acc: 0.301777 Loss: 25.356903 | Val Acc: 0.327114 loss: 30.982846
[180/200] 122.77 sec(s) Train Acc: 0.297723 Loss: 25.383539 | Val Acc: 0.327697 loss: 31.084763
[181/200] 122.78 sec(s) Train Acc: 0.299750 Loss: 25.222876 | Val Acc: 0.326822 loss: 30.997823
[182/200] 122.60 sec(s) Train Acc: 0.302453 Loss: 25.231652 | Val Acc: 0.319534 loss: 30.998754
[183/200] 122.78 sec(s) Train Acc: 0.300

[023/200] 122.33 sec(s) Train Acc: 0.262687 Loss: 27.156283 | Val Acc: 0.290962 loss: 32.783238
[024/200] 122.13 sec(s) Train Acc: 0.261470 Loss: 27.165758 | Val Acc: 0.289504 loss: 33.127955
[025/200] 122.23 sec(s) Train Acc: 0.260051 Loss: 27.052835 | Val Acc: 0.290379 loss: 32.977489
[026/200] 122.20 sec(s) Train Acc: 0.261774 Loss: 27.186592 | Val Acc: 0.289796 loss: 33.168908
[027/200] 122.42 sec(s) Train Acc: 0.260491 Loss: 27.151374 | Val Acc: 0.290379 loss: 32.738578
[028/200] 122.59 sec(s) Train Acc: 0.260930 Loss: 27.121934 | Val Acc: 0.290087 loss: 33.294384
[029/200] 122.70 sec(s) Train Acc: 0.263058 Loss: 27.135244 | Val Acc: 0.283965 loss: 33.270213
[030/200] 122.62 sec(s) Train Acc: 0.259072 Loss: 27.204864 | Val Acc: 0.286880 loss: 33.176695
[031/200] 122.72 sec(s) Train Acc: 0.261234 Loss: 27.103166 | Val Acc: 0.288921 loss: 33.204528
[032/200] 122.83 sec(s) Train Acc: 0.261977 Loss: 27.144194 | Val Acc: 0.289213 loss: 33.184178
[033/200] 122.67 sec(s) Train Acc: 0.266

[109/200] 123.27 sec(s) Train Acc: 0.260018 Loss: 27.211116 | Val Acc: 0.281924 loss: 33.351565
[110/200] 123.70 sec(s) Train Acc: 0.263193 Loss: 27.192136 | Val Acc: 0.289213 loss: 33.046593
[111/200] 123.47 sec(s) Train Acc: 0.266302 Loss: 27.132873 | Val Acc: 0.290962 loss: 33.305128
[112/200] 123.45 sec(s) Train Acc: 0.261301 Loss: 27.175397 | Val Acc: 0.286297 loss: 33.093728
[113/200] 123.15 sec(s) Train Acc: 0.262822 Loss: 27.203343 | Val Acc: 0.287172 loss: 33.207381
[114/200] 123.18 sec(s) Train Acc: 0.264139 Loss: 27.114107 | Val Acc: 0.287172 loss: 33.057381
[115/200] 123.26 sec(s) Train Acc: 0.260051 Loss: 27.206408 | Val Acc: 0.285714 loss: 32.680042
[116/200] 123.15 sec(s) Train Acc: 0.260930 Loss: 27.236691 | Val Acc: 0.285131 loss: 33.691051
[117/200] 123.13 sec(s) Train Acc: 0.259139 Loss: 27.167976 | Val Acc: 0.288630 loss: 33.145390
[118/200] 122.88 sec(s) Train Acc: 0.258666 Loss: 27.209316 | Val Acc: 0.287755 loss: 32.493868
[119/200] 122.93 sec(s) Train Acc: 0.258

[196/200] 123.13 sec(s) Train Acc: 0.258632 Loss: 27.220524 | Val Acc: 0.283090 loss: 33.077299
[197/200] 122.73 sec(s) Train Acc: 0.261538 Loss: 27.135774 | Val Acc: 0.285714 loss: 33.043336
[198/200] 122.94 sec(s) Train Acc: 0.263768 Loss: 27.157401 | Val Acc: 0.281341 loss: 32.979847
[199/200] 123.05 sec(s) Train Acc: 0.262552 Loss: 27.095434 | Val Acc: 0.287755 loss: 32.921552
[200/200] 122.95 sec(s) Train Acc: 0.264748 Loss: 27.188727 | Val Acc: 0.281924 loss: 33.788072
0.32057708828124987
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 24, 192, 192]             672
       BatchNorm2d-2         [-1, 24, 192, 192]              48
             ReLU6-3         [-1, 24, 192, 192]               0
         MaxPool2d-4           [-1, 24, 96, 96]               0
            Conv2d-5           [-1, 24, 96, 96]             240
       BatchNorm2d-6           [-1, 24, 96, 96]    