In [0]:
import warnings
warnings.filterwarnings('ignore')

In [0]:
import sys

IN_IPYTHON = True
try:
    __IPYTHON__
except NameError:
    IN_IPYTHON = False

if IN_IPYTHON:
    workspace_dir, output_fpath = 'food-11', 'predict.csv'
else:
    try:
        workspace_dir = sys.argv[1]
    except:
        workspace_dir = 'food-11'

    try:
        output_fpath = sys.argv[2]
    except:
        output_fpath = "predict.csv"

In [0]:
import os
import cv2
import numpy as np

IMAGE_SIZE = 192
def readfile(path, label):
    # label 是一個 boolean variable，代表需不需要回傳 y 值
    image_dir = sorted(os.listdir(path))
    x = np.zeros((len(image_dir), IMAGE_SIZE, IMAGE_SIZE, 3), dtype=np.uint8)
    y = np.zeros((len(image_dir)), dtype=np.uint8)
    for i, file in enumerate(image_dir):
        img = cv2.imread(os.path.join(path, file))
        # resize to IMAGE_SIZE x ? or ? x IMAGE_SIZE
        height = img.shape[0]
        width = img.shape[1]
        rate = IMAGE_SIZE / max(height, width)
        height = int(height * rate)
        width = int(width * rate)
        img = cv2.resize(img, (width, height))
        # pad black
        # from https://blog.csdn.net/qq_20622615/article/details/80929746
        W, H = IMAGE_SIZE, IMAGE_SIZE
        top = (H - height) // 2
        bottom = (H - height) // 2
        if top + bottom + height < H:
            bottom += 1
        left = (W - width) // 2
        right = (W - width) // 2
        if left + right + width < W:
            right += 1
        img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0))
        # to np array
        x[i, :, :] = img
        if label:
            y[i] = int(file.split("_")[0])
    if label:
      return x, y
    else:
      return x

In [0]:
import torchvision.transforms as transforms

transform_mean = np.array([ 69.58238342,  92.66689336, 115.24940137]) / 255
transform_std = np.array([71.8342021 , 76.83536755, 83.40123168]) / 255

train_transform1 = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomChoice([
        transforms.RandomVerticalFlip(),
        transforms.RandomHorizontalFlip(),
        transforms.RandomPerspective()
    ]),
    transforms.RandomChoice([
        transforms.RandomAffine(10), # 隨機線性轉換
        transforms.RandomRotation(40)
    ]),
    transforms.ColorJitter(), # 隨機色溫等
    transforms.ToTensor(), # 將圖片轉成 Tensor，並把數值 normalize 到 [0,1] (data normalization)
    transforms.Normalize(
        transform_mean,
        transform_std
    )
])
train_transform2 = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomOrder([
        transforms.RandomChoice([
            transforms.RandomHorizontalFlip(),
            transforms.RandomPerspective()
        ]),
        transforms.RandomAffine(30), # 隨機線性轉換
        transforms.RandomResizedCrop((IMAGE_SIZE, IMAGE_SIZE), scale=(0.5, 1.0)), # 隨機子圖
    ]),
    transforms.RandomChoice([
        transforms.ColorJitter(), # 隨機色溫等
        transforms.RandomGrayscale(),
    ]),
    transforms.ToTensor(), # 將圖片轉成 Tensor，並把數值 normalize 到 [0,1] (data normalization)
    transforms.RandomErasing(0.2),
    transforms.Normalize(
        transform_mean,
        transform_std
    )
])
test_transform = transforms.Compose([
    transforms.ToPILImage(),                                    
    transforms.ToTensor(),
    transforms.Normalize(
        transform_mean,
        transform_std
    )
])

In [0]:
import re
import torch
from glob import glob
from PIL import Image
import torchvision.transforms as transforms

class MyDataset(torch.utils.data.Dataset):

    def __init__(self, folderName, transform=None):
        self.transform = transform
        self.img_paths = []

        for img_path in sorted(glob(folderName + '/*.jpg')):
            self.img_paths.append(img_path)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
          idx = idx.tolist()
        try:
            # Get classIdx by parsing image path
            class_idx = int(re.findall(re.compile(r'\d+'), self.img_paths[idx])[1])
        except:
            # if inference mode (there's no answer), class_idx default 0
            class_idx = 0

        image = Image.open(self.img_paths[idx])
        # Get File Descriptor
        image_fp = image.fp
        image.load()
        # Close File Descriptor (or it'll reach OPEN_MAX)
        image_fp.close()

        if self.transform:
            image = self.transform(image)

        return (image, class_idx)

    def __len__(self):
        return len(self.img_paths)


trainTransform = transforms.Compose([
    transforms.RandomCrop(256, pad_if_needed=True, padding_mode='symmetric'),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ToTensor(),
])
testTransform = transforms.Compose([
    transforms.CenterCrop(256),
    transforms.ToTensor(),
])

def get_dataloader(mode='training', batch_size=32):

    assert mode in ['training', 'testing', 'validation']

    dataset = MyDataset(
        f'./food-11/{mode}',
        transform=trainTransform if mode == 'training' else testTransform)

    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=(mode == 'training'))

    return dataloader

In [0]:
train_loader = get_dataloader('training', batch_size=32)
val_loader = get_dataloader('validation', batch_size=32)

In [0]:
import torchvision.models as models

!gdown --id '1B8ljdrxYXJsZv2vmTequdPOofp3VF3NN' --output 'teacher_resnet18.bin'
teacher_net = models.resnet18(pretrained=False, num_classes=11).cuda()
teacher_net.load_state_dict(torch.load('teacher_resnet18.bin'))

Downloading...
From: https://drive.google.com/uc?id=1B8ljdrxYXJsZv2vmTequdPOofp3VF3NN
To: /content/teacher_resnet18.bin
44.8MB [00:00, 67.4MB/s]


<All keys matched successfully>

In [0]:
import torch.nn as nn
import torch.nn.functional as F

class swish(nn.Module):
    def __init__(self):
        super(swish, self).__init__()
 
    def forward(self, x):
        x = x * F.sigmoid(x)
        return x


class StudentNet(nn.Module):
    '''
      在這個Net裡面，我們會使用Depthwise & Pointwise Convolution Layer來疊model。
      你會發現，將原本的Convolution Layer換成Dw & Pw後，Accuracy通常不會降很多。

      另外，取名為StudentNet是因為這個Model等會要做Knowledge Distillation。
    '''

    def __init__(self, base=16, width_mult=1):
        '''
          Args:
            base: 這個model一開始的ch數量，每過一層都會*2，直到base*16為止。
            width_mult: 為了之後的Network Pruning使用，在base*8 chs的Layer上會 * width_mult代表剪枝後的ch數量。        
        '''
        super(StudentNet, self).__init__()
        multiplier = [2, 4, 8, 8, 8, 16, 16, 16, 16]

        # bandwidth: 每一層Layer所使用的ch數量
        bandwidth = [int(base * m) for m in multiplier]

        # 我們只Pruning第三層以後的Layer
        for i in range(4, 8):
            bandwidth[i] = int(bandwidth[i] * width_mult)

        self.cnn = nn.Sequential(
            # 第一層我們通常不會拆解Convolution Layer。
            nn.Sequential(
                nn.Conv2d(3, bandwidth[0], 3, 1, 1),
                nn.BatchNorm2d(bandwidth[0]),
                nn.ReLU6(),
                nn.MaxPool2d(2, 2, 0),
            ),
            # 接下來每一個Sequential Block都一樣，所以我們只講一個Block
            nn.Sequential(
                # Depthwise Convolution
                nn.Conv2d(bandwidth[0], bandwidth[0], 3, 1, 1, groups=bandwidth[0]),
                # Batch Normalization
                nn.BatchNorm2d(bandwidth[0]),
                # ReLU6 是限制Neuron最小只會到0，最大只會到6。 MobileNet系列都是使用ReLU6。
                # 使用ReLU6的原因是因為如果數字太大，會不好壓到float16 / or further qunatization，因此才給個限制。
                nn.ReLU6(),
                # Pointwise Convolution
                nn.Conv2d(bandwidth[0], bandwidth[1], 1),
                # 過完Pointwise Convolution不需要再做ReLU，經驗上Pointwise + ReLU效果都會變差。
                nn.MaxPool2d(2, 2, 0),
                # 每過完一個Block就Down Sampling
            ),

            nn.Sequential(
                nn.Conv2d(bandwidth[1], bandwidth[1], 3, 1, 1, groups=bandwidth[1]),
                nn.BatchNorm2d(bandwidth[1]),
                nn.ReLU6(),
                nn.Conv2d(bandwidth[1], bandwidth[2], 1),
                nn.MaxPool2d(2, 2, 0),
            ),

            nn.Sequential(
                nn.Conv2d(bandwidth[2], bandwidth[2], 3, 1, 1, groups=bandwidth[2]),
                nn.BatchNorm2d(bandwidth[2]),
                nn.ReLU6(),
                nn.Conv2d(bandwidth[2], bandwidth[3], 1),
                nn.MaxPool2d(2, 2, 0),
            ),
            
            # 到這邊為止因為圖片已經被Down Sample很多次了，所以就不做MaxPool

            nn.Sequential(
                nn.Conv2d(bandwidth[3], bandwidth[3], 3, 1, 1, groups=bandwidth[3]),
                nn.BatchNorm2d(bandwidth[3]),
                nn.ReLU6(),
                nn.Conv2d(bandwidth[3], bandwidth[4], 1),
            ),

            nn.Sequential(
                nn.Conv2d(bandwidth[4], bandwidth[4], 3, 1, 1, groups=bandwidth[4]),
                nn.BatchNorm2d(bandwidth[4]),
                swish(),
                nn.Conv2d(bandwidth[4], bandwidth[5], 1),
            ),

            nn.Sequential(
                nn.Conv2d(bandwidth[5], bandwidth[5], 3, 1, 1, groups=bandwidth[5]),
                nn.BatchNorm2d(bandwidth[5]),
                swish(),
                nn.Conv2d(bandwidth[5], bandwidth[6], 1),
            ),

            nn.Sequential(
                nn.Conv2d(bandwidth[6], bandwidth[6], 3, 1, 1, groups=bandwidth[6]),
                nn.BatchNorm2d(bandwidth[6]),
                swish(),
                nn.Conv2d(bandwidth[6], bandwidth[7], 1),
            ),
            
            nn.Sequential(
                nn.Conv2d(bandwidth[7], bandwidth[7], 3, 1, 1, groups=bandwidth[7]),
                nn.BatchNorm2d(bandwidth[7]),
                swish(),
                nn.Conv2d(bandwidth[7], bandwidth[8], 1),
            ),

            # 這邊我們採用Global Average Pooling。
            # 如果輸入圖片大小不一樣的話，就會因為Global Average Pooling壓成一樣的形狀，這樣子接下來做FC就不會對不起來。
            nn.AdaptiveAvgPool2d((1, 1)),
        )
        self.fc = nn.Sequential(
            # 這邊我們直接Project到11維輸出答案。
            nn.Linear(bandwidth[8], 128),
            nn.BatchNorm1d(128),
            nn.ReLU6(),
            
            nn.Dropout(0.4),
            
            nn.Linear(128, 128),
            nn.BatchNorm1d(128),
            swish(),
                        
            nn.Linear(128, 11)
        )

    def forward(self, x):
        out = self.cnn(x)
        out = out.view(out.size()[0], -1)
        return self.fc(out)

In [0]:
def loss_fn_kd(outputs, labels, teacher_outputs, T=20, alpha=0.5):
    # 一般的Cross Entropy
    hard_loss = F.cross_entropy(outputs, labels) * (1. - alpha)
    # 讓logits的log_softmax對目標機率(teacher的logits/T後softmax)做KL Divergence。
    soft_loss = nn.KLDivLoss(reduction='batchmean')(F.log_softmax(outputs/T, dim=1),
                             F.softmax(teacher_outputs/T, dim=1)) * (alpha * T * T)
    return hard_loss + soft_loss

In [0]:
#from torchvision.models import mobilenet_v2
# student_net = mobilenet_v2(
#     num_classes=11,
#     width_mult=0.6,
#     round_nearest=4,
#     inverted_residual_setting = [
#         # t, c, n, s
#         [1, 16, 1, 1],
#         [6, 24, 2, 2],
# #         [6, 32, 3, 2],
#         [6, 64, 4, 2],
#         [6, 96, 3, 1],
# #         [6, 160, 3, 2],
#         [6, 320, 1, 1],
#     ]
# ).cuda()

student_net_base = 9.5
student_net = StudentNet(student_net_base).cuda()
# student_net.load_state_dict(torch.load('student_model.bin'))

optimizer = torch.optim.Adam(student_net.parameters(), lr=1e-3)

In [0]:
def run_epoch(dataloader, update=True, alpha=0.5):
    total_num, total_hit, total_loss = 0, 0, 0
    for now_step, batch_data in enumerate(dataloader):
        # 清空 optimizer
        optimizer.zero_grad()
        # 處理 input
        inputs, hard_labels = batch_data
        inputs = inputs.cuda()
        hard_labels = torch.LongTensor(hard_labels).cuda()
        # 因為Teacher沒有要backprop，所以我們使用torch.no_grad
        # 告訴torch不要暫存中間值(去做backprop)以浪費記憶體空間。
        with torch.no_grad():
            soft_labels = teacher_net(inputs)

        if update:
            logits = student_net(inputs)
            # 使用我們之前所寫的融合soft label&hard label的loss。
            # T=20是原始論文的參數設定。
            loss = loss_fn_kd(logits, hard_labels, soft_labels, 20, alpha)
            loss.backward()
            optimizer.step()    
        else:
            # 只是算validation acc的話，就開no_grad節省空間。
            with torch.no_grad():
                logits = student_net(inputs)
                loss = loss_fn_kd(logits, hard_labels, soft_labels, 20, alpha)
            
        total_hit += torch.sum(torch.argmax(logits, dim=1) == hard_labels).item()
        total_num += len(inputs)

        total_loss += loss.item() * len(inputs)
    return total_loss / total_num, total_hit / total_num

In [12]:
import time

num_epoch = 200

# TeacherNet永遠都是Eval mode.
teacher_net.eval()
now_best_acc = 0
for epoch in range(num_epoch):
    epoch_start_time = time.time()
    
    student_net.train()
    train_loss, train_acc = run_epoch(train_loader, update=True)

    student_net.eval()
    valid_loss, valid_acc = run_epoch(val_loader, update=False)

    # 存下最好的model。
    if valid_acc > now_best_acc:
        now_best_acc = valid_acc
        torch.save(student_net.state_dict(), 'student_model.bin')
    print('[%03d/%03d] %2.2f sec(s) Train Acc: %3.6f Loss: %3.6f | Val Acc: %3.6f loss: %3.6f' % 
            (epoch + 1, num_epoch, time.time()-epoch_start_time, train_acc,
            train_loss, valid_acc, valid_loss))

[001/200] 99.96 sec(s) Train Acc: 0.294851 Loss: 15.648100 | Val Acc: 0.340233 loss: 16.597417
[002/200] 101.41 sec(s) Train Acc: 0.360835 Loss: 14.300623 | Val Acc: 0.404373 loss: 15.144613
[003/200] 101.53 sec(s) Train Acc: 0.393473 Loss: 13.543069 | Val Acc: 0.446939 loss: 14.066009
[004/200] 101.65 sec(s) Train Acc: 0.429759 Loss: 12.924635 | Val Acc: 0.472012 loss: 12.889131
[005/200] 101.99 sec(s) Train Acc: 0.446686 Loss: 12.487596 | Val Acc: 0.510496 loss: 12.026516
[006/200] 101.98 sec(s) Train Acc: 0.463916 Loss: 12.023128 | Val Acc: 0.513411 loss: 11.962598
[007/200] 101.14 sec(s) Train Acc: 0.488749 Loss: 11.754721 | Val Acc: 0.539359 loss: 11.475014
[008/200] 101.37 sec(s) Train Acc: 0.490371 Loss: 11.537664 | Val Acc: 0.535860 loss: 11.426353
[009/200] 101.60 sec(s) Train Acc: 0.516319 Loss: 11.087658 | Val Acc: 0.523324 loss: 11.598299
[010/200] 101.13 sec(s) Train Acc: 0.524833 Loss: 10.771684 | Val Acc: 0.583965 loss: 10.472408
[011/200] 101.28 sec(s) Train Acc: 0.5328

In [0]:
num_epoch = 0

for epoch in range(num_epoch):
    epoch_start_time = time.time()
    
    student_net.train()
    train_loss, train_acc = run_epoch(train_loader, update=True, alpha=0)
    student_net.eval()
    valid_loss, valid_acc = run_epoch(val_loader, update=False, alpha=0)

    # 存下最好的model。
    if valid_acc > now_best_acc:
        now_best_acc = valid_acc
        torch.save(student_net.state_dict(), 'student_model.bin')
    print('[%03d/%03d] %2.2f sec(s) Train Acc: %3.6f Loss: %3.6f | Val Acc: %3.6f loss: %3.6f' % 
            (epoch + 1, num_epoch, time.time()-epoch_start_time, train_acc,
            train_loss, valid_acc, valid_loss))