# Advance 자료 3: Label Smoothing

`Label Smoothing (LS)`은 이미지 분류뿐만 아니라 다른 언어 번역등의 기술에서 많이 사용되는 기술입니다.

`LS`는 앞선 기술과 달리 image를 augmentation하는 기술이 아닌, label을 변환시키는 기술입니다.

`LS`는 말 그대로 label을 스무딩하여 모델 일반화 성능을 향상시킵니다.

Hard target(one-hot-representation)을 soft target으로 바꾸는 것이 핵심입니다. 여기서의 hard target은 one-hot vector로 $[0,1,0,0]$의 형태를 말합니다.

K 개 범주(class)에 관한 레이블 스무딩 벡터의  k 번째 스칼라(sclar) 값은 다음 수식과 같습니다( yk 는  k 번째 범주가 정답이면 1, 그렇지 않으면 0,  α 는 hyperparameter).

$$ y^{LS}_k = (1-\alpha) y_k + \alpha / K$$

In [22]:
import torch
import copy
from torch import cuda, nn, optim
import torchvision
import torchvision.transforms as transforms
from torch.autograd import Variable
import torchvision.models as models
from torch.optim import lr_scheduler
from tqdm import tqdm

import matplotlib.pyplot as plt
import numpy as np

----

In [2]:
class LabelSmoothingLoss(nn.Module):
    # Label smoothing method: https://arxiv.org/abs/1512.00567
    # It injects the uniform noise to the hard target (i.e., one-hot vector) whose magnitude is epsilon.
    def __init__(self, device, classes, smoothing=0.0, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim
        self.device = device

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            true_dist = ?????????
            ??????????????
            ??????????????
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

In [3]:
def mixup(images, labels, device, alpha=1.0):
    """
    mixup function from 'mixup: BEYOND EMPIRICAL RISK MINIMIZATION', 
    https://arxiv.org/pdf/1710.09412.pdf
    """
        
    lam = np.random.beta(alpha, alpha)
    rand_index = torch.randperm(images.size()[0]).to(device)
    labels1 = labels
    labels2 = labels[rand_index]
    images2 = copy.deepcopy(images)
            
    images = Variable(lam * images + (1-lam)*images2[rand_index,:,:,:]).to(device)
    
    return lam, images, labels1, labels2

In [4]:
def rand_bbox(size, lam):
    W = size[2]
    H = size[3]
    cut_rat = np.sqrt(1. - lam)
    cut_w = np.int(W * cut_rat)
    cut_h = np.int(H * cut_rat)

    # uniform
    cx = np.random.randint(W)
    cy = np.random.randint(H)

    bbx1 = np.clip(cx - cut_w // 2, 0, W)
    bby1 = np.clip(cy - cut_h // 2, 0, H)
    bbx2 = np.clip(cx + cut_w // 2, 0, W)
    bby2 = np.clip(cy + cut_h // 2, 0, H)

    return bbx1, bby1, bbx2, bby2

def cutmix(images, labels, device, alpha = 1.0):
    """
    cutmix function from 'CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features',
    https://arxiv.org/abs/1905.04899
    """
    
    #generate mixed sample
    lam = np.random.beta(alpha, alpha)
    rand_index = torch.randperm(images.size()[0]).to(device)
    labels_a = labels
    labels_b = labels[rand_index]
    bbx1, bby1, bbx2, bby2 = rand_bbox(images.size(), lam)
    images[:, :, bbx1:bbx2, bby1:bby2] = images[rand_index, :, bbx1:bbx2, bby1:bby2]
    #adjust lambda to exactly match pixel ratio
    lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (images.size()[-1] * images.size()[-2]))
    #compute output
    images = torch.autograd.Variable(images, requires_grad=True).to(device)

    return lam, images, labels_a, labels_b

-----

In [18]:
batch_size = 64
device = 'cuda:0'
num_epochs = 20

In [5]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


In [10]:
model = models.mobilenet_v2(pretrained=False)

In [13]:
num_ftrs = model.classifier[1].in_features
model.classifier[1] = nn.Linear(num_ftrs, len(classes))
model = model.to(device)

In [41]:
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-5)
scheduler = lr_scheduler.MultiStepLR(gamma=0.1, milestones=[10, 15], optimizer=optimizer)

In [42]:
def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    batch_size = target.size(0)
    num = output.size(1)
    target_topk = []
    appendices = []
    for k in topk:
        if k <= num:
            target_topk.append(k)
        else:
            appendices.append([0.0])
    topk = target_topk
    maxk = max(topk)
    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res + appendices

In [61]:
def train(model, optimizer, scheduler, trainloader, testloader, device, num_epochs, reg):
    state = {}
    current_state = copy.deepcopy(model.state_dict())
    for k, v in current_state.items():
        current_state[k] = v.cpu()
    state['init'] = copy.deepcopy(current_state)
    
    for epoch in range(num_epochs):
        model.train()
        scheduler.step()
        
        losses = []
        for i, (images, labels) in enumerate(tqdm(trainloader)):
            images = images.type(torch.FloatTensor).to(device)
            labels = labels.type(torch.LongTensor).to(device)

            if reg in ['cutmix', 'mixup']:
                if reg == 'cutmix':
                    lam, images, labels_a, labels_b = cutmix(images, labels, device)
                elif reg == 'mixup':
                    lam, images, labels_a, labels_b = mixup(images, labels, device)
                optimizer.zero_grad()

                outputs = model(images)

                ???????????????????????
            elif reg == 'ls':
                optimizer.zero_grad()
                outputs = model(images)
                loss = ????????????????????????
            else:
                optimizer.zero_grad()
                outputs = model(images)
                loss = nn.CrossEntropyLoss()(outputs, labels)
            
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
        
        print('[epoch: %d] train loss: %.3f' %
                  (epoch + 1, np.mean(losses)))
        tl, ta = eval_model(model, testloader, nn.CrossEntropyLoss(), device)
        print('[epoch: %d] test loss: %.3f, test accuracy: %.4f' %
                  (epoch + 1, tl, ta))
        if epoch == num_epochs-1:
            current_state = copy.deepcopy(model.state_dict())
            for k, v in current_state.items():
                current_state[k] = v.cpu()
            state[str(epoch)] = copy.deepcopy(current_state)
    
    torch.save(state, 'practice.t1')

In [62]:
def eval_model(model, loader, criterion, device):
    model.eval()
    losses = []
    for i, data in enumerate(loader):
        image = data[0].type(torch.FloatTensor).to(device)
        label = data[1].type(torch.LongTensor).to(device)
        
        pred_label = model(image)

        loss = criterion(pred_label, label)
        losses.append(loss.item())
        
        if i == 0:
            labels = label.cpu().detach().numpy()
            pred_labels = pred_label.cpu().detach().numpy()
        else:
            labels = np.concatenate((labels, label.cpu().detach().numpy()), axis=0, out=None)
            pred_labels = np.concatenate((pred_labels, pred_label.cpu().detach().numpy()), axis=0, out=None)
            
        image = image.cpu()
        label = label.cpu()
        with torch.cuda.device(device):
            torch.cuda.empty_cache()

    pred_labels = np.argmax(pred_labels, axis=1)
    return np.mean(losses), np.sum(pred_labels==labels)/float(labels.size)

In [60]:
train(model, optimizer, scheduler, trainloader, testloader, device, num_epochs, 'none')

100%|██████████| 782/782 [00:29<00:00, 26.18it/s]

[epoch: 1] train loss: 1.700



  0%|          | 0/782 [00:00<?, ?it/s]

[epoch: 1] test loss: 1.978, test accuracy: 0.312


 35%|███▍      | 272/782 [00:10<00:19, 26.49it/s]


KeyboardInterrupt: 

---------
### <프로젝트>

- Image Classification의 성능을 보았습니다.

- 이것의 learning rate, optimizer, scheduler, mixup, cutmix, label smoothing 등을 이리 저리 활용하여서 신경망의 성능을 올리세요.

- num_epoch은 20번 이내로 사용하실 수 있습니다.

------------