![image.png](attachment:image.png)

### augmentation은 동일
### 현재 학습한거 CE + 현재 학습한거와 이전까지EMA MSE 한건데...
### 그냥 momentum 계열의 optimizer 쓴거와 뭐가 다른건지..
### 앞에서 augmentation이라도 따로 한것도 아니고


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms

import numpy as np
from timeit import default_timer as timer

In [2]:
def mnist_dataset(root, transform):
    # load train data
    train_dataset = datasets.MNIST(
        root=root,
        train=True,
        transform=transform,
        download=True)

    # load test data
    test_dataset = datasets.MNIST(
        root=root,
        train=False,
        transform=transform, download=True)

    return train_dataset, test_dataset

In [3]:
def sample_train(train_dataset, test_dataset, batch_size, k, n_classes, seed, shuffle_train=False, return_idx=True):
    '''Randomly form unlabeled data in training dataset'''

    n = len(train_dataset)  # dataset size
    rrng = np.random.RandomState(seed) # seed 
    indices = torch.zeros(k)  # indices of keep labeled data
    others = torch.zeros(n - k)  # indices of unlabeled data
    card = k // n_classes
    cpt = 0

    for i in range(n_classes):
        class_items = (train_dataset.train_labels == i).nonzero()  # indices of samples with label i
        n_class = len(class_items)  # number of samples with label i
        rd = rrng.permutation(np.arange(n_class))  # shuffle them
        indices[i * card: (i+1) * card] = torch.squeeze(class_items[rd[:card]])
        others[cpt: cpt+n_class-card] = torch.squeeze(class_items[rd[card:]])
        cpt += (n_class-card)

    # tensor as indices must be long, byte or bool
    others = others.long()
    train_dataset.train_labels[others] = -1

    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=batch_size,
                                               num_workers=2,
                                               shuffle=shuffle_train)

    test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                              batch_size=batch_size,
                                              num_workers=2,
                                              shuffle=False)

    if return_idx:
        return train_loader, test_loader, indices
    return train_loader, test_loader

In [14]:
class GaussianNoise(nn.Module):
    # """데이터에 noise 추가"""
    def __init__(self, batch_size, input_shape, std):
        super(GaussianNoise, self).__init__()
        self.shape = (batch_size, ) + input_shape
        self.std = std
        self.noise = torch.zeros(self.shape).cuda()

    def forward(self, x):
        self.noise.normal_(mean=0, std=self.std)
        # print(self.noise.shape)

        return x + self.noise

def temporal_losses(out1, out2, w, labels):
    # output1: current output
    # output2: temporal output
    # w: weight for summation loss

    # "ensemble output과 current output을 통해 supervised, unsupervised loss 및 total loss를 계산함"

    sup_loss, nbsup = masked_crossentropy(out1, labels)
    unsup_loss = mse_loss(out1, out2)
    total_loss = sup_loss + w * unsup_loss

    return total_loss, sup_loss, unsup_loss, nbsup

def mse_loss(out1, out2):
    # "current output, ensemble output 간의 mean difference: unsupervised loss"
    quad_diff = torch.sum((F.softmax(out1, dim=1) - F.softmax(out2, dim=1)) ** 2)

    return quad_diff / out1.data.nelement()

def masked_crossentropy(out, labels):
    # "labeld된 data에 한해서 crossentropy loss를 계산함"
    cond = (labels >= 0)
    nnz = torch.nonzero(cond)  # array of labeled sample index
    nbsup = len(nnz)  # number of supervised samples
    # check if labeled samples in batch, return 0 if none
    if nbsup > 0:
        # select lines in out with label
        masked_outputs = torch.index_select(out, 0, nnz.view(nbsup))
        masked_labels = labels[cond]
        loss = F.cross_entropy(masked_outputs, masked_labels)
        return loss, nbsup
    loss = torch.tensor([0.], requires_grad=False).cuda()
    return loss, 0

def weight_scheduler(epoch, max_epochs, max_val, mult, n_labeled, n_samples):
    "epoch이 지남에 따라 weight를 조정함"
    max_val = max_val * (float(n_labeled) / n_samples)
    return ramp_up(epoch, max_epochs, max_val, mult)

def ramp_up(epoch, max_epochs, max_val, mult):
    # "weight를 조정하며 첫 epoch에는 0을 사용함"
    if epoch == 0:
        return 0.
    elif epoch >= max_epochs:
        return max_val
    return max_val * np.exp(-mult * (1. - float(epoch) / max_epochs) ** 2)

def calc_metrics(model, loader):
    correct = 0
    total = 0
    for i, (samples, labels) in enumerate(loader):
        samples = samples.cuda()
        labels = labels.requires_grad_(False).cuda()
        outputs = model(samples)
        _, predicted = torch.max(outputs.detach(), 1)
        total += labels.size(0)
        correct += (predicted == labels.detach().view_as(predicted)).sum()
    acc = 100 * float(correct) / total
    return acc

In [15]:
class CNN(nn.Module):
    def __init__(self, batch_size, std, input_shape=(1, 28, 28), p=0.5, fm1=16, fm2=32):
        super(CNN, self).__init__()
        self.std = std
        self.p = p
        self.fm1 = fm1
        self.fm2 = fm2
        self.input_shape = input_shape
        self.conv_block1 = nn.Sequential(nn.Conv2d(1, self.fm1, 3, stride=1, padding=1),
                                        nn.BatchNorm2d(self.fm1), 
                                        nn.ReLU(),
                                        nn.MaxPool2d(3, stride=2, padding=1)
                                      )
        
        self.conv_block2 = nn.Sequential(nn.Conv2d(self.fm1, self.fm2, 3, stride=1, padding=1),
                                        nn.BatchNorm2d(self.fm2), 
                                        nn.ReLU(),
                                        nn.MaxPool2d(3, stride=2, padding=1)
                                      )
        self.drop = nn.Dropout(self.p)
        self.fc = nn.Linear(self.fm2 * 7 * 7, 10)


    def forward(self, x):
        if self.training:
            b = x.size(0)
            gn = GaussianNoise(b, self.input_shape, self.std)
            x = gn(x)

        # first block
        x = self.conv_block1(x)
        
        # second block
        x = self.conv_block2(x)

        # classifier
        x = x.view(-1, self.fm2 * 7 * 7)
        x = self.fc(self.drop(x))

        return x

In [16]:
def train(model, train_loader, val_loader ,seed, k, alpha, lr, num_epochs, batch_size, ntrain,n_classes=10, max_epochs=80, max_val=1.):

    # build model and feed to GPU
    model.cuda()

    # setup param optimization
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.99))

    # model.train()
    
    # 첫 ensemble ouput은 모두 0
    # 한 에폭 내에서 각 minibatch iter마다 outputs에 차곡차곡 쌓음
    # 각 minibatch iter마다 z에 저장된 값 이용해서 loss에 반영함
    # z는 한 에폭이 끝난 후 현재 에폭의 정보인 outputs와 이전 정보(z) EMA해서 갱신
    Z = torch.zeros(ntrain, n_classes).float().cuda()  # intermediate values
    z = torch.zeros(ntrain, n_classes).float().cuda()  # temporal outputs
    outputs = torch.zeros(ntrain, n_classes).float().cuda()  # current outputs

    losses = []
    suplosses = []
    unsuplosses = []
    best_loss = 30.0
    for epoch in range(num_epochs):
        t = timer()
        print('\nEpoch: {}'.format(epoch+1))
        model.train()
        # evaluate unsupervised cost weight
        w = weight_scheduler(epoch, max_epochs, max_val, 5, k, 60000)

        w = torch.tensor(w, requires_grad=False).cuda()
        print('---------------------')

        # targets change only once per epoch
        for i, (images, labels) in enumerate(train_loader):
            #print(i)
            batch_size = images.size(0)  # retrieve batch size again cause drop last is false
            images = images.cuda()
            labels = labels.requires_grad_(False).cuda()

            optimizer.zero_grad()
            out = model(images)
            # 현재 batch에 맞는 ensemble 결과들을 가져옴
            zcomp = z[i * batch_size: (i+1) * batch_size]
            zcomp.requires_grad_(False)
            loss, suploss, unsuploss, nbsup = temporal_losses(out, zcomp, w, labels)

            # save outputs
            outputs[i * batch_size: (i+1) * batch_size] = out.clone().detach()
            losses.append(loss.item())
            suplosses.append(nbsup * suploss.item())
            unsuplosses.append(unsuploss.item())

            # backprop
            loss.backward()
            optimizer.step()

        loss_mean = np.mean(losses)
        supl_mean = np.mean(suplosses)
        unsupl_mean = np.mean(unsuplosses)

        print('Epoch [%d/%d], Loss: %.6f, Supervised Loss: %.6f, Unsupervised Loss: %.6f, Time: %.2f' %
              (epoch + 1, num_epochs, float(loss_mean), float(supl_mean), float(unsupl_mean), timer()-t))
        # model의 outputs을 가중평균을 이용해 ensemble outputs으로 update 함
        Z = alpha * Z + (1. - alpha) * outputs
        z = Z * (1. / (1. - alpha ** (epoch + 1)))

        if loss_mean < best_loss:
            best_loss = loss_mean
            torch.save({'state_dict': model.state_dict()}, 'model_best.pth')

        model.eval()
        acc = calc_metrics(model, val_loader)
        print('Acc : %.2f' % acc)

def evaluation(model, loader):

    # test best model
    checkpoint = torch.load('model_best.pth')
    model.load_state_dict(checkpoint['state_dict'])
    model.eval()
    correct = 0
    total = 0
    for i, (samples, labels) in enumerate(loader):
        samples = samples.cuda()
        labels = labels.requires_grad_(False).cuda()
        outputs = model(samples)
        _, predicted = torch.max(outputs.detach(), 1)
        total += labels.size(0)
        correct += (predicted == labels.detach().view_as(predicted)).sum()
    acc = 100 * float(correct) / total
    print('Acc (best model): %.2f' % acc)

In [17]:
class config():
    def __init__(self):
        # global vars
        self.n_exp = 5 # number of experiments, try 5 different seed
        self.k = 100 # keep k labeled data in whole training set, other without label

        # dataset vars
        self.m = 0.1307
        self.s = 0.3081

        # model vars
        self.drop = 0.5 # dropout probability
        self.std = 0.15 # std of gaussian noise
        self.fm1 = 32 # channels of the first conv
        self.fm2 = 64 # channels of the second conv
        self.w_norm = True

        # optim vars
        self.learning_rate = 0.002
        self.beta2 = 0.99 # second momentum for Adam
        self.num_epochs = 50
        self.batch_size = 64

        # temporal ensembling vars
        self.alpha = 0.6 # ensembling momentum
        self.data_norm = 'channelwise' # image normalization
        self.divide_by_bs = False # whether we divide supervised cost by batch_size

        # RNG
        self.rng = np.random.RandomState(42)
        self.seeds = [self.rng.randint(200) for _ in range(self.n_exp)]

In [18]:
cfg = vars(config())

# prepare data
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(cfg['m'], cfg['s'])])
train_dataset, val_dataset = mnist_dataset(root='~/datasets/MNIST', transform=transform)
ntrain = len(train_dataset)


for i in range(cfg['n_exp']):
    model = CNN(cfg['batch_size'], cfg['std'], fm1=cfg['fm1'], fm2=cfg['fm2']).cuda()
    seed = cfg['seeds'][i]
    train_loader, val_loader, indices = sample_train(train_dataset, val_dataset, batch_size=cfg['batch_size'],
                                                 k=cfg['k'], n_classes=10, seed=seed, shuffle_train=False)
    train(model, train_loader, val_loader,seed, cfg['k'],cfg['alpha'],cfg['learning_rate'],
         cfg['num_epochs'], cfg['batch_size'], ntrain)
    evaluation(model, val_loader)


Epoch: 1
---------------------
Epoch [1/50], Loss: 0.765479, Supervised Loss: 0.809218, Unsupervised Loss: 0.065700, Time: 6.61
Acc : 31.85

Epoch: 2
---------------------
Epoch [2/50], Loss: 0.552805, Supervised Loss: 0.579083, Unsupervised Loss: 0.061820, Time: 6.38
Acc : 42.54

Epoch: 3
---------------------
Epoch [3/50], Loss: 0.421480, Supervised Loss: 0.441230, Unsupervised Loss: 0.061802, Time: 6.73
Acc : 70.33

Epoch: 4
---------------------
Epoch [4/50], Loss: 0.347495, Supervised Loss: 0.363312, Unsupervised Loss: 0.059535, Time: 6.45
Acc : 60.76

Epoch: 5
---------------------
Epoch [5/50], Loss: 0.291636, Supervised Loss: 0.304673, Unsupervised Loss: 0.055390, Time: 6.62
Acc : 71.49

Epoch: 6
---------------------
Epoch [6/50], Loss: 0.253021, Supervised Loss: 0.263902, Unsupervised Loss: 0.051658, Time: 6.88
Acc : 79.96

Epoch: 7
---------------------
Epoch [7/50], Loss: 0.222358, Supervised Loss: 0.231762, Unsupervised Loss: 0.048294, Time: 7.03
Acc : 80.06

Epoch: 8
---

Acc : 80.15

Epoch: 9
---------------------
Epoch [9/50], Loss: 0.173898, Supervised Loss: 0.179889, Unsupervised Loss: 0.039826, Time: 7.36
Acc : 77.87

Epoch: 10
---------------------
Epoch [10/50], Loss: 0.159780, Supervised Loss: 0.165246, Unsupervised Loss: 0.038018, Time: 7.18
Acc : 81.50

Epoch: 11
---------------------
Epoch [11/50], Loss: 0.146967, Supervised Loss: 0.151937, Unsupervised Loss: 0.036467, Time: 7.21
Acc : 78.86

Epoch: 12
---------------------
Epoch [12/50], Loss: 0.136103, Supervised Loss: 0.140689, Unsupervised Loss: 0.035116, Time: 7.10
Acc : 83.58

Epoch: 13
---------------------
Epoch [13/50], Loss: 0.125796, Supervised Loss: 0.130030, Unsupervised Loss: 0.033915, Time: 7.08
Acc : 86.66

Epoch: 14
---------------------
Epoch [14/50], Loss: 0.118158, Supervised Loss: 0.122089, Unsupervised Loss: 0.032863, Time: 7.09
Acc : 89.46

Epoch: 15
---------------------
Epoch [15/50], Loss: 0.110984, Supervised Loss: 0.114687, Unsupervised Loss: 0.031945, Time: 7.01
A

KeyboardInterrupt: 