# <Semi-supervised learning tutorial 2 - pseudo labeling>

In [1]:
import os
import math
import random
import time
import numpy as np

import torch
from torch import nn
from torch.nn import functional as F
from torch import optim
from torchvision import datasets
from torchvision import transforms
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

import easydict# dictionary의 속성을 dot(.)을 사용하여 표기가능
from tqdm.auto import tqdm #ipython파일에서 출력을 깔끔하게하기위해 tqdm.tqdm 대신 tqdm.auto.tqdm 또는 tqdm.notebook.tqdm 사용
from PIL import Image # PIL(Python Imaging Library)

from augmentation import RandAugmentCIFAR # 데이터 증강에 필요한 함수 작성해 모아놓은 augmentation.py 파일
from models import WideResNet # 모델 관련 함수 작성해 모아놓은 models.py 파일

In [2]:
args = easydict.EasyDict({
    "seed" : 0,
    "gpu": 0,
    "start_step" : 0,
    "total_steps" : 2000, # 300000
    "eval_step" : 20, # 100
    "lambda_u" : 1,
    
    # for supervised learning
    "total_epoch" : 100,
    
    # for data
    "data_path" : "./data",
    "num_data" : 10000, # 50000
    "num_labeled" : 1000,# 5000 
    "num_classes" : 10, # number of classes
    "resize" : 32, # resize image
    "batch_size" : 64,
    "mu" : 1, # coefficient of unlabeled batch size,
    
    # for WideResNet model
    "depth" : 10, # 기본 28, assert((depth - 4) % 6 == 0) 학습 시간을 줄이기 위해서 모델 크기 줄임
    "widen_factor" : 1, # 기본 2, , 학습 시간을 줄이기 위해서 모델 크기 줄임
    "teacher_dropout" : 0, # dropout on last dense layer of teacher model
    "student_dropout" : 0, # dropout on last dense layer of student model
    
    # for optimizing
    "teacher_lr" : 0.01, # train learning rate of teacher model
    "student_lr" : 0.01, # train learning rate of student model
    "momentum" : 0.9, # SGD Momentum
    "nesterov" : True, # use nesterov
    "weight_decay" : 0.01, # train weight decay
    
})

In [3]:
args.device = torch.device('cuda', args.gpu)

In [4]:
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [5]:
base_dataset = datasets.CIFAR10(args.data_path, train=True, download=True)
test_dataset = datasets.CIFAR10(args.data_path, train=False, download=False)

Files already downloaded and verified


In [6]:
def l_u_split(args, labels):
    
    label_per_class = args.num_labeled // args.num_classes
    num_unlabel_data = ((args.num_data // args.num_classes) - label_per_class) * args.num_classes
    # 학습 시간을 줄이기 위해서 데이터 개수를 줄이기 위해서 추가
    
    print(f'클래스별 labeled data 개수 : {label_per_class}')
    print(f'Labeled data 개수 : {label_per_class * args.num_classes}')
    print(f'Unlabeled data 개수 : {num_unlabel_data}')
    
    labels = np.array(labels)
    labeled_idx = []
    
    unlabeled_idx = np.array(range(len(labels))) 
    for i in range(args.num_classes):
        idx = np.where(labels == i)[0]
        idx = np.random.choice(idx, label_per_class, False)
        labeled_idx.extend(idx)
    labeled_idx = np.array(labeled_idx)
    np.random.shuffle(labeled_idx)
    
    unlabeled_idx = np.array([i for i in unlabeled_idx if i not in labeled_idx])
    np.random.shuffle(unlabeled_idx)
    unlabeled_idx = unlabeled_idx[:num_unlabel_data]
    
    return labeled_idx, unlabeled_idx

In [7]:
labeled_idxs, unlabeled_idxs = l_u_split(args, base_dataset.targets)

클래스별 labeled data 개수 : 100
Labeled data 개수 : 1000
Unlabeled data 개수 : 9000


In [8]:
len(labeled_idxs)

1000

In [9]:
len(unlabeled_idxs)

9000

In [10]:
cifar10_mean = (0.491400, 0.482158, 0.4465231)
cifar10_std = (0.247032, 0.243485, 0.2615877)

transform_labeled = transforms.Compose([
        transforms.RandomHorizontalFlip(),
        transforms.RandomCrop(size=args.resize,
                              padding=int(args.resize * 0.125),
                              fill=128,
                              padding_mode='constant'),
        transforms.ToTensor(),
        transforms.Normalize(mean=cifar10_mean, std=cifar10_std),
    ])

transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=cifar10_mean, std=cifar10_std)
    ])

class CustomTransform(object):
    def __init__(self, args, mean, std):
        n, m = 2, 10
        
        self.ori = transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.RandomCrop(size=args.resize,
                                  padding=int(args.resize * 0.125),
                                  fill=128,
                                  padding_mode='constant')])
        
        self.aug = transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.RandomCrop(size=args.resize,
                                  padding=int(args.resize * 0.125),
                                  fill=128,
                                  padding_mode='constant'),
            RandAugmentCIFAR(n=n, m=m)])
        
        self.normalize = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=mean, std=std)])

    def __call__(self, x):
        ori = self.ori(x)
        aug = self.aug(x)
        return self.normalize(ori), self.normalize(aug)

In [11]:
class CustomCIFAR10SSL(datasets.CIFAR10):
    def __init__(self, root, indexs, train=True,
                 transform=None, target_transform=None, download=False):
        super().__init__(root, train=train,
                         transform=transform,
                         target_transform=target_transform,
                         download=download)
        if indexs is not None:
            self.data = self.data[indexs]
            self.targets = np.array(self.targets)[indexs]

    def __getitem__(self, index):
        img, target = self.data[index], self.targets[index]
        img = Image.fromarray(img)

        if self.transform is not None:
            img = self.transform(img)

        return img, target

In [12]:
labeled_dataset = CustomCIFAR10SSL(args.data_path, labeled_idxs, train=True, transform=transform_labeled)
unlabeled_dataset = CustomCIFAR10SSL(args.data_path, unlabeled_idxs, train=True, 
                                     transform=CustomTransform(args, mean=cifar10_mean, std=cifar10_std))
test_dataset = datasets.CIFAR10(args.data_path, train=False, transform=transform_test, download=False)

In [13]:
labeled_loader = DataLoader(labeled_dataset, sampler=RandomSampler(labeled_dataset),
                            batch_size=args.batch_size, drop_last=True)
unlabeled_loader = DataLoader(unlabeled_dataset, sampler=RandomSampler(unlabeled_dataset),
                              batch_size=args.batch_size * args.mu, drop_last=True)
test_loader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=args.batch_size)

# Supervised learning

In [14]:
teacher_model = WideResNet(num_classes=args.num_classes,
                           depth=args.depth,
                           widen_factor=args.widen_factor,
                           dropout=0,
                           dense_dropout=args.teacher_dropout)
teacher_model.to(args.device)
print(f"Params: {sum(p.numel() for p in teacher_model.parameters())/1e6:.2f}M")
# K킬로 1000, M 메가 100만 million, G 기가 10억 billion

Params: 0.08M


In [15]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(teacher_model.parameters(), lr=args.teacher_lr, momentum=args.momentum, nesterov=args.nesterov)

In [16]:
since = time.time()

for epoch in range(args.total_epoch):
    # 모델은 training mode로 설정
    teacher_model.train()
    
    running_loss = 0
    running_total = 0
    
    for inputs, targets in labeled_loader:
        inputs = inputs.to(args.device)
        targets = targets.to(args.device, dtype=torch.long)
        
        # parameter gradients를 0으로 설정
        optimizer.zero_grad()
        
        # forward
        outputs = teacher_model(inputs)
        #print(outputs)
        #print(targets)
        loss = criterion(outputs, targets)
        
        # backward
        loss.backward()
        optimizer.step()
        
        # batch별 loss를 축적함
        running_loss += loss.item() * inputs.size(0)
        running_total += inputs.size(0)

    # epoch의 loss 도출
    epoch_loss = running_loss / running_total
    print(f'{epoch+1} Loss : {epoch_loss:.4f}')
    
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))

1 Loss : 2.3356
2 Loss : 2.1040
3 Loss : 2.0085
4 Loss : 1.9521
5 Loss : 1.9039
6 Loss : 1.8594
7 Loss : 1.8283
8 Loss : 1.8181
9 Loss : 1.7788
10 Loss : 1.7856
11 Loss : 1.7563
12 Loss : 1.7323
13 Loss : 1.7137
14 Loss : 1.7024
15 Loss : 1.6590
16 Loss : 1.6627
17 Loss : 1.6569
18 Loss : 1.6140
19 Loss : 1.6137
20 Loss : 1.6192
21 Loss : 1.5772
22 Loss : 1.5979
23 Loss : 1.5628
24 Loss : 1.5603
25 Loss : 1.5550
26 Loss : 1.5562
27 Loss : 1.5239
28 Loss : 1.5268
29 Loss : 1.5118
30 Loss : 1.4932
31 Loss : 1.4881
32 Loss : 1.5004
33 Loss : 1.4744
34 Loss : 1.4648
35 Loss : 1.4302
36 Loss : 1.4433
37 Loss : 1.4216
38 Loss : 1.4106
39 Loss : 1.4329
40 Loss : 1.4056
41 Loss : 1.3836
42 Loss : 1.3731
43 Loss : 1.3744
44 Loss : 1.3725
45 Loss : 1.3454
46 Loss : 1.3518
47 Loss : 1.3360
48 Loss : 1.3538
49 Loss : 1.2863
50 Loss : 1.2770
51 Loss : 1.3090
52 Loss : 1.2970
53 Loss : 1.2912
54 Loss : 1.2287
55 Loss : 1.2644
56 Loss : 1.2571
57 Loss : 1.2468
58 Loss : 1.2252
59 Loss : 1.1935
60 Los

In [17]:
teacher_model.eval()
with torch.no_grad():
    corrects = 0
    total = 0
    for inputs, targets in test_loader:
        inputs = inputs.to(args.device)
        targets = targets.to(args.device, dtype=torch.long)
        
        # forward
        outputs = teacher_model(inputs)
        
        # output 중 최대값의 위치에 해당하는 class로 예측 수행
        _, preds = torch.max(outputs, 1)
        
        # batch별 정답 개수를 축적함
        corrects += torch.sum(preds == targets.data)
        total += targets.size(0)

test_acc = corrects.double() / total
print('Testing Acc: {:.4f}'.format(test_acc))

Testing Acc: 0.3158


In [18]:
teacher_model_parameter = teacher_model.state_dict()

# Semi-supervized learning using pseudo labeling

In [19]:
teacher_model = WideResNet(num_classes=args.num_classes,
                           depth=args.depth,
                           widen_factor=args.widen_factor,
                           dropout=0,
                           dense_dropout=args.teacher_dropout)
teacher_model.to(args.device)

student_model = WideResNet(num_classes=args.num_classes,
                           depth=args.depth,
                           widen_factor=args.widen_factor,
                           dropout=0,
                           dense_dropout=args.teacher_dropout)
student_model.to(args.device)

WideResNet(
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (block1): NetworkBlock(
    (layer): Sequential(
      (0): BasicBlock(
        (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.001, affine=True, track_running_stats=True)
        (relu1): LeakyReLU(negative_slope=0.1, inplace=True)
        (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(16, eps=1e-05, momentum=0.001, affine=True, track_running_stats=True)
        (relu2): LeakyReLU(negative_slope=0.1, inplace=True)
        (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      )
    )
  )
  (block2): NetworkBlock(
    (layer): Sequential(
      (0): BasicBlock(
        (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.001, affine=True, track_running_stats=True)
        (relu1): LeakyReLU(negative_slope=0.1, inplace=True)
        (conv1): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), pad

In [20]:
teacher_model.load_state_dict(teacher_model_parameter)

<All keys matched successfully>

In [21]:
t_optimizer = optim.SGD(teacher_model.parameters(),
                        lr=args.teacher_lr,
                        momentum=args.momentum,
                        nesterov=args.nesterov)
s_optimizer = optim.SGD(student_model.parameters(),
                        lr=args.student_lr,
                        momentum=args.momentum,
                        nesterov=args.nesterov)
criterion = nn.CrossEntropyLoss()

In [22]:
def train_pseudo_labeling(args, teacher_model, student_model, t_optimizer, s_optimizer, criterion):
    since = time.time()
    for step in range(args.start_step, args.total_steps):
        if step % args.eval_step == 0:
            if step != 0:
                print('{} Step - Teacher loss: {:.4f} Student loss: {:.4f}\nl_loss: {:.4f} u_loss: {:.4f}'.format(step, np.mean(t_losses), np.mean(s_losses),
                                                                                    np.mean(l_losses), np.mean(u_losses)))
                
            s_losses = []
            t_losses = []
            l_losses = []
            u_losses = []
            
        teacher_model.train()
        student_model.train()

        try:
            images_l, targets = labeled_iter.next()
        except:
            labeled_iter = iter(labeled_loader)
            images_l, targets = labeled_iter.next()

        try:
            (images_uw, images_us), _ = unlabeled_iter.next()
        except:
            unlabeled_iter = iter(unlabeled_loader)
            (images_uw, images_us), _ = unlabeled_iter.next()

        images_l = images_l.to(args.device)
        images_uw = images_uw.to(args.device)
        images_us = images_us.to(args.device)
        targets = targets.to(args.device, dtype=torch.long)

        # parameter gradients를 0으로 설정
        t_optimizer.zero_grad()
        s_optimizer.zero_grad()

        # forward teacher model
        batch_size = images_l.shape[0]
        t_images = torch.cat((images_l, images_uw))
        t_logits = teacher_model(t_images)
        t_logits_l = t_logits[:batch_size]
        t_logits_uw = t_logits[batch_size:]
        del t_logits

        t_loss_l = criterion(t_logits_l, targets)

        # make pseudo label
        soft_pseudo_label = torch.softmax(t_logits_uw, dim=-1)
        max_probs, hard_pseudo_label = torch.max(soft_pseudo_label, dim=-1)
        
        # forward student model
        s_images = torch.cat((images_l, images_uw))
        s_logits = student_model(s_images)
        s_logits_l = s_logits[:batch_size]
        s_logits_us = s_logits[batch_size:]
        del s_logits

        s_loss_l = criterion(s_logits_l, targets)
        s_loss_u = criterion(s_logits_us, hard_pseudo_label.detach())
        s_loss = s_loss_l + (args.lambda_u * s_loss_u)

        # backward
        t_loss_l.backward()
        t_optimizer.step()
        
        s_loss.backward()
        s_optimizer.step()

        s_losses.append(s_loss.item())
        t_losses.append(t_loss_l.item())
        l_losses.append(s_loss_l.item())
        u_losses.append(s_loss_u.item())
        
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))

In [23]:
train_pseudo_labeling(args, teacher_model, student_model, t_optimizer, s_optimizer, criterion)

20 Step - Teacher loss: 0.8793 Student loss: 4.3520
l_loss: 2.1902 u_loss: 2.1618
40 Step - Teacher loss: 0.8881 Student loss: 3.8720
l_loss: 1.9775 u_loss: 1.8945
60 Step - Teacher loss: 0.8400 Student loss: 3.6188
l_loss: 1.8774 u_loss: 1.7413
80 Step - Teacher loss: 0.8156 Student loss: 3.4440
l_loss: 1.7887 u_loss: 1.6553
100 Step - Teacher loss: 0.8475 Student loss: 3.3086
l_loss: 1.7160 u_loss: 1.5926
120 Step - Teacher loss: 0.8269 Student loss: 3.1813
l_loss: 1.6792 u_loss: 1.5021
140 Step - Teacher loss: 0.7983 Student loss: 3.0981
l_loss: 1.6346 u_loss: 1.4635
160 Step - Teacher loss: 0.8461 Student loss: 3.0499
l_loss: 1.6366 u_loss: 1.4134
180 Step - Teacher loss: 0.8615 Student loss: 3.0389
l_loss: 1.6353 u_loss: 1.4035
200 Step - Teacher loss: 0.8049 Student loss: 2.9594
l_loss: 1.5775 u_loss: 1.3819
220 Step - Teacher loss: 0.7798 Student loss: 2.8510
l_loss: 1.5319 u_loss: 1.3191
240 Step - Teacher loss: 0.8027 Student loss: 2.8583
l_loss: 1.5342 u_loss: 1.3241
260 Step

Training complete in 2m 50s


In [24]:
def test(args, model, loader):
    model.eval()
    with torch.no_grad():
        corrects = 0
        total = 0
        for inputs, targets in loader:
            inputs = inputs.to(args.device)
            targets = targets.to(args.device, dtype=torch.long)

            # forward
            outputs = model(inputs)

            # output 중 최대값의 위치에 해당하는 class로 예측 수행
            _, preds = torch.max(outputs, 1)

            # batch별 정답 개수를 축적함
            corrects += torch.sum(preds == targets.data)
            total += targets.size(0)

    test_acc = corrects.double() / total
    print('Testing Acc: {:.4f}'.format(test_acc))

In [25]:
test(args, student_model, test_loader)

Testing Acc: 0.4509
