# Supervised

## Initialisation

In [1]:
import os.path as osp
import torch

from argparse import Namespace

from mlu.datasets.utils import generate_indexes
from mlu.metrics.categorical import CategoricalAccuracy
from mlu.metrics.incremental import IncrementalMean
from mlu.nn import CrossEntropyWithVectors, OneHot
from mlu.utils.misc import reset_seed, get_datetime, get_lr, get_nb_parameters
from mlu.utils.printers import LinePrinter

from torch import nn
from torch.nn import Module
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import MultiStepLR
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.dataset import Subset
from torch.utils.tensorboard import SummaryWriter
from torchvision.datasets import CIFAR10
from torchvision.transforms import ToTensor, RandomAffine, RandomHorizontalFlip, Compose

In [2]:
args = Namespace()
args.seed = 1234

# Hyperparameters
args.nb_epochs = 200
args.bsize = 128
args.nb_labels = 50000
args.lr = 0.1

# SGD parameters
args.weight_decay = 0.0005
args.momentum = 0.9  # called "beta" in paper
args.nesterov = False

# Scheduler parameters
lr_decay_gamma = 0.2
lr_decay_milestones = [60, 120, 160]

optimizer_name = "SGD"  # or SGD

reset_seed(args.seed)

dataset_root = osp.join("..", "datasets")
tensorboard_root = osp.join("..", "results", "tensorboard")
device = torch.device("cuda")

### Model

In [3]:
def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
	"""3x3 convolution with padding"""
	return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
					 padding=dilation, groups=groups, bias=False, dilation=dilation)


def conv1x1(in_planes, out_planes, stride=1):
	"""1x1 convolution"""
	return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)


class BasicBlock(nn.Module):
	expansion = 1

	def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
				 base_width=64, dilation=1, norm_layer=None):
		super(BasicBlock, self).__init__()

		# Both self.conv1 and self.downsample layers downsample the input when stride != 1
		self.conv1 = conv3x3(inplanes, planes, stride)
		self.bn1 = norm_layer(planes)
		self.relu = nn.ReLU(inplace=True)
		self.conv2 = conv3x3(planes, planes)
		self.bn2 = norm_layer(planes)
		self.downsample = downsample
		self.stride = stride

		self.expansion = 2

	def forward(self, x):
		identity = x

		out = self.conv1(x)
		out = self.bn1(out)
		out = self.relu(out)

		out = self.conv2(out)
		out = self.bn2(out)

		if self.downsample is not None:
			identity = self.downsample(x)

		out += identity
		out = self.relu(out)

		return out


class ResNet(Module):
	def __init__(self, layers, width: int = 2, num_classes=10, zero_init_residual=False,
				 groups=1, width_per_group=16, replace_stride_with_dilation=None,
				 norm_layer=None):
		Module.__init__(self)

		if norm_layer is None:
			norm_layer = nn.BatchNorm2d
		self._norm_layer = norm_layer

		block = BasicBlock
		self.inplanes = 16*width
		self.dilation = 1
		if replace_stride_with_dilation is None:
			# each element in the tuple indicates if we should replace
			# the 2x2 stride with a dilated convolution instead
			replace_stride_with_dilation = [False, False, False]
		if len(replace_stride_with_dilation) != 3:
			raise ValueError("replace_stride_with_dilation should be None "
							 "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
		self.groups = groups
		self.base_width = width_per_group
		self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False)
		self.bn1 = norm_layer(self.inplanes)
		self.relu = nn.ReLU(inplace=True)
		self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
		self.layer1 = self._make_layer(block, 16*width, layers[0])
		self.layer2 = self._make_layer(block, 32*width, layers[1], stride=2,
									   dilate=replace_stride_with_dilation[0])
		self.layer3 = self._make_layer(block, 64*width, layers[2], stride=2,
									   dilate=replace_stride_with_dilation[1])

		self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
		self.fc = nn.Linear(64 * width * block.expansion, num_classes)

		for m in self.modules():
			if isinstance(m, nn.Conv2d):
				nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')

			elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
				nn.init.constant_(m.weight, 1)
				nn.init.constant_(m.bias, 0)

		# Zero-initialize the last BN in each residual branch,
		# so that the residual branch starts with zeros, and each residual block behaves like an identity.
		# This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
		if zero_init_residual:
			for m in self.modules():
				if isinstance(m, BasicBlock):
					nn.init.constant_(m.bn2.weight, 0)

	def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
		norm_layer = self._norm_layer
		downsample = None
		previous_dilation = self.dilation
		if dilate:
			self.dilation *= stride
			stride = 1
		if stride != 1 or self.inplanes != planes * block.expansion:
			downsample = nn.Sequential(
				conv1x1(self.inplanes, planes * block.expansion, stride),
				norm_layer(planes * block.expansion),
			)

		layers = []
		layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
							self.base_width, previous_dilation, norm_layer))
		self.inplanes = planes * block.expansion
		for _ in range(1, blocks):
			layers.append(block(self.inplanes, planes, groups=self.groups,
								base_width=self.base_width, dilation=self.dilation,
								norm_layer=norm_layer))

		return nn.Sequential(*layers)

	def _forward_impl(self, x):
		# See note [TorchScript super()]
		x = self.conv1(x)
		x = self.bn1(x)
		x = self.relu(x)
		x = self.maxpool(x)

		x = self.layer1(x)
		x = self.layer2(x)
		x = self.layer3(x)

		x = self.avgpool(x)
		x = torch.flatten(x, 1)
		x = self.fc(x)

		return x

	def forward(self, x):
		return self._forward_impl(x)


class WideResNet28(ResNet):
	def __init__(self, num_classes: int, width: int = 2):
		super().__init__(layers=[4, 4, 4], width=width, num_classes=num_classes)

### Build models, optimizer, metrics, and utilities

In [4]:
# Build WideResNet-28-2 model
model = WideResNet28(num_classes=10, width=2).to(device)
activation = lambda x, dim: x.softmax(dim).clamp(2e-20)  # lambda x, dim: x
criterion = CrossEntropyWithVectors()

if optimizer_name.upper() == "SGD":
	optim = SGD(model.parameters(), lr=args.lr, weight_decay=args.weight_decay, momentum=args.momentum, nesterov=args.nesterov)
else:
	optim = Adam(model.parameters(), lr=args.lr)
scheduler = MultiStepLR(optim, milestones=lr_decay_milestones, gamma=lr_decay_gamma)

# Build metrics for labeled, unlabeled and validation predictions.
metrics_train = {"train/acc": CategoricalAccuracy(vector_target=True)}
metrics_val = {"val/acc": CategoricalAccuracy(vector_target=True)}

# Tensorboard writer and the Recorder wrapper for tracking max, std & min of the values stored.
writer = SummaryWriter(osp.join(tensorboard_root, "CIFAR10_%s_WideResNet28_Supervised_Notebook" % get_datetime()))

# Class for managing how the values are print in terminal
printer = LinePrinter()

print("Nb model parameters : ", get_nb_parameters(model))

Nb model parameters :  1472554


## Data preparation

### Builds datasets

In [5]:
transform_train = Compose([
	RandomAffine(0, translate=(1/16, 1/16)),
	RandomHorizontalFlip(),
	ToTensor(),
])
transform_val = Compose([
    ToTensor(),
])
target_transform = OneHot(nb_classes=10)

In [6]:
dataset_train = CIFAR10(dataset_root, train=True, download=True, transform=transform_train, target_transform=target_transform)

supervised_ratio = args.nb_labels / len(dataset_train)
if args.nb_labels < len(dataset_train):
	indexes_s = generate_indexes(
		dataset_train,
		nb_classes=10,
		ratios=[supervised_ratio],
		target_one_hot=True,
	)[0]
	dataset_train = Subset(dataset_train, indexes_s)

print("Use {} labels. (proportion of the dataset = {}/1.0)".format(args.nb_labels, supervised_ratio))

# Create validation dataset
dataset_val = CIFAR10(dataset_root, train=False, download=True, transform=transform_val, target_transform=target_transform)

Files already downloaded and verified
Use 50000 labels. (proportion of the dataset = 1.0/1.0)
Files already downloaded and verified


### Build loaders

In [7]:
loader_train = DataLoader(dataset_train, batch_size=args.bsize, shuffle=True, num_workers=4, drop_last=False)
loader_val = DataLoader(dataset_val, batch_size=args.bsize, shuffle=False, drop_last=False)

## Training

In [8]:
def train(epoch: int):
	model.train()

	metric_names = ["train/loss"] + list(metrics_train.keys())
	continue_metrics = {name: IncrementalMean() for name in metric_names}

	for i, (batch, labels) in enumerate(loader_train):
		batch = batch.to(device).float()
		labels = labels.to(device).long()

		optim.zero_grad()

		# Compute prediction
		logits = model(batch)
		pred = activation(logits, dim=1)

		# Update model
		loss = criterion(pred, labels)
		loss.backward()
		optim.step()

		# Compute metrics
		with torch.no_grad():
			continue_metrics["train/loss"].add(loss.item())

			for name, metric in metrics_train.items():
				score = metric(pred, labels)
				continue_metrics[name].add(score.item())

			current_values = {name: continue_metric.get_current() for name, continue_metric in continue_metrics.items()}
			current_values["train/lr"] = get_lr(optim)
			printer.print_current_values(current_values, i, len(loader_train), epoch)

	# Save metrics in tensorboard
	for name, continue_metric in continue_metrics.items():
		writer.add_scalar(name, continue_metric.get_current(), epoch)
	writer.add_scalar("train/lr", get_lr(optim), epoch)

In [9]:
def val(epoch: int):
	model.eval()

	metric_names = list(metrics_val.keys())
	continue_metrics = {name: IncrementalMean() for name in metric_names}

	for i, (x, y) in enumerate(loader_val):
		x = x.to(device).float()
		y = y.to(device).long()

		# Compute prediction
		logits = model(x)
		pred = torch.softmax(logits, dim=1)

		for name, metric in metrics_val.items():
			score = metric(pred, y)
			continue_metrics[name].add(score.item())

		current_values = {name: continue_metric.get_current() for name, continue_metric in continue_metrics.items()}
		printer.print_current_values(current_values, i, len(loader_val), epoch)

	# Save metrics in tensorboard
	for name, continue_metric in continue_metrics.items():
		writer.add_scalar(name, continue_metric.get_current(), epoch)

## Start learning

In [10]:
for e in range(args.nb_epochs):
	train(e)
	with torch.no_grad():
		val(e)
	if scheduler is not None:
		scheduler.step()
	print("")

writer.close()

train, epoch   1, 100%, acc: 3.6314e-01, loss: 1.7386e+00, lr: 1.0000e-01, took (s): 6.95
val  , epoch   1, 100%, acc: 5.0307e-01, took (s): 2.00

train, epoch   2, 100%, acc: 5.5263e-01, loss: 1.2334e+00, lr: 1.0000e-01, took (s): 7.04
val  , epoch   2, 100%, acc: 5.2146e-01, took (s): 2.00

train, epoch   3, 100%, acc: 6.3858e-01, loss: 1.0188e+00, lr: 1.0000e-01, took (s): 7.10
val  , epoch   3, 100%, acc: 6.6525e-01, took (s): 1.99

train, epoch   4, 100%, acc: 6.8548e-01, loss: 8.9032e-01, lr: 1.0000e-01, took (s): 6.92
val  , epoch   4, 100%, acc: 6.5803e-01, took (s): 2.08

train, epoch   5, 100%, acc: 7.1624e-01, loss: 8.0996e-01, lr: 1.0000e-01, took (s): 7.76
val  , epoch   5, 100%, acc: 6.0117e-01, took (s): 2.05

train, epoch   6, 100%, acc: 7.3955e-01, loss: 7.4932e-01, lr: 1.0000e-01, took (s): 6.94
val  , epoch   6, 100%, acc: 7.1737e-01, took (s): 2.03

train, epoch   7, 100%, acc: 7.5805e-01, loss: 6.9850e-01, lr: 1.0000e-01, took (s): 6.92
val  , epoch   