In [8]:
import torch, math, copy
import numpy as np
from torchvision import datasets, transforms
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F


from keras.datasets import cifar10
from keras.utils.np_utils import to_categorical   


transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])

#(X_train, y_train), (X_test, y_test) = cifar10.load_data()

#train_dataset = datasets.MNIST("data", train=True, download=True, transform=transform)
train_dataset = datasets.CIFAR10("data", train=True, download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=256, shuffle=True)

#test_dataset = datasets.MNIST("data", train=False, download=True, transform=transform)
test_dataset = datasets.CIFAR10("data", train=False, download=True, transform=transform)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=256, shuffle=False)


def train(epochs, model, criterion, optimizer, train_loader, test_loader):
    for epoch in range(epochs):
        train_err = train_epoch(model, criterion, optimizer, train_loader)
        test_err = test(model, test_loader)
        print('Epoch {:03d}/{:03d}, Train Error {:.2f}% || Test Error {:.2f}%'.format(epoch, epochs, train_err*100, test_err*100))
    return train_err, test_err
    
def train_epoch(model, criterion, optimizer, loader):
    total_correct = 0.
    total_samples = 0.
    
    for batch_idx, (data, target) in enumerate(loader):
        if torch.cuda.is_available():
            data, target = data.cuda(), target.cuda()

        # insert code to feed the data to the model and collect its output
        output = model(data)

        # insert code to compute the loss from output and the true target
        loss = criterion(output,target)

        # insert code to update total_correct and total_samples
        # total_correct: total number of correctly classified samples
        # total_samples: total number of samples seen so far
        total_samples += len(target)
        pred = output.argmax(dim=1, keepdim=True)  
        total_correct += pred.eq(target.view_as(pred)).sum().item()
            
        # insert code to update the parameters using optimizer
        # be careful in this part as an incorrect implementation will affect
        # all your experiments and have a significant impact on your grade!
        # in particular, note that pytorch does --not-- automatically
        # clear the parameter's gradients: check tutorials to see
        # how this can be done with a single method call.
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    return 1 - total_correct/total_samples
    
def test(model, loader):
    total_correct = 0.
    total_samples = 0.
    model.eval()
    
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(loader):
            if torch.cuda.is_available():
                data, target = data.cuda(), target.cuda()

            # insert code to feed the data to the model and collect its output
            output = model(data)

            # insert code to update total_correct and total_samples
            # total_correct: total number of correctly classified samples
            # total_samples: total number of samples seen so far
            total_samples += len(target)
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            total_correct += pred.eq(target.view_as(pred)).sum().item()
            


    return 1 - total_correct/total_samples


criterion = torch.nn.CrossEntropyLoss()


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting data/cifar-10-python.tar.gz to data
Files already downloaded and verified


In [6]:
!pip install d2l==0.17.0

import torch
from torch import nn
from d2l import torch as d2l


def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
    # Use `is_grad_enabled` to determine whether the current mode is training
    # mode or prediction mode
    if not torch.is_grad_enabled():
        # If it is prediction mode, directly use the mean and variance
        # obtained by moving average
        X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps)
        #if torch.mean(moving_var)>torch.mean(gamma):
        Y = (torch.sqrt(moving_var)+9.*gamma)/10. * X_hat + (9.*beta+moving_mean)/10.
        #else:
        #  Y=X_hat * torch.sqrt(moving_var + eps) + beta
    
    else:
        assert len(X.shape) in (2, 4)
        if len(X.shape) == 2:
            # When using a fully-connected layer, calculate the mean and
            # variance on the feature dimension
            mean = X.mean(dim=0)
            var = ((X - mean) ** 2).mean(dim=0)
            X_hat = (X - mean) / torch.sqrt(var)
            
        else:
            # When using a two-dimensional convolutional layer, calculate the
            # mean and variance on the channel dimension (axis=1). Here we
            # need to maintain the shape of `X`, so that the broadcasting
            # operation can be carried out later
            mean = X.mean(dim=(0, 2, 3), keepdim=True)
            var = ((X - mean) ** 2).mean(dim=(0, 2, 3), keepdim=True)
        # In training mode, the current mean and variance are used for the
        # standardization
            X_hat = (X - mean) / torch.sqrt(var + eps)
            
            
        #if torch.mean(var)>torch.mean(gamma):
        Y = (9.*gamma+torch.sqrt(var))/10. * X_hat + (9.*beta+mean)/10.  # Scale and shift
        #else:
        #  Y=X_hat *torch.sqrt(var) + beta      
    
        # Update the mean and variance using moving average
        moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
        moving_var = momentum * moving_var + (1.0 - momentum) * var
        
    return Y, moving_mean.data, moving_var.data


class BatchNorm(nn.Module):
    # `num_features`: the number of outputs for a fully-connected layer
    # or the number of output channels for a convolutional layer. `num_dims`:
    # 2 for a fully-connected layer and 4 for a convolutional layer
    def __init__(self, num_features, num_dims):
        super().__init__()
        if num_dims == 2:
            shape = (1, num_features)
        else:
            shape = (1, num_features, 1, 1)
        # The scale parameter and the shift parameter (model parameters) are
        # initialized to 1 and 0, respectively
        self.gamma = nn.Parameter(torch.ones(shape))
        self.beta = nn.Parameter(torch.zeros(shape))
        # The variables that are not model parameters are initialized to 0 and 1
        self.moving_mean = torch.zeros(shape)
        self.moving_var = torch.ones(shape)

    def forward(self, X):
        # If `X` is not on the main memory, copy `moving_mean` and
        # `moving_var` to the device where `X` is located
        if self.moving_mean.device != X.device:
            self.moving_mean = self.moving_mean.to(X.device)
            self.moving_var = self.moving_var.to(X.device)
        # Save the updated `moving_mean` and `moving_var`
        Y, self.moving_mean, self.moving_var = batch_norm(
            X, self.gamma, self.beta, self.moving_mean,
            self.moving_var, eps=1e-5, momentum=0.9)
        return Y



In [3]:
#Define a new class to connect the residual with ELU, BatchNorm, and Conv.
class ResBlock(nn.Module):
    def __init__(self,in_channels):
        super(ResBlock, self).__init__()
        self.conv=nn.Sequential(nn.Conv2d(in_channels, in_channels, 3, 1, 1),
                                 BatchNorm(in_channels, num_dims=4),
                                 nn.ELU())
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                # insert code to compute sigma
                sigma = np.sqrt(1/(m.out_channels*m.kernel_size[0]*m.kernel_size[0]))
                m.weight.data.normal_(0, sigma)
                m.bias.data.zero_()
        
    def forward(self, input):        
        u=self.conv(input)+input
        return u
        


class ResNet(nn.Module):
    def __init__(self, k):
        super(ResNet, self).__init__()

        # write code here to instantiate layers
        # for example, self.conv = nn.Conv2d(1, 4, 3, 1, 1)
        # creates a conv layer with 1 input channel, 4 output
        # channels, a 3x3 kernel, and stride=padding=1
        self.conv1=[]
        self.conv1.append(nn.Conv2d(1, 4, 3, 1, 1))
        self.conv1.append(BatchNorm(4, num_dims=4))
        self.conv1.append(nn.ELU())
        for i in range((k//3)-1):
         self.conv1.append(ResBlock(4))         
                
        self.conv2=[]
        self.conv2.append(nn.Conv2d(4, 8, 3, 1, 1))
        self.conv2.append(BatchNorm(8, num_dims=4))
        self.conv2.append(nn.ELU())
        for i in range((k//3)-1):
         self.conv2.append(ResBlock(8))
                 
        self.conv3=[]
        self.conv3.append(nn.Conv2d(8, 16, 3, 1, 1))
        self.conv3.append(BatchNorm(16, num_dims=4))
        self.conv3.append(nn.ELU())
        for i in range((k//3)-1):
         self.conv3.append(ResBlock(16))         
        
        self.conv1=nn.Sequential(*self.conv1)
        self.conv2=nn.Sequential(*self.conv2)
        self.conv3=nn.Sequential(*self.conv3)       
       
        self.ln = nn.Linear(144, 10)
        self.flat = nn.Flatten()     

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                # insert code to compute sigma
                sigma = np.sqrt(1/(m.out_channels*m.kernel_size[0]*m.kernel_size[0]))
                m.weight.data.normal_(0, sigma)
                m.bias.data.zero_()
        
    def forward(self, input):
        
        # write code here to define how the output u is computed
        # from the input and the model's layers
        # for example, u = self.conv(input) defines u
        # to be simply the output of self.conv given 'input'
        x=self.conv1(input) 
        x=F.avg_pool2d(x,kernel_size=2,stride=2)
        
        x=self.conv2(x) 
        x=F.avg_pool2d(x,kernel_size=2,stride=2)
        
        x=self.conv3(x) 
        x=F.avg_pool2d(x,kernel_size=2,stride=2)
              
        u=self.flat(x)
        u=self.ln(u) 
        return u

In [15]:
#Convex combination improvement:
lr = 0.01
for k in [24, 36]:
  print("\nTraining ResNet with {} layers".format(k))
  model = ResNet(k).cuda()
  optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
  train_errs, test_errs = train(3, model, criterion, optimizer, train_loader, test_loader)


Training ResNet with 24 layers
Epoch 000/003, Train Error 59.95% || Test Error 52.69%
Epoch 001/003, Train Error 58.23% || Test Error 50.05%
Epoch 002/003, Train Error 47.53% || Test Error 45.27%

Training ResNet with 36 layers
Epoch 000/003, Train Error 62.41% || Test Error 53.92%
Epoch 001/003, Train Error 54.01% || Test Error 49.53%
Epoch 002/003, Train Error 46.99% || Test Error 45.50%


In [16]:

#Original BN paper
def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
    # Use `is_grad_enabled` to determine whether the current mode is training
    # mode or prediction mode
    if not torch.is_grad_enabled():
        # If it is prediction mode, directly use the mean and variance
        # obtained by moving average
        X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps)
    else:
        assert len(X.shape) in (2, 4)
        if len(X.shape) == 2:
            # When using a fully-connected layer, calculate the mean and
            # variance on the feature dimension
            mean = X.mean(dim=0)
            var = ((X - mean) ** 2).mean(dim=0)
        else:
            # When using a two-dimensional convolutional layer, calculate the
            # mean and variance on the channel dimension (axis=1). Here we
            # need to maintain the shape of `X`, so that the broadcasting
            # operation can be carried out later
            mean = X.mean(dim=(0, 2, 3), keepdim=True)
            var = ((X - mean) ** 2).mean(dim=(0, 2, 3), keepdim=True)
        # In training mode, the current mean and variance are used for the
        # standardization
        X_hat = (X - mean) / torch.sqrt(var + eps)
        # Update the mean and variance using moving average
        moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
        moving_var = momentum * moving_var + (1.0 - momentum) * var
    Y = gamma * X_hat + beta  # Scale and shift
    return Y, moving_mean.data, moving_var.data

class BatchNorm(nn.Module):
    # `num_features`: the number of outputs for a fully-connected layer
    # or the number of output channels for a convolutional layer. `num_dims`:
    # 2 for a fully-connected layer and 4 for a convolutional layer
    def __init__(self, num_features, num_dims):
        super().__init__()
        if num_dims == 2:
            shape = (1, num_features)
        else:
            shape = (1, num_features, 1, 1)
        # The scale parameter and the shift parameter (model parameters) are
        # initialized to 1 and 0, respectively
        self.gamma = nn.Parameter(torch.ones(shape))
        self.beta = nn.Parameter(torch.zeros(shape))
        # The variables that are not model parameters are initialized to 0 and 1
        self.moving_mean = torch.zeros(shape)
        self.moving_var = torch.ones(shape)

    def forward(self, X):
        # If `X` is not on the main memory, copy `moving_mean` and
        # `moving_var` to the device where `X` is located
        if self.moving_mean.device != X.device:
            self.moving_mean = self.moving_mean.to(X.device)
            self.moving_var = self.moving_var.to(X.device)
        # Save the updated `moving_mean` and `moving_var`
        Y, self.moving_mean, self.moving_var = batch_norm(
            X, self.gamma, self.beta, self.moving_mean,
            self.moving_var, eps=1e-5, momentum=0.9)
        return Y


lr = 0.01

In [17]:
k=60
print("\nTraining ResNet with {} layers".format(k))
model = ResNet(k).cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
train_errs, test_errs = train(3, model, criterion, optimizer, train_loader, test_loader)


Training ResNet with 60 layers
Epoch 000/003, Train Error 71.73% || Test Error 61.80%
Epoch 001/003, Train Error 59.02% || Test Error 56.25%
Epoch 002/003, Train Error 53.16% || Test Error 51.02%


In [18]:
#Change intitilaization
def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
    # Use `is_grad_enabled` to determine whether the current mode is training
    # mode or prediction mode
    if not torch.is_grad_enabled():
        # If it is prediction mode, directly use the mean and variance
        # obtained by moving average
        X_hat = (X - moving_mean) #/ torch.sqrt(moving_var + eps)
    else:
        assert len(X.shape) in (2, 4)
        if len(X.shape) == 2:
            # When using a fully-connected layer, calculate the mean and
            # variance on the feature dimension
            mean = X.mean(dim=0)
            var = ((X - mean) ** 2).mean(dim=0)
        else:
            # When using a two-dimensional convolutional layer, calculate the
            # mean and variance on the channel dimension (axis=1). Here we
            # need to maintain the shape of `X`, so that the broadcasting
            # operation can be carried out later
            mean = X.mean(dim=(0, 2, 3), keepdim=True)
            var = ((X - mean) ** 2).mean(dim=(0, 2, 3), keepdim=True)
        # In training mode, the current mean and variance are used for the
        # standardization
        X_hat = (X - mean) #/ torch.sqrt(var + eps)
        # Update the mean and variance using moving average
        moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
        moving_var = momentum * moving_var + (1.0 - momentum) * var
    Y = gamma * X_hat + beta  # Scale and shift
    return Y, moving_mean.data, moving_var.data


class BatchNorm(nn.Module):
    # `num_features`: the number of outputs for a fully-connected layer
    # or the number of output channels for a convolutional layer. `num_dims`:
    # 2 for a fully-connected layer and 4 for a convolutional layer
    def __init__(self, num_features, num_dims):
        super().__init__()
        if num_dims == 2:
            shape = (1, num_features)
        else:
            shape = (1, num_features, 1, 1)
        # The scale parameter and the shift parameter (model parameters) are
        # initialized to 1 and 0, respectively
        self.init=True
        self.gamma = nn.Parameter(torch.ones(shape)) 
        self.beta = nn.Parameter(torch.zeros(shape))
        # The variables that are not model parameters are initialized to 0 and 1
        self.moving_mean = torch.zeros(shape)
        self.moving_var = torch.ones(shape)

    def forward(self, X):
        # If `X` is not on the main memory, copy `moving_mean` and
        # `moving_var` to the device where `X` is located
        if self.moving_mean.device != X.device:
            self.moving_mean = self.moving_mean.to(X.device)
            self.moving_var = self.moving_var.to(X.device)
        # Save the updated `moving_mean` and `moving_var`
        if self.init:
          self.init=False
          mean = X.mean(dim=0)
          var = ((X - mean) ** 2).mean(dim=0)            
          for i in self.gamma:
            i = i/torch.sqrt(var)


        Y, self.moving_mean, self.moving_var = batch_norm(
            X, self.gamma, self.beta, self.moving_mean,
            self.moving_var, eps=1e-5, momentum=0.9)
        return Y


In [19]:
lr = 0.01
for k in[3, 6]:
  print("\nTraining ResNet with {} layers".format(k))
  model = ResNet(k).cuda()
  optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
  train_errs, test_errs = train(3, model, criterion, optimizer, train_loader, test_loader)


Training ResNet with 3 layers
Epoch 000/003, Train Error 63.13% || Test Error 56.59%
Epoch 001/003, Train Error 53.28% || Test Error 50.80%
Epoch 002/003, Train Error 48.83% || Test Error 48.20%

Training ResNet with 6 layers
Epoch 000/003, Train Error 60.92% || Test Error 53.04%
Epoch 001/003, Train Error 53.17% || Test Error 48.54%
Epoch 002/003, Train Error 45.94% || Test Error 45.63%


In [14]:
#Not using
class ResBlock(nn.Module):
    def __init__(self,in_channels):
        super(ResBlock, self).__init__()
        self.conv=nn.Sequential(nn.Conv2d(in_channels, in_channels, 3, 1, 1),
                                 nn.BatchNorm2d(in_channels),
                                 nn.ELU())
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                # insert code to compute sigma
                sigma = np.sqrt(1/(m.out_channels*m.kernel_size[0]*m.kernel_size[0]))
                m.weight.data.normal_(0, sigma)
                m.bias.data.zero_()
        
    def forward(self, input):        
        u=self.conv(input)+input
        return u
        


class ResNet(nn.Module):
    def __init__(self, k):
        super(ResNet, self).__init__()

        # write code here to instantiate layers
        # for example, self.conv = nn.Conv2d(1, 4, 3, 1, 1)
        # creates a conv layer with 1 input channel, 4 output
        # channels, a 3x3 kernel, and stride=padding=1
        self.conv1=[]
        self.conv1.append(nn.Conv2d(3, 3, 3, 1, 1))
        self.conv1.append(nn.BatchNorm2d(3))
        self.conv1.append(nn.ELU())
        for i in range((k//3)-1):
         self.conv1.append(ResBlock(3))         
                
        self.conv2=[]
        self.conv2.append(nn.Conv2d(3, 6, 3, 1, 1))
        self.conv2.append(nn.BatchNorm2d(6))
        self.conv2.append(nn.ELU())
        for i in range((k//3)-1):
         self.conv2.append(ResBlock(6))
                 
        self.conv3=[]
        self.conv3.append(nn.Conv2d(6, 12, 3, 1, 1))
        self.conv3.append(nn.BatchNorm2d(12))
        self.conv3.append(nn.ELU())
        for i in range((k//3)-1):
         self.conv3.append(ResBlock(12))         
        
        self.conv1=nn.Sequential(*self.conv1)
        self.conv2=nn.Sequential(*self.conv2)
        self.conv3=nn.Sequential(*self.conv3)       
       
        self.ln = nn.Linear(192, 10)
        self.flat = nn.Flatten()     

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                # insert code to compute sigma
                sigma = np.sqrt(1/(m.out_channels*m.kernel_size[0]*m.kernel_size[0]))
                m.weight.data.normal_(0, sigma)
                m.bias.data.zero_()
        
    def forward(self, input):
        
        # write code here to define how the output u is computed
        # from the input and the model's layers
        # for example, u = self.conv(input) defines u
        # to be simply the output of self.conv given 'input'
        x=self.conv1(input) 
        x=F.avg_pool2d(x,kernel_size=2,stride=2)
        
        x=self.conv2(x) 
        x=F.avg_pool2d(x,kernel_size=2,stride=2)
        
        x=self.conv3(x) 
        x=F.avg_pool2d(x,kernel_size=2,stride=2)
              
        u=self.flat(x)
        u=self.ln(u) 
        return u

   
k = 12
lr = 0.01

print("\nTraining ResNet with {} layers".format(k))
model = ResNet(k).cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
train_errs, test_errs = train(3, model, criterion, optimizer, train_loader, test_loader)


Training ResNet with 12 layers
Epoch 000/003, Train Error 60.86% || Test Error 53.57%
Epoch 001/003, Train Error 54.22% || Test Error 50.24%
Epoch 002/003, Train Error 47.39% || Test Error 45.22%
