# Meta-Learning for parameter initialization

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time
from IPython.display import clear_output
import torchvision
import torchvision.transforms as transforms
from torchvision import datasets
import matplotlib.pyplot as plt
from itertools import product
from sklearn.metrics import confusion_matrix

### Download the dataset

In [2]:
# Use standard FashionMNIST dataset
train_set = torchvision.datasets.FashionMNIST(
    root = './data/FashionMNIST',
    train = True,
    download = True,
    transform = transforms.Compose([
        transforms.ToTensor()    # turn images into Tensor so we can directly use it with our network                             
    ])
)

In [3]:
'''Checking the data set classes and amount of data in each class.'''
idx2class = {v: k for k, v in train_set.class_to_idx.items()}
def get_class_distribution(dataset_obj):
    count_dict = {k:0 for k,v in dataset_obj.class_to_idx.items()}
    
    for element in dataset_obj:
        y_lbl = element[1]
        y_lbl = idx2class[y_lbl]
        count_dict[y_lbl] += 1
            
    return count_dict
print("Distribution of classes: \n", get_class_distribution(train_set))

Distribution of classes: 
 {'T-shirt/top': 6000, 'Trouser': 6000, 'Pullover': 6000, 'Dress': 6000, 'Coat': 6000, 'Sandal': 6000, 'Shirt': 6000, 'Sneaker': 6000, 'Bag': 6000, 'Ankle boot': 6000}


### Five different initialization approaches.

### 1. init with kaiming_normal_
torch.nn.init.kaiming_normal_(tensor)


In [4]:
input_size = 784
hidden_sizes = [128, 64]
output_size = 10

model_kaiming = nn.Sequential(nn.Linear(input_size, hidden_sizes[0]),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[0], hidden_sizes[1]),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[1], output_size),
                      nn.LogSoftmax(dim=1))
def init_ones(m):
    if type(m) == nn.Linear:
        nn.init.kaiming_normal_(m.weight)
        
model_kaiming.apply(init_ones)
print(model_kaiming)

Sequential(
  (0): Linear(in_features=784, out_features=128, bias=True)
  (1): ReLU()
  (2): Linear(in_features=128, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=10, bias=True)
  (5): LogSoftmax(dim=1)
)


### 2. init with  uniform_: 
torch.nn.init.uniform_(tensor, a=0.0, b=1.0)


In [5]:
input_size = 784
hidden_sizes = [128, 64]
output_size = 10

model_uniform = nn.Sequential(nn.Linear(input_size, hidden_sizes[0]),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[0], hidden_sizes[1]),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[1], output_size),
                      nn.LogSoftmax(dim=1))
def init_uniform(m):
    if type(m) == nn.Linear:
        nn.init.uniform_(m.weight)
        
model_uniform.apply(init_uniform)
print(model_uniform)

Sequential(
  (0): Linear(in_features=784, out_features=128, bias=True)
  (1): ReLU()
  (2): Linear(in_features=128, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=10, bias=True)
  (5): LogSoftmax(dim=1)
)


### 3. init with eye_: 
torch.nn.init.eye_(tensor)

In [6]:
input_size = 784
hidden_sizes = [128, 64]
output_size = 10

model_eye = nn.Sequential(nn.Linear(input_size, hidden_sizes[0]),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[0], hidden_sizes[1]),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[1], output_size),
                      nn.LogSoftmax(dim=1))
def init_zeros(m):
    if type(m) == nn.Linear:
        nn.init.eye_(m.weight)
        
model_eye.apply(init_zeros)
print(model_eye)

Sequential(
  (0): Linear(in_features=784, out_features=128, bias=True)
  (1): ReLU()
  (2): Linear(in_features=128, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=10, bias=True)
  (5): LogSoftmax(dim=1)
)


### 4. init with  normal_: 
torch.nn.init.normal_(tensor, mean=0.0, std=1.0)


In [7]:
input_size = 784
hidden_sizes = [128, 64]
output_size = 10

model_normal = nn.Sequential(nn.Linear(input_size, hidden_sizes[0]),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[0], hidden_sizes[1]),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[1], output_size),
                      nn.LogSoftmax(dim=1))
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight)
        
model_normal.apply(init_normal)
print(model_normal)

Sequential(
  (0): Linear(in_features=784, out_features=128, bias=True)
  (1): ReLU()
  (2): Linear(in_features=128, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=10, bias=True)
  (5): LogSoftmax(dim=1)
)


### 5. init with constant_: 
torch.nn.init.constant_(tensor, val)


In [8]:
input_size = 784
hidden_sizes = [128, 64]
output_size = 10

model_constant = nn.Sequential(nn.Linear(input_size, hidden_sizes[0]),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[0], hidden_sizes[1]),
                      nn.ReLU(),
                      nn.Linear(hidden_sizes[1], output_size),
                      nn.LogSoftmax(dim=1))
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 0.3)
        
model_constant.apply(init_constant)
print(model_constant)

Sequential(
  (0): Linear(in_features=784, out_features=128, bias=True)
  (1): ReLU()
  (2): Linear(in_features=128, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=10, bias=True)
  (5): LogSoftmax(dim=1)
)


### preparing dataloaders

In [9]:

train_dataloader = torch.utils.data.DataLoader(train_set, batch_size=34, num_workers=0)
dataiterator = iter(train_dataloader)
images, labels = dataiterator.next()
# Loss
criterion = nn.CrossEntropyLoss()
images, labels = next(iter(train_dataloader))
images = images.view(images.shape[0], -1)

logps = model_normal(images) #log probabilities
loss = criterion(logps, labels) #calculate the Cross Entropy Loss

### Training

In [10]:
learning_rate = 1e-4
optimizer = torch.optim.Adam(model_normal.parameters(), lr=learning_rate)

epochs = 12
m_loss_uniform = []
m_loss_normal = []
m_loss_constant = []
m_loss_kaiming = []
m_loss_eye = []
for epoch in range(epochs):
    run_loss_uniform = 0
    run_loss_normal = 0
    run_loss_constant = 0
    run_loss_kaiming = 0
    run_loss_eye = 0
    
    for images, labels in train_dataloader:
        # Flatten FashioMNIST images into a 784 long vector
        images = images.view(images.shape[0], -1)
    
        # Training pass
        optimizer.zero_grad() # set the gradients to zero before starting to do backpropragation
        
        out_uniform = model_uniform(images)
        out_normal = model_normal(images)
        out_constant = model_constant(images)
        out_kaiming = model_kaiming(images)
        out_eye = model_eye(images)
        
        
        loss_uniform = criterion(out_uniform, labels)
        loss_normal = criterion(out_normal, labels)
        loss_constant = criterion(out_constant, labels)
        loss_kaiming = criterion(out_kaiming, labels)
        loss_eye = criterion(out_eye, labels)
        
        # Training model by backprop
        loss_uniform.backward()
        loss_normal.backward()
        loss_constant.backward()
        loss_kaiming.backward()
        loss_eye.backward()
        
        # optimize the weights
        optimizer.step()
        
        run_loss_uniform += loss_uniform.item()
        run_loss_normal += loss_normal.item()
        run_loss_constant += loss_constant.item()
        run_loss_kaiming += loss_kaiming.item()
        run_loss_eye += loss_eye.item()
        
        m_loss_uniform.append(loss_uniform.item())
        m_loss_normal.append(loss_normal.item())
        m_loss_constant.append(loss_constant.item())
        m_loss_kaiming.append(loss_kaiming.item())
        m_loss_eye.append(loss_eye.item())
        
    print("Epoch {} - Training loss with uniform weight: {}".format(epoch, run_loss_uniform/len(train_dataloader)))
    print("Epoch {} - Training loss with normal weight: {}".format(epoch, run_loss_normal/len(train_dataloader)))
    print("Epoch {} - Training loss with constant weights: {}".format(epoch, run_loss_constant/len(train_dataloader)))
    print("Epoch {} - Training loss with kaiming : {}".format(epoch, run_loss_kaiming/len(train_dataloader)))
    print("Epoch {} - Training loss with eye: {}".format(epoch, run_loss_eye/len(train_dataloader)))

Epoch 0 - Training loss with uniform weight: 42522.54282932011
Epoch 0 - Training loss with normal weight: 149.35448463915426
Epoch 0 - Training loss with constant weights: 2.3040513098071047
Epoch 0 - Training loss with kaiming : 2.585275750254774
Epoch 0 - Training loss with eye: 2.315396511250785
Epoch 1 - Training loss with uniform weight: 42522.54282932011
Epoch 1 - Training loss with normal weight: 44.55685523846332
Epoch 1 - Training loss with constant weights: 2.3040513098071047
Epoch 1 - Training loss with kaiming : 2.585275750254774
Epoch 1 - Training loss with eye: 2.315396511250785
Epoch 2 - Training loss with uniform weight: 42522.54282932011
Epoch 2 - Training loss with normal weight: 30.550186090496375
Epoch 2 - Training loss with constant weights: 2.3040513098071047
Epoch 2 - Training loss with kaiming : 2.585275750254774
Epoch 2 - Training loss with eye: 2.315396511250785
Epoch 3 - Training loss with uniform weight: 42522.54282932011
Epoch 3 - Training loss with normal

In [11]:
@torch.no_grad()
def get_predictions(model, loader):
    predictions = torch.tensor([])
    for batch in loader:
        images, labels = batch
        images = images.view(images.shape[0], -1)
        pred = model(images)
        predictions = torch.cat(
            (predictions, pred)
            ,dim=0
        )
    return predictions

with torch.no_grad():
    train_model_uniform = get_predictions(model_uniform, train_dataloader)
    train_model_normal = get_predictions(model_normal, train_dataloader)
    train_model_constant = get_predictions(model_constant, train_dataloader)
    train_model_kaiming = get_predictions(model_kaiming, train_dataloader)
    train_model_eye = get_predictions(model_eye, train_dataloader)

### Compare and report your results using:
classification_report and confusion_matrix

In [12]:
def get_accuracy(predictions, labels):
    return predictions.argmax(dim=1).eq(labels).sum().item()

print('No. of correction predictions: ', get_accuracy(train_model_uniform, train_set.targets))
print('accuracy - uniform weights:', get_accuracy(train_model_uniform, train_set.targets) / len(train_set))

print('No. of correction predictions: ',get_accuracy(train_model_normal, train_set.targets))
print('accuracy - normal weights:', get_accuracy(train_model_normal, train_set.targets) / len(train_set))

print('No. of correction predictions: ',get_accuracy(train_model_constant, train_set.targets))
print('accuracy - constant weights:', get_accuracy(train_model_constant, train_set.targets) / len(train_set))

print('No. of correction predictions: ',get_accuracy(train_model_kaiming, train_set.targets))
print('accuracy kaiming:', get_accuracy(train_model_kaiming, train_set.targets) / len(train_set))

print('No. of correction predictions: ',get_accuracy(train_model_eye, train_set.targets))
print('accuracy eye:', get_accuracy(train_model_eye, train_set.targets) / len(train_set))

No. of correction predictions:  6000
accuracy - uniform weights: 0.1
No. of correction predictions:  40676
accuracy - normal weights: 0.6779333333333334
No. of correction predictions:  6985
accuracy - constant weights: 0.11641666666666667
No. of correction predictions:  4916
accuracy kaiming: 0.08193333333333333
No. of correction predictions:  6012
accuracy eye: 0.1002


In [13]:
print('\n Confusion Matrix with kaiming_normal_ \n')
print(confusion_matrix(train_set.targets, train_model_kaiming.argmax(dim=1)))

print('\n Confusion Matrix for Uniform Weights \n')
print(confusion_matrix(train_set.targets, train_model_uniform.argmax(dim=1)))

print('\n Confusion Matrix with eye_ \n')
print(confusion_matrix(train_set.targets, train_model_eye.argmax(dim=1)))

print('\n Confusion Matrix for Normal Weights \n')
print(confusion_matrix(train_set.targets, train_model_normal.argmax(dim=1)))

print('\n Confusion Matrix for Constant Weights \n')
print(confusion_matrix(train_set.targets, train_model_constant.argmax(dim=1)))


 Confusion Matrix with kaiming_normal_ 

[[ 112    0  340 3011   71    0  424 2042    0    0]
 [ 102    0  294 2573   34    0 1926 1071    0    0]
 [ 202    1  404  122   75    0   35 5161    0    0]
 [  75    0  418 1315   83    0 1925 2184    0    0]
 [ 104    0  558   78  211    0   88 4961    0    0]
 [ 195   48 1539 1710   31    0  230 2247    0    0]
 [ 141    0  339  805  113    0  225 4377    0    0]
 [ 373    0 1188 1672   15    0  103 2649    0    0]
 [ 505    9 1136  169   52    0  362 3767    0    0]
 [  77    2  519 1411   10    0  872 3109    0    0]]

 Confusion Matrix for Uniform Weights 

[[   0    0    0    0    0    0    0    0    0 6000]
 [   0    0    0    0    0    0    0    0    0 6000]
 [   0    0    0    0    0    0    0    0    0 6000]
 [   0    0    0    0    0    0    0    0    0 6000]
 [   0    0    0    0    0    0    0    0    0 6000]
 [   0    0    0    0    0    0    0    0    0 6000]
 [   0    0    0    0    0    0    0    0    0 6000]
 [   0    0    

### Observation

- Model with "Normal distribution" has a better prediction result than all the other 4 models with different init weights.

- The model with normal weights has 46553 correction predictions and an accuracy of 0.77 i.e. ~ 77%