# ==== Import Library ====

In [1]:
# import library
import argparse,copy,h5py, os,sys,time,socket
import numpy as np
import tensorflow as tf
import torch,torchvision,torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torch.nn.functional as F
from   torchsummary import summary
from   torch.autograd.variable import Variable

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from mpl_toolkits.mplot3d import Axes3D  
import matplotlib.pyplot as plt
from matplotlib import ticker,colors

# seed value and plotly
init_notebook_mode(connected=True); torch.manual_seed(7); torch.cuda.manual_seed_all(7); np.set_printoptions(suppress=True,precision=3,); tf.set_random_seed(7)

# ==== Set up the Network ====

In [2]:
# need the module 
class AllConvNet(nn.Module):
    
    def __init__(self, input_size, n_classes=10, **kwargs):
        super(AllConvNet, self).__init__()
        self.conv1 = nn.Conv2d(input_size, 64, 3, padding=1,bias=False)
        self.conv2 = nn.Conv2d(64, 64, 3, padding=1,bias=False)
        self.conv3 = nn.Conv2d(64, 64, 3, padding=1, stride=2,bias=False)
        self.conv4 = nn.Conv2d(64, 128, 3, padding=1,bias=False)
        self.conv5 = nn.Conv2d(128, 128, 3, padding=1,bias=False)
        self.conv6 = nn.Conv2d(128, 128, 3, padding=1, stride=2,bias=False)
        self.conv7 = nn.Conv2d(128, 128, 3, padding=1,bias=False)
        self.conv8 = nn.Conv2d(128, 128, 1,bias=False)
        self.class_conv = nn.Conv2d(128, n_classes, 1,bias=False)

    def forward(self, x):
        
#         conv1_out = F.relu(self.conv1(x))
#         conv2_out = F.relu(self.conv2(conv1_out))
#         conv3_out = F.relu(self.conv3(conv2_out))
        
#         conv4_out = F.relu(self.conv4(conv3_out))
#         conv5_out = F.relu(self.conv5(conv4_out))
#         conv6_out = F.relu(self.conv6(conv5_out))
        
#         conv7_out = F.relu(self.conv7(conv6_out))
#         conv8_out = F.relu(self.conv8(conv7_out))
#         class_out = F.relu(self.class_conv(conv8_out))
#         pool_out  = F.adaptive_avg_pool2d(class_out, 1)
        
        conv1_out = F.elu(self.conv1(x))
        conv2_out = self.conv2(conv1_out)
        conv3_out = F.elu(self.conv3(conv2_out))
        
        conv4_out = self.conv4(conv3_out)
        conv5_out = F.elu(self.conv5(conv4_out))
        conv6_out = self.conv6(conv5_out)
        
        conv7_out = F.elu(self.conv7(conv6_out))
        conv8_out = self.conv8(conv7_out)
        class_out = F.elu(self.class_conv(conv8_out))
        pool_out  = F.adaptive_avg_pool2d(class_out, 1)
        
        pool_out.squeeze_(-1)
        pool_out.squeeze_(-1)
        return pool_out
    
    
class AllConvNet_BN(nn.Module):
    
    def __init__(self, input_size, n_classes=10, **kwargs):
        super(AllConvNet_BN, self).__init__()
        
        self.conv1 = nn.Conv2d(input_size,96, 3, padding=1,bias=False)
        self.bn1   = nn.BatchNorm2d(96)

        self.conv2 = nn.Conv2d(96, 96, 3, padding=1,bias=False)
        self.bn2   = nn.BatchNorm2d(96)

        self.conv3 = nn.Conv2d(96, 96, 3, padding=1, stride=2,bias=False)
        self.bn3   = nn.BatchNorm2d(96)
        
        self.conv4 = nn.Conv2d(96, 192, 3, padding=1,bias=False)
        self.bn4   = nn.BatchNorm2d(192)

        self.conv5 = nn.Conv2d(192, 192, 3, padding=1,bias=False)
        self.bn5   = nn.BatchNorm2d(192)

        self.conv6 = nn.Conv2d(192, 192, 3, padding=1, stride=2,bias=False)
        self.bn6   = nn.BatchNorm2d(192)
        
        self.conv7 = nn.Conv2d(192, 192, 3, padding=1,bias=False)
        self.bn7   = nn.BatchNorm2d(192)
        
        self.conv8 = nn.Conv2d(192, 192, 1,bias=False)
        self.bn8   = nn.BatchNorm2d(192)
        
        self.class_conv = nn.Conv2d(192, n_classes, 1,bias=False)
    def forward(self, x):
        
        conv1_out = F.relu(self.conv1(x))
        conv1_out = self.bn1(conv1_out)
        
        conv2_out = F.relu(self.conv2(conv1_out))
        conv2_out = self.bn2(conv2_out)

        conv3_out = F.relu(self.conv3(conv2_out))
        conv3_out = self.bn3(conv3_out)
        
        conv4_out = F.relu(self.conv4(conv3_out))
        conv4_out = self.bn4(conv4_out)

        conv5_out = F.relu(self.conv5(conv4_out))
        conv5_out = self.bn5(conv5_out)

        conv6_out = F.relu(self.conv6(conv5_out))
        conv6_out = self.bn6(conv6_out)
        
        conv7_out = F.relu(self.conv7(conv6_out))
        conv7_out = self.bn7(conv7_out)

        conv8_out = F.relu(self.conv8(conv7_out))
        conv8_out = self.bn8(conv8_out)

        class_out = F.relu(self.class_conv(conv8_out))
        pool_out  = F.adaptive_avg_pool2d(class_out, 1)
        
        pool_out.squeeze_(-1)
        pool_out.squeeze_(-1)
        return pool_out
class AllConvNet_LocalRN(nn.Module):
    
    def __init__(self, input_size, n_classes=10, **kwargs):
        super(AllConvNet_BN, self).__init__()
        
        self.conv1 = nn.Conv2d(input_size,96, 3, padding=1,bias=False)
        self.bn1   = nn.LocalResponseNorm(96)

        self.conv2 = nn.Conv2d(96, 96, 3, padding=1,bias=False)
        self.bn2   = nn.LocalResponseNorm(96)

        self.conv3 = nn.Conv2d(96, 96, 3, padding=1, stride=2,bias=False)
        self.bn3   = nn.LocalResponseNorm(96)
        
        self.conv4 = nn.Conv2d(96, 192, 3, padding=1,bias=False)
        self.bn4   = nn.LocalResponseNorm(192)

        self.conv5 = nn.Conv2d(192, 192, 3, padding=1,bias=False)
        self.bn5   = nn.LocalResponseNorm(192)

        self.conv6 = nn.Conv2d(192, 192, 3, padding=1, stride=2,bias=False)
        self.bn6   = nn.LocalResponseNorm(192)
        
        self.conv7 = nn.Conv2d(192, 192, 3, padding=1,bias=False)
        self.bn7   = nn.LocalResponseNorm(192)
        
        self.conv8 = nn.Conv2d(192, 192, 1,bias=False)
        self.bn8   = nn.LocalResponseNorm(192)
        
        self.class_conv = nn.Conv2d(192, n_classes, 1,bias=False)
    def forward(self, x):
        
        conv1_out = F.relu(self.conv1(x))
        conv1_out = self.bn1(conv1_out)
        
        conv2_out = F.relu(self.conv2(conv1_out))
        conv2_out = self.bn2(conv2_out)

        conv3_out = F.relu(self.conv3(conv2_out))
        conv3_out = self.bn3(conv3_out)
        
        conv4_out = F.relu(self.conv4(conv3_out))
        conv4_out = self.bn4(conv4_out)

        conv5_out = F.relu(self.conv5(conv4_out))
        conv5_out = self.bn5(conv5_out)

        conv6_out = F.relu(self.conv6(conv5_out))
        conv6_out = self.bn6(conv6_out)
        
        conv7_out = F.relu(self.conv7(conv6_out))
        conv7_out = self.bn7(conv7_out)

        conv8_out = F.relu(self.conv8(conv7_out))
        conv8_out = self.bn8(conv8_out)

        class_out = F.relu(self.class_conv(conv8_out))
        pool_out  = F.adaptive_avg_pool2d(class_out, 1)
        
        pool_out.squeeze_(-1)
        pool_out.squeeze_(-1)
        return pool_out

In [3]:
# load model with some batch size
batch_size = 128 ; learning_rate = 0.0008
net        = AllConvNet(3).cuda()

# load the loss 
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=learning_rate)

net.eval()
summary(net, (3, 32, 32))
print("Cuda: ",next(net.parameters()).is_cuda)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 64, 32, 32]           1,728
            Conv2d-2           [-1, 64, 32, 32]          36,864
            Conv2d-3           [-1, 64, 16, 16]          36,864
            Conv2d-4          [-1, 128, 16, 16]          73,728
            Conv2d-5          [-1, 128, 16, 16]         147,456
            Conv2d-6            [-1, 128, 8, 8]         147,456
            Conv2d-7            [-1, 128, 8, 8]         147,456
            Conv2d-8            [-1, 128, 8, 8]          16,384
            Conv2d-9             [-1, 10, 8, 8]           1,280
Total params: 609,216
Trainable params: 609,216
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.01
Forward/backward pass size (MB): 1.82
Params size (MB): 2.32
Estimated Total Size (MB): 4.15
-------------------------------------------

In [4]:
#  Get the weights - after training
def get_weights(net):            return [p.data for p in net.parameters()]
def get_random_weights(weights): return [torch.randn(w.size()) for w in weights]
def normalize_direction(direction, weights,norm='filter'):
    if norm == 'filter':
        # Rescale the filters (weights in group) in 'direction' so that each filter has the same norm as its corresponding filter in 'weights'.
        for d, w in zip(direction, weights): 
            # the direction channel gets norm via the channel
            d.mul_(w.norm()/(d.norm() + 1e-10))

    elif norm == 'layer':
        # Rescale the layer variables in the direction so that each layer has the same norm as the layer variables in weights.
        direction.mul_(weights.norm()/direction.norm())

    elif norm == 'weight':
        # Rescale the entries in the direction so that each entry has the same scale as the corresponding weight.
        direction.mul_(weights.cpu())

    elif norm == 'dfilter':
        # Rescale the entries in the direction so that each filter direction has the unit norm.
        for d in direction: 
            d.div_(d.norm() + 1e-10)

    elif norm == 'dlayer':
        # Rescale the entries in the direction so that each layer direction has the unit norm.
        direction.div_(direction.norm())
weight      = get_weights(net)
temp_layer  = 0
for x in weight: 
    temp_layer = temp_layer + 1
    print(str(temp_layer) + " layer " + str(x.shape))

1 layer torch.Size([64, 3, 3, 3])
2 layer torch.Size([64, 64, 3, 3])
3 layer torch.Size([64, 64, 3, 3])
4 layer torch.Size([128, 64, 3, 3])
5 layer torch.Size([128, 128, 3, 3])
6 layer torch.Size([128, 128, 3, 3])
7 layer torch.Size([128, 128, 3, 3])
8 layer torch.Size([128, 128, 1, 1])
9 layer torch.Size([10, 128, 1, 1])


In [5]:
# prepare the data set - here CIFAR
normalize = transforms.Normalize(mean=[x/255.0 for x in [125.3, 123.0, 113.9]],std=[x/255.0 for x in [63.0, 62.1, 66.7]])
transform = transforms.Compose([transforms.ToTensor(),normalize,])
trainset    = torchvision.datasets.CIFAR10(root='.', train=True,download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=False, num_workers=3)
testset     = torchvision.datasets.CIFAR10(root='.', train=False, download=True, transform=transform)
testloader  = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=3)
classes     = ('plane', 'car', 'bird', 'cat','deer', 'dog', 'frog', 'horse', 'ship', 'truck')
print(trainset,testset)

Files already downloaded and verified
Files already downloaded and verified
Dataset CIFAR10
    Number of datapoints: 50000
    Split: train
    Root Location: .
    Transforms (if any): Compose(
                             ToTensor()
                             Normalize(mean=[0.4913725490196078, 0.4823529411764706, 0.4466666666666667], std=[0.24705882352941178, 0.24352941176470588, 0.2615686274509804])
                         )
    Target Transforms (if any): None Dataset CIFAR10
    Number of datapoints: 10000
    Split: test
    Root Location: .
    Transforms (if any): Compose(
                             ToTensor()
                             Normalize(mean=[0.4913725490196078, 0.4823529411764706, 0.4466666666666667], std=[0.24705882352941178, 0.24352941176470588, 0.2615686274509804])
                         )
    Target Transforms (if any): None


# ==== Train the Network & Get Converged Weights ====

In [6]:
# train the network
num_epoch   = 100
acurracy_list_train = []; loss_list_train     = []; acurracy_list_test  = []; loss_list_test     = []

for epoch in range(num_epoch):  # loop over the dataset multiple times
    correct_train    = 0; total_loss_train = 0; total_train      = 0 
    for batch_idx, (inputs, targets) in enumerate(trainloader):
    
        # modify the inputs
        batch_size  = inputs.size(0)
        total_train = total_train + batch_size
        inputs  = Variable(inputs);      targets = Variable(targets)
        inputs, targets = inputs.cuda(), targets.cuda()
        
        # zero the parameter gradients
        optimizer.zero_grad()
        
        # get the output and calculate loss and acc
        outputs = net(inputs)
        loss    = criterion(outputs, targets)
        total_loss_train   = total_loss_train + loss.item()*batch_size
        _, predicted = torch.max(outputs.data, 1)
        correct_train      = correct_train + predicted.eq(targets).sum().item()
        
        # update
        loss.backward(); optimizer.step()
        
        # print
        sys.stdout.write("Epoch: "+str(epoch)+" i : "+str(batch_idx+1)+" and loss: "+str(np.around(loss.item(),3))+" acc: "+str(predicted.eq(targets).sum().item()/batch_size)+"\r")
        sys.stdout.flush()
    sys.stdout.write("\tEpoch: "+str(epoch)+"\tTrain loss: "+str(np.around(total_loss_train/total_train,3))+"\tTrain Acc: "+str(100.*correct_train/total_train)+"\n")
    loss_list_train.append(total_loss_train/total_train) ;acurracy_list_train.append(100.*correct_train/total_train)
    
    correct = 0; total_loss = 0; total = 0 
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            batch_size = inputs.size(0)
            total += batch_size
            inputs  = Variable(inputs); targets = Variable(targets)
            inputs, targets = inputs.cuda(), targets.cuda()
            outputs = net(inputs)
            loss    = criterion(outputs, targets)
            total_loss   = total_loss + loss.item()*batch_size
            _, predicted = torch.max(outputs.data, 1)
            correct      = correct + predicted.eq(targets).sum().item()
            # print
            sys.stdout.write("Epoch: "+str(epoch)+" i : "+str(batch_idx+1)+" and loss: "+str(np.around(loss.item(),3))+" acc: "+str(predicted.eq(targets).sum().item()/batch_size)+"\r")
            sys.stdout.flush()
    sys.stdout.write("\tEpoch: "+str(epoch)+"\tTest loss: "+str(np.around(total_loss/total,3))+"\tTest Acc: "+str(100.*correct/total)+"\n")
    loss_list_test.append(total_loss/total) ;acurracy_list_test.append(100.*correct/total)
    sys.stdout.write("==========================\n")

	Epoch: 0	Train loss: 1.857	Train Acc: 30.87375
	Epoch: 0	Test loss: 1.61	Test Acc: 40.6893755
	Epoch: 1	Train loss: 1.423	Train Acc: 48.36425
	Epoch: 1	Test loss: 1.188	Test Acc: 57.485625
	Epoch: 2	Train loss: 1.125	Train Acc: 60.03625
	Epoch: 2	Test loss: 1.04	Test Acc: 63.2993755
	Epoch: 3	Train loss: 0.966	Train Acc: 66.20855
	Epoch: 3	Test loss: 0.933	Test Acc: 67.453755
	Epoch: 4	Train loss: 0.851	Train Acc: 70.43255
	Epoch: 4	Test loss: 0.885	Test Acc: 69.340625
	Epoch: 5	Train loss: 0.77	Train Acc: 73.324625
	Epoch: 5	Test loss: 0.828	Test Acc: 71.513125
Epoch: 6 i : 82 and loss: 0.967 acc: 0.6953125

KeyboardInterrupt: 

In [None]:
# get the converged weights and plot them save the accuracy
converged_weights = get_weights(net)
plt.figure(figsize=(15,5))
plt.subplot(121); 
plt.plot(acurracy_list_train,label='Train Acc')
plt.plot(acurracy_list_test,label='Test Loss')
plt.legend()

plt.subplot(122); 
plt.plot(loss_list_train,label='Train Acc')
plt.plot(loss_list_test,label='Test Loss')

plt.legend()
plt.show()

np.save('a_acurracy_list_train.npy', np.asarray(acurracy_list_train))
np.save('a_acurracy_list_test.npy', np.asarray(acurracy_list_test))
np.save('a_loss_list_train.npy', np.asarray(loss_list_train))
np.save('a_loss_list_test.npy', np.asarray(loss_list_test))

In [None]:
# create help functions
def viz_histogram_weights(converged_weights, direction1,direction2,title="None"):
    plt.figure(figsize=(55,55//9))
    plt.suptitle(title, fontsize=20, y=1.15)
    for layer_index in range(len(converged_weights)):
        plt.subplot(1,9,layer_index+1)
        plt.title("Layer : " + str(layer_index))
        plt.hist(converged_weights[layer_index].cpu().numpy().ravel(),50,alpha=0.6,label='Weight')
        plt.hist(direction1[layer_index].cpu().numpy().ravel(),50,alpha=0.2,label='Direction 1')
        plt.hist(direction2[layer_index].cpu().numpy().ravel(),50,alpha=0.2,label='Direction 2')
        plt.yticks([])
        plt.legend()
    plt.show()
def create_viz(loss_list,acc_list,title="none"):
    
    # plot the loss functions
    plt.figure(figsize=(18,6))
    plt.subplot(131)
    plt.title("Original Contour")
    CS = plt.contour(xcoord_mesh, ycoord_mesh, loss_list,  10, zorder=1, cmap='terrain', linestyles='--')
    plt.clabel(CS, inline=1, fontsize=8)

    plt.subplot(132)
    plt.title("Original Contour with Color")
    plt.contour(xcoord_mesh, ycoord_mesh, loss_list,  10, zorder=1, cmap='terrain', linestyles='--')
    CS = plt.contourf(xcoord_mesh, ycoord_mesh, loss_list, 10, zorder=1, cmap='terrain', linestyles='--')   
    plt.clabel(CS, fontsize=12,inline=0,fmt = '%2.1f')
    plt.colorbar(CS)

    plt.subplot(133)
    plt.title("Log Scale")
    CS = plt.contour(xcoord_mesh, ycoord_mesh, np.log(loss_list+1e-8),10,zorder=1, cmap='terrain', linestyles='--'); 
    plt.clabel(CS, fontsize=8,inline=1)

    plt.savefig(title)
    plt.show()
    
    data = [
        go.Surface(
            x=xcoord_mesh,y=ycoord_mesh,
            z=(loss_list.max()-loss_list.min())*(acc_list-acc_list.min())/(acc_list.max()-acc_list.min()+1e-8)+loss_list.min(),
            showscale=False, opacity=0.6,colorscale='Cividis',
        ),
        go.Surface(
            x=xcoord_mesh,y=ycoord_mesh,z=loss_list,colorscale='Jet',opacity=0.9,
            contours=go.surface.Contours(z=go.surface.contours.Z(show=True,usecolormap=True,project=dict(z=True),),
            )
        )
    ]
    layout = go.Layout(title='Loss / Accuracy',autosize=True,scene=dict(camera=dict(eye=dict(x=1.87, y=0.88, z=-0.64))),margin=dict(l=65,r=50,b=65,t=90))
    fig    = go.Figure(data=data,layout=layout); iplot(fig); plt.show()

    data = [
        go.Surface(
            x=xcoord_mesh,y=ycoord_mesh,
            z=(np.log(loss_list).max()-np.log(loss_list).min())*(acc_list-acc_list.min())/(acc_list.max()-acc_list.min()+1e-8)+np.log(loss_list).min(),
            showscale=False, opacity=0.6,colorscale='Cividis',
        ),
        go.Surface(
            x=xcoord_mesh,y=ycoord_mesh,z=np.log(loss_list),colorscale='Jet',opacity=0.9,
            contours=go.surface.Contours(z=go.surface.contours.Z(show=True,usecolormap=True,project=dict(z=True),),
            )
        )
    ]
    layout = go.Layout(title='Log Scale Loss / Accuracy',autosize=True,scene=dict(camera=dict(eye=dict(x=1.87, y=0.88, z=-0.64))),margin=dict(l=65,r=50,b=65,t=90))
    fig    = go.Figure(data=data,layout=layout); iplot(fig); plt.show()

In [None]:
# create the coordinates
numebr_of_points = 21 ; small_range = -1.0 ; large_range =  1.0

xcoordinates = np.linspace(small_range, large_range, num=numebr_of_points) 
ycoordinates = np.linspace(small_range, large_range, num=numebr_of_points) 

xcoord_mesh, ycoord_mesh = np.meshgrid(xcoordinates, ycoordinates)
inds = np.array(range(numebr_of_points**2))
s1   = xcoord_mesh.ravel()[inds]
s2   = ycoord_mesh.ravel()[inds]
coordinate = np.c_[s1,s2]
print('From ',small_range,' to ',large_range,' with ',numebr_of_points,' total number of coordinate: ', numebr_of_points**2)

In [None]:
torch.save(net, 'relu.pth.tar')

# ==== Play Around with the Weigths ====

In [None]:
copy_of_the_weights = [ w.clone() for w in converged_weights]
sess = tf.InteractiveSession()

In [None]:
random_direction1 = get_random_weights(copy_of_the_weights)
random_direction2 = [w.clone() for w in random_direction1]

In [None]:
for d,w in zip(random_direction1,copy_of_the_weights):
    normalize_direction(d,w,'filter')

In [None]:
temp = []
for d,w in zip(random_direction2,copy_of_the_weights):
    d_re   = d.view((d.shape[0],-1))
    d_norm = d_re.norm(dim=(1),keepdim=True)[:,:,None,None]
    
    w_re   = w.view((w.shape[0],-1))
    w_norm = w_re.norm(dim=(1),keepdim=True)[:,:,None,None]
    temp.append(d.cuda() * (w_norm.cuda()/(d_norm.cuda()+1e-10)))
    d.data      =  d.cuda() * (w_norm.cuda()/(d_norm.cuda()+1e-10))

In [None]:
for x, xx in zip(random_direction1,random_direction2):
    print(np.allclose(x.cpu().numpy(),xx.cpu().numpy()))
    
for x, xx in zip(random_direction1,temp):
    print(np.allclose(x.cpu().numpy(),xx.cpu().numpy()))

In [None]:
viz_histogram_weights(copy_of_the_weights,random_direction1,random_direction2)

# ==== 1. Random - Filter Normalization ====

In [None]:
random_direction1 = get_random_weights(copy_of_the_weights)
random_direction2 = get_random_weights(copy_of_the_weights)

for d1,d2,w in zip(random_direction1,random_direction2,copy_of_the_weights):
    
    w_norm  = w.view((w.shape[0],-1))  .norm(dim=(1),keepdim=True)[:,:,None,None]
    d_norm1 = d1.view((d1.shape[0],-1)).norm(dim=(1),keepdim=True)[:,:,None,None]
    d_norm2 = d2.view((d2.shape[0],-1)).norm(dim=(1),keepdim=True)[:,:,None,None]
    
    d1.data = d1.cuda() * (w_norm/(d_norm1.cuda()+1e-10))
    d2.data = d2.cuda() * (w_norm/(d_norm2.cuda()+1e-10))

In [None]:
viz_histogram_weights(copy_of_the_weights,random_direction1,random_direction2)

In [None]:
# start the evaluation
loss_list = np.zeros((numebr_of_points,numebr_of_points)); acc_list  = np.zeros((numebr_of_points,numebr_of_points))
col_value = 0

for count, ind in enumerate(inds):
    
    # change the weight values
    coord   = coordinate[count]
    changes = [d0.cuda()*coord[0] + d1.cuda()*coord[1] for (d0, d1) in zip(random_direction1, random_direction2)]
    for (p, w, d) in zip(net.parameters(), weight, changes): p.data = w + d

    # start the evaluation
    correct = 0; total_loss = 0; total = 0 
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(trainloader):
            batch_size = inputs.size(0)
            total      = total + batch_size
            
            inputs, targets = Variable(inputs).cuda(),Variable(targets).cuda()
            
            outputs = net(inputs)
            loss    = criterion(outputs, targets)
            total_loss   = total_loss + loss.item()*batch_size
            _, predicted = torch.max(outputs.data, 1)
            correct      = correct + predicted.eq(targets).sum().item()
            sys.stdout.write("Coord: "+str(coord)+"\tAcc: "+str(predicted.eq(targets).sum().item())+"\tLoss: "+str(np.around(loss.item(),3))+"\r")
            sys.stdout.flush()
            
            if batch_idx==10: break
            
        if count % 2 == 0 : sys.stdout.write("count: "+str(count)+"\tCoord: "+str(coord)+"\t\tAcc: "+str(100.*correct/total)+"\tLoss: "+str(np.around(total_loss/total,3))+"\n")
        
    # store value 
    loss_list[col_value][ind%numebr_of_points] = total_loss/total
    acc_list [col_value][ind%numebr_of_points] = 100.*correct/total
    ind_compare = ind + 1
    if ind_compare % numebr_of_points == 0 :  col_value = col_value + 1

In [None]:
create_viz(loss_list,acc_list)

# ==== 2. Random - Orthogonal - whole norm ====

In [None]:
# np.save('a_loss_list_1.npy', np.asarray(loss_list))
# np.save('a_acc_list_1.npy', np.asarray(acc_list))
random_direction1 = get_random_weights(copy_of_the_weights)
random_direction2 = get_random_weights(copy_of_the_weights)

for d1,d2,w in zip(random_direction1,random_direction2,copy_of_the_weights):
    
    if w.dim() == 1:
        d1.data = torch.zeros_like(w)
        d2.data = torch.zeros_like(w)
        
    elif w.shape[0] == 10:
        d11,_ = tf.qr(d1.cpu().numpy())
        d11   = d11.eval()
        d22,_ = tf.qr(np.transpose(d2.cpu().numpy(),(2,3,1,0)))
        d22   = np.transpose(d22.eval(),(3,2,0,1))
        

        d1.data   = torch.from_numpy(d11).cuda()
        d2.data   = torch.from_numpy(d22).cuda()     
        
        w_norm  = w.view((w.shape[0],-1))  .norm(dim=(1),keepdim=True)[:,:,None,None]
        d_norm1 = d1.view((d1.shape[0],-1)).norm(dim=(1),keepdim=True)[:,:,None,None]
        d_norm2 = d2.view((d2.shape[0],-1)).norm(dim=(1),keepdim=True)[:,:,None,None]

        d1.data = d1.cuda() * (w_norm/(d_norm1.cuda()+1e-10))
        d2.data = d2.cuda() * (w_norm/(d_norm2.cuda()+1e-10))
        
    else:
        d11,_ = tf.qr(d1.cpu().numpy())
        d11   = d11.eval()
        d22,_ = tf.qr(np.transpose(d2.cpu().numpy(),(2,3,0,1)))
        d22   = np.transpose(d22.eval(),(2,3,0,1))
        print(d11.shape,d22.shape)

        d1.data   = torch.from_numpy(d11).cuda()
        d2.data   = torch.from_numpy(d22).cuda()     
        
        w_norm  = w.view((w.shape[0],-1))  .norm(dim=(1),keepdim=True)[:,:,None,None]
        d_norm1 = d1.view((d1.shape[0],-1)).norm(dim=(1),keepdim=True)[:,:,None,None]
        d_norm2 = d2.view((d2.shape[0],-1)).norm(dim=(1),keepdim=True)[:,:,None,None]

        d1.data = d1.cuda() * (w_norm/(d_norm1.cuda()+1e-10))
        d2.data = d2.cuda() * (w_norm/(d_norm2.cuda()+1e-10))

In [None]:
viz_histogram_weights(copy_of_the_weights,random_direction1,random_direction2)

In [None]:
# start the evaluation
loss_list = np.zeros((numebr_of_points,numebr_of_points)); acc_list  = np.zeros((numebr_of_points,numebr_of_points))
col_value = 0

for count, ind in enumerate(inds):
    
    # change the weight values
    coord   = coordinate[count]
    changes = [d0.cuda()*coord[0] + d1.cuda()*coord[1] for (d0, d1) in zip(random_direction1, random_direction2)]
    for (p, w, d) in zip(net.parameters(), weight, changes): p.data = w + d

    # start the evaluation
    correct = 0; total_loss = 0; total = 0 
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(trainloader):
            batch_size = inputs.size(0)
            total      = total + batch_size
            
            inputs, targets = Variable(inputs).cuda(),Variable(targets).cuda()
            
            outputs = net(inputs)
            loss    = criterion(outputs, targets)
            total_loss   = total_loss + loss.item()*batch_size
            _, predicted = torch.max(outputs.data, 1)
            correct      = correct + predicted.eq(targets).sum().item()
            sys.stdout.write("Coord: "+str(coord)+"\tAcc: "+str(predicted.eq(targets).sum().item())+"\tLoss: "+str(np.around(loss.item(),3))+"\r")
            sys.stdout.flush()
            
            if batch_idx==10: break
            
        if count % 2 == 0 : sys.stdout.write("count: "+str(count)+"\tCoord: "+str(coord)+"\t\tAcc: "+str(100.*correct/total)+"\tLoss: "+str(np.around(total_loss/total,3))+"\n")
        
    # store value 
    loss_list[col_value][ind%numebr_of_points] = total_loss/total
    acc_list [col_value][ind%numebr_of_points] = 100.*correct/total
    ind_compare = ind + 1
    if ind_compare % numebr_of_points == 0 :  col_value = col_value + 1

In [None]:
create_viz(loss_list,acc_list)

# ==== 3. Weight - Orthogonal - whole norm ====

In [None]:
# np.save('a_loss_list_2.npy', np.asarray(loss_list))
# np.save('a_acc_list_2.npy', np.asarray(acc_list))
random_direction1 = []
random_direction2 = []

for w in copy_of_the_weights:
    
    if w.dim() == 1: 
        random_direction1.append(torch.zeros_like(w))
        random_direction2.append(torch.zeros_like(w))
        
    else:
        random_vector = w.clone().cpu().numpy()
        
        random_vector1 = random_vector - random_vector.mean((2,3),keepdims=True)
        random_vector2 = random_vector - random_vector.mean((0,1),keepdims=True)
        random_vector2 = np.transpose(random_vector2,(2,3,0,1))
        
        sigma1 = tf.matmul(tf.transpose(random_vector1,(0,1,3,2)),random_vector1) / random_vector1.shape[3]
        sigma2 = tf.matmul(tf.transpose(random_vector2,(0,1,3,2)),random_vector2) / random_vector2.shape[3]

        s1,u1,v1 = tf.linalg.svd(sigma1,False)
        s2,u2,v2 = tf.linalg.svd(sigma2,False)
        
        tmp1 =  tf.matmul(u1,1/(tf.sqrt(tf.linalg.diag(s1))+1e-5))
        tmp1 = tmp1 @ tf.transpose(u1,(0,1,3,2))
        
        tmp2 =  tf.matmul(u2,1/(tf.sqrt(tf.linalg.diag(s2))+1e-5))
        tmp2 = tmp2 @ tf.transpose(u2,(0,1,3,2))
        
        random_vector1 = random_vector1 @ tf.transpose(tmp1,(0,1,3,2))
        random_vector2 = random_vector2 @ tf.transpose(tmp2,(0,1,3,2))
        random_vector2 = tf.transpose(random_vector2,(2,3,0,1))
        
        random_vector1 = torch.from_numpy(random_vector1.eval()).cuda()
        random_vector2 = torch.from_numpy(random_vector2.eval()).cuda()

        w_norm  = w.view((w.shape[0],-1))  .norm(dim=(1),keepdim=True)[:,:,None,None]
        d_norm1 = random_vector1.view((random_vector1.shape[0],-1)).norm(dim=(1),keepdim=True)[:,:,None,None]
        d_norm2 = random_vector2.view((random_vector2.shape[0],-1)).norm(dim=(1),keepdim=True)[:,:,None,None]

        random_vector1 = random_vector1 * (w_norm/(d_norm1.cuda()+1e-10))
        random_vector2 = random_vector2 * (w_norm/(d_norm2.cuda()+1e-10))  
        
        print(random_vector1.shape)
        print(random_vector2.shape)

        random_direction1.append(random_vector1)
        random_direction2.append(random_vector2)

In [None]:
viz_histogram_weights(copy_of_the_weights,random_direction1,random_direction2)

In [None]:
# start the evaluation
loss_list = np.zeros((numebr_of_points,numebr_of_points)); acc_list  = np.zeros((numebr_of_points,numebr_of_points))
col_value = 0

for count, ind in enumerate(inds):
    
    # change the weight values
    coord   = coordinate[count]
    changes = [d0.cuda()*coord[0] + d1.cuda()*coord[1] for (d0, d1) in zip(random_direction1, random_direction2)]
    for (p, w, d) in zip(net.parameters(), weight, changes): p.data = w + d

    # start the evaluation
    correct = 0; total_loss = 0; total = 0 
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(trainloader):
            batch_size = inputs.size(0)
            total      = total + batch_size
            
            inputs, targets = Variable(inputs).cuda(),Variable(targets).cuda()
            
            outputs = net(inputs)
            loss    = criterion(outputs, targets)
            total_loss   = total_loss + loss.item()*batch_size
            _, predicted = torch.max(outputs.data, 1)
            correct      = correct + predicted.eq(targets).sum().item()
            sys.stdout.write("Coord: "+str(coord)+"\tAcc: "+str(predicted.eq(targets).sum().item())+"\tLoss: "+str(np.around(loss.item(),3))+"\r")
            sys.stdout.flush()
            
            if batch_idx==10: break
            
        if count % 2 == 0 : sys.stdout.write("count: "+str(count)+"\tCoord: "+str(coord)+"\t\tAcc: "+str(100.*correct/total)+"\tLoss: "+str(np.around(total_loss/total,3))+"\n")
        
    # store value 
    loss_list[col_value][ind%numebr_of_points] = total_loss/total
    acc_list [col_value][ind%numebr_of_points] = 100.*correct/total
    ind_compare = ind + 1
    if ind_compare % numebr_of_points == 0 :  col_value = col_value + 1

In [None]:
create_viz(loss_list,acc_list)

# ==== 4. Weight - PCA - whole norm ====

In [None]:
# np.save('a_loss_list_3.npy', np.asarray(loss_list))
# np.save('a_acc_list_3.npy', np.asarray(acc_list))
random_direction1 = []
random_direction2 = []

for w in copy_of_the_weights:
    
    if w.dim() == 1: 
        random_direction1.append(torch.zeros_like(w))
        random_direction2.append(torch.zeros_like(w))
        
    else:
        random_vector = w.clone().cpu().numpy()
        
        random_vector1 = random_vector - random_vector.mean((2,3),keepdims=True)
        random_vector2 = random_vector - random_vector.mean((0,1),keepdims=True)
        random_vector2 = np.transpose(random_vector2,(2,3,0,1))
        
        s1,u1,v1 = tf.linalg.svd(random_vector1,False)
        s2,u2,v2 = tf.linalg.svd(random_vector2,False)
        
        random_vector1 = u1 @ tf.linalg.diag(s1)[:,:,:,:1] @ tf.transpose(v1,(0,1,3,2))[:,:,:1,:]
        random_vector2 = u2 @ tf.linalg.diag(s2)[:,:,:,:1] @ tf.transpose(v2,(0,1,3,2))[:,:,:1,:]
        random_vector2 = tf.transpose(random_vector2,(2,3,0,1))
        
        random_vector1 = torch.from_numpy(random_vector1.eval()).cuda()
        random_vector2 = torch.from_numpy(random_vector2.eval()).cuda()

        w_norm  = w.view((w.shape[0],-1))  .norm(dim=(1),keepdim=True)[:,:,None,None]
        d_norm1 = random_vector1.view((random_vector1.shape[0],-1)).norm(dim=(1),keepdim=True)[:,:,None,None]
        d_norm2 = random_vector2.view((random_vector2.shape[0],-1)).norm(dim=(1),keepdim=True)[:,:,None,None]

        random_vector1 = random_vector1 * (w_norm/(d_norm1.cuda()+1e-10))
        random_vector2 = random_vector2 * (w_norm/(d_norm2.cuda()+1e-10))  
        
        print(random_vector1.shape)
        print(random_vector2.shape)

        random_direction1.append(random_vector1)
        random_direction2.append(random_vector2)

In [None]:
viz_histogram_weights(copy_of_the_weights,random_direction1,random_direction2)

In [None]:
# start the evaluation
loss_list = np.zeros((numebr_of_points,numebr_of_points)); acc_list  = np.zeros((numebr_of_points,numebr_of_points))
col_value = 0

for count, ind in enumerate(inds):
    
    # change the weight values
    coord   = coordinate[count]
    changes = [d0.cuda()*coord[0] + d1.cuda()*coord[1] for (d0, d1) in zip(random_direction1, random_direction2)]
    for (p, w, d) in zip(net.parameters(), weight, changes): p.data = w + d

    # start the evaluation
    correct = 0; total_loss = 0; total = 0 
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(trainloader):
            batch_size = inputs.size(0)
            total      = total + batch_size
            
            inputs, targets = Variable(inputs).cuda(),Variable(targets).cuda()
            
            outputs = net(inputs)
            loss    = criterion(outputs, targets)
            total_loss   = total_loss + loss.item()*batch_size
            _, predicted = torch.max(outputs.data, 1)
            correct      = correct + predicted.eq(targets).sum().item()
            sys.stdout.write("Coord: "+str(coord)+"\tAcc: "+str(predicted.eq(targets).sum().item())+"\tLoss: "+str(np.around(loss.item(),3))+"\r")
            sys.stdout.flush()
            
            if batch_idx==10: break
            
        if count % 2 == 0 : sys.stdout.write("count: "+str(count)+"\tCoord: "+str(coord)+"\t\tAcc: "+str(100.*correct/total)+"\tLoss: "+str(np.around(total_loss/total,3))+"\n")
        
    # store value 
    loss_list[col_value][ind%numebr_of_points] = total_loss/total
    acc_list [col_value][ind%numebr_of_points] = 100.*correct/total
    ind_compare = ind + 1
    if ind_compare % numebr_of_points == 0 :  col_value = col_value + 1

In [None]:
create_viz(loss_list,acc_list)

In [None]:
np.save('a_loss_list_4.npy', np.asarray(loss_list))
np.save('a_acc_list_4.npy', np.asarray(acc_list))