In [1]:
import torch as t 
import torchvision
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pylab as plt
from torch.nn.utils import clip_grad_value_
%matplotlib inline
import pickle
import json
from torchvision import datasets, transforms
import tqdm
import os


In [2]:
device = 'cuda' # cuda or cpu
device = t.device(device)
if device == 'cuda':
    t.backends.cudnn.deterministic = True
    t.backends.cudnn.benchmark = False

In [3]:
batch_size = 256
init_log_sigma = -3.0 # логарифм дисперсии вариационного распределения при инициализации
prior_sigma = 0.1 # априорная дисперсия
epoch_num = 25 #количество эпох
lamb = [0.1, 1,  10, 100]
hidden_num = 100 #количество нейронов на скрытом слое
acc_delete = [] 
start_num = 5


lam_hidden_num = 50
lambda_sample_num = 5
path_to_save = 'saved'


In [4]:
if not os.path.exists(path_to_save):
    os.mkdir(path_to_save)

In [5]:
# загрузка данных
train_data = torchvision.datasets.MNIST('./files/', train=True, download=True,
                             transform = transforms.Compose([transforms.ToTensor(),
                              transforms.Normalize((0.5,), (0.5,)),
                                  torchvision.transforms.Lambda(lambda x: x.view(-1))
                              ]))

test_data = torchvision.datasets.MNIST('./files/', train=False, download=True,
                             transform = transforms.Compose([transforms.ToTensor(),
                              transforms.Normalize((0.5,), (0.5,)),
                                  torchvision.transforms.Lambda(lambda x: x.view(-1))
                              ]))


train_loader = t.utils.data.DataLoader(train_data, batch_size=batch_size, pin_memory=True )
test_loader = t.utils.data.DataLoader(test_data, batch_size=batch_size)


In [6]:
class VarLayer(nn.Module): # вариационная однослойная сеть
    def __init__(self, in_,  out_,   act=F.relu):         
        nn.Module.__init__(self)                    
        self.mean = nn.Parameter(t.randn(in_, out_, device=device)) # параметры средних
        t.nn.init.xavier_uniform(self.mean) 
        self.log_sigma = nn.Parameter(t.ones(in_, out_, device = device)*init_log_sigma) # логарифм дисперсии
        self.mean_b = nn.Parameter(t.randn(out_, device=device)) # то же самое для свободного коэффициента
        self.log_sigma_b = nn.Parameter(t.ones(out_, device=device) * init_log_sigma)
                
        self.in_ = in_
        self.out_ = out_
        self.act = act
        
    def forward(self,x):
        if self.training: # во время обучения - сэмплируем из нормального распределения
            W = self.mean
            mu = x.matmul(W)
            log_alpha = self.log_sigma
            eps = 1e-8
            si = t.sqrt((x * x).matmul(((t.exp(2*log_alpha+eps)))))
            activation = mu + t.normal(t.zeros_like(mu), t.ones_like(mu)) * si + \
                t.exp(2*self.log_sigma_b) * t.normal(t.zeros_like(mu), t.ones_like(mu))   
        
            return self.act(activation + self.mean_b)
             
        else:  # во время контроля - смотрим средние значения параметра        
            w = self.mean 
            b = self.mean_b
            # функция активации 
            
            return self.act(t.matmul(x, w)+b)

    def KLD(self):        
        # подсчет дивергенции
        size = self.in_, self.out_
        out = self.out_
        self.eps_w = t.distributions.Normal(self.mean, t.exp(self.log_sigma))
        self.eps_b = t.distributions.Normal(self.mean_b,  t.exp(self.log_sigma_b))
        self.h_w = t.distributions.Normal(t.zeros(size, device=device), t.ones(size, device=device)*prior_sigma)
        self.h_b = t.distributions.Normal(t.zeros(out, device=device), t.ones(out, device=device)*prior_sigma)                
        k1 = t.distributions.kl_divergence(self.eps_w,self.h_w).sum()        
        k2 = t.distributions.kl_divergence(self.eps_b,self.h_b).sum()        
        return k1+k2
    

In [22]:
class LowRankNet(nn.Module):
    def __init__(self, size, hidden, gain_const = 1.0, gain_lamb = 1.0, gain_lowrank = .0001,  act= F.relu): 
        nn.Module.__init__(self)        
        self.w = nn.Linear(1, hidden).to(device)
        t.nn.init.xavier_uniform(self.w.weight, gain_lamb)
        # проверка на вектор или матрица
        if isinstance(size, tuple) and len(size) == 2: # если сайз неизменяемый список и его длина 2
            self.in_, self.out_ = size
            self.diagonal = False
        else:
            self.out_ = size
            self.diagonal = True
            
        
        self.one = t.ones(1,device=device) # для упрощения работы с лямбдой. Костыль, можно сделать проще
        self.act = act
        
        if self.diagonal:
            self.w_d = nn.Linear(hidden, self.out_).to(device)
            t.nn.init.xavier_uniform_(self.w_d.weight, gain_lowrank)
            # независимая от параметра lambda часть
            self.const = nn.Parameter(t.randn(self.out_, device=device)) 

            
        else:
            self.w_a1 = nn.Linear(hidden, self.in_).to(device)
            t.nn.init.xavier_uniform_(self.w_a1.weight, gain_lowrank)
            
            self.w_a2 = nn.Linear(hidden, self.out_).to(device)
            t.nn.init.xavier_uniform_(self.w_a2.weight, gain_lowrank)
            
            self.const = nn.Parameter(t.randn(self.in_, self.out_, device=device)) 
            t.nn.init.xavier_uniform_(self.const,  gain_const)
            
        
            
            
    def forward(self, lam):
        h = self.act(self.w(self.one * lam))        
        if self.diagonal:
            return self.const + self.w_d(h)
        else:
            a1 = self.w_a1(h)
            a2 = self.w_a2(h)
         
            return self.const +  t.matmul(a1.view(-1, 1), a2.view(1, -1)) 

        
class LinearApprNet(nn.Module):
    def __init__(self, size,  gain_const = 1.0, gain_const2 = 1.0,  act= lambda x: x):    
        nn.Module.__init__(self)        
        if isinstance(size, tuple) and len(size) == 2:
            self.in_, self.out_ = size
            self.diagonal = False
        else:
            self.out_ = size
            self.diagonal = True
            
        
        self.one = t.ones(1, device=device) # для упрощения работы с лямбдой. Костыль, можно сделать проще
        self.act = act
        
        if self.diagonal:
            # независимая от параметра lambda часть
            self.const = nn.Parameter(t.randn(self.out_, device=device)) 
            self.const2 = nn.Parameter(t.ones(self.out_, device=device) * gain_const2) 
            
            
        else:
            self.const = nn.Parameter(t.randn(self.in_, self.out_, device=device)) 
            t.nn.init.xavier_uniform_(self.const,  gain_const)
            self.const2 = nn.Parameter(t.randn(self.in_, self.out_, device=device)) 
            t.nn.init.xavier_uniform_(self.const2,  gain_const2)
            
            
    def forward(self, lam):        
        if self.diagonal:
            return self.const + self.const2 * lam
        else:
            return self.const + self.const2 * lam 

        
class VarLayerLowRank(nn.Module): # вариационная однослойная сеть
    def __init__(self, in_,  out_,   act=F.relu):         
        nn.Module.__init__(self)                    
        self.mean = LowRankNet((in_, out_), lam_hidden_num) # параметры средних            
        self.log_sigma = LowRankNet((in_, out_), lam_hidden_num) # логарифм дисперсии
        self.mean_b = LowRankNet( out_, lam_hidden_num) # то же самое для свободного коэффициента
        self.log_sigma_b = LowRankNet( out_, lam_hidden_num)
     
        self.log_sigma.const.data*= 0 # забьем константу нужными нам значениями
        self.log_sigma.const.data+= init_log_sigma
     
        self.log_sigma_b.const.data*= 0 # забьем константу нужными нам значениями
        self.log_sigma_b.const.data+= init_log_sigma
        
        
                
        self.in_ = in_
        self.out_ = out_
        self.act = act
        
    def forward(self,x, l):
        if self.training: # во время обучения - сэмплируем из нормального распределения
            W = self.mean(l)
            mu = x.matmul(W)
            log_alpha = self.log_sigma(l)
            eps = 1e-6
            if (t.isinf(2*log_alpha +eps)).any():
                print('inf')
            else:
                print(t.min(log_alpha))
            si = t.sqrt((x * x).matmul(((t.exp(2*log_alpha+eps)))))
            activation = mu + t.normal(t.zeros_like(mu), t.ones_like(mu)) * si + \
                t.exp(2*self.log_sigma_b(l)) * t.normal(t.zeros_like(mu), t.ones_like(mu))   
        
            return self.act(activation + self.mean_b(l))

             
        else:  # во время контроля - смотрим средние значения параметра        
            w = self.mean(l) 
            b = self.mean_b(l)
            
            # функция активации 
            return self.act(t.matmul(x, w)+b)
            
            

    def KLD(self, l):        
        # подсчет дивергенции
        size = self.in_, self.out_
        out = self.out_
        self.eps_w = t.distributions.Normal(self.mean(l), t.exp(self.log_sigma(l)))
        self.eps_b = t.distributions.Normal(self.mean_b(l),  t.exp(self.log_sigma_b(l)))
        self.h_w = t.distributions.Normal(t.zeros(size, device=device), t.ones(size, device=device)*prior_sigma)
        self.h_b = t.distributions.Normal(t.zeros(out, device=device), t.ones(out, device=device)*prior_sigma)                
        k1 = t.distributions.kl_divergence(self.eps_w,self.h_w).sum()        
        k2 = t.distributions.kl_divergence(self.eps_b,self.h_b).sum()        
        return k1+k2
    

class VarLayerLinearAppr(nn.Module): # вариационная однослойная сеть
    def __init__(self, in_,  out_,   act=F.relu):         
        nn.Module.__init__(self)                    
        self.mean = LinearApprNet((in_, out_)) # параметры средних            
        self.log_sigma = LinearApprNet((in_, out_)) # логарифм дисперсии
        self.mean_b = LinearApprNet( out_) # то же самое для свободного коэффициента
        self.log_sigma_b = LinearApprNet( out_)
     
        self.log_sigma.const.data*= 0 # забьем константу нужными нам значениями
        self.log_sigma.const.data+= init_log_sigma
     
        self.log_sigma_b.const.data*= 0 # забьем константу нужными нам значениями
        self.log_sigma_b.const.data+= init_log_sigma
        
        
                
        self.in_ = in_
        self.out_ = out_
        self.act = act
        
    def forward(self, x, l):
        
        if self.training: # во время обучения - сэмплируем из нормального распределения
            W = self.mean(l)
            mu = x.matmul(W)
            log_alpha = self.log_sigma(l)
            eps = 1e-8
            si = t.sqrt((x * x).matmul(((t.exp(2*log_alpha+eps)))))
            activation = mu + t.normal(t.zeros_like(mu), t.ones_like(mu)) * si + \
                t.exp(2*self.log_sigma_b(l)) * t.normal(t.zeros_like(mu), t.ones_like(mu))   
        
            return self.act(activation + self.mean_b(l))

             
        else:  # во время контроля - смотрим средние значения параметра        
            w = self.mean(l) 
            b = self.mean_b(l)
            
            # функция активации 
            return self.act(t.matmul(x, w)+b)
    
    def KLD(self, l):        
        # подсчет дивергенции
        size = self.in_, self.out_
        out = self.out_
        self.eps_w = t.distributions.Normal(self.mean(l), t.exp(self.log_sigma(l)))
        self.eps_b = t.distributions.Normal(self.mean_b(l),  t.exp(self.log_sigma_b(l)))
        self.h_w = t.distributions.Normal(t.zeros(size, device=device), t.ones(size, device=device)*prior_sigma)
        self.h_b = t.distributions.Normal(t.zeros(out, device=device), t.ones(out, device=device)*prior_sigma)                
        k1 = t.distributions.kl_divergence(self.eps_w,self.h_w).sum()        
        k2 = t.distributions.kl_divergence(self.eps_b,self.h_b).sum()        
        return k1+k2    

In [23]:
class VarSeqNet(nn.Sequential):    
    # класс-обертка на случай, если у нас многослойная нейронная сеть
    def KLD(self, lam = None):
        k = 0
        for l in self: 
            if lam is None:
                k+=l.KLD()
            else:
                k+=l.KLD(lam)
                
        return k
    
    def forward(self, x, lam = None):
        if lam is None:
            for l in self:
                x = l(x)
            return x
        else:
            for l in self:
                x = l(x, lam)
            return x
    


In [24]:
def train_batches(net, loss_fn, optimizer, lam, label):
    tq = tqdm.tqdm(train_loader)
    losses = []
    for x,y in tq:            
        x = x.to(device)
        y = y.to(device)          
        optimizer.zero_grad()  
        loss = 0
        if lam is None:
            for _ in range(lambda_sample_num):  
                p = t.rand(1).to(device)*3 -1
                lam_param = 10**p[0]
                
                #t.rand(1).to(device)[0]*100.0                  
                out = net(x, t.log(lam_param))
                loss = loss + loss_fn(out, y)/lambda_sample_num
                loss += net.KLD(lam_param)*(lam_param)/len(train_data)/lambda_sample_num
                losses+=[loss.cpu().detach().numpy()]
            # правдоподобие должно суммироваться по всей обучающей выборке
            # в случае батчей - она приводится к тому же порядку 
        else:
            out = net(x)
            loss = loss + loss_fn(out, y)
            loss += net.KLD()*lam/len(train_data)
            losses+=[loss.cpu().detach().numpy()]
        tq.set_description(label+str(np.mean(losses)))
        loss.backward()       
        clip_grad_value_(net.parameters(), 1.0) # для стабильности градиента. С этим можно играться
        optimizer.step()

In [14]:
t.manual_seed(0)
for lam in lamb:
    for start in range(start_num):                    
            net = VarSeqNet(VarLayer(784,  hidden_num), VarLayer(hidden_num, 10, act=lambda x:x))
            net = net.to(device)
            optim = t.optim.Adam(net.parameters(), lr=5e-4)
            loss_fn = nn.CrossEntropyLoss().to(device)            
            for e in range(epoch_num):
                label = 'lambda {}, epoch {}: '.format(lam, e)                
                train_batches(net, loss_fn, optim, lam, label)
            t.save(net.state_dict(), os.path.join(path_to_save, 'var_net_lam_{}_start_{}.cpk'.format(lam, start)))

  """
lambda 0.1, epoch 0: 2.024342:  22%|██▏       | 51/235 [00:03<00:11, 16.35it/s] 


KeyboardInterrupt: 

In [27]:
for mode in ['linear']:
    t.manual_seed(0)
    for start in range(start_num): 
        if mode == 'lowrank':
            net = VarSeqNet(VarLayerLowRank(784,  hidden_num), VarLayerLowRank(hidden_num, 10, act=lambda x:x))
        else:
            net = VarSeqNet(VarLayerLinearAppr(784,  hidden_num), VarLayerLinearAppr(hidden_num, 10, act=lambda x:x))
            
        net = net.to(device)
        optim = t.optim.Adam(net.parameters(), lr=5e-8)
        loss_fn = nn.CrossEntropyLoss().to(device)            
        for e in range(epoch_num):
            label = '{}, epoch {}: '.format(mode, e)                
            train_batches(net, loss_fn, optim, None, label)
        t.save(net.state_dict(), os.path.join(path_to_save, '{}_start_{}.cpk'.format(mode, start)))

linear, epoch 0: nan:  11%|█         | 25/235 [00:02<00:19, 10.60it/s]


KeyboardInterrupt: 

In [10]:
def test_acc(net): # точность классификации
    acc = []    
    correct = 0
    net.eval()
    for x,y in test_loader: 
        x = x.to(device)
        y = y.to(device)  
        out = net(x)    
        correct += out.argmax(1).eq(y).sum().cpu().numpy()
    acc = (correct / len(test_data))

    return acc


# будем удалять по 10% от модели и смотреть качество
def delete_10(net):
    acc_delete = []
    mu = net[0].mean
    sigma = t.exp(2*net[0].log_sigma)
    prune_coef = (mu**2/sigma).cpu().detach().numpy()    
    sorted_coefs = np.sort(prune_coef.flatten())
    mu2 = net[1].mean
    sigma2 = t.exp(2*net[1].log_sigma)
    prune_coef2 = (mu2**2/sigma2).cpu().detach().numpy()    
    sorted_coefs2 = np.sort(prune_coef2.flatten())
    
    
    for j in range(10):
        
        ids = (prune_coef <= sorted_coefs[round(j/10*len(sorted_coefs))]) 
        net[0].mean.data*=(1-t.tensor(ids*1.0, device=device, dtype=t.float))
        
        ids2 = (prune_coef2 <= sorted_coefs2[round(j/10*len(sorted_coefs2))]) 
        net[1].mean.data*=(1-t.tensor(ids2*1.0, device=device, dtype=t.float))
        
        print ('nonzero params: ', (abs(net[0].mean)>0).float().mean())
        acc_delete.append(test_acc(net))
    return acc_delete    


In [11]:
net = VarSeqNet(VarLayer(784,  hidden_num), VarLayer(hidden_num, 10, act=lambda x:x))
net = net.to(device)

In [12]:
lam_results = {}
print(type(lam_results))
for lam in lamb:
    lam_results[lam] = []
    for s in range(start_num):
        print (lam, s)
        net.load_state_dict(t.load(os.path.join('saved/', 'var_net_lam_{}_start_{}.cpk'.format(lam, s))))
        lam_results[lam].append(delete_10(net))


In [13]:
with open('results_var.json','w') as out:
    out.write(json.dumps(lam_results))

In [14]:
lam_results = {}
with open('results_var.json','r') as out:
    lam_results = json.loads(out.read())

In [15]:
proc = [0,10,20,30,40,50,60,70,80,90]
ls = {
    '0.1':'-',
    '1':'--',
    '10':':',
    '100':'-.'
}
lamb = [0.1, 1,  10, 100]
plt.rcParams['figure.figsize'] = 12, 12
plt.rcParams.update({'font.size': 27})
plt.rc('lines', linewidth=4)
    
for lam in lamb:
    lam = str(lam)
    print(lam)
    plt.fill_between(proc, np.min(lam_results[lam], 0), np.max(lam_results[lam], 0), alpha=0.2)
    plt.plot(proc, np.mean(lam_results[lam], 0), label='$\lambda={}$'.format(lam))
    #plt.errorbar(proc, np.mean(lam_results[lam], 0), 
     #                       yerr = np.std(lam_results[lam], 0), 
      #                      ls = ls[str(lam)],  elinewidth=0, ecolor='black', color=(0,0,0,0), lw=1)
    #plt.plot(proc, np.mean(lam_results[lam], 0), label='$\lambda={}$'.format(lam), ls = ls[str(lam)], c='k',
     #                  lw=2)
plt.ylabel('Точность классификации', fontsize = 27)
plt.xlabel('Процент удаления', fontsize = 27)
plt.tick_params(axis='both', which='major', labelsize=27)
plt.legend(loc='lower left')
plt.autoscale(enable=True, axis='x', tight=True)
plt.savefig('hypernets_var.eps', bbox_inches = 'tight')
plt.savefig('H_var_log_onestr')
plt.show()


In [16]:
def net_copy(net, new_net, lam):
    lam_param = lam/100.0
    for j in range(0, 2): # бежим по слоям        
        new_net[j].mean.data*=0
        new_net[j].mean.data+=net[j].mean(lam_param)
        new_net[j].mean_b.data*=0
        new_net[j].mean_b.data+=net[j].mean_b(lam_param)
        new_net[j].log_sigma.data*=0
        new_net[j].log_sigma.data+=net[j].log_sigma(lam_param)
        new_net[j].log_sigma_b.data*=0
        new_net[j].log_sigma_b.data+=net[j].log_sigma_b(lam_param)
    


In [17]:
hnet = VarSeqNet(VarLayerLowRank(784,  hidden_num), VarLayerLowRank(hidden_num, 10, act=lambda x:x))
hnet = hnet.to(device)
net = VarSeqNet(VarLayer(784,  hidden_num), VarLayer(hidden_num, 10, act=lambda x:x))
net = net.to(device)
    
lam_results = {}
for lam in lamb:
    lam_results[lam] = []
    for s in range(start_num):
        print (lam, s)
        hnet.load_state_dict(t.load(os.path.join('saved/', 'lowrank_start_{}.cpk'.format(s))))
        
        net_copy(hnet, net, lam)
        lam_results[lam].append(delete_10(net))

In [18]:
import json
with open('results_lowrank.json','w') as out:
    out.write(json.dumps(lam_results))

In [19]:
with open('results_lowrank.json') as inp:
        lam_results = json.loads(inp.read())

In [21]:
proc = [0,10,20,30,40,50,60,70,80,90]
plt.rcParams['figure.figsize'] = 12, 12
plt.rcParams.update({'font.size': 27})
plt.rc('lines', linewidth=4)
    
    
for lam in lamb:
    lam = str(lam)
    plt.fill_between(proc, np.min(lam_results[lam], 0), np.max(lam_results[lam], 0), alpha=0.2)
    plt.plot(proc, np.mean(lam_results[lam], 0), label='$\lambda={}$'.format(lam))
plt.ylabel('Точность классификации', fontsize = 27)
plt.xlabel('Процент удаления', fontsize = 27)
plt.tick_params(axis='both', which='major', labelsize=27)
plt.legend(loc='lower left')
plt.autoscale(enable=True, axis='x', tight=True)
plt.savefig('H_lowrank_log_onestr')
#plt.show()


In [23]:
hnet = VarSeqNet(VarLayerLinearAppr(784,  hidden_num), VarLayerLinearAppr(hidden_num, 10, act=lambda x:x))
hnet = hnet.to(device)
net = VarSeqNet(VarLayer(784,  hidden_num), VarLayer(hidden_num, 10, act=lambda x:x))
net = net.to(device)
    
lam_results = {}
for lam in lamb:
    lam_results[lam] = []
    for s in range(start_num):
        print (lam, s)
        hnet.load_state_dict(t.load(os.path.join('saved/', 'linear_start_{}.cpk'.format(s))))
        
        net_copy(hnet, net, lam)
        lam_results[lam].append(delete_10(net))

In [24]:
import json
with open('results_linear.json','w') as out:
    out.write(json.dumps(lam_results))

In [26]:
proc = [0,10,20,30,40,50,60,70,80,90]
plt.rcParams['figure.figsize'] = 12, 12
plt.rcParams.update({'font.size': 27})
plt.rc('lines', linewidth=4)
    
    
for lam in lamb:
    #lam = str(lam)
    plt.fill_between(proc, np.min(lam_results[lam], 0), np.max(lam_results[lam], 0), alpha=0.2)
    plt.plot(proc, np.mean(lam_results[lam], 0), label='$\lambda={}$'.format(lam))
plt.ylabel('Точность классификации', fontsize = 27)
plt.xlabel('Процент удаления', fontsize = 27)
plt.tick_params(axis='both', which='major', labelsize=27)
plt.legend(loc='lower left')
plt.autoscale(enable=True, axis='x', tight=True)
plt.savefig('H_linear_log2')
#plt.show()


In [27]:
hnet = VarSeqNet(VarLayerLowRank(784,  hidden_num), VarLayerLowRank(hidden_num, 10, act=lambda x:x))
hnet = hnet.to(device)
net = VarSeqNet(VarLayer(784,  hidden_num), VarLayer(hidden_num, 10, act=lambda x:x))
net = net.to(device)
    
lam_results = {}
t.manual_seed(0)
for lam in lamb:    
    lam_results[lam] = []
    for s in range(start_num):
        
        hnet.load_state_dict(t.load(os.path.join('saved/', 'lowrank_start_{}.cpk'.format(s))))        
        net_copy(hnet, net, lam)
        
        optim = t.optim.Adam(net.parameters(), lr=5e-4)
        loss_fn = nn.CrossEntropyLoss().to(device)            
        label = 'finetune lowrank, lam:{} '.format(lam)                
        train_batches(net, loss_fn, optim, lam, label)        
        lam_results[lam].append(delete_10(net))
        

In [28]:
import json
with open('results_lowrank_finetune.json','w') as out:
    out.write(json.dumps(lam_results))

In [29]:
proc = [0,10,20,30,40,50,60,70,80,90]
plt.rcParams['figure.figsize'] = 12, 12
plt.rcParams.update({'font.size': 27})
plt.rc('lines', linewidth=4)
    
    
for lam in lamb:
    plt.fill_between(proc, np.min(lam_results[lam], 0), np.max(lam_results[lam], 0), alpha=0.2)
    plt.plot(proc, np.mean(lam_results[lam], 0), label='$\lambda={}$'.format(lam))
plt.ylabel('Точность классификации', fontsize = 27)
plt.xlabel('Процент удаления', fontsize = 27)
plt.tick_params(axis='both', which='major', labelsize=27)
plt.legend(loc='lower left')
plt.autoscale(enable=True, axis='x', tight=True)
plt.savefig('H_low_log+1')
#plt.show()


In [30]:
hnet = VarSeqNet(VarLayerLinearAppr(784,  hidden_num), VarLayerLinearAppr(hidden_num, 10, act=lambda x:x))
hnet = hnet.to(device)
net = VarSeqNet(VarLayer(784,  hidden_num), VarLayer(hidden_num, 10, act=lambda x:x))
net = net.to(device)
t.manual_seed(0)    
lam_results = {}
for lam in lamb:
    lam_results[lam] = []
    for s in range(start_num):
        hnet.load_state_dict(t.load(os.path.join('saved/', 'linear_start_{}.cpk'.format(s))))        
        net_copy(hnet, net, lam)
        
        optim = t.optim.Adam(net.parameters(), lr=5e-4)
        loss_fn = nn.CrossEntropyLoss().to(device)            
        label = 'finetune linear, lam:{} '.format(lam)                
        train_batches(net, loss_fn, optim, lam, label)        
        lam_results[lam].append(delete_10(net))
        

In [31]:
import json
with open('results_linear_finetune.json','w') as out:
    out.write(json.dumps(lam_results))

In [32]:
proc = [0,10,20,30,40,50,60,70,80,90]
plt.rcParams['figure.figsize'] = 12, 12
plt.rcParams.update({'font.size': 27})
plt.rc('lines', linewidth=4)
    
    
for lam in lamb:
    plt.fill_between(proc, np.min(lam_results[lam], 0), np.max(lam_results[lam], 0), alpha=0.2)
    plt.plot(proc, np.mean(lam_results[lam], 0), label='$\lambda={}$'.format(lam))
plt.ylabel('Точность классификации', fontsize = 27)
plt.xlabel('Процент удаления', fontsize = 27)
plt.tick_params(axis='both', which='major', labelsize=27)
plt.legend(loc='lower left')
plt.autoscale(enable=True, axis='x', tight=True)
plt.savefig('Hypernet_linear + 1_log')
#plt.show()


In [33]:
for lam in lamb:
    t.manual_seed(0) 
    for start in range(start_num):                    
            net = VarSeqNet(VarLayer(784,  hidden_num), VarLayer(hidden_num, 10, act=lambda x:x))
            net = net.to(device)
            optim = t.optim.Adam(net.parameters(), lr=5e-4)
            loss_fn = nn.CrossEntropyLoss().to(device)                        
            label = 'lambda {} '.format(lam)                
            train_batches(net, loss_fn, optim, lam, label)
            t.save(net.state_dict(), os.path.join(path_to_save, 'var_1e_net_lam_{}_start_{}.cpk'.format(lam, start)))

In [34]:
lam_results = {}
for lam in lamb:
    lam_results[lam] = []
    for s in range(start_num):
        print (lam, s)
        net.load_state_dict(t.load(os.path.join('saved/', 'var_1e_net_lam_{}_start_{}.cpk'.format(lam, s))))
        lam_results[lam].append(delete_10(net))

In [35]:
import json
with open('results_1e.json','w') as out:
    out.write(json.dumps(lam_results))

In [38]:
proc = [0,10,20,30,40,50,60,70,80,90]
plt.rcParams['figure.figsize'] = 12, 12
plt.rcParams.update({'font.size': 27})
plt.rc('lines', linewidth=4)
    
    
for lam in lamb:
    plt.fill_between(proc, np.min(lam_results[lam], 0), np.max(lam_results[lam], 0), alpha=0.2)
    plt.plot(proc, np.mean(lam_results[lam], 0), label='$\lambda={}$'.format(lam))
plt.ylabel('Точность классификации', fontsize = 27)
plt.xlabel('Процент удаления', fontsize = 27)
plt.tick_params(axis='both', which='major', labelsize=27)
plt.legend(loc='lower left')
plt.autoscale(enable=True, axis='x', tight=True)
plt.savefig('1e.eps', bbox_inches = 'tight')
#plt.show()


In [39]:
#calculating accuracy
for mode in ['results_var','results_1e', 'results_lowrank', 'results_linear', 'results_lowrank_finetune', 'results_linear_finetune']:
    print ('mode:', mode)
    with open(mode+'.json') as inp:
        data = json.loads(inp.read())
    for lam in lamb:
        lam = str(lam)
        print (lam, np.mean(data[lam][0]), np.std(data[lam][0]))
    print()

In [80]:
#calculating stability
for mode in ['results_var','results_1e', 'results_lowrank', 'results_linear', 'results_lowrank_finetune', 'results_linear_finetune']:
    print ('mode:', mode)
    with open(mode+'.json') as inp:
        data = json.loads(inp.read())
        
    for lam in lamb:
        lam = str(lam)
        v_0 = np.array(data[lam])[:,0]
        v_last = np.array(data[lam])[:,-1]
        
        stab = v_0/v_last
        print (lam, np.mean(stab), np.std(stab))
    print()

In [87]:
print ('ordinal model')
net = VarSeqNet(VarLayer(784,  hidden_num), VarLayer(hidden_num, 10, act=lambda x:x)).to(device)
loss_fn = nn.CrossEntropyLoss().to(device)   

for lam in lamb:
    lam_results[lam] = []
    k = []
    ll = []
    for s in range(start_num):        
        net.load_state_dict(t.load(os.path.join('saved/', 'var_net_lam_{}_start_{}.cpk'.format(lam, s))))                 
        t.manual_seed(0)
        k+= [net.KLD().cpu().detach().numpy()]
        for x,y in train_loader:
            x = x.to(device)
            y = y.to(device)
            ll += [loss_fn(net(x), y).cpu().detach().numpy()]
    print (lam, np.mean(ll)*len(train_data) + lam * np.mean(k))
            
        

In [90]:
print ('1e model')
net = VarSeqNet(VarLayer(784,  hidden_num), VarLayer(hidden_num, 10, act=lambda x:x)).to(device)
loss_fn = nn.CrossEntropyLoss().to(device)   

for lam in lamb:
    lam_results[lam] = []
    k = []
    ll = []
    for s in range(start_num):        
        net.load_state_dict(t.load(os.path.join('saved/', 'var_1e_net_lam_{}_start_{}.cpk'.format(lam, s))))                 
        t.manual_seed(0)
        k+= [net.KLD().cpu().detach().numpy()]
        for x,y in train_loader:
            x = x.to(device)
            y = y.to(device)
            ll += [loss_fn(net(x), y).cpu().detach().numpy()]
    print (lam, np.mean(ll)*len(train_data) + lam * np.mean(k))
            
        

In [92]:
print ('lr model')
net = VarSeqNet(VarLayer(784,  hidden_num), VarLayer(hidden_num, 10, act=lambda x:x)).to(device)
hnet = VarSeqNet(VarLayerLowRank(784,  hidden_num), VarLayerLowRank(hidden_num, 10, act=lambda x:x)).to(device)
loss_fn = nn.CrossEntropyLoss().to(device)   

for lam in lamb:
    lam_results[lam] = []
    k = []
    ll = []
    for s in range(start_num):     
        hnet.load_state_dict(t.load(os.path.join('saved/', 'lowrank_start_{}.cpk'.format(s))))        
        net_copy(hnet, net, lam)
        t.manual_seed(0)
        k+= [net.KLD().cpu().detach().numpy()]
        for x,y in train_loader:
            x = x.to(device)
            y = y.to(device)
            ll += [loss_fn(net(x), y).cpu().detach().numpy()]
    print (lam, np.mean(ll)*len(train_data) + lam * np.mean(k))
            
print ('lr ft model')
for lam in lamb:
    lam_results[lam] = []
    k = []
    ll = []
    for s in range(start_num):     
        hnet.load_state_dict(t.load(os.path.join('saved/', 'lowrank_start_{}.cpk'.format(s))))        
        net_copy(hnet, net, lam)
        t.manual_seed(0)
        optim = t.optim.Adam(net.parameters(), lr=5e-4)        
        train_batches(net, loss_fn, optim, lam, label)   
        
        k+= [net.KLD().cpu().detach().numpy()]
        for x,y in train_loader:
            x = x.to(device)
            y = y.to(device)
            ll += [loss_fn(net(x), y).cpu().detach().numpy()]  
        
    print (lam, np.mean(ll)*len(train_data) + lam * np.mean(k))
            
                

In [93]:
print ('linear model')
net = VarSeqNet(VarLayer(784,  hidden_num), VarLayer(hidden_num, 10, act=lambda x:x)).to(device)
hnet = VarSeqNet(VarLayerLinearAppr(784,  hidden_num), VarLayerLinearAppr(hidden_num, 10, act=lambda x:x)).to(device)
loss_fn = nn.CrossEntropyLoss().to(device)   

for lam in lamb:
    lam_results[lam] = []
    k = []
    ll = []
    for s in range(start_num):     
        hnet.load_state_dict(t.load(os.path.join('saved/', 'linear_start_{}.cpk'.format(s))))        
        net_copy(hnet, net, lam)
        t.manual_seed(0)
        k+= [net.KLD().cpu().detach().numpy()]
        for x,y in train_loader:
            x = x.to(device)
            y = y.to(device)
            ll += [loss_fn(net(x), y).cpu().detach().numpy()]
    print (lam, np.mean(ll)*len(train_data) + lam * np.mean(k))
            
print ('lr ft model')
for lam in lamb:
    lam_results[lam] = []
    k = []
    ll = []
    for s in range(start_num):     
        hnet.load_state_dict(t.load(os.path.join('saved/', 'linear_start_{}.cpk'.format(s))))        
        net_copy(hnet, net, lam)
        t.manual_seed(0)
        optim = t.optim.Adam(net.parameters(), lr=5e-4)        
        train_batches(net, loss_fn, optim, lam, label)   
        
        k+= [net.KLD().cpu().detach().numpy()]
        for x,y in train_loader:
            x = x.to(device)
            y = y.to(device)
            ll += [loss_fn(net(x), y).cpu().detach().numpy()]  
        
    print (lam, np.mean(ll)*len(train_data) + lam * np.mean(k))
            
                

In [112]:
# количество оптимизаций параметров
# количество батчей * количество эпох * количество параметров, домноженное на количество элементов в lamb
net = VarSeqNet(VarLayer(784,  hidden_num), VarLayer(hidden_num, 10, act=lambda x:x))
p_num = 0
for p in hnet.parameters():
    size = p.size()
    if len(size)==1:
        p_num += size[0]
    elif len(size) == 2:
        p_num += np.prod(size)    
p_num_var = p_num
print ('ordinal', len(train_loader)*epoch_num*(p_num)*len(lamb))
print ('1e', len(train_loader)*p_num * len(lamb))

hnet = VarSeqNet(VarLayerLowRank(784,  hidden_num), VarLayerLowRank(hidden_num, 10, act=lambda x:x))
p_num = 0
for p in hnet.parameters():
    size = p.size()
    if len(size)==1:
        p_num += size[0]
    elif len(size) == 2:
        p_num += np.prod(size)    
    
print ('low rank', len(train_loader)*epoch_num*(p_num))
print ('low rank finetune', len(train_loader)*epoch_num*(p_num) + len(train_loader)*p_num_var)

hnet = VarSeqNet(VarLayerLinearAppr(784,  hidden_num), VarLayerLinearAppr(hidden_num, 10, act=lambda x:x))
p_num = 0
for p in hnet.parameters():
    size = p.size()
    if len(size)==1:
        p_num += size[0]
    elif len(size) == 2:
        p_num += np.prod(size)    
    
print ('linear', len(train_loader)*epoch_num*(p_num))
print ('linear fine tune', len(train_loader)*epoch_num*(p_num) + len(train_loader)*p_num_var)

In [25]:
#new plots
for mode in ['results_var','results_1e', 'results_lowrank', 'results_linear', 'results_lowrank_finetune', 'results_linear_finetune']:
    print ('mode:', mode)
    with open(mode+'.json') as inp:
        data = json.loads(inp.read())
    for lam in lamb:
        lam = str(lam)
        print (lam, np.mean(data[lam][0]), np.std(data[lam][0]))
    print()

In [44]:
plt.rcParams['figure.figsize'] = 20, 12
plt.rcParams.update({'font.size': 27})
plt.rc('lines', linewidth=4)
    
fig, axs = plt.subplots(2, 3)
coords = [(0,0), (1,0), (0,1), (1,1), (0,2), (1,2)]
titles = ['(a)', '(b)', '(c)', '(d)', '(e)', '(f)']
ls = {
    '0.1':'-',
    '1':'--',
    '10':':',
    '100':'-.'
}
roc = np.array([0,10,20,30,40,50,60,70,80,90])
for id, mode in enumerate(['results_var','results_1e', 'results_lowrank', 'results_lowrank_finetune', 'results_linear', 'results_linear_finetune']):
    print ('mode:', mode)
    with open(mode+'.json') as inp:
        lam_results = json.loads(inp.read())
    current_ax = axs[coords[id][0], coords[id][1]]


    for lam in lamb:
        lam = str(lam)
        current_ax.fill_between(proc, np.min(lam_results[lam], 0), np.max(lam_results[lam], 0), alpha=0.2)
        current_ax.plot(proc, np.mean(lam_results[lam], 0), label='$\lambda={}$'.format(lam))
        
        # вызываем plot и errorbar два раза:
        # (здесь это оказалось ненужным, но оставил на случай переделок:
        # если ошибки разных графиков сильно накладываются, их разносят небольшим сдвигом
        # а сами линии средних значений оставляют на месте)
       # current_ax.errorbar(proc, np.mean(lam_results[lam], 0), 
        #                    yerr = np.std(lam_results[lam], 0), 
         #                   ls = ls[lam],  elinewidth=0)#, ecolor='black', color=(0,0,0,0), lw=1)
        #current_ax.plot(proc, np.mean(lam_results[lam], 0), label='$\lambda={}$'.format(lam), ls = ls[lam], c='k',
         #              lw=2)
        
        #current_ax.set_title(titles[id], y=-0.05)
        if id in [1,3,5]:
            current_ax.text(40, .38, titles[id])
        else:
            current_ax.text(40, .42, titles[id])
        current_ax.set_ylim((0.5, 1.0))
        if id != 0 and id != 1:
            current_ax.set_yticklabels([])
        if id not in [1,3,5]:
            current_ax.set_xticklabels([])
        if id == 5:
            current_ax.legend(bbox_to_anchor=(1.1, 1.3))
fig.add_subplot(111, frame_on=False)
plt.tick_params(labelcolor="none", bottom=False, left=False)

plt.ylabel('Accuracy', fontsize = 27)
plt.xlabel('Percentage of removed parameters', fontsize = 27)
plt.tick_params(axis='both', which='major', labelsize=27)
ax = plt.gca() 
ax.xaxis.set_label_coords(0.5, -0.15) 
#plt.autoscale(enable=True, axis='x', tight=False)
#fig.tight_layout()
plt.savefig('hypernets_colour_begin.png', bbox_inches = 'tight')
#plt.show()



In [42]:
plt.rcParams['figure.figsize'] = 20, 12
plt.rcParams.update({'font.size': 27})
plt.rc('lines', linewidth=4)
    
fig, axs = plt.subplots(2, 3)
coords = [(0,0), (1,0), (0,1), (1,1), (0,2), (1,2)]
titles = ['(а)', '(б)', '(в)', '(г)', '(д)', '(е)']
ls = {
    '0.1':'-',
    '1':'--',
    '10':':',
    '100':'-.'
}
roc = np.array([0,10,20,30,40,50,60,70,80,90])
for id, mode in enumerate(['results_var','results_1e', 'results_lowrank', 'results_lowrank_finetune', 'results_linear', 'results_linear_finetune']):
    print ('mode:', mode)
    with open(mode+'.json') as inp:
        lam_results = json.loads(inp.read())
    current_ax = axs[coords[id][0], coords[id][1]]

    for lam in lamb:
        lam = str(lam)
        
        # вызываем plot и errorbar два раза:
        # (здесь это оказалось ненужным, но оставил на случай переделок:
        # если ошибки разных графиков сильно накладываются, их разносят небольшим сдвигом
        # а сами линии средних значений оставляют на месте)
        current_ax.errorbar(proc, np.mean(lam_results[lam], 0), 
                            yerr = np.std(lam_results[lam], 0), 
                            ls = ls[lam],  elinewidth=0, ecolor='black', color=(0,0,0,0), lw=1)
        current_ax.plot(proc, np.mean(lam_results[lam], 0), label='$\lambda={}$'.format(lam), ls = ls[lam], c='k',
                       lw=2)
        
        #current_ax.set_title(titles[id], y=-0.05)
        if id in [1,3,5]:
            current_ax.text(40, .38, titles[id])
        else:
            current_ax.text(40, .42, titles[id])
        current_ax.set_ylim((0.5, 1.0))
        if id != 0 and id != 1:
            current_ax.set_yticklabels([])
        if id not in [1,3,5]:
            current_ax.set_xticklabels([])
        if id == 5:
            current_ax.legend(bbox_to_anchor=(1.1, 1.3))
fig.add_subplot(111, frame_on=False)
plt.tick_params(labelcolor="none", bottom=False, left=False)

plt.ylabel('Точность классификации', fontsize = 27)
plt.xlabel('Процент удаления параметров', fontsize = 27)
plt.tick_params(axis='both', which='major', labelsize=27)
ax = plt.gca() 
ax.xaxis.set_label_coords(0.5, -0.15) 
#plt.autoscale(enable=True, axis='x', tight=False)
#fig.tight_layout()
plt.savefig('hypernets.eps', bbox_inches = 'tight')
#plt.show()

In [230]:
class BaseLayer(nn.Module): #однослойная сеть
    def __init__(self, in_,  out_,   act=F.relu):         
        nn.Module.__init__(self)                    
        self.mean = nn.Parameter(t.randn(in_, out_, device=device)) # параметры средних
        t.nn.init.xavier_uniform(self.mean) 
        self.log_sigma = nn.Parameter(t.ones(in_, out_, device = device)*init_log_sigma) # логарифм дисперсии
        self.mean_b = nn.Parameter(t.randn(out_, device=device)) # то же самое для свободного коэффициента
        self.log_sigma_b = nn.Parameter(t.ones(out_, device=device) * init_log_sigma)
                
        self.in_ = in_
        self.out_ = out_
        self.act = act
        
    def forward(self,x):    
        w = self.mean 
        b = self.mean_b
            
        # функция активации 
        return self.act(t.matmul(x, w)+b)

    def KLD(self):        
        # подсчет дивергенции
        size = self.in_, self.out_
        out = self.out_
        self.eps_w = t.distributions.Normal(self.mean, t.exp(self.log_sigma))
        self.eps_b = t.distributions.Normal(self.mean_b,  t.exp(self.log_sigma_b))
        self.h_w = t.distributions.Normal(t.zeros(size, device=device), t.ones(size, device=device)*prior_sigma)
        self.h_b = t.distributions.Normal(t.zeros(out, device=device), t.ones(out, device=device)*prior_sigma)                
        k1 = t.distributions.kl_divergence(self.eps_w,self.h_w).sum()        
        k2 = t.distributions.kl_divergence(self.eps_b,self.h_b).sum()        
        return k1+k2
    