In [3]:
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import copy
from networks import DQN

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: ", device)
print("Torch Version: ", torch.__version__)

Device:  cuda
Torch Version:  1.0.1.post2


In [5]:
def increase_capacity_keep_lr(network, capacity, optimizer, device):
    # Store old ids
    old_ids = [id(p) for p in network.parameters()]
    old_param_sizes = [p.size() for p in network.parameters()]

    network.increase_capacity(capacity)

    # Store new ids
    new_ids = [id(p) for p in network.parameters()]
    new_param_sizes = [p.size() for p in network.parameters()]

    # Store old state 
    opt_state_dict = optimizer.state_dict()
    for old_id, new_id, new_param_size, old_param_size in zip(old_ids, new_ids, new_param_sizes, old_param_sizes):
        # Store step, and exp_avgs
        step = opt_state_dict['state'][old_id]['step']
        old_exp_avg = opt_state_dict['state'][old_id]['exp_avg']
        old_exp_avg_sq = opt_state_dict['state'][old_id]['exp_avg_sq']
        old_max_exp_avg_sq = opt_state_dict['state'][old_id]['max_exp_avg_sq']

        exp_avg = torch.zeros(new_param_size)
        exp_avg_sq = torch.zeros(new_param_size)
        max_exp_avg_sq =  torch.zeros(new_param_size)
        # Extend exp_avgs to new shape depending on wether param is bias or weight
        if exp_avg.dim()>1:
            # Weights
            exp_avg[0:old_param_size[0],0:old_param_size[1]] = old_exp_avg
            exp_avg_sq[0:old_param_size[0],0:old_param_size[1]] = old_exp_avg_sq
            max_exp_avg_sq[0:old_param_size[0],0:old_param_size[1]] = old_max_exp_avg_sq
        else:
            # Biases/last layer
            exp_avg[0:old_param_size[0]] = old_exp_avg
            exp_avg_sq[0:old_param_size[0]] = old_exp_avg_sq
            max_exp_avg_sq[0:old_param_size[0]] = old_max_exp_avg_sq
        
        # Delete old id from state_dict and update new params and new id
        del opt_state_dict['state'][old_id]
        opt_state_dict['state'][new_id] = {
            'step': step,
            'exp_avg': exp_avg,
            'exp_avg_sq': exp_avg_sq.to(device),
            'max_exp_avg_sq' : max_exp_avg_sq.to(device)
        }
        opt_state_dict['param_groups'][0]['params'].remove(old_id)
        opt_state_dict['param_groups'][0]['params'].append(new_id)

    network.to(device)
    optimizer = optim.Adam(network.parameters(), amsgrad=True)
    optimizer.load_state_dict(opt_state_dict)
    
    return network, optimizer

In [9]:
def generate_zero():
    return random.uniform(0, 49) / 100

def generate_one():
    return random.uniform(50, 100) / 100

def generate_both(num_data_points, p):
    Xs, Ys = [], []
    for _ in range(num_data_points):
        if random.random() < p:
            Xs.append([generate_zero(), generate_zero(), 0]); Ys.append([0])
            # or(1, 0) -> 1
            Xs.append([generate_one(), generate_zero(), 0]); Ys.append([1])
            # or(0, 1) -> 1
            Xs.append([generate_zero(), generate_one(), 0]); Ys.append([1])
            # or(1, 1) -> 1
            Xs.append([generate_one(), generate_one(), 0]); Ys.append([1])
        else:
            # xor(0, 0) -> 0
            Xs.append([generate_zero(), generate_zero(), 1]); Ys.append([0])
            # xor(1, 0) -> 1
            Xs.append([generate_one(), generate_zero(), 1]); Ys.append([1])
            # xor(0, 1) -> 1
            Xs.append([generate_zero(), generate_one(), 1]); Ys.append([1])
            # xor(1, 1) -> 0
            Xs.append([generate_one(), generate_one(), 1]); Ys.append([0])
    return Xs, Ys

def generate_or_XY(num_data_points):
    Xs, Ys = [], []
    for _ in range(num_data_points):
        # or(0, 0) -> 0 
        Xs.append([generate_zero(), generate_zero(), 0]); Ys.append([0])
        # or(1, 0) -> 1
        Xs.append([generate_one(), generate_zero(), 0]); Ys.append([1])
        # or(0, 1) -> 1
        Xs.append([generate_zero(), generate_one(), 0]); Ys.append([1])
        # or(1, 1) -> 1
        Xs.append([generate_one(), generate_one(), 0]); Ys.append([1])
    return Xs, Ys

def generate_xor_XY(num_data_points):
    Xs, Ys = [], []
    for _ in range(num_data_points):
        # xor(0, 0) -> 0 
        Xs.append([generate_zero(), generate_zero(), 1]); Ys.append([0])
        # xor(1, 0) -> 1
        Xs.append([generate_one(), generate_zero(), 1]); Ys.append([1])
        # xor(0, 1) -> 1
        Xs.append([generate_zero(), generate_one(), 1]); Ys.append([1])
        # xor(1, 1) -> 0
        Xs.append([generate_one(), generate_one(), 1]); Ys.append([0])
    return Xs, Ys

In [7]:
def xor_experiments(initial_capacity, train_or, capacity, non_linearity):
    lowest_loss = 0
    lowest_settings = []
    
    pre_pre_loss_or = []
    pre_pre_loss_xor = []
    
    pre_loss_or = []
    pre_loss_xor = []
    
    losses_or = []
    losses_xor = []
    for seed in range(100):
        # Set seeds
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)

        # Initialisation network
        network = DQN(3, initial_capacity.copy(), 1, non_linearity)
        optimizer = optim.Adam(network.parameters(), amsgrad=True)
        criterion = nn.MSELoss()
        
        prediction = network(torch.tensor([[0,0,0],[0,1,0],[1,0,0],[1,1,0]], dtype=torch.float))
        Ys = torch.tensor([[0],[1],[1],[1]], dtype=torch.float)
        OR_loss = 1.0/(1.0+criterion(prediction, Ys))
            
        prediction = network(torch.tensor([[0,0,1],[0,1,1],[1,0,1],[1,1,1]], dtype=torch.float))
        Ys = torch.tensor([[0],[1],[1],[0]], dtype=torch.float)
        XOR_loss = 1.0/(1.0+criterion(prediction, Ys))
            
        pre_pre_loss_or.append(OR_loss.item())
        pre_pre_loss_xor.append(XOR_loss.item())
        
        if train_or:
            for i in range(127*1000):
                optimizer.zero_grad()

                Xs, Ys = generate_both(25,0.1)
                    
                Xs = torch.tensor(Xs)
                Ys = torch.tensor(Ys, dtype=torch.float)

                prediction = network(Xs)
                loss = criterion(prediction, Ys)

                loss.backward()
                optimizer.step()

                with torch.no_grad():
                    # Evaluation
                    prediction = network(torch.tensor([[0,0,0],[0,1,0],[1,0,0],[1,1,0]], dtype=torch.float))
                    Ys = torch.tensor([[0],[1],[1],[1]], dtype=torch.float)
                    loss = 1.0/(1.0+criterion(prediction, Ys))

                if loss>0.95:
                    break
                    
        prediction = network(torch.tensor([[0,0,0],[0,1,0],[1,0,0],[1,1,0]], dtype=torch.float))
        Ys = torch.tensor([[0],[1],[1],[1]], dtype=torch.float)
        OR_loss = 1.0/(1.0+criterion(prediction, Ys))
            
        prediction = network(torch.tensor([[0,0,1],[0,1,1],[1,0,1],[1,1,1]], dtype=torch.float))
        Ys = torch.tensor([[0],[1],[1],[0]], dtype=torch.float)
        XOR_loss = 1.0/(1.0+criterion(prediction, Ys))
            
        pre_loss_or.append(OR_loss.item())
        pre_loss_xor.append(XOR_loss.item())
        
        if capacity is not None:
            network, optimizer = increase_capacity_keep_lr(network, capacity, optimizer, 'cpu')
        
        iters = (155-127)*1000
        if not train_or:
            (155) * 1000
            
        for i in range(iters):
            optimizer.zero_grad()
            
            # Uniform syllabus 20% of the time
            Xs, Ys = generate_both(25,0.9)
                
            Xs = torch.tensor(Xs)
            Ys = torch.tensor(Ys, dtype=torch.float)

            prediction = network(Xs)
            loss = criterion(prediction, Ys)

            loss.backward()
            optimizer.step()
            
            with torch.no_grad():
                # Evaluation
                prediction = network(torch.tensor([[0,0,1],[0,1,1],[1,0,1],[1,1,1]], dtype=torch.float))
                Ys = torch.tensor([[0],[1],[1],[0]], dtype=torch.float)
                loss = 1.0/(1.0+criterion(prediction, Ys))
                
            if loss>0.95:
                break
        
        average_loss = 0
        
        with torch.no_grad():
            # Test or
            prediction = network(torch.tensor([[0,0,0],[0,1,0],[1,0,0],[1,1,0]], dtype=torch.float))
            Ys = torch.tensor([[0],[1],[1],[1]], dtype=torch.float)
            loss = 1.0/(1.0+criterion(prediction, Ys))
        
        average_loss += loss.item()
        losses_or.append(loss.item())

        with torch.no_grad():
            # Test xor
            prediction = network(torch.tensor([[0,0,1],[0,1,1],[1,0,1],[1,1,1]], dtype=torch.float))
            Ys = torch.tensor([[0],[1],[1],[0]], dtype=torch.float)
            loss = 1.0/(1.0+criterion(prediction, Ys))
        
        average_loss += loss.item()
        average_loss /= 2
        losses_xor.append(loss.item())
        
        if average_loss > lowest_loss:
            lowest_loss = copy.copy(average_loss)
            lowest_settings = [average_loss, losses_or[-1], losses_xor[-1], seed, initial_capacity, train_or, capacity]
        
        if loss>0.95:
            break
        
    # Print statistics
    print(initial_capacity, train_or, capacity)
    
    print('Average loss or before xor training: ', np.average(pre_pre_loss_or))
    print('Average loss xor before xor training: ', np.average(pre_pre_loss_xor))
    
    print('Average loss or before xor training: ', np.average(pre_loss_or))
    print('Average loss xor before xor training: ', np.average(pre_loss_xor))
    print('Average loss or: ', np.average(losses_or))
    print('Average loss xor: ', np.average(losses_xor))
    print('Average loss: ', (np.average(losses_or) +  np.average(losses_xor))/2)
    print(lowest_settings)

xor_experiments([5,2],True,[3,1])
# xor_experiments([2],False, None)
# xor_experiments([2], True, None)
# xor_experiments([1], True, [1])

# xor_experiments([3],False,None)
# xor_experiments([3], True, None)
# xor_experiments([2], True, [1])
# xor_experiments([1], True, [2])

# xor_experiments([4],False,None)
# xor_experiments([4], True, None)
# xor_experiments([1], True, [3])
# xor_experiments([2], True, [2])
# xor_experiments([3], True, [1])

[5, 2] True [3, 1]
Average loss or before xor training:  0.30647775530815125
Average loss xor before xor training:  0.36558668315410614
Average loss or before xor training:  0.9143003523349762
Average loss xor before xor training:  0.9034073650836945
Average loss or:  0.9684329926967621
Average loss xor:  0.9590063989162445
Average loss:  0.9637196958065033
[0.9969995021820068, 0.9997853636741638, 0.9942136406898499, 1, [5, 2], True, [3, 1]]


# Animation

In [None]:
Writer = animation.writers['ffmpeg']
writer = Writer(fps=24, metadata=dict(artist='Joe Harrison'), bitrate=1800)

In [None]:
fig = plt.figure()

with writer.saving(fig, 'xor.mp4' ,100):
    random.seed(0)
    np.random.seed(0)
    torch.manual_seed(0)
    
    x = np.linspace(0, 1.0, 100)
    y = np.linspace(0, 1.0, 100)
    
    network = DQN(3, [6,3], 1, F.elu)
    optimizer = optim.Adam(network.parameters(), amsgrad=True)
    criterion = nn.MSELoss()
    
    for i in range(500):
        
        optimizer.zero_grad()

        Xs, Ys = generate_both(25,0.1)
                    
        Xs = torch.tensor(Xs)
        Ys = torch.tensor(Ys, dtype=torch.float)

        prediction = network(Xs)
        loss = criterion(prediction, Ys)

        loss.backward()
        optimizer.step()
        
        print(i, loss.item())

        OR_mat = np.zeros((100,100))
        XOR_mat = np.zeros((100,100))

        for idx_y, grid_point_y in enumerate(y):
            for idx_x, grid_point_x in enumerate(x):
                OR_mat[idx_y, idx_x] = network(torch.tensor([grid_point_x, grid_point_y, 0.0])).item()
                XOR_mat[idx_y, idx_x] = network(torch.tensor([grid_point_x, grid_point_y, 1.0])).item()

        plt.axis('off')
        
        plt.subplot(1, 2, 1)
        plt.axvline(x=0.5, color='w', linestyle='dashed')
        plt.axhline(y=0.5, color='w', linestyle='dashed')
        plt.xticks(np.arange(min(x), max(x)+1, 0.5))
        plt.yticks(np.arange(min(y), max(y)+1, 0.5))
        plt.imshow(OR_mat, interpolation='none', cmap='inferno', extent=(0.0, 1.0, 0.0, 1.0))
        plt.subplot(1, 2, 2)
        plt.axvline(x=0.5, color='w', linestyle='dashed')
        plt.axhline(y=0.5, color='w', linestyle='dashed')
        plt.xticks(np.arange(min(x), max(x)+1, 0.5))
        plt.yticks(np.arange(min(y), max(y)+1, 0.5))
        plt.imshow(XOR_mat, interpolation='none', cmap='inferno', extent=(0.0, 1.0, 0.0, 1.0))
        
        writer.grab_frame()
    
    for i in range(500):
       
        optimizer.zero_grad()

        Xs, Ys = generate_both(25,0.9)
                    
        Xs = torch.tensor(Xs)
        Ys = torch.tensor(Ys, dtype=torch.float)

        prediction = network(Xs)
        loss = criterion(prediction, Ys)

        loss.backward()
        optimizer.step()
        
        print(i, loss.item())

        OR_mat = np.zeros((100,100))
        XOR_mat = np.zeros((100,100))

        for idx_y, grid_point_y in enumerate(y):
            for idx_x, grid_point_x in enumerate(x):
                OR_mat[idx_y, idx_x] = network(torch.tensor([grid_point_x, grid_point_y, 0.0])).item()
                XOR_mat[idx_y, idx_x] = network(torch.tensor([grid_point_x, grid_point_y, 1.0])).item()

        plt.axis('off')
        plt.subplot(1, 2, 1)
        plt.axvline(x=0.5, color='w', linestyle='dashed')
        plt.axhline(y=0.5, color='w', linestyle='dashed')
        plt.xticks(np.arange(min(x), max(x)+1, 0.5))
        plt.yticks(np.arange(min(y), max(y)+1, 0.5))
        plt.imshow(OR_mat, interpolation='none', cmap='inferno', extent=(0.0, 1.0, 0.0, 1.0))
        plt.subplot(1, 2, 2)
        plt.axvline(x=0.5, color='w', linestyle='dashed')
        plt.axhline(y=0.5, color='w', linestyle='dashed')
        plt.xticks(np.arange(min(x), max(x)+1, 0.5))
        plt.yticks(np.arange(min(y), max(y)+1, 0.5))
        plt.imshow(XOR_mat, interpolation='none', cmap='inferno', extent=(0.0, 1.0, 0.0, 1.0))
        
        writer.grab_frame()

0 0.8552559614181519
1 0.9450685381889343
2 0.8479112982749939
3 0.8452067971229553
4 0.8431272506713867
5 0.8419037461280823
6 0.8403158783912659
7 0.853485643863678
8 0.8190130591392517
9 0.8330321311950684
10 0.8607609868049622
11 0.8887335658073425
12 0.8406065702438354
13 0.8083235621452332
14 0.8525921106338501
15 0.8209795951843262
16 0.8337359428405762
17 0.8464990854263306
18 0.8744804263114929
19 0.8732001781463623
20 0.8718239068984985
21 0.8252975940704346
22 0.8088929057121277
23 0.8660187125205994
24 0.8351905941963196
25 0.8483043313026428
26 0.8612344861030579
27 0.801175594329834
28 0.8433561325073242
29 0.8272910714149475
30 0.825671374797821
31 0.8386083245277405
32 0.8080980181694031
33 0.8065598011016846
34 0.8194347620010376
35 0.8324432373046875
36 0.8019487857818604
37 0.8004421591758728
38 0.7990315556526184
39 0.7830743193626404
40 0.7959089875221252
41 0.8230062127113342
42 0.8216138482093811
43 0.8199988603591919
44 0.8042615652084351
45 0.8311848640441895
4

359 0.25323063135147095
360 0.2610549330711365
361 0.2551250159740448
362 0.2639027237892151
363 0.25560733675956726
364 0.25369173288345337
365 0.25213244557380676
366 0.2588612735271454
367 0.2515970766544342
368 0.26022061705589294
369 0.25410306453704834
370 0.2586001753807068
371 0.2528402805328369
372 0.252511590719223
373 0.260211706161499
374 0.24943773448467255
375 0.25179004669189453
376 0.2567742168903351
377 0.25429844856262207
378 0.26269152760505676
379 0.2514294981956482
380 0.25524723529815674
381 0.2608136534690857
382 0.25660353899002075
383 0.25541526079177856
384 0.2510131299495697
385 0.2528967261314392
386 0.25573399662971497
387 0.2569952607154846
388 0.25276893377304077
389 0.2513931393623352
390 0.25748172402381897
391 0.2587720453739166
392 0.25585606694221497
393 0.25326260924339294
394 0.2524624168872833
395 0.25284233689308167
396 0.24664589762687683
397 0.2600404620170593
398 0.2571449279785156
399 0.25478118658065796
400 0.25792115926742554
401 0.25859108

KeyboardInterrupt: 

In [54]:
def weight_change(initial_capacity, train_or, capacity, non_linearity):
    common_weight = []
    new_weight = []
    total_weight = []
    common_bias = []
    new_bias = []
    total_bias = []
    
    for idx in range(len(initial_capacity)+1):
        common_weight.append([])
        new_weight.append([])
        total_weight.append([])
        common_bias.append([])
        new_bias.append([])
        total_bias.append([])
    
    for seed in range(2):  
        # Set seeds
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        
        # Initialisation network
        network = DQN(3, initial_capacity.copy(), 1, non_linearity).to(device)
        optimizer = optim.Adam(network.parameters(), amsgrad=True)
        criterion = nn.MSELoss()
        
        if train_or:
            for i in range(1000):
                optimizer.zero_grad()

                Xs, Ys = generate_both(25,0.1)
                    
                Xs = torch.tensor(Xs, device=device)
                Ys = torch.tensor(Ys, dtype=torch.float, device=device)

                prediction = network(Xs)
                loss = criterion(prediction, Ys)

                loss.backward()
                optimizer.step()

                with torch.no_grad():
                    # Evaluation
                    prediction = network(torch.tensor([[0,0,0],[0,1,0],[1,0,0],[1,1,0]], dtype=torch.float, device=device))
                    Ys = torch.tensor([[0],[1],[1],[1]], dtype=torch.float, device=device)
                    loss = 1.0/(1.0+criterion(prediction, Ys))

                if loss>0.95:
                    break
                    
                network_before_increase = copy.deepcopy(network)
        
        if capacity is not None:
            network, optimizer = increase_capacity_keep_lr(network, capacity, optimizer, device)
            
        network_after_increase = copy.deepcopy(network)
        
        iters = 1000
#         if not train_or:
#             (155) * 1000
            
        for i in range(iters):
            optimizer.zero_grad()
            
            # Uniform syllabus 20% of the time
            Xs, Ys = generate_both(25,0.9)
                
            Xs = torch.tensor(Xs, device=device)
            Ys = torch.tensor(Ys, dtype=torch.float, device=device)

            prediction = network(Xs)
            loss = criterion(prediction, Ys)

            loss.backward()
            optimizer.step()
            
            with torch.no_grad():
                # Evaluation
                prediction = network(torch.tensor([[0,0,0],[0,1,0],[1,0,0],[1,1,0]], dtype=torch.float, device=device))
                Ys = torch.tensor([[0],[1],[1],[1]], dtype=torch.float, device=device)
                loss = 1.0/(1.0+criterion(prediction, Ys))

            if loss>0.95:
                break
        
        for idx in range(len(initial_capacity) + 1):
            if idx==0: 
                common_weight[idx].append(torch.mean(torch.abs(network.layers[idx].weight[0:initial_capacity[0],:] - network_after_increase.layers[idx].weight[0:initial_capacity[0],:])).item())
                new_weight[idx].append(torch.mean(torch.abs(network.layers[idx].weight[initial_capacity[0]:,:] - network_after_increase.layers[idx].weight[initial_capacity[0]:,:])).item())
                
            elif idx<len(initial_capacity):
                common_weight[idx].append(torch.mean(torch.abs(network.layers[idx].weight[0:initial_capacity[idx],0:initial_capacity[idx - 1] ] - network_after_increase.layers[idx].weight[0:initial_capacity[idx],0:initial_capacity[idx-1]])).item())
                new_weight[idx].append(torch.mean(torch.abs(network.layers[idx].weight[initial_capacity[idx]:,initial_capacity[idx - 1]:] - network_after_increase.layers[idx].weight[initial_capacity[idx]:,initial_capacity[idx-1]:])).item())
            else:
                common_weight[idx].append(torch.mean(torch.abs(network.layers[idx].weight[:,0:initial_capacity[idx - 1] ] - network_after_increase.layers[idx].weight[:,0:initial_capacity[idx-1]])).item())
                new_weight[idx].append(torch.mean(torch.abs(network.layers[idx].weight[:,initial_capacity[idx - 1]: ] - network_after_increase.layers[idx].weight[:,initial_capacity[idx-1]:])).item())
            
            total_weight[idx].append(torch.mean(torch.abs(network.layers[idx].weight - network_after_increase.layers[idx].weight)).item())
            
            if idx<len(initial_capacity):
                common_bias[idx].append(torch.mean(torch.abs(network.layers[idx].bias[:initial_capacity[idx]] - network_after_increase.layers[idx].bias[:initial_capacity[idx]])).item())
                new_bias[idx].append(torch.mean(torch.abs(network.layers[idx].bias[initial_capacity[idx]:] - network_after_increase.layers[idx].bias[initial_capacity[idx]:])).item())
            else:
                common_bias[idx].append(torch.mean(torch.abs(network.layers[idx].bias - network_after_increase.layers[idx].bias)).item())
                new_bias[idx].append(torch.mean(torch.abs(network.layers[idx].bias - network_after_increase.layers[idx].bias)).item())
            total_bias[idx].append(torch.mean(torch.abs(network.layers[idx].bias - network_after_increase.layers[idx].bias)).item())
            
    for idx in range(len(initial_capacity)+1):
        print('Layer ', idx)
        print('Common Weights ', np.average(np.array(common_weight[idx])))
        print('New Weights', np.average(np.array(new_weight[idx])))
        print('Total Weights', np.average(np.array(total_weight[idx])))
        print('Common Bias', np.average(np.array(common_bias[idx])))
        print('New Bias', np.average(np.array(new_bias[idx])))
        print('Total Bias', np.average(np.array(total_bias[idx])))
        
initial_capacity = [4, 2]
capacity = None
train_or = True
non_linearity = F.elu
weight_change(initial_capacity, train_or, capacity, non_linearity)
print('----')
non_linearity = F.elu
initial_capacity = [1, 1]
capacity = [3, 1]
train_or = True
weight_change(initial_capacity, train_or, capacity, non_linearity)   

tensor([-0.5850, -0.1173, -0.4558, -0.0853], device='cuda:0',
       grad_fn=<SliceBackward>) tensor([], device='cuda:0', grad_fn=<SliceBackward>)
tensor([ 0.1066, -0.0613], device='cuda:0', grad_fn=<SliceBackward>) tensor([], device='cuda:0', grad_fn=<SliceBackward>)
tensor([-0.0406, -0.4383,  0.2101, -0.2855], device='cuda:0',
       grad_fn=<SliceBackward>) tensor([], device='cuda:0', grad_fn=<SliceBackward>)
tensor([0.0420, 0.2337], device='cuda:0', grad_fn=<SliceBackward>) tensor([], device='cuda:0', grad_fn=<SliceBackward>)
Layer  0
Common Weights  0.15690992027521133
New Weights nan
Total Weights 0.15690992027521133
Common Bias 0.13067696243524551
New Bias 0.13067696243524551
Total Bias 0.13067696243524551
Layer  1
Common Weights  0.07105209305882454
New Weights nan
Total Weights 0.07105209305882454
Common Bias 0.01578469481319189
New Bias 0.01578469481319189
Total Bias 0.01578469481319189
Layer  2
Common Weights  0.07519078254699707
New Weights nan
Total Weights 0.0751907825469