In [3]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, random_split
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch import optim
import pickle
from matplotlib import pyplot as plt
from libs_unet.models import unet_002
from libs_unet.training.libs_train import train_loop, test_loop
from pathlib import Path
#from torch.utils.tensorboard import SummaryWriter
import datetime

top_dir = Path.cwd()
datapath = top_dir / 'data'


In [4]:
#Leverage PyTorch native Dataset and DataLoader 
#Define Train/Test sets from 20 element data samples
with open(datapath / 'training/10k_nomods.pickle', 'rb') as f:
    fracs = pickle.load(f)
    wave = pickle.load(f)
    x_data = pickle.load(f)
    y_data = pickle.load(f)

#create dataset
#input needs a placeholder "channel" dimension since single channel
#learned labels already has max_z + 2 channels from spec_array
#data has to match weights which default to float() so cast data as same
scale_factor = 1
x_data = torch.tensor(x_data[:,None,:].astype('float32'))
y_data = torch.tensor(y_data.astype('float32'))
spec_ds = TensorDataset(scale_factor * x_data, scale_factor * y_data)
#batch sizes
train_bs = 50
test_bs = 100
#create random split for training and validation
train_len = int(0.8 * len(x_data))
test_len = len(x_data) - train_len
train_ds, test_ds = random_split(spec_ds,[train_len, test_len])
train_dl = DataLoader(train_ds, batch_size=train_bs) #took out , shuffle=True for repeatability
test_dl = DataLoader(test_ds, batch_size=test_bs)
#

In [5]:
#set parameters
el_count = 20 #first n elements used to construct model
wl_points = 760 #number of wavelength point measurements in data
learning_rate = 1

#Initialize and run
model = unet_002.LIBSUNet(el_count, wl_points)
loss_fn = nn.MSELoss(reduction='mean')
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

#code from training module
model.train()
#see initial weights and bias norms by node as-initialized
init_wts = {}
        
for name, param in model.named_parameters():
    print(f"{name},{torch.linalg.vector_norm(param)}") #all non-zero
    init_wts[name] = param.clone()




down_conv_1.double_conv.0.weight,2.822108268737793
down_conv_1.double_conv.0.bias,1.0788921117782593
down_conv_1.double_conv.2.weight,2.6998066902160645
down_conv_1.double_conv.2.bias,0.24507422745227814
down_conv_2.double_conv.0.weight,3.8295814990997314
down_conv_2.double_conv.0.bias,0.3457043468952179
down_conv_2.double_conv.2.weight,3.7998058795928955
down_conv_2.double_conv.2.bias,0.2413749098777771
down_conv_3.double_conv.0.weight,5.404940128326416
down_conv_3.double_conv.0.bias,0.3505250811576843
down_conv_3.double_conv.2.weight,5.414536952972412
down_conv_3.double_conv.2.bias,0.23484382033348083
down_conv_4.double_conv.0.weight,7.658783435821533
down_conv_4.double_conv.0.bias,0.3364388346672058
down_conv_4.double_conv.2.weight,7.661884784698486
down_conv_4.double_conv.2.bias,0.2388111799955368
down_conv_5.double_conv.0.weight,10.830170631408691
down_conv_5.double_conv.0.bias,0.32977917790412903
down_conv_5.double_conv.2.weight,10.827914237976074
down_conv_5.double_conv.2.bias,0

In [6]:
#what is form or init_wts?
for k,v in init_wts.items():
    print(f"{k}:{type(k)}{v}:{type(v)}")
    break

down_conv_1.double_conv.0.weight:<class 'str'>tensor([[[-0.2845,  0.1972,  0.3026,  0.3516, -0.1351,  0.1973]],

        [[ 0.2245, -0.2457,  0.0517, -0.2012, -0.1708, -0.3756]],

        [[ 0.2991,  0.1787, -0.3559, -0.1404,  0.0261, -0.2756]],

        [[ 0.2749,  0.3289,  0.0658, -0.2488, -0.1459, -0.1965]],

        [[ 0.2255,  0.0286, -0.3641,  0.4031, -0.3786, -0.3634]],

        [[-0.3774,  0.0883,  0.1110,  0.0098,  0.1824, -0.2406]],

        [[ 0.2761, -0.2463, -0.3492,  0.3626, -0.2095,  0.2497]],

        [[ 0.1571, -0.1091,  0.0607, -0.2528,  0.3500, -0.2726]],

        [[ 0.2259, -0.3294,  0.3359, -0.0065,  0.3584,  0.3548]],

        [[ 0.1773, -0.0317,  0.0178, -0.0389,  0.2734, -0.3512]],

        [[ 0.3319, -0.3192, -0.1377, -0.1791, -0.3019,  0.2199]],

        [[ 0.3167, -0.0644,  0.0087, -0.1485, -0.0848, -0.3805]],

        [[ 0.2254, -0.2153, -0.4054,  0.1433,  0.0954, -0.3012]],

        [[-0.3293, -0.3224,  0.4067,  0.2663, -0.1218,  0.3249]],

        [[ 0.250

In [7]:
#now predict / train on first batch from data loader and check gradients
for batch, (X, y) in enumerate(train_dl):
    if batch == 0: #just process first batch
        pred = model(X)
        loss = loss_fn(pred, y)
    else:
        break #exit once past first batch

# clear gradients before back prop
optimizer.zero_grad()
#check if gradients still None or explicit zeros
for name, param in model.named_parameters():
    print(f"{name},{param.grad}") #All None

  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: The size of tensor a (846) must match the size of tensor b (760) at non-singleton dimension 2

In [None]:
#run back-prop and see that gradients are calculated for each node
#note I don't think that state_dict() returns gradients
loss.backward()
for name, param in model.named_parameters():
    print(f"{name},{param.grad}") #we see non-zero and zero gradient tensors through the graph

down_conv_1.0.weight,tensor([[[-1.1317e-08, -1.2171e-08, -1.2392e-08, -1.2194e-08, -1.1554e-08,
          -1.0587e-08]],

        [[-2.1554e-08, -2.1233e-08, -2.0306e-08, -1.8757e-08, -1.7136e-08,
          -1.5872e-08]],

        [[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00]],

        [[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00]],

        [[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00]],

        [[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00]],

        [[ 5.6630e-10,  8.4667e-10,  8.5680e-10,  3.6165e-10, -9.1834e-11,
          -1.7314e-10]],

        [[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00]],

        [[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00]],

        [[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e

In [None]:
#check the L2 norm for a compact representation of zero/non-zero gradients at each node
for name, param in model.named_parameters():
    print(f"{name},{torch.linalg.vector_norm(param.grad)}") #all non-zero

down_conv_1.0.weight,1.2865262988270842e-07
down_conv_1.0.bias,6.341264816001058e-05
down_conv_1.2.weight,0.00036249522236175835
down_conv_1.2.bias,0.00029606197495013475
down_conv_2.0.weight,1.3831452179147163e-06
down_conv_2.0.bias,2.9826285299350275e-06
down_conv_2.2.weight,6.713265520374989e-06
down_conv_2.2.bias,9.87112525763223e-06
down_conv_3.0.weight,1.1186593695811098e-07
down_conv_3.0.bias,2.465852730892948e-07
down_conv_3.2.weight,4.092389644938521e-07
down_conv_3.2.bias,6.100741529735387e-07
down_conv_4.0.weight,9.074144102783066e-09
down_conv_4.0.bias,1.6538084679496023e-08
down_conv_4.2.weight,2.225107920139635e-08
down_conv_4.2.bias,3.447769358899677e-08
down_conv_5.0.weight,3.2552520679018926e-09
down_conv_5.0.bias,6.278348507748888e-09
down_conv_5.2.weight,9.457665761658518e-09
down_conv_5.2.bias,1.4888200006168972e-08
up_trans_1.weight,6.695778598242441e-09
up_trans_1.bias,4.61019062925061e-08
up_conv_1.0.weight,1.2784754233052809e-07
up_conv_1.0.bias,1.11049729412116

In [None]:
#step the optimizer and see whether L2 norm of weights/biases change
optimizer.step() #leverages tensor gradients from backward() to update weights
for name, param in model.named_parameters():
    print(f"{name},{torch.linalg.vector_norm(param)}") #all non-zero, note norm of first node changes so weights update

down_conv_1.0.weight,2.7115249633789062
down_conv_1.0.bias,1.162429928779602
down_conv_1.2.weight,2.6694557666778564
down_conv_1.2.bias,0.222115159034729
down_conv_2.0.weight,3.8454744815826416
down_conv_2.0.bias,0.3513379991054535
down_conv_2.2.weight,3.804018974304199
down_conv_2.2.bias,0.22764228284358978
down_conv_3.0.weight,5.422050476074219
down_conv_3.0.bias,0.31513625383377075
down_conv_3.2.weight,5.412996768951416
down_conv_3.2.bias,0.2484075129032135
down_conv_4.0.weight,7.650272846221924
down_conv_4.0.bias,0.3204376697540283
down_conv_4.2.weight,7.660725116729736
down_conv_4.2.bias,0.2307216078042984
down_conv_5.0.weight,10.84073257446289
down_conv_5.0.bias,0.3296024799346924
down_conv_5.2.weight,10.83821964263916
down_conv_5.2.bias,0.24198462069034576
up_trans_1.weight,10.826815605163574
up_trans_1.bias,0.4167179763317108
up_conv_1.0.weight,7.6619486808776855
up_conv_1.0.bias,0.158700093626976
up_conv_1.2.weight,7.658204555511475
up_conv_1.2.bias,0.22891490161418915
up_tran

In [None]:
#Unrelated type checks for logging
new_dict = {}
for k, v in model.named_parameters():
    new_dict[k] = { 'wt':v, 'grad':v.grad }
#now compare weights to those in init_dict
#diff_dict = {}
for k,v in new_dict.items():
    #diff_dict[k] = v['wt'] - init_wts[k]
    print(f"{k}:init0:{init_wts[k][0]} new0:{new_dict[k]['wt'][0]}")

down_conv_1.0.weight:init0:tensor([[ 0.3751,  0.2476, -0.1833,  0.0470, -0.0355, -0.0275]],
       grad_fn=<SelectBackward0>) new0:tensor([[ 0.3751,  0.2476, -0.1833,  0.0470, -0.0355, -0.0275]],
       grad_fn=<SelectBackward0>)
down_conv_1.0.bias:init0:0.1567375659942627 new0:0.15674932301044464
down_conv_1.2.weight:init0:tensor([[ 0.0010,  0.0348, -0.0657, -0.0616, -0.0540,  0.0673],
        [ 0.0771, -0.0391, -0.0870, -0.0329,  0.0594, -0.0629],
        [ 0.0467,  0.0122,  0.0285,  0.0035,  0.0377,  0.0375],
        [-0.0481,  0.0264,  0.0205,  0.0328,  0.0254,  0.0148],
        [ 0.0252, -0.0148, -0.0781, -0.0302, -0.0052,  0.0473],
        [-0.0005, -0.0742,  0.0504, -0.0228, -0.0693,  0.0368],
        [-0.0335,  0.0377,  0.0470,  0.0723, -0.0857, -0.0393],
        [-0.0768, -0.0235, -0.0334, -0.0755,  0.0054, -0.0441],
        [-0.0605, -0.0337,  0.0643, -0.0650,  0.0115, -0.0013],
        [-0.0564, -0.0341,  0.0655,  0.0571,  0.0204,  0.0069],
        [ 0.0145,  0.0433,  0.0583

In [None]:
#Now process the second batch and see if we are getting updates to everything
for batch, (X, y) in enumerate(train_dl):
    if batch == 0: #skip the batch we already processed
        continue
    if batch == 1: #just process first batch
        pred = model(X)
        loss = loss_fn(pred, y)
    else:
        break #exit once past second batch

In [None]:
# Clear gradients
optimizer.zero_grad()
#check if gradients still None or explicit zeros
for name, param in model.named_parameters():
    print(f"{name},{torch.linalg.vector_norm(param.grad)}") #All zeros

down_conv_1.0.weight,0.0
down_conv_1.0.bias,0.0
down_conv_1.2.weight,0.0
down_conv_1.2.bias,0.0
down_conv_2.0.weight,0.0
down_conv_2.0.bias,0.0
down_conv_2.2.weight,0.0
down_conv_2.2.bias,0.0
down_conv_3.0.weight,0.0
down_conv_3.0.bias,0.0
down_conv_3.2.weight,0.0
down_conv_3.2.bias,0.0
down_conv_4.0.weight,0.0
down_conv_4.0.bias,0.0
down_conv_4.2.weight,0.0
down_conv_4.2.bias,0.0
down_conv_5.0.weight,0.0
down_conv_5.0.bias,0.0
down_conv_5.2.weight,0.0
down_conv_5.2.bias,0.0
up_trans_1.weight,0.0
up_trans_1.bias,0.0
up_conv_1.0.weight,0.0
up_conv_1.0.bias,0.0
up_conv_1.2.weight,0.0
up_conv_1.2.bias,0.0
up_trans_2.weight,0.0
up_trans_2.bias,0.0
up_conv_2.0.weight,0.0
up_conv_2.0.bias,0.0
up_conv_2.2.weight,0.0
up_conv_2.2.bias,0.0
up_trans_3.weight,0.0
up_trans_3.bias,0.0
up_conv_3.0.weight,0.0
up_conv_3.0.bias,0.0
up_conv_3.2.weight,0.0
up_conv_3.2.bias,0.0
up_trans_4.weight,0.0
up_trans_4.bias,0.0
up_conv_4.0.weight,0.0
up_conv_4.0.bias,0.0
up_conv_4.2.weight,0.0
up_conv_4.2.bias,0.0


In [None]:
loss.backward()
#check the L2 norm and if different than before
for name, param in model.named_parameters():
    print(f"{name},{torch.linalg.vector_norm(param.grad)}") #all non-zero

down_conv_1.0.weight,1.5333885983181972e-07
down_conv_1.0.bias,6.083563494030386e-05
down_conv_1.2.weight,0.0003466005437076092
down_conv_1.2.bias,0.00028313646907918155
down_conv_2.0.weight,1.31531317038025e-06
down_conv_2.0.bias,2.838158479789854e-06
down_conv_2.2.weight,6.243294137675548e-06
down_conv_2.2.bias,9.186105671687983e-06
down_conv_3.0.weight,1.0400877670235786e-07
down_conv_3.0.bias,2.2931989462904312e-07
down_conv_3.2.weight,3.824099508165091e-07
down_conv_3.2.bias,5.700066481040267e-07
down_conv_4.0.weight,8.535766760076058e-09
down_conv_4.0.bias,1.5556713250930443e-08
down_conv_4.2.weight,2.1027526386774298e-08
down_conv_4.2.bias,3.258200464983929e-08
down_conv_5.0.weight,3.0532845141095777e-09
down_conv_5.0.bias,5.8888027787418196e-09
down_conv_5.2.weight,8.890930658367324e-09
down_conv_5.2.bias,1.399598392737289e-08
up_trans_1.weight,6.338749081180595e-09
up_trans_1.bias,4.353224980491177e-08
up_conv_1.0.weight,1.2107383895454404e-07
up_conv_1.0.bias,1.05161646501983

In [None]:
#step the optimizer and see whether L2 norm of weights/biases change
optimizer.step() #leverages tensor gradients from backward() to update weights
for name, param in model.named_parameters():
    print(f"{name},{torch.linalg.vector_norm(param)}") #all non-zero, note norm of first node changes so weights update

down_conv_1.0.weight,2.7115249633789062
down_conv_1.0.bias,1.1624234914779663
down_conv_1.2.weight,2.6694529056549072
down_conv_1.2.bias,0.222116619348526
down_conv_2.0.weight,3.8454744815826416
down_conv_2.0.bias,0.3513377606868744
down_conv_2.2.weight,3.8040192127227783
down_conv_2.2.bias,0.22764413058757782
down_conv_3.0.weight,5.422050476074219
down_conv_3.0.bias,0.3151363134384155
down_conv_3.2.weight,5.412996768951416
down_conv_3.2.bias,0.24840745329856873
down_conv_4.0.weight,7.650272846221924
down_conv_4.0.bias,0.3204376995563507
down_conv_4.2.weight,7.660725116729736
down_conv_4.2.bias,0.2307216078042984
down_conv_5.0.weight,10.84073257446289
down_conv_5.0.bias,0.3296024799346924
down_conv_5.2.weight,10.83821964263916
down_conv_5.2.bias,0.24198460578918457
up_trans_1.weight,10.826815605163574
up_trans_1.bias,0.4167180061340332
up_conv_1.0.weight,7.6619486808776855
up_conv_1.0.bias,0.15870007872581482
up_conv_1.2.weight,7.658204555511475
up_conv_1.2.bias,0.22891494631767273
up_