In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, random_split
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch import optim
import pickle
from matplotlib import pyplot as plt
from libs_unet.models import unet_001
from libs_unet.training.libs_train import train_loop, test_loop
from pathlib import Path
#from torch.utils.tensorboard import SummaryWriter
import datetime

top_dir = Path.cwd()
datapath = top_dir / 'data'


In [2]:
#Leverage PyTorch native Dataset and DataLoader 
#Define Train/Test sets from 20 element data samples
with open(datapath / 'training/10k_nomods.pickle', 'rb') as f:
    fracs = pickle.load(f)
    wave = pickle.load(f)
    x_data = pickle.load(f)
    y_data = pickle.load(f)

#create dataset
#input needs a placeholder "channel" dimension since single channel
#learned labels already has max_z + 2 channels from spec_array
#data has to match weights which default to float() so cast data as same
scale_factor = 1
x_data = torch.tensor(x_data[:,None,:].astype('float32'))
y_data = torch.tensor(y_data.astype('float32'))
spec_ds = TensorDataset(scale_factor * x_data, scale_factor * y_data)
#batch sizes
train_bs = 50
test_bs = 100
#create random split for training and validation
train_len = int(0.8 * len(x_data))
test_len = len(x_data) - train_len
train_ds, test_ds = random_split(spec_ds,[train_len, test_len])
train_dl = DataLoader(train_ds, batch_size=train_bs) #took out , shuffle=True for repeatability
test_dl = DataLoader(test_ds, batch_size=test_bs)
#

In [10]:
#set parameters
el_count = 20 #first n elements used to construct model
wl_points = 760 #number of wavelength point measurements in data
learning_rate = 1

#Initialize and run
model = unet_001.LIBSUNet(el_count, wl_points)
loss_fn = nn.MSELoss(reduction='mean')
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

#code from training module
model.train()
#see initial weights and bias norms by node as-initialized
init_wts = {}
        
for name, param in model.named_parameters():
    print(f"{name},{torch.linalg.vector_norm(param)}") #all non-zero
    init_wts[name] = param.clone()




down_conv_1.0.weight,2.818425178527832
down_conv_1.0.bias,1.2078570127487183
down_conv_1.2.weight,2.7074482440948486
down_conv_1.2.bias,0.23369815945625305
down_conv_2.0.weight,3.8313000202178955
down_conv_2.0.bias,0.31021618843078613
down_conv_2.2.weight,3.8383705615997314
down_conv_2.2.bias,0.23778246343135834
down_conv_3.0.weight,5.397403717041016
down_conv_3.0.bias,0.31789326667785645
down_conv_3.2.weight,5.414884567260742
down_conv_3.2.bias,0.235641211271286
down_conv_4.0.weight,7.657407760620117
down_conv_4.0.bias,0.32042866945266724
down_conv_4.2.weight,7.671154499053955
down_conv_4.2.bias,0.22540859878063202
down_conv_5.0.weight,10.832119941711426
down_conv_5.0.bias,0.34078899025917053
down_conv_5.2.weight,10.823777198791504
down_conv_5.2.bias,0.23834553360939026
up_trans_1.weight,10.829153060913086
up_trans_1.bias,0.406354695558548
up_conv_1.0.weight,7.6648664474487305
up_conv_1.0.bias,0.1627163290977478
up_conv_1.2.weight,7.645699977874756
up_conv_1.2.bias,0.23467347025871277

In [11]:
#what is form or init_wts?
for k,v in init_wts.items():
    print(f"{k}:{type(k)}{v}:{type(v)}")
    break

down_conv_1.0.weight:<class 'str'>tensor([[[-0.1136, -0.3242, -0.2296,  0.4058, -0.3774, -0.1755]],

        [[-0.3905,  0.2933, -0.2489, -0.3260,  0.2927,  0.2666]],

        [[ 0.4056,  0.2062, -0.3536,  0.0698, -0.1295,  0.2633]],

        [[ 0.0154,  0.1361, -0.3543,  0.0933, -0.1124,  0.3607]],

        [[ 0.0958, -0.2954,  0.1655,  0.3518,  0.3487,  0.2622]],

        [[ 0.1868, -0.1732, -0.4078, -0.2631,  0.2743, -0.2653]],

        [[ 0.1401,  0.2369, -0.2446, -0.1042,  0.2050,  0.1150]],

        [[-0.1284, -0.1603, -0.3410,  0.2337, -0.0380, -0.0131]],

        [[ 0.3780, -0.2316, -0.1529,  0.1582, -0.2055,  0.1811]],

        [[ 0.2529, -0.1692,  0.2499, -0.2820,  0.0627,  0.1665]],

        [[ 0.3863, -0.3666,  0.2084, -0.1522, -0.3035, -0.1462]],

        [[-0.2369,  0.3003, -0.2040,  0.1268,  0.3002,  0.3776]],

        [[ 0.0374, -0.3468,  0.3027,  0.0029,  0.0484,  0.3277]],

        [[-0.4013, -0.1299,  0.2438, -0.1942, -0.2975,  0.3469]],

        [[-0.1551, -0.0938, 

In [12]:
#now predict / train on first batch from data loader and check gradients
for batch, (X, y) in enumerate(train_dl):
    if batch == 0: #just process first batch
        pred = model(X)
        loss = loss_fn(pred, y)
    else:
        break #exit once past first batch

# clear gradients before back prop
optimizer.zero_grad()
#check if gradients still None or explicit zeros
for name, param in model.named_parameters():
    print(f"{name},{param.grad}") #All None

down_conv_1.0.weight,None
down_conv_1.0.bias,None
down_conv_1.2.weight,None
down_conv_1.2.bias,None
down_conv_2.0.weight,None
down_conv_2.0.bias,None
down_conv_2.2.weight,None
down_conv_2.2.bias,None
down_conv_3.0.weight,None
down_conv_3.0.bias,None
down_conv_3.2.weight,None
down_conv_3.2.bias,None
down_conv_4.0.weight,None
down_conv_4.0.bias,None
down_conv_4.2.weight,None
down_conv_4.2.bias,None
down_conv_5.0.weight,None
down_conv_5.0.bias,None
down_conv_5.2.weight,None
down_conv_5.2.bias,None
up_trans_1.weight,None
up_trans_1.bias,None
up_conv_1.0.weight,None
up_conv_1.0.bias,None
up_conv_1.2.weight,None
up_conv_1.2.bias,None
up_trans_2.weight,None
up_trans_2.bias,None
up_conv_2.0.weight,None
up_conv_2.0.bias,None
up_conv_2.2.weight,None
up_conv_2.2.bias,None
up_trans_3.weight,None
up_trans_3.bias,None
up_conv_3.0.weight,None
up_conv_3.0.bias,None
up_conv_3.2.weight,None
up_conv_3.2.bias,None
up_trans_4.weight,None
up_trans_4.bias,None
up_conv_4.0.weight,None
up_conv_4.0.bias,None
up

In [13]:
#run back-prop and see that gradients are calculated for each node
#note I don't think that state_dict() returns gradients
loss.backward()
for name, param in model.named_parameters():
    print(f"{name},{param.grad}") #we see non-zero and zero gradient tensors through the graph

down_conv_1.0.weight,tensor([[[ 4.7137e-08,  4.6298e-08,  4.6492e-08,  4.7106e-08,  4.7825e-08,
           4.8312e-08]],

        [[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00]],

        [[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00]],

        [[-1.7114e-08, -1.7306e-08, -1.6963e-08, -1.6897e-08, -1.6941e-08,
          -1.6582e-08]],

        [[-8.8362e-09, -8.8989e-09, -7.6068e-09, -5.7205e-09, -4.7699e-09,
          -5.1922e-09]],

        [[ 5.8551e-08,  5.7902e-08,  5.8012e-08,  5.8727e-08,  5.9593e-08,
           6.0297e-08]],

        [[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00]],

        [[ 1.7910e-08,  1.9392e-08,  2.0346e-08,  2.0476e-08,  1.9362e-08,
           1.7412e-08]],

        [[ 2.6970e-08,  2.8669e-08,  3.0180e-08,  3.1088e-08,  3.1463e-08,
           3.1387e-08]],

        [[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e

In [14]:
#check the L2 norm for a compact representation of zero/non-zero gradients at each node
for name, param in model.named_parameters():
    print(f"{name},{torch.linalg.vector_norm(param.grad)}") #all non-zero

down_conv_1.0.weight,3.6796637914449093e-07
down_conv_1.0.bias,0.00023893079196568578
down_conv_1.2.weight,0.0007881526253186166
down_conv_1.2.bias,0.0005006043356843293
down_conv_2.0.weight,5.014594535168726e-06
down_conv_2.0.bias,5.483371296577388e-06
down_conv_2.2.weight,1.0621945875755046e-05
down_conv_2.2.bias,1.692823934718035e-05
down_conv_3.0.weight,1.2026404760945297e-07
down_conv_3.0.bias,2.518645771942829e-07
down_conv_3.2.weight,4.383779241834418e-07
down_conv_3.2.bias,6.997294121902087e-07
down_conv_4.0.weight,6.005894892524566e-09
down_conv_4.0.bias,1.1346652506460941e-08
down_conv_4.2.weight,1.812578886983829e-08
down_conv_4.2.bias,2.921689912227521e-08
down_conv_5.0.weight,2.6678863562779043e-09
down_conv_5.0.bias,6.113255679451868e-09
down_conv_5.2.weight,9.052015137456237e-09
down_conv_5.2.bias,1.479316313890422e-08
up_trans_1.weight,6.434928145893082e-09
up_trans_1.bias,4.512767759479175e-08
up_conv_1.0.weight,1.1854094594809794e-07
up_conv_1.0.bias,1.077131841498157

In [15]:
#step the optimizer and see whether L2 norm of weights/biases change
optimizer.step() #leverages tensor gradients from backward() to update weights
for name, param in model.named_parameters():
    print(f"{name},{torch.linalg.vector_norm(param)}") #all non-zero, note norm of first node changes so weights update

down_conv_1.0.weight,2.818425178527832
down_conv_1.0.bias,1.2078545093536377
down_conv_1.2.weight,2.7074472904205322
down_conv_1.2.bias,0.23363976180553436
down_conv_2.0.weight,3.8313000202178955
down_conv_2.0.bias,0.3102169930934906
down_conv_2.2.weight,3.8383708000183105
down_conv_2.2.bias,0.23777882754802704
down_conv_3.0.weight,5.397403717041016
down_conv_3.0.bias,0.31789320707321167
down_conv_3.2.weight,5.414884567260742
down_conv_3.2.bias,0.235641211271286
down_conv_4.0.weight,7.657407760620117
down_conv_4.0.bias,0.32042866945266724
down_conv_4.2.weight,7.671154499053955
down_conv_4.2.bias,0.22540858387947083
down_conv_5.0.weight,10.832119941711426
down_conv_5.0.bias,0.34078899025917053
down_conv_5.2.weight,10.823777198791504
down_conv_5.2.bias,0.23834553360939026
up_trans_1.weight,10.829153060913086
up_trans_1.bias,0.406354695558548
up_conv_1.0.weight,7.6648664474487305
up_conv_1.0.bias,0.162716343998909
up_conv_1.2.weight,7.645699977874756
up_conv_1.2.bias,0.23467350006103516
u

In [16]:
#Unrelated type checks for logging
new_dict = {}
for k, v in model.named_parameters():
    new_dict[k] = { 'wt':v, 'grad':v.grad }
#now compare weights to those in init_dict
#diff_dict = {}
for k,v in new_dict.items():
    #diff_dict[k] = v['wt'] - init_wts[k]
    print(f"{k}:init0:{init_wts[k][0]} new0:{new_dict[k]['wt'][0]}")

down_conv_1.0.weight:init0:tensor([[-0.1136, -0.3242, -0.2296,  0.4058, -0.3774, -0.1755]],
       grad_fn=<SelectBackward0>) new0:tensor([[-0.1136, -0.3242, -0.2296,  0.4058, -0.3774, -0.1755]],
       grad_fn=<SelectBackward0>)
down_conv_1.0.bias:init0:0.05188566446304321 new0:0.05181257054209709
down_conv_1.2.weight:init0:tensor([[-0.0285,  0.0096, -0.0018,  0.0631,  0.0351,  0.0153],
        [ 0.0493,  0.0759,  0.0414,  0.0371,  0.0558, -0.0241],
        [-0.0815,  0.0835, -0.0510,  0.0551,  0.0605,  0.0774],
        [-0.0215, -0.0659,  0.0155, -0.0737,  0.0280, -0.0120],
        [-0.0081, -0.0821, -0.0070,  0.0162,  0.0406,  0.0441],
        [ 0.0365, -0.0474, -0.0597,  0.0860, -0.0820,  0.0650],
        [ 0.0035, -0.0721,  0.0521, -0.0008,  0.0734,  0.0482],
        [ 0.0188,  0.0297,  0.0685, -0.0132,  0.0026,  0.0146],
        [ 0.0246, -0.0153,  0.0041, -0.0352,  0.0199,  0.0700],
        [-0.0404,  0.0522, -0.0175,  0.0008,  0.0347,  0.0467],
        [ 0.0451, -0.0820,  0.034

In [None]:
#Now process the second batch and see if we are getting updates to everything
for batch, (X, y) in enumerate(train_dl):
    if batch == 0: #skip the batch we already processed
        continue
    if batch == 1: #just process first batch
        pred = model(X)
        loss = loss_fn(pred, y)
    else:
        break #exit once past second batch

In [None]:
# Clear gradients
optimizer.zero_grad()
#check if gradients still None or explicit zeros
for name, param in model.named_parameters():
    print(f"{name},{torch.linalg.vector_norm(param.grad)}") #All zeros

In [None]:
loss.backward()
#check the L2 norm and if different than before
for name, param in model.named_parameters():
    print(f"{name},{torch.linalg.vector_norm(param.grad)}") #all non-zero

In [None]:
#step the optimizer and see whether L2 norm of weights/biases change
optimizer.step() #leverages tensor gradients from backward() to update weights
for name, param in model.named_parameters():
    print(f"{name},{torch.linalg.vector_norm(param)}") #all non-zero, note norm of first node changes so weights update