# **run load_data.ipynb BEFORE running this!**

In [1]:
import pickle
import os.path
import numpy as np
PATH = 'model_dnn/'
LOG = "log_dnn"
data_file_name = 'data'

with open(data_file_name + '.pickle', 'rb') as handle:
    spectrum, temperature = pickle.load(handle)
    print(f"read data from {data_file_name}.pickle")
print(f"shape of spectrum data: {spectrum.shape}")
print(f"shape of temperature data: {temperature.shape}")
print()
print(f"there are {temperature.shape[0]} spectrums")
print(f"each spectrum is {spectrum.shape[1]} long, which is number of features")

read data from data.pickle
shape of spectrum data: (6000, 10000)
shape of temperature data: (6000, 1)

there are 6000 spectrums
each spectrum is 10000 long, which is number of features


In [2]:
indices_file_name = 'indices'
with open(indices_file_name + '.pickle', 'rb') as handle:
    train_indices, validate_indices, test_indices = pickle.load(handle)
    print(f"got indices from {indices_file_name}.pickle")  
print()
print(f"sets of training indices: {len(train_indices)}")
print(f"number of training indices per set: {len(train_indices[0])}")
print(f"sets of validating indices: {len(validate_indices)}")
print(f"number of validating indices per set: {len(validate_indices[0])}")
print(f"number of testing indices: {len(test_indices)}")
print()
input_dimension = spectrum.shape[1]
print(f"input dimension is:   {input_dimension}")
number_of_samples = spectrum.shape[0]
print(f"number of samples is: {number_of_samples}")
output_dimension = temperature.shape[1]
print(f"output dimension is:  {output_dimension}")

got indices from indices.pickle

sets of training indices: 16
number of training indices per set: 4200
sets of validating indices: 16
number of validating indices per set: 600
number of testing indices: 1200

input dimension is:   10000
number of samples is: 6000
output dimension is:  1


In [3]:
# how many parameters?
#   "a very simple two-layer ReLU network with p = 2n + d parameters 
#   that can express any labeling of any sample of size n in d dimensions
#   https://arxiv.org/pdf/1611.03530.pdf
#   https://stats.stackexchange.com/questions/320383/relationship-between-
#   model-over-fitting-and-number-of-parameters/320387#320387
import torch
import torch.nn as nn
class Model(torch.nn.Module):
    def __init__(self, device, input_dim=input_dimension):
        super().__init__()
        self.relu  = nn.ReLU()
        self.hidden_dim = 32
        self.output_dim = output_dimension
        self.sequential = torch.nn.Sequential(
            torch.nn.Linear(input_dim, self.hidden_dim),
            self.relu,
            # torch.nn.Dropout(0.2),
            torch.nn.Linear(self.hidden_dim, self.hidden_dim),
            self.relu,
            # torch.nn.Dropout(0.2),
            torch.nn.Linear(self.hidden_dim, self.output_dim)
        )
        self.device = device
        self.to(device)
    def forward(self, x):
        y = self.sequential(x)
        # change: remove sigmoid, use y result as is
        # y = torch.sigmoid(y)
        return y
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
import torch
import torch.nn as nn
# change: input dim = 1000, output dim = 1 (temperature value)
class SiameseModel(torch.nn.Module):
    def __init__(self, device, input_dim=input_dimension):
        super().__init__()
        self.relu  = nn.ReLU()
        self.hidden_dim = 500
        self.linear1 = torch.nn.Linear(input_dim, self.hidden_dim)
        self.linear2 = torch.nn.Linear(self.hidden_dim, self.hidden_dim)
        self.linear3 = torch.nn.Linear(self.hidden_dim, 1)
        self.device = device
        self.to(device)
    def forward_siamese(self, x):
        y = self.linear3(self.relu(self.linear2(self.relu(self.linear1(x)))))
        return y
    def forward(self, input):
        # print(f"SiameseModel: input shape: {input.shape}")
        # in training, input shape is: 
        #   [batch size, number of inputs * input dim 1, input dim 2]
        #   e.g. [32, 2, 1000]
        # in actually using, input shape would be:
        #   [input dim 2] (in our example, input is 1*1000)
        # in classification, we don't need the network used to generated embedding
        #   we just need the final result, which is the network's 
        #   output through contrastive loss network
        # here, we DO need the output, which is just one number!
        if(input.dim() == 3): # <- training
            # input is tensor of [x1, x2]
            x1 = input[:,0,:] 
            x2 = input[:,1,:]
            # print(f"SiameseModel: ")
            # print(f"x1 = {x1}, shape{x1.shape}")
            # print(f"x2 = {x2}, shape{x2.shape}")
            y1 = self.forward_siamese(x1)
            y2 = self.forward_siamese(x2)
            # print(f"SiameseModel: y1 = {y1}, y2 = {y2}")
            # change: remove sigmoid, use y result as is
            # y = torch.sigmoid(y)
            difference = y1 - y2 # <- order matter, no absolute value!
            return difference
        else: # <- actually using: 
            # input is just x
            y = self.forward_siamese(input)
            return y

In [15]:
import random
# reference:
# https://colab.research.google.com/github/maticvl/dataHacker/
# blob/master/pyTorch/014_siameseNetwork.ipynb#scrollTo=gD1BFFm_z7aj
class SiameseDataset(torch.utils.data.TensorDataset):
    def __init__(self, X, y):
        # subset_X_IDs: e.g. only use mth to nth examples to train
        self.X, self.y = X, y
        self.indices = range(len(y))
    # length: number of elements in subset of X_IDs
    def __len__(self):
        return len(self.y)
    # get item by index in [m, n]
    def __getitem__(self, index): # assume index lies within subset_X_IDs
        # property: randomly sample two inputs,
        # return: the two inputs, 
        #   and binary label for whethery they have same class
        input_1_id = index
        input_1 = self.X[input_1_id]
        label_1 = self.y[input_1_id]
        input_2_id = random.choice(self.indices)
        input_2 = self.X[input_2_id]
        label_2 = self.y[input_2_id]
        difference = label_1 - label_2 # order matter!
        # turn into array BEFORE turning into tensor?
        #   going from list to tensor is slow
        #   going from array to tensor is fast
        # in this implementation, input (see initialization) already on cuda device
        # turning into array actually needs moving into cpu first
        # so just do: list of tensor -> tensor
        #   which is torch.stack
        return torch.stack([input_1, input_2]), difference

In [16]:
from torch.utils.data import DataLoader
from torch import optim
import numpy as np

loss_function = nn.MSELoss()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class CalculateMSE():
    def __init__(self, net, n_epochs, batch_size):
        super().__init__()
        self.net = net
        #initialize some constants
        self.batch_size = 32
        self.learning_rate = 1e-4
        self.n_epochs = n_epochs
        self.net.apply(self.weights_init)   
    def weights_init(self,layer):
        if type(layer) == nn.Linear:
            nn.init.orthogonal_(layer.weight)
    def get_mse(self,train_data, train_label, test_data, test_label):
        train_set = torch.utils.data.TensorDataset(
            torch.Tensor(train_data).to(device), 
            torch.Tensor(train_label).to(device))
        val_set = torch.utils.data.TensorDataset(
            torch.Tensor(test_data).to(device), 
            torch.Tensor(test_label).to(device))
        loader_args = dict(batch_size=self.batch_size)
        train_loader = DataLoader(train_set, shuffle=True, drop_last=True, **loader_args)
        val_loader = DataLoader(val_set, shuffle=True, drop_last=True, **loader_args)
        train_loss = []
        validate_loss = []
        criterion = loss_function
        optimizer = optim.Adam(self.net.parameters(), lr=self.learning_rate) # weight_decay=0
        
        for epoch in range(0, self.n_epochs):
            # if epoch % 1000 == 0:
            #     print(f"epoch = {epoch}")
            epoch_train_loss=[]
            for i, data in enumerate(train_loader, 0):
                inputs, label = data
                y_pred = self.net(inputs.to(self.net.device))
                loss = criterion(y_pred, label.to(self.net.device))
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                epoch_train_loss.append(loss.item())
            train_loss.append(np.mean(epoch_train_loss))
            # validation step: need to UN-normalize data!
            epoch_loss=[]

            for i, data in enumerate(val_loader, 0):
                with torch.no_grad():
                    inputs_validate, label_validate = data
                    y_pred_validate = self.net(inputs_validate.to(self.net.device))
                    loss_validate = criterion(y_pred_validate, label_validate.to(self.net.device)) # one number
                    epoch_loss.append(loss_validate.item())
            validate_loss.append(np.mean(epoch_loss))
        return np.min(validate_loss), self.net

In [17]:
from pathlib import Path
from datetime import datetime
# change: turn into 10 right now for development, was 3000
n_epochs=1600
batch_size=32

PATH = 'model_siamese/'
Path(PATH).mkdir(parents=True, exist_ok=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# change: commented out: alraedy read in X, y data earlier
# response = pd.read_csv("1127_final_data/response.csv", header=None).values #input X
# spectra = pd.read_csv("1127_final_data/spectra.csv", header=None).values #ground truth label Y
# change: input dim = 1000
mdl = SiameseModel(device=device, input_dim=input_dimension)
losses = []
print(f"number of epochs: {n_epochs}, batch size: {batch_size}, device: {mdl.device}")

f = open("log_siamese.txt", "w")
f.write("train START: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\n")
f.write(f"number of epochs: {n_epochs}, batch size: {batch_size}, device: {device}\n")
for i,(train,test) in enumerate(zip(train_indices,test_indices)):
    print(f"we are on fold no.{i}")
    train_data, train_label= spectrum[train],temperature[train]
    test_data, test_label= spectrum[test],temperature[test]
    mse_calculator = CalculateMSE(mdl,n_epochs,batch_size)
    loss,model = mse_calculator.get_mse(train_data, 
                                        train_label, 
                                        test_data, 
                                        test_label)
    losses.append(loss)
    print(f"\tloss: {loss}")
    print("\ttime: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
        
    f.write(f"fold = {i}")
    f.write(f"\tloss: {loss}\n")
    f.write("\ttime: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S") +"\n")

    torch.save(model.state_dict(), PATH+'model_'+str(i))
f.write("train END: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\n")
f.close()

number of epochs: 1, batch size: 32, device: cuda
we are on fold no.0
	loss: 0.18194938753102277
	time: 2023-05-20 20:01:56
we are on fold no.1
	loss: 0.18130954475821676
	time: 2023-05-20 20:01:57
we are on fold no.2
	loss: 0.22512434221602776
	time: 2023-05-20 20:01:58
we are on fold no.3
	loss: 0.2567099489875742
	time: 2023-05-20 20:01:59
we are on fold no.4
	loss: 0.18850154892818347
	time: 2023-05-20 20:02:01
we are on fold no.5
	loss: 0.2658860892862887
	time: 2023-05-20 20:02:02
we are on fold no.6
	loss: 0.21993798319552396
	time: 2023-05-20 20:02:03
we are on fold no.7
	loss: 0.2440004256126043
	time: 2023-05-20 20:02:04
we are on fold no.8
	loss: 0.29874687742542577
	time: 2023-05-20 20:02:06
we are on fold no.9
	loss: 0.24145226462467298
	time: 2023-05-20 20:02:07


In [18]:
losses_mean = np.mean(losses)
losses_std = np.std(losses)
print(f"mean losses: {losses_mean}, std: {losses_std}")

mean losses: 0.23036184125655407, std: 0.03685102763696396


In [19]:
number_figures = 10
import matplotlib.pyplot as plt

indices = torch.randint(0, len(spectrum),(number_figures,)).unique()
for i in indices:
    print(f"we use {i}th example")
    # change: cast i to int, since pandas not work with torch.int64
    # changed: removed figure, since the output is just one number
    spec = np.asarray(spectrum[int(i)]).flatten()
    temp = np.asarray(temperature[int(i)]).flatten()

    prediction = model(torch.Tensor(np.asarray(spectrum[int(i)])).to(model.device)).detach().cpu().flatten()
    
    prediction = prediction.item()
    ground_truth = temp.item()
    # recover data from normalization
    print(f"-----------------------------------------------------")
    print(f"\tthe prediction is: {prediction}")
    print(f"\tthe ground truth is: {ground_truth}")
    print(f"\tthe difference is: {prediction - ground_truth}")

we use 556th example
-----------------------------------------------------
	the prediction is: 20.728620529174805
	the ground truth is: 20.7
	the difference is: 0.028620529174805398
we use 716th example
-----------------------------------------------------
	the prediction is: 20.869356155395508
	the ground truth is: 20.7
	the difference is: 0.16935615539550852
we use 1102th example
-----------------------------------------------------
	the prediction is: 20.995031356811523
	the ground truth is: 21.0
	the difference is: -0.0049686431884765625
we use 1201th example
-----------------------------------------------------
	the prediction is: 20.831024169921875
	the ground truth is: 21.0
	the difference is: -0.168975830078125
we use 1617th example
-----------------------------------------------------
	the prediction is: 20.972780227661133
	the ground truth is: 21.0
	the difference is: -0.027219772338867188
we use 2533th example
-----------------------------------------------------
	the predic

# **skip cell 14 for now (it's in template file!)**