# **run load_data.ipynb BEFORE running this!**

In [1]:
import pickle
import os.path
import numpy as np
data_file_name = 'data'
with open(data_file_name + '.pickle', 'rb') as handle:
    spectrum, temperature = pickle.load(handle)
    print(f"read data from {data_file_name}.pickle")
print(f"shape of spectrum data: {spectrum.shape}")
print(f"shape of temperature data: {temperature.shape}")
print()
print(f"there are {temperature.shape[0]} spectrums")
print(f"each spectrum is {spectrum.shape[1]} long, which is number of features")

read data from data.pickle
shape of spectrum data: (6000, 10000)
shape of temperature data: (6000, 1)

there are 6000 spectrums
each spectrum is 10000 long, which is number of features


In [2]:
indices_file_name = 'cross_validation_resample=2_fold=5'
with open(indices_file_name + '.pickle', 'rb') as handle:
    train_indices, test_indices = pickle.load(handle)
    print(f"got indices from {indices_file_name}.pickle")  
print()
print(f"sets of training indices: {len(train_indices)}")
print(f"number of training indices per set: {len(train_indices[0])}")
print(f"sets of testing indices: {len(test_indices)}")
print(f"number of testing indices per set: {len(test_indices[0])}")

got indices from cross_validation_resample=2_fold=5.pickle

sets of training indices: 10
number of training indices per set: 4800
sets of testing indices: 10
number of testing indices per set: 1200


In [3]:
input_dimension = spectrum.shape[1]
print(f"input dimension is:   {input_dimension}")
number_of_samples = spectrum.shape[0]
print(f"number of samples is: {number_of_samples}")
output_dimension = temperature.shape[1]
print(f"output dimension is:  {output_dimension}")

input dimension is:   10000
number of samples is: 6000
output dimension is:  1


In [4]:
# how many parameters?
#   "a very simple two-layer ReLU network with p = 2n + d parameters 
#   that can express any labeling of any sample of size n in d dimensions
#   https://arxiv.org/pdf/1611.03530.pdf
#   https://stats.stackexchange.com/questions/320383/relationship-between-
#   model-over-fitting-and-number-of-parameters/320387#320387
import torch
import torch.nn as nn
class Model(torch.nn.Module):
    def __init__(self, device, input_dim=input_dimension):
        super().__init__()
        self.relu  = nn.ReLU()
        self.hidden_dim = 32
        self.output_dim = output_dimension
        self.sequential = torch.nn.Sequential(
            torch.nn.Linear(input_dim, self.hidden_dim),
            self.relu,
            torch.nn.Linear(self.hidden_dim, self.hidden_dim),
            self.relu,
            torch.nn.Linear(self.hidden_dim, self.output_dim)
        )
        self.device = device
        self.to(device)
    def forward(self, x):
        y = self.sequential(x)
        # change: remove sigmoid, use y result as is
        # y = torch.sigmoid(y)
        return y

In [5]:
from torch.utils.data import DataLoader
from torch import optim
import numpy as np

loss_function = nn.MSELoss()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# return: train_loss, validation_loss, current model
class CalculateMSE():
    def __init__(self, net, n_epochs, batch_size, learning_rate):
        super().__init__()
        self.net = net
        #initialize some constants
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.n_epochs = n_epochs
        self.net.apply(self.weights_init)   
    def weights_init(self,layer):
        if type(layer) == nn.Linear:
            nn.init.orthogonal_(layer.weight)
    def get_mse(self,train_data, train_label, test_data, test_label):
        train_set = torch.utils.data.TensorDataset(
            torch.Tensor(train_data).to(device), 
            torch.Tensor(train_label).to(device))
        val_set = torch.utils.data.TensorDataset(
            torch.Tensor(test_data).to(device), 
            torch.Tensor(test_label).to(device))
        loader_args = dict(batch_size=self.batch_size)
        train_loader = DataLoader(train_set, shuffle=True, drop_last=True, 
                                  **loader_args)
        val_loader = DataLoader(val_set, shuffle=True, drop_last=True, 
                                **loader_args)
        train_loss = []
        validate_loss = []
        criterion = loss_function
        optimizer = optim.Adam(self.net.parameters(), lr=self.learning_rate)
        
        for epoch in range(0, self.n_epochs):
            # if epoch % 1000 == 0:
            #     print(f"epoch = {epoch}")
            epoch_train_loss=[]
            for i, data in enumerate(train_loader, 0):
                inputs, label = data
                y_pred = self.net(inputs.to(self.net.device))
                loss = criterion(y_pred, label.to(self.net.device))
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                epoch_train_loss.append(loss.item())
            train_loss.append(np.mean(epoch_train_loss))
            # validation step: need to UN-normalize data!
            epoch_validate_loss=[]

            for i, data in enumerate(val_loader, 0):
                with torch.no_grad():
                    inputs_validate, label_validate = data
                    y_pred_validate = self.net(
                        inputs_validate.to(self.net.device))
                    loss_validate = criterion(
                        y_pred_validate, 
                        label_validate.to(self.net.device)) # one number
                    epoch_validate_loss.append(loss_validate.item())
            validate_loss.append(np.mean(epoch_validate_loss))
        return np.min(train_loss), np.min(validate_loss), self.net

In [6]:
from pathlib import Path
from datetime import datetime
from sklearn.preprocessing import StandardScaler

n_epochs = 2000
batch_size = 32
learning_rate = 1e-6

PATH = 'model_dnn/'
LOG = "log_dnn"
Path(PATH).mkdir(parents=True, exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
mdl = Model(device=device, input_dim=input_dimension)
print("how many params we need?")
print(f"\tthere are {number_of_samples} samples with {input_dimension} features")
print(f"\twe need just {2 * number_of_samples + input_dimension} parameters")
# show number of parameters!
print("number of parameters we have:")
total_params = sum(p.numel() for p in mdl.parameters())
print(f"\ttotal parameters: {total_params}")
total_params_trainable = sum(p.numel() for p in mdl.parameters() if p.requires_grad)
print(f"\ttotal trainable parameters: {total_params_trainable}")

train_losses = []
validate_losses = []
print(f"number of epochs: {n_epochs}, batch size: {batch_size}, device: {mdl.device}, learning rate {learning_rate}")
print(f"\tin each cross-validation round, I normalize the training data with sklearn scaler")
print(f"\tthen, I use the same scaler on testing data to precent data snooping")

print()
print("training starts:")
f = open(LOG + ".txt", "w")
f.write("training START: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\n")
f.write(f"number of epochs: {n_epochs}, batch size: {batch_size}, device: {device}, learning rate {learning_rate}\n")

for cross_validation_round, (train, test) in enumerate(zip(train_indices, test_indices)):
    print(f"we are on validation round no.{cross_validation_round}")
    # use indices to extract training & testing set for current cross-validation round
    train_data, train_label= spectrum[train], temperature[train]
    test_data, test_label= spectrum[test], temperature[test]
    # normalize training data, scaler normalize direction = column
    scaler = StandardScaler()
    scaler.fit(train_data)
    train_data = scaler.transform(train_data)
    # 'normalize' testing data, using transformation for training data (prevents data snooping)
    test_data = scaler.transform(test_data)
    mse_calculator = CalculateMSE(mdl, n_epochs, batch_size=batch_size, 
                                  learning_rate=learning_rate)
    train_loss, validate_loss, model = mse_calculator.get_mse(
                                        train_data, 
                                        train_label, 
                                        test_data, 
                                        test_label)
    train_losses.append(train_loss)
    validate_losses.append(validate_loss)
    print(f"\ttraining loss:   {train_loss}")
    print(f"\tvalidation loss: {validate_loss}")
    print("\ttime: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
        
    f.write(f"cross-validation round = {cross_validation_round}\n")
    f.write(f"\ttraining loss:   {train_loss}\n")
    f.write(f"\tvalidation loss: {validate_loss}\n")
    f.write("\ttime: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S") +"\n")

    torch.save(model.state_dict(), PATH + 'model_' + str(cross_validation_round) + '.pth')
f.write("training END: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\n")
f.close()

how many params we need?
	there are 6000 samples with 10000 features
	we need just 22000 parameters
number of parameters we have:
	total parameters: 321121
	total trainable parameters: 321121
number of epochs: 2000, batch size: 32, device: cuda, learning rate 1e-06
	in each cross-validation round, I normalize the training data with sklearn scaler
	then, I use the same scaler on testing data to precent data snooping

training starts:
we are on validation round no.0
	training loss:   0.0024756366627601287
	validation loss: 0.021117446289674657
	time: 2023-05-23 17:46:49
we are on validation round no.1
	training loss:   0.0031832191642994683
	validation loss: 0.018764695932937635
	time: 2023-05-23 18:08:06
we are on validation round no.2
	training loss:   0.0038658664903293053
	validation loss: 0.020107779626709385
	time: 2023-05-23 18:34:14
we are on validation round no.3
	training loss:   0.0039847012205670275
	validation loss: 0.019049227162188775
	time: 2023-05-23 19:00:18
we are on v

In [7]:
losses_mean = np.mean(validate_losses)
losses_std = np.std(validate_losses)
print(f"mean validation losses: {losses_mean}, std: {losses_std}")

mean validation losses: 0.019062931373765742, std: 0.002208505248363594


In [8]:
number_figures = 10
import matplotlib.pyplot as plt

indices = torch.randint(0, len(spectrum),(number_figures,)).unique()
for i in indices:
    print(f"we use {i}th example")
    # change: cast i to int, since pandas not work with torch.int64
    # changed: removed figure, since the output is just one number
    spec = np.asarray(spectrum[int(i)]).flatten()
    temp = np.asarray(temperature[int(i)]).flatten()

    prediction = model(torch.Tensor(np.asarray(spectrum[int(i)])).to(model.device)).detach().cpu().flatten()
    
    prediction = prediction.item()
    ground_truth = temp.item()
    # recover data from normalization
    print(f"-----------------------------------------------------")
    print(f"\tthe prediction is: {prediction}")
    print(f"\tthe ground truth is: {ground_truth}")
    print(f"\tthe difference is: {prediction - ground_truth}")


we use 159th example
-----------------------------------------------------
	the prediction is: 24.31614112854004
	the ground truth is: 21.7
	the difference is: 2.6161411285400398
we use 1228th example
-----------------------------------------------------
	the prediction is: 24.2984561920166
	the ground truth is: 21.2
	the difference is: 3.0984561920166023
we use 2818th example
-----------------------------------------------------
	the prediction is: 19.420440673828125
	the ground truth is: 21.8
	the difference is: -2.3795593261718757
we use 3209th example
-----------------------------------------------------
	the prediction is: 19.346473693847656
	the ground truth is: 21.8
	the difference is: -2.4535263061523445
we use 3404th example
-----------------------------------------------------
	the prediction is: 24.218372344970703
	the ground truth is: 21.2
	the difference is: 3.018372344970704
we use 3664th example
-----------------------------------------------------
	the prediction is: 20

# **skip cell 14 for now (it's in template file!)**