# Neural Network

## Preliminaries

### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import itertools
import torch
from torch import nn
torch.backends.cudnn.benchmark = False
from torch.utils.data.sampler import WeightedRandomSampler
from torch.utils.data import Dataset, DataLoader, Subset, TensorDataset
from torch.utils.tensorboard import SummaryWriter
from torch.utils.tensorboard.summary import hparams
from pprint import pprint
from sklearn.metrics import precision_recall_fscore_support as score

from torchinfo import summary
from textwrap import dedent

from urllib.request import urlretrieve

import os

We want to exploit the parallel computing offered by CUDA on the GPU, if available. 

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device: {}".format(device))

Device: cuda


The following function assures the reproducibility of experiments.

In [3]:
def set_reproducibility(seed = 42):
	torch.manual_seed(seed)
	np.random.seed(seed)
	os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
	torch.use_deterministic_algorithms(True)

## Design

### Class for data loading and pre-processing

By instatiating this class, we load the dataset in output by the *Data Manipulation* section of the notebook, we clean it with the same operations done in the *Data Cleaning section*, we split it into `X`, `y` and `ratings_count` as `weights` and, eventually, we discretize the continuous label into 5 discrete classes.

In [4]:
class MoviesDataset(Dataset):
	def __init__(self):
		try: 
			df = pd.read_csv("datasets/df.csv")
		
		except FileNotFoundError:
			print(f"Download in progress of df.csv")
			file, _ = urlretrieve(url = "http://github.com/MickPerl/DataAnalyticsProject/releases/download/datasets/df.csv", filename="datasets/df.csv")
			df = pd.read_csv(file)

		df = pd.read_csv("datasets/df.csv")
		df = self.cleaning(df)

		X, y, weights = self.split_XYweights(df)

		y = self.discretization(y)

		self.num_classes = y.nunique()
		self.X = torch.FloatTensor(X.values)
		self.y = torch.LongTensor(y)
		self.weights = torch.FloatTensor(weights)

	def __len__(self):
		return self.X.shape[0]

	def __getitem__(self, idx):
		return self.X[idx, :], self.y[idx], self.weights[idx]

	def split_XYweights(self, df):
		y = df['rating_mean']
		weights = df['ratings_count']
		X = df.drop(columns=['ratings_count', 'rating_mean'], axis=1)
		return X, y, weights

	def cleaning(self, df):
		df.dropna(subset = ['rating_mean'], inplace=True)
		df_without_tags = df[df.iloc[:, 23:-2].isna().all(axis=1)]
		df_without_tags_nor_genres = df_without_tags[df_without_tags['(no genres listed)'] == 1]
		rows_to_be_deleted = df.loc[df["movieId"].isin(df_without_tags_nor_genres["movieId"])].index
		df.drop(rows_to_be_deleted, axis=0, inplace=True)
		df.iloc[:, 23:-2] = df.iloc[:, 23:-2].fillna(0)
		df.drop(['(no genres listed)'], inplace=True, axis=1)
		df_year_without_na = df.year[-pd.isna(df.year)]
		df.year = df.loc[:, 'year'].fillna(np.median(df_year_without_na)).astype('int')
		df.drop('movieId', inplace=True, axis=1)
		df.drop_duplicates(inplace=True)
		return df

	def discretization(self, series):
		return pd.cut(series, bins=5, labels=False)

### Class for the network architecture

By instiantiating this class, we build the network architecture.\
The architecture is highly parametrized: in particular, some of the parameters that it is possible to specify are the activation functions of the first layer, that of the hidden layers and that of the output layer as well as the number of hidden layers, the probability of dropout and batch normalization. 

In [5]:
class Feedforward(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, af_first_layer, af_hidden_layers, af_output_layer, num_hidden_layers, dropout, batch_norm):
        super(Feedforward, self).__init__()
    
        model = [nn.Linear(input_size, hidden_size), af_first_layer]

        for i in range(num_hidden_layers):
            model.append(nn.Linear(hidden_size, hidden_size))

            if batch_norm:
                model.append(nn.BatchNorm1d(hidden_size))
            
            model.append(af_hidden_layers)
            
            if dropout != 0:
                model.append(nn.Dropout(dropout))
    

        model.append(nn.Linear(hidden_size, num_classes))

        if af_output_layer :
            model.append(af_output_layer)

        self.model = nn.Sequential(*model)
        

    def forward(self, x):
        return self.model(x)


## Training function

We implement by hand the **early stopping** mechanism; in detail, we trigger it after the fifth epoch and we set to 3 the number of consecutive epochs we tolerate an increase of the loss (`n_bad_epochs`): every time the loss decreases with respect to the last min value, the counter of bad epochs is reset.

We log on **TensorBoard** some values such as the loss and the accuracy every batch, the loss and the accuracy every epoch as well as the weights and the bias every batch.

Furthermore, we check for the **vanishing and exploding gradient phenomenon**; even thought the architecture is well designed, there could be some batch containing bad examples which cause a na or inf gradient: ideally, these samples should be removed, but we solely skip them and continue training.

In [6]:
def get_num_correct(preds, labels):
    return preds.argmax(dim=1).eq(labels).sum().item()

def train_model(model, criterion, optimizer, data_loader, epochs, n_bad_epochs, device, tb, cardinality_training_set):
	model.train()

	loss_values = []	# to store loss values over all batches regardless distinct epochs: it's the list we return after training

	n_bad_epochs = n_bad_epochs
	patience = 0
	min_loss = np.Inf


	for epoch in range(epochs):
		losses_batches_current_epoch = []	# to store loss values over all batches with regard to a single epoch to checking condition about early stopping
		correct_batches_current_epoch = []
		
		for batch_idx, samples in enumerate(data_loader):
			data, targets = samples[0].to(device), samples[1].to(device)
			optimizer.zero_grad()
			
			y_pred = model(data)
			
			if str(criterion) == "CrossEntropyLoss()":
				loss = criterion(y_pred, targets)
			else:	# "KLDivLoss()"
				targets_one_hot_encoded = torch.nn.functional.one_hot(targets, num_classes=5).float()
				loss = criterion(y_pred, targets_one_hot_encoded)

			correct = get_num_correct(y_pred, targets)
			
			tb.add_scalar("Loss every batch", loss, epoch * len(data_loader) + batch_idx + 1)
			tb.add_scalar("Correct every batch", correct, epoch * len(data_loader) + batch_idx + 1)
			tb.add_scalar("Accuracy every batch", correct / len(data), epoch * len(data_loader) + batch_idx + 1)

			loss_values.append(loss.item())
			losses_batches_current_epoch.append(loss.item())
			correct_batches_current_epoch.append(correct)

			# Backward pass
			loss.backward()

			valid_gradients = True
			for name, param in model.named_parameters():
				if param.grad is not None:
					if torch.isnan(param.grad).any():
						print(f"{name} is nan, so model parameters are not going to be updated: this batch is skipped and the gradient is reset.")
						optimizer.zero_grad()
						valid_gradients = False
					if torch.isinf(param.grad).any():
						print(f"{name} is inf, so model parameters are not going to be updated: this batch is skipped and the gradient is reset.")
						optimizer.zero_grad()
						valid_gradients = False
			if not valid_gradients :
				continue
			
			optimizer.step()
			
			for name, value in model.named_parameters():
				name = name.replace('.', '/')
				tb.add_histogram('every batch_' + name, param.data.cpu().detach().numpy(), batch_idx + 1)
				tb.add_histogram('every batch_' + name + '/grad', param.grad.data.cpu().numpy(), batch_idx + 1)
		
		total_correct_current_epoch = np.sum(correct_batches_current_epoch)
		tb.add_scalar("Correct every epoch", total_correct_current_epoch, epoch)

		accuracy_current_epoch = total_correct_current_epoch / cardinality_training_set
		tb.add_scalar("Accuracy every epoch", accuracy_current_epoch, epoch)
		
		for name, param in model.named_parameters():
			name = name.replace('.', '/')
			tb.add_histogram('every epoch_' + name, param.data.cpu().detach().numpy(), epoch)
			tb.add_histogram('every epoch_' + name + '/grad', param.grad.data.cpu().numpy(), epoch)

		mean_loss_current_epoch = np.mean(losses_batches_current_epoch)
		tb.add_scalar("Loss every epoch", mean_loss_current_epoch, epoch)

		if epoch < 5 :
			print(f"Epoch: {epoch}\t Mean Loss: {mean_loss_current_epoch}")
			continue
		
		if epoch == 5 :
			print("Waiting for three consecutive epochs during which the mean loss over batches does not decrease...")
        
		if mean_loss_current_epoch < min_loss:
			# Save the model
			# torch.save(model)
			patience = 0
			min_loss = mean_loss_current_epoch
		else:
			patience += 1

		print(f"Epoch: {epoch}\t Mean Loss: {mean_loss_current_epoch}\t Current min mean loss: {min_loss}")

		if patience == n_bad_epochs:
			print(f"Early stopped at {epoch}-th epoch, since the mean loss over batches didn't decrease during the last {n_bad_epochs} epochs")
			return model, loss_values, epoch, mean_loss_current_epoch, accuracy_current_epoch


	return model, loss_values, epoch, mean_loss_current_epoch, accuracy_current_epoch 

## Testing function

In [7]:
def test_model(model, data_loader, device, output_dict = False):
	model.eval()
	y_pred = []
	y_test = []
	
	for batch_idx, samples in enumerate(data_loader):
	    data, targets = samples[0].to(device), samples[1].to(device)
	    y_pred.append(model(data))
	    y_test.append(targets)
	y_pred = torch.stack(y_pred).squeeze()
	y_test = torch.stack(y_test).squeeze()
	y_pred = y_pred.argmax(dim=1, keepdim=True).squeeze()
	return classification_report(y_test.cpu(), y_pred.cpu(), zero_division=0, output_dict=output_dict)

## Utilities

The following utility function lets us to obtain the samples' weights from the classes' weights: this output are going to be used in the sampler of the `DataLoader` object in order to manage the data imbalance. 

In [8]:
def class_weights(y):
    class_count = torch.bincount(y)
    class_weighting = 1. / class_count
    sample_weights = class_weighting[y]   # np.array([weighting[t] for t in y_train])
    return sample_weights

Due to a bug in the TensorBoard porting to PyTorch, we inherit the `SummaryWriter` class and overwrite the `add_hparams` function with some modifications.

In [9]:
class SummaryWriter(SummaryWriter):

	def add_hparams(self, hparam_dict, metric_dict):
		torch._C._log_api_usage_once("tensorboard.logging.add_hparams")
		if type(hparam_dict) is not dict or type(metric_dict) is not dict:
			raise TypeError('hparam_dict and metric_dict should be dictionary.')
		exp, ssi, sei = hparams(hparam_dict, metric_dict)

		self.file_writer.add_summary(exp)
		self.file_writer.add_summary(ssi)
		self.file_writer.add_summary(sei)
		for k, v in metric_dict.items():
			if v is not None:
				self.add_scalar(k, v)

We define a function to extract dictionaries containing a hyperparameters' configuration from the cartesian product of values of the hyperparameters; in detail, before creating a dictionary we check some condition in order to skip pointless or incorrect configurations.\
Examples of skipped configurations are those with:
- batch_size < 32 and batch norm, since batches aren't statistically significant;
- CrossEntropy as loss function and whichever activation function in the output layer, since CrossEntropy always contains SoftMax as activation function of output layer;
- Kullback-Leibler divergence as loss function and whichever activation function in the output layer other than SoftMax: since Kullback-Leibler divergence works with probability distributions, the SoftMax as the activation function of the output layer is a suitable choice in that it returns a probability distribution over classes for each feature vector in input.
- high probability of dropout (0.5) and a hidden layer sizes less than 64;
- low probability of dropout (0.2) and hidden layer size greater than 32;  

In [10]:
def dict_configs_from_params_cartesian_product(hyperparams) :
	name_params = list(hyperparams.keys())
	cartesian_product_filtered = []
	cartesian_product_config_params = itertools.product(*hyperparams.values())

	for conf_params in cartesian_product_config_params:
		conf_params_dict = {name_params[i]: conf_params[i] for i in range(len(hyperparams))}
		
		if conf_params_dict['batch_norm'] and conf_params_dict['batch_size'] < 32 :
			continue

		if str(conf_params_dict['loss_function']) == "CrossEntropyLoss()" and conf_params_dict['af_output_layer'] != None:
			continue

		if str(conf_params_dict['loss_function']) == "KLDivLoss()" and str(conf_params_dict['af_output_layer']) != "LogSoftmax(dim=1)":
			continue
		
		if conf_params_dict['dropout'] == 0.5 and conf_params_dict['hidden_size'] < 64 :
			continue

		if conf_params_dict['dropout'] == 0.2 and conf_params_dict['hidden_size'] > 32 :
			continue

		cartesian_product_filtered.append(conf_params_dict)
	
	return cartesian_product_filtered

Since the number of parameters' configurations are really high (~ 6000), we implement a function to split them into `nr_sets` subsets: so that, we are able to execute the hyperparameters optimization in parallel.

In [11]:
def split_configs_params(dict_configs, nr_sets = 4):
	assert len(dict_configs) % nr_sets == 0,  "The number of configs params sets have to be a dividend of the cardinality of all configs."
	print(f"Newly created sets (ratio {nr_sets}:1 to all {len(dict_configs)} configs):")

	for i in range(nr_sets):
		globals()[f"configs_set{i}"] = np.array_split(dict_configs, nr_sets)[i]
		print(f"configs_set{i}")

## Neural Network in action

### Creation training, validation and test set

In [12]:
dataset = MoviesDataset()
train_idx, test_idx = train_test_split(np.arange(len(dataset)), test_size=0.2, stratify=dataset.y, random_state=42)
train_idx, val_idx = train_test_split(train_idx, test_size=0.1, stratify=dataset.y[train_idx], random_state=42)

X_train = dataset.X[train_idx]
X_val = dataset.X[val_idx]
X_test = dataset.X[test_idx]

We min-max scale `year` e `title_length` on training, validation and testing set.

In [13]:
train_year_max = torch.max(X_train[:,0])
train_year_min = torch.min(X_train[:,0])
dataset.X[train_idx, 0] = (X_train[:,0] - train_year_min)/(train_year_max - train_year_min)
dataset.X[val_idx, 0] = (X_val[:,0] - train_year_min)/(train_year_max - train_year_min)
dataset.X[test_idx, 0] = (X_test[:,0] - train_year_min)/(train_year_max - train_year_min)

train_title_length_max = torch.max(X_train[:,1])
train_title_length_min = torch.min(X_train[:,1])
dataset.X[train_idx, 1] = (X_train[:,1] - train_title_length_min)/(train_title_length_max - train_title_length_min)
dataset.X[val_idx, 1] = (X_val[:,1] - train_title_length_min)/(train_title_length_max - train_title_length_min)
dataset.X[test_idx, 1] = (X_test[:,1] - train_title_length_min)/(train_title_length_max - train_title_length_min)

### Managing imbalance

We create two samplers which we are going to pass to the `DataLoader` object in order to manage the data imbalance: 
- `sampler_class_frequency` which, as its name reveals, weights each sample depending on the frequency of the class it belongs to.

In [14]:
y_train = dataset.y[train_idx]

sample_weights = class_weights(y_train)
sampler_class_frequency = WeightedRandomSampler(sample_weights, len(train_idx))

The following code shows the classes' distribution over a subsets of batches. 

In [16]:
train_subset = Subset(dataset, train_idx)
print(type(train_subset))
train_loader=DataLoader(train_subset, batch_size=128, shuffle=False, sampler=sampler_class_frequency, drop_last=True)

for i, samples in enumerate(train_loader):
	if not i%10:
		print(len(np.where(samples[1].numpy() == 0)[0]),
        len(np.where(samples[1].numpy() == 1)[0]),
        len(np.where(samples[1].numpy() == 2)[0]),
        len(np.where(samples[1].numpy() == 3)[0]),
        len(np.where(samples[1].numpy() == 4)[0]), sep = "\t"
    )

<class 'torch.utils.data.dataset.Subset'>
26	22	25	28	27
35	18	28	26	21
29	24	27	27	21
23	28	28	27	22
28	22	23	23	32
28	30	18	31	21
31	21	18	29	29
20	28	36	13	31
22	27	29	26	24
17	23	29	30	29
31	25	22	28	22
39	19	19	24	27
30	27	25	28	18
31	17	28	20	32
27	22	29	29	21
23	29	35	22	19
28	30	23	24	23
22	29	30	23	24
21	28	25	32	22
31	23	25	22	27
22	35	23	18	30
24	28	16	31	29
25	30	24	28	21


- `sampler_ratings_count` which weights each sample depending on the `ratings_count` values.

In [17]:
# MinMaxScaling ratings_count
weights_train = dataset.weights[train_idx] 
weights_val = dataset.weights[val_idx]
weights_test = dataset.weights[test_idx] 

weights_train_max = torch.max(weights_train)
weights_train_min = torch.min(weights_train)
dataset.weights[train_idx]  = (weights_train - weights_train_min) / (weights_train_max - weights_train_min)
dataset.weights[val_idx] = (weights_val - weights_train_min) / (weights_train_max - weights_train_min)
dataset.weights[test_idx] = (weights_test - weights_train_min) / (weights_train_max - weights_train_min)

sampler_ratings_count = WeightedRandomSampler(dataset.weights[train_idx], len(train_idx))

We have conducted some experiments with both and the performance have consistently been better with `sampler_class_frequency`, therefore we have ever adopted it during the following fine tuning. 

### Defining first hyperparameters space

Within the first hyperparameters optimization, we set the number of epochs to a very high value (500) as the early stopping assures that the training continues as long as the loss decreases and no further (in detail, the patience is set to 3). For remaining hyperparameters we define a wide space. 

In [13]:
first_hyperparams = {
	'num_epochs' : [500],
	'n_bad_epochs': [3],
	'num_hidden_layers' : [1, 3, 5, 7],
	'hidden_size' : [8, 16, 32, 64, 128],
	'batch_size' : [16, 32, 64, 128, 256],
	'af_first_layer' : [nn.Tanh(), nn.LeakyReLU()],
	'af_hidden_layers' : [nn.LeakyReLU()],
	'af_output_layer' : [None, nn.LogSoftmax(dim=1)],
	'loss_function' : [nn.CrossEntropyLoss(), nn.KLDivLoss(reduction = 'batchmean')], 
	'dropout' : [0, 0.2, 0.5],
	'batch_norm' : [False, True],
	'learning_rate' : [0.01, 0.001], 
	'optimizer': ["torch.optim.SGD", "torch.optim.Adam"],
	'weight_decay': [1e-4]		
}

### First training

We split the parameters' configurations into 6 sets and then we execute scripts specifying the index of the sets we want to consider. 

In [None]:
first_configs = dict_configs_from_params_cartesian_product(first_hyperparams)
nr_sets = 6
split_configs_params(first_configs, nr_sets)

idx_set = 1
assert idx_set < nr_sets, f"You can specify a set with an index until {nr_sets-1}"
config_set = eval(f"configs_set{idx_set}") 

if config_set == first_configs:
	nr_train = 0
else :
	nr_train = len(configs_set0) * idx_set

We log to TensorBoard the architecture of the network and the various hyperparameters' configurations.

In [None]:
set_reproducibility()	
	
columns = ["nr_train"] + list(first_configs[0].keys()) + ["epoch_stopped", "loss", "accuracy", "precision", "precision_total", "recall", "recall_total", "f1_score", "f1_score_total", "support"]
results_first_ft = pd.DataFrame(columns=columns)

for config_params in config_set:
	nr_train += 1
	print(f"{nr_train}° training with params:")
	pprint(config_params)

	list_params_config = list(map(str, list(config_params.values())))
	name_run = '__'.join(list_params_config)
	with SummaryWriter(log_dir=os.path.join('tensorboard_logs', f"{idx_set}_out_of_{nr_sets - 1}", 'Train_' + str(nr_train), name_run)) as tb:
	# tb = SummaryWriter(log_dir=os.path.join('tensorboard_logs', f"{idx_set}_out_of_{nr_sets - 1}", 'Train_' + str(nr_train), name_run))

		train_subset = Subset(dataset, train_idx)
		val_subset=Subset(dataset, val_idx)
		test_subset=Subset(dataset, test_idx)
		train_loader=DataLoader(train_subset, batch_size=config_params['batch_size'], shuffle=False, sampler=sampler_class_frequency, drop_last=True)
		val_loader=DataLoader(val_subset, batch_size=1, shuffle=False, drop_last=True)
		test_loader=DataLoader(test_subset, batch_size=1, shuffle=False, drop_last=True)

		model = Feedforward(
			dataset.X.shape[1],
			config_params['hidden_size'],
			dataset.num_classes,
			config_params['af_first_layer'],
			config_params['af_hidden_layers'],
			config_params['af_output_layer'],
			config_params['num_hidden_layers'],
			config_params['dropout'], 
			config_params['batch_norm'])

		model.to(device)
		input_model = dataset.X[train_idx][:config_params['batch_size']].to(device)
		tb.add_graph(model, input_model)

		summary(model, input_size=(config_params['batch_size'], int(35850 // config_params['batch_size']), 1149), col_names= ["input_size","output_size", "num_params"], verbose=1)

		loss_func = config_params['loss_function'] 

		optim = eval(config_params['optimizer'] + "(model.parameters(), lr=config_params['learning_rate'])")

		cardinality_training_set = len(X_train)
		model, loss_values, epoch_stopped, loss_value_last_epoch, accuracy_last_epoch = train_model(model, loss_func, optim, train_loader, config_params['num_epochs'], config_params['n_bad_epochs'], device, tb, cardinality_training_set)
		
		print(f"Loss: {loss_value_last_epoch}", end="\n\n")

		report = test_model(model, val_loader, device, True)
		index_classes = len(report) - 3
		f1_score = [float(report[str(i)]['f1-score']) for i in range(index_classes)]
		f1_score_total = np.sum(f1_score)
		precision = [float(report[str(i)]['precision']) for i in range(index_classes)]
		precision_total = np.sum(precision)
		recall = [float(report[str(i)]['recall']) for i in range(index_classes)]
		recall_total = np.sum(recall)
		support = [int(report[str(i)]['support']) for i in range(index_classes)]
		accuracy = report['accuracy']


		row_values= [nr_train] + list_params_config + [epoch_stopped, loss_value_last_epoch, accuracy, precision, precision_total, recall, recall_total, f1_score, f1_score_total, support]
		results_first_ft=results_first_ft.append(pd.Series(row_values, index=columns), ignore_index=True)

		dict_params_config = {list(config_params.keys())[z]: list_params_config[z] for z in range(len(config_params))}
		tb.add_hparams(hparam_dict = dict_params_config, metric_dict = {"Accuracy every epoch": None, "Loss every epoch": None})
		tb.flush()
		tb.close()
	del model, optim, train_loader, val_loader

In [None]:
if config_set == first_configs:
	results_first_ft.to_csv("tuning_hyperparams/results_first_ft.csv", index=False)
else :
	results_first_ft.to_csv(f"tuning_hyperparams/results_nrSets{nr_sets}_idxSet{idx_set}.csv", index=False)

In [8]:
results_first_ft = pd.concat([pd.read_csv(f"results_hyperparams_optimization/NN/results_nrSets6_idxSet{i}.csv") for i in range(6)], ignore_index=True)

In [104]:
results_first_ft.to_csv("tuning_hyperparams/results_first_ft.csv", index=False)

In [105]:
results_first_ft = pd.read_csv("tuning_hyperparams/results_first_ft.csv")

We display the first 10 trainings sorted in descending order by accuracy: we note null precisions and recalls regarding class with lower frequency. 

In [108]:
results_first_ft.sort_values(by=['accuracy'], ascending=False).iloc[:10, -6:]

Unnamed: 0,loss,accuracy,precision,recall,f1_score,support
4041,1.715587,0.523795,"[0.0, 0.0, 0.74194, 0.49482, 0.0]","[0.0, 0.0, 0.24147, 0.97881, 0.0]","[0.0, 0.0, 0.36436, 0.65734, 0.0]","[122, 335, 1143, 1416, 157]"
4248,1.609664,0.47526,"[0.0, 0.0, 0.39234, 0.68404, 0.0]","[0.0, 0.0, 0.77953, 0.43573, 0.0]","[0.0, 0.0, 0.52197, 0.53236, 0.0]","[122, 335, 1143, 1416, 157]"
5489,32.183064,0.453829,"[0.07336, 0.0, 0.41394, 0.72903, 0.07955]","[0.15574, 0.0, 0.74278, 0.39901, 0.04459]","[0.09974, 0.0, 0.53162, 0.51575, 0.05714]","[122, 335, 1143, 1416, 157]"
3000,1.609313,0.452884,"[0.0, 0.30435, 0.0, 0.45847, 0.0]","[0.0, 0.10448, 0.0, 0.99011, 0.0]","[0.0, 0.15556, 0.0, 0.62673, 0.0]","[122, 335, 1143, 1416, 157]"
2442,1.609575,0.446896,"[0.0, 0.0, 0.37011, 0.60171, 0.0]","[0.0, 0.0, 0.68679, 0.44703, 0.0]","[0.0, 0.0, 0.481, 0.51297, 0.0]","[122, 335, 1143, 1416, 157]"
3016,1.609671,0.446265,"[0.0, 0.0, 0.0, 0.44627, 0.0]","[0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 0.0, 0.0, 0.61713, 0.0]","[122, 335, 1143, 1416, 157]"
3498,1.609638,0.446265,"[0.0, 0.0, 0.0, 0.44627, 0.0]","[0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 0.0, 0.0, 0.61713, 0.0]","[122, 335, 1143, 1416, 157]"
3034,1.61222,0.446265,"[0.0, 0.0, 0.0, 0.44627, 0.0]","[0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 0.0, 0.0, 0.61713, 0.0]","[122, 335, 1143, 1416, 157]"
5282,1.609482,0.446265,"[0.0, 0.0, 0.0, 0.44627, 0.0]","[0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 0.0, 0.0, 0.61713, 0.0]","[122, 335, 1143, 1416, 157]"
4626,1.609466,0.446265,"[0.0, 0.0, 0.0, 0.44627, 0.0]","[0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 0.0, 0.0, 0.61713, 0.0]","[122, 335, 1143, 1416, 157]"


We display the first 10 trainings sorted in ascending order by loss: the precision and recall regarding class with lower frequency are still quite imbalanced with respect to class with higher frequency but to a lesser extent. 

In [107]:
results_first_ft.sort_values(by=['loss']).iloc[:10, -6:]

Unnamed: 0,loss,accuracy,precision,recall,f1_score,support
5587,0.859668,0.410652,"[0.06918, 0.22021, 0.6506, 0.80118, 0.09045]","[0.27049, 0.25373, 0.37795, 0.48093, 0.4586]","[0.11018, 0.23578, 0.47814, 0.60106, 0.1511]","[122, 335, 1143, 1416, 157]"
2627,0.889824,0.414119,"[0.07783, 0.21042, 0.61134, 0.80196, 0.11001]","[0.27049, 0.31343, 0.3867, 0.46328, 0.49682]","[0.12088, 0.2518, 0.47374, 0.58729, 0.18014]","[122, 335, 1143, 1416, 157]"
5751,0.899446,0.414434,"[0.07598, 0.2367, 0.61605, 0.78498, 0.09413]","[0.30328, 0.26567, 0.3762, 0.48729, 0.43949]","[0.12151, 0.25035, 0.46714, 0.60131, 0.15506]","[122, 335, 1143, 1416, 157]"
5735,0.901097,0.414434,"[0.07246, 0.22685, 0.64984, 0.81316, 0.09857]","[0.2459, 0.29254, 0.36045, 0.4887, 0.52866]","[0.11194, 0.25554, 0.4637, 0.6105, 0.16617]","[122, 335, 1143, 1416, 157]"
4115,0.901097,0.409392,"[0.0751, 0.24148, 0.65263, 0.82911, 0.10116]","[0.31148, 0.25373, 0.3797, 0.46257, 0.55414]","[0.12102, 0.24745, 0.48009, 0.59383, 0.17109]","[122, 335, 1143, 1416, 157]"
2691,0.907835,0.424519,"[0.08578, 0.21218, 0.6767, 0.80765, 0.11053]","[0.31148, 0.30149, 0.37358, 0.49223, 0.53503]","[0.13451, 0.24908, 0.4814, 0.61167, 0.18321]","[122, 335, 1143, 1416, 157]"
4067,0.912362,0.414434,"[0.08416, 0.23211, 0.63663, 0.83014, 0.10101]","[0.27869, 0.35821, 0.37708, 0.45904, 0.50955]","[0.12928, 0.28169, 0.47363, 0.59118, 0.1686]","[122, 335, 1143, 1416, 157]"
4163,0.912675,0.417271,"[0.07919, 0.22922, 0.63526, 0.80392, 0.1026]","[0.28689, 0.27164, 0.3657, 0.49223, 0.52866]","[0.12411, 0.24863, 0.46419, 0.6106, 0.17184]","[122, 335, 1143, 1416, 157]"
2659,0.918037,0.402143,"[0.07795, 0.20388, 0.69039, 0.79138, 0.10307]","[0.33607, 0.25075, 0.33946, 0.47952, 0.53503]","[0.12654, 0.2249, 0.45513, 0.59719, 0.17284]","[122, 335, 1143, 1416, 157]"
4211,0.92003,0.41538,"[0.07743, 0.22654, 0.67213, 0.79472, 0.101]","[0.28689, 0.29552, 0.35871, 0.48941, 0.51592]","[0.12195, 0.25648, 0.46777, 0.60577, 0.16893]","[122, 335, 1143, 1416, 157]"


By analysing the corresponding hyperparameters, we understand that the best performance are obtained with hidden sizes greater than 16: therefore we extend the space of hidden sizes values. Moreover, we want to make experiment with greater value of batch_size (512) and a lesser learning rate.

### Defining second hyperparameters space

In [14]:
second_hyperparams = {
	'num_epochs' : [500],
	'n_bad_epochs': [3],
	'num_hidden_layers' : [3, 5, 7, 10],
	'hidden_size' : [16, 64, 128, 256],
	'batch_size' : [16, 64, 256, 512],
	'af_first_layer' : [nn.Tanh(), nn.LeakyReLU()],
	'af_hidden_layers' : [nn.LeakyReLU()],
	'af_output_layer' : [None, nn.LogSoftmax(dim=1)],
	'loss_function' : [nn.CrossEntropyLoss(), nn.KLDivLoss(reduction = 'batchmean')], 
	'dropout' : [0, 0.5],
	'batch_norm' : [False, True],
	'learning_rate' : [0.01, 1e-5], 
	'optimizer': ["torch.optim.SGD", "torch.optim.Adam"],
	'weight_decay': [1e-4]		
}

For a better readability, we do not present once again the code implementing the training and the testing.

Within this second fine tuning, we enhance the performance analysis by computing also the sums of precision, recall and f1_score which are conditional to single classes. 

In [111]:
results_second_ft.to_csv("tuning_hyperparams/results_second_ft.csv", index=False)

In [36]:
results_second_ft = pd.read_csv("tuning_hyperparams/results_second_ft.csv")

We display the first 10 trainings sorted in descending order by accuracy: we note null precisions and recalls regarding class with lower frequency. 

In [116]:
results_second_ft.sort_values(by=['accuracy'], ascending=False).iloc[:10, -9:]

Unnamed: 0,loss,accuracy,precision,precision_total,recall,recall_total,f1_score,f1_score_total,support
41,1.316624,0.448471,"[0.11633, 0.20382, 0.48902, 0.93333, 0.15126]",1.893765,"[0.46721, 0.38209, 0.56518, 0.40537, 0.11465]",1.934499,"[0.18627, 0.26584, 0.52435, 0.56524, 0.13043]",1.672135,"[122, 335, 1143, 1416, 157]"
56,1.611139,0.446265,"[0.0, 0.0, 0.0, 0.44627, 0.0]",0.446265,"[0.0, 0.0, 0.0, 1.0, 0.0]",1.0,"[0.0, 0.0, 0.0, 0.61713, 0.0]",0.617128,"[122, 335, 1143, 1416, 157]"
9,1.626771,0.446265,"[0.0, 0.0, 0.0, 0.44627, 0.0]",0.446265,"[0.0, 0.0, 0.0, 1.0, 0.0]",1.0,"[0.0, 0.0, 0.0, 0.61713, 0.0]",0.617128,"[122, 335, 1143, 1416, 157]"
101,1.609784,0.446265,"[0.0, 0.0, 0.0, 0.44627, 0.0]",0.446265,"[0.0, 0.0, 0.0, 1.0, 0.0]",1.0,"[0.0, 0.0, 0.0, 0.61713, 0.0]",0.617128,"[122, 335, 1143, 1416, 157]"
104,1.61196,0.446265,"[0.0, 0.0, 0.0, 0.44627, 0.0]",0.446265,"[0.0, 0.0, 0.0, 1.0, 0.0]",1.0,"[0.0, 0.0, 0.0, 0.61713, 0.0]",0.617128,"[122, 335, 1143, 1416, 157]"
48,1.612005,0.446265,"[0.0, 0.0, 0.0, 0.44627, 0.0]",0.446265,"[0.0, 0.0, 0.0, 1.0, 0.0]",1.0,"[0.0, 0.0, 0.0, 0.61713, 0.0]",0.617128,"[122, 335, 1143, 1416, 157]"
29,1.612719,0.446265,"[0.0, 0.0, 0.0, 0.44627, 0.0]",0.446265,"[0.0, 0.0, 0.0, 1.0, 0.0]",1.0,"[0.0, 0.0, 0.0, 0.61713, 0.0]",0.617128,"[122, 335, 1143, 1416, 157]"
53,1.610725,0.446265,"[0.0, 0.0, 0.0, 0.44627, 0.0]",0.446265,"[0.0, 0.0, 0.0, 1.0, 0.0]",1.0,"[0.0, 0.0, 0.0, 0.61713, 0.0]",0.617128,"[122, 335, 1143, 1416, 157]"
12,1.623199,0.446265,"[0.0, 0.0, 0.0, 0.44627, 0.0]",0.446265,"[0.0, 0.0, 0.0, 1.0, 0.0]",1.0,"[0.0, 0.0, 0.0, 0.61713, 0.0]",0.617128,"[122, 335, 1143, 1416, 157]"
64,1.611939,0.446265,"[0.0, 0.0, 0.0, 0.44627, 0.0]",0.446265,"[0.0, 0.0, 0.0, 1.0, 0.0]",1.0,"[0.0, 0.0, 0.0, 0.61713, 0.0]",0.617128,"[122, 335, 1143, 1416, 157]"


We display the first 10 trainings sorted in ascending order by loss: the precision and recall regarding class with lower frequency are still quite imbalanced with respect to class with higher frequency but to a lesser extent. However, the second fine tuning generally leads to a lower accuracy and a higher loss.

In [117]:
results_second_ft.sort_values(by=['loss']).iloc[:10, -9:]

Unnamed: 0,loss,accuracy,precision,precision_total,recall,recall_total,f1_score,f1_score_total,support
91,1.08097,0.38607,"[0.08269, 0.22753, 0.64634, 0.80328, 0.1]",1.859841,"[0.35246, 0.24179, 0.32458, 0.44986, 0.59236]",1.96105,"[0.13396, 0.23444, 0.43215, 0.57673, 0.17111]",1.548393,"[122, 335, 1143, 1416, 157]"
83,1.098394,0.381658,"[0.08611, 0.22701, 0.63393, 0.83508, 0.09596]",1.878084,"[0.36066, 0.23582, 0.31059, 0.45056, 0.6051]",1.962723,"[0.13902, 0.23133, 0.41691, 0.58532, 0.16565]",1.538235,"[122, 335, 1143, 1416, 157]"
107,1.103853,0.377246,"[0.08961, 0.2125, 0.63194, 0.8099, 0.09779]",1.841738,"[0.40984, 0.20299, 0.31846, 0.43927, 0.59236]",1.962904,"[0.14706, 0.20763, 0.4235, 0.5696, 0.16787]",1.515662,"[122, 335, 1143, 1416, 157]"
75,1.155672,0.36716,"[0.08722, 0.23214, 0.6568, 0.84637, 0.09555]",1.918089,"[0.35246, 0.27164, 0.29134, 0.4202, 0.65605]",1.991688,"[0.13984, 0.25034, 0.40364, 0.56159, 0.1668]",1.522205,"[122, 335, 1143, 1416, 157]"
59,1.160277,0.364324,"[0.0775, 0.23843, 0.64272, 0.8209, 0.09389]",1.873449,"[0.33607, 0.2, 0.29746, 0.42726, 0.65605]",1.916839,"[0.12596, 0.21753, 0.4067, 0.56201, 0.16427]",1.476472,"[122, 335, 1143, 1416, 157]"
99,1.160518,0.364324,"[0.07356, 0.2153, 0.65577, 0.80345, 0.09291]",1.84099,"[0.30328, 0.22687, 0.29834, 0.42726, 0.61783]",1.873576,"[0.1184, 0.22093, 0.4101, 0.55786, 0.16153]",1.468825,"[122, 335, 1143, 1416, 157]"
51,1.163139,0.375355,"[0.09091, 0.24501, 0.68952, 0.84203, 0.09242]",1.9599,"[0.37705, 0.25672, 0.31671, 0.42161, 0.63694]",2.009029,"[0.1465, 0.25073, 0.43405, 0.56188, 0.16142]",1.554581,"[122, 335, 1143, 1416, 157]"
67,1.165624,0.365585,"[0.09091, 0.2112, 0.64245, 0.82639, 0.09657]",1.867512,"[0.31967, 0.24776, 0.29396, 0.4202, 0.68153]",1.963123,"[0.14156, 0.22802, 0.40336, 0.55712, 0.16917]",1.49923,"[122, 335, 1143, 1416, 157]"
19,1.251483,0.352978,"[0.08277, 0.25455, 0.64646, 0.83843, 0.09342]",1.915625,"[0.40164, 0.20896, 0.27997, 0.40678, 0.66879]",1.966129,"[0.13725, 0.22951, 0.39072, 0.54779, 0.16393]",1.469207,"[122, 335, 1143, 1416, 157]"
57,1.277507,0.363694,"[0.08945, 0.31383, 0.78864, 0.93596, 0.09351]",2.221392,"[0.54918, 0.17612, 0.30359, 0.40254, 0.70701]",2.138436,"[0.15385, 0.22562, 0.43841, 0.56296, 0.16518]",1.546017,"[122, 335, 1143, 1416, 157]"


We display the first 10 trainings sorted in descending order by f1_score, which synthesize the precision and the recall.

In [128]:
results_second_ft.sort_values(by=['f1_score_total'], ascending=False).iloc[:10, -9:]

Unnamed: 0,loss,accuracy,precision,precision_total,recall,recall_total,f1_score,f1_score_total,support
41,1.316624,0.448471,"[0.11633, 0.20382, 0.48902, 0.93333, 0.15126]",1.893765,"[0.46721, 0.38209, 0.56518, 0.40537, 0.11465]",1.934499,"[0.18627, 0.26584, 0.52435, 0.56524, 0.13043]",1.672135,"[122, 335, 1143, 1416, 157]"
105,1.291049,0.379136,"[0.08971, 0.34146, 0.72505, 0.94617, 0.09413]",2.196519,"[0.61475, 0.20896, 0.33683, 0.4096, 0.59236]",2.162503,"[0.15658, 0.25926, 0.45998, 0.57171, 0.16245]",1.609967,"[122, 335, 1143, 1416, 157]"
73,1.29885,0.364639,"[0.09821, 0.36111, 0.78005, 0.9479, 0.09183]",2.279099,"[0.54098, 0.19403, 0.30096, 0.39831, 0.75159]",2.185873,"[0.16625, 0.25243, 0.43434, 0.56091, 0.16366]",1.577594,"[122, 335, 1143, 1416, 157]"
51,1.163139,0.375355,"[0.09091, 0.24501, 0.68952, 0.84203, 0.09242]",1.9599,"[0.37705, 0.25672, 0.31671, 0.42161, 0.63694]",2.009029,"[0.1465, 0.25073, 0.43405, 0.56188, 0.16142]",1.554581,"[122, 335, 1143, 1416, 157]"
91,1.08097,0.38607,"[0.08269, 0.22753, 0.64634, 0.80328, 0.1]",1.859841,"[0.35246, 0.24179, 0.32458, 0.44986, 0.59236]",1.96105,"[0.13396, 0.23444, 0.43215, 0.57673, 0.17111]",1.548393,"[122, 335, 1143, 1416, 157]"
57,1.277507,0.363694,"[0.08945, 0.31383, 0.78864, 0.93596, 0.09351]",2.221392,"[0.54918, 0.17612, 0.30359, 0.40254, 0.70701]",2.138436,"[0.15385, 0.22562, 0.43841, 0.56296, 0.16518]",1.546017,"[122, 335, 1143, 1416, 157]"
89,1.285883,0.355184,"[0.10094, 0.70312, 0.80535, 0.94359, 0.09051]",2.643521,"[0.61475, 0.13433, 0.28959, 0.38983, 0.78981]",2.218311,"[0.17341, 0.22556, 0.426, 0.55172, 0.16241]",1.539106,"[122, 335, 1143, 1416, 157]"
83,1.098394,0.381658,"[0.08611, 0.22701, 0.63393, 0.83508, 0.09596]",1.878084,"[0.36066, 0.23582, 0.31059, 0.45056, 0.6051]",1.962723,"[0.13902, 0.23133, 0.41691, 0.58532, 0.16565]",1.538235,"[122, 335, 1143, 1416, 157]"
97,1.290532,0.354554,"[0.09446, 0.42424, 0.79268, 0.94898, 0.08887]",2.349234,"[0.61475, 0.16716, 0.28434, 0.39407, 0.70701]",2.167332,"[0.16376, 0.23983, 0.41854, 0.55689, 0.15789]",1.53691,"[122, 335, 1143, 1416, 157]"
81,1.280429,0.360227,"[0.09852, 0.75472, 0.80048, 0.94676, 0.09138]",2.691841,"[0.59836, 0.1194, 0.29484, 0.40184, 0.78981]",2.204247,"[0.16918, 0.20619, 0.43095, 0.5642, 0.1638]",1.534318,"[122, 335, 1143, 1416, 157]"


The first trainings seems to be a good candidate as the best hyperparameters' configuration, since it presents an acceptable accuracy (0.44, given the fact that a random classifier over 5 classes presents an accuracy equal to 0.2) and low loss (1.31).

In [147]:
results_second_ft.sort_values(by=['f1_score_total'], ascending=False).iloc[0,:]

nr_train                                                      5803
num_epochs                                                     500
n_bad_epochs                                                     3
num_hidden_layers                                               10
hidden_size                                                     64
batch_size                                                     512
af_first_layer                      LeakyReLU(negative_slope=0.01)
af_hidden_layers                    LeakyReLU(negative_slope=0.01)
af_output_layer                                  LogSoftmax(dim=1)
loss_function                                          KLDivLoss()
dropout                                                        0.0
batch_norm                                                   False
learning_rate                                              0.00001
optimizer                                         torch.optim.Adam
weight_decay                                                0.

### Defining third hyperparameters space

By analysing the corresponding hyperparameters, we understand that the best performance are obtained with higher number of hidden layers and bigger batches, so we extend their space: moreover, we delete 0.01 as learning rate and 0.2 as dropout probability since they do not lead to good performances.

In [None]:
second_hyperparams = {
	'num_epochs' : [500],
	'n_bad_epochs': [3],
	'num_hidden_layers' : [3, 5, 7, 10],
	'hidden_size' : [16, 64, 128, 256],
	'batch_size' : [16, 64, 256, 512],
	'af_first_layer' : [nn.Tanh(), nn.LeakyReLU()],
	'af_hidden_layers' : [nn.LeakyReLU()],
	'af_output_layer' : [None, nn.LogSoftmax(dim=1)],
	'loss_function' : [nn.CrossEntropyLoss(), nn.KLDivLoss(reduction = 'batchmean')], 
	'dropout' : [0, 0.5],
	'batch_norm' : [False, True],
	'learning_rate' : [0.01, 1e-5], 
	'optimizer': ["torch.optim.SGD", "torch.optim.Adam"],
	'weight_decay': [1e-4]		
}

In [22]:
new_new_hyperparams = {
	'num_epochs' : [500],
	'n_bad_epochs': [3],
	'num_hidden_layers' : [12, 15, 18],
	'hidden_size' : [64, 128, 256],
	'batch_size' : [256, 512, 1024, 2048],
	'af_first_layer' : [nn.LeakyReLU()],
	'af_hidden_layers' : [nn.LeakyReLU()],
	'af_output_layer' : [None, nn.LogSoftmax(dim=1)],
	'loss_function' : [nn.CrossEntropyLoss(), nn.KLDivLoss(reduction = 'batchmean')], 
	'dropout' : [0, 0.5],
	'batch_norm' : [False, True],
	'learning_rate' : [1e-5], 
	'optimizer': ["torch.optim.Adam"],
	'weight_decay': [1e-4]		
}

In [122]:
results_third_ft.to_csv("tuning_hyperparams/results_third_ft.csv", index=False)

In [123]:
results_third_ft = pd.read_csv("tuning_hyperparams/results_third_ft.csv")

We display the first 10 trainings sorted in descending order by accuracy: we note null precisions and recalls regarding class with lower frequency. 

In [126]:
results_third_ft.sort_values(by=['accuracy'], ascending=False).iloc[:10, -9:]

Unnamed: 0,loss,accuracy,precision,precision_total,recall,recall_total,f1_score,f1_score_total,support
0,1.610409,0.446265,"[0.0, 0.0, 0.0, 0.44627, 0.0]",0.446265,"[0.0, 0.0, 0.0, 1.0, 0.0]",1.0,"[0.0, 0.0, 0.0, 0.61713, 0.0]",0.617128,"[122, 335, 1143, 1416, 157]"
244,1.609764,0.446265,"[0.0, 0.0, 0.0, 0.44627, 0.0]",0.446265,"[0.0, 0.0, 0.0, 1.0, 0.0]",1.0,"[0.0, 0.0, 0.0, 0.61713, 0.0]",0.617128,"[122, 335, 1143, 1416, 157]"
240,1.60974,0.446265,"[0.0, 0.0, 0.0, 0.44627, 0.0]",0.446265,"[0.0, 0.0, 0.0, 1.0, 0.0]",1.0,"[0.0, 0.0, 0.0, 0.61713, 0.0]",0.617128,"[122, 335, 1143, 1416, 157]"
28,1.6134,0.446265,"[0.0, 0.0, 0.0, 0.44627, 0.0]",0.446265,"[0.0, 0.0, 0.0, 1.0, 0.0]",1.0,"[0.0, 0.0, 0.0, 0.61713, 0.0]",0.617128,"[122, 335, 1143, 1416, 157]"
262,1.609661,0.446265,"[0.0, 0.0, 0.0, 0.44627, 0.0]",0.446265,"[0.0, 0.0, 0.0, 1.0, 0.0]",1.0,"[0.0, 0.0, 0.0, 0.61713, 0.0]",0.617128,"[122, 335, 1143, 1416, 157]"
264,1.609421,0.446265,"[0.0, 0.0, 0.0, 0.44627, 0.0]",0.446265,"[0.0, 0.0, 0.0, 1.0, 0.0]",1.0,"[0.0, 0.0, 0.0, 0.61713, 0.0]",0.617128,"[122, 335, 1143, 1416, 157]"
56,1.611898,0.446265,"[0.0, 0.0, 0.0, 0.44627, 0.0]",0.446265,"[0.0, 0.0, 0.0, 1.0, 0.0]",1.0,"[0.0, 0.0, 0.0, 0.61713, 0.0]",0.617128,"[122, 335, 1143, 1416, 157]"
232,1.609957,0.446265,"[0.0, 0.0, 0.0, 0.44627, 0.0]",0.446265,"[0.0, 0.0, 0.0, 1.0, 0.0]",1.0,"[0.0, 0.0, 0.0, 0.61713, 0.0]",0.617128,"[122, 335, 1143, 1416, 157]"
60,1.609902,0.446265,"[0.0, 0.0, 0.0, 0.44627, 0.0]",0.446265,"[0.0, 0.0, 0.0, 1.0, 0.0]",1.0,"[0.0, 0.0, 0.0, 0.61713, 0.0]",0.617128,"[122, 335, 1143, 1416, 157]"
230,1.610333,0.446265,"[0.0, 0.0, 0.0, 0.44627, 0.0]",0.446265,"[0.0, 0.0, 0.0, 1.0, 0.0]",1.0,"[0.0, 0.0, 0.0, 0.61713, 0.0]",0.617128,"[122, 335, 1143, 1416, 157]"


We display the first 10 trainings sorted in ascending order by loss: the precision and recall regarding class with lower frequency are still quite imbalanced with respect to class with higher frequency but to a lesser extent. However, the second fine tuning generally leads to a lower accuracy and a higher loss.

In [127]:
results_third_ft.sort_values(by=['loss']).iloc[:10, -9:]

Unnamed: 0,loss,accuracy,precision,precision_total,recall,recall_total,f1_score,f1_score_total,support
189,1.063103,0.34573,"[0.08015, 0.20163, 0.55172, 0.73881, 0.0898]",1.662111,"[0.36066, 0.2209, 0.26597, 0.41949, 0.51592]",1.782933,"[0.13115, 0.21083, 0.35891, 0.53514, 0.15297]",1.388997,"[122, 335, 1143, 1416, 157]"
177,1.070294,0.340372,"[0.0748, 0.17344, 0.54919, 0.72795, 0.0912]",1.616592,"[0.31148, 0.19104, 0.26859, 0.41384, 0.5414]",1.726355,"[0.12063, 0.18182, 0.36075, 0.52769, 0.15611]",1.347002,"[122, 335, 1143, 1416, 157]"
285,1.074537,0.328396,"[0.08368, 0.1687, 0.50333, 0.72118, 0.09894]",1.575835,"[0.32787, 0.20597, 0.26422, 0.37994, 0.59236]",1.770356,"[0.13333, 0.18548, 0.34653, 0.49769, 0.16955]",1.332587,"[122, 335, 1143, 1416, 157]"
77,1.079831,0.371257,"[0.08268, 0.22781, 0.62101, 0.78315, 0.09726]",1.811913,"[0.34426, 0.22985, 0.28959, 0.44633, 0.61146]",1.921494,"[0.13333, 0.22883, 0.39499, 0.5686, 0.16783]",1.493581,"[122, 335, 1143, 1416, 157]"
93,1.094534,0.344469,"[0.07773, 0.19841, 0.56228, 0.78219, 0.09153]",1.712142,"[0.30328, 0.22388, 0.27647, 0.40325, 0.59873]",1.805599,"[0.12375, 0.21038, 0.37067, 0.53215, 0.15878]",1.395736,"[122, 335, 1143, 1416, 157]"
85,1.098131,0.35676,"[0.07629, 0.18378, 0.57841, 0.80518, 0.09384]",1.737509,"[0.30328, 0.20299, 0.30009, 0.41737, 0.59236]",1.816081,"[0.12191, 0.19291, 0.39516, 0.54977, 0.16202]",1.421768,"[122, 335, 1143, 1416, 157]"
73,1.103233,0.357706,"[0.08299, 0.21902, 0.60256, 0.78129, 0.09336]",1.779222,"[0.32787, 0.22687, 0.28784, 0.41879, 0.61783]",1.879193,"[0.13245, 0.22287, 0.38958, 0.54529, 0.16221]",1.452399,"[122, 335, 1143, 1416, 157]"
273,1.104342,0.326505,"[0.07963, 0.16316, 0.51646, 0.73978, 0.09554]",1.594575,"[0.35246, 0.18507, 0.26072, 0.38347, 0.57325]",1.754974,"[0.12991, 0.17343, 0.34651, 0.50512, 0.16379]",1.318749,"[122, 335, 1143, 1416, 157]"
165,1.106367,0.364639,"[0.08408, 0.22164, 0.64466, 0.82434, 0.09829]",1.873013,"[0.38525, 0.25075, 0.29046, 0.4209, 0.6242]",1.971564,"[0.13803, 0.23529, 0.40048, 0.55727, 0.16984]",1.500923,"[122, 335, 1143, 1416, 157]"
81,1.107266,0.348566,"[0.08485, 0.21833, 0.57268, 0.77217, 0.08973]",1.737757,"[0.34426, 0.24179, 0.28609, 0.39972, 0.57325]",1.845109,"[0.13614, 0.22946, 0.38156, 0.52676, 0.15517]",1.429097,"[122, 335, 1143, 1416, 157]"


We display the first 10 trainings sorted in descending order by f1_score, however the first ones present accuracies lower than that of the beforementioned good configuration.

In [130]:
results_third_ft.sort_values(by=['f1_score_total'], ascending=False).iloc[:10, -9:]

Unnamed: 0,loss,accuracy,precision,precision_total,recall,recall_total,f1_score,f1_score_total,support
76,1.283165,0.37693,"[0.13428, 0.22107, 0.78251, 0.93677, 0.09335]",2.167984,"[0.31148, 0.38209, 0.30534, 0.3976, 0.75159]",2.148093,"[0.18765, 0.28009, 0.43927, 0.55825, 0.16608]",1.631347,"[122, 335, 1143, 1416, 157]"
4,1.303822,0.370942,"[0.12058, 0.26269, 0.77427, 0.94676, 0.09063]",2.194923,"[0.47541, 0.26269, 0.30009, 0.40184, 0.75796]",2.197982,"[0.19237, 0.26269, 0.43253, 0.5642, 0.1619]",1.613702,"[122, 335, 1143, 1416, 157]"
36,1.295256,0.363063,"[0.10375, 0.35417, 0.80145, 0.96055, 0.09373]",2.313641,"[0.59016, 0.20299, 0.28959, 0.39548, 0.7707]",2.248919,"[0.17647, 0.25806, 0.42545, 0.56028, 0.16713]",1.587392,"[122, 335, 1143, 1416, 157]"
68,1.254475,0.357075,"[0.08079, 0.74667, 0.82915, 0.9527, 0.09144]",2.700744,"[0.60656, 0.16716, 0.28871, 0.39831, 0.69427]",2.155008,"[0.14258, 0.27317, 0.42829, 0.56175, 0.1616]",1.5674,"[122, 335, 1143, 1416, 157]"
72,1.2865,0.368736,"[0.0922, 0.49485, 0.7584, 0.91733, 0.09422]",2.356999,"[0.63934, 0.14328, 0.31584, 0.40749, 0.67516]",2.181108,"[0.16116, 0.22222, 0.44595, 0.5643, 0.16537]",1.559003,"[122, 335, 1143, 1416, 157]"
160,1.265158,0.363694,"[0.09439, 0.66154, 0.76068, 0.94684, 0.08852]",2.551971,"[0.60656, 0.12836, 0.31146, 0.40254, 0.70701]",2.155925,"[0.16336, 0.215, 0.44196, 0.56492, 0.15734]",1.542568,"[122, 335, 1143, 1416, 157]"
100,1.385441,0.363063,"[0.07456, 0.30423, 0.74121, 0.83062, 0.0832]",2.033813,"[0.27869, 0.32239, 0.25809, 0.43291, 0.64968]",1.94176,"[0.11765, 0.31304, 0.38287, 0.56917, 0.14751]",1.530238,"[122, 335, 1143, 1416, 157]"
64,1.261881,0.354869,"[0.09517, 0.44348, 0.83721, 0.94463, 0.08836]",2.408849,"[0.53279, 0.15224, 0.28346, 0.3976, 0.78344]",2.149529,"[0.16149, 0.22667, 0.42353, 0.55964, 0.15881]",1.530141,"[122, 335, 1143, 1416, 157]"
32,1.318057,0.350772,"[0.09498, 0.72464, 0.78325, 0.92916, 0.08198]",2.614008,"[0.57377, 0.14925, 0.27822, 0.39831, 0.70701]",2.106551,"[0.16298, 0.24752, 0.41059, 0.55759, 0.14692]",1.525603,"[122, 335, 1143, 1416, 157]"
65,1.133167,0.366215,"[0.08863, 0.21727, 0.66102, 0.82633, 0.09238]",1.885631,"[0.37705, 0.23284, 0.30709, 0.41667, 0.61783]",1.951473,"[0.14353, 0.22478, 0.41935, 0.55399, 0.16073]",1.502384,"[122, 335, 1143, 1416, 157]"


So the best model is the following:

In [37]:
results_second_ft.sort_values(by=['f1_score_total'], ascending=False).iloc[0,:]

nr_train                                                      5803
num_epochs                                                     500
n_bad_epochs                                                     3
num_hidden_layers                                               10
hidden_size                                                     64
batch_size                                                     512
af_first_layer                      LeakyReLU(negative_slope=0.01)
af_hidden_layers                    LeakyReLU(negative_slope=0.01)
af_output_layer                                  LogSoftmax(dim=1)
loss_function                                          KLDivLoss()
dropout                                                        0.0
batch_norm                                                   False
learning_rate                                              0.00001
optimizer                                         torch.optim.Adam
weight_decay                                                0.

## Save model

In [None]:
torch.save(model.state_dict(), "best_model.pth")

In [121]:
results_third_ft.f1_score = [[round(z, 5) for z in list(map(float,j[1:-1].split(", ")))] for j in results_third_ft.f1_score] 
results_third_ft.recall = [[round(z, 5) for z in list(map(float,j[1:-1].split(", ")))] for j in results_third_ft.recall] 
results_third_ft.precision = [[round(z, 5) for z in list(map(float,j[1:-1].split(", ")))] for j in results_third_ft.precision] 

## Defining fourth hyperparameters space for regression task

In [15]:
regression_hyperparams = {
	'num_epochs' : [500],
	'n_bad_epochs': [3],
	'num_hidden_layers' : [3, 5, 7, 10],
	'hidden_size' : [64, 128, 256],
	'batch_size' : [16, 64, 256, 512],
	'af_first_layer' : [nn.Tanh(), nn.LeakyReLU()],
	'af_hidden_layers' : [nn.LeakyReLU()],
	'af_output_layer' : [None],
	'loss_function' : [nn.MSELoss()], 
	'dropout' : [0, 0.5],
	'batch_norm' : [False, True],
	'learning_rate' : [0.01, 1e-5], 
	'optimizer': ["torch.optim.Adam"],
	'weight_decay': [1e-4]		
}

In [29]:
set_reproducibility()	

if config_set == first_configs:
	nr_train = 0
else :
	nr_train = len(configs_set0) * idx_set
	
columns = ["nr_train"] + list(first_configs[0].keys()) + ["epoch_stopped", "loss", "accuracy", "precision", "precision_total", "recall", "recall_total", "f1_score", "f1_score_total", "support"]
results_regression_ft = pd.DataFrame(columns=columns)

for config_params in config_set:
	nr_train += 1
	print(f"{nr_train}° training with params:")
	pprint(config_params)

	list_params_config = list(map(str, list(config_params.values())))
	name_run = '__'.join(list_params_config)
	with SummaryWriter(log_dir=os.path.join('tensorboard_logs', f"{idx_set}_out_of_{nr_sets - 1}", 'Train_' + str(nr_train), name_run)) as tb:
	# tb = SummaryWriter(log_dir=os.path.join('tensorboard_logs', f"{idx_set}_out_of_{nr_sets - 1}", 'Train_' + str(nr_train), name_run))

		train_subset = Subset(dataset, train_idx)
		val_subset=Subset(dataset, val_idx)
		test_subset=Subset(dataset, test_idx)
		train_loader=DataLoader(train_subset, batch_size=config_params['batch_size'], shuffle=False, sampler=sampler_class_frequency, drop_last=True)
		val_loader=DataLoader(val_subset, batch_size=1, shuffle=False, drop_last=True)
		test_loader=DataLoader(test_subset, batch_size=1, shuffle=False, drop_last=True)

		model = Feedforward(
			dataset.X.shape[1],
			config_params['hidden_size'],
			dataset.num_classes,
			config_params['af_first_layer'],
			config_params['af_hidden_layers'],
			config_params['af_output_layer'],
			config_params['num_hidden_layers'],
			config_params['dropout'], 
			config_params['batch_norm'])

		model.to(device)
		input_model = dataset.X[train_idx][:config_params['batch_size']].to(device)
		tb.add_graph(model, input_model)

		summary(model, input_size=(config_params['batch_size'], int(35850 // config_params['batch_size']), 1149), col_names= ["input_size","output_size", "num_params"], verbose=1)
		# dataset.X[train_idx].shape[1] == 1149, dataset.X[train_idx].shape[0] == 35850			provare verbose = 2 per weight e bias
		# test_model(model, val_loader, device)

		loss_func = config_params['loss_function'] 

		optim = eval(config_params['optimizer'] + "(model.parameters(), lr=config_params['learning_rate'])")

		cardinality_training_set = len(X_train)
		model, loss_values, epoch_stopped, loss_value_last_epoch, accuracy_last_epoch = train_model(model, loss_func, optim, train_loader, config_params['num_epochs'], config_params['n_bad_epochs'], device, tb, cardinality_training_set)
		
		print(f"Loss: {loss_value_last_epoch}", end="\n\n")

		report = test_model(model, val_loader, device, True)
		index_classes = len(report) - 3

		f1_score = [float(report[str(i)]['f1-score']) for i in range(index_classes)]
		f1_score_total = np.sum(f1_score)

		def MSE(metrics_per_class):
			mean = np.mean(metrics_per_class)
			sum_errors_squared = 0
			for j in metrics_per_class:
				sum_errors_squared += np.square(j - f1_score_mean)
			return np.sqrt(sum_errors_squared)

		f1_score_mse = MSE(f1_score)

		precision = [float(report[str(i)]['precision']) for i in range(index_classes)]
		precision_total = np.sum(precision)
		precision_mse = MSE(precision)

		recall = [float(report[str(i)]['recall']) for i in range(index_classes)]
		recall_total = np.sum(recall)
		recall_mse = MSE(recall)

		support = [int(report[str(i)]['support']) for i in range(index_classes)]
		accuracy = report['accuracy']


		row_values= [nr_train] + list_params_config + [epoch_stopped, loss_value_last_epoch, accuracy, precision, precision_total, recall, recall_total, f1_score, f1_score_total, support]
		results_regression_ft=results_regression_ft.append(pd.Series(row_values, index=columns), ignore_index=True)
		# plt.plot(loss_values)
		# plt.title("Number of epochs: {}".format(num_epochs))
		# plt.show()

		dict_params_config = {list(config_params.keys())[z]: list_params_config[z] for z in range(len(config_params))}
		tb.add_hparams(hparam_dict = dict_params_config, metric_dict = {"Accuracy every epoch": None, "Loss every epoch": None})
		tb.flush()
		tb.close()
	del model, optim, train_loader, val_loader

NameError: name 'config_set' is not defined

In [17]:
results_regression_ft.to_csv("tuning_hyperparams/results_regression_ft.csv", index=False)

In [145]:
results_regression_ft = pd.read_csv("tuning_hyperparams/results_regression_ft.csv")

In [None]:
new_results['mse_precision'] = [mean_squared_error([1:-1]) for i in results.f1_score]

In [21]:
mse_list = []
for i in new_results.precision:
	i = list(map(float, i[1:-1].split(", ")))
	sum_errors_squared = 0
	mean = np.mean(i)
	for j in i:
		sum_errors_squared += np.square(j - mean)
	mse_list.append(np.sqrt(sum_errors_squared))

## Testing with best hyperparams

In [None]:
test_model(model, test_loader, device)

# Save model

In [None]:
torch.save(model.state_dict(), "best_model.pth")

## Predicting

In [None]:
def predict(row, model):
    row = torch.Tensor([row])
    yhat = model(row)
    yhat = yhat.detach().numpy()
    return yhat

## Saving model