# Run experiment multi output regression


---


The aim of this script is to run the experiment in order to train a multioutput regression neural network to predict the size of cell parameters of different crystal structures. Users have to load the crystal structure dataset, read it, select which are the features to be used to train the model, as well as, which are the response variables. 

After that, dataset will be splitted in 80% for training and 20% for test dataset. Training dataset will be splitted again in 75% for training and 25% for validation, during the hyperparameters optimization. After that, the best hyperparameters will be used to fit agai the neural network on the full training dataset.

## Import library

In [3]:
pip install keras-tuner --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import os

# Data Manipulation
import numpy as np
import pandas as pd
from scipy.stats import reciprocal

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# TensorFlow / Keras 
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError, MeanAbsolutePercentageError 
import keras_tuner as kt
import tensorflow as tf

# Scikit-learn
from sklearn.model_selection import train_test_split

In [None]:
import warnings 
warnings.filterwarnings('ignore')

## Load the Data


---


User must enter the name of the path in which dataset is stored. After that, we will check if the directory exists and if it is empty or not.

### *Check directory and files*

In [None]:
def read_dataset ():
  dataset = ''
  path_name = input('Enter the path name for dataset: ')
  path_name = '/content/' + path_name

  if not os.path.exists(path_name):
      print('Error! Invalid path selected.')
  else:
      print(path_name + ' is a valid path.')

      if not os.listdir(path_name):
        print("Warning! Empty directory.")
      else:
        file_name = input('Enter the file name for dataset: ')
        dataset = pd.read_csv(path_name + '/' + file_name + '.csv', sep = ';', index_col = 'ID_Observations' )
  return dataset

### *Load the dataset*

In [None]:
y_coord_dataset = read_dataset()

In [None]:
print('y_coord_dataset shape: {}'.format(y_coord_dataset.shape))
print('\n data types: \n{}'.format(y_coord_dataset.dtypes))
print('\ny_coord_dataset content: \n')
y_coord_dataset

## Insert details on dataset features


---


Users have to specify which variables will be used as features and which will be response features.

In [None]:
def insert_dataset_structure_details(final_dataset):
    print('============================================================================')
    print('Insert dataset structure details');

    while 1:
        print('----------------------------------------------------------------------------')
        use_volume = input('\n1) Do you want to use volume as a feature in the experiment? [Y|N]: ');
        if use_volume.lower() == 'y':
          use_volume = True;
          break;
        elif use_volume.lower() == 'n':
          use_volume = False;
          break;

    while 1:
        print('----------------------------------------------------------------------------')
        use_total_n_peaks = input('\n2) Do you want to use total_n_peaks as a feature in the experiment? [Y|N]: ');
        if use_total_n_peaks.lower() == 'y':
          use_total_n_peaks = True;
          break;
        elif use_total_n_peaks.lower() == 'n':
          use_total_n_peaks = False;
          break;

    while 1:
        print('----------------------------------------------------------------------------')
        use_max_peaks = input('\n3) Do you want to use max_peaks as a feature in the experiment? [Y|N]: ');
        if use_max_peaks.lower() == 'y':
          use_max_peaks = True;
          break;
        elif use_max_peaks.lower() == 'n':
          use_max_peaks = False;
          break;
   
    while 1:
        print('----------------------------------------------------------------------------')
        a_is_response = input('\n4) "a" is a response feature ? [Y|N]: ');
        if a_is_response.lower() == 'y':
          a_is_response = True;
          break;
        elif a_is_response.lower() == 'n':
          a_is_response = False;
          break;

    while 1:
        print('----------------------------------------------------------------------------')
        b_is_response = input('\n5) "b" is a response feature ? [Y|N]: ');
        if b_is_response.lower() == 'y':
          b_is_response = True;
          break;
        elif b_is_response.lower() == 'n':
          b_is_response = False;
          break;

    while 1:
        print('----------------------------------------------------------------------------')
        c_is_response = input('\n6) "c" is a response feature ? [Y|N]: ');
        if c_is_response.lower() == 'y':
          c_is_response = True;
          break;
        elif c_is_response.lower() == 'n':
          c_is_response = False;
          break;

    print('============================================================================')

    response_features_name = []

    if not use_total_n_peaks:
      final_dataset.drop('Total_n_peaks', axis=1, inplace=True)
    
    if not use_max_peaks:
      final_dataset.drop('Max_peaks_position', axis=1, inplace=True)

    if not use_volume:
      final_dataset.drop('Volume', axis=1, inplace=True)
    
    if not a_is_response:
      final_dataset.drop('a', axis=1, inplace=True)
    else:
      response_features_name.append('a')
    
    if not b_is_response:
      final_dataset.drop('b', axis=1, inplace=True)
    else:
      response_features_name.append('b')
    
    if not c_is_response:
      final_dataset.drop('c', axis=1, inplace=True)  
    else:
      response_features_name.append('c')

    final_dataset.drop('Crystal_Structure_Type', axis=1, inplace=True)

    return final_dataset, response_features_name

In [None]:
final_dataset_for_experiment, response_features_name = insert_dataset_structure_details(y_coord_dataset.copy())
print('The response features for this crystal structure are: {}'.format(response_features_name))
print('The final dataset with features is:\n')
final_dataset_for_experiment

## Splitting data for regression task

### *Separating Input Features and Output Features*

In [None]:
x = final_dataset_for_experiment[final_dataset_for_experiment.columns.difference(response_features_name)]
print('The dataset has the following number of input features: {:d}'.format(x.shape[1]))
print('Input features to be used for training are: ')
x

In [None]:
y = final_dataset_for_experiment[response_features_name]
print('The dataset has the following number of output features: {:d}'.format(y.shape[1]))
print('Output features are:')
y

### *Splitting the dataset into training set, validation set and test set*
We use 80% of the original dataset as training dataset and the remaining 20% to test the model.
The training set (80%) is splitted again in training dataset (75%) and validation dataset (25%). 

**random_state messo per riproducibilità**
**da cancellare**

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=8)

In [None]:
print('Training-Validation-Test dataset shape')
print('------------------------------------------------------------------------')
print('- The training set has the following number of observations: {:d}'.format(y_train.shape[0]))
print('- The validation set has the following number of observations: {:d}'.format(y_val.shape[0]))
print('- The test set has the following number of observations: {:d}'.format(y_test.shape[0]))

## Training multi-output regression neural network model

---

After splitting the data into training and testing sets, it's time to train our neural network model. 

***Keras models accept three types of inputs:***
* NumPy arrays, just like Scikit-Learn and many other Python-based libraries. This is a good option if your data fits in memory.
* TensorFlow Dataset objects. This is a high-performance option that is more suitable for datasets that do not fit in memory and that are streamed from disk or from a distributed filesystem.
* Python generators that yield batches of data (such as custom subclasses of the keras.utils.Sequence class).***

### *Define the neural network architectures and search space for hyperparameters*

In [None]:
def build_model(hp):
  model = Sequential()

  # Optimize the number of hidden layers
  for i in range(hp.Int('num_layers', 1, 5)):
    model.add(Dense(units=hp.Int(f'units_{i}', min_value=50, max_value=500, step=25),
                    activation=hp.Choice(f'activation_{i}', ['relu',
                                                             'tanh',])))
  model.add(Dropout(rate=hp.Choice(f'drop_rate_layer_{i}',[0.0,0.1,0.2,
                                                             0.3,0.4,0.5,
                                                             0.6,0.7,0.8,0.9])))

  # Adding the output layer (senza fuznione di attivazione, 
  # dovremmo cambiare il numero di neuroni in base al numero di output)
  model.add(Dense(1))

  # Define the optimizer learning rate as a hyperparameter.
  learning_rate = hp.Float('lr', min_value=1e-4, max_value=1e-2, sampling='log')
  model.compile(
      optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
      loss='mean_squared_error',
      metrics=[RootMeanSquaredError(), MeanAbsoluteError(), MeanAbsolutePercentageError()],
      )
  return model

In [None]:
build_model(kt.HyperParameters())

In [None]:
tuner = kt.RandomSearch(
    hypermodel=build_model,
    objective=kt.Objective('val_root_mean_squared_error', direction='min'),
    max_trials=5,
    executions_per_trial=5,
    overwrite=True,
    distribution_strategy=tf.distribute.MirroredStrategy(),
    directory='neural-network-opt-2',
    project_name='cubic-opt-neural-network'
)

In [None]:
tuner.search_space_summary()

In [None]:
tuner.search(x_train, y_train, 
             epochs=100, 
             validation_data=(x_val, y_val)
             )
             #callbacks=[tf.keras.callbacks.EarlyStopping(
                 #monitor='val_root_mean_squared_error', patience=5)]

In [None]:
tuner.results_summary()

In [None]:
(tuner.get_best_hyperparameters()[0].values)

In [None]:
best_model = tuner.get_best_models(num_models=1)[0]

In [None]:
best_model = tuner.hypermodel.build(tuner.get_best_hyperparameters()[0])
best_model.fit(x_train, y_train, batch_size=32, epochs=100, initial_epoch=0)

In [None]:
best_model.summary()
best_model.evaluate(x_test,y_test)

## Plot regression results

### *Perfect fit plot*
*Perfect fit plot* will display a straight black line meaning real observations value are equals to predicted values, and a blue points which will represent the observations. We will plot real observations value again predicted values.

In [5]:
import random
trX = 0.5 * np.random.randn(1500) * 0.25 +1
trY = 0.8* np.random.randn(*trX.shape) * 0.33 +1
df = pd.DataFrame({'Obs_a':trX,'pred_a': trY,'Obs_b':trX,'pred_b': trY,'Obs_c':trX,'pred_c': trY})
df

Unnamed: 0,Obs_a,pred_a,Obs_b,pred_b,Obs_c,pred_c
0,0.996720,1.416517,0.996720,1.416517,0.996720,1.416517
1,0.809764,0.979116,0.809764,0.979116,0.809764,0.979116
2,0.999454,0.611874,0.999454,0.611874,0.999454,0.611874
3,1.339207,1.154983,1.339207,1.154983,1.339207,1.154983
4,1.140176,0.536163,1.140176,0.536163,1.140176,0.536163
...,...,...,...,...,...,...
1495,1.200987,1.209838,1.200987,1.209838,1.200987,1.209838
1496,1.048898,0.753572,1.048898,0.753572,1.048898,0.753572
1497,1.105199,0.732943,1.105199,0.732943,1.105199,0.732943
1498,0.892052,0.714583,0.892052,0.714583,0.892052,0.714583


In [None]:
def plot_perfect_fit(df, output_names, title_fig):

  n_output = df.shape[1]
  
  fig = make_subplots(
      rows=1, cols= round(n_output/2),
      subplot_titles=['<b>' + name_output + '</b>' for name_output in output_names],
  )

  max_lim_axis = round(max(df.max(axis=0))+1)

  min_lim_axis = round(min(df.min(axis=0)))
  if min_lim_axis > 0:
    min_lim_axis = 0
  
  fit_point = np.linspace(0, max_lim_axis, max_lim_axis)

  j = 1

  for i in range(0,n_output,2):
    obs = df.iloc[:, i].to_numpy()
    pred = df.iloc[:, i+1].to_numpy()

    fig.add_trace(go.Scatter(x=obs, y=pred, mode='markers', marker_color='#1F77B4', 
                             marker_size=8, name='Observations'), row=1, col=j)
    
    fig.add_trace(go.Scatter(x=fit_point, y=fit_point, mode='lines', 
                             line={'color': 'black'}, name='Perfect prediction'),
                   row=1, col=j)
    
    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey', 
                      showline=True, linewidth=2, linecolor='black', mirror=True,
                      title_text='True response', range=[min_lim_axis, max_lim_axis])
    
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey', 
                      showline=True, linewidth=2, linecolor='black', mirror=True,
                      title_text='Predicted response', range=[min_lim_axis, max_lim_axis])
    if j > 1:
      fig.update_traces(row=1, col=j, showlegend=False)
    
    fig.layout.annotations[j-1].update(font=dict(size=20))

    j+=1

  fig.update_layout(
      template='simple_white',
      width=1780,
      height=650,
      title_text='<b>'+title_fig+'</b>',
      title_x=0.5, 
      font=dict(size=16), 
      legend_tracegroupgap=360,
      legend=dict(
          font=dict(size=16),
          bordercolor="Black",
          borderwidth=2
          )
  )

  fig.show()

In [None]:
plot_perfect_fit(df,['a','b','c'], 'Cube')

### *Residual bar plot*
*Residual bar plot* will display blue points which represent the real observations, the yellow points which are predicted observations and the red bar which are the residual values between real and predicted values.

In [None]:
def plot_residual_bar(df, output_names, title_fig):

  n_observations = df.shape[0]
  n_output = df.shape[1]
  
  fig = make_subplots(
      rows=1, cols= round(n_output/2),
      subplot_titles=['<b>' + name_output + '</b>' for name_output in output_names],
  )

  max_lim_axis = round(max(df.max(axis=0))+1)

  min_lim_axis = round(min(df.min(axis=0)))
  if min_lim_axis > 0:
    min_lim_axis = 0
  
  index_row = np.linspace(0, n_observations, n_observations)

  j = 1
  show_error_legend_bar = True

  for i in range(0,n_output,2):
    param_df = df.iloc[:, [i, i+1]]
    param_df = param_df.sort_values(by=param_df.columns[0])
    
    obs = param_df.iloc[:, 0].to_numpy()
    pred = param_df.iloc[:, 1].to_numpy()

    for k in range(1, n_observations):
      fig.add_trace(go.Scatter(x=[k,k], y=[obs[k], pred[k]], name="Error", 
                                mode='lines', line={'color': '#D62728', 'width':1},
                               showlegend=show_error_legend_bar), row=1, col=j)
      if show_error_legend_bar:
        show_error_legend_bar=False

    fig.add_trace(go.Scatter(x=index_row, y=obs, mode='markers', marker_color='#1F77B4', 
                             marker_size=8, name='True'), row=1, col=j)
    
    fig.add_trace(go.Scatter(x=index_row, y=pred, mode='markers', marker_color='#FF7F0E', 
                             marker_size=8, name='Predicted'), row=1, col=j)
        
    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey', 
                      showline=True, linewidth=2, linecolor='black', mirror=True,
                      title_text='Record number', range=[0, n_observations])
    
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey', 
                      showline=True, linewidth=2, linecolor='black', mirror=True,
                      title_text='Response', range=[min_lim_axis, max_lim_axis])
    if j > 1:
      fig.update_traces(row=1, col=j, showlegend=False)
    
    fig.layout.annotations[j-1].update(font=dict(size=20))

    j+=1

  fig.update_layout(
      template='simple_white',
      width=1780,
      height=650,
      title_text='<b>'+title_fig+'</b>',
      title_x=0.5, 
      font=dict(size=16), 
      legend_tracegroupgap=360,
      legend=dict(
          font=dict(size=16),
          bordercolor="Black",
          borderwidth=2
          )
  )

  fig.show()

In [None]:
plot_residual_bar(df, ['a','b','c'],'cube')

### *Compare observations plot*

In [8]:
def plot_compare_observations(df, output_names, title_fig):

  n_observations = df.shape[0]
  n_output = df.shape[1]
  
  fig = make_subplots(
      rows=round(n_output/2), cols=1,
      subplot_titles=['<b>' + name_output + '</b>' for name_output in output_names],
  )

  max_lim_axis = round(max(df.max(axis=0))+1)

  min_lim_axis = round(min(df.min(axis=0)))
  if min_lim_axis > 0:
    min_lim_axis = 0
  
  index_row = np.linspace(0, n_observations, n_observations)

  j = 1

  for i in range(0,n_output,2):
    obs = df.iloc[:, i].to_numpy()
    pred = df.iloc[:, i+1].to_numpy()

    fig.add_trace(go.Scatter(x=index_row, y=obs, 
                             mode='lines', line={'color': '#1F77B4', 'width':1},
                             name='True'), row=j, col=1)
    
    fig.add_trace(go.Scatter(x=index_row, y=pred, 
                             mode='lines', line={'color': '#FF7F0E', 'width':1}, 
                             name='Predicted'),row=j, col=1)
    
    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey', 
                      showline=True, linewidth=2, linecolor='black', mirror=True,
                      title_text='Record number', range=[0, n_observations])
    
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey', 
                      showline=True, linewidth=2, linecolor='black', mirror=True,
                      title_text='Response', range=[min_lim_axis, max_lim_axis])
    if j > 1:
      fig.update_traces(row=j, col=1, showlegend=False)
    
    fig.layout.annotations[j-1].update(font=dict(size=20))

    j+=1

  fig.update_layout(
      template='simple_white',
      width=1780,
      height=1500,
      title_text='<b>'+title_fig+'</b>',
      title_x=0.5, 
      font=dict(size=16), 
      legend_tracegroupgap=360,
      legend=dict(
          font=dict(size=16),
          bordercolor="Black",
          borderwidth=2
          )
  )

  fig.show()

In [None]:
plot_compare_observations(df, ['a','b','c'], 'cube')