# Run experiment multi output regression


---


The aim of this script is to run the experiment in order to train a multioutput regression neural network to predict the size of cell parameters of different crystal structures. Users have to load the crystal structure dataset, read it, select which are the features to be used to train the model, as well as, which are the response variables. 

After that, dataset will be splitted in 80% for training and 20% for test dataset. Repeated K-fold cross validation will be used with *k* and *n* equals to 10. Hyperparameters optimization will be also used. 

## Import library

In [None]:
pip install keras-tuner --upgrade

In [None]:
import os

# Data Manipulation
import numpy as np
import pandas as pd
from scipy.stats import reciprocal

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# TensorFlow / Keras 
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError, MeanAbsolutePercentageError 
import kerastuner as kt
import tensorflow as tf

# Scikit-learn
from sklearn.model_selection import train_test_split

In [None]:
import warnings 
warnings.filterwarnings('ignore')

## Load the Data


---


User must enter the name of the path in which dataset is stored. After that, we will check if the directory exists and if it is empty or not.

### *Check directory and files*

In [None]:
def read_dataset ():
  dataset = ''
  path_name = input('Enter the path name for dataset: ')
  path_name = '/content/' + path_name

  if not os.path.exists(path_name):
      print('Error! Invalid path selected.')
  else:
      print(path_name + ' is a valid path.')

      if not os.listdir(path_name):
        print("Warning! Empty directory.")
      else:
        file_name = input('Enter the file name for dataset: ')
        dataset = pd.read_csv(path_name + '/' + file_name + '.csv', sep = ';', index_col = 'ID_Observations' )
  return dataset

### *Load the dataset*

In [None]:
y_coord_dataset = read_dataset()

In [None]:
print('y_coord_dataset shape: {}'.format(y_coord_dataset.shape))
print('\n data types: \n{}'.format(y_coord_dataset.dtypes))
print('\ny_coord_dataset content: \n')
y_coord_dataset

## Insert details on dataset features


---


Users have to specify which variables will be used as features and which will be response features.

In [None]:
def insert_dataset_structure_details(final_dataset):
    print('============================================================================')
    print('Insert dataset structure details');

    while 1:
        print('----------------------------------------------------------------------------')
        use_volume = input('\n1) Do you want to use volume as a feature in the experiment? [Y|N]: ');
        if use_volume.lower() == 'y':
          use_volume = True;
          break;
        elif use_volume.lower() == 'n':
          use_volume = False;
          break;

    while 1:
        print('----------------------------------------------------------------------------')
        use_total_n_peaks = input('\n2) Do you want to use total_n_peaks as a feature in the experiment? [Y|N]: ');
        if use_total_n_peaks.lower() == 'y':
          use_total_n_peaks = True;
          break;
        elif use_total_n_peaks.lower() == 'n':
          use_total_n_peaks = False;
          break;

    while 1:
        print('----------------------------------------------------------------------------')
        use_max_peaks = input('\n3) Do you want to use max_peaks as a feature in the experiment? [Y|N]: ');
        if use_max_peaks.lower() == 'y':
          use_max_peaks = True;
          break;
        elif use_max_peaks.lower() == 'n':
          use_max_peaks = False;
          break;
   
    while 1:
        print('----------------------------------------------------------------------------')
        a_is_response = input('\n4) "a" is a response feature ? [Y|N]: ');
        if a_is_response.lower() == 'y':
          a_is_response = True;
          break;
        elif a_is_response.lower() == 'n':
          a_is_response = False;
          break;

    while 1:
        print('----------------------------------------------------------------------------')
        b_is_response = input('\n5) "b" is a response feature ? [Y|N]: ');
        if b_is_response.lower() == 'y':
          b_is_response = True;
          break;
        elif b_is_response.lower() == 'n':
          b_is_response = False;
          break;

    while 1:
        print('----------------------------------------------------------------------------')
        c_is_response = input('\n6) "c" is a response feature ? [Y|N]: ');
        if c_is_response.lower() == 'y':
          c_is_response = True;
          break;
        elif c_is_response.lower() == 'n':
          c_is_response = False;
          break;

    print('============================================================================')

    response_features_name = []

    if not use_total_n_peaks:
      final_dataset.drop('Total_n_peaks', axis=1, inplace=True)
    
    if not use_max_peaks:
      final_dataset.drop('Max_peaks_position', axis=1, inplace=True)

    if not use_volume:
      final_dataset.drop('Volume', axis=1, inplace=True)
    
    if not a_is_response:
      final_dataset.drop('a', axis=1, inplace=True)
    else:
      response_features_name.append('a')
    
    if not b_is_response:
      final_dataset.drop('b', axis=1, inplace=True)
    else:
      response_features_name.append('b')
    
    if not c_is_response:
      final_dataset.drop('c', axis=1, inplace=True)  
    else:
      response_features_name.append('c')

    final_dataset.drop('Crystal_Structure_Type', axis=1, inplace=True)

    return final_dataset, response_features_name

In [None]:
final_dataset_for_experiment, response_features_name = insert_dataset_structure_details(y_coord_dataset.copy())
print('The response features for this crystal structure are: {}'.format(response_features_name))
print('The final dataset with features is:\n')
final_dataset_for_experiment

## Splitting data for regression task

### *Separating Input Features and Output Features*

In [None]:
x = final_dataset_for_experiment[final_dataset_for_experiment.columns.difference(response_features_name)]
print('The dataset has the following number of input features: {:d}'.format(x.shape[1]))
print('Input features to be used for training are: ')
x

In [None]:
y = final_dataset_for_experiment[response_features_name]
print('The dataset has the following number of output features: {:d}'.format(y.shape[1]))
print('Output features are:')
y

### *Splitting the dataset into training set and test set*
We use 80% of the original dataset as training dataset and the remaining 20% to test the model.

**random_state messo per riproducibilità**
**da cancellare**

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

In [None]:
print('Training/Test dataset shape')
print('------------------------------------------------------------------------')
print('- The training set has the following number of observations: {:d}'.format(y_train.shape[0]))
print('- The test set has the following number of observations: {:d}'.format(y_test.shape[0]))

## Training multi-output regression neural network model

---

After splitting the data into training and testing sets, it's time to train our neural network model. 

***Keras models accept three types of inputs:***
* NumPy arrays, just like Scikit-Learn and many other Python-based libraries. This is a good option if your data fits in memory.
* TensorFlow Dataset objects. This is a high-performance option that is more suitable for datasets that do not fit in memory and that are streamed from disk or from a distributed filesystem.
* Python generators that yield batches of data (such as custom subclasses of the keras.utils.Sequence class).***

### *Define the neural network architectures and search space for hyperparameters*

In [None]:
def build_model(hp):
  model = Sequential()

  # Optimize the number of hidden layers
  for i in range(hp.Int('num_layers', 1, 10)):
    model.add(Dense(units=hp.Int(f'units_{i}', min_value=50, max_value=500, step=25),
                    activation=hp.Choice(f'activation_{i}', ['relu', 'tanh'])))

  # Adding the output layer (senza fuznione di attivazione, 
  # dovremmo cambiare il numero di neuroni in base al numero di output)
  model.add(Dense(1))

  # Define the optimizer learning rate as a hyperparameter.
  learning_rate = hp.Float('lr', min_value=1e-4, max_value=1e-2, sampling='log')
  model.compile(
      optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
      loss='mean_squared_error',
      metrics=[RootMeanSquaredError()],
      )
  return model

In [None]:
# funzione per plottare la learning curves
def plot_learning_curves(history):
  pd.DataFrame(history.history).plot(figsize=(8, 5))
  plt.grid(True)
  #plt.gca().set_ylim(0, 1) # set the vertical range to [0-1]
  plt.show()

In [None]:
build_model(kt.HyperParameters())

In [None]:
tuner = kt.RandomSearch(
    hypermodel=build_model,
    objective=kt.Objective('val_loss', direction='min'),
    max_trials=2,
    executions_per_trial=5,
    overwrite=True,
    directory='neural-network-opt-2',
    project_name='cubic-opt-neural-network'
)

In [None]:
tuner.search_space_summary()

In [None]:
tuner.search(x_train, y_train, 
             epochs=10, 
             validation_data=(x_test, y_test),
             callbacks=[tf.keras.callbacks.EarlyStopping(
                 monitor='val_loss', patience=5)])

In [None]:
tuner.results_summary()

In [None]:
(tuner.get_best_hyperparameters()[0].values)

In [None]:
best_model = tuner.get_best_models(num_models=1)[0]

In [None]:
best_model = tuner.hypermodel.build(tuner.get_best_hyperparameters()[0])
best_model.fit(x_train, y_train, batch_size=32, epochs=100, initial_epoch=0)

In [None]:
best_model.summary()
best_model.evaluate(x_test,y_test)