# Run experiment multi output regression
The aim of this script is to run the experiment in order to train a multioutput regression neural network to predict the size of cell parameters of different crystal structures. Users have to load the crystal structure dataset, read it, select which are the features to be used to train the model, as well as, which are the response variables. 

After that, dataset will be splitted in 80% for training and 20% for test dataset. Repeated K-fold cross validation will be used with *k* and *n* equals to 10. Hyperparameters optimization will be also used. 

## Import library

In [1]:
import os

# Data Manipulation
import numpy as np
import pandas as pd

# Data Visualization
import seaborn as sn
import matplotlib.pyplot as plt
import plotly.express as px

# Machine Learning
from sklearn.model_selection import train_test_split

In [2]:
import warnings 
warnings.filterwarnings('ignore')

## Load the Data
User must enter the name of the path in which dataset is stored.

In [3]:
def read_dataset ():
  dataset = ''
  path_name = input('Enter the path name for dataset: ')
  path_name = '/content/' + path_name

  if not os.path.exists(path_name):
      print('Error! Invalid path selected.')
  else:
      print(path_name + ' is a valid path.')

      if not os.listdir(path_name):
        print("Warning! Empty directory.")
      else:
        file_name = input('Enter the file name for dataset: ')
        dataset = pd.read_csv(path_name + '/' + file_name + '.csv', sep = ';', index_col = 'ID_Observations' )
  return dataset

### *Load the dataset*

In [4]:
y_coord_dataset = read_dataset()

Enter the path name for dataset: crystal-dataset
/content/crystal-dataset is a valid path.
Enter the file name for dataset: final_dataset_hexagonal


In [5]:
print('y_coord_dataset shape: {}'.format(y_coord_dataset.shape))
print('\n data types: \n{}'.format(y_coord_dataset.dtypes))
print('\ny_coord_dataset content: \n')
y_coord_dataset

y_coord_dataset shape: (2465, 17)

 data types: 
x_1                       float64
x_2                       float64
x_3                       float64
x_4                       float64
x_5                       float64
x_6                       float64
x_7                       float64
x_8                       float64
x_9                       float64
x_10                      float64
Total_n_peaks               int64
Max_peaks_position        float64
Volume                    float64
Crystal_Structure_Type     object
a                         float64
b                         float64
c                         float64
dtype: object

y_coord_dataset content: 



Unnamed: 0_level_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,x_10,Total_n_peaks,Max_peaks_position,Volume,Crystal_Structure_Type,a,b,c
ID_Observations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1000100,10.96,19.02,22.00,22.42,25.02,29.26,29.58,31.62,33.28,37.16,50,19.02,298.027,Hexagonal,9.3200,9.3200,3.9618
1000115,7.92,13.74,19.52,23.94,27.72,32.10,36.06,36.94,38.76,39.60,25,7.92,714.180,Hexagonal,12.8720,12.8720,4.9772
1000134,20.38,21.10,26.74,27.50,30.40,36.56,36.74,36.88,37.30,41.44,30,27.50,754.378,Hexagonal,10.1760,10.1760,8.4121
1000155,20.64,28.14,33.10,38.40,39.32,41.98,43.98,54.48,56.14,58.20,18,39.32,217.903,Hexagonal,5.4090,5.4090,8.6000
1000156,18.92,20.64,28.14,33.10,38.40,39.32,41.98,43.98,46.44,51.58,21,39.32,217.903,Hexagonal,5.4090,5.4090,8.6000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8103008,23.54,26.34,28.90,31.32,35.62,39.46,43.16,44.86,46.46,48.16,31,26.34,352.717,Hexagonal,8.7199,8.7199,5.3564
8103058,20.52,22.00,23.32,30.26,35.94,39.42,41.74,42.54,43.28,44.88,26,42.54,174.295,Hexagonal,4.9935,4.9935,8.0713
8103181,8.30,9.54,12.56,14.14,15.16,16.42,16.66,17.24,19.16,20.24,32,9.54,2452.274,Hexagonal,12.2812,12.2812,18.7740
8103183,24.42,27.08,33.96,42.98,43.26,47.48,50.04,51.54,54.10,55.86,17,33.96,117.247,Hexagonal,4.2060,4.2060,7.6530


## Insert details on dataset features
Users have to specify which variables will be used as features and which will be response features.

In [6]:
def insert_dataset_structure_details(final_dataset):
    print('============================================================================')
    print('Insert dataset structure details');

    while 1:
        print('----------------------------------------------------------------------------')
        use_volume = input('\n1) Do you want to use volume as a feature in the experiment? [Y|N]: ');
        if use_volume.lower() == 'y':
          use_volume = True;
          break;
        elif use_volume.lower() == 'n':
          use_volume = False;
          break;

    while 1:
        print('----------------------------------------------------------------------------')
        use_total_n_peaks = input('\n2) Do you want to use total_n_peaks as a feature in the experiment? [Y|N]: ');
        if use_total_n_peaks.lower() == 'y':
          use_total_n_peaks = True;
          break;
        elif use_total_n_peaks.lower() == 'n':
          use_total_n_peaks = False;
          break;

    while 1:
        print('----------------------------------------------------------------------------')
        use_max_peaks = input('\n3) Do you want to use max_peaks as a feature in the experiment? [Y|N]: ');
        if use_max_peaks.lower() == 'y':
          use_max_peaks = True;
          break;
        elif use_max_peaks.lower() == 'n':
          use_max_peaks = False;
          break;
   
    while 1:
        print('----------------------------------------------------------------------------')
        a_is_response = input('\n4) "a" is a response feature ? [Y|N]: ');
        if a_is_response.lower() == 'y':
          a_is_response = True;
          break;
        elif a_is_response.lower() == 'n':
          a_is_response = False;
          break;

    while 1:
        print('----------------------------------------------------------------------------')
        b_is_response = input('\n5) "b" is a response feature ? [Y|N]: ');
        if b_is_response.lower() == 'y':
          b_is_response = True;
          break;
        elif b_is_response.lower() == 'n':
          b_is_response = False;
          break;

    while 1:
        print('----------------------------------------------------------------------------')
        c_is_response = input('\n6) "c" is a response feature ? [Y|N]: ');
        if c_is_response.lower() == 'y':
          c_is_response = True;
          break;
        elif c_is_response.lower() == 'n':
          c_is_response = False;
          break;

    print('============================================================================')

    response_features_name = []

    if not use_total_n_peaks:
      final_dataset.drop('Total_n_peaks', axis=1, inplace=True)
    
    if not use_max_peaks:
      final_dataset.drop('Max_peaks_position', axis=1, inplace=True)

    if not use_volume:
      final_dataset.drop('Volume', axis=1, inplace=True)
    
    if not a_is_response:
      final_dataset.drop('a', axis=1, inplace=True)
    else:
      response_features_name.append('a')
    
    if not b_is_response:
      final_dataset.drop('b', axis=1, inplace=True)
    else:
      response_features_name.append('b')
    
    if not c_is_response:
      final_dataset.drop('c', axis=1, inplace=True)  
    else:
      response_features_name.append('c')

    final_dataset.drop('Crystal_Structure_Type', axis=1, inplace=True)

    return final_dataset, response_features_name

In [7]:
final_dataset_for_experiment, response_features_name = insert_dataset_structure_details(y_coord_dataset.copy())
print('The response features for this crystal structure are: {}'.format(response_features_name))
print('The final dataset with features is:\n')
final_dataset_for_experiment

Insert dataset structure details
----------------------------------------------------------------------------

1) Do you want to use volume as a feature in the experiment? [Y|N]: n
----------------------------------------------------------------------------

2) Do you want to use total_n_peaks as a feature in the experiment? [Y|N]: n
----------------------------------------------------------------------------

3) Do you want to use max_peaks as a feature in the experiment? [Y|N]: n
----------------------------------------------------------------------------

4) "a" is a response feature ? [Y|N]: y
----------------------------------------------------------------------------

5) "b" is a response feature ? [Y|N]: n
----------------------------------------------------------------------------

6) "c" is a response feature ? [Y|N]: y
The response features for this crystal structure are: ['a', 'c']
The final dataset with features is:



Unnamed: 0_level_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,x_10,a,c
ID_Observations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1000100,10.96,19.02,22.00,22.42,25.02,29.26,29.58,31.62,33.28,37.16,9.3200,3.9618
1000115,7.92,13.74,19.52,23.94,27.72,32.10,36.06,36.94,38.76,39.60,12.8720,4.9772
1000134,20.38,21.10,26.74,27.50,30.40,36.56,36.74,36.88,37.30,41.44,10.1760,8.4121
1000155,20.64,28.14,33.10,38.40,39.32,41.98,43.98,54.48,56.14,58.20,5.4090,8.6000
1000156,18.92,20.64,28.14,33.10,38.40,39.32,41.98,43.98,46.44,51.58,5.4090,8.6000
...,...,...,...,...,...,...,...,...,...,...,...,...
8103008,23.54,26.34,28.90,31.32,35.62,39.46,43.16,44.86,46.46,48.16,8.7199,5.3564
8103058,20.52,22.00,23.32,30.26,35.94,39.42,41.74,42.54,43.28,44.88,4.9935,8.0713
8103181,8.30,9.54,12.56,14.14,15.16,16.42,16.66,17.24,19.16,20.24,12.2812,18.7740
8103183,24.42,27.08,33.96,42.98,43.26,47.48,50.04,51.54,54.10,55.86,4.2060,7.6530


## Split data for regression task

### *Split final dataset in x (features) and y (response)*

In [8]:
x = final_dataset_for_experiment[final_dataset_for_experiment.columns.difference(response_features_name)]
print('Features to be used for training are: ')
x

Features to be used for training are: 


Unnamed: 0_level_0,x_1,x_10,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9
ID_Observations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000100,10.96,37.16,19.02,22.00,22.42,25.02,29.26,29.58,31.62,33.28
1000115,7.92,39.60,13.74,19.52,23.94,27.72,32.10,36.06,36.94,38.76
1000134,20.38,41.44,21.10,26.74,27.50,30.40,36.56,36.74,36.88,37.30
1000155,20.64,58.20,28.14,33.10,38.40,39.32,41.98,43.98,54.48,56.14
1000156,18.92,51.58,20.64,28.14,33.10,38.40,39.32,41.98,43.98,46.44
...,...,...,...,...,...,...,...,...,...,...
8103008,23.54,48.16,26.34,28.90,31.32,35.62,39.46,43.16,44.86,46.46
8103058,20.52,44.88,22.00,23.32,30.26,35.94,39.42,41.74,42.54,43.28
8103181,8.30,20.24,9.54,12.56,14.14,15.16,16.42,16.66,17.24,19.16
8103183,24.42,55.86,27.08,33.96,42.98,43.26,47.48,50.04,51.54,54.10


In [9]:
y = final_dataset_for_experiment[response_features_name]
print('Response features are:')
y

Response features are:


Unnamed: 0_level_0,a,c
ID_Observations,Unnamed: 1_level_1,Unnamed: 2_level_1
1000100,9.3200,3.9618
1000115,12.8720,4.9772
1000134,10.1760,8.4121
1000155,5.4090,8.6000
1000156,5.4090,8.6000
...,...,...
8103008,8.7199,5.3564
8103058,4.9935,8.0713
8103181,12.2812,18.7740
8103183,4.2060,7.6530


### *Split dataset in training and test set*
We use 80% of the original dataset as training dataset and the remaining 20% to test the model.

**random_state messo per riproducibilità**
**da cancellare**

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

In [11]:
print('Training dataset shape')
print('- The x_train dataframe has the following shape: {}'.format(x_train.shape))
print('- The y_train dataframe has the following shape: {}'.format(y_train.shape))
print('------------------------------------------------------------------------')
print('Test dataset shape')
print('- The x_test dataframe has the following shape: {}'.format(x_test.shape))
print('- The y_test dataframe has the following shape: {}'.format(y_test.shape))

Training dataset shape
- The x_train dataframe has the following shape: (1972, 10)
- The y_train dataframe has the following shape: (1972, 2)
------------------------------------------------------------------------
Test dataset shape
- The x_test dataframe has the following shape: (493, 10)
- The y_test dataframe has the following shape: (493, 2)


## Overview of the data

### *Descriptive Statistics*
Using the method describe() we can see some information about the dataset we have, we can have a picture of each column (feature):
*   Mean, mediam, model, standard deviation.
*   Min and Max.
*   Count.
*   Quartiles.








In [12]:
x_train.join(y_train).describe()

Unnamed: 0,x_1,x_10,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,a,c
count,1972.0,1972.0,1972.0,1972.0,1972.0,1972.0,1972.0,1972.0,1972.0,1972.0,1972.0,1972.0
mean,10.752535,31.177404,14.775497,17.416278,20.045649,22.267698,24.171379,26.035213,27.736988,29.489939,12.033345,14.559464
std,5.585539,12.807594,6.728961,7.699028,8.578355,9.456592,10.188328,10.955881,11.504324,12.16415,5.851602,8.438074
min,2.92,9.32,3.14,4.28,5.42,5.84,6.26,6.62,6.92,8.3,2.921,2.5862
25%,6.68,20.97,9.3,11.14,13.16,14.66,15.915,17.355,18.58,19.78,7.398975,7.75885
50%,9.4,28.41,12.86,15.61,18.23,20.12,21.96,23.43,25.25,27.0,10.7807,12.78795
75%,13.34,39.01,19.225,22.82,26.105,28.66,31.06,33.1,34.92,36.99,15.610675,19.398025
max,35.1,88.3,40.82,47.54,56.82,62.98,69.56,76.58,81.42,83.38,39.039,39.4397


In [13]:
x_test.join(y_test).describe()

Unnamed: 0,x_1,x_10,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,a,c
count,493.0,493.0,493.0,493.0,493.0,493.0,493.0,493.0,493.0,493.0,493.0,493.0
mean,10.98069,31.462637,14.94929,17.462759,20.39217,22.551034,24.45432,26.274077,27.996836,29.685396,12.184837,13.728938
std,5.810416,12.844726,6.766273,7.758621,9.003983,9.729747,10.416384,11.085005,11.642506,12.150079,5.699764,7.760897
min,2.9,11.78,4.88,5.88,6.34,7.62,8.04,9.5,10.48,11.18,3.1919,2.906
25%,6.38,20.92,9.54,11.08,13.04,14.62,16.18,17.2,18.32,19.68,7.7959,7.5566
50%,9.44,29.56,13.32,15.8,18.32,20.36,22.32,24.46,25.74,27.58,10.702,12.24
75%,14.06,40.68,19.54,22.94,26.92,29.84,31.32,33.66,35.96,37.82,16.142,18.479
max,32.36,82.74,39.9,47.54,51.74,57.72,63.62,67.74,70.74,77.68,35.098,37.102


### *Boxplot*

In [31]:
def plot_boxplot(df, boxplot_title):
  fig = px.box(df, template='simple_white')
  fig.update_layout(title_text=boxplot_title,
                    xaxis_title='features',
                    yaxis_title='',
                    title_x=0.5, 
                    font=dict(
                        size=16
                    ))
  fig.show()

In [32]:
plot_boxplot(x_train.join(y_train), 'Training dataset distribution')
plot_boxplot(x_test.join(y_test), 'Test dataset distribution')