# Data preparation and peaks extraction
The aim of this script is to prepare crystal structure dataset for the regression task. We read the **x_coord_dataset** and the **y_coord_dataset**_***crystal_structure***, and after this details about dataset configuration must be enter by the user. In particular, user must enter the following information:


*   *n_peaks_position* in **x_coord_dataset** to be used 
*   *threshold* used to keep a peak
*   if *n_total_peaks* have to be used as feature
*   if *max_peak* have to be used as feature
*   if it is needed to replace with zeroes missing peaks
*   which features must be used as *response*

After that, for each observation in y_coord_dataset, we extract the first n_peaks position of peaks bigger than the threshold and we will use it as feature for regression task.







## Import library

In [166]:
import numpy as np
import pandas as pd
import os
from scipy.signal import find_peaks

In [167]:
import warnings 
warnings.filterwarnings('ignore')

## Select path name
User must enter the name of the path in which dataset is stored.

In [168]:
def read_dataset ():
  dataset = ''
  path_name = input('Enter the path name for dataset: ')
  path_name = '/content/' + path_name

  if not os.path.exists(path_name):
      print('Error! Invalid path selected.')
  else:
      print(path_name + ' is a valid path.')

      if not os.listdir(path_name):
        print("Warning! Empty directory.")
      else:
        file_name = input('Enter the file name for dataset: ')
        dataset = pd.read_csv(path_name + '/' + file_name + '.csv', sep = ';')
  return dataset

## Read new dataset

In [169]:
x_coord_dataset = read_dataset()

Enter the path name for dataset: crystal-dataset
/content/crystal-dataset is a valid path.
Enter the file name for dataset: x_coord_dataset


In [170]:
print('x_coord_dataset shape: {}'.format(x_coord_dataset.shape))
print('\n data types: \n{}'.format(x_coord_dataset.dtypes))
print('\nx_coord_dataset content: \n')
x_coord_dataset

x_coord_dataset shape: (1, 4501)

 data types: 
x_1       float64
x_2       float64
x_3       float64
x_4       float64
x_5       float64
           ...   
x_4497    float64
x_4498    float64
x_4499    float64
x_4500    float64
x_4501    float64
Length: 4501, dtype: object

x_coord_dataset content: 



Unnamed: 0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,x_10,...,x_4492,x_4493,x_4494,x_4495,x_4496,x_4497,x_4498,x_4499,x_4500,x_4501
0,0.0,0.02,0.04,0.06,0.08,0.1,0.12,0.14,0.16,0.18,...,89.82,89.84,89.86,89.88,89.9,89.92,89.94,89.96,89.98,90.0


In [171]:
y_coord_dataset = read_dataset()

Enter the path name for dataset: crystal-dataset
/content/crystal-dataset is a valid path.
Enter the file name for dataset: y_coord_dataset_hexagonal


In [172]:
print('y_coord_dataset shape: {}'.format(y_coord_dataset.shape))
print('\n data types: \n{}'.format(y_coord_dataset.dtypes))
print('\ny_coord_dataset content: \n')
y_coord_dataset

y_coord_dataset shape: (2543, 4510)

 data types: 
y_1                       float64
y_2                       float64
y_3                       float64
y_4                       float64
y_5                       float64
                           ...   
alpha                     float64
beta                      float64
gamma                     float64
Volume                    float64
Crystal_Structure_Type     object
Length: 4510, dtype: object

y_coord_dataset content: 



Unnamed: 0,y_1,y_2,y_3,y_4,y_5,y_6,y_7,y_8,y_9,y_10,...,y_4501,ID_Observations,a,b,c,alpha,beta,gamma,Volume,Crystal_Structure_Type
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000,1000100,9.3200,9.3200,3.9618,90.0,90.0,120.0,298.027,Hexagonal
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001,1000115,12.8720,12.8720,4.9772,90.0,90.0,120.0,714.180,Hexagonal
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.183,1000134,10.1760,10.1760,8.4121,90.0,90.0,120.0,754.378,Hexagonal
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000,1000155,5.4090,5.4090,8.6000,90.0,90.0,120.0,217.903,Hexagonal
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000,1000156,5.4090,5.4090,8.6000,90.0,90.0,120.0,217.903,Hexagonal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,12.473,8103008,8.7199,8.7199,5.3564,90.0,90.0,120.0,352.717,Hexagonal
2539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.037,8103058,4.9935,4.9935,8.0713,90.0,90.0,120.0,174.295,Hexagonal
2540,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.133,8103181,12.2812,12.2812,18.7740,90.0,90.0,120.0,2452.274,Hexagonal
2541,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.017,8103183,4.2060,4.2060,7.6530,90.0,90.0,120.0,117.247,Hexagonal


## Read information on dataset and feature to be used in regression

In [173]:
def insert_dataset_structure_details():
    print('============================================================================')
    print('Insert dataset structure details');

    while 1:
        print('----------------------------------------------------------------------------')
        n_peaks_to_keep = input('\n1) Insert the number of peaks to use as features in the experiment: ');
        if n_peaks_to_keep.isnumeric():
          n_peaks_to_keep = int(n_peaks_to_keep)
          break;

    while 1:
        print('----------------------------------------------------------------------------')
        threshold = input('\n2) Insert the threshold used to select the peaks in the experiment: ');
        if threshold.isnumeric():
          threshold = float(threshold)
          break;

    while 1:
        print('----------------------------------------------------------------------------')
        use_total_n_peaks = input('\n4) Do you want to use total_n_peaks as a feature in the experiment? [Y|N]: ');
        if use_total_n_peaks.lower() == 'y':
          use_total_n_peaks = True;
          break;
        elif use_total_n_peaks.lower() == 'n':
          use_total_n_peaks = False;
          break;

    while 1:
        print('----------------------------------------------------------------------------')
        use_max_peaks = input('\n3) Do you want to use max_peaks as a feature in the experiment? [Y|N]: ');
        if use_max_peaks.lower() == 'y':
          use_max_peaks = True;
          break;
        elif use_max_peaks.lower() == 'n':
          use_max_peaks = False;
          break;
   
    while 1:
        print('----------------------------------------------------------------------------')
        a_is_response = input('\n6) "a" is a response feature ? [Y|N]: ');
        if a_is_response.lower() == 'y':
          a_is_response = True;
          break;
        elif a_is_response.lower() == 'n':
          a_is_response = False;
          break;

    while 1:
        print('----------------------------------------------------------------------------')
        b_is_response = input('\n6) "b" is a response feature ? [Y|N]: ');
        if b_is_response.lower() == 'y':
          b_is_response = True;
          break;
        elif b_is_response.lower() == 'n':
          b_is_response = False;
          break;

    while 1:
        print('----------------------------------------------------------------------------')
        c_is_response = input('\n6) "c" is a response feature ? [Y|N]: ');
        if c_is_response.lower() == 'y':
          c_is_response = True;
          break;
        elif c_is_response.lower() == 'n':
          c_is_response = False;
          break;

    print('============================================================================')


    final_dataset = pd.DataFrame()

    final_dataset['ID_Observations'] = ''
    final_dataset[[f'x_{i}' for i in range(1, n_peaks_to_keep+1)]] = ''

    if use_total_n_peaks:
      final_dataset['Total_n_peaks'] = ''
    
    if use_max_peaks:
      final_dataset['Max_peaks_position'] = ''
    
    final_dataset['Volume'] = ''
    final_dataset['Crystal_Structure_Type'] = ''
    
    if a_is_response:
      final_dataset['a'] = ''
    
    if b_is_response:
      final_dataset['b'] = ''

    if c_is_response:
      final_dataset['c'] = ''

    return threshold, n_peaks_to_keep, use_total_n_peaks, use_max_peaks,\
     a_is_response, b_is_response, c_is_response, final_dataset

In [174]:
peak_threshold, n_peaks_to_keep, use_total_n_peaks, use_max_peaks, a_is_response,\
 b_is_response, c_is_response, final_dataset = insert_dataset_structure_details()

Insert dataset structure details
----------------------------------------------------------------------------

1) Insert the number of peaks to use as features in the experiment: 10
----------------------------------------------------------------------------

2) Insert the threshold used to select the peaks in the experiment: 1
----------------------------------------------------------------------------

4) Do you want to use total_n_peaks as a feature in the experiment? [Y|N]: y
----------------------------------------------------------------------------

3) Do you want to use max_peaks as a feature in the experiment? [Y|N]: y
----------------------------------------------------------------------------

6) "a" is a response feature ? [Y|N]: y
----------------------------------------------------------------------------

6) "b" is a response feature ? [Y|N]: n
----------------------------------------------------------------------------

6) "c" is a response feature ? [Y|N]: y


In [175]:
print('The final dataset has the following structure: \n')
final_dataset

The final dataset has the following structure: 



Unnamed: 0,ID_Observations,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,x_10,Total_n_peaks,Max_peaks_position,Volume,Crystal_Structure_Type,a,c


## Peaks extraction

In [176]:
for i in range(0, y_coord_dataset.shape[0]):
  obs = y_coord_dataset.iloc[i]
  spectrum_data = obs.iloc[:4501]
  peaks_position = find_peaks(spectrum_data, threshold = peak_threshold)[0]   # return position of peaks bigger than threshold
  total_n_peaks = peaks_position.shape[0]
  max_peak_position = 0

  if peaks_position.shape[0] >= n_peaks_to_keep:
      peaks_position = peaks_position[:n_peaks_to_keep]   # Keeping only the first n_peaks_to_keep peaks
      x_coord_peaks_obs = x_coord_dataset.iloc[:, peaks_position] # Retrive the x_coord releated to the peaks
      x_coord_peaks_obs.columns = [f'x_{j}' for j in range(1, n_peaks_to_keep+1)]
      x_coord_peaks_obs['ID_Observations'] = obs.ID_Observations
      x_coord_peaks_obs['Volume'] = obs.Volume
      x_coord_peaks_obs['Crystal_Structure_Type'] = obs.Crystal_Structure_Type

      if a_is_response:
        x_coord_peaks_obs['a'] = obs.a

      if b_is_response:
        x_coord_peaks_obs['b'] = obs.b

      if c_is_response:
        x_coord_peaks_obs['c'] = obs.c

      if use_total_n_peaks:
        x_coord_peaks_obs['Total_n_peaks'] = total_n_peaks
      
      if use_max_peaks:
        temp_spectrum = spectrum_data.to_numpy()
        idx_max_peak = np.where(temp_spectrum == 1000)[0][0]
        x_coord_peaks_obs['Max_peaks_position'] = x_coord_dataset.iloc[0,  idx_max_peak]

      final_dataset = pd.concat([final_dataset, x_coord_peaks_obs], ignore_index= True)

In [177]:
print('The final dataset has the following shape: {}'.format(final_dataset.shape))
print('The final dataset has the following data: \n')
final_dataset

The final dataset has the following shape: (2465, 17)
The final dataset has the following data: 



Unnamed: 0,ID_Observations,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,x_10,Total_n_peaks,Max_peaks_position,Volume,Crystal_Structure_Type,a,c
0,1000100,10.96,19.02,22.00,22.42,25.02,29.26,29.58,31.62,33.28,37.16,50,19.02,298.027,Hexagonal,9.3200,3.9618
1,1000115,7.92,13.74,19.52,23.94,27.72,32.10,36.06,36.94,38.76,39.60,25,7.92,714.180,Hexagonal,12.8720,4.9772
2,1000134,20.38,21.10,26.74,27.50,30.40,36.56,36.74,36.88,37.30,41.44,30,27.50,754.378,Hexagonal,10.1760,8.4121
3,1000155,20.64,28.14,33.10,38.40,39.32,41.98,43.98,54.48,56.14,58.20,18,39.32,217.903,Hexagonal,5.4090,8.6000
4,1000156,18.92,20.64,28.14,33.10,38.40,39.32,41.98,43.98,46.44,51.58,21,39.32,217.903,Hexagonal,5.4090,8.6000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2460,8103008,23.54,26.34,28.90,31.32,35.62,39.46,43.16,44.86,46.46,48.16,31,26.34,352.717,Hexagonal,8.7199,5.3564
2461,8103058,20.52,22.00,23.32,30.26,35.94,39.42,41.74,42.54,43.28,44.88,26,42.54,174.295,Hexagonal,4.9935,8.0713
2462,8103181,8.30,9.54,12.56,14.14,15.16,16.42,16.66,17.24,19.16,20.24,32,9.54,2452.274,Hexagonal,12.2812,18.7740
2463,8103183,24.42,27.08,33.96,42.98,43.26,47.48,50.04,51.54,54.10,55.86,17,33.96,117.247,Hexagonal,4.2060,7.6530


## Save final dataset

In [None]:
def save_dataset(dataset):
  save_data = input('Do you want to save x_coord_dataset [y|n]: ')
  if save_data.lower() == 'y':
    path_dataset = input('Enter path in which to store dataset: ')
    name_dataset = input('Enter dataset name you want to save: ')
    valid_dataset_path = '/content/' + path_dataset

    if os.path.exists(valid_dataset_path):
      complete_path = valid_dataset_path + '/' + name_dataset + '.csv'
      dataset.to_csv(complete_path, sep=';', index=False, header=True)
      print('Dataset stored in : ',complete_path)
    else:
      print('Error! Invalid name of dataset or Not uploaded dataset has been requested.\n'
            +'Please, enter a valid dataset name to continue.')
  elif save_data.lower() == 'n':
    print('Dataset will NOT be saved!')

In [None]:
save_dataset(final_dataset)