# Data preparation and peaks extraction


---


The aim of this script is to prepare crystal structure dataset for the regression task. We read the **x_coord_dataset** and the **y_coord_dataset**_***crystal_structure***, and after this details about dataset configuration must be enter by the user. In particular, user must enter the following information:


*   *n_peaks_position* in **x_coord_dataset** to be used 
*   *threshold* used to keep a peak

After that, for each observation in y_coord_dataset, we extract the first n_peaks position of peaks bigger than the threshold and we will use it as feature for regression task.







## Import library

In [None]:
import os
from google.colab import files

# Data Manipulation
import numpy as np
import pandas as pd

# Data Visualization
import seaborn as sns

# Peak Extraction
from scipy.signal import find_peaks

In [None]:
import warnings 
warnings.filterwarnings('ignore')

## Load the data


---


User must enter the name of the path in which dataset is stored. After that, we will check if the directory exists and if it is empty or not.

### *Check directory and files*

In [None]:
def read_dataset ():
  dataset = ''
  path_name = input('Enter the path name for dataset: ')
  path_name = '/content/' + path_name

  if not os.path.exists(path_name):
      print('Error! Invalid path selected.')
  else:
      print(path_name + ' is a valid path.')

      if not os.listdir(path_name):
        print("Warning! Empty directory.")
      else:
        file_name = input('Enter the file name for dataset: ')
        dataset = pd.read_csv(path_name + '/' + file_name + '.csv', sep = ';')
  return dataset

### *Load the dataset*

In [None]:
x_coord_dataset = read_dataset()

In [None]:
print('x_coord_dataset shape: {}'.format(x_coord_dataset.shape))
print('\n data types: \n{}'.format(x_coord_dataset.dtypes))
print('\nx_coord_dataset content: \n')
x_coord_dataset

In [None]:
y_coord_dataset = read_dataset()

In [None]:
print('y_coord_dataset shape: {}'.format(y_coord_dataset.shape))
print('\n data types: \n{}'.format(y_coord_dataset.dtypes))
print('\ny_coord_dataset content: \n')
y_coord_dataset

## Read information on dataset and feature to be used in regression

In [None]:
def insert_dataset_structure_details():
    print('============================================================================')
    print('Insert dataset structure details');

    while 1:
        print('----------------------------------------------------------------------------')
        n_peaks_to_keep = input('\n1) Insert the number of peaks to use as features in the experiment: ');
        if n_peaks_to_keep.isnumeric():
          n_peaks_to_keep = int(n_peaks_to_keep)
          break;

    while 1:
        print('----------------------------------------------------------------------------')
        threshold = input('\n2) Insert the threshold used to select the peaks in the experiment: ');
        if threshold.isnumeric():
          threshold = float(threshold)
          break;
    print('============================================================================')


    final_dataset = pd.DataFrame()

    final_dataset['ID_Observations'] = ''
    final_dataset[[f'x_{i}' for i in range(1, n_peaks_to_keep+1)]] = ''
    final_dataset['Total_n_peaks'] = ''
    final_dataset['Max_peaks_position'] = ''
    final_dataset['Volume'] = ''
    final_dataset['Crystal_Structure_Type'] = ''
    final_dataset['a'] = ''
    final_dataset['b'] = ''
    final_dataset['c'] = ''

    return threshold, n_peaks_to_keep, final_dataset

In [None]:
peak_threshold, n_peaks_to_keep, final_dataset = insert_dataset_structure_details()

In [None]:
print('The final dataset has the following structure: \n')
final_dataset

## Peaks extraction

In [None]:
for i in range(0, y_coord_dataset.shape[0]):
  obs = y_coord_dataset.iloc[i]
  spectrum_data = obs.iloc[:4501]
  peaks_position = find_peaks(spectrum_data, threshold = peak_threshold)[0]   # return position of peaks bigger than threshold
  total_n_peaks = peaks_position.shape[0]
  max_peak_position = 0

  if peaks_position.shape[0] >= n_peaks_to_keep:
      peaks_position = peaks_position[:n_peaks_to_keep]   # Keeping only the first n_peaks_to_keep peaks
      x_coord_peaks_obs = x_coord_dataset.iloc[:, peaks_position] # Retrive the x_coord releated to the peaks
      x_coord_peaks_obs.columns = [f'x_{j}' for j in range(1, n_peaks_to_keep+1)]
      x_coord_peaks_obs['ID_Observations'] = obs.ID_Observations
      x_coord_peaks_obs['Volume'] = obs.Volume
      x_coord_peaks_obs['Crystal_Structure_Type'] = obs.Crystal_Structure_Type
      
      x_coord_peaks_obs['a'] = obs.a
      x_coord_peaks_obs['b'] = obs.b
      x_coord_peaks_obs['c'] = obs.c
      x_coord_peaks_obs['Total_n_peaks'] = total_n_peaks
      
      temp_spectrum = spectrum_data.to_numpy()
      idx_max_peak = np.where(temp_spectrum == 1000)[0][0]
      x_coord_peaks_obs['Max_peaks_position'] = x_coord_dataset.iloc[0,  idx_max_peak]

      final_dataset = pd.concat([final_dataset, x_coord_peaks_obs], ignore_index= True)
      
final_dataset = final_dataset.set_index('ID_Observations')

In [None]:
print('The final dataset has the following shape: {}'.format(final_dataset.shape))
print('The final dataset has the following data: \n')
final_dataset

## Save final dataset

In [None]:
def save_dataset(dataset):
  save_data = input('Do you want to save final_dataset [y|n]: ')
  if save_data.lower() == 'y':
    path_dataset = input('Enter path in which to store dataset: ')
    name_dataset = input('Enter dataset name you want to save: ')
    valid_dataset_path = '/content/' + path_dataset

    if os.path.exists(valid_dataset_path):
      complete_path = valid_dataset_path + '/' + name_dataset + '.csv'
      dataset.to_csv(complete_path, sep=';', index=False, header=True)
      print('Dataset stored in : ',complete_path)
    else:
      print('Error! Invalid name of dataset or Not uploaded dataset has been requested.\n'
            +'Please, enter a valid dataset name to continue.')
  elif save_data.lower() == 'n':
    print('Dataset will NOT be saved!')

In [None]:
save_dataset(final_dataset)