# Merging raw data from crystal dataset


---


The aim of this script is to merge in a more confortable way the huge raw dataset available. In particular, for each crystal structure (we have 7 different crystal structure), the releated dataset will be loaded from a large-text-file. As output of this script we want a dataframe (X) which collect all the spectrum position (from 0 to 90, increasing by 0.02) and 7 different dataframe (one for each crystal structure), which collect the releated intensity with respect the X position in the spectrum. Moreover, in each Y dataset, we collect also the cell paramenters size, which will be used as target features in future task. Lastly, dataset will be saved.

## Import library

In [None]:
import os
from google.colab import files

# Data Manipulation
import numpy as np
import pandas as pd

# Data Visualization
import seaborn as sns

In [None]:
import warnings 
warnings.filterwarnings('ignore')

## Create dataframes to store dataset 
---
For each dataset, script will produce two different dataframe in output as followed:
*   ***x_coord_dataset***, which is a dataframe of size 1-by-4501  with information about the x-coord of the spectrum. In particular, we store point in the following interval [0,90] increasing by 0.02;
*   ***y_coord_dataset***, which is a datframe of size num_obs-by-4502 storing information about the recordered intensity, releated to each point stored in x-coord-dataset, with also id-observation





### *Create dataframe to store the x-coordinates of the spectrum for each observation*

In [None]:
x_coord_dataset = pd.DataFrame(np.arange(0,90.02,0.02)).T
x_coord_dataset.columns = [f'x_{i}' for i in range(1,len(x_coord_dataset.T)+1)]   # Rename x_coord_dataset columns name
print('The x_coord_dataset has the following shape: ', x_coord_dataset.shape)     # Check the shape of x_coord_dataset
x_coord_dataset

### *Create a dataframe to store the y-coordinates of the spectrum for each observation*

In [None]:
y_coord_dataset = pd.DataFrame(columns = [f'y_{i}' for i in range(1,4502)])
y_coord_dataset['ID_Observations'] = ''
y_coord_dataset

### *Create dataframe to store additional information*

In [None]:
additional_information_dataset = pd.DataFrame(columns=['ID_Observations', 'a', 
                                                       'b', 'c', 'alpha', 'beta',
                                                       'gamma', 'Volume', 
                                                       'Crystal_Structure_Type'])
additional_information_dataset

## Load the data


---


User must enter the name of the path in which dataset is stored. After that, we will check if the directory exists and if it is empty or not.

### *Check directory and files*

In [None]:
UploadedFiles = files.upload()

In [None]:
path_name = input('Enter the path name for dataset: ')
path_name = '/content/' + path_name
dir_list = ''

if not os.path.exists(path_name):
    print('Error! Invalid path selected.')
else:
    print(path_name + ' is a valid path.')

    if not os.listdir(path_name):
      print("Warning! Empty directory.")
    else:
      dir_list = os.listdir(path_name)

In [None]:
dir_list

### *Load the dataset using chunk*

In [None]:
chunksize = 150
k = 1
for file in dir_list:
  print('============================================================================')
  print('WORKING ON FILE NUMBER: {:d}'.format(k))

  i = 1

  with pd.read_csv(path_name + '/' + file, sep='|', header=None, index_col = None, 
                  names=["Spectrum_Data", "Cell_Parameters", "Volume", 
                          "ID_Observations", "Crystal_Structure_Type"], 
                  chunksize=chunksize, usecols = [0,1,2,3,4]) as reader:
                  for chunk in reader:
                    print('----------------------------------------------------------------------------')
                    print('Working of chunk number: {:d}'.format(i))
                    print('----------------------------------------------------------------------------')
                    print("DataType: {} \nShape: {} \nMemory: {}".format(type(chunk),
                                                chunk.shape, 
                                                chunk.memory_usage().sum()))
                    
                    # Retrive field for additional_information_dataset dataframe
                    chunk[['a', 'b', 'c', 'alpha', 'beta', 'gamma',]] = chunk.Cell_Parameters.str.split(' ', expand=True)
                    chunk.pop('Cell_Parameters')
                    additional_information_single_chunk = chunk[['ID_Observations', 
                                                                  'a', 'b', 'c', 
                                                                  'alpha', 'beta',
                                                                  'gamma', 'Volume',
                                                                  'Crystal_Structure_Type']]
                    additional_information_dataset = pd.concat([additional_information_dataset, 
                                                                additional_information_single_chunk], ignore_index= True)
                    
                    # Retrive and split specrtum field
                    xy_chunk = chunk['Spectrum_Data'].str.split(' ', expand=True)
                    x_y_chunk = pd.DataFrame()

                    j = 1
                    for col in xy_chunk.columns:
                      x_y_chunk[[f'x_{j}', f'y_{j}']] = xy_chunk[col].str.split(';', expand = True)
                      j = j + 1

                    # Drop column with name starting with 'x_'
                    x_y_chunk = x_y_chunk.loc[:, ~x_y_chunk.columns.str.contains('^x_')]
                                        
                    x_y_chunk['ID_Observations'] = chunk.ID_Observations

                    y_coord_dataset = pd.concat([y_coord_dataset, x_y_chunk], ignore_index= True)
                    
                    i = i + 1
  k = k + 1

In [None]:
additional_information_dataset=additional_information_dataset.set_index('ID_Observations')
y_coord_dataset=y_coord_dataset.set_index('ID_Observations')

In [None]:
print('x_coord_dataset shape: {}'.format(x_coord_dataset.shape))
print('x_coord_dataset data types: \n{}'.format(x_coord_dataset.dtypes))

In [None]:
print('y_coord_dataset shape: {}'.format(y_coord_dataset.shape))
print('y_coord_dataset data types: \n{}'.format(y_coord_dataset.dtypes))

In [None]:
print('additional_information_dataset shape: {}'.format(additional_information_dataset.shape))
print('additional_information_dataset data types: \n{}'.format(additional_information_dataset.dtypes))

## Basic checks

### *Casting of numeric field from object to float*

In [None]:
y_coord_dataset = y_coord_dataset.apply(pd.to_numeric)
print('y_coord_dataset data types after casting operation: \n{}'.format(y_coord_dataset.dtypes))

In [None]:
index_name = additional_information_dataset.select_dtypes(include='object').columns
index_name = index_name.drop('Crystal_Structure_Type')
additional_information_dataset[index_name] = additional_information_dataset[index_name].apply(pd.to_numeric)
print('additional_information_dataset data types after casting operation: \n{}'.format(additional_information_dataset.dtypes))

### *Merging additional_information_dataset and y_coord_dataset on ID_Observations*

In [None]:
y_coord_dataset = y_coord_dataset.join(additional_information_dataset)
print('y_coord_dataset shape: {}'.format(y_coord_dataset.shape))
print('y_coord_dataset data types: \n{}'.format(y_coord_dataset.dtypes))

### *Check for ***null*** field*

In [None]:
print('Null field in y_coord_dataset: \n{}'.format(y_coord_dataset.isnull().any()))
sns.heatmap(y_coord_dataset.isnull(), cbar=False)

### *Check for NaN rows and drop it*

In [None]:
n_samples = y_coord_dataset.shape[0]
y_coord_dataset = y_coord_dataset.dropna()   
print('Number of NaN rows dropped: {}/{} ({:.2f}%)'.format(n_samples - y_coord_dataset.shape[0], n_samples, (n_samples - y_coord_dataset.shape[0]) / n_samples * 100))

### *Check for duplicate rows and drop, if exists*

In [None]:
duplicated_rows = y_coord_dataset[y_coord_dataset.duplicated()]
n_samples = y_coord_dataset.shape[0]  
duplicates = n_samples - y_coord_dataset.shape[0]

if not duplicated_rows.empty:
  y_coord_dataset.drop_duplicates(ignore_index=False, inplace=True)
  print('Number of canceled duplicates: {}/{} ({:.2f}%)'.format(duplicates, n_samples, duplicates / n_samples * 100))
else:
  print('Number of canceled duplicates: {}/{} ({:.2f}%)'.format(duplicates, n_samples, duplicates / n_samples * 100))

## Show dataset content

In [None]:
print('x_coord_dataset content: ')
x_coord_dataset

In [None]:
print('y_coord_dataset content: ')
y_coord_dataset

## Saving the new dataset

In [None]:
def save_dataset(dataset):
  save_data = input('Do you want to save x_coord_dataset [y|n]: ')
  if save_data.lower() == 'y':
    path_dataset = input('Enter path in which to store dataset: ')
    name_dataset = input('Enter dataset name you want to save: ')
    valid_dataset_path = '/content/' + path_dataset

    if os.path.exists(valid_dataset_path):
      complete_path = valid_dataset_path + '/' + name_dataset + '.csv'
      dataset.to_csv(complete_path, sep=';', index=True, header=True)
      print('Dataset stored in : ',complete_path)
    else:
      print('Error! Invalid name of dataset or Not uploaded dataset has been requested.\n'
            +'Please, enter a valid dataset name to continue.')
  elif save_data.lower() == 'n':
    print('Dataset will NOT be saved!')

### *Saving x_coord_dataset*

In [None]:
save_dataset(x_coord_dataset)

### *Saving y_coord_dataset*

In [None]:
save_dataset(y_coord_dataset)