# Data analysis overview

---


The aim of this jupyter notebook is to quick investigate our data by using descriptive statistics, as well as, boxplot, correlation matrix, pairplot. 

## Import library

In [None]:
import os

# Data Manipulation
import numpy as np
import pandas as pd
from scipy.stats import reciprocal
import re                                  

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [None]:
import warnings 
warnings.filterwarnings('ignore')

## Load the Data


---


User must enter the name of the path in which dataset is stored. After that, we will check if the directory exists and if it is empty or not.

### *Check directory and files*

In [None]:
def read_dataset ():
  dataset = ''
  path_name = input('Enter the path name for dataset: ')
  path_name = '/content/' + path_name

  if not os.path.exists(path_name):
      print('Error! Invalid path selected.')
  else:
      print(path_name + ' is a valid path.')

      if not os.listdir(path_name):
        print("Warning! Empty directory.")
      else:
        file_name = input('Enter the file name for dataset: ')
        dataset = pd.read_csv(path_name + '/' + file_name + '.csv', sep = ';', index_col = 'ID_Observations' )
  return dataset

### *Load the dataset*

In [None]:
y_coord_dataset = read_dataset()

In [None]:
print('y_coord_dataset shape: {}'.format(y_coord_dataset.shape))
print('\n data types: \n{}'.format(y_coord_dataset.dtypes))
print('\ny_coord_dataset content: \n')
y_coord_dataset

## Overview of the data

### *Descriptive Statistics*
Using the method describe() we can see some information about the dataset we have, we can have a picture of each column (feature):
*   Mean, mediam, model, standard deviation.
*   Min and Max.
*   Count.
*   Quartiles.

In [None]:
y_coord_dataset.describe()

### *Boxplot*

In [None]:
def plot_boxplot(df, boxplot_title):
  fig = px.box(df,template='simple_white')

  fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=True, title_text='Variables')
    
  fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=True)

  fig.update_layout(title_text='<b>'+boxplot_title+'</b>',
                    title_x=0.5, 
                    font=dict(
                        size=16
                    ))
  fig.show()

In [None]:
plot_boxplot(y_coord_dataset[list(filter(re.compile(r'^x_').search, y_coord_dataset.columns))\
                             +['Total_n_peaks', 'Max_peaks_position','a','b','c']],
             'Boxplot of dataset features distribution')

In [None]:
plot_boxplot(y_coord_dataset['Volume'], 'Boxplot of Volume distribution')


### *Correlation matrix*

In [None]:
def plot_correlation_matrix(df, corr_matrix_title):
  df_corr = df.corr()
  fig = go.Figure()
  fig.add_trace(
      go.Heatmap(
          x = df_corr.columns,
          y = df_corr.index,
          z = np.array(df_corr),
          text=df_corr.values,
          texttemplate='%{text:.2f}',
          colorscale='Viridis'
      )
  )
 
  fig.update_layout(title_text='<b>'+corr_matrix_title+'</b>',
                    xaxis_title='',
                    yaxis_title='',
                    title_x=0.5, 
                    font=dict(
                        size=16
                        )
                    )
  fig.show()

In [None]:
plot_correlation_matrix(y_coord_dataset, 'Correlation matrix of dataset')

### *Pairplot*

In [None]:
print('Pairplot of dataset')
sns.pairplot(y_coord_dataset, x_vars=list(filter(re.compile(r'^x_').search, 
                                                 y_coord_dataset.columns)) + 
             ['Volume', 'Total_n_peaks', 'Max_peaks_position'],
             y_vars=['a','b','c'])