In [None]:
# default_exp dataset

# dataset

> Module allowing the creation of the dataset and data statistics. 

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
import pandas as pd

In order to illustrate the function define we will use the Ishigami function define as :

$$\begin{array}{ccccc}
f & : & [-\pi,\pi]^3 & \to & \mathbb{R} \\
 & & (x_1,x_2,x_3) & \mapsto & \sin(x_1) + a\times\sin(x_2) + b\times x_3^4\times\sin(x_1) \\
\end{array}$$

where $a = 7$ and $b = 0.1$.

We create a database for this function using a uniform distribution.

In [None]:
import numpy as np
a_, b_ = 7, 0.1
min_ = -np.pi*100
max_ = np.pi*100
data = np.random.randint(low=min_, high=max_, size=(100,3))/100
output = np.sin(data[:,0]) + a_*np.sin(data[:,1]) + b_*np.power(data[:,2],4)*np.sin(data[:,0])
value_ = {'x1':data[:,0], 'x2':data[:,1], 'x3':data[:,2],'y':output}
database = pd.DataFrame(value_)
database.to_csv('data/ishigami.csv',index=False)

In [None]:
#export
def open_data(file,
              info = False):
    """ Open the data and transform it in a DataFrame 
        Arguments :
            :file: CSV to read and convert into a pandas DataFrame
            :info: default = False : Boolean to get summary information on the created object
        Output :
            A pandas DataFrame with all the data from the CSV file
    """
    df = pd.read_csv(file)
    if info is True :
        print('Five first rows of the generated DataFrame : \n {}'.format(df.head()))
        print('\nDataFrame shape : {}\n'.format(df.shape))
    return df

This is a usefull function charging a file and returning a dataframe version. `info` set to *True* will allow to print some statistics on the dataset.

In [None]:
database = open_data('data/ishigami.csv',info=True)

Five first rows of the generated DataFrame : 
      x1    x2    x3         y
0 -3.06  1.66 -0.46  6.890301
1  2.19 -2.24  0.73 -4.652745
2 -1.22  1.19  1.95  4.201639
3 -2.08  1.69 -1.11  5.944645
4  1.89 -2.14  2.72  0.250308

DataFrame shape : (100, 4)



In [None]:
#export
def create_train_test_set(dataframe,
                          train_frac,
                          test_frac,
                          target,
                          random_state = 123):
    """ Create the train and test set for the training with a random method
        Arguments :
            :dataframe: pandas DataFrame containing the date to split
            :train_frac: float, fraction number of training data to keep
            :test_frac: float, fraction number of test data to keep
            :target: string, name of the target value
        Outputs : 
            :train_features: pandas DataFrame of the training points selected randomly
            :train_labels: pandas DataFrame, outputs for the training
            :test_features: pandas DataFrame of the test points selected randomly
            :test_labels: pandas DataFrame, outputs for the tests
    """
    train_dataset = dataframe.sample(frac = train_frac, random_state = random_state)
    tmp = dataframe.drop(train_dataset.index)
    test_dataset = tmp.sample(frac = test_frac, random_state = random_state)
    tmp.drop(test_dataset.index)
    train_labels = train_dataset.pop(target)
    train_features = train_dataset
    test_labels = test_dataset.pop(target)
    test_features =test_dataset
    return train_features, train_labels, test_features, test_labels


Let's turn this database into a training set and a test set using `create_train_test_set`function.

In [None]:
train_features, train_labels, test_features, test_labels = create_train_test_set(database,0.8,1.,'y')

## Get insight on the data

We can now check some statistics on the train set using get_statistics.

In [None]:
#export
def get_statistics(dataframe,
                   *argv):
    """ Compute some basic statistics over the data 
        Arguments :
            :dataframe: pandas DataFrame 
            :*argv: allows to pass multiple DataFrame in one time
        Output : None
    """
    print('Statistics Computed : \n {}'.format(dataframe.describe().transpose()))
    for arg in argv :
        print(arg.describe().transpose())


In [None]:
get_statistics(train_features,train_labels)

Statistics Computed : 
     count      mean       std   min     25%    50%     75%   max
x1   80.0 -0.257250  1.615413 -3.12 -1.5725 -0.220  0.7350  2.96
x2   80.0 -0.277500  1.806109 -3.04 -1.9900 -0.325  1.1450  3.11
x3   80.0  0.146125  1.861266 -3.04 -1.4450  0.220  1.7125  3.12
count    80.000000
mean     -0.773725
std       5.645724
min     -15.798305
25%      -5.385894
50%      -0.858053
75%       4.292206
max      10.253852
Name: y, dtype: float64


## Scaling the data

The two next function are here to shift and scale the data in two different ways :

*  `norm`: substract the mean and divide by the standard deviation.
*  `minmaxscaler`: substract the max and divide by $(\max-\min)$

In [None]:
#export
def norm(x):
    """ Standardization of a dataset 
        Arguments :
            :x: pandas Dataframe contening the data to standardize
        Output :
            A pandas DataFrame with standardize values
    """
    x_stats = x.describe().transpose()
    return((x - x_stats['mean'])/x_stats['std'])




In [None]:
#export
def minmaxscaler(x):
    """ MinMax scale of a dataset 
        Arguments :
            :x: pandas Dataframe contening the data to standardize
        Output :
            A pandas DataFrame with scaled values
    """
    x_stats = x.describe().transpose()
    return ((x-x_stats['max'])/(x_stats['max']-x_stats['min']))


In [None]:
normed_train_features = norm(train_features)
get_statistics(normed_train_features)

Statistics Computed : 
     count          mean  std       min       25%       50%       75%       max
x1   80.0  2.775558e-17  1.0 -1.772147 -0.814188  0.023059  0.614239  1.991596
x2   80.0  5.551115e-18  1.0 -1.529531 -0.948171 -0.026300  0.787605  1.875579
x3   80.0  2.220446e-17  1.0 -1.711806 -0.854862  0.039691  0.841564  1.597770
