# Split

In [14]:
import os
import sys
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')

# define functions

In [15]:
# During sampling, we need to check if the distributions are similar to the original data by setting a significance level 
# and using Kolmogorov-Smirnoff method.

def get_sample(df, significance = 0.05, sample_size = 5000, iterations = 100):
    for i in range(iterations):
        sample = df.sample(sample_size)
        sample_indexes = sample.index
        retrieved = True
        for var in range(df.shape[1]):
            var_sample = np.array(sample.iloc[:,var])
            metrics = ks_2samp(df.iloc[:,var], var_sample)
            pvalue = round(metrics[1], 3)
            if pvalue < significance: 
                retrieved = False
                break
        if retrieved == True: 
            print('found sample after {} iterations'.format(i+1) )
            return sample
    if not retrieved: raise ValueError("Could not build samples with {} iterations, significane={}, and sample_size={}"
                           .format(iterations,significance,sample_size))

# Define paths and capture data

In [16]:
inputs = os.path.join('..', 'data', '02_intermediate')
outputs = os.path.join('..', 'data', '02_intermediate')
reports = os.path.join('..', 'data', '06_reporting')

data = pd.read_csv(os.path.join(inputs, 'data.csv'), index_col='id')

In [17]:
print('Dataset dimensions:', data.shape)
data.head()

Dataset dimensions: (799, 13)


Unnamed: 0_level_0,cycle,preset_1,preset_2,temperature,pressure,vibrationx,vibrationy,vibrationz,frequency,y,lag_1,lag_2,lag_3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,1,3,6,44.235186,47.657254,46.441769,64.820327,66.45452,44.48325,False,,,
1,2,2,4,60.807234,63.172076,62.005951,80.714431,81.246405,60.228715,False,,,
2,3,2,1,79.027536,83.03219,82.64211,98.254386,98.785196,80.993479,False,,,
3,4,2,3,79.716242,100.508634,122.362321,121.363429,118.652538,80.315567,False,0.0,0.0,0.0
4,5,2,5,39.989054,51.764833,42.514302,61.03791,50.716469,64.245166,False,0.0,0.0,0.0


# data sampling
if sampling_data == True: faster to run project, but will work on a data sample.

During sampling, we need to check if the distributions are similar to the original data by setting a significance level and using Kolmogorov-Smirnoff method. See function defined in the beginning of the notebook.

In [18]:
from scipy.stats import ks_2samp

In [19]:
sampling_data = False

In [20]:
if sampling_data == True:
    data = get_sample(data, significance=0.05, sample_size=5000, iterations=10)
data.shape

(799, 13)

# final description

In [21]:
data.tail()

Unnamed: 0_level_0,cycle,preset_1,preset_2,temperature,pressure,vibrationx,vibrationy,vibrationz,frequency,y,lag_1,lag_2,lag_3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
794,795,2,7,107.452875,101.617775,121.179986,80.182308,96.035159,94.125167,True,1.0,1.0,0.0
795,796,2,6,50.469522,98.235421,151.585252,99.34138,148.838481,49.8414,True,1.0,1.0,1.0
796,797,1,4,49.98527,160.43364,110.95301,160.777175,109.917566,110.91929,True,1.0,1.0,1.0
797,798,1,4,79.777294,110.535405,61.334995,149.577811,129.463843,70.853353,True,1.0,1.0,1.0
798,799,2,8,79.360314,159.985443,61.762879,169.773831,130.213426,80.11354,True,1.0,1.0,1.0


In [22]:
data.describe()

Unnamed: 0,cycle,preset_1,preset_2,temperature,pressure,vibrationx,vibrationy,vibrationz,frequency,lag_1,lag_2,lag_3
count,799.0,799.0,799.0,799.0,799.0,799.0,799.0,799.0,799.0,796.0,796.0,796.0
mean,400.0,1.987484,4.555695,69.248987,78.958463,73.801236,72.71312,71.843101,68.219647,0.080402,0.067839,0.056533
std,230.795725,0.805584,2.291225,25.548948,32.50299,31.204488,32.693677,27.854379,29.156755,0.272086,0.251628,0.231093
min,1.0,1.0,1.0,2.089354,3.480279,3.846343,10.057744,18.784169,4.380101,0.0,0.0,0.0
25%,200.5,1.0,3.0,51.014242,55.480793,50.698623,48.498192,50.77607,45.788589,0.0,0.0,0.0
50%,400.0,2.0,5.0,65.885436,75.000457,69.380923,65.406638,69.308268,65.646454,0.0,0.0,0.0
75%,599.5,3.0,7.0,80.488039,99.25192,90.044297,94.009832,88.861568,90.118268,0.0,0.0,0.0
max,799.0,3.0,8.0,255.607829,189.995681,230.861142,193.569947,230.951134,178.090303,1.0,1.0,1.0


# Split data
test_size could depend on data size. For instance, for 1 million entries, it would work fine to establish test_size=0.1.

In [23]:
X = data.drop('y', axis=1)
print('dimensions of X:', X.shape)

y = data.loc[:, 'y']
y = y.astype('float')
print('dimensions of y:', y.shape)

dimensions of X: (799, 12)
dimensions of y: (799,)


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle = False, random_state=42)
print('dimensions of X_train:', X_train.shape)
print('dimensions of y_train:', y_train.shape)
print('dimensions of X_test:', X_test.shape)
print('dimensions of y_test:', y_test.shape)

dimensions of X_train: (559, 12)
dimensions of y_train: (559,)
dimensions of X_test: (240, 12)
dimensions of y_test: (240,)


In [25]:
X_train

Unnamed: 0_level_0,cycle,preset_1,preset_2,temperature,pressure,vibrationx,vibrationy,vibrationz,frequency,lag_1,lag_2,lag_3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1,3,6,44.235186,47.657254,46.441769,64.820327,66.454520,44.483250,,,
1,2,2,4,60.807234,63.172076,62.005951,80.714431,81.246405,60.228715,,,
2,3,2,1,79.027536,83.032190,82.642110,98.254386,98.785196,80.993479,,,
3,4,2,3,79.716242,100.508634,122.362321,121.363429,118.652538,80.315567,0.0,0.0,0.0
4,5,2,5,39.989054,51.764833,42.514302,61.037910,50.716469,64.245166,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
554,555,3,8,44.926109,45.128385,55.084454,44.829898,54.419924,62.405853,1.0,1.0,1.0
555,556,1,5,59.080631,60.468110,75.435527,59.438649,75.087669,83.777042,0.0,0.0,0.0
556,557,1,7,75.732972,74.688730,95.181253,74.645044,95.769773,115.773835,0.0,0.0,0.0
557,558,1,8,65.867720,62.089807,44.304732,44.142188,45.676147,45.099925,0.0,0.0,0.0


# save train and test sets

In [26]:
y_train = pd.DataFrame(y_train, columns=['y'])
y_test = pd.DataFrame(y_test, columns=['y'])

X_train.to_csv(os.path.join(outputs, 'X_train.csv'))
X_test.to_csv(os.path.join(outputs, 'X_test.csv'))
y_train.to_csv(os.path.join(outputs, 'y_train.csv'))
y_test.to_csv(os.path.join(outputs, 'y_test.csv'))