# Split

In [71]:
import os
import sys
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')

In [72]:
sys.path.append(os.path.join('..', 'src'))

In [73]:
import importlib
import utils
importlib.reload(utils)
from utils import capture_data

# define functions

In [74]:
def save_csv(sufix, label=False):
    file = sufix + '.csv'
    if label==False: 
        df = eval(sufix)
    else: 
        df = pd.DataFrame(eval(sufix), columns=['y'])
    df.to_csv(os.path.join(outputs, file))

# set sampling to True/False
* if sampling_data == True: faster to run project, but will work on a data sample

In [75]:
sampling_data = True

In [76]:
sample_size = 5000

# Define paths and capture data

In [77]:
inputs = os.path.join('..', 'data', '02_intermediate')
outputs = os.path.join('..', 'data', '02_intermediate')
reports = os.path.join('..', 'data', '06_reporting')

data = capture_data(inputs, 'data.csv')
data_test = capture_data(inputs, 'data_test.csv')

In [78]:
print('Dataset dimensions:', data.shape)
data.head()

Dataset dimensions: (32561, 14)


Unnamed: 0_level_0,age,workclass,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0.0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0.0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0.0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0.0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0.0


In [79]:
print('Dataset dimensions:', data_test.shape)
data_test.head()

Dataset dimensions: (16281, 14)


Unnamed: 0_level_0,age,workclass,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,25,State-gov,Bachelors,7,Never-married,Adm-clerical,Not-in-family,White,Male,0,0,40,United-States,0.0
1,38,Self-emp-not-inc,Bachelors,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,50,United-States,0.0
2,28,Private,HS-grad,12,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0.0
3,44,Private,11th,10,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,7688,0,40,United-States,0.0
4,18,Private,Bachelors,10,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,30,Cuba,0.0


# data sampling
if sampling_data == True: faster to run project, but will work on a data sample

In [80]:
if sampling_data == True:
    data = data.sample(sample_size, random_state=42)
data.shape

(5000, 14)

# final description

In [81]:
data.tail()

Unnamed: 0_level_0,age,workclass,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
27453,32,Self-emp-not-inc,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,48,United-States,0.0
27161,43,Private,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1.0
17926,35,Private,HS-grad,9,Divorced,Craft-repair,Unmarried,White,Male,0,0,40,United-States,0.0
25713,32,Private,Assoc-voc,11,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,99,United-States,1.0
3334,26,Private,HS-grad,9,Never-married,Handlers-cleaners,Own-child,Black,Male,0,0,40,United-States,0.0


In [82]:
data.describe()

Unnamed: 0,age,education_num,capital_gain,capital_loss,hours_per_week,y
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,38.7936,10.065,1130.1594,87.18,40.6796,0.2456
std,13.688304,2.555246,7583.026928,406.812837,12.26164,0.430485
min,17.0,1.0,0.0,0.0,1.0,0.0
25%,28.0,9.0,0.0,0.0,40.0,0.0
50%,37.0,10.0,0.0,0.0,40.0,0.0
75%,48.0,12.0,0.0,0.0,45.0,0.0
max,90.0,16.0,99999.0,4356.0,99.0,1.0


# Split data
test_size could depend on data size. For instance, for 1 million entries, it would work fine to establish test_size=0.1.

But let's notice that we already have train and test sets! So we just need to separate the response variable!

if we wanted to split by using some handy function, the approach would be :
```python
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
```

In [83]:
X_train = data.loc[:, data.columns != 'y']
y_train = data.loc[:, 'y']
print('dimensions of X:', X_train.shape)
print('dimensions of y:', y_train.shape)

X_test = data_test.loc[:, data_test.columns != 'y']
y_test = data_test.loc[:, 'y']
print('\r\ndimensions of X:', X_test.shape)
print('dimensions of y:', y_test.shape)

dimensions of X: (5000, 13)
dimensions of y: (5000,)

dimensions of X: (16281, 13)
dimensions of y: (16281,)


# save train and test sets

In [84]:
for file_sufix in ['X_train', 'X_test']:
    save_csv(file_sufix)
for file_sufix in ['y_train', 'y_test']:
    save_csv(file_sufix, label=True)