# Data Preprocessing

This notebook converts categorical string columns in the dataset into ordinal number columns and creates 3 different splits out of the resulting data.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from lib.definitions import RANDOM_SEED, numerical_column_names, categorical_column_names, binary_column_names
from lib.definitions import RAW_TRAINING_DATA, PROCESSED_DATA_OUTPUT_PATH, SPLITS_BASE_PATH

In [3]:
RANDOM_SEED = 42

import random
random.seed(RANDOM_SEED)

import numpy.random
numpy.random.seed(RANDOM_SEED)

import os
os.environ['PYTHONHASHSEED']=str(RANDOM_SEED)

import tensorflow
tensorflow.set_random_seed(RANDOM_SEED)

In [4]:
import joblib
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

In [5]:
SPLITS_BASE_PATH.mkdir(parents=True, exist_ok=True) 
assert RAW_TRAINING_DATA.exists()

## 1. Load Data

In [6]:
from lib.definitions import column_definitions

In [7]:
df_train = pd.read_csv(RAW_TRAINING_DATA, sep=' ', names=column_definitions)

Change encoding of female/male from 1/2 to 0/1 and save resulting data as csv

In [8]:
df_train['heart_disease'] = df_train['heart_disease'] - 1
df_train.to_csv(PROCESSED_DATA_OUTPUT_PATH / 'heart.csv', index=False)

## 2. Create Splits

In [9]:
splitter = StratifiedKFold(n_splits=3, random_state=RANDOM_SEED, shuffle=True, )

df_training_features = df_train.drop('heart_disease', axis=1)
df_training_labels = df_train['heart_disease']

for i, (train_index, val_index) in enumerate(splitter.split(df_training_features, df_training_labels)):
    df_split_training_features = df_training_features.iloc[train_index, :]
    df_split_training_labels = df_training_labels.iloc[train_index]
    
    df_split_validation_features = df_training_features.iloc[val_index, :]
    df_split_validation_labels = df_training_labels.iloc[val_index]
    
    split_path = SPLITS_BASE_PATH / str(i + 1)
    split_path.mkdir(parents=True, exist_ok=True)
    
    df_split_training_features.to_csv(split_path / 'train_values.csv', index=False, header=True)
    df_split_training_labels.to_csv(split_path / 'train_labels.csv', index=False, header=True)
    
    df_split_validation_features.to_csv(split_path / 'val_values.csv', index=False, header=True)
    df_split_validation_labels.to_csv(split_path / 'val_labels.csv', index=False, header=True)