# Data Preprocessing

In [154]:
import os
import os.path
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [155]:
# Future Improvement Ideas
"""
feature engineering: there might be a high correlation between height and weight -> multicollinearity  problem => BMI?
outlier handling: need to check the numeric data for outliers and implement ways to handle them
more normalization methods
"""


"""
This method provides a preprocessed data set.

Parameters:
enable_feature_engineering (bool): describes whether some features get replaced by artifical ones (calculated from the replaced ones), possible values:
    True - enabled
    False - disabled
    
enable_outlier_handling (bool): describes whether some extreme data entries will be deleted or not, possbible values:
    True - enabled
    False - disabled
    
normalize (str): describes whether the numeric values should be normalized (between 0 and 1), possible values:
    'minmax' - uses min max to normalize the data
    'median' - uses median to normalize the data
    None - data won't be normalized

use_one_hot_encoding (bool): describes whether one hot encoding should be applied to categorical data, possible values are:
    True  - enabled
    False - disabled
    
split_size tuple(numeric, numeric, numeric): describes which porportion of the data should be used for the split, possible values:
    the input should be tuple with three numeric values which sum up to 1
        -> (0.7, 0.2, 0.1) in this case 70% will be used for training, 20% for validation and 10% for testing
    In case you want to use more complex cross validation algorithms like k-fold you should only split into train
    and apply your cross validation algorithm to the train data
    
Returns: 
(df): train set (y)
(df): train set (x)
(df): validation set (y)
(df): validation set (x)
(df): test set (y)
(df): test set (x)
"""
def get_data(enable_feature_engineering, enable_outlier_handling, normalize, enable_one_hot_encoding, split_size):
    # load the data
    data_file_folder = 'data'
    data_file_name = 'cardio_data.csv' 
    data_df = pd.read_csv(os.path.join('..' , data_file_folder, data_file_name), sep=';')       
      
    # drop unnecessary columns
    data_df = data_df.drop(['id'], axis=1)   
            
    # set dtypes
    data_df = data_df.astype({
        'age': 'int64',
        'gender': 'int64',
        'height': 'int64',
        'weight': 'int64',
        'ap_hi': 'int64',
        'ap_lo': 'int64',
        'cholesterol': 'category',
        'gluc': 'category',
        'smoke': 'bool',
        'alco': 'bool',
        'active': 'bool',
        'cardio': 'bool'
    })
    
    # drop duplicate rows
    data_df.drop_duplicates(inplace=True)
    
    # outlier handline
    if enable_outlier_handling:
        # remove extreme cases of height, weight and blood presure (height, weight, ap_hi, ap_lo)
        data_df = data_df[data_df['height'] > 120]
        data_df = data_df[data_df['height'] < 250]
        data_df = data_df[data_df['weight'] > 20]
        data_df = data_df[data_df['weight'] < 250]
       
        # normal systolic blood preasure ranges from ~80 to ~120 but values till 240(?) are imaginable 
        data_df = data_df[data_df['ap_hi'] > 40]
        data_df = data_df[data_df['ap_hi'] < 240]
        
        # normal diastolic blood preasure ranges from ~40 to ~80  but values till 200(?) are imaginable
        data_df = data_df[data_df['ap_lo'] > 20]
        data_df = data_df[data_df['ap_lo'] < 200]
        
        # systolic blood preasure should always be higher then diastolic
        data_df = data_df[data_df['ap_lo'] < data_df['ap_hi']]
        
    # feature engineering
    if enable_feature_engineering:
        # replace gender with is female
        data_df['is_female'] = data_df['gender'] == 1
        data_df.drop(['gender'], axis=1, inplace=True)
        
        # combine height and weight to BMI to reduce the multicollinearity problem
        data_df['bmi'] = data_df['weight']/(data_df['height']**2)
           
    # normalization
    if normalize is None:
        pass    
    elif normalize in ['minmax']:
        columns_to_normalize = ['age', 'weight' , 'height', 'ap_hi', 'ap_lo']
        
        for column_name in columns_to_normalize:
            if normalize == 'minmax':
                data_df[column_name] = ((data_df[column_name] - data_df[column_name].min()) / (data_df[column_name].max() - data_df[column_name].min())).astype('float64')
    else:
        raise Exception('Invalid value for normalization!')
    
    
    # encoding (0 corresponds to ediable and 1 to poisonous)       
    # this part also splits in x and y, it was convienient to do at one step
    if enable_one_hot_encoding is True:
        # the y column should never be one hot encoded but encoded to numeric values instead
        y = data_df['cardio']   
        x = pd.get_dummies(data_df.drop(['cardio'], axis=1), columns=['cholesterol', 'gluc'])
    else:               
        # split in x and y         
        y = data_df.drop(data_df.columns.difference(['cardio']), axis=1)    
        x = data_df.drop(['cardio'], axis=1)

    # train/ validation/ test split
    x_train, x_val_and_test, y_train, y_val_and_test = train_test_split(x, y, test_size = split_size[1] + split_size[2], random_state=42)
    
    if split_size[1] > 0.0:
        x_val, x_test, y_val, y_test = train_test_split(x_val_and_test, y_val_and_test, test_size=(split_size[1]/(split_size[1]+split_size[2])), random_state=42)
    else:
        x_val = None
        y_val = None
        x_test = x_val_and_test
        y_test = y_val_and_test        
        
    return y_train, x_train, y_val, x_val, y_test, x_test