# Data Preprocessing

In [4]:
import os
import os.path
import pandas as pd
import numpy as np

In [69]:
"""
This method provides a preprocessed data set.

Parameters:
handle_missing_data (str): describes how to handle missing data, possible values are: 
    rr - remove rows with missing data
    rc - remove columns with missing data
    uu - use unknown as its own value 
    dn - do nothing, in this case onehot encoding isn't possible (replaces missing values with np.nan)
         and it only should be used if the prediction algorithm can handle missing data/ None values
         
use_one_hot_encoding (bool): describes whether one hot encoding should be applied, possible values are:
    True  - one hot encoding will be applied
    False - one hot encoding won't be applied, but categorical values will be converted to numeric ones
    
split_size tuple(numeric, numeric, numeric): describes which porportion of the data should be used for the split, possible values:
    the input should be tuple with three numeric values which sum up to 1
        -> (0.7, 0.2, 0.1) in this case 70% will be used for training, 20% for validation and 10% for testing
    In case you want to use more complex cross validation algorithms like k-fold you should only split into train
    and apply your cross validation algorithm to the train data
    
Returns: 
y_train (df): train set (y)
x_train (df): train set (x)
y_val   (df): validation set (y)
x_val   (df): validation set (x)
y_test  (df): test set (y)
x_test  (df): test set (x)
"""
def get_data(handle_missing_data, use_one_hot_encoding, split_size):
    # load the data
    data_file_folder = 'data'
    data_file_name = 'retrieved_data.csv' 
    data_df = pd.read_csv(os.path.join('..' , data_file_folder, data_file_name), dtype='category')
            
    # drop unnecessary columns from x
    data_df = data_df.drop(['veil-type'], axis=1)
    
    # handle missing values
    if handle_missing_data is 'rr':
        data_df = data_df[data_df['stalk-root'] != 0]
    elif handle_missing_data is 'rc':
        data_df = data_df.drop(['stalk-root'], axis=1)
    elif handle_missing_data is 'uu':
        data_df['stalk-root'] = data_df['stalk-root'].replace('?', 'u').astype('category')
    elif handle_missing_data is 'dn':
        data_df['stalk-root'] = data_df['stalk-root'].replace('?', np.nan).astype('category')
    
    # encoding       
    if use_one_hot_encoding is True:
        # the y column should never be one hot encoded but encoded to numeric values instead
        class_df = data_df.drop(data_df.columns.difference(['class']), axis=1)
        class_df['class'] = class_df['class'].cat.codes
        
        data_df = class_df.join(pd.get_dummies(data_df.drop(['class'], axis=1)))        
    else:
        for (columnName, columnData) in data_df.iteritems(): 
            data_df[columnName] = data_df[columnName].cat.codes
            
    # train/ validation/ test split
    train, val, test = np.split(data_df.sample(frac=1, random_state=42), [int(split_size[0]*len(data_df)), int((split_size[0] + split_size[1])*len(data_df))])
    
    print(train-)
    # split in x and y         
    y_train = train.drop(train.columns.difference(['class']), axis=1)    
    x_train = train.drop(['class'], axis=1)
    
    y_val = val.drop(val.columns.difference(['class']), axis=1)    
    x_val = val.drop(['class'], axis=1)
        
    y_test = test.drop(test.columns.difference(['class']), axis=1)    
    x_test = test.drop(['class'], axis=1)
    
    return y_train, x_train, y_val, x_val, y_test, x_test

In [70]:
y_train, x_train, y_val, x_val, y_test, x_test = get_data('rr', True, (0.7, 0.2, 0.1))

      class  cap-shape_b  cap-shape_c  cap-shape_f  cap-shape_k  cap-shape_s  \
1971      0            0            0            1            0            0   
6654      1            0            0            1            0            0   
5606      1            0            0            0            0            0   
3332      0            0            0            1            0            0   
6988      1            0            0            1            0            0   

      cap-shape_x  cap-surface_f  cap-surface_g  cap-surface_s    ...      \
1971            0              1              0              0    ...       
6654            0              0              0              1    ...       
5606            1              0              0              0    ...       
3332            0              0              0              0    ...       
6988            0              0              0              1    ...       

      population_s  population_v  population_y  habitat_

In [71]:
y_train.head(5)

Unnamed: 0,class
1971,0
6654,1
5606,1
3332,0
6988,1


In [72]:
x_train.head(5)

Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
1971,0,0,1,0,0,0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,0
6654,0,0,1,0,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
5606,0,0,0,0,0,1,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0
3332,0,0,1,0,0,0,0,0,0,1,...,0,0,1,1,0,0,0,0,0,0
6988,0,0,1,0,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
