# Data Preprocessing

## Imports

In [69]:
import os
import os.path
import pandas as pd
import numpy as np

## Load the Data

In [70]:
data_file_folder = 'data'
data_file_name = 'retrieved_data.csv' 
data_df = pd.read_csv(os.path.join('..' , data_file_folder, data_file_name), dtype='category')

## Data Preview

In [71]:
data_df.head(5)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [72]:
data_df.dtypes

class                       category
cap-shape                   category
cap-surface                 category
cap-color                   category
bruises                     category
odor                        category
gill-attachment             category
gill-spacing                category
gill-size                   category
gill-color                  category
stalk-shape                 category
stalk-root                  category
stalk-surface-above-ring    category
stalk-surface-below-ring    category
stalk-color-above-ring      category
stalk-color-below-ring      category
veil-type                   category
veil-color                  category
ring-number                 category
ring-type                   category
spore-print-color           category
population                  category
habitat                     category
dtype: object

In [73]:
possible_values_per_column = {}
for (columnName, columnData) in data_df.iteritems(): 
    possible_values_per_column[columnName] = columnData.values
    print('Colunm Name : ', columnName) 
    print('Column Contents : ', sorted(list(set(columnData.values)))) 
    print()

Colunm Name :  class
Column Contents :  ['e', 'p']

Colunm Name :  cap-shape
Column Contents :  ['b', 'c', 'f', 'k', 's', 'x']

Colunm Name :  cap-surface
Column Contents :  ['f', 'g', 's', 'y']

Colunm Name :  cap-color
Column Contents :  ['b', 'c', 'e', 'g', 'n', 'p', 'r', 'u', 'w', 'y']

Colunm Name :  bruises
Column Contents :  ['f', 't']

Colunm Name :  odor
Column Contents :  ['a', 'c', 'f', 'l', 'm', 'n', 'p', 's', 'y']

Colunm Name :  gill-attachment
Column Contents :  ['a', 'f']

Colunm Name :  gill-spacing
Column Contents :  ['c', 'w']

Colunm Name :  gill-size
Column Contents :  ['b', 'n']

Colunm Name :  gill-color
Column Contents :  ['b', 'e', 'g', 'h', 'k', 'n', 'o', 'p', 'r', 'u', 'w', 'y']

Colunm Name :  stalk-shape
Column Contents :  ['e', 't']

Colunm Name :  stalk-root
Column Contents :  ['?', 'b', 'c', 'e', 'r']

Colunm Name :  stalk-surface-above-ring
Column Contents :  ['f', 'k', 's', 'y']

Colunm Name :  stalk-surface-below-ring
Column Contents :  ['f', 'k', 's'

As we can see stalk-root is missing some values. Furthermore we can see that veil-type only contains one possible values and is therefore useless to us.

In [74]:
print(data_df['stalk-root'].value_counts()['?'], '/ ', len(data_df.index))

2480 /  8124


In [75]:
data_df = data_df.drop(['veil-type'], axis=1)

In [141]:
# how to handle missing data 
    # remove rows with missing data (key = rr)        
    # remove columns with missing data (key rc)
    # do nothing about it, in this case onehot encoding isn't possible (key = dn)
    
"""
This method provides a preprocessed data set.

Parameters:
handle_missing_data (str): describes how to handle missing data, possible values are: 
    rr - remove rows with missing data
    rc - remove columns with missing data
    uu - use unknown as its own value 
    dn - do nothing, in this case onehot encoding isn't possible (replaces missing values with np.nan)
         and it only should be used if the prediction algorithm can handle missing data/ None values
         
use_one_hot_encoding (bool): describes whether one hot encoding should be applied, possible values are:
    True  - one hot encoding will be applied
    False - one hot encoding won't be applied, but categorical values will be converted to numeric ones
    
split_size tuple(numeric, numeric, numeric): describes which porportion of the data should be used for the split, possible values:
    the input should be tuple with three numeric values which sum up to 1
        -> (0.7, 0.2, 0.1) in this case 70% will be used for training, 20% for validation and 10% for testing
    In case you want to use more complex cross validation algorithms like k-fold you should only split into train
    and apply your cross validation algorithm to the train data
    
Returns: 
train (df): train set
val (df): validation set
test (df): test set
"""
def get_data(handle_missing_data, use_one_hot_encoding, split_size):
    # load the data
    data_file_folder = 'data'
    data_file_name = 'retrieved_data.csv' 
    data_df = pd.read_csv(os.path.join('..' , data_file_folder, data_file_name), dtype='category')
    
    # drop unnecessary columns 
    data_df = data_df.drop(['veil-type'], axis=1)
    
    # handle missing values
    if handle_missing_data is 'rr':
        data_df = data_df[data_df['stalk-root'] != 0]
    elif handle_missing_data is 'rc':
        data_df = data_df.drop(['stalk-root'], axis=1)
    elif handle_missing_data is 'uu':
        data_df['stalk-root'] = data_df['stalk-root'].replace('?', 'u').astype('category')
    elif handle_missing_data is 'dn':
        data_df['stalk-root'] = data_df['stalk-root'].replace('?', np.nan).astype('category')
    
    # encoding       
    if use_one_hot_encoding is True:
        data_df = pd.get_dummies(data_df)
    else:
        for (columnName, columnData) in data_df.iteritems(): 
            data_df[columnName] = data_df[columnName].cat.codes
            
    # train/ validation/ test split
    train, val, test = np.split(data_df.sample(frac=1, random_state=42), [int(split_size[0]*len(data_df)), int((split_size[0] + split_size[1])*len(data_df))])
    
    return train, val, test