# Data preparation
This notebook takes as input the original dataset and generates a clean dataset. It can run automatically or with some user inputs. User inputs generate a more consistent dataset, but for the dataset used it has no effect on model accuracy. 
In addition to the automatic mode, the numerical encoding and type of scaler are also parameters. 

In [None]:
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, RobustScaler, StandardScaler

auto_mode = True
scaler = RobustScaler()
numerical_null_values = 'knn'
categorical_null_values = 'knn'
dataset_name = 'kidney_disease'
useless_columns = ['id']

## Parameter check

In [None]:
import os

if auto_mode not in [True, False]:
    raise ValueError('"auto_mode" is not of type boolean')
if type(scaler) not in [MaxAbsScaler, MinMaxScaler, RobustScaler, StandardScaler]:
    raise ValueError('"scaler" is not of a scaler class')
if numerical_null_values not in ['mode', 'mean', 'knn']:
    raise ValueError('"numerical_null_values" has to be in [mode, mean, knn]')
if categorical_null_values not in ['knn', 'frequent']:
    raise ValueError('"numerical_null_values" has to be in [frequent, knn]')
if not os.path.exists('Data/'+dataset_name+'.csv'):
    raise ValueError('No dataset found for name "'+dataset_name+'"')

## Import original dataset

In [None]:
import pandas as pd

original_df = pd.read_csv('Data/'+dataset_name+'.csv')
original_df.head()

## Drop irrelevant columns

In [None]:
original_df.drop(useless_columns, inplace=True, axis=1)

## Drop duplicated rows

In [None]:
original_df = original_df.drop_duplicates()

## Split Categorical and Numerical columns

In [None]:
numerical_cols = original_df.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = original_df.select_dtypes(include=['object']).columns.tolist()

print("Numerical Columns:\n", "  - ".join(numerical_cols))
print("\nCategorical Columns:\n", "  - ".join(categorical_cols))

### Convert false categorical to numerical

In [None]:
modified = False
for col in categorical_cols:
    if any(original_df[col].dropna().apply(lambda x: x.isnumeric() if type(x)==str else False)):
        modified = True
        print(col+' was categorical')
        original_df[col]=pd.to_numeric(original_df[col], errors='coerce')

if modified:
    numerical_cols = original_df.select_dtypes(include=['float64', 'int64']).columns.tolist()
    categorical_cols = original_df.select_dtypes(include=['object']).columns.tolist()

    print("\nNew numerical Columns:\n", "  - ".join(numerical_cols))
    print("\nNew categorical Columns:\n", "  - ".join(categorical_cols))

## Categorical columns standardization
### Noise removing
Some of the categorical values contain misspelled data (example: '\tyes' for 'yes'). Here, we want to make sure all the misspells get replaced by their correct values.
#### User controlled way

In [None]:
if not auto_mode:
    for column in categorical_cols :
        distinct_values =  original_df[column].dropna().unique()
        print(distinct_values)
        print('\n' + column + ':')
        for value in distinct_values:
            is_replaced = input("\t- '" + value + "' : replace? (y/N)")
            if is_replaced == 'y':
                replacement = input('\t  replace by: ')
                original_df[column] = original_df[column].replace(value, replacement)
        distinct_values =  original_df[column].dropna().unique()
    
    print('\nBoolean columns now all contain only 2 distinct not-null values')

#### Automatic way

In [None]:
if auto_mode:
    original_df[categorical_cols] = original_df[categorical_cols].replace({'\t':'', ' ':''}, regex=True)
    for col in categorical_cols:
        original_df[col]=original_df[col].str.lower()

#### Check correction
In this dataset, all the categorical columns express a boolean value (examples: [yes, no], [present, notpresent]). We can check that all of the columns don't contain misspels by checking that they all contain only 2 distinct non-null values.

In [None]:
all_col_check = True
for column in categorical_cols :
    distinct_values =  original_df[column].dropna().unique()
    if len(distinct_values) > 2:
        all_col_check = False
        print(column+' contains misspells : '+str(distinct_values))

if all_col_check:
    print('Check is cleared') 

### String encoding
#### User controlled way
This way allows to keep some integrity in the data. For example, if yes is encoded as 1 in a column, it will also be encoded as 1 in another column, which we can't make sure of with automatic encoding.

In [None]:
if not auto_mode:
    boolean_columns = [x for x in categorical_cols if len(original_df[x].dropna().unique())==2]
    for column in boolean_columns:
        distinct_values =  original_df[column].dropna().unique()
        order = True
        print(column + ' : ' + str(distinct_values) + ' --> ' + str([order, not order]))
        is_reversed = input("Confirm order or reverse ? (C/r)")
        if is_reversed == 'r':
            order = not order
        original_df[column] = original_df[column].replace(distinct_values[0], int(order))
        print('\t- ' + distinct_values[0] + ' --> ' + str(order))
        original_df[column] = original_df[column].replace(distinct_values[1], int(not order))
        print('\t- ' + distinct_values[1] + ' --> ' + str(not order) + '\n')

#### Automatic way

In [None]:
if auto_mode:
    for col in categorical_cols:
        uniques=original_df[col].dropna().unique()
        replace_in_order = [1, 0]
        if uniques[0]=='no' or uniques[0].startswith('not') or uniques[0].startswith('ab'):
            replace_in_order = [0, 1]
        original_df[col] = original_df[col].replace(uniques[0], replace_in_order[0])
        original_df[col] = original_df[col].replace(uniques[1], replace_in_order[1])

## Scaling

In [None]:
original_df = pd.DataFrame(scaler.fit_transform(original_df), columns=original_df.columns)

## Null values handling
### Numerical columns
We have 3 different ways to replace null values in numerical columns:
- Mean
- Mode
- KNN prediction  

As our dataset has an important number of outliers, mode seems like a better choice than mean.

In [None]:
if numerical_null_values in ['mode', 'mean']:
    for col in numerical_cols:
        if numerical_null_values == 'mode':
            replacement = original_df[col].mode()[0]
        else:
            replacement = original_df[col].mean()
        original_df[col] = original_df[col].fillna(replacement)

### Categorical columns
- most frequent value
- KNN prediction  

In [None]:
if categorical_null_values=='frequent':
    for col in categorical_cols:
        most_frequent = original_df[col].value_counts().idxmax()
        original_df[col] = original_df[col].fillna(most_frequent)

### KNN prediction
KNN prediction is more precise, but consumes more in both cases. As it is done for all the dataset at once, we do it last because otherwise other null values would also be replaced.

In [None]:
from sklearn.impute import KNNImputer

if 'knn' in [categorical_null_values, numerical_null_values]:
    imputer = KNNImputer(n_neighbors=1)
    original_df = pd.DataFrame(imputer.fit_transform(original_df), columns=original_df.columns)

### No Null value check

In [None]:
if original_df.isna().values.any():
    raise Exception('Dataset still contains null values')

## Save clean dataset into csv file

In [None]:
original_df.to_csv('Data/'+dataset_name+'-clean.csv', index=False)