In [282]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

import re
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.impute import KNNImputer, SimpleImputer, IterativeImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [187]:
df = pd.read_csv('data/kidney_disease.csv', index_col=0)
df

Unnamed: 0_level_0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,48.0,80.0,1.020,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.020,4.0,0.0,,normal,notpresent,notpresent,,...,38,6000,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.010,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.010,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,140.0,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,75.0,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,100.0,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,114.0,...,51,7200,5.9,no,no,no,good,no,no,notckd


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             391 non-null    float64
 1   bp              388 non-null    float64
 2   sg              353 non-null    float64
 3   al              354 non-null    float64
 4   su              351 non-null    float64
 5   rbc             248 non-null    object 
 6   pc              335 non-null    object 
 7   pcc             396 non-null    object 
 8   ba              396 non-null    object 
 9   bgr             356 non-null    float64
 10  bu              381 non-null    float64
 11  sc              383 non-null    float64
 12  sod             313 non-null    float64
 13  pot             312 non-null    float64
 14  hemo            348 non-null    float64
 15  pcv             330 non-null    object 
 16  wc              295 non-null    object 
 17  rc              270 non-null    obj

In [179]:
# Create a function to evaluate missing in data, check unique in object (categorical col)
def check_overview(df):

    check = []

    for col in df.columns:
        weird_obj = ''
        
        if df.dtypes[col] == 'object':
            type = 'Object'
            unique_obj = df[col].unique()
            for i in df[col]:
                obj = re.findall(r'[^A-Za-z\d\.]', str(i))
                if obj != []:
                    weird_obj += str(obj)

        else:
            type = 'Numeric'
            unique_obj = ''

        total_nan = df[col].isnull().sum()
        percent_nan = total_nan/len(df[col])*100
        check.append([col, type, total_nan, percent_nan, unique_obj, weird_obj])

    check_df = pd.DataFrame(check, columns=['Name', 'Type', 'Total_na', 'Percent', 'Unique', 'Weird_obj'])

    return check_df

check = check_overview(df)
check

Unnamed: 0,Name,Type,Total_na,Percent,Unique,Weird_obj
0,age,Numeric,9,2.25,,
1,bp,Numeric,12,3.0,,
2,sg,Numeric,47,11.75,,
3,al,Numeric,46,11.5,,
4,su,Numeric,49,12.25,,
5,rbc,Object,152,38.0,"[nan, normal, abnormal]",
6,pc,Object,65,16.25,"[normal, abnormal, nan]",
7,pcc,Object,4,1.0,"[notpresent, present, nan]",
8,ba,Object,4,1.0,"[notpresent, present, nan]",
9,bgr,Numeric,44,11.0,,


In [188]:
# Clean weird object first
for col in ['pcv', 'wc', 'rc']:
    df[col] = pd.to_numeric(df[col].str.replace(r'[^A-Za-z\d\.]', ''))
for col in ['dm', 'cad', 'classification']:
    df[col] = df[col].str.replace(r'[^A-Za-z\d\.]', '')

In [191]:
# Some columns have too many nan, not enough to predict and too bad for use imputer, let's drop some col with percent_na > 25%
df.drop(columns=['rbc', 'sod', 'wc', 'rc'], inplace=True)

In [189]:
check = check_overview(df)
check

Unnamed: 0,Name,Type,Total_na,Percent,Unique,Weird_obj
0,age,Numeric,9,2.25,,
1,bp,Numeric,12,3.0,,
2,sg,Numeric,47,11.75,,
3,al,Numeric,46,11.5,,
4,su,Numeric,49,12.25,,
5,rbc,Object,152,38.0,"[nan, normal, abnormal]",
6,pc,Object,65,16.25,"[normal, abnormal, nan]",
7,pcc,Object,4,1.0,"[notpresent, present, nan]",
8,ba,Object,4,1.0,"[notpresent, present, nan]",
9,bgr,Numeric,44,11.0,,


In [308]:
X = df.drop(columns='classification')
y = df['classification']
num_col = X.select_dtypes(include='float64').columns
cate_col = X.select_dtypes(include='object').columns
all_col = num_col.append(cate_col)


num_pipeline = Pipeline([
        ('scaler', StandardScaler())])

cate_pipeline = Pipeline([
        ('fill_missing', SimpleImputer(strategy='constant', fill_value='_')),
        ('ordinal_encoder', OrdinalEncoder()),
        ('reverse_missing', SimpleImputer(strategy='constant', missing_values=0, fill_value=np.nan))])

prepare_for_imputer = ColumnTransformer([
        ('num_var', num_pipeline, num_col),
        ('cate_var', cate_pipeline, cate_col)])

X_prepare_for_imputer = pd.DataFrame(prepare_for_imputer.fit_transform(X), columns=all_col)

knn_imputer = KNNImputer(n_neighbors=3)

X_imputed = pd.DataFrame(knn_imputer.fit_transform(X_prepare_for_imputer), columns=X_prepare_for_imputer.columns)
X_imputed[cate_col] = X_imputed[cate_col].astype('int64')

X_final = pd.get_dummies(X_imputed, columns=cate_col, drop_first=True)
X_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 20 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   age      400 non-null    float64
 1   bp       400 non-null    float64
 2   sg       400 non-null    float64
 3   al       400 non-null    float64
 4   su       400 non-null    float64
 5   bgr      400 non-null    float64
 6   bu       400 non-null    float64
 7   sc       400 non-null    float64
 8   pot      400 non-null    float64
 9   hemo     400 non-null    float64
 10  pcv      400 non-null    float64
 11  pc_2     400 non-null    uint8  
 12  pcc_2    400 non-null    uint8  
 13  ba_2     400 non-null    uint8  
 14  htn_2    400 non-null    uint8  
 15  dm_2     400 non-null    uint8  
 16  cad_2    400 non-null    uint8  
 17  appet_2  400 non-null    uint8  
 18  pe_2     400 non-null    uint8  
 19  ane_2    400 non-null    uint8  
dtypes: float64(11), uint8(9)
memory usage: 38.0 KB
