In [34]:
import numpy as np
import pandas as pd 

In [4]:
#!pip install impyute

In [35]:
from impyute.imputation.cs import mice
from sklearn.preprocessing import OrdinalEncoder

In [36]:
df = pd.read_csv('kidney_disease.csv')

In [37]:
cols_names={"bp":"blood_pressure",
          "sg":"specific_gravity",
          "al":"albumin",
          "su":"sugar",
          "rbc":"red_blood_cells",
          "pc":"pus_cell",
          "pcc":"pus_cell_clumps",
          "ba":"bacteria",
          "bgr":"blood_glucose_random",
          "bu":"blood_urea",
          "sc":"serum_creatinine",
          "sod":"sodium",
          "pot":"potassium",
          "hemo":"haemoglobin",
          "pcv":"packed_cell_volume",
          "wc":"white_blood_cell_count",
          "rc":"red_blood_cell_count",
          "htn":"hypertension",
          "dm":"diabetes_mellitus",
          "cad":"coronary_artery_disease",
          "appet":"appetite",
          "pe":"pedal_edema",
          "ane":"anemia"}

df.rename(columns=cols_names, inplace=True)

In [38]:
df['red_blood_cell_count'] = pd.to_numeric(df['red_blood_cell_count'], errors='coerce')
df['packed_cell_volume'] = pd.to_numeric(df['packed_cell_volume'], errors='coerce')
df['white_blood_cell_count'] = pd.to_numeric(df['white_blood_cell_count'], errors='coerce')

In [39]:
df.drop(["id"],axis=1,inplace=True)

In [40]:
numerical_features = []
categorical_features = []

for i in df.drop('classification', axis=1).columns:
    if df[i].nunique()>7:
        numerical_features.append(i)
    else:
        categorical_features.append(i)

In [41]:
#Replace incorrect values
df['diabetes_mellitus'] = df['diabetes_mellitus'].replace(to_replace = {'\tno':'no','\tyes':'yes',' yes':'yes'})
df['coronary_artery_disease'] = df['coronary_artery_disease'].replace(to_replace = '\tno', value='no')
df['classification'] = df['classification'].replace(to_replace = 'ckd\t', value = 'ckd')

In [42]:
df.loc[:,categorical_features].isnull().sum().sort_values(ascending=False)

red_blood_cells            152
pus_cell                    65
sugar                       49
specific_gravity            47
albumin                     46
pus_cell_clumps              4
bacteria                     4
hypertension                 2
diabetes_mellitus            2
coronary_artery_disease      2
appetite                     1
pedal_edema                  1
anemia                       1
dtype: int64

In [43]:
df.loc[:,numerical_features].isnull().sum().sort_values(ascending=False)

red_blood_cell_count      131
white_blood_cell_count    106
potassium                  88
sodium                     87
packed_cell_volume         71
haemoglobin                52
blood_glucose_random       44
blood_urea                 19
serum_creatinine           17
blood_pressure             12
age                         9
dtype: int64

## Codificación de características categóricas con tipo de objeto

In [44]:
to_encode = [feat for feat in categorical_features if df[feat].dtype=='object']

In [45]:
to_encode

['red_blood_cells',
 'pus_cell',
 'pus_cell_clumps',
 'bacteria',
 'hypertension',
 'diabetes_mellitus',
 'coronary_artery_disease',
 'appetite',
 'pedal_edema',
 'anemia']

In [46]:
ode = OrdinalEncoder(dtype = int)

In [47]:
def encode(data):
    '''función para codificar datos que no son nano y reemplazarlos en los datos originales'''
    #retains only non-null values
    nonulls = np.array(data.dropna())
    #reshapes the data for encoding
    impute_reshape = nonulls.reshape(-1,1)
    #encode date
    impute_ordinal = ode.fit_transform(impute_reshape)
    #Assign back encoded values to non-null values
    data.loc[data.notnull()] = np.squeeze(impute_ordinal)
    return data

#create a for loop to iterate through each column in the data
for columns in to_encode:
    encode(df[columns])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[data.notnull()] = np.squeeze(impute_ordinal)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[data.notnull()] = np.squeeze(impute_ordinal)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[data.notnull()] = np.squeeze(impute_ordinal)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.l

In [48]:
df.loc[:, categorical_features].head(10)

Unnamed: 0,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,pedal_edema,anemia
0,1.02,1.0,0.0,,1.0,0,0,1,1,0,0,0,0
1,1.02,4.0,0.0,,1.0,0,0,0,0,0,0,0,0
2,1.01,2.0,3.0,1.0,1.0,0,0,0,1,0,1,0,1
3,1.005,4.0,0.0,1.0,0.0,1,0,1,0,0,1,1,1
4,1.01,2.0,0.0,1.0,1.0,0,0,0,0,0,0,0,0
5,1.015,3.0,0.0,,,0,0,1,1,0,0,1,0
6,1.01,0.0,0.0,,1.0,0,0,0,0,0,0,0,0
7,1.015,2.0,4.0,1.0,0.0,0,0,0,1,0,0,1,0
8,1.015,3.0,0.0,1.0,0.0,1,0,1,1,0,0,0,1
9,1.02,2.0,0.0,0.0,0.0,1,0,1,1,0,1,0,1


So, they're Label encoded now.

In [49]:
X = df.drop('classification', axis=1)

In [51]:
df.head(20)

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,...,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,pedal_edema,anemia,classification
0,48.0,80.0,1.02,1.0,0.0,,1.0,0,0,121.0,...,44.0,7800.0,5.2,1,1,0,0,0,0,ckd
1,7.0,50.0,1.02,4.0,0.0,,1.0,0,0,,...,38.0,6000.0,,0,0,0,0,0,0,ckd
2,62.0,80.0,1.01,2.0,3.0,1.0,1.0,0,0,423.0,...,31.0,7500.0,,0,1,0,1,0,1,ckd
3,48.0,70.0,1.005,4.0,0.0,1.0,0.0,1,0,117.0,...,32.0,6700.0,3.9,1,0,0,1,1,1,ckd
4,51.0,80.0,1.01,2.0,0.0,1.0,1.0,0,0,106.0,...,35.0,7300.0,4.6,0,0,0,0,0,0,ckd
5,60.0,90.0,1.015,3.0,0.0,,,0,0,74.0,...,39.0,7800.0,4.4,1,1,0,0,1,0,ckd
6,68.0,70.0,1.01,0.0,0.0,,1.0,0,0,100.0,...,36.0,,,0,0,0,0,0,0,ckd
7,24.0,,1.015,2.0,4.0,1.0,0.0,0,0,410.0,...,44.0,6900.0,5.0,0,1,0,0,1,0,ckd
8,52.0,100.0,1.015,3.0,0.0,1.0,0.0,1,0,138.0,...,33.0,9600.0,4.0,1,1,0,0,0,1,ckd
9,53.0,90.0,1.02,2.0,0.0,0.0,0.0,1,0,70.0,...,29.0,12100.0,3.7,1,1,0,1,0,1,ckd


In [52]:
X_train = X.loc[:300,]
X_test = X.loc[300:,]

In [53]:
X_train.shape

(301, 24)

In [54]:
X_test.shape

(100, 24)

## Imputing numerical features using MICE

In [55]:
# MICE requiere valores flotantes
X_train_numerical = X_train.loc[:,numerical_features].astype('float64')

In [56]:
# Pasar las matrices numpy a los mice
X_train_numerical_imputed = mice(X_train_numerical.values)

In [57]:
X_train.loc[:,numerical_features].isna().sum().sort_values(ascending=False)

red_blood_cell_count      127
white_blood_cell_count    102
potassium                  84
sodium                     83
packed_cell_volume         68
haemoglobin                47
blood_glucose_random       40
blood_urea                 15
serum_creatinine           14
blood_pressure             11
age                         9
dtype: int64

In [58]:
X_train.loc[:,numerical_features] = X_train_numerical_imputed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.loc[:,numerical_features] = X_train_numerical_imputed


In [25]:
X_train.loc[:,numerical_features].isna().sum().sort_values(ascending=False)

age                       0
blood_pressure            0
blood_glucose_random      0
blood_urea                0
serum_creatinine          0
sodium                    0
potassium                 0
haemoglobin               0
packed_cell_volume        0
white_blood_cell_count    0
red_blood_cell_count      0
dtype: int64

Now, all the numerical features for training data are imputed. Let's take a look at the categorical features now.

## Imputing Categorical features

Here I'll be using the KNN function from FancyImpute for the task. Note that KNN outputs float values, so I'll round them to intergers to preserve categorical nature

In [62]:
from fancyimpute import KNN

In [61]:
#!pip install fancyimpute

In [63]:
imputer = KNN()

Con el backend de tensorflow, el proceso es rápido y los resultados se imprimirán a medida que itera a través de cada 100 filas. Necesitamos redondear los valores porque KNN producirá flotantes. Esto significa que nuestras columnas categóricas también se redondearán, así que asegúrese de dejar fuera de los datos las características que no desea que se redondeen.

In [64]:
X_train_imputed = pd.DataFrame(np.round(imputer.fit_transform(X_train)),columns = X_train.columns)

Imputing row 1/301 with 1 missing, elapsed time: 0.031
Imputing row 101/301 with 0 missing, elapsed time: 0.031
Imputing row 201/301 with 1 missing, elapsed time: 0.031
Imputing row 301/301 with 2 missing, elapsed time: 0.031


In [65]:
X_train_imputed.isnull().sum()

age                        0
blood_pressure             0
specific_gravity           0
albumin                    0
sugar                      0
red_blood_cells            0
pus_cell                   0
pus_cell_clumps            0
bacteria                   0
blood_glucose_random       0
blood_urea                 0
serum_creatinine           0
sodium                     0
potassium                  0
haemoglobin                0
packed_cell_volume         0
white_blood_cell_count     0
red_blood_cell_count       0
hypertension               0
diabetes_mellitus          0
coronary_artery_disease    0
appetite                   0
pedal_edema                0
anemia                     0
dtype: int64

Ahora, los datos son imputados.

## Scaling Data

In [66]:
X_train_imputed.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,301.0,52.833887,16.980155,2.0,45.0,55.0,65.0,90.0
blood_pressure,301.0,78.574751,14.164929,50.0,70.0,80.0,90.0,180.0
specific_gravity,301.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
albumin,301.0,1.38206,1.327985,0.0,0.0,1.0,2.0,5.0
sugar,301.0,0.598007,1.214302,0.0,0.0,0.0,0.0,5.0
red_blood_cells,301.0,0.747508,0.435165,0.0,0.0,1.0,1.0,1.0
pus_cell,301.0,0.714286,0.452506,0.0,0.0,1.0,1.0,1.0
pus_cell_clumps,301.0,0.139535,0.347081,0.0,0.0,0.0,0.0,1.0
bacteria,301.0,0.07309,0.260717,0.0,0.0,0.0,0.0,1.0
blood_glucose_random,301.0,162.116279,82.598203,22.0,106.0,133.0,193.0,490.0


Escalemos los datos ahora, ya que las distribuciones varían mucho para algunas características. Aquí usaré MinMaxScaler porque no quiero cambiar la distribución subyacente ni los valores atípicos.

In [67]:
from sklearn.preprocessing import MinMaxScaler

In [68]:
scaler = MinMaxScaler()
scaler.fit(X_train_imputed)
X_train_scaled = scaler.transform(X_train_imputed)

In [69]:
X_train_scaled = pd.DataFrame(data=X_train_scaled, columns = X_train.columns)

In [70]:
X_train_scaled.describe()

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,...,haemoglobin,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,pedal_edema,anemia
count,301.0,301.0,301.0,301.0,301.0,301.0,301.0,301.0,301.0,301.0,...,301.0,301.0,301.0,301.0,301.0,301.0,301.0,301.0,301.0,301.0
mean,0.577658,0.219806,0.0,0.276412,0.119601,0.747508,0.714286,0.139535,0.07309,0.299394,...,0.610346,0.597931,0.277562,0.366556,0.495017,0.458472,0.112957,0.272425,0.252492,0.199336
std,0.192956,0.108961,0.0,0.265597,0.24286,0.435165,0.452506,0.347081,0.260717,0.176492,...,0.175674,0.176329,0.111761,0.149084,0.500808,0.499102,0.317067,0.445949,0.435165,0.400166
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.488636,0.153846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.179487,...,0.5,0.5,0.219424,0.333333,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.602273,0.230769,0.0,0.2,0.0,1.0,1.0,0.0,0.0,0.237179,...,0.642857,0.590909,0.281583,0.333333,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.715909,0.307692,0.0,0.4,0.0,1.0,1.0,0.0,0.0,0.365385,...,0.714286,0.704545,0.309806,0.5,1.0,1.0,0.0,1.0,1.0,0.0
max,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Ahora, los datos están en escalas similares y son lo suficientemente buenos para ser modelados. Los mismos pasos también se aplicarán en el conjunto de prueba.

## Test Data

In [71]:
# MICE requires float values
X_test_numerical = X_test.loc[:,numerical_features].astype('float64')

In [72]:
X_test_numerical_imputed = mice(X_test_numerical.values)
X_test.loc[:,numerical_features] = X_test_numerical_imputed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.loc[:,numerical_features] = X_test_numerical_imputed


In [73]:
X_test_imputed = pd.DataFrame(np.round(imputer.fit_transform(X_test)),columns = X_test.columns)

Imputing row 1/100 with 2 missing, elapsed time: 0.007


In [74]:
scaler.fit(X_test_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

In [75]:
X_test_scaled = pd.DataFrame(data=X_test_scaled, columns = X_test.columns)

Ahora, los datos de entrenamiento y prueba están listos.

In [76]:
X_test_scaled.head()

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,...,haemoglobin,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,pedal_edema,anemia
0,0.485294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.628571,...,0.4,0.214286,0.731343,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.470588,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.371429,...,0.8,0.071429,0.432836,0.5,0.0,0.0,0.0,0.0,0.0,0.0
2,0.25,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.814286,...,0.4,0.571429,0.521045,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.632353,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.528571,...,0.8,0.714286,0.358209,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.308824,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.828571,...,0.0,0.357143,0.283582,0.0,0.0,0.0,0.0,0.0,0.0,0.0
