# Import the libraries

In [72]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter as c
from sklearn.preprocessing import LabelEncoder
%matplotlib inline

# Read the Dataset

In [3]:
data = pd.read_csv('chronickidneydisease.csv')

In [4]:
data.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


# Understanding datatypes and summery of features 

In [41]:
data.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')

In [42]:
data.shape

(400, 25)

In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
age               400 non-null float64
bp                400 non-null float64
sg                400 non-null float64
al                400 non-null float64
su                351 non-null float64
rbc               400 non-null object
pc                400 non-null object
pcc               400 non-null object
ba                400 non-null object
bgr               400 non-null float64
bu                400 non-null float64
sc                400 non-null float64
sod               400 non-null float64
pot               400 non-null float64
hemo              400 non-null float64
pcv               400 non-null float64
wc                400 non-null float64
rc                400 non-null float64
htn               400 non-null object
dm                400 non-null object
cad               400 non-null object
appet             400 non-null object
pe                400 non-null object
ane    

In [44]:
data['classification'].unique()

array(['ckd', 'notckd'], dtype=object)

In [45]:
data['classification']=data['classification'].replace("ckd\t","ckd")
data['classification'].unique()

array(['ckd', 'notckd'], dtype=object)

#### Categorical Columns 

In [46]:
catcol = set(data.dtypes[data.dtypes=='O'].index.values)
print(catcol)

{'htn', 'appet', 'dm', 'pcc', 'rbc', 'pe', 'pc', 'ane', 'classification', 'ba', 'cad'}


In [47]:
for i in catcol:
    print("Columns: ",i)
    print(c(data[i]))
    print("*"*120+"\n")

Columns:  htn
Counter({'no': 253, 'yes': 147})
************************************************************************************************************************

Columns:  appet
Counter({'good': 318, 'poor': 82})
************************************************************************************************************************

Columns:  dm
Counter({'no': 260, 'yes': 134, '\tno': 3, '\tyes': 2, ' yes': 1})
************************************************************************************************************************

Columns:  pcc
Counter({'notpresent': 358, 'present': 42})
************************************************************************************************************************

Columns:  rbc
Counter({'normal': 353, 'abnormal': 47})
************************************************************************************************************************

Columns:  pe
Counter({'no': 324, 'yes': 76})
******************************************************

#### Numerical Columns 

In [48]:
numcol = set(data.dtypes[data.dtypes!='O'].index.values)
print(numcol)

{'su', 'sod', 'bgr', 'pot', 'wc', 'hemo', 'sg', 'sc', 'age', 'bu', 'rc', 'pcv', 'bp', 'al'}


#### Rectifying the categorical columns

In [52]:
data['cad'] = data.cad.replace('\tno','no')
c(data['cad'])

Counter({'no': 366, 'yes': 34})

In [53]:
data['dm'] = data.dm.replace(to_replace={'\tno':'no','\tyes':'yes',' yes':'yes'})
c(data['dm'])

Counter({'no': 263, 'yes': 137})

# Handling the missing values

In [11]:
data.isnull().any()

id                False
age                True
bp                 True
sg                 True
al                 True
su                 True
rbc                True
pc                 True
pcc                True
ba                 True
bgr                True
bu                 True
sc                 True
sod                True
pot                True
hemo               True
pcv                True
wc                 True
rc                 True
htn                True
dm                 True
cad                True
appet              True
pe                 True
ane                True
classification    False
dtype: bool

In [12]:
data.isnull().sum()

id                  0
age                 9
bp                 12
sg                 47
al                 46
su                 49
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                44
bu                 19
sc                 17
sod                87
pot                88
hemo               52
pcv                70
wc                105
rc                130
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
classification      0
dtype: int64

In [14]:
data.pcv = pd.to_numeric(data.pcv,errors='coerce')
data.wc = pd.to_numeric(data.wc,errors='coerce')
data.rc = pd.to_numeric(data.rc,errors='coerce')

# Replacing the missing values

In [17]:
data['bgr'].fillna(data['bgr'].mean(),inplace=True)
data['bp'].fillna(data['bp'].mean(),inplace=True)
data['bu'].fillna(data['bu'].mean(),inplace=True)
data['hemo'].fillna(data['hemo'].mean(),inplace=True)
data['pcv'].fillna(data['pcv'].mean(),inplace=True)
data['pot'].fillna(data['pot'].mean(),inplace=True)
data['rc'].fillna(data['rc'].mean(),inplace=True)
data['sc'].fillna(data['sc'].mean(),inplace=True)
data['sod'].fillna(data['sod'].mean(),inplace=True)
data['wc'].fillna(data['wc'].mean(),inplace=True)

data['age'].fillna(data['age'].mode()[0],inplace=True)
data['htn'].fillna(data['htn'].mode()[0],inplace=True)
data['pcc'].fillna(data['pcc'].mode()[0],inplace=True)
data['appet'].fillna(data['appet'].mode()[0],inplace=True)
data['al'].fillna(data['al'].mode()[0],inplace=True)
data['pc'].fillna(data['pc'].mode()[0],inplace=True)
data['rbc'].fillna(data['rbc'].mode()[0],inplace=True)
data['cad'].fillna(data['cad'].mode()[0],inplace=True)
data['ba'].fillna(data['ba'].mode()[0],inplace=True)
data['ane'].fillna(data['ane'].mode()[0],inplace=True)
data['dm'].fillna(data['dm'].mode()[0],inplace=True)
data['pe'].fillna(data['pe'].mode()[0],inplace=True)
data['sg'].fillna(data['sg'].mode()[0],inplace=True)

# Label Encoding

In [54]:
for i in catcol:
    print("Label Encoding of: ",i)
    LE = LabelEncoder()
    print(c(data[i]))
    data[i] = LE.fit_transform(data[i])
    print(c(data[i]))
    print("*"*100)

Label Encoding of:  htn
Counter({'no': 253, 'yes': 147})
Counter({0: 253, 1: 147})
****************************************************************************************************
Label Encoding of:  appet
Counter({'good': 318, 'poor': 82})
Counter({0: 318, 1: 82})
****************************************************************************************************
Label Encoding of:  dm
Counter({'no': 263, 'yes': 137})
Counter({0: 263, 1: 137})
****************************************************************************************************
Label Encoding of:  pcc
Counter({'notpresent': 358, 'present': 42})
Counter({0: 358, 1: 42})
****************************************************************************************************
Label Encoding of:  rbc
Counter({'normal': 353, 'abnormal': 47})
Counter({1: 353, 0: 47})
****************************************************************************************************
Label Encoding of:  pe
Counter({'no': 324, 'yes': 76})
Counte