## predict whether the person has chronic kidney disease or not?

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


import sklearn
import imblearn


plt.rcParams['figure.figsize'] = (16, 5)

plt.style.use('fivethirtyeight')

In [2]:
df= pd.read_csv("kidney_disease.csv")

In [3]:
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


##
### features:
 age - age
    bp - blood pressure
    sg - specific gravity
    al - albumin
    su - sugar
    rbc - red blood cells
    pc - pus cell
    pcc - pus cell clumps
    ba - bacteria
    bgr - blood glucose random
    bu - blood urea
    sc - serum creatinine
    sod - sodium
    pot - potassium
    hemo - haemoglobin
    pcv - packed cell volume
    wc - white blood cell count
    rc - red blood cell count
    htn - hypertension
    dm - diabetes mellitus
    cad - coronary artery disease
    appet - appetite
    pe - pedal edema
    ane - anemia
    classification - class

In [4]:
df.shape

(400, 26)

In [5]:
missing = df.isnull().sum()

In [None]:
missing

In [6]:
missing_percent = ((df.isnull().sum()/df.shape[0])*100).round(2)

In [7]:
missing_data = pd.concat([missing,missing_percent,],
                                axis=1, 
                                keys=['missing_Total', 'missing_Percent %',],
                                sort = True)

In [8]:
missing_data

Unnamed: 0,missing_Total,missing_Percent %
age,9,2.25
al,46,11.5
ane,1,0.25
appet,1,0.25
ba,4,1.0
bgr,44,11.0
bp,12,3.0
bu,19,4.75
cad,2,0.5
classification,0,0.0


In [9]:
columns=pd.read_csv('data_description.txt',sep='-')
columns=columns.reset_index()
columns.columns=['cols','abbrevationnames']

In [10]:
columns

Unnamed: 0,cols,abbrevationnames
0,id,id
1,age,age
2,bp,blood pressure
3,sg,specific gravity
4,al,albumin
5,su,sugar
6,rbc,red blood cells
7,pc,pus cell
8,pcc,pus cell clumps
9,ba,bacteria


In [None]:
df.info()

In [None]:
features=['rc','pcv','wc']

In [None]:
def convert_dtype(df,feature):
    df[feature] = pd.to_numeric(df[feature])

In [11]:
df.drop(["id"],axis=1,inplace=True) 

In [20]:
df[["rc"]].value_counts()

rc 
5.2    18
4.5    16
4.9    14
4.7    11
4.8    10
3.9    10
3.4     9
4.6     9
6.1     8
3.7     8
5.0     8
5.5     8
5.9     8
5.3     7
5.4     7
5.8     7
3.8     7
5.6     6
4.2     6
4.3     6
4.1     5
5.1     5
6.2     5
6.5     5
4.4     5
6.4     5
5.7     5
3.2     5
6.0     4
6.3     4
3.6     4
4.0     3
3.3     3
3.5     3
4       3
3.0     2
5       2
3.1     2
2.9     2
2.8     2
2.7     2
2.6     2
2.5     2
2.1     2
\t?     1
3       1
2.4     1
2.3     1
8.0     1
dtype: int64

In [64]:
def cat_num(df):
    categorical_col=[col for col in df.columns if df[col].dtype=='object']
    numerical_col=[col for col in df.columns if df[col].dtype!='object']
    return categorical_col,numerical_col

In [65]:
categorical_col,numerical_col=cat_num(df)

In [66]:
categorical_col

['rbc',
 'pc',
 'pcc',
 'ba',
 'htn',
 'dm',
 'cad',
 'appet',
 'pe',
 'ane',
 'classification']

In [67]:
numerical_col

['age',
 'bp',
 'sg',
 'al',
 'su',
 'bgr',
 'bu',
 'sc',
 'sod',
 'pot',
 'hemo',
 'pcv',
 'wc',
 'rc']

In [68]:
for col in categorical_col:
    print(col,"*****",df[col].unique())
    print('\n')

rbc ***** [nan 'normal' 'abnormal']


pc ***** ['normal' 'abnormal' nan]


pcc ***** ['notpresent' 'present' nan]


ba ***** ['notpresent' 'present' nan]


htn ***** ['yes' 'no' nan]


dm ***** ['yes' 'no' nan]


cad ***** ['no' 'yes' nan]


appet ***** ['good' 'poor' nan]


pe ***** ['no' 'yes' nan]


ane ***** ['no' 'yes' nan]


classification ***** ['ckd' 'notckd']




In [33]:
#Replace incorrect values

df['dm'].replace(to_replace = {'\tno':'no','\tyes':'yes',' yes':'yes'},inplace=True)

df['cad'] = df['cad'].replace(to_replace = '\tno', value='no')

df['classification'] = df['classification'].replace(to_replace = 'ckd\t', value = 'ckd')

In [34]:
df['rc'].replace(to_replace = {'\t?':'0'},inplace=True)

## cleaning wc,pcv,rc

In [36]:
df['wc'].replace(to_replace = {'\t?':'0','\t6200':"6200","\t8400":"8400"},inplace=True)

In [38]:
df['pcv'].replace(to_replace = {'\t?':'0','\t43':"43"},inplace=True)

In [44]:
df1= (df['rc']).isnull().sum()

In [45]:
df1

130

In [49]:
df['rc'].fillna(np.nan, inplace = True)
df['rc'].isnull().sum()

130

In [56]:
df['rc'] = df['rc'].astype(float)

In [57]:
df['rc'].fillna(df['rc'].mean(), inplace = True)
df['rc'].isnull().sum()

0

In [59]:
df['wc'] = df['wc'].astype(float)

In [60]:
df['wc'].fillna(df['wc'].mean(), inplace = True)
df['wc'].isnull().sum()

0

In [61]:
df['pcv'] = df['pcv'].astype(float)

In [62]:
df['pcv'].fillna(df['pcv'].mean(), inplace = True)
df['pcv'].isnull().sum()

0

## Looking for numerical columns

In [69]:
for col in numerical_col:
    print(col,"*****",df[col].unique())
    print('\n')

age ***** [48.  7. 62. 51. 60. 68. 24. 52. 53. 50. 63. 40. 47. 61. 21. 42. 75. 69.
 nan 73. 70. 65. 76. 72. 82. 46. 45. 35. 54. 11. 59. 67. 15. 55. 44. 26.
 64. 56.  5. 74. 38. 58. 71. 34. 17. 12. 43. 41. 57.  8. 39. 66. 81. 14.
 27. 83. 30.  4.  3.  6. 32. 80. 49. 90. 78. 19.  2. 33. 36. 37. 23. 25.
 20. 29. 28. 22. 79.]


bp ***** [ 80.  50.  70.  90.  nan 100.  60. 110. 140. 180. 120.]


sg ***** [1.02  1.01  1.005 1.015   nan 1.025]


al ***** [ 1.  4.  2.  3.  0. nan  5.]


su ***** [ 0.  3.  4.  1. nan  2.  5.]


bgr ***** [121.  nan 423. 117. 106.  74. 100. 410. 138.  70. 490. 380. 208.  98.
 157.  76.  99. 114. 263. 173.  95. 108. 156. 264. 123.  93. 107. 159.
 140. 171. 270.  92. 137. 204.  79. 207. 124. 144.  91. 162. 246. 253.
 141. 182.  86. 150. 146. 425. 112. 250. 360. 163. 129. 133. 102. 158.
 165. 132. 104. 127. 415. 169. 251. 109. 280. 210. 219. 295.  94. 172.
 101. 298. 153.  88. 226. 143. 115.  89. 297. 233. 294. 323. 125.  90.
 308. 118. 224. 128. 122. 214. 213. 268

In [70]:
numerical_col

['age',
 'bp',
 'sg',
 'al',
 'su',
 'bgr',
 'bu',
 'sc',
 'sod',
 'pot',
 'hemo',
 'pcv',
 'wc',
 'rc']

In [72]:
for col in numerical_col:
    #print('{} has {} categories'.format(col, data[col].nunique()))
    df[col].fillna(df[col].mean(), inplace = True)
    

In [73]:
df['age'].isnull().sum()

0

In [74]:
df['bp'].isnull().sum()

0

In [75]:
df['sg'].isnull().sum()

0

In [77]:
df["al"].isnull().sum()

0

In [78]:
df.isna().sum().sort_values(ascending=False)

rbc               152
pc                 65
pcc                 4
ba                  4
cad                 2
dm                  2
htn                 2
ane                 1
pe                  1
appet               1
age                 0
pcv                 0
rc                  0
wc                  0
sod                 0
hemo                0
pot                 0
bp                  0
sc                  0
bu                  0
bgr                 0
su                  0
al                  0
sg                  0
classification      0
dtype: int64

In [79]:

df['rbc'] = df['rbc'].fillna(df['rbc'].mode()[0])


 #df.isnull().sum()

In [80]:
df['pc'] = df['pc'].fillna(df['pc'].mode()[0])

In [81]:
df['pcc'] = df['pcc'].fillna(df['pcc'].mode()[0])

In [82]:
df['ba'] = df['ba'].fillna(df['ba'].mode()[0])

In [83]:
df['cad'] = df['cad'].fillna(df['cad'].mode()[0])

In [84]:
df['dm'] = df['dm'].fillna(df['dm'].mode()[0])

In [None]:
df['dm'] = df['dm'].fillna(df['dm'].mode()[0])

In [85]:
df['htn'] = df['htn'].fillna(df['htn'].mode()[0])

In [86]:
df['ane'] = df['ane'].fillna(df['ane'].mode()[0])

In [87]:
df['pe'] = df['pe'].fillna(df['pe'].mode()[0])

In [88]:
df['appet'] = df['appet'].fillna(df['appet'].mode()[0])

In [89]:
df.isnull().sum().sum()

0

## Feature encoding

In [93]:
for col in categorical_col:
    print(col,"**********", df[col].nunique())
    

rbc ********** 2
pc ********** 2
pcc ********** 2
ba ********** 2
htn ********** 2
dm ********** 2
cad ********** 2
appet ********** 2
pe ********** 2
ane ********** 2
classification ********** 2


In [94]:
from sklearn.preprocessing import LabelEncoder

In [95]:
le = LabelEncoder()

In [96]:
for col in categorical_col:
    df[col]=le.fit_transform(df[col])

## Model building

In [97]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             400 non-null    float64
 1   bp              400 non-null    float64
 2   sg              400 non-null    float64
 3   al              400 non-null    float64
 4   su              400 non-null    float64
 5   rbc             400 non-null    int32  
 6   pc              400 non-null    int32  
 7   pcc             400 non-null    int32  
 8   ba              400 non-null    int32  
 9   bgr             400 non-null    float64
 10  bu              400 non-null    float64
 11  sc              400 non-null    float64
 12  sod             400 non-null    float64
 13  pot             400 non-null    float64
 14  hemo            400 non-null    float64
 15  pcv             400 non-null    float64
 16  wc              400 non-null    float64
 17  rc              400 non-null    flo

In [99]:
df.shape

(400, 25)

In [102]:
x = df.iloc[:,0:25]
y = df['classification']

In [103]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y,train_size=0.75)

In [106]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_test = sc.fit_transform(X_test)
X_train = sc.fit_transform(X_train)

In [108]:
from sklearn import tree
# from sklearn.tree import DecisionTreeClassifier
dt = tree.DecisionTreeClassifier(max_depth = 4 , criterion = 'entropy')
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0])

In [109]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test,y_pred)
score

1.0