Link of deployed app: https://med7diagnosis.herokuapp.com/

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df = pd.read_csv('datasets/kidney_disease.csv')
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd


In [3]:
df['classification'].value_counts()

ckd       248
notckd    150
ckd\t       2
Name: classification, dtype: int64

In [4]:
df.isnull().sum()

id                  0
age                 9
bp                 12
sg                 47
al                 46
su                 49
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                44
bu                 19
sc                 17
sod                87
pot                88
hemo               52
pcv                70
wc                105
rc                130
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
classification      0
dtype: int64

In [5]:
df.isnull().mean()

id                0.0000
age               0.0225
bp                0.0300
sg                0.1175
al                0.1150
su                0.1225
rbc               0.3800
pc                0.1625
pcc               0.0100
ba                0.0100
bgr               0.1100
bu                0.0475
sc                0.0425
sod               0.2175
pot               0.2200
hemo              0.1300
pcv               0.1750
wc                0.2625
rc                0.3250
htn               0.0050
dm                0.0050
cad               0.0050
appet             0.0025
pe                0.0025
ane               0.0025
classification    0.0000
dtype: float64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null    obj

In [7]:
df.describe()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo
count,400.0,391.0,388.0,353.0,354.0,351.0,356.0,381.0,383.0,313.0,312.0,348.0
mean,199.5,51.483376,76.469072,1.017408,1.016949,0.450142,148.036517,57.425722,3.072454,137.528754,4.627244,12.526437
std,115.614301,17.169714,13.683637,0.005717,1.352679,1.099191,79.281714,50.503006,5.741126,10.408752,3.193904,2.912587
min,0.0,2.0,50.0,1.005,0.0,0.0,22.0,1.5,0.4,4.5,2.5,3.1
25%,99.75,42.0,70.0,1.01,0.0,0.0,99.0,27.0,0.9,135.0,3.8,10.3
50%,199.5,55.0,80.0,1.02,0.0,0.0,121.0,42.0,1.3,138.0,4.4,12.65
75%,299.25,64.5,80.0,1.02,2.0,0.0,163.0,66.0,2.8,142.0,4.9,15.0
max,399.0,90.0,180.0,1.025,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8


In [8]:
df[['htn','dm','cad','pe','ane']] = df[['htn','dm','cad','pe','ane']].replace(to_replace={'yes':1,'no':0})
df[['rbc','pc']] = df[['rbc','pc']].replace(to_replace={'abnormal':1,'normal':0})
df[['pcc','ba']] = df[['pcc','ba']].replace(to_replace={'present':1,'notpresent':0})
df[['appet']] = df[['appet']].replace(to_replace={'good':1,'poor':0,'no':np.nan})
df['classification'] = df['classification'].replace(to_replace={'ckd':1.0,'ckd\t':1.0,'notckd':0.0,'no':0.0})
df.rename(columns={'classification':'class'},inplace=True)

# Further cleaning
df['pe'] = df['pe'].replace(to_replace='good',value=0) # Not having pedal edema is good
df['appet'] = df['appet'].replace(to_replace='no',value=0)
df['cad'] = df['cad'].replace(to_replace='\tno',value=0)
df['dm'] = df['dm'].replace(to_replace={'\tno':0,'\tyes':1,' yes':1, '':np.nan})
df.drop('id',axis=1,inplace=True)

In [9]:
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,,0.0,0.0,0.0,121.0,36.0,1.2,,,15.4,44,7800,5.2,1.0,1.0,0.0,1.0,0.0,0.0,1.0
1,7.0,50.0,1.02,4.0,0.0,,0.0,0.0,0.0,,18.0,0.8,,,11.3,38,6000,,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,62.0,80.0,1.01,2.0,3.0,0.0,0.0,0.0,0.0,423.0,53.0,1.8,,,9.6,31,7500,,0.0,1.0,0.0,0.0,0.0,1.0,1.0
3,48.0,70.0,1.005,4.0,0.0,0.0,1.0,1.0,0.0,117.0,56.0,3.8,111.0,2.5,11.2,32,6700,3.9,1.0,0.0,0.0,0.0,1.0,1.0,1.0
4,51.0,80.0,1.01,2.0,0.0,0.0,0.0,0.0,0.0,106.0,26.0,1.4,,,11.6,35,7300,4.6,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [10]:
for i in range(len(df)):
    if type(df.iloc[i, 15]) == str:
        df.iloc[i, 15] = df.iloc[i, 15].strip()
        if df.iloc[i, 15] == '?':
            df.iloc[i, 15] = np.NaN
    if type(df.iloc[i, 16]) == str:
        df.iloc[i, 16] = df.iloc[i, 16].strip()
        if df.iloc[i, 16] == '?':
            df.iloc[i, 16] = np.NaN
    if type(df.iloc[i, 17]) == str:
        df.iloc[i, 17] = df.iloc[i, 17].strip()
        if df.iloc[i, 16] == '?':
            df.iloc[i, 16] = np.NaN

In [11]:
df['pcv'] = pd.to_numeric(df['pcv'], errors='coerce')
df['wc'] = pd.to_numeric(df['wc'], errors='coerce')
df['rc'] = pd.to_numeric(df['rc'], errors='coerce')
df['pcv'] = df['pcv'].astype('float')
df['wc'] = df['wc'].astype('float')
df['rc'] = df['rc'].astype('float')

In [12]:
from fancyimpute import KNN

df_new = KNN(k=5).fit_transform(df)
df_new = pd.DataFrame(df_new, columns=df.columns)
df_new['class'] = df_new['class'].astype('int')
df_new.head()

Imputing row 1/400 with 3 missing, elapsed time: 0.035
Imputing row 101/400 with 4 missing, elapsed time: 0.040
Imputing row 201/400 with 1 missing, elapsed time: 0.046
Imputing row 301/400 with 2 missing, elapsed time: 0.049


Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,0.207691,0.0,0.0,0.0,121.0,36.0,1.2,138.020076,4.297237,15.4,44.0,7800.0,5.2,1.0,1.0,0.0,1.0,0.0,0.0,1
1,7.0,50.0,1.02,4.0,0.0,0.136343,0.0,0.0,0.0,103.140405,18.0,0.8,137.852793,3.630794,11.3,38.0,6000.0,5.547415,0.0,0.0,0.0,1.0,0.0,0.0,1
2,62.0,80.0,1.01,2.0,3.0,0.0,0.0,0.0,0.0,423.0,53.0,1.8,132.44627,4.155945,9.6,31.0,7500.0,3.793092,0.0,1.0,0.0,0.0,0.0,1.0,1
3,48.0,70.0,1.005,4.0,0.0,0.0,1.0,1.0,0.0,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,1.0,0.0,0.0,0.0,1.0,1.0,1
4,51.0,80.0,1.01,2.0,0.0,0.0,0.0,0.0,0.0,106.0,26.0,1.4,138.605288,4.088994,11.6,35.0,7300.0,4.6,0.0,0.0,0.0,1.0,0.0,0.0,1


In [13]:
df_new.isnull().sum()

age      0
bp       0
sg       0
al       0
su       0
rbc      0
pc       0
pcc      0
ba       0
bgr      0
bu       0
sc       0
sod      0
pot      0
hemo     0
pcv      0
wc       0
rc       0
htn      0
dm       0
cad      0
appet    0
pe       0
ane      0
class    0
dtype: int64

In [14]:
from sklearn.model_selection import train_test_split

X = df_new.drop(columns=['class'])
y = df_new['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [15]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
lr = LogisticRegression()
rc = RidgeClassifier()
knn = KNeighborsClassifier()
rfc = RandomForestClassifier()
nbc = GaussianNB()
svc = SVC()

lr.fit(X_train, y_train)
rc.fit(X_train, y_train)
knn.fit(X_train, y_train)
rfc.fit(X_train, y_train)
nbc.fit(X_train, y_train)
svc.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


SVC()

In [16]:
y_preds_lr = lr.predict(X_test)
y_preds_rc = rc.predict(X_test)
y_preds_knn = knn.predict(X_test)
y_preds_rfc = rfc.predict(X_test)
y_preds_nbc = nbc.predict(X_test)
y_preds_svc = svc.predict(X_test)

In [17]:
cm = confusion_matrix(y_test, y_preds_lr)
ac = accuracy_score(y_test, y_preds_lr)
cr = classification_report(y_test, y_preds_lr)

print('Logistic Regression:')
print('Confusion Matrix:\n', cm)
print('Accuracy Score:', ac)
print('Classification Report:\n', cr)

cm = confusion_matrix(y_test, y_preds_rc)
ac = accuracy_score(y_test, y_preds_rc)
cr = classification_report(y_test, y_preds_rc)

print('Ridge Classifier:')
print('Confusion Matrix:\n', cm)
print('Accuracy Score:', ac)
print('Classification Report:\n', cr)

cm = confusion_matrix(y_test, y_preds_knn)
ac = accuracy_score(y_test, y_preds_knn)
cr = classification_report(y_test, y_preds_knn)

print('KNN Classifier:')
print('Confusion Matrix:\n', cm)
print('Accuracy Score:', ac)
print('Classification Report:\n', cr)

cm = confusion_matrix(y_test, y_preds_rfc)
ac = accuracy_score(y_test, y_preds_rfc)
cr = classification_report(y_test, y_preds_rfc)

print('Random Forest Classifier:')
print('Confusion Matrix:\n', cm)
print('Accuracy Score:', ac)
print('Classification Report:\n', cr)

cm = confusion_matrix(y_test, y_preds_nbc)
ac = accuracy_score(y_test, y_preds_nbc)
cr = classification_report(y_test, y_preds_nbc)

print('Naive Bayes Classifier:')
print('Confusion Matrix:\n', cm)
print('Accuracy Score:', ac)
print('Classification Report:\n', cr)

cm = confusion_matrix(y_test, y_preds_svc)
ac = accuracy_score(y_test, y_preds_svc)
cr = classification_report(y_test, y_preds_svc)

print('Support Vector Classifier:')
print('Confusion Matrix:\n', cm)
print('Accuracy Score:', ac)
print('Classification Report:\n', cr)

Logistic Regression:
Confusion Matrix:
 [[24  4]
 [ 5 47]]
Accuracy Score: 0.8875
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.86      0.84        28
           1       0.92      0.90      0.91        52

    accuracy                           0.89        80
   macro avg       0.87      0.88      0.88        80
weighted avg       0.89      0.89      0.89        80

Ridge Classifier:
Confusion Matrix:
 [[27  1]
 [ 4 48]]
Accuracy Score: 0.9375
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.96      0.92        28
           1       0.98      0.92      0.95        52

    accuracy                           0.94        80
   macro avg       0.93      0.94      0.93        80
weighted avg       0.94      0.94      0.94        80

KNN Classifier:
Confusion Matrix:
 [[20  8]
 [23 29]]
Accuracy Score: 0.6125
Classification Report:
               precision    recall  f

  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'n_estimators': [10, 50, 100, 200, 300, 400, 500],
    'criterion': ['gini', 'entropy'],
    'max_depth':[3, 4, 5, 6, 7, 8, 9, 10]
}

model = GridSearchCV(RandomForestClassifier(), parameters, n_jobs=-1, cv=5, verbose=2)
model.fit(X_train, y_train)

Fitting 5 folds for each of 112 candidates, totalling 560 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:   19.8s
[Parallel(n_jobs=-1)]: Done 560 out of 560 | elapsed:   31.6s finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
                         'n_estimators': [10, 50, 100, 200, 300, 400, 500]},
             verbose=2)

In [19]:
model.best_estimator_

RandomForestClassifier(max_depth=3, n_estimators=300)

In [20]:
y_preds = model.predict(X_test)
cm = confusion_matrix(y_test, y_preds)
ac = accuracy_score(y_test, y_preds)
cr = classification_report(y_test, y_preds)

print('Confusion Matrix:\n', cm)
print('Accuracy Score:', ac)
print('Classification Report:\n', cr)

Confusion Matrix:
 [[25  3]
 [ 0 52]]
Accuracy Score: 0.9625
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.89      0.94        28
           1       0.95      1.00      0.97        52

    accuracy                           0.96        80
   macro avg       0.97      0.95      0.96        80
weighted avg       0.96      0.96      0.96        80



In [21]:
final_model = model.best_estimator_.fit(X, y)

import pickle
pickle.dump(final_model, open('kidney_model.pkl', 'wb'))