In [1]:
#importing the Libraies
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Reading the Dataset
dataset = pd.read_csv('CKD.csv')
dataset=pd.get_dummies(dataset,dtype=int,drop_first=True)
dataset

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.000000,76.459948,3,0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,0,0,0,0,0,0,1,1,0,1
1,3.000000,76.459948,2,0,148.112676,22.000000,0.700000,137.528754,4.627244,10.700000,...,1,0,0,0,0,0,1,0,0,1
2,4.000000,76.459948,1,0,99.000000,23.000000,0.600000,138.000000,4.400000,12.000000,...,1,0,0,0,0,0,1,0,0,1
3,5.000000,76.459948,1,0,148.112676,16.000000,0.700000,138.000000,3.200000,8.100000,...,1,0,0,0,0,0,1,0,1,1
4,5.000000,50.000000,0,0,148.112676,25.000000,0.600000,137.528754,4.627244,11.800000,...,1,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,0,0,219.000000,36.000000,1.300000,139.000000,3.700000,12.500000,...,1,0,0,0,0,0,1,0,0,1
395,51.492308,70.000000,0,2,220.000000,68.000000,2.800000,137.528754,4.627244,8.700000,...,1,0,0,1,1,0,1,0,1,1
396,51.492308,70.000000,3,0,110.000000,115.000000,6.000000,134.000000,2.700000,9.100000,...,1,0,0,1,1,0,0,0,0,1
397,51.492308,90.000000,0,0,207.000000,80.000000,6.800000,142.000000,5.500000,8.500000,...,1,0,0,1,1,0,1,0,1,1


In [3]:
#Checking dataset for balanced or imbalanced:
dataset["classification_yes"].value_counts()

classification_yes
1    249
0    150
Name: count, dtype: int64

In [4]:
dataset.columns

Index(['age', 'bp', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hrmo', 'pcv',
       'wc', 'rc', 'sg_b', 'sg_c', 'sg_d', 'sg_e', 'rbc_normal', 'pc_normal',
       'pcc_present', 'ba_present', 'htn_yes', 'dm_yes', 'cad_yes',
       'appet_yes', 'pe_yes', 'ane_yes', 'classification_yes'],
      dtype='object')

In [5]:
#split into independent and dependent variable:
indep=dataset[['age', 'bp', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hrmo', 'pcv',
       'wc', 'rc', 'sg_b', 'sg_c', 'sg_d', 'sg_e', 'rbc_normal', 'pc_normal',
       'pcc_present', 'ba_present', 'htn_yes', 'dm_yes', 'cad_yes',
       'appet_yes', 'pe_yes', 'ane_yes']]
dep=dataset['classification_yes']

In [6]:
#split into training set and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(indep, dep, test_size = 1/3, random_state = 0)


In [7]:
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV

param_grid = {'criterion':['gini','entropy'],
              'max_features': ['auto','sqrt','log2'],
              'splitter':['best','random']}

grid = GridSearchCV(DecisionTreeClassifier(), param_grid, refit = True, verbose = 3,n_jobs=-1,scoring='f1_weighted')

grid.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [8]:
# print best parameter after tuning 
print(grid.best_params_) 

{'criterion': 'gini', 'max_features': 'sqrt', 'splitter': 'random'}


In [9]:
re=grid.cv_results_
#print(re)

In [10]:
grid_predictions = grid.predict(X_test)  
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, grid_predictions)

In [11]:
from sklearn.metrics import classification_report
clf_report = classification_report(y_test, grid_predictions)

In [12]:
from sklearn.metrics import f1_score
f1_macro=f1_score(y_test,grid_predictions,average='weighted')
print("The f1_macro value for best parameter {}:".format(grid.best_params_),f1_macro)

The f1_macro value for best parameter {'criterion': 'gini', 'max_features': 'sqrt', 'splitter': 'random'}: 0.9699248120300752


In [13]:
print("The confusion Matrix:\n",cm)

The confusion Matrix:
 [[49  2]
 [ 2 80]]


In [14]:
print("The report:\n",clf_report)

The report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96        51
           1       0.98      0.98      0.98        82

    accuracy                           0.97       133
   macro avg       0.97      0.97      0.97       133
weighted avg       0.97      0.97      0.97       133



In [15]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test,grid.predict_proba(X_test)[:,1])

0.9681970349115256

In [16]:
table=pd.DataFrame.from_dict(re)

In [17]:
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004187,0.001714,0.0,0.0,gini,auto,best,"{'criterion': 'gini', 'max_features': 'auto', ...",,,,,,,,9
1,0.003193,0.000976,0.0,0.0,gini,auto,random,"{'criterion': 'gini', 'max_features': 'auto', ...",,,,,,,,9
2,0.037643,0.028946,0.007483,0.001005,gini,sqrt,best,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.908877,0.961755,0.925524,0.981217,0.981217,0.951718,0.029538,8
3,0.00738,0.001849,0.007779,0.001162,gini,sqrt,random,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.981569,0.981014,0.981217,0.961826,1.0,0.981125,0.012074,1
4,0.007181,0.001465,0.009408,0.002358,gini,log2,best,"{'criterion': 'gini', 'max_features': 'log2', ...",1.0,0.868632,0.981031,0.923652,1.0,0.954663,0.051321,7
5,0.00738,0.001492,0.006784,0.000751,gini,log2,random,"{'criterion': 'gini', 'max_features': 'log2', ...",1.0,0.981014,0.962264,0.981031,0.962264,0.977315,0.014108,2
6,0.003193,0.000748,0.0,0.0,entropy,auto,best,"{'criterion': 'entropy', 'max_features': 'auto...",,,,,,,,9
7,0.002997,0.000549,0.0,0.0,entropy,auto,random,"{'criterion': 'entropy', 'max_features': 'auto...",,,,,,,,9
8,0.007373,0.001343,0.006983,0.001546,entropy,sqrt,best,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.962963,0.92351,0.944023,0.962264,0.981031,0.954758,0.019522,5
9,0.006783,0.002632,0.007978,0.000891,entropy,sqrt,random,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.944707,0.961755,0.962573,0.923652,0.981217,0.954781,0.019384,4


In [18]:
table.loc[:,"params"]

0     {'criterion': 'gini', 'max_features': 'auto', ...
1     {'criterion': 'gini', 'max_features': 'auto', ...
2     {'criterion': 'gini', 'max_features': 'sqrt', ...
3     {'criterion': 'gini', 'max_features': 'sqrt', ...
4     {'criterion': 'gini', 'max_features': 'log2', ...
5     {'criterion': 'gini', 'max_features': 'log2', ...
6     {'criterion': 'entropy', 'max_features': 'auto...
7     {'criterion': 'entropy', 'max_features': 'auto...
8     {'criterion': 'entropy', 'max_features': 'sqrt...
9     {'criterion': 'entropy', 'max_features': 'sqrt...
10    {'criterion': 'entropy', 'max_features': 'log2...
11    {'criterion': 'entropy', 'max_features': 'log2...
Name: params, dtype: object

In [19]:
age = float(input("Age:")) #age = 1 to 100
bp = float(input("Blood pressure:")) #bp = 10 to 150
al = float(input("Alostatic load:")) #al = 0 to 4
su = float(input("Substance Use:")) #su = 0 to 4
bgr = float(input("Blood Glucose Regulator:")) #bg = 10 to 500
bu = float(input("Buruli ulcer:")) #bu = 10 to 500
sc = float(input("Sickle Cell:")) #sc = 1 to 10
sod = float(input("Sphincter of Oddi Dysfunction:")) #sod = 10 to 500
pot = float(input("Postural Orthostatic Tachycardia:")) #pod = 1 to 10 
hrmo = float(input("Highly Resistant Microorganism:")) #hrmo = 1 to 15
pcv = float(input("Packed Cell Volume:")) #pcv = 1 to 100    
wc = float(input("Waist Circumference:")) #wc = 100 to 10000 
rc = float(input("radical cystectomy:")) #rc = 1 to 10
sg_b = float(input("Specific Gravity b:")) #sg (a = 0, b = 1, c = 2, d = 3)
sg_c = float(input("Specific Gravity c:")) 
sg_d = float(input("Specific Gravity d:")) 
sg_e = float(input("Specific Gravity e:"))
rbc_normal = float(input("Red Blood Cell:")) #rbc (yes= 1, no= 0)
pc_normal = float(input("Palliative Care:")) #pc (normal = 1, abnormal = 0)
pcc_present = float(input("Patient Centered Care:")) #pcc (present = 1, not present= 0)
ba_present = float(input("Bronchial Asthma:")) #ba (present = 1, not present= 0)
htn_yes = float(input("Hypertension:")) #htn (yes=1, no=0)
dm_yes = float(input("Diabetes Mellitus:")) #dm (yes= 1, no= 0)
cad_yes = float(input("Coronary artery disease :")) #cad (yes= 1, no= 0)
appet_yes = float(input("Amiodarone-induced Pulmonary Toxicity:")) #appet (yes= 1, no= 0)
pe_yes = float(input("Pulmonary Embolism:")) #pe (yes= 1, no= 0)
ane_yes = float(input("Anemia of Chronic:")) #ane (yes= 1, no= 0)

Age: 22
Blood pressure: 20
Alostatic load: 3
Substance Use: 2
Blood Glucose Regulator: 250
Buruli ulcer: 300
Sickle Cell: 8
Sphincter of Oddi Dysfunction: 280
Postural Orthostatic Tachycardia: 5
Highly Resistant Microorganism: 7
Packed Cell Volume: 25
Waist Circumference: 8000
radical cystectomy: 5
Specific Gravity b: 1
Specific Gravity c: 0
Specific Gravity d: 0
Specific Gravity e: 0
Red Blood Cell: 1
Palliative Care: 0
Patient Centered Care: 1
Bronchial Asthma: 0
Hypertension: 0
Diabetes Mellitus: 1
Coronary artery disease : 1
Amiodarone-induced Pulmonary Toxicity: 0
Pulmonary Embolism: 1
Anemia of Chronic: 1


In [25]:
Future_Prediction=grid.predict([[age, bp, al, su, bgr, bu, sc, sod, pot, hrmo, pcv,
       wc, rc, sg_b, sg_c, sg_d, sg_e, rbc_normal, pc_normal,
       pcc_present, ba_present, htn_yes, dm_yes, cad_yes,
       appet_yes, pe_yes, ane_yes]])# change the paramter,play with it.
print("Future_Prediction={}".format(Future_Prediction))

Future_Prediction=[1]


In [26]:
# Deployement Phase:
import pickle
filename="finalized_model_CKD.sav"
pickle.dump(grid,open(filename,'wb'))
loaded_model=pickle.load(open("finalized_model_CKD.sav",'rb'))
result=loaded_model.predict([[22,20,3,2,250,300,8,280,5,7,25,8000,5,1,0,0,0,1,0,1,0,0,1,1,0,1,1]])
result

array([1])