In [None]:
# import libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from pandas_profiling import ProfileReport 
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif # use this for classification tasks
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline


In [None]:
# load the data
df = pd.read_csv('../input/ckdisease/kidney_disease.csv')
df.head(10)

In [None]:
#exploration data shape 
df.shape

In [None]:
df.columns.values

In [None]:
df.drop('id', axis=1, inplace=True)

# Data contains

1. age - age
2. bp - blood pressure
3. sg - specific gravity
4. al - albumin
5. su - sugar
6. rbc - red blood cells
7. pc - pus cell
8. pcc - pus cell clumps
9. ba - bacteria
10. bgr - blood glucose random
11. bu - blood urea
12. sc - serum creatinine
13. sod - sodium
14. pot - potassium
15. hemo - haemoglobin
16. pcv - packed cell volume
17. wc - white blood cell count
18. rc - red blood cell count
19. htn - hypertension
20. dm - diabetes mellitus
21. cad - coronary artery disease
22. appet - appetite
23. pe - pedal edema
24. ane - anemia
25. lassification - class

# Feature description

1. Age(numerical) --> age in years
2. Blood Pressure(numerical) bp in mm/Hg
3. Specific Gravity(nominal) sg - (1.005,1.010,1.015,1.020,1.025)
4. Albumin(nominal)al - (0,1,2,3,4,5)
5. Sugar(nominal) su - (0,1,2,3,4,5)
6. Red Blood Cells(nominal) rbc - (normal,abnormal)
7. Pus Cell (nominal)pc - (normal,abnormal)
8. Pus Cell clumps(nominal)pcc - (present,notpresent)
9. Bacteria(nominal) ba - (present,notpresent)
10. Blood Glucose Random(numerical) bgr in mgs/dl
11. Blood Urea(numerical) bu in mgs/dl
12. Serum Creatinine(numerical) sc in mgs/dl
13. Sodium(numerical) sod in mEq/L
14. Potassium(numerical) pot in mEq/L
15. Haemoglobin(numerical) hemo in gms
16. Packed Cell Volume(numerical)
17. White Blood Cell Count(numerical) wc in cells/cumm
18. Red Blood Cell Count(numerical) rc in millions/cmm
19. Hypertension(nominal) htn - (yes,no)
20. Diabetes Mellitus(nominal) dm - (yes,no)
21. Coronary Artery Disease(nominal) cad - (yes,no)
22. Appetite(nominal) ppet - (good,poor)
23. Pedal Edema(nominal) pe - (yes,no)
24. Anemia(nominal)ane - (yes,no)
25. Class (nominal) class - (ckd,notckd)

# EDA

In [None]:
df.info()

In [None]:
df.head().T


In [None]:
df[['htn','dm','cad','pe','ane']] = df[['htn','dm','cad','pe','ane']].replace(to_replace={'yes':1,'no':0})

In [None]:
df[['rbc','pc']] = df[['rbc','pc']].replace(to_replace={'normal':0,'abnormal':1})

In [None]:
df[['ba','pcc']] = df[['ba','pcc']].replace(to_replace={'notpresent':0,'present':1})

In [None]:
df[['appet']] = df[['appet']].replace(to_replace={'good':1,'poor':0,'no':np.nan})

In [None]:
df["classification"] = [1 if i == "ckd" else 0 for i in df["classification"]]

In [None]:
df.head().T


In [None]:
df.dtypes

In [None]:
df.pcv = pd.to_numeric(df.pcv, errors='coerce')
df.pc = pd.to_numeric(df.pc, errors='coerce')
df.dm = pd.to_numeric(df.dm, errors='coerce')
df.cad = pd.to_numeric(df.cad, errors='coerce')
df.wc = pd.to_numeric(df.wc, errors='coerce')
df.rc = pd.to_numeric(df.rc, errors='coerce')


In [None]:
#describe data 
df.describe().T

In [None]:
sum(df.duplicated())

In [None]:
df.isna().sum().sort_values()

In [None]:
((df.isnull().sum()/df.shape[0])*100).sort_values(ascending=False).plot(kind='bar', figsize=(10,10))

In [None]:
#show missing data
import missingno as msno

msno.matrix(df)
plt.show()

# DATA VISUALIZATION

In [None]:
plt.style.use("seaborn-dark-palette")


In [None]:
sns.countplot(df.classification)
plt.xlabel('Chronic Kidney Disease')
plt.title("patients Classification",fontsize=15)
plt.show()


In [None]:
# blood pressure graph
sns.factorplot(data=df, x='bp', kind= 'count',size=6,aspect=2)
plt.xlabel('Chronic Kidney Disease')
plt.title("blood pressure graph",fontsize=15)
plt.show()


In [None]:
#density-frequency graph

sns.factorplot(data=df, x='sg', kind= 'count',size=6,aspect=2)
plt.xlabel('Chronic Kidney Disease')
plt.title("density-frequency graph",fontsize=15)
plt.show()

In [None]:
#sugar-frequency graph
sns.factorplot(data=df, x='su', kind= 'count',size=6,aspect=2)
plt.xlabel('Chronic Kidney Disease')
plt.title("sugar-frequency graph",fontsize=15)
plt.show()

In [None]:
df.age.value_counts().sort_values()


In [None]:
# packed cell volume grahp
sns.factorplot(data=df, x='age', kind= 'count',aspect=5)
plt.xlabel('Chronic Kidney Disease')
plt.title("packed cell volume grahp",fontsize=15)
plt.show()

In [None]:
sns.pairplot(df )


In [None]:
#correlation map
f,ax = plt.subplots(figsize=(15, 15))
sns.heatmap(df.corr(),annot=True, linewidths=.5, fmt= '.1f',ax=ax)
plt.title('Correlations between different predictors')
plt.show()

pandas_profiling library Generates profile reports from a pandas DataFrame. The pandas df.describe() function is great but a little basic for serious exploratory data analysis. pandas_profiling extends the pandas DataFrame with df.profile_report() for quick data analysis.

For each column the following statistics - if relevant for the column type - are presented in an interactive HTML report:

Type inference: detect the types of columns in a dataframe.

* Essentials: type, unique values, missing values

* Quantile statistics like minimum value, Q1, median, Q3, maximum, range, interquartile range

* Descriptive statistics like mean, mode, standard deviation, sum, median absolute deviation, coefficient of variation, kurtosis, skewness

* Most frequent values

* Histogram

* Correlations highlighting of highly correlated variables, Spearman, Pearson and Kendall matrices

* Missing values matrix, count, heatmap and dendrogram of missing values

* Text analysis learn about categories (Uppercase, Space), scripts (Latin, Cyrillic) and blocks (ASCII) of text data.

In [None]:

profile = ProfileReport(df.corr(), title='Pandas profiling report ' , html={'style':{'full_width':True}})

profile.to_notebook_iframe()

In [None]:
#I used the knnimputer method for the remaining missing values
#because some features have specific values that's why I didn't get the mean value.
imputer = KNNImputer(n_neighbors=2)
df_filled = imputer.fit_transform(df)

In [None]:
df_filled.tolist()


In [None]:
df2 = pd.DataFrame(data = df_filled)


In [None]:
df2.info()

In [None]:
df2.isna().sum()

In [None]:
df2.head()


In [None]:
df2.to_csv('Chronic_KIdney_Disease_data.csv',index=False)

In [None]:
y=df2[24].values


In [None]:
y

In [None]:
x_data=df2.drop([24],axis=1)

In [None]:
x=(x_data-np.min(x_data))/(np.max(x_data)-np.min(x_data))


In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=1,test_size=0.3)


In [None]:
# Import Libraries
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier 
#----------------------------------------------------

#----------------------------------------------------
#Applying VotingClassifier Model 

'''
#ensemble.VotingClassifier(estimators, voting=’hard’, weights=None,n_jobs=None, flatten_transform=None)
'''

#loading models for Voting Classifier
LRModel_ = LogisticRegression(solver='lbfgs', multi_class='multinomial',random_state=33)
RFModel_ = RandomForestClassifier(n_estimators=100, criterion='gini',max_depth=1, random_state=33)
SVCModel_ = SVC(kernel = 'rbf', random_state = 33,C = 0.9,degree = 5)
SGDModel_ = SGDClassifier(loss='log', penalty='l2', max_iter=10000, tol=1e-5)

#loading Voting Classifier
VotingClassifierModel = VotingClassifier(estimators=[('LRModel',LRModel_),('RFModel',RFModel_),('SVCModel',SVCModel_),('SGDModel',SGDModel_)], voting='hard')
VotingClassifierModel.fit(x_train, y_train)

#Calculating Details
print('VotingClassifierModel Train Score is : ' , VotingClassifierModel.score(x_train, y_train))

In [None]:

#Calculating Details
print('VotingClassifierModel Train Score is : ' , VotingClassifierModel.score(x_train, y_train))
print('VotingClassifierModel Test Score is : ' , VotingClassifierModel.score(x_test, y_test))
print('----------------------------------------------------')

#Calculating Prediction
y_pred = VotingClassifierModel.predict(x_test)
print('Predicted Value for VotingClassifierModel is : ' , y_pred[:10])

In [None]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix
y_pred=VotingClassifierModel.predict(x_test)
y_true=y_test
cm=confusion_matrix(y_true,y_pred)

#Confusion Matrix on Heatmap
f,ax=plt.subplots(figsize=(5,5))
sns.heatmap(cm,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.title("Voting Classifier Model Matrix")
plt.show()

In [None]:

# Saving the model
import pickle
pickle.dump(VotingClassifierModel, open('klney_clf.pkl', 'wb'))

In [None]:
#Import Libraries
from sklearn.ensemble import GradientBoostingClassifier
#----------------------------------------------------

#----------------------------------------------------
#Applying GradientBoostingClassifier Model 

'''
ensemble.GradientBoostingClassifier(loss='deviance’, learning_rate=0.1,n_estimators=100, subsample=1.0,
                                    criterion='friedman_mse’,min_samples_split=2,min_samples_leaf=1,
                                    min_weight_fraction_leaf=0.0,max_depth=3,min_impurity_decrease=0.0,
                                    min_impurity_split=None,init=None, random_state=None,max_features=None,
                                    verbose=0, max_leaf_nodes=None,warm_start=False, presort='auto’, 
                                    validation_fraction=0.1,n_iter_no_change=None, tol=0.0001)
'''

GBCModel = GradientBoostingClassifier(n_estimators=500,max_depth=3,random_state=33) 
GBCModel.fit(x_train, y_train)

#Calculating Details
print('GBCModel Train Score is : ' , GBCModel.score(x_train, y_train))
print('GBCModel Test Score is : ' , GBCModel.score(x_test, y_test))
print('GBCModel features importances are : ' , GBCModel.feature_importances_)
#print('----------------------------------------------------')

#Calculating Prediction
y_pred = GBCModel.predict(x_test)
y_pred_prob = GBCModel.predict_proba(x_test)
print('Predicted Value for GBCModel is : ' , y_pred[:10])
print('Prediction Probabilities Value for GBCModel is : ' , y_pred_prob[:10])

In [None]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix
y_pred=GBCModel.predict(x_test)
y_true=y_test
cm=confusion_matrix(y_true,y_pred)

#Confusion Matrix on Heatmap
f,ax=plt.subplots(figsize=(5,5))
sns.heatmap(cm,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.title("GBCModel Matrix")
plt.show()