In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


import warnings
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')
%matplotlib inline
pd.set_option('display.max_columns', 26)

In [3]:
df= pd.read_csv('kidney_disease.csv')
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd


In [4]:
# dropping id column
df.drop('id', axis = 1, inplace = True)

In [5]:
df.columns = ['age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar', 'red_blood_cells', 'pus_cell',
              'pus_cell_clumps', 'bacteria', 'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium',
              'potassium', 'haemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count',
              'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite', 'peda_edema',
              'aanemia', 'class']

In [6]:
df = df[["age", "blood_pressure", "blood_glucose_random", "haemoglobin", "hypertension", "diabetes_mellitus", "appetite", "class"]]

In [7]:
cat_cols = [col for col in df.columns if df[col].dtype == 'object']
cat_cols

['hypertension', 'diabetes_mellitus', 'appetite', 'class']

In [8]:
for col in cat_cols:
    unique_values = df[col].unique()
    print(f"Unique values in column '{col}': {unique_values}")

Unique values in column 'hypertension': ['yes' 'no' nan]
Unique values in column 'diabetes_mellitus': ['yes' 'no' ' yes' '\tno' '\tyes' nan]
Unique values in column 'appetite': ['good' 'poor' nan]
Unique values in column 'class': ['ckd' 'ckd\t' 'notckd']


In [9]:
# Check for mixed data types in each column
mixed_data_types = {}
for col in df.columns:
    data_types = df[col].apply(type).unique()
    if len(data_types) > 1:
        mixed_data_types[col] = data_types

# Print columns with mixed data types
if mixed_data_types:
    print("Columns with mixed data types:")
    for col, data_types in mixed_data_types.items():
        print(f"{col}: {data_types}")
else:
    print("No columns with mixed data types found.")

Columns with mixed data types:
hypertension: [<class 'str'> <class 'float'>]
diabetes_mellitus: [<class 'str'> <class 'float'>]
appetite: [<class 'str'> <class 'float'>]


In [10]:
# Define columns with mixed data types
mixed_columns = ['hypertension', 'diabetes_mellitus', 'appetite']

# Iterate over each column with mixed data types
for col in mixed_columns:
    # Filter the DataFrame for entries where the data type is string
    str_entries = df[df[col].apply(lambda x: isinstance(x, str))]
    
    # Filter the DataFrame for entries where the data type is float
    float_entries = df[df[col].apply(lambda x: isinstance(x, float))]

    # Print out the entries for each data type
    print(f"Entries for column '{col}' where data type is string:")
    print(str_entries[col])
    print(f"Entries for column '{col}' where data type is float:")
    print(float_entries[col])

Entries for column 'hypertension' where data type is string:
0      yes
1       no
2       no
3      yes
4       no
      ... 
395     no
396     no
397     no
398     no
399     no
Name: hypertension, Length: 398, dtype: object
Entries for column 'hypertension' where data type is float:
288    NaN
297    NaN
Name: hypertension, dtype: object
Entries for column 'diabetes_mellitus' where data type is string:
0      yes
1       no
2      yes
3       no
4       no
      ... 
395     no
396     no
397     no
398     no
399     no
Name: diabetes_mellitus, Length: 398, dtype: object
Entries for column 'diabetes_mellitus' where data type is float:
288    NaN
297    NaN
Name: diabetes_mellitus, dtype: object
Entries for column 'appetite' where data type is string:
0      good
1      good
2      poor
3      poor
4      good
       ... 
395    good
396    good
397    good
398    good
399    good
Name: appetite, Length: 399, dtype: object
Entries for column 'appetite' where data type is float:
29

In [11]:
# Drop all rows containing NaN values
df.dropna(inplace=True)

In [12]:
# Print unique values in all categorical columns
for col in cat_cols:
    unique_values = df[col].unique()
    print(f"Unique values in column '{col}': {unique_values}")

Unique values in column 'hypertension': ['yes' 'no']
Unique values in column 'diabetes_mellitus': ['yes' 'no']
Unique values in column 'appetite': ['good' 'poor']
Unique values in column 'class': ['ckd' 'ckd\t' 'notckd']


In [13]:
df['class'] = df['class'].astype(str)

# Print the data type of the 'class' column after conversion
print(f"Data type of column 'class' after conversion: {df['class'].dtype}")

Data type of column 'class' after conversion: object


In [14]:
mixed_data_types_class = df['class'].apply(type).nunique() > 1

if mixed_data_types_class:
    print("The 'class' column has mixed data types.")
else:
    print("The 'class' column does not have mixed data types.")


The 'class' column does not have mixed data types.


In [15]:
# Check if there are any null values in the 'class' column
class_null_values = df['class'].isnull().any()

if class_null_values:
    print("There are null values in the 'class' column.")
else:
    print("There are no null values in the 'class' column.")

There are no null values in the 'class' column.


In [16]:
# Check unique values in the 'class' column
unique_values_class = df['class'].unique()
print(f"Unique values in the 'class' column: {unique_values_class}")

# Check if there are any non-string values
non_string_values = [value for value in unique_values_class if not isinstance(value, str)]
if non_string_values:
    print("Non-string values found in the 'class' column.")
    print("Inspect and clean these values before converting to string.")
else:
    print("All values in the 'class' column are strings.")

Unique values in the 'class' column: ['ckd' 'ckd\t' 'notckd']
All values in the 'class' column are strings.


In [17]:
# Standardize values in the 'class' column
df['class'] = df['class'].replace({'ckd\t': 'ckd'})

# Check the unique values in the 'class' column again
unique_values_class = df['class'].unique()
print(f"Unique values in the 'class' column after standardization: {unique_values_class}")

Unique values in the 'class' column after standardization: ['ckd' 'notckd']


In [18]:
df.shape

(296, 8)

In [19]:
from sklearn.preprocessing import LabelEncoder

# Assuming cat_cols is the list of object columns
for col in cat_cols:
    label_encoder = LabelEncoder()
    df[col] = label_encoder.fit_transform(df[col])


In [20]:
ind_col = [col for col in df.columns if col != 'class']
dep_col = 'class'

X = df[ind_col]
y = df[dep_col]

In [21]:
# splitting data intp training and test set

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

In [22]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

# accuracy score, confusion matrix and classification report of decision tree

dtc_acc = accuracy_score(y_test, dtc.predict(X_test))

print(f"Training Accuracy of Decision Tree Classifier is {accuracy_score(y_train, dtc.predict(X_train))}")
print(f"Test Accuracy of Decision Tree Classifier is {dtc_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, dtc.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, dtc.predict(X_test))}")

Training Accuracy of Decision Tree Classifier is 1.0
Test Accuracy of Decision Tree Classifier is 0.9662921348314607 

Confusion Matrix :- 
[[43  2]
 [ 1 43]]

Classification Report :- 
               precision    recall  f1-score   support

           0       0.98      0.96      0.97        45
           1       0.96      0.98      0.97        44

    accuracy                           0.97        89
   macro avg       0.97      0.97      0.97        89
weighted avg       0.97      0.97      0.97        89



In [23]:
from sklearn.model_selection import GridSearchCV
grid_param = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [3, 5, 7, 10],
    'splitter' : ['best', 'random'],
    'min_samples_leaf' : [1, 2, 3, 5, 7],
    'min_samples_split' : [1, 2, 3, 5, 7],
    'max_features' : ['auto', 'sqrt', 'log2']
}

grid_search_dtc = GridSearchCV(dtc, grid_param, cv = 5, n_jobs = -1, verbose = 1)
grid_search_dtc.fit(X_train, y_train)

Fitting 5 folds for each of 1200 candidates, totalling 6000 fits


In [24]:
# best estimator

dtc = grid_search_dtc.best_estimator_

# accuracy score, confusion matrix and classification report of decision tree

dtc_acc = accuracy_score(y_test, dtc.predict(X_test))

print(f"Training Accuracy of Decision Tree Classifier is {accuracy_score(y_train, dtc.predict(X_train))}")
print(f"Test Accuracy of Decision Tree Classifier is {dtc_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, dtc.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, dtc.predict(X_test))}")

Training Accuracy of Decision Tree Classifier is 0.9710144927536232
Test Accuracy of Decision Tree Classifier is 0.9550561797752809 

Confusion Matrix :- 
[[41  4]
 [ 0 44]]

Classification Report :- 
               precision    recall  f1-score   support

           0       1.00      0.91      0.95        45
           1       0.92      1.00      0.96        44

    accuracy                           0.96        89
   macro avg       0.96      0.96      0.96        89
weighted avg       0.96      0.96      0.95        89



In [25]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier()
ada.fit(X_train, y_train)

# accuracy score, confusion matrix and classification report of ada boost

ada_acc = accuracy_score(y_test, ada.predict(X_test))

print(f"Training Accuracy of Ada Boost Classifier is {accuracy_score(y_train, ada.predict(X_train))}")
print(f"Test Accuracy of Ada Boost Classifier is {ada_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, ada.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, ada.predict(X_test))}")

Training Accuracy of Ada Boost Classifier is 1.0
Test Accuracy of Ada Boost Classifier is 0.9662921348314607 

Confusion Matrix :- 
[[43  2]
 [ 1 43]]

Classification Report :- 
               precision    recall  f1-score   support

           0       0.98      0.96      0.97        45
           1       0.96      0.98      0.97        44

    accuracy                           0.97        89
   macro avg       0.97      0.97      0.97        89
weighted avg       0.97      0.97      0.97        89



In [26]:
import pickle

with open('adaboost_model.pkl', 'wb') as f:
    pickle.dump(ada, f)

In [27]:
df.columns

Index(['age', 'blood_pressure', 'blood_glucose_random', 'haemoglobin',
       'hypertension', 'diabetes_mellitus', 'appetite', 'class'],
      dtype='object')

In [28]:
df.info

<bound method DataFrame.info of       age  blood_pressure  blood_glucose_random  haemoglobin  hypertension  \
0    48.0            80.0                 121.0         15.4             1   
2    62.0            80.0                 423.0          9.6             0   
3    48.0            70.0                 117.0         11.2             1   
4    51.0            80.0                 106.0         11.6             0   
5    60.0            90.0                  74.0         12.2             1   
..    ...             ...                   ...          ...           ...   
395  55.0            80.0                 140.0         15.7             0   
396  42.0            70.0                  75.0         16.5             0   
397  12.0            80.0                 100.0         15.8             0   
398  17.0            60.0                 114.0         14.2             0   
399  58.0            80.0                 131.0         15.8             0   

     diabetes_mellitus  appetit

In [29]:
df.describe


<bound method NDFrame.describe of       age  blood_pressure  blood_glucose_random  haemoglobin  hypertension  \
0    48.0            80.0                 121.0         15.4             1   
2    62.0            80.0                 423.0          9.6             0   
3    48.0            70.0                 117.0         11.2             1   
4    51.0            80.0                 106.0         11.6             0   
5    60.0            90.0                  74.0         12.2             1   
..    ...             ...                   ...          ...           ...   
395  55.0            80.0                 140.0         15.7             0   
396  42.0            70.0                  75.0         16.5             0   
397  12.0            80.0                 100.0         15.8             0   
398  17.0            60.0                 114.0         14.2             0   
399  58.0            80.0                 131.0         15.8             0   

     diabetes_mellitus  appet

In [30]:
df.describe()

Unnamed: 0,age,blood_pressure,blood_glucose_random,haemoglobin,hypertension,diabetes_mellitus,appetite,class
count,296.0,296.0,296.0,296.0,296.0,296.0,296.0,296.0
mean,52.726351,76.385135,146.378378,12.669595,0.371622,0.351351,0.195946,0.445946
std,15.618637,13.558673,78.944899,2.858793,0.484056,0.478201,0.397599,0.497911
min,6.0,50.0,22.0,3.1,0.0,0.0,0.0,0.0
25%,44.0,70.0,99.0,10.4,0.0,0.0,0.0,0.0
50%,55.0,80.0,120.0,13.0,0.0,0.0,0.0,0.0
75%,64.25,80.0,158.25,15.0,1.0,1.0,0.0,1.0
max,90.0,180.0,490.0,17.8,1.0,1.0,1.0,1.0
