In [None]:
# import numpy as np
# import pandas as pd

# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler

# import seaborn as sns

# import tensorflow as tf
# from tensorflow import keras

In [None]:
# READ FROM CSV AND CREATE A DATA FRAME

import pandas as pd

df = pd.read_csv('./kidney_disease.csv')
# print(df)
df = df.sample(frac=1).reset_index(drop=True)
print(df.head)

In [None]:
df.isnull().sum()

In [None]:
df.drop('id', axis = 1, inplace = True)
df.columns = ['age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar', 'red_blood_cells', 'pus_cell',
              'pus_cell_clumps', 'bacteria', 'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium',
              'potassium', 'haemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count',
              'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite', 'peda_edema',
              'aanemia', 'class']
print(len(df.columns))
print(df.head())
df.describe()

In [None]:
df['packed_cell_volume'] = pd.to_numeric(df['packed_cell_volume'], errors='coerce')
df['white_blood_cell_count'] = pd.to_numeric(df['white_blood_cell_count'], errors='coerce')
df['red_blood_cell_count'] = pd.to_numeric(df['red_blood_cell_count'], errors='coerce')

In [None]:
cat_cols = [col for col in df.columns if df[col].dtype == 'object']
num_cols = [col for col in df.columns if df[col].dtype != 'object']
for col in cat_cols:
    print(f"{col} has {df[col].unique()} values\n")

In [None]:
df['class'] = df['class'].map({'ckd': 1, 'notckd': 0})
df['class'] = pd.to_numeric(df['class'], errors='coerce')

In [None]:
cols = ['diabetes_mellitus', 'coronary_artery_disease', 'class']

for col in cols:
    print(f"{col} has {df[col].unique()} values\n")

In [None]:
# for col in num_cols:
#     print(f"{col} has {df[col].unique()} values\n")

# cat_cols.append("specific_gravity")
# cat_cols.append("albumin")
# cat_cols.append("sugar")

# num_cols.remove("specific_gravity")
# num_cols.remove("albumin")
# num_cols.remove("sugar")

In [None]:
# checking numerical features distribution

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize = (20, 5))
plotnumber = 1

for column in num_cols:
    if plotnumber <= 14:
        ax = plt.subplot(2, 7, plotnumber)
        # sns.distplot(df[column])
        sns.histplot(df[column])
        plt.xlabel(column)
        
    plotnumber += 1

plt.tight_layout()
plt.show()

In [None]:
# looking at categorical columns

plt.figure(figsize = (20, 5))
plotnumber = 1

for column in cat_cols:
    if plotnumber <= 11:
        ax = plt.subplot(2, 6, plotnumber)
        sns.countplot(df[column])
        plt.xlabel(column)
        
    plotnumber += 1

plt.tight_layout()
plt.show()

In [None]:
# heatmap of data
plt.figure(figsize = (15, 8))
sns.heatmap(df[num_cols].corr(), annot = True, linewidths = 2, linecolor = 'lightgrey')
plt.show()

In [None]:
print(df["age"].isna().sum())
r = df["age"].dropna().sample(df["age"].isna().sum())
print(r)
r.index = df[df["age"].isnull()].index
print(r.index)
# filling null values, we will use two methods, random sampling for higher null values and 
# mean/mode sampling for lower null values

def random_value_imputation(feature):
    # Take N number of random samples from the feature where N = number of NA values
    random_sample = df[feature].dropna().sample(df[feature].isna().sum())
    random_sample.index = df[df[feature].isnull()].index
    df.loc[df[feature].isnull(), feature] = random_sample
    
def impute_mode(feature):
    mode = df[feature].mode()[0]
    df[feature] = df[feature].fillna(mode)

In [None]:
# def random_value_imputation_by_class(feature):
#     # Take N number of random samples from the feature where N = number of NA values
#     random_sample = df[feature].dropna().sample(df[feature].isna().sum())
#     random_sample.index = df[df[feature].isnull()].index
#     df.loc[df[feature].isnull(), feature] = random_sample
    
# def impute_mode_by_class(feature):
#     mode = df[feature].mode()[0]
#     df[feature] = df[feature].fillna(mode)

# from sklearn.impute import KNNImputer

# imputer = KNNImputer(n_neighbors=5, weights="uniform")
# imputer.fit_transform(df)

In [None]:
for col in num_cols:
    random_value_imputation(col)

In [None]:
df[num_cols].isnull().sum()

In [None]:
for col in cat_cols:
    print(col)
    print(df.loc[col])
    impute_mode(col)

In [None]:
for col in cat_cols:
    print(f"{col} has {df[col].nunique()} categories\n")

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in cat_cols:
    df[col] = le.fit_transform(df[col])

In [None]:
df.head()

In [None]:
ind_col = [col for col in df.columns if col != 'class']
dep_col = 'class'

x = df[ind_col]
y = df[dep_col]

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [None]:
# hyper parameter tuning of decision tree 

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)

grid_param = {
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'splitter' : ['best', 'random'],
    'max_depth' : [3, 5, 7, 10, 12, 15, 20],
    'min_samples_split' : [2, 3, 5, 7],
    'min_samples_leaf' : [1, 2, 3, 5, 7],
    'max_features' : ['sqrt', 'log2']
}

grid_search_dt = GridSearchCV(dt, grid_param, cv = 5, n_jobs = -1, verbose = 3)
grid_search_dt.fit(x_train, y_train)



# best parameters and best score
print(grid_search_dt.best_params_)
print(grid_search_dt.best_score_)
print(grid_search_dt.best_estimator_)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# dt = DecisionTreeClassifier(max_depth=5, max_features='log2', min_samples_split=7)
dt = grid_search_dt.best_estimator_
dt.fit(x_train, y_train)

# accuracy score, confusion matrix and classification report of decision tree

dt_acc = accuracy_score(y_test, dt.predict(x_test))

print(f"Training Accuracy of Decision Tree is {accuracy_score(y_train, dt.predict(x_train))}")
print(f"Test Accuracy of Decision Tree is {dt_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, dt.predict(x_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, dt.predict(x_test))}")

In [None]:
# from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# dt_acc = accuracy_score(y_test, dt.predict(x_test))

# print(f"Training Accuracy of Decision Tree is {accuracy_score(y_train, dt.predict(x_train))}")
# print(f"Testing Accuracy of Decision Tree is {dt_acc} \n")

# print(f"Confusion Matrix :- \n{confusion_matrix(y_test, dt.predict(x_test))}\n")
# print(f"Classification Report :- \n {classification_report(y_test, dt.predict(x_test))}")

In [None]:
# # hyper parameter tuning of random forest

# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# import matplotlib.pyplot as plt

# rf = RandomForestClassifier()
# rf.fit(x_train, y_train)


# from sklearn.model_selection import GridSearchCV
# grid_param = {
#     'criterion' : ['gini', 'entropy', 'log_loss'],
#     'max_depth' : [3, 5, 7, 10],
#     'class_weight': ['balanced', 
#                      'balanced_subsample'
#                      ],
#     'min_samples_leaf' : [1, 3, 5, 7],
#     'min_samples_split' : [2, 3, 5, 7],
#     'max_features' : ['sqrt', 'log2']
# }

# grid_search_rf = GridSearchCV(rf, grid_param, cv = 5, n_jobs = -1, verbose = 3)
# grid_search_rf.fit(x_train, y_train)



# # best parameters and best score
# print(grid_search_rf.best_params_)
# print(grid_search_rf.best_score_)
# print(grid_search_rf.best_estimator_)

In [None]:
# from sklearn.metrics import accuracy_score
# from sklearn.tree import DecisionTreeClassifier

# dt = DecisionTreeClassifier(max_depth=5, max_features='sqrt', min_samples_split=7)
# dt.fit(x_train, y_train)
# dt_acc = accuracy_score(y_test, dt.predict(x_test))
# print(dt_acc)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

rf = RandomForestClassifier(class_weight='balanced', max_depth=7, max_features='log2', min_samples_leaf=3, min_samples_split=5)
# rf = grid_search_rf.best_estimator_
rf.fit(x_train, y_train)
rf_acc = accuracy_score(y_test, rf.predict(x_test))

print(f"Training Accuracy of Random Forest Classifier is {accuracy_score(y_train, rf.predict(x_train))}")
print(f"Test Accuracy of Random Forest Classifier is {rf_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, rf.predict(x_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, rf.predict(x_test))}")

In [None]:
from sklearn.model_selection import cross_val_score, KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cross_val_results = cross_val_score(dt, x, y, cv=kf)
print(f'Cross-Validation Results (Accuracy) DT: {cross_val_results}')
print(f'Mean Accuracy: {cross_val_results.mean()}')

In [None]:
from sklearn.model_selection import cross_val_score, KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cross_val_results = cross_val_score(rf, x, y, cv=kf)
print(f'Cross-Validation Results (Accuracy): {cross_val_results}')
print(f'Mean Accuracy: {cross_val_results.mean()}')

In [None]:
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error
def error_report(model):
    mse = mean_squared_error(y_test, model.predict(x_test))
    print(f"Mean Squared Error: {mse}")
    rmse = root_mean_squared_error(y_test, model.predict(x_test))
    print(f"Root Mean Squared Error: {rmse}")
    mae = mean_absolute_error(y_test, model.predict(x_test))
    print(f"Mean Absolute Error: {mae}")
error_report(dt)
print()
error_report(rf)

In [None]:
# SAVING THE MODEL USING PICKLE PACKAGE
import pickle

# save the iris classification model as a pickle file
model_pkl_file = "./ckd-dt.pkl"

with open(model_pkl_file, 'wb') as file:  
    pickle.dump(dt, file)

In [None]:
# SAVING THE MODEL USING PICKLE PACKAGE
import pickle

# save the iris classification model as a pickle file
model_pkl_file = "./ckd-rf.pkl"

with open(model_pkl_file, 'wb') as file:  
    pickle.dump(rf, file)