<a href="https://colab.research.google.com/github/MUHAMMADALIAFZAL/Data_Science_Project_kidney_disease_Classification/blob/main/Project_Code_CKD_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing the Libraries

In [1]:
import math
import itertools
import numpy as np                         # Data Manipulation
import pandas as pd                        # Data Manipulation
import seaborn as sns                      # For Plotting
import matplotlib.pyplot as plt            # For Plotting
from sklearn.impute import SimpleImputer   # Predict Missing Values to Fill Nulls in the Dataset
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier

# Loading the Dataset

In [2]:
# Reading the CSV File
dataframe = pd.read_csv('/content/drive/MyDrive/kidney_disease.csv')
dataframe = dataframe.drop('id', axis=1)
dataframe.head()  # Showing the DataFram

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,...,38,6000,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35,7300,4.6,no,no,no,good,no,no,ckd


# Data Statistics

In [3]:
dataframe.describe()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo
count,391.0,388.0,353.0,354.0,351.0,356.0,381.0,383.0,313.0,312.0,348.0
mean,51.483376,76.469072,1.017408,1.016949,0.450142,148.036517,57.425722,3.072454,137.528754,4.627244,12.526437
std,17.169714,13.683637,0.005717,1.352679,1.099191,79.281714,50.503006,5.741126,10.408752,3.193904,2.912587
min,2.0,50.0,1.005,0.0,0.0,22.0,1.5,0.4,4.5,2.5,3.1
25%,42.0,70.0,1.01,0.0,0.0,99.0,27.0,0.9,135.0,3.8,10.3
50%,55.0,80.0,1.02,0.0,0.0,121.0,42.0,1.3,138.0,4.4,12.65
75%,64.5,80.0,1.02,2.0,0.0,163.0,66.0,2.8,142.0,4.9,15.0
max,90.0,180.0,1.025,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8


# Datatypes

In [4]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             391 non-null    float64
 1   bp              388 non-null    float64
 2   sg              353 non-null    float64
 3   al              354 non-null    float64
 4   su              351 non-null    float64
 5   rbc             248 non-null    object 
 6   pc              335 non-null    object 
 7   pcc             396 non-null    object 
 8   ba              396 non-null    object 
 9   bgr             356 non-null    float64
 10  bu              381 non-null    float64
 11  sc              383 non-null    float64
 12  sod             313 non-null    float64
 13  pot             312 non-null    float64
 14  hemo            348 non-null    float64
 15  pcv             330 non-null    object 
 16  wc              295 non-null    object 
 17  rc              270 non-null    obj

# Renaming the Columns

In [5]:
renamed_columns = {
    "age": "Age",
    "bp": "Blood_Pressure",
    "sg": "Specific_Gravity",
    "al": "Albumin",
    "su": "Sugar",
    "rbc": "Red_Blood_Cells",
    "pc": "Pus_Cells",
    "pcc": "Pus_Cell_Clumps",
    "ba": "Bacteria",
    "bgr": "Blood_Glucose_Random",
    "bu": "Blood_Urea",
    "sc": "Serum_Creatinine",
    "sod": "Sodium",
    "pot": "Potassium",
    "hemo": "Hemoglobin",
    "pcv": "Packed_Cell_Volume",
    "wc": "White_Blood_Cell_Count",
    "rc": "Red_Blood_Cell_Count",
    "htn": "Hypertension",
    "dm": "Diabetes_Mellitus",
    "cad": "Coronary_Artery_Disease",
    "appet": "Appetite",
    "pe": "Pedal_Edema",
    "ane": "Anemia",
    "classification": "Classification"
}

# Renaming the columns in the DataFrame
dataframe = dataframe.rename(columns=renamed_columns)

# Displaying the renamed DataFrame columns
dataframe.head()

Unnamed: 0,Age,Blood_Pressure,Specific_Gravity,Albumin,Sugar,Red_Blood_Cells,Pus_Cells,Pus_Cell_Clumps,Bacteria,Blood_Glucose_Random,...,Packed_Cell_Volume,White_Blood_Cell_Count,Red_Blood_Cell_Count,Hypertension,Diabetes_Mellitus,Coronary_Artery_Disease,Appetite,Pedal_Edema,Anemia,Classification
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,...,38,6000,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35,7300,4.6,no,no,no,good,no,no,ckd


# NULL Value Count for Each Column

In [6]:
# Counting null values in each column
null_values_count = dataframe.isnull().sum()

# Displaying the count of null values
null_values_count

Unnamed: 0,0
Age,9
Blood_Pressure,12
Specific_Gravity,47
Albumin,46
Sugar,49
Red_Blood_Cells,152
Pus_Cells,65
Pus_Cell_Clumps,4
Bacteria,4
Blood_Glucose_Random,44


# Imputing Values in the Columns

In [7]:
# Separating numerical and categorical columns
numerical_columns = dataframe.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = dataframe.select_dtypes(include=['object']).columns

# Imputation for numerical columns (filling with mean)
numerical_imputer = SimpleImputer(strategy='mean')
dataframe[numerical_columns] = numerical_imputer.fit_transform(dataframe[numerical_columns])

# Imputation for categorical columns (filling with most frequent value)
categorical_imputer = SimpleImputer(strategy='most_frequent')
dataframe[categorical_columns] = categorical_imputer.fit_transform(dataframe[categorical_columns])

# Displaying the DataFrame after imputation
dataframe.head()

Unnamed: 0,Age,Blood_Pressure,Specific_Gravity,Albumin,Sugar,Red_Blood_Cells,Pus_Cells,Pus_Cell_Clumps,Bacteria,Blood_Glucose_Random,...,Packed_Cell_Volume,White_Blood_Cell_Count,Red_Blood_Cell_Count,Hypertension,Diabetes_Mellitus,Coronary_Artery_Disease,Appetite,Pedal_Edema,Anemia,Classification
0,48.0,80.0,1.02,1.0,0.0,normal,normal,notpresent,notpresent,121.0,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,normal,normal,notpresent,notpresent,148.036517,...,38,6000,5.2,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31,7500,5.2,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [8]:
# Counting null values in each column
null_values_count = dataframe.isnull().sum()

# Displaying the count of null values
null_values_count

Unnamed: 0,0
Age,0
Blood_Pressure,0
Specific_Gravity,0
Albumin,0
Sugar,0
Red_Blood_Cells,0
Pus_Cells,0
Pus_Cell_Clumps,0
Bacteria,0
Blood_Glucose_Random,0


# Data Cleaning

In [11]:
# Unique values in categorical columns only
unique_categorical_values = {col: dataframe[col].unique() for col in categorical_columns}

unique_categorical_values

{'Red_Blood_Cells': array(['normal', 'abnormal'], dtype=object),
 'Pus_Cells': array(['normal', 'abnormal'], dtype=object),
 'Pus_Cell_Clumps': array(['notpresent', 'present'], dtype=object),
 'Bacteria': array(['notpresent', 'present'], dtype=object),
 'Packed_Cell_Volume': array(['44', '38', '31', '32', '35', '39', '36', '33', '29', '28', '41',
        '16', '24', '37', '30', '34', '40', '45', '27', '48', '\t?', '52',
        '14', '22', '18', '42', '17', '46', '23', '19', '25', '26', '15',
        '21', '43', '20', '\t43', '47', '9', '49', '50', '53', '51', '54'],
       dtype=object),
 'White_Blood_Cell_Count': array(['7800', '6000', '7500', '6700', '7300', '9800', '6900', '9600',
        '12100', '4500', '12200', '11000', '3800', '11400', '5300', '9200',
        '6200', '8300', '8400', '10300', '9100', '7900', '6400', '8600',
        '18900', '21600', '4300', '8500', '11300', '7200', '7700', '14600',
        '6300', '\t6200', '7100', '11800', '9400', '5500', '5800', '13200',
     

In [12]:
# Fixing abnormal values in the columns by normalizing them

# Replacing inconsistent values in each column
dataframe['Packed_Cell_Volume'] = dataframe['Packed_Cell_Volume'].replace(['\t43', '\t?'], ['43', None]).astype(float, errors='ignore')
dataframe['White_Blood_Cell_Count'] = dataframe['White_Blood_Cell_Count'].replace(['\t6200', '\t8400', '\t?'], [6200, 8400, None]).astype(float, errors='ignore')
dataframe['Red_Blood_Cell_Count'] = dataframe['Red_Blood_Cell_Count'].replace(['\t?'], [None]).astype(float, errors='ignore')
dataframe['Diabetes_Mellitus'] = dataframe['Diabetes_Mellitus'].replace([' yes', '\tyes', '\tno'], ['yes', 'yes', 'no'])
dataframe['Coronary_Artery_Disease'] = dataframe['Coronary_Artery_Disease'].replace(['\tno'], ['no'])
dataframe['Classification'] = dataframe['Classification'].replace(['ckd\t'], ['ckd'])

# Displaying the DataFrame to confirm corrections
dataframe.head()


Unnamed: 0,Age,Blood_Pressure,Specific_Gravity,Albumin,Sugar,Red_Blood_Cells,Pus_Cells,Pus_Cell_Clumps,Bacteria,Blood_Glucose_Random,...,Packed_Cell_Volume,White_Blood_Cell_Count,Red_Blood_Cell_Count,Hypertension,Diabetes_Mellitus,Coronary_Artery_Disease,Appetite,Pedal_Edema,Anemia,Classification
0,48.0,80.0,1.02,1.0,0.0,normal,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,normal,normal,notpresent,notpresent,148.036517,...,38.0,6000.0,5.2,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,5.2,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd
