In [10]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import pandas as pd

pd.set_option("display.max_columns", None)


import matplotlib
matplotlib.use('Agg')  # Use Agg backend if full matplotlib is imported

import matplotlib.pyplot as plt
plt.switch_backend('Agg')  # if only plt is imported

get_ipython().run_line_magic('matplotlib', 'inline')

import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

RANDOM_SEED = 42

# Set default style for seaborn
sns.set(style='whitegrid')


In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
# Data Loading
data_path = '/content/drive/MyDrive/Colab Notebooks/india_ckd.csv'
df = pd.read_csv(data_path, delimiter=',', encoding='ascii')

print('Dataset loaded successfully.')
print('Shape:', df.shape)

Dataset loaded successfully.
Shape: (1100, 26)


In [13]:
new_df = df.copy()

In [14]:
new_df.columns = [
    'id','age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar',
    'red_blood_cells', 'pus_cell', 'pus_cell_clumps', 'bacteria',
    'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium',
    'potassium', 'haemoglobin', 'packed_cell_volume', 'white_blood_cell_count',
    'red_blood_cell_count', 'hypertension', 'diabetes_mellitus',
    'coronary_artery_disease', 'appetite', 'peda_edema', 'anemia', 'class'
]

## 4. Fix Typos (errors in data entering)

In [15]:
# see what are the different labes if they are binary or not
print(f"diabetes_mellitus : {new_df['diabetes_mellitus'].unique()}")
print(f"coronary_artery_disease :{new_df['coronary_artery_disease'].unique()}")
print(f"class : {new_df['class'].unique()}")

#typos before

diabetes_mellitus : ['no' 'yes' '\tno' '\tyes' nan ' yes']
coronary_artery_disease :['no' 'yes' '\tno' nan]
class : ['ckd' 'notckd' 'ckd\t']


In [16]:
for col in new_df.select_dtypes(include=['object']).columns:
    print(col, new_df[col].unique()[:20])  # show first 10 unique values


red_blood_cells [nan 'normal' 'abnormal']
pus_cell ['normal' nan 'abnormal']
pus_cell_clumps ['notpresent' 'present' nan]
bacteria ['notpresent' 'present' nan]
packed_cell_volume ['52' '44' '41' '17' '28' '38' '24' '46' '\t43' '42' '32' '53' '30' '22'
 '48' '50' '26' '35' '29' '33']
white_blood_cell_count ['7000' '7300' '7200' '6500' '14600' nan '9200' '4900' '5800' '10500'
 '10400' '6000' '16300' '6900' '9100' '5500' '4700' '8000' '4500' '9400']
red_blood_cell_count [nan '6.4' '5.0' '3.2' '5.2' '5' '5.5' '4.2' '4.5' '2.7' '4.9' '5.8' '5.7'
 '5.4' '5.1' '3.4' '2.4' '4.8' '3.6' '4.7']
hypertension ['no' 'yes' nan]
diabetes_mellitus ['no' 'yes' '\tno' '\tyes' nan ' yes']
coronary_artery_disease ['no' 'yes' '\tno' nan]
appetite ['good' 'poor' nan]
peda_edema ['no' 'yes' nan]
anemia ['no' 'yes' nan]
class ['ckd' 'notckd' 'ckd\t']


In [17]:
# lable typing mistakes into correct yes or no
new_df['diabetes_mellitus'] = new_df['diabetes_mellitus'].replace(to_replace={" yes":"yes","\tyes":"yes","\tno":"no"})
new_df['coronary_artery_disease'] = new_df['coronary_artery_disease'].replace(to_replace={"\tno":"no"})
new_df['class'] = new_df['class'].replace(to_replace={"ckd\t":"ckd","notckd":"not ckd"})

In [18]:
#after
# see what are the different labes if they are binary or not
print(f"diabetes_mellitus : {new_df['diabetes_mellitus'].unique()}")
print(f"coronary_artery_disease :{new_df['coronary_artery_disease'].unique()}")
print(f"class : {new_df['class'].unique()}")

#after typos

diabetes_mellitus : ['no' 'yes' nan]
coronary_artery_disease :['no' 'yes' nan]
class : ['ckd' 'not ckd']
