In [2]:
import pandas as pd
import numpy as np

In [18]:


pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
from ucimlrepo import fetch_ucirepo

# fetch dataset
chronic_kidney_disease = fetch_ucirepo(id=336)

# data (as pandas dataframes)
X = chronic_kidney_disease.data.features
y = chronic_kidney_disease.data.targets

# metadata
# print(chronic_kidney_disease.metadata)

# variable information

# print(chronic_kidney_disease.variables)

age_bp_df = pd.DataFrame({
    'age': X['age'],
    'bp': X['bp'],
    'sg': X['sg'],
    'al': X['al'],
    'su': X['su'],
    'bgr': X['bgr'],
    'bu': X['bu'],
    'pot': X['pot'],
    'hemo': X['hemo'],
    'pcv': X['pcv'],
    'wbcc': X['wbcc'],
    'rbcc': X['rbcc']
})

y = y.copy()
y.loc[:, 'class'] = y['class'].replace({'ckd': 'a', 'ckd\t': 'a', 'notckd': 'c'})

combined_df = age_bp_df.join(y['class'])

#Counts the missing values in each row
missing_count = combined_df.isnull().sum(axis=1)

filtered_df = combined_df[missing_count < 3]

filtered_df.reset_index(drop=True, inplace=True)

# Convert hemoglobin values from g/dl to g/l
age_bp_df['hemo'] = age_bp_df['hemo'].apply(lambda x: x * 10 if pd.notnull(x) else x)

# Combine the datasets after conversion
combined_df = age_bp_df.join(y['class'])

# Split the filtered_df into two DataFrames based on the 'class' column
affected_df = filtered_df[filtered_df['class'] == 'a']
control_df = filtered_df[filtered_df['class'] == 'c']

control_df.reset_index(drop=True, inplace=True)

# Display the updated dataframe
print(combined_df.to_string(index=False))

print(f"NNumber of rows left before filtering: {len(combined_df)}")

print(f"Number of rows left after filtering: {len(filtered_df)}")

print(filtered_df.to_string(index=False))

# Display the DataFrames
print("Affected Individuals DataFrame:")
display(affected_df)
print(f"Number of rows in affected DataFrame: {len(affected_df)}")

print("\nControl Individuals DataFrame:")
display(control_df)
print(f"Number of rows in control DataFrame: {len(control_df)}")


 age    bp    sg  al  su   bgr    bu  pot  hemo  pcv    wbcc  rbcc class
48.0  80.0 1.020 1.0 0.0 121.0  36.0  NaN 154.0 44.0  7800.0   5.2     a
 7.0  50.0 1.020 4.0 0.0   NaN  18.0  NaN 113.0 38.0  6000.0   NaN     a
62.0  80.0 1.010 2.0 3.0 423.0  53.0  NaN  96.0 31.0  7500.0   NaN     a
48.0  70.0 1.005 4.0 0.0 117.0  56.0  2.5 112.0 32.0  6700.0   3.9     a
51.0  80.0 1.010 2.0 0.0 106.0  26.0  NaN 116.0 35.0  7300.0   4.6     a
60.0  90.0 1.015 3.0 0.0  74.0  25.0  3.2 122.0 39.0  7800.0   4.4     a
68.0  70.0 1.010 0.0 0.0 100.0  54.0  4.0 124.0 36.0     NaN   NaN     a
24.0   NaN 1.015 2.0 4.0 410.0  31.0  NaN 124.0 44.0  6900.0   5.0     a
52.0 100.0 1.015 3.0 0.0 138.0  60.0  NaN 108.0 33.0  9600.0   4.0     a
53.0  90.0 1.020 2.0 0.0  70.0 107.0  3.7  95.0 29.0 12100.0   3.7     a
50.0  60.0 1.010 2.0 4.0 490.0  55.0  NaN  94.0 28.0     NaN   NaN     a
63.0  70.0 1.010 3.0 0.0 380.0  60.0  4.2 108.0 32.0  4500.0   3.8     a
68.0  70.0 1.015 3.0 1.0 208.0  72.0  5.8  97.0 28.

Unnamed: 0,age,bp,sg,al,su,bgr,bu,pot,hemo,pcv,wbcc,rbcc,class
0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,,15.4,44.0,7800.0,5.2,a
1,62.0,80.0,1.01,2.0,3.0,423.0,53.0,,9.6,31.0,7500.0,,a
2,48.0,70.0,1.005,4.0,0.0,117.0,56.0,2.5,11.2,32.0,6700.0,3.9,a
3,51.0,80.0,1.01,2.0,0.0,106.0,26.0,,11.6,35.0,7300.0,4.6,a
4,60.0,90.0,1.015,3.0,0.0,74.0,25.0,3.2,12.2,39.0,7800.0,4.4,a
5,68.0,70.0,1.01,0.0,0.0,100.0,54.0,4.0,12.4,36.0,,,a
6,24.0,,1.015,2.0,4.0,410.0,31.0,,12.4,44.0,6900.0,5.0,a
7,52.0,100.0,1.015,3.0,0.0,138.0,60.0,,10.8,33.0,9600.0,4.0,a
8,53.0,90.0,1.02,2.0,0.0,70.0,107.0,3.7,9.5,29.0,12100.0,3.7,a
9,63.0,70.0,1.01,3.0,0.0,380.0,60.0,4.2,10.8,32.0,4500.0,3.8,a


Number of rows in affected DataFrame: 138

Control Individuals DataFrame:


Unnamed: 0,age,bp,sg,al,su,bgr,bu,pot,hemo,pcv,wbcc,rbcc,class
0,40.0,80.0,1.025,0.0,0.0,140.0,10.0,5.0,15.0,48.0,10400.0,4.5,c
1,23.0,80.0,1.025,0.0,0.0,70.0,36.0,4.6,17.0,52.0,9800.0,5.0,c
2,45.0,80.0,1.025,0.0,0.0,82.0,49.0,4.4,15.9,46.0,9100.0,4.7,c
3,57.0,80.0,1.025,0.0,0.0,119.0,17.0,4.7,15.4,42.0,6200.0,6.2,c
4,51.0,60.0,1.025,0.0,0.0,99.0,38.0,3.7,13.0,49.0,8300.0,5.2,c
5,34.0,80.0,1.025,0.0,0.0,121.0,27.0,3.9,13.6,52.0,9200.0,6.3,c
6,60.0,80.0,1.025,0.0,0.0,131.0,10.0,5.0,14.5,41.0,10700.0,5.1,c
7,38.0,60.0,1.02,0.0,0.0,91.0,36.0,3.7,14.0,46.0,9100.0,5.8,c
8,42.0,80.0,1.02,0.0,0.0,98.0,20.0,3.5,13.9,44.0,8400.0,5.5,c
9,35.0,80.0,1.02,0.0,0.0,104.0,31.0,5.0,16.1,45.0,4300.0,5.2,c


Number of rows in control DataFrame: 139
