In [2]:
import pandas as pd
import numpy as np

In [3]:


pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
from ucimlrepo import fetch_ucirepo

# fetch dataset
chronic_kidney_disease = fetch_ucirepo(id=336)

# data (as pandas dataframes)
X = chronic_kidney_disease.data.features
y = chronic_kidney_disease.data.targets

# metadata
# print(chronic_kidney_disease.metadata)

# variable information

# print(chronic_kidney_disease.variables)

age_bp_df = pd.DataFrame({
    'age': X['age'],
    'bp': X['bp'],
    'sg': X['sg'],
    'al': X['al'],
    'su': X['su'],
    'bgr': X['bgr'],
    'bu': X['bu'],
    'pot': X['pot'],
    'hemo': X['hemo'],
    'pcv': X['pcv'],
    'wbcc': X['wbcc'],
    'rbcc': X['rbcc']
})

y = y.copy()
y.loc[:, 'class'] = y['class'].replace({'ckd': 'a', 'ckd\t': 'a', 'notckd': 'c'})

combined_df = age_bp_df.join(y['class'])

print(combined_df)


      age     bp     sg   al   su    bgr     bu   pot  hemo   pcv     wbcc  \
0    48.0   80.0  1.020  1.0  0.0  121.0   36.0   NaN  15.4  44.0   7800.0   
1     7.0   50.0  1.020  4.0  0.0    NaN   18.0   NaN  11.3  38.0   6000.0   
2    62.0   80.0  1.010  2.0  3.0  423.0   53.0   NaN   9.6  31.0   7500.0   
3    48.0   70.0  1.005  4.0  0.0  117.0   56.0   2.5  11.2  32.0   6700.0   
4    51.0   80.0  1.010  2.0  0.0  106.0   26.0   NaN  11.6  35.0   7300.0   
5    60.0   90.0  1.015  3.0  0.0   74.0   25.0   3.2  12.2  39.0   7800.0   
6    68.0   70.0  1.010  0.0  0.0  100.0   54.0   4.0  12.4  36.0      NaN   
7    24.0    NaN  1.015  2.0  4.0  410.0   31.0   NaN  12.4  44.0   6900.0   
8    52.0  100.0  1.015  3.0  0.0  138.0   60.0   NaN  10.8  33.0   9600.0   
9    53.0   90.0  1.020  2.0  0.0   70.0  107.0   3.7   9.5  29.0  12100.0   
10   50.0   60.0  1.010  2.0  4.0  490.0   55.0   NaN   9.4  28.0      NaN   
11   63.0   70.0  1.010  3.0  0.0  380.0   60.0   4.2  10.8  32.

In [24]:
# Pieni tarkistus että data on oikein voi ottaa pois kun valmis
data_id_0 = X.loc[0]
data_0 = y.loc[0]
print(data_id_0, data_0)

age            48.0
bp             80.0
sg             1.02
al              1.0
su              0.0
rbc             NaN
pc           normal
pcc      notpresent
ba       notpresent
bgr           121.0
bu             36.0
sc              1.2
sod             NaN
pot             NaN
hemo           15.4
pcv            44.0
wbcc         7800.0
rbcc            5.2
htn             yes
dm              yes
cad              no
appet          good
pe               no
ane              no
Name: 0, dtype: object class    a
Name: 0, dtype: object


In [4]:
#Counts the missing values in each row
missing_count = combined_df.isnull().sum(axis=1)

filtered_df = combined_df[missing_count < 3]

filtered_df.reset_index(drop=True, inplace=True)

print(f"NNumber of rows left before filtering: {len(combined_df)}")

print(f"Number of rows left after filtering: {len(filtered_df)}")

print(filtered_df.to_string(index=False))

NNumber of rows left before filtering: 400
Number of rows left after filtering: 277
 age    bp    sg  al  su   bgr    bu  pot  hemo  pcv    wbcc  rbcc class
48.0  80.0 1.020 1.0 0.0 121.0  36.0  NaN  15.4 44.0  7800.0   5.2     a
62.0  80.0 1.010 2.0 3.0 423.0  53.0  NaN   9.6 31.0  7500.0   NaN     a
48.0  70.0 1.005 4.0 0.0 117.0  56.0  2.5  11.2 32.0  6700.0   3.9     a
51.0  80.0 1.010 2.0 0.0 106.0  26.0  NaN  11.6 35.0  7300.0   4.6     a
60.0  90.0 1.015 3.0 0.0  74.0  25.0  3.2  12.2 39.0  7800.0   4.4     a
68.0  70.0 1.010 0.0 0.0 100.0  54.0  4.0  12.4 36.0     NaN   NaN     a
24.0   NaN 1.015 2.0 4.0 410.0  31.0  NaN  12.4 44.0  6900.0   5.0     a
52.0 100.0 1.015 3.0 0.0 138.0  60.0  NaN  10.8 33.0  9600.0   4.0     a
53.0  90.0 1.020 2.0 0.0  70.0 107.0  3.7   9.5 29.0 12100.0   3.7     a
63.0  70.0 1.010 3.0 0.0 380.0  60.0  4.2  10.8 32.0  4500.0   3.8     a
68.0  70.0 1.015 3.0 1.0 208.0  72.0  5.8   9.7 28.0 12200.0   3.4     a
68.0  80.0 1.010 3.0 2.0 157.0  90.0  6.

In [None]:
# Convert hemoglobin values from g/dl to g/l
age_bp_df['hemo'] = age_bp_df['hemo'].apply(lambda x: x * 10 if pd.notnull(x) else x)

# Combine the datasets after conversion
combined_df = age_bp_df.join(y['class'])

# Display the updated dataframe
print(combined_df.to_string(index=False))