# Cho dữ liệu chronic_kidney_disease.csv chứa thông tin của các bệnh nhân. 
## Bộ dữ liệu này có thể được sử dụng để dự đoán bệnh thận mãn tính và nó được thu thập trong bệnh viện gần 2 tháng.

## Thông tin dữ liệu:
## Dữ liệu có thể tham khảo và download tại: https://archive.ics.uci.edu/ml/datasets/chronic_kidney_disease

## Data Information:
- age - age
- bp - blood pressure
- sg - specific gravity
- al - albumin
- su - sugar
- rbc - red blood cells
- pc - pus cell
- pcc - pus cell clumps
- ba - bacteria
- bgr - blood glucose random
- bu - blood urea
- sc - serum creatinine
- sod - sodium
- pot - potassium
- hemo - hemoglobin
- pcv - packed cell volume
- wc - white blood cell count
- rc - red blood cell count
- htn - hypertension
- dm - diabetes mellitus
- cad - coronary artery disease
- appet - appetite
- pe - pedal edema
- ane - anemia
- class - class

## Yêu cầu
### Đọc dữ liệu, tìm hiểu sơ bộ về dữ liệu
### Chọn phương pháp để chuẩn hóa dữ liệu và thực hiện việc chuẩn hóa.

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sbn

In [19]:
col_names = ['age', 'bp', 'sg', 'al','su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'class']

ckd = pd.read_csv('data5/Ex2_Chonic_Kidney_Disease_Dataset/chronic_kidney_disease.csv', header=None, names=col_names)
#header=None to command the pd.read_csv not to treat the first row as header row

ckd

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,class
0,48,80,1.02,1,0,?,normal,notpresent,notpresent,121,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7,50,1.02,4,0,?,normal,notpresent,notpresent,?,...,38,6000,?,no,no,no,good,no,no,ckd
2,62,80,1.01,2,3,normal,normal,notpresent,notpresent,423,...,31,7500,?,no,yes,no,poor,no,yes,ckd
3,48,70,1.01,4,0,normal,abnormal,present,notpresent,117,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51,80,1.01,2,0,normal,normal,notpresent,notpresent,106,...,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55,80,1.02,0,0,normal,normal,notpresent,notpresent,140,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,42,70,1.03,0,0,normal,normal,notpresent,notpresent,75,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,12,80,1.02,0,0,normal,normal,notpresent,notpresent,100,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,17,60,1.03,0,0,normal,normal,notpresent,notpresent,114,...,51,7200,5.9,no,no,no,good,no,no,notckd


## Predict a patient is CKD or NotCKD ("class" column") from other information
## Output: "class" column
## Input: other columns

In [22]:
ckd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   age     400 non-null    object
 1   bp      400 non-null    object
 2   sg      400 non-null    object
 3   al      400 non-null    object
 4   su      400 non-null    object
 5   rbc     400 non-null    object
 6   pc      400 non-null    object
 7   pcc     400 non-null    object
 8   ba      400 non-null    object
 9   bgr     400 non-null    object
 10  bu      400 non-null    object
 11  sc      400 non-null    object
 12  sod     400 non-null    object
 13  pot     400 non-null    object
 14  hemo    400 non-null    object
 15  pcv     400 non-null    object
 16  wc      400 non-null    object
 17  rc      400 non-null    object
 18  htn     400 non-null    object
 19  dm      400 non-null    object
 20  cad     400 non-null    object
 21  appet   400 non-null    object
 22  pe      400 non-null    ob

In [30]:
for col in col_names:
    print(f"{col} _______ {len(ckd[col].unique())} unique values: {ckd[col].unique()}")
    print()

age _______ 77 unique values: ['48' '7' '62' '51' '60' '68' '24' '52' '53' '50' '63' '40' '47' '61' '21'
 '42' '75' '69' '?' '73' '70' '65' '76' '72' '82' '46' '45' '35' '54' '11'
 '59' '67' '15' '55' '44' '26' '64' '56' '5' '74' '38' '58' '71' '34' '17'
 '12' '43' '41' '57' '8' '39' '66' '81' '14' '27' '83' '30' '4' '3' '6'
 '32' '80' '49' '90' '78' '19' '2' '33' '36' '37' '23' '25' '20' '29' '28'
 '22' '79']

bp _______ 11 unique values: ['80' '50' '70' '90' '?' '100' '60' '110' '140' '180' '120']

sg _______ 4 unique values: ['1.02' '1.01' '?' '1.03']

al _______ 7 unique values: ['1' '4' '2' '3' '0' '?' '5']

su _______ 7 unique values: ['0' '3' '4' '1' '?' '2' '5']

rbc _______ 3 unique values: ['?' 'normal' 'abnormal']

pc _______ 3 unique values: ['normal' 'abnormal' '?']

pcc _______ 3 unique values: ['notpresent' 'present' '?']

ba _______ 3 unique values: ['notpresent' 'present' '?']

bgr _______ 147 unique values: ['121' '?' '423' '117' '106' '74' '100' '410' '138' '70' '490

## There many columns have "?" value, therefore their type is "object" => Replace "?" with np.nan
## Also, numeric data is written within '...', which makes them become a string => Use pd.numeric(..., errors='coerce')

In [72]:
ckd_clean = ckd.replace('?', np.nan)

col_numeric = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc']

for col in col_numeric:
    ckd_clean[col] = pd.to_numeric(ckd_clean[col])

ckd_clean

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,...,38.0,6000.0,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.01,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.02,0.0,0.0,normal,normal,notpresent,notpresent,140.0,...,47.0,6700.0,4.9,no,no,no,good,no,no,notckd
396,42.0,70.0,1.03,0.0,0.0,normal,normal,notpresent,notpresent,75.0,...,54.0,7800.0,6.2,no,no,no,good,no,no,notckd
397,12.0,80.0,1.02,0.0,0.0,normal,normal,notpresent,notpresent,100.0,...,49.0,6600.0,5.4,no,no,no,good,no,no,notckd
398,17.0,60.0,1.03,0.0,0.0,normal,normal,notpresent,notpresent,114.0,...,51.0,7200.0,5.9,no,no,no,good,no,no,notckd


In [74]:
ckd_clean.isnull().sum()

age        9
bp        12
sg        47
al        46
su        49
rbc      152
pc        65
pcc        4
ba         4
bgr       44
bu        19
sc        17
sod       87
pot       88
hemo      52
pcv       71
wc       106
rc       131
htn        2
dm         2
cad        2
appet      1
pe         1
ane        1
class      0
dtype: int64

## There are many rows with NaN values
## For numeric columns => replace NaN with median
## For categorical columns => drop

## Preprocess the numeric columns first, then dropna in categorical ones to reduce the number of deleted rows

In [77]:
for col in col_numeric:
    ckd_clean[col] = ckd_clean[col].fillna(ckd_clean[col].median())

ckd_no_na = ckd_clean.dropna()
ckd_no_na

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,class
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,4.8,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.01,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd
7,24.0,80.0,1.02,2.0,4.0,normal,abnormal,notpresent,notpresent,410.0,...,44.0,6900.0,5.0,no,yes,no,good,yes,no,ckd
8,52.0,100.0,1.02,3.0,0.0,normal,abnormal,present,notpresent,138.0,...,33.0,9600.0,4.0,yes,yes,no,good,no,yes,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.02,0.0,0.0,normal,normal,notpresent,notpresent,140.0,...,47.0,6700.0,4.9,no,no,no,good,no,no,notckd
396,42.0,70.0,1.03,0.0,0.0,normal,normal,notpresent,notpresent,75.0,...,54.0,7800.0,6.2,no,no,no,good,no,no,notckd
397,12.0,80.0,1.02,0.0,0.0,normal,normal,notpresent,notpresent,100.0,...,49.0,6600.0,5.4,no,no,no,good,no,no,notckd
398,17.0,60.0,1.03,0.0,0.0,normal,normal,notpresent,notpresent,114.0,...,51.0,7200.0,5.9,no,no,no,good,no,no,notckd


In [81]:
ckd_no_na.info()

<class 'pandas.core.frame.DataFrame'>
Index: 233 entries, 2 to 399
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     233 non-null    float64
 1   bp      233 non-null    float64
 2   sg      233 non-null    float64
 3   al      233 non-null    float64
 4   su      233 non-null    float64
 5   rbc     233 non-null    object 
 6   pc      233 non-null    object 
 7   pcc     233 non-null    object 
 8   ba      233 non-null    object 
 9   bgr     233 non-null    float64
 10  bu      233 non-null    float64
 11  sc      233 non-null    float64
 12  sod     233 non-null    float64
 13  pot     233 non-null    float64
 14  hemo    233 non-null    float64
 15  pcv     233 non-null    float64
 16  wc      233 non-null    float64
 17  rc      233 non-null    float64
 18  htn     233 non-null    object 
 19  dm      233 non-null    object 
 20  cad     233 non-null    object 
 21  appet   233 non-null    object 
 22  pe     

## In ckd_no_na, there are categorical columns whose dtypes==object
## => Set a boolean conditon (mask) to filter out these categorical columns

In [88]:
categorical_mask = ckd_no_na.dtypes=="object" #Set boolean conditon
 
col_categorical = ckd_no_na.columns[categorical_mask] #get categorical "object" columns using boolean condition

col_categorical

Index(['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane',
       'class'],
      dtype='object')

## Problem: the two ouput values "ckd" and "notckd" are not so balanced in quantities
## Keep using Logistic Regression model, if the result is not as good as expected, then apply Data Balancing method

In [100]:
ckd_no_na['class'].value_counts()

class
notckd    135
ckd        98
Name: count, dtype: int64

## Since the output have only <span style='color:red;'>two values (binary values)</span> = 'ckd' or 'notckd'
## Can build <span style='color:red;'>Logistic Regression model</span> to predict

## Also, we <span style='color:yellow;'>only need to encode CATEGORICAL features/columns</span>, while numeric ones do not

In [149]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
#from sklearn.utils.validation import column_or_1d

x = ckd_no_na.drop(['class'], axis=1)
x = pd.get_dummies(x, columns=col_categorical[:-1], drop_first=True).astype(int) #use dummies method to encode categorical columns
                                                                                #set col_categorical[:-1] to exclude the 'class' column

y = pd.get_dummies(ckd_no_na[['class']],).drop('class_notckd', axis=1).astype(int)
#encode y 'class' with dummies too

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2)

In [151]:
model_log_reg = LogisticRegression(solver='liblinear')

'''
solver='liblinear': This specifies the algorithm used to fit the logistic regression model. 
The 'liblinear' solver is one of the solvers available in Scikit-learn, which is particularly efficient for smaller datasets 
or when you need an implementation that works well for binary classification. It uses coordinate descent or a trust region method to optimize the model.
'''

model_log_reg.fit(x_train, y_train)

print("Log_reg model's score on Train set =", model_log_reg.score(x_train, y_train))
print("Log_reg model's score on Test set =", model_log_reg.score(x_test, y_test))

Log_reg model's score on Train set = 1.0
Log_reg model's score on Test set = 0.9574468085106383


  y = column_or_1d(y, warn=True)


In [153]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred_test = model_log_reg.predict(x_test)

print(confusion_matrix(y_test, y_pred_test))

print()

print(classification_report(y_test, y_pred_test))

[[26  0]
 [ 2 19]]

              precision    recall  f1-score   support

           0       0.93      1.00      0.96        26
           1       1.00      0.90      0.95        21

    accuracy                           0.96        47
   macro avg       0.96      0.95      0.96        47
weighted avg       0.96      0.96      0.96        47



## <span style='color:red'>Results:
### + The model has good results on both train and test set
### + Confusion matrix also reports high results