In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler

In [2]:
data = pd.read_csv("./data/2020/heart_2020_cleaned.csv")
data.head(10)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
5,Yes,28.87,Yes,No,No,6.0,0.0,Yes,Female,75-79,Black,No,No,Fair,12.0,No,No,No
6,No,21.63,No,No,No,15.0,0.0,No,Female,70-74,White,No,Yes,Fair,4.0,Yes,No,Yes
7,No,31.64,Yes,No,No,5.0,0.0,Yes,Female,80 or older,White,Yes,No,Good,9.0,Yes,No,No
8,No,26.45,No,No,No,0.0,0.0,No,Female,80 or older,White,"No, borderline diabetes",No,Fair,5.0,No,Yes,No
9,No,40.69,No,No,No,0.0,0.0,Yes,Male,65-69,White,No,Yes,Good,10.0,No,No,No


In [3]:
data['SleepTime'].unique()

array([ 5.,  7.,  8.,  6., 12.,  4.,  9., 10., 15.,  3.,  2.,  1., 16.,
       18., 14., 20., 11., 13., 17., 24., 19., 21., 22., 23.])

# 1. Lấy các cột dữ liệu đề cập trong bài báo

In [4]:
post_data = data.loc[:, ["HeartDisease","BMI", "PhysicalHealth", "DiffWalking", "AgeCategory", "PhysicalActivity", "GenHealth", "SleepTime"]]

In [5]:
post_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   PhysicalHealth    319795 non-null  float64
 3   DiffWalking       319795 non-null  object 
 4   AgeCategory       319795 non-null  object 
 5   PhysicalActivity  319795 non-null  object 
 6   GenHealth         319795 non-null  object 
 7   SleepTime         319795 non-null  float64
dtypes: float64(3), object(5)
memory usage: 19.5+ MB


In [6]:
post_data['BMI'].unique()

array([16.6 , 20.34, 26.58, ..., 62.42, 51.46, 46.56])

In [7]:
post_data['PhysicalHealth'].unique()

array([ 3.,  0., 20., 28.,  6., 15.,  5., 30.,  7.,  1.,  2., 21.,  4.,
       10., 14., 18.,  8., 25., 16., 29., 27., 17., 24., 12., 23., 26.,
       22., 19.,  9., 13., 11.])

In [8]:
post_data['SleepTime'].unique()

array([ 5.,  7.,  8.,  6., 12.,  4.,  9., 10., 15.,  3.,  2.,  1., 16.,
       18., 14., 20., 11., 13., 17., 24., 19., 21., 22., 23.])

In [9]:
post_data.shape

(319795, 8)

In [10]:
post_data['HeartDisease'].value_counts()

HeartDisease
No     292422
Yes     27373
Name: count, dtype: int64

# 2. Tiền xử lý và train dữ liệu không xử lý mất cần bằng

In [11]:
def categorize_bmi_detailed(bmi):
    """
    Phân loại chỉ số BMI dựa trên bảng chi tiết.

    Args:
        bmi (float): Chỉ số khối cơ thể (Body Mass Index).

    Returns:
        str: Danh mục BMI tương ứng.
    """
    if bmi < 16.0:
        return 'Underweight'
    elif 16.0 <= bmi < 17.0: # Diễn giải: 16.0 đến dưới 17.0
        return 'Underweight'
    elif 17.0 <= bmi < 18.5: # Diễn giải: 17.0 đến dưới 18.5
        return 'Underweight'
    elif 18.5 <= bmi < 25.0: # Diễn giải: 18.5 đến dưới 25.0
        return 'Normal range'
    elif 25.0 <= bmi < 30.0: # Diễn giải: 25.0 đến dưới 30.0
        return 'Overweight'
    elif 30.0 <= bmi < 35.0: # Diễn giải: 30.0 đến dưới 35.0
        return 'Obese'
    elif 35.0 <= bmi < 40.0: # Diễn giải: 35.0 đến dưới 40.0
        return 'Obese (Class II)'
    elif bmi >= 40.0:
        return 'Obese (Class III)'
    else:
        return 'Invalid BMI value' # Xử lý trường hợp giá trị không hợp lệ (ví dụ: âm)

In [12]:
post_data["BMI"] = post_data["BMI"].apply(categorize_bmi_detailed)

In [13]:
post_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  object 
 2   PhysicalHealth    319795 non-null  float64
 3   DiffWalking       319795 non-null  object 
 4   AgeCategory       319795 non-null  object 
 5   PhysicalActivity  319795 non-null  object 
 6   GenHealth         319795 non-null  object 
 7   SleepTime         319795 non-null  float64
dtypes: float64(2), object(6)
memory usage: 19.5+ MB


In [14]:
post_data['SleepTime'] = post_data['SleepTime'].astype('category')
post_data['PhysicalHealth'] = post_data['PhysicalHealth'].astype('category')

In [15]:
post_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   HeartDisease      319795 non-null  object  
 1   BMI               319795 non-null  object  
 2   PhysicalHealth    319795 non-null  category
 3   DiffWalking       319795 non-null  object  
 4   AgeCategory       319795 non-null  object  
 5   PhysicalActivity  319795 non-null  object  
 6   GenHealth         319795 non-null  object  
 7   SleepTime         319795 non-null  category
dtypes: category(2), object(6)
memory usage: 15.3+ MB


In [16]:
cols_to_encode = [
    'HeartDisease',
    'BMI',
    'PhysicalHealth',
    'DiffWalking',
    'AgeCategory',
    'PhysicalActivity',
    'GenHealth',
    'SleepTime'
]

# Khởi tạo LabelEncoder
le = LabelEncoder()

# Áp dụng LabelEncoder cho từng cột
encoders = {}
for col in cols_to_encode:
    post_data[col] = le.fit_transform(post_data[col])
    encoders[col] = le

In [17]:
post_data.head(10)

Unnamed: 0,HeartDisease,BMI,PhysicalHealth,DiffWalking,AgeCategory,PhysicalActivity,GenHealth,SleepTime
0,0,5,3,0,7,1,4,4
1,0,0,0,0,12,1,4,6
2,0,4,20,0,9,1,1,7
3,0,0,0,0,11,0,2,5
4,0,0,28,1,4,1,4,7
5,1,4,6,1,11,0,1,11
6,0,0,15,0,10,1,1,3
7,0,1,5,1,12,0,2,8
8,0,4,0,0,12,0,1,4
9,0,3,0,1,9,1,2,9


In [18]:
X = post_data.drop(columns=["HeartDisease"])
y = post_data["HeartDisease"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train CategoricalNB Model
model = CategoricalNB()
model.fit(X_train, y_train)

# Evaluate Model
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"\nĐộ chính xác của mô hình: {accuracy*100:.2f}%")

print("\nMa trận nhầm lẫn (Confusion Matrix):")
print(confusion_matrix(y_test, y_pred))


Độ chính xác của mô hình: 87.88%

Ma trận nhầm lẫn (Confusion Matrix):
[[54454  3998]
 [ 3756  1751]]


# 3. Tiền xử lý và train dữ liệu không xử lý mất cần bằng

In [19]:
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)
X_train_resample, X_test_resample, y_train_resample, y_test_resample = train_test_split(X_resampled, y_resampled, test_size=0.2)

# Train CategoricalNB Model
model = CategoricalNB()
model.fit(X_train_resample, y_train_resample)

# Evaluate Model
y_pred_resample = model.predict(X_test_resample)

accuracy = accuracy_score(y_test_resample, y_pred_resample)
print(f"\nĐộ chính xác của mô hình: {accuracy*100:.2f}%")

print("\nMa trận nhầm lẫn (Confusion Matrix):")
print(confusion_matrix(y_test_resample, y_pred_resample))



Độ chính xác của mô hình: 71.77%

Ma trận nhầm lẫn (Confusion Matrix):
[[4220 1303]
 [1788 3639]]
