In [1]:
# For classic ML models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer


import numpy as np
import pandas as pd

# For deep learning models
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

In [2]:
# Load the dataset
data = pd.read_csv('./HeartDisease/2022/heart_2022_with_nans.csv')

# Display the first few rows to understand the structure
data.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,,No,...,,,,No,No,Yes,No,"Yes, received tetanus shot but not sure what type",No,No
1,Alabama,Female,Excellent,0.0,0.0,,No,6.0,,No,...,1.6,68.04,26.57,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No
2,Alabama,Female,Very good,2.0,3.0,Within past year (anytime less than 12 months ...,Yes,5.0,,No,...,1.57,63.5,25.61,No,No,No,No,,No,Yes
3,Alabama,Female,Excellent,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,,No,...,1.65,63.5,23.3,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
4,Alabama,Female,Fair,2.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,,No,...,1.57,53.98,21.77,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,No


In [3]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445132 entries, 0 to 445131
Data columns (total 40 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   State                      445132 non-null  object 
 1   Sex                        445132 non-null  object 
 2   GeneralHealth              443934 non-null  object 
 3   PhysicalHealthDays         434205 non-null  float64
 4   MentalHealthDays           436065 non-null  float64
 5   LastCheckupTime            436824 non-null  object 
 6   PhysicalActivities         444039 non-null  object 
 7   SleepHours                 439679 non-null  float64
 8   RemovedTeeth               433772 non-null  object 
 9   HadHeartAttack             442067 non-null  object 
 10  HadAngina                  440727 non-null  object 
 11  HadStroke                  443575 non-null  object 
 12  HadAsthma                  443359 non-null  object 
 13  HadSkinCancer              44

In [4]:
# 统计每一列的唯一值
unique_values_per_column = {}
for column in data.columns:
    unique_values = data[column].unique()  # 获取该列的唯一值
    unique_values_per_column[column] = unique_values

# 打印每一列的唯一值
for column, unique_values in unique_values_per_column.items():
    print(f"{column} 列的唯一值：{unique_values}")

State 列的唯一值：['Alabama' 'Alaska' 'Arizona' 'Arkansas' 'California' 'Colorado'
 'Connecticut' 'Delaware' 'District of Columbia' 'Florida' 'Georgia'
 'Hawaii' 'Idaho' 'Illinois' 'Indiana' 'Iowa' 'Kansas' 'Kentucky'
 'Louisiana' 'Maine' 'Maryland' 'Massachusetts' 'Michigan' 'Minnesota'
 'Mississippi' 'Missouri' 'Montana' 'Nebraska' 'Nevada' 'New Hampshire'
 'New Jersey' 'New Mexico' 'New York' 'North Carolina' 'North Dakota'
 'Ohio' 'Oklahoma' 'Oregon' 'Pennsylvania' 'Rhode Island' 'South Carolina'
 'South Dakota' 'Tennessee' 'Texas' 'Utah' 'Vermont' 'Virginia'
 'Washington' 'West Virginia' 'Wisconsin' 'Wyoming' 'Guam' 'Puerto Rico'
 'Virgin Islands']
Sex 列的唯一值：['Female' 'Male']
GeneralHealth 列的唯一值：['Very good' 'Excellent' 'Fair' 'Poor' 'Good' nan]
PhysicalHealthDays 列的唯一值：[ 0.  2.  1.  8.  5. 30.  4. 23. 14. nan 15.  3. 10.  7. 25.  6. 21. 20.
 29. 16.  9. 27. 28. 12. 13. 11. 26. 17. 24. 19. 18. 22.]
MentalHealthDays 列的唯一值：[ 0.  3.  9.  5. 15. 20. 14. 10. 18.  1. nan  2. 30.  4.  6.  7. 2

In [5]:
duplicates = data.duplicated()
print(duplicates.value_counts())

False    444975
True        157
Name: count, dtype: int64


In [6]:
duplicate_rows = data[data.duplicated(keep=False)]
print(duplicate_rows)

             State     Sex GeneralHealth  PhysicalHealthDays  \
4712        Alaska    Male     Very good                 0.0   
7310        Alaska    Male     Very good                 0.0   
10696      Arizona  Female     Excellent                 0.0   
11503      Arizona  Female     Excellent                 0.0   
26789   California    Male     Excellent                 0.0   
...            ...     ...           ...                 ...   
428370   Wisconsin    Male          Good                 0.0   
431351   Wisconsin    Male          Good                 0.0   
431653   Wisconsin    Male     Excellent                 0.0   
433340     Wyoming  Female          Good                 0.0   
433508     Wyoming  Female          Good                 0.0   

        MentalHealthDays                                    LastCheckupTime  \
4712                 0.0  Within past year (anytime less than 12 months ...   
7310                 0.0  Within past year (anytime less than 12 months .

In [7]:
data_unique = data.drop_duplicates()
print(data_unique.shape[0])

444975


In [8]:
original_row_count = data_unique.shape[0]

# 删除其它仍然包含缺失值的行
data_cleaned = data_unique.dropna()

# 检查清理后的数据行数
cleaned_row_count = data_cleaned.shape[0]
print(f"填充并删除缺失值行后，剩余的行数: {cleaned_row_count}")

# 统计删除的行数
deleted_row_count = original_row_count - cleaned_row_count
print(f"删除的行数: {deleted_row_count}")

填充并删除缺失值行后，剩余的行数: 246013
删除的行数: 198962


In [9]:
data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 246013 entries, 342 to 445130
Data columns (total 40 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   State                      246013 non-null  object 
 1   Sex                        246013 non-null  object 
 2   GeneralHealth              246013 non-null  object 
 3   PhysicalHealthDays         246013 non-null  float64
 4   MentalHealthDays           246013 non-null  float64
 5   LastCheckupTime            246013 non-null  object 
 6   PhysicalActivities         246013 non-null  object 
 7   SleepHours                 246013 non-null  float64
 8   RemovedTeeth               246013 non-null  object 
 9   HadHeartAttack             246013 non-null  object 
 10  HadAngina                  246013 non-null  object 
 11  HadStroke                  246013 non-null  object 
 12  HadAsthma                  246013 non-null  object 
 13  HadSkinCancer              24601

In [10]:
for col in data_cleaned.columns:
    # 检查列的数据类型是否为 object
    if data_cleaned[col].dtype == 'object':
        # 输出该列的唯一值
        unique_values = data_cleaned[col].unique()
        print(f"Column '{col}' unique values: {unique_values}")

Column 'State' unique values: ['Alabama' 'Alaska' 'Arizona' 'Arkansas' 'California' 'Colorado'
 'Connecticut' 'Delaware' 'District of Columbia' 'Florida' 'Georgia'
 'Hawaii' 'Idaho' 'Illinois' 'Indiana' 'Iowa' 'Kansas' 'Kentucky'
 'Louisiana' 'Maine' 'Maryland' 'Massachusetts' 'Michigan' 'Minnesota'
 'Mississippi' 'Missouri' 'Montana' 'Nebraska' 'Nevada' 'New Hampshire'
 'New Jersey' 'New Mexico' 'New York' 'North Carolina' 'North Dakota'
 'Ohio' 'Oklahoma' 'Oregon' 'Pennsylvania' 'Rhode Island' 'South Carolina'
 'South Dakota' 'Tennessee' 'Texas' 'Utah' 'Vermont' 'Virginia'
 'Washington' 'West Virginia' 'Wisconsin' 'Wyoming' 'Guam' 'Puerto Rico'
 'Virgin Islands']
Column 'Sex' unique values: ['Female' 'Male']
Column 'GeneralHealth' unique values: ['Very good' 'Fair' 'Good' 'Excellent' 'Poor']
Column 'LastCheckupTime' unique values: ['Within past year (anytime less than 12 months ago)'
 '5 or more years ago'
 'Within past 2 years (1 year but less than 2 years ago)'
 'Within past 5 year

In [11]:
# 定义各列的映射字典
state_mapping = {state: idx for idx, state in enumerate([
    'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 
    'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia',
    'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
    'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
    'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
    'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota',
    'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
    'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia',
    'Washington', 'West Virginia', 'Wisconsin', 'Wyoming', 'Guam', 'Puerto Rico',
    'Virgin Islands'])}

sex_mapping = {'Female': 0, 'Male': 1}

general_health_mapping = {
    'Very good': 0, 'Fair': 1, 'Good': 2, 'Excellent': 3, 'Poor': 4
}

last_checkup_time_mapping = {
    'Within past year (anytime less than 12 months ago)': 0,
    '5 or more years ago': 1,
    'Within past 2 years (1 year but less than 2 years ago)': 2,
    'Within past 5 years (2 years but less than 5 years ago)': 3
}

physical_activities_mapping = {'Yes': 1, 'No': 0}

removed_teeth_mapping = {
    'None of them': 0, '6 or more, but not all': 1, '1 to 5': 2, 'All': 3
}

# 所有 Yes/No 类型的列可以使用相同的映射
yes_no_mapping = {'Yes': 1, 'No': 0}

# 特殊列的映射
had_diabetes_mapping = {
    'No': 0, 'Yes': 1, 
    'Yes, but only during pregnancy (female)': 2, 
    'No, pre-diabetes or borderline diabetes': 3
}

smoker_status_mapping = {
    'Former smoker': 0, 'Never smoked': 1, 
    'Current smoker - now smokes every day': 2,
    'Current smoker - now smokes some days': 3
}

e_cigarette_usage_mapping = {
    'Never used e-cigarettes in my entire life': 0,
    'Use them some days': 1, 'Not at all (right now)': 2,
    'Use them every day': 3
}

race_ethnicity_mapping = {
    'White only, Non-Hispanic': 0, 'Black only, Non-Hispanic': 1,
    'Other race only, Non-Hispanic': 2, 'Multiracial, Non-Hispanic': 3,
    'Hispanic': 4
}

age_category_mapping = {
    'Age 18 to 24': 0, 'Age 25 to 29': 1, 'Age 30 to 34': 2, 'Age 35 to 39': 3,
    'Age 40 to 44': 4, 'Age 45 to 49': 5, 'Age 50 to 54': 6, 'Age 55 to 59': 7,
    'Age 60 to 64': 8, 'Age 65 to 69': 9, 'Age 70 to 74': 10, 'Age 75 to 79': 11,
    'Age 80 or older': 12
}

tetanus_last_10_td_mapping = {
    'Yes, received Tdap': 0, 'Yes, received tetanus shot but not sure what type': 1,
    'No, did not receive any tetanus shot in the past 10 years': 2,
    'Yes, received tetanus shot, but not Tdap': 3
}

covid_pos_mapping = {
    'No': 0, 'Yes': 1,
    'Tested positive using home test without a health professional': 2
}

# 映射数据
# 使用 .loc 确保赋值在原始 DataFrame 上完成
data_cleaned.loc[:, 'State'] = data_cleaned['State'].map(state_mapping)
data_cleaned.loc[:, 'Sex'] = data_cleaned['Sex'].map(sex_mapping)
data_cleaned.loc[:, 'GeneralHealth'] = data_cleaned['GeneralHealth'].map(general_health_mapping)
data_cleaned.loc[:, 'LastCheckupTime'] = data_cleaned['LastCheckupTime'].map(last_checkup_time_mapping)
data_cleaned.loc[:, 'PhysicalActivities'] = data_cleaned['PhysicalActivities'].map(yes_no_mapping)
data_cleaned.loc[:, 'RemovedTeeth'] = data_cleaned['RemovedTeeth'].map(removed_teeth_mapping)
data_cleaned.loc[:, 'HadHeartAttack'] = data_cleaned['HadHeartAttack'].map(yes_no_mapping)
data_cleaned.loc[:, 'HadAngina'] = data_cleaned['HadAngina'].map(yes_no_mapping)
data_cleaned.loc[:, 'HadStroke'] = data_cleaned['HadStroke'].map(yes_no_mapping)
data_cleaned.loc[:, 'HadAsthma'] = data_cleaned['HadAsthma'].map(yes_no_mapping)
data_cleaned.loc[:, 'HadSkinCancer'] = data_cleaned['HadSkinCancer'].map(yes_no_mapping)
data_cleaned.loc[:, 'HadCOPD'] = data_cleaned['HadCOPD'].map(yes_no_mapping)
data_cleaned.loc[:, 'HadDepressiveDisorder'] = data_cleaned['HadDepressiveDisorder'].map(yes_no_mapping)
data_cleaned.loc[:, 'HadKidneyDisease'] = data_cleaned['HadKidneyDisease'].map(yes_no_mapping)
data_cleaned.loc[:, 'HadArthritis'] = data_cleaned['HadArthritis'].map(yes_no_mapping)
data_cleaned.loc[:, 'HadDiabetes'] = data_cleaned['HadDiabetes'].map(had_diabetes_mapping)
data_cleaned.loc[:, 'DeafOrHardOfHearing'] = data_cleaned['DeafOrHardOfHearing'].map(yes_no_mapping)
data_cleaned.loc[:, 'BlindOrVisionDifficulty'] = data_cleaned['BlindOrVisionDifficulty'].map(yes_no_mapping)
data_cleaned.loc[:, 'DifficultyConcentrating'] = data_cleaned['DifficultyConcentrating'].map(yes_no_mapping)
data_cleaned.loc[:, 'DifficultyWalking'] = data_cleaned['DifficultyWalking'].map(yes_no_mapping)
data_cleaned.loc[:, 'DifficultyDressingBathing'] = data_cleaned['DifficultyDressingBathing'].map(yes_no_mapping)
data_cleaned.loc[:, 'DifficultyErrands'] = data_cleaned['DifficultyErrands'].map(yes_no_mapping)
data_cleaned.loc[:, 'SmokerStatus'] = data_cleaned['SmokerStatus'].map(smoker_status_mapping)
data_cleaned.loc[:, 'ECigaretteUsage'] = data_cleaned['ECigaretteUsage'].map(e_cigarette_usage_mapping)
data_cleaned.loc[:, 'ChestScan'] = data_cleaned['ChestScan'].map(yes_no_mapping)
data_cleaned.loc[:, 'RaceEthnicityCategory'] = data_cleaned['RaceEthnicityCategory'].map(race_ethnicity_mapping)
data_cleaned.loc[:, 'AgeCategory'] = data_cleaned['AgeCategory'].map(age_category_mapping)
data_cleaned.loc[:, 'AlcoholDrinkers'] = data_cleaned['AlcoholDrinkers'].map(yes_no_mapping)
data_cleaned.loc[:, 'HIVTesting'] = data_cleaned['HIVTesting'].map(yes_no_mapping)
data_cleaned.loc[:, 'FluVaxLast12'] = data_cleaned['FluVaxLast12'].map(yes_no_mapping)
data_cleaned.loc[:, 'PneumoVaxEver'] = data_cleaned['PneumoVaxEver'].map(yes_no_mapping)
data_cleaned.loc[:, 'TetanusLast10Tdap'] = data_cleaned['TetanusLast10Tdap'].map(tetanus_last_10_td_mapping)
data_cleaned.loc[:, 'HighRiskLastYear'] = data_cleaned['HighRiskLastYear'].map(yes_no_mapping)
data_cleaned.loc[:, 'CovidPos'] = data_cleaned['CovidPos'].map(covid_pos_mapping)

# 检查映射结果
print(data_cleaned.head())


    State Sex GeneralHealth  PhysicalHealthDays  MentalHealthDays  \
342     0   0             0                 4.0               0.0   
343     0   1             0                 0.0               0.0   
345     0   1             0                 0.0               0.0   
346     0   0             1                 5.0               0.0   
347     0   0             2                 3.0              15.0   

    LastCheckupTime PhysicalActivities  SleepHours RemovedTeeth  \
342               0                  1         9.0            0   
343               0                  1         6.0            0   
345               0                  0         8.0            1   
346               0                  1         9.0            0   
347               0                  1         5.0            2   

    HadHeartAttack  ... HeightInMeters WeightInKilograms    BMI  \
342              0  ...           1.60             71.67  27.99   
343              0  ...           1.78          

In [12]:
data_cleaned.to_csv('data_cleaned.csv', index=True)

In [18]:
correlation_matrix = data_cleaned.corr()
strong_correlation_cols = [col for col in correlation_matrix.columns if abs(correlation_matrix.loc['HadHeartAttack', col]) >= 0.05 and col != 'HadHeartAttack']

# Select features (X) and target (y)
X = data_cleaned[strong_correlation_cols]
y = data_cleaned['HadHeartAttack']

strong_correlation_cols

['Sex',
 'GeneralHealth',
 'PhysicalHealthDays',
 'LastCheckupTime',
 'PhysicalActivities',
 'RemovedTeeth',
 'HadAngina',
 'HadStroke',
 'HadCOPD',
 'HadKidneyDisease',
 'HadArthritis',
 'HadDiabetes',
 'DeafOrHardOfHearing',
 'BlindOrVisionDifficulty',
 'DifficultyConcentrating',
 'DifficultyWalking',
 'DifficultyDressingBathing',
 'DifficultyErrands',
 'ChestScan',
 'AgeCategory',
 'AlcoholDrinkers',
 'PneumoVaxEver']

In [None]:
def remove_outliers_from_dataframe(df):
    
    for column in df.columns:
        if column in ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']:
            Q1 = df[column].quantile(0.25)
            Q3 = df[column].quantile(0.75)
            IQR = Q3 - Q1
        
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
        
            df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

data_cleaned = remove_outliers_from_dataframe(data_unique)
print(data_cleaned.shape[0])