# Heart Disease - Preprocessing & Analysis

## Contents


In [1]:
# # Install Python libraries
# %pip install -U numpy
# %pip install -U pandas
# %pip install -U seaborn
# %pip install -U matplotlib
# %pip install -U scikit-learn
# %pip install -U imbalanced-learn

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization library  
import statistics as stats # https://docs.python.org/3/library/statistics.html#statistics.fmean
#import scipy.stats as spstats
import matplotlib.pyplot as plt

### Validation & Normalization methods ###
from sklearn.model_selection import cross_validate, StratifiedKFold, RepeatedStratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

### ML models ###
from sklearn.linear_model import LogisticRegression, SGDClassifier # C1 loss: log_loss => LogisticRegression with SGD
from sklearn.linear_model import Perceptron # C2
from sklearn.svm import SVC # C3
from sklearn.svm import LinearSVC # C4
from sklearn.tree import DecisionTreeClassifier # C5
from sklearn.ensemble import RandomForestClassifier # C6
from sklearn.neural_network import MLPClassifier # C7

### Metrics ###
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, make_scorer
from imblearn.metrics import geometric_mean_score # https://imbalanced-learn.org/stable/references/generated/imblearn.metrics.geometric_mean_score.html
import time
import timeit # https://stackoverflow.com/questions/17579357/time-time-vs-timeit-timeit

### Pipeline ###
from sklearn.pipeline import make_pipeline , Pipeline # https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

### Custom Modules ###
from functions.data_types import optimize_dtypes
from functions.dataframe_actions import df_info, df_clean, show_value_counts, fill_missing_values
from functions.ml_training import train_classifiers, train_classifiers_tuned

### Other configurations ###
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'
# #import warnings library
# import warnings
# # ignore all warnings
# warnings.filterwarnings('ignore')



In [3]:
# to read .csv files from another directory
data_location = "../Data/" # "/<path>"

df = pd.read_csv(data_location + "heart_2022_with_nans.csv")
df.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,HadAngina,HadStroke,HadAsthma,HadSkinCancer,HadCOPD,HadDepressiveDisorder,HadKidneyDisease,HadArthritis,HadDiabetes,DeafOrHardOfHearing,BlindOrVisionDifficulty,DifficultyConcentrating,DifficultyWalking,DifficultyDressingBathing,DifficultyErrands,SmokerStatus,ECigaretteUsage,ChestScan,RaceEthnicityCategory,AgeCategory,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,,No,No,No,No,No,No,No,No,No,Yes,No,No,No,No,No,No,Never smoked,Not at all (right now),No,"White only, Non-Hispanic",Age 80 or older,,,,No,No,Yes,No,"Yes, received tetanus shot but not sure what type",No,No
1,Alabama,Female,Excellent,0.0,0.0,,No,6.0,,No,No,No,No,Yes,No,No,No,No,No,No,No,No,No,No,No,Never smoked,Never used e-cigarettes in my entire life,No,"White only, Non-Hispanic",Age 80 or older,1.6,68.04,26.57,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No
2,Alabama,Female,Very good,2.0,3.0,Within past year (anytime less than 12 months ...,Yes,5.0,,No,No,No,No,Yes,No,No,No,No,No,No,No,No,No,No,No,Never smoked,Never used e-cigarettes in my entire life,No,"White only, Non-Hispanic",Age 55 to 59,1.57,63.5,25.61,No,No,No,No,,No,Yes
3,Alabama,Female,Excellent,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,,No,No,No,Yes,No,No,No,No,Yes,No,No,No,No,No,No,No,Current smoker - now smokes some days,Never used e-cigarettes in my entire life,Yes,"White only, Non-Hispanic",,1.65,63.5,23.3,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
4,Alabama,Female,Fair,2.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Never smoked,Never used e-cigarettes in my entire life,Yes,"White only, Non-Hispanic",Age 40 to 44,1.57,53.98,21.77,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,No


* Drop `State` column as it is irrelevant

In [4]:
df = df.drop('State', axis = 1)

* Put label column `HadHeartAttack` in the end of the dataframe

In [5]:
df.insert(len(df.columns)-1, 'HadHeartAttack', df.pop('HadHeartAttack'))

* Drop missing values from the label column

In [6]:
# Drop rows with missing values in 'HadHeartAttack' column
df.dropna(subset=['HadHeartAttack'], inplace=True)

In [7]:
# Separate target variable from feature variables
X = df.drop('HadHeartAttack', axis=1, inplace=False)  # Features
y = df['HadHeartAttack']     

* Split the data into train and test set, in order to investigate missing values only in the training set.

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=13)

* DataFrame information

In [9]:
df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

In [10]:
dataframe = [df_train]
dataframe_name = ["cvd"]
df_info(dataframe,dataframe_name)

----- information for  cvd  -----
cvd  :  (353653, 39) (rows, columns)
cvd  :  705758 missing values
cvd  :  1215 duplicate values
cvd  : Value counts for  HadHeartAttack
HadHeartAttack
No     333567
Yes     20086
Name: count, dtype: int64


In [11]:
dataframe = [df_test]
dataframe_name = ["cvd"]
df_info(dataframe,dataframe_name)

----- information for  cvd  -----
cvd  :  (88414, 39) (rows, columns)
cvd  :  176628 missing values
cvd  :  115 duplicate values
cvd  : Value counts for  HadHeartAttack
HadHeartAttack
No     83392
Yes     5022
Name: count, dtype: int64


### Fill the missing values

* Investigate the missing values

In [12]:
# Concatenate isna().sum() and dtypes
info_df = pd.concat([df_train.isna().sum(), df_train.dtypes], axis=1)
info_df.columns = ['Missing Values', 'Data Type']
info_df
#print(info_df)

Unnamed: 0,Missing Values,Data Type
Sex,0,object
GeneralHealth,874,object
PhysicalHealthDays,8419,float64
MentalHealthDays,7015,float64
LastCheckupTime,6406,object
PhysicalActivities,801,object
SleepHours,4161,float64
RemovedTeeth,8809,object
HadAngina,2867,object
HadStroke,821,object


In [13]:
df_train_filled = df_train.copy()
df_train_filled = fill_missing_values(df_train_filled, 'HadHeartAttack')

Filled values:
Column: Sex
Values filled: {'No': 'Female', 'Yes': 'Male'}

Column: GeneralHealth
Values filled: {'No': 'Very good', 'Yes': 'Good'}

Column: PhysicalHealthDays
Values filled: {'No': 4.026982992184935, 'Yes': 9.411798188874515}

Column: MentalHealthDays
Values filled: {'No': 4.309441852787067, 'Yes': 5.433094262295082}

Column: LastCheckupTime
Values filled: {'No': 'Within past year (anytime less than 12 months ago)', 'Yes': 'Within past year (anytime less than 12 months ago)'}

Column: PhysicalActivities
Values filled: {'No': 'Yes', 'Yes': 'Yes'}

Column: SleepHours
Values filled: {'No': 7.022429124512262, 'Yes': 7.055462270391289}

Column: RemovedTeeth
Values filled: {'No': 'None of them', 'Yes': '1 to 5'}

Column: HadAngina
Values filled: {'No': 'No', 'Yes': 'No'}

Column: HadStroke
Values filled: {'No': 'No', 'Yes': 'No'}

Column: HadAsthma
Values filled: {'No': 'No', 'Yes': 'No'}

Column: HadSkinCancer
Values filled: {'No': 'No', 'Yes': 'No'}

Column: HadCOPD
Values 

In [14]:
# # Separate object and numeric columns
# object_columns = df.select_dtypes(include=['object']).columns
# numeric_columns = df.select_dtypes(include=['float64']).columns

In [15]:
# # Concatenate isna().sum() and dtypes
# info_df = pd.concat([df_train_filled.isna().sum(), df_train_filled.dtypes], axis=1)
# info_df.columns = ['Missing Values', 'Data Type']
# info_df

df_train_filled.isnull().sum().sum()

0

### Convert object columns to numeric and investigate missing values

In [16]:
show_value_counts(df_train_filled)

Data Type: object
Sex
Female    187765
Male      165888
Name: count, dtype: int64


Data Type: object
GeneralHealth
Very good    119134
Good         114088
Excellent     57344
Fair          47688
Poor          15399
Name: count, dtype: int64


Data Type: float64
PhysicalHealthDays
0.000000     213204
30.000000     26171
2.000000      20122
1.000000      13710
3.000000      12731
5.000000      12119
10.000000      8478
4.026983       7658
7.000000       7382
15.000000      6942
4.000000       6729
20.000000      4217
14.000000      3984
6.000000       1995
25.000000      1722
8.000000       1399
12.000000       891
21.000000       843
9.411798        761
28.000000       606
9.000000        318
29.000000       270
18.000000       207
16.000000       205
27.000000       156
17.000000       153
11.000000       138
13.000000       134
22.000000       105
24.000000        98
26.000000        86
23.000000        80
19.000000        39
Name: count, dtype: int64


Data Type: float64
MentalHealt

In [17]:
# Map the "size" ordinal feature to an integer value
map_lexicon = {'Female': 0, 'Male': 1}
df_train_filled['Sex'] = df_train_filled['Sex'].map(map_lexicon)

map_lexicon = {'Poor': 0, 'Fair': 1, 'Good': 2, 'Very good': 3, 'Excellent': 4}
df_train_filled['GeneralHealth'] = df_train_filled['GeneralHealth'].map(map_lexicon)

map_lexicon = {'5 or more years ago': 0, 'Within past 5 years (2 years but less than 5 years ago)': 1, 'Within past 2 years (1 year but less than 2 years ago)': 2, 'Within past year (anytime less than 12 months ago)': 3}
df_train_filled['LastCheckupTime'] = df_train_filled['LastCheckupTime'].map(map_lexicon)

map_lexicon = {'All': 0, '6 or more, but not all': 1, '1 to 5': 2, 'None of them': 3}
df_train_filled['RemovedTeeth'] = df_train_filled['RemovedTeeth'].map(map_lexicon)

map_lexicon = {'No': 0, 'Yes': 1}
df_train_filled['HadAngina'] = df_train_filled['HadAngina'].map(map_lexicon)

map_lexicon = {'No': 0, 'Yes': 1}
df_train_filled['HadStroke'] = df_train_filled['HadStroke'].map(map_lexicon)

map_lexicon = {'No': 0, 'Yes': 1}
df_train_filled['HadAsthma'] = df_train_filled['HadAsthma'].map(map_lexicon)

map_lexicon = {'No': 0, 'Yes': 1}
df_train_filled['HadSkinCancer'] = df_train_filled['HadSkinCancer'].map(map_lexicon)

map_lexicon = {'No': 0, 'Yes': 1}
df_train_filled['HadCOPD'] = df_train_filled['HadCOPD'].map(map_lexicon)

map_lexicon = {'No': 0, 'Yes': 1}
df_train_filled['HadDepressiveDisorder'] = df_train_filled['HadDepressiveDisorder'].map(map_lexicon)

map_lexicon = {'No': 0, 'Yes': 1}
df_train_filled['HadKidneyDisease'] = df_train_filled['HadKidneyDisease'].map(map_lexicon)

map_lexicon = {'No': 0, 'Yes': 1}
df_train_filled['HadArthritis'] = df_train_filled['HadArthritis'].map(map_lexicon)

map_lexicon = {'No': 0, 'No, pre-diabetes or borderline diabetes': 1, 'Yes, but only during pregnancy (female)': 2, 'Yes': 3}
df_train_filled['HadDiabetes'] = df_train_filled['HadDiabetes'].map(map_lexicon)

map_lexicon = {'No': 0, 'Yes': 1}
df_train_filled['DeafOrHardOfHearing'] = df_train_filled['DeafOrHardOfHearing'].map(map_lexicon)

map_lexicon = {'No': 0, 'Yes': 1}
df_train_filled['BlindOrVisionDifficulty'] = df_train_filled['BlindOrVisionDifficulty'].map(map_lexicon)

map_lexicon = {'No': 0, 'Yes': 1}
df_train_filled['DifficultyConcentrating'] = df_train_filled['DifficultyConcentrating'].map(map_lexicon)

map_lexicon = {'No': 0, 'Yes': 1}
df_train_filled['DifficultyWalking'] = df_train_filled['DifficultyWalking'].map(map_lexicon)

map_lexicon = {'No': 0, 'Yes': 1}
df_train_filled['DifficultyDressingBathing'] = df_train_filled['DifficultyDressingBathing'].map(map_lexicon)

map_lexicon = {'No': 0, 'Yes': 1}
df_train_filled['DifficultyErrands'] = df_train_filled['DifficultyErrands'].map(map_lexicon)

map_lexicon = {'Never smoked': 0, 'Former smoker': 1, 'Current smoker - now smokes some days': 2, 'Current smoker - now smokes every day': 3}
df_train_filled['SmokerStatus'] = df_train_filled['SmokerStatus'].map(map_lexicon)

map_lexicon = {'Never used e-cigarettes in my entire life': 0, 'Not at all (right now)': 1, 'Use them some days': 2, 'Use them every day': 3}
df_train_filled['ECigaretteUsage'] = df_train_filled['ECigaretteUsage'].map(map_lexicon)

map_lexicon = {'No': 0, 'Yes': 1}
df_train_filled['ChestScan'] = df_train_filled['ChestScan'].map(map_lexicon)

map_lexicon = {'White only, Non-Hispanic': 0, 'Hispanic': 1, 'Other race only, Non-Hispanic': 2, 'Multiracial, Non-Hispanic': 3, 'Black only, Non-Hispanic': 4}
df_train_filled['RaceEthnicityCategory'] = df_train_filled['RaceEthnicityCategory'].map(map_lexicon)

map_lexicon = {'Age 18 to 24': 1824, 'Age 25 to 29': 2529, 'Age 30 to 34': 3034, 'Age 35 to 39': 3539, 'Age 40 to 44': 4044, 'Age 45 to 49': 4549, 'Age 50 to 54': 5054, 'Age 55 to 59': 5559, 'Age 60 to 64': 6064, 'Age 65 to 69': 6569, 'Age 70 to 74': 7074, 'Age 75 to 79': 7079, 'Age 80 or older': 8000}
df_train_filled['AgeCategory'] = df_train_filled['AgeCategory'].map(map_lexicon)

map_lexicon = {'No': 0, 'Yes': 1}
df_train_filled['AlcoholDrinkers'] = df_train_filled['AlcoholDrinkers'].map(map_lexicon)

map_lexicon = {'No': 0, 'Yes': 1}
df_train_filled['HIVTesting'] = df_train_filled['HIVTesting'].map(map_lexicon)

map_lexicon = {'No': 0, 'Yes': 1}
df_train_filled['FluVaxLast12'] = df_train_filled['FluVaxLast12'].map(map_lexicon)

map_lexicon = {'No': 0, 'Yes': 1}
df_train_filled['PneumoVaxEver'] = df_train_filled['PneumoVaxEver'].map(map_lexicon)

map_lexicon = {'No, did not receive any tetanus shot in the past 10 years': 0, 'Yes, received tetanus shot but not sure what type': 1, 'Yes, received tetanus shot, but not Tdap': 2, 'Yes, received Tdap': 3}
df_train_filled['TetanusLast10Tdap'] = df_train_filled['TetanusLast10Tdap'].map(map_lexicon)

map_lexicon = {'No': 0, 'Yes': 1}
df_train_filled['HighRiskLastYear'] = df_train_filled['HighRiskLastYear'].map(map_lexicon)

map_lexicon = {'No': 0, 'Tested positive using home test without a health professional': 1, 'Yes': 2}
df_train_filled['CovidPos'] = df_train_filled['CovidPos'].map(map_lexicon)

map_lexicon = {'No': 0, 'Yes': 1}
df_train_filled['HadHeartAttack'] = df_train_filled['HadHeartAttack'].map(map_lexicon)

In [18]:
df_train_filled = optimize_dtypes(df_train_filled)

In [21]:
df_train_filled.dtypes

Sex                            uint8
GeneralHealth                  uint8
PhysicalHealthDays           float16
MentalHealthDays             float16
LastCheckupTime                uint8
PhysicalActivities            object
SleepHours                   float16
RemovedTeeth                   uint8
HadAngina                      uint8
HadStroke                      uint8
HadAsthma                      uint8
HadSkinCancer                  uint8
HadCOPD                        uint8
HadDepressiveDisorder          uint8
HadKidneyDisease               uint8
HadArthritis                   uint8
HadDiabetes                    uint8
DeafOrHardOfHearing            uint8
BlindOrVisionDifficulty        uint8
DifficultyConcentrating        uint8
DifficultyWalking              uint8
DifficultyDressingBathing      uint8
DifficultyErrands              uint8
SmokerStatus                   uint8
ECigaretteUsage                uint8
ChestScan                      uint8
RaceEthnicityCategory          uint8
A