# Heart Disease - Preprocessing & Analysis

## Contents


In [1]:
# # Install Python libraries
# %pip install -U numpy
# %pip install -U pandas
# %pip install -U seaborn
# %pip install -U matplotlib
# %pip install -U scikit-learn
# %pip install -U imbalanced-learn

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization library  
import statistics as stats # https://docs.python.org/3/library/statistics.html#statistics.fmean
#import scipy.stats as spstats
import matplotlib.pyplot as plt

### Validation & Normalization methods ###
from sklearn.model_selection import cross_validate, StratifiedKFold, RepeatedStratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

### ML models ###
from sklearn.linear_model import LogisticRegression, SGDClassifier # C1 loss: log_loss => LogisticRegression with SGD
from sklearn.linear_model import Perceptron # C2
from sklearn.svm import SVC # C3
from sklearn.svm import LinearSVC # C4
from sklearn.tree import DecisionTreeClassifier # C5
from sklearn.ensemble import RandomForestClassifier # C6
from sklearn.neural_network import MLPClassifier # C7

### Metrics ###
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, make_scorer
from imblearn.metrics import geometric_mean_score # https://imbalanced-learn.org/stable/references/generated/imblearn.metrics.geometric_mean_score.html
import time
import timeit # https://stackoverflow.com/questions/17579357/time-time-vs-timeit-timeit

### Pipeline ###
from sklearn.pipeline import make_pipeline , Pipeline # https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

### Custom Modules ###
from functions.data_types import optimize_dtypes
from functions.dataframe_actions import df_info, df_clean, show_value_counts
from functions.ml_training import train_classifiers, train_classifiers_tuned

### Other configurations ###
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'
# #import warnings library
# import warnings
# # ignore all warnings
# warnings.filterwarnings('ignore')



In [3]:
# to read .csv files from another directory
data_location = "../Data/" # "/<path>"

df = pd.read_csv(data_location + "heart_2022_with_nans.csv")
df.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,HadAngina,HadStroke,HadAsthma,HadSkinCancer,HadCOPD,HadDepressiveDisorder,HadKidneyDisease,HadArthritis,HadDiabetes,DeafOrHardOfHearing,BlindOrVisionDifficulty,DifficultyConcentrating,DifficultyWalking,DifficultyDressingBathing,DifficultyErrands,SmokerStatus,ECigaretteUsage,ChestScan,RaceEthnicityCategory,AgeCategory,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,,No,No,No,No,No,No,No,No,No,Yes,No,No,No,No,No,No,Never smoked,Not at all (right now),No,"White only, Non-Hispanic",Age 80 or older,,,,No,No,Yes,No,"Yes, received tetanus shot but not sure what type",No,No
1,Alabama,Female,Excellent,0.0,0.0,,No,6.0,,No,No,No,No,Yes,No,No,No,No,No,No,No,No,No,No,No,Never smoked,Never used e-cigarettes in my entire life,No,"White only, Non-Hispanic",Age 80 or older,1.6,68.04,26.57,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No
2,Alabama,Female,Very good,2.0,3.0,Within past year (anytime less than 12 months ...,Yes,5.0,,No,No,No,No,Yes,No,No,No,No,No,No,No,No,No,No,No,Never smoked,Never used e-cigarettes in my entire life,No,"White only, Non-Hispanic",Age 55 to 59,1.57,63.5,25.61,No,No,No,No,,No,Yes
3,Alabama,Female,Excellent,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,,No,No,No,Yes,No,No,No,No,Yes,No,No,No,No,No,No,No,Current smoker - now smokes some days,Never used e-cigarettes in my entire life,Yes,"White only, Non-Hispanic",,1.65,63.5,23.3,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
4,Alabama,Female,Fair,2.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Never smoked,Never used e-cigarettes in my entire life,Yes,"White only, Non-Hispanic",Age 40 to 44,1.57,53.98,21.77,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,No


In [4]:
df = df.drop('State', axis = 1)

* Put label column `HadHeartAttack` in the end of the dataframe

In [5]:
df.insert(len(df.columns)-1, 'HadHeartAttack', df.pop('HadHeartAttack'))

* Drop missing values from the label column

In [6]:
# Drop rows with missing values in 'HadHeartAttack' column
df.dropna(subset=['HadHeartAttack'], inplace=True)

In [7]:
# Separate target variable from feature variables
X = df.drop('HadHeartAttack', axis=1, inplace=False)  # Features
y = df['HadHeartAttack']     

* Split the data into train and test set, in order to investigate missing values only in the training set.

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=13)

* DataFrame information

In [9]:
df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

In [10]:
dataframe = [df_train]
dataframe_name = ["cvd"]
df_info(dataframe,dataframe_name)

----- information for  cvd  -----
cvd  :  (353653, 39) (rows, columns)
cvd  :  705758 missing values
cvd  :  1215 duplicate values
cvd  : Value counts for  HadHeartAttack
HadHeartAttack
No     333567
Yes     20086
Name: count, dtype: int64


In [11]:
dataframe = [df_test]
dataframe_name = ["cvd"]
df_info(dataframe,dataframe_name)

----- information for  cvd  -----
cvd  :  (88414, 39) (rows, columns)
cvd  :  176628 missing values
cvd  :  115 duplicate values
cvd  : Value counts for  HadHeartAttack
HadHeartAttack
No     83392
Yes     5022
Name: count, dtype: int64


* Investigate the missing values

In [12]:
# Concatenate isna().sum() and dtypes
info_df = pd.concat([df_train.isna().sum(), df_train.dtypes], axis=1)
info_df.columns = ['Missing Values', 'Data Type']
info_df
print(info_df)

                           Missing Values Data Type
Sex                                     0    object
GeneralHealth                         874    object
PhysicalHealthDays                   8419   float64
MentalHealthDays                     7015   float64
LastCheckupTime                      6406    object
PhysicalActivities                    801    object
SleepHours                           4161   float64
RemovedTeeth                         8809    object
HadAngina                            2867    object
HadStroke                             821    object
HadAsthma                            1161    object
HadSkinCancer                        2212    object
HadCOPD                              1497    object
HadDepressiveDisorder                1930    object
HadKidneyDisease                     1276    object
HadArthritis                         1839    object
HadDiabetes                           662    object
DeafOrHardOfHearing                 16250    object
BlindOrVisio

### Convert object columns to numeric and investigate missing values

In [13]:
show_value_counts(df_train)

Column: Sex | Data Type: object
Sex
Female    187765
Male      165888
Name: count, dtype: int64


Column: GeneralHealth | Data Type: object
GeneralHealth
Very good    118350
Good         113998
Excellent     57344
Fair          47688
Poor          15399
Name: count, dtype: int64


Column: PhysicalHealthDays | Data Type: float64
PhysicalHealthDays
0.0     213204
30.0     26171
2.0      20122
1.0      13710
3.0      12731
5.0      12119
10.0      8478
7.0       7382
15.0      6942
4.0       6729
20.0      4217
14.0      3984
6.0       1995
25.0      1722
8.0       1399
12.0       891
21.0       843
28.0       606
9.0        318
29.0       270
18.0       207
16.0       205
27.0       156
17.0       153
11.0       138
13.0       134
22.0       105
24.0        98
26.0        86
23.0        80
19.0        39
Name: count, dtype: int64


Column: MentalHealthDays | Data Type: float64
MentalHealthDays
0.0     210781
30.0     21310
2.0      18965
5.0      15919
10.0     12286
3.0      12259
15.0 

In [None]:
# Map the "size" ordinal feature to an integer value
map_lexicon = {'Female': 0, 'Male': 1}
df_train['Sex'] = df_train['Sex'].map(map_lexicon)

map_lexicon = {'Poor': 0, 'Fair': 1, 'Good': 2, 'Very Good': 3, 'Excellent': 4}
df_train['GeneralHealth'] = df_train['GeneralHealth'].map(map_lexicon)

map_lexicon = {'5 or more years ago': 0, 'Within past 5 years (2 years but less than 5 years ago)': 1, 'Within past 2 years (1 year but less than 2 years ago)': 2, 'Within past year (anytime less than 12 months ago)': 3}
df_train['LastCheckupTime'] = df_train['LastCheckupTime'].map(map_lexicon)

map_lexicon = {'All': 0, '6 or more, but not all': 1, 'Good': 2, 'Very Good': 3, 'Excellent': 4}
df_train['GeneralHealth'] = df_train['GeneralHealth'].map(map_lexicon)

In [None]:
# null_data = df[df.isnull().any(axis=1)]
# null_data.head()

In [None]:
# Separate object and numeric columns
object_columns = df.select_dtypes(include=['object']).columns
numeric_columns = df.select_dtypes(include=['float64']).columns

* the attribute mean for all samples belonging to the same class: smarter