# Heart Disease - Preprocessing & Analysis

## Contents


In [11]:
# # Install Python libraries
# %pip install -U numpy
# %pip install -U pandas
# %pip install -U seaborn
# %pip install -U matplotlib
# %pip install -U scikit-learn
# %pip install -U imbalanced-learn

In [12]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization library  
import statistics as stats # https://docs.python.org/3/library/statistics.html#statistics.fmean
#import scipy.stats as spstats
import matplotlib.pyplot as plt

### Validation & Normalization methods ###
from sklearn.model_selection import cross_validate, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

### ML models ###
from sklearn.linear_model import LogisticRegression, SGDClassifier # C1 loss: log_loss => LogisticRegression with SGD
from sklearn.linear_model import Perceptron # C2
from sklearn.svm import SVC # C3
from sklearn.svm import LinearSVC # C4
from sklearn.tree import DecisionTreeClassifier # C5
from sklearn.ensemble import RandomForestClassifier # C6
from sklearn.neural_network import MLPClassifier # C7

### Metrics ###
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, make_scorer
from imblearn.metrics import geometric_mean_score # https://imbalanced-learn.org/stable/references/generated/imblearn.metrics.geometric_mean_score.html
import time
import timeit # https://stackoverflow.com/questions/17579357/time-time-vs-timeit-timeit

### Pipeline ###
from sklearn.pipeline import make_pipeline , Pipeline # https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

### Custom Modules ###
from functions.data_types import optimize_dtypes
from functions.dataframe_actions import df_info, df_clean
from functions.ml_training import train_classifiers, train_classifiers_tuned

### Other configurations ###
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'
# #import warnings library
# import warnings
# # ignore all warnings
# warnings.filterwarnings('ignore')

In [13]:
# to read .csv files from another directory
data_location = "../Data/" # "/<path>"

df = pd.read_csv(data_location + "heart_2022_with_nans.csv")
df.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,HadAngina,HadStroke,HadAsthma,HadSkinCancer,HadCOPD,HadDepressiveDisorder,HadKidneyDisease,HadArthritis,HadDiabetes,DeafOrHardOfHearing,BlindOrVisionDifficulty,DifficultyConcentrating,DifficultyWalking,DifficultyDressingBathing,DifficultyErrands,SmokerStatus,ECigaretteUsage,ChestScan,RaceEthnicityCategory,AgeCategory,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,,No,No,No,No,No,No,No,No,No,Yes,No,No,No,No,No,No,Never smoked,Not at all (right now),No,"White only, Non-Hispanic",Age 80 or older,,,,No,No,Yes,No,"Yes, received tetanus shot but not sure what type",No,No
1,Alabama,Female,Excellent,0.0,0.0,,No,6.0,,No,No,No,No,Yes,No,No,No,No,No,No,No,No,No,No,No,Never smoked,Never used e-cigarettes in my entire life,No,"White only, Non-Hispanic",Age 80 or older,1.6,68.04,26.57,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No
2,Alabama,Female,Very good,2.0,3.0,Within past year (anytime less than 12 months ...,Yes,5.0,,No,No,No,No,Yes,No,No,No,No,No,No,No,No,No,No,No,Never smoked,Never used e-cigarettes in my entire life,No,"White only, Non-Hispanic",Age 55 to 59,1.57,63.5,25.61,No,No,No,No,,No,Yes
3,Alabama,Female,Excellent,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,,No,No,No,Yes,No,No,No,No,Yes,No,No,No,No,No,No,No,Current smoker - now smokes some days,Never used e-cigarettes in my entire life,Yes,"White only, Non-Hispanic",,1.65,63.5,23.3,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
4,Alabama,Female,Fair,2.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Never smoked,Never used e-cigarettes in my entire life,Yes,"White only, Non-Hispanic",Age 40 to 44,1.57,53.98,21.77,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,No


* Put label column `HadHeartAttack` in the end of the dataframe

In [14]:
df.insert(len(df.columns)-1, 'HadHeartAttack', df.pop('HadHeartAttack'))

* Drop missing values from the label column

In [38]:
# Drop rows with missing values in 'HadHeartAttack' column
df.dropna(subset=['HadHeartAttack'], inplace=True)

* DataFrame information

In [39]:
dataframe = [df]
dataframe_name = ["cvd"]
df_info(dataframe,dataframe_name)

----- information for  cvd  -----
cvd  :  (442067, 40) (rows, columns)
cvd  :  882386 missing values
cvd  :  152 duplicate values
cvd  : Value counts for  HadHeartAttack
HadHeartAttack
No     416959
Yes     25108
Name: count, dtype: int64


* Investigate the missing values

In [35]:
# Concatenate isna().sum() and dtypes
info_df = pd.concat([df.isna().sum(), df.dtypes], axis=1)
info_df.columns = ['Missing Values', 'Data Type']
info_df
print(info_df)

                           Missing Values Data Type
State                                   0    object
Sex                                     0    object
GeneralHealth                        1095    object
PhysicalHealthDays                  10597   float64
MentalHealthDays                     8792   float64
LastCheckupTime                      8041    object
PhysicalActivities                    972    object
SleepHours                           5196   float64
RemovedTeeth                        11010    object
HadAngina                            3588    object
HadStroke                            1070    object
HadAsthma                            1437    object
HadSkinCancer                        2764    object
HadCOPD                              1838    object
HadDepressiveDisorder                2421    object
HadKidneyDisease                     1614    object
HadArthritis                         2313    object
HadDiabetes                           813    object
DeafOrHardOf

* analyse missing values for each column separately

In [36]:
df['GeneralHealth'].value_counts()

GeneralHealth
Very good    147967
Good         142550
Excellent     71623
Fair          59528
Poor          19304
Name: count, dtype: int64

In [29]:
# null_data = df[df.isnull().any(axis=1)]
# null_data.head()

In [31]:
# Separate object and numeric columns
object_columns = df.select_dtypes(include=['object']).columns
numeric_columns = df.select_dtypes(include=['float64']).columns

* the attribute mean for all samples belonging to the same class: smarter