In [1]:
%matplotlib inline

In [2]:
#Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os

In [3]:
plt.rcParams["figure.figsize"] = (14, 6)

# Heart Disease Analysis and Prediction - Draft

### 1. Data Acquisition

In [4]:
#TODO use the other dataset - processed.cleveland.data -> rename it and clean it much much carefully
data_path = os.path.join("data", "heart-disease.data")
heart_data = pd.read_csv(data_path, header = None)
# NOTE - use asserts

In [5]:
heart_data.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
217,46.0,0.0,4.0,138.0,243.0,0.0,2.0,152.0,1.0,0.0,2.0,0.0,3.0,0
81,53.0,0.0,4.0,130.0,264.0,0.0,2.0,143.0,0.0,0.4,2.0,0.0,3.0,0
280,57.0,1.0,4.0,110.0,335.0,0.0,0.0,143.0,1.0,3.0,2.0,1.0,7.0,2
104,49.0,1.0,3.0,120.0,188.0,0.0,0.0,139.0,0.0,2.0,2.0,3.0,7.0,3
26,58.0,0.0,3.0,120.0,340.0,0.0,0.0,172.0,0.0,0.0,1.0,0.0,3.0,0


In [6]:
# Check the memory usage of the dataframe
heart_data.memory_usage().sum() / 1000

34.064

In [7]:
# Columns -> Rename columns and after that describe the variables what they mean -> see the documentation and the research info too
#TODO - rename some columns
heart_data.columns = [
    "age", "sex", "chest_pain_type", "rest_blood_pressure", "cholesterol", "diabetes", "ecg_rest", "heart_rate_max",
    "exercise_angina", "oldpeak", "ST_slope", "vessels", "blood_flow", "heart_disease"]

In [8]:
# Data types -> Describe the data types of the variables but tell also that there are many categorical variables
# Check ranges and value validity of each variable step by step clean -> Cleaning
heart_data.dtypes

age                    float64
sex                    float64
chest_pain_type        float64
rest_blood_pressure    float64
cholesterol            float64
diabetes               float64
ecg_rest               float64
heart_rate_max         float64
exercise_angina        float64
oldpeak                float64
ST_slope               float64
vessels                 object
blood_flow              object
heart_disease            int64
dtype: object

In [9]:
# Check for null values
heart_data.isnull().sum()

age                    0
sex                    0
chest_pain_type        0
rest_blood_pressure    0
cholesterol            0
diabetes               0
ecg_rest               0
heart_rate_max         0
exercise_angina        0
oldpeak                0
ST_slope               0
vessels                0
blood_flow             0
heart_disease          0
dtype: int64

### 2. Data Cleaning

In [10]:
### First Nan cleaning
#Clean vessels
heart_data["vessels"].value_counts() # Replace with 0 they are the most

0.0    176
1.0     65
2.0     38
3.0     20
?        4
Name: vessels, dtype: int64

In [11]:
heart_data["vessels"].replace("?", "0.0", inplace = True)
heart_data["vessels"] = pd.to_numeric(heart_data["vessels"]).astype(np.int64)

In [12]:
heart_data["blood_flow"].unique()

array(['6.0', '3.0', '7.0', '?'], dtype=object)

In [13]:
heart_data.loc[heart_data["blood_flow"] == "?"]

Unnamed: 0,age,sex,chest_pain_type,rest_blood_pressure,cholesterol,diabetes,ecg_rest,heart_rate_max,exercise_angina,oldpeak,ST_slope,vessels,blood_flow,heart_disease
87,53.0,0.0,3.0,128.0,216.0,0.0,2.0,115.0,0.0,0.0,1.0,0,?,0
266,52.0,1.0,4.0,128.0,204.0,1.0,0.0,156.0,1.0,1.0,2.0,0,?,2


In [14]:
heart_data["blood_flow"].value_counts()

3.0    166
7.0    117
6.0     18
?        2
Name: blood_flow, dtype: int64

In [15]:
heart_data.loc[heart_data["heart_disease"] == 0]["blood_flow"].value_counts()

3.0    129
7.0     28
6.0      6
?        1
Name: blood_flow, dtype: int64

In [16]:
heart_data.loc[heart_data["heart_disease"] == 2]["blood_flow"].value_counts()

7.0    22
3.0     7
6.0     6
?       1
Name: blood_flow, dtype: int64

In [17]:
heart_data.loc[(heart_data["blood_flow"] == "?") & (heart_data["heart_disease"] == 0), "blood_flow"] = "3.0"
heart_data.loc[(heart_data["blood_flow"] == "?") & (heart_data["heart_disease"] == 2), "blood_flow"] = "7.0"

In [18]:
# Clean continous varables
# Age
heart_data["age"] = heart_data["age"].astype(np.int64)
heart_data["age"].min(), heart_data["age"].max() # -> It looks valid

(29, 77)

In [19]:
# Rest blood pressure cleaning -> Continious var
heart_data["rest_blood_pressure"] = heart_data["rest_blood_pressure"].astype(np.int64)
heart_data["rest_blood_pressure"].min(), heart_data["rest_blood_pressure"].max()

(94, 200)

In [20]:
heart_data["cholesterol"] = heart_data["cholesterol"].astype(np.int64)
heart_data["cholesterol"].min(), heart_data["cholesterol"].max()   #Interpret the range - research normal, high and low values

(126, 564)

In [21]:
heart_data["heart_rate_max"] = heart_data["heart_rate_max"].astype(np.int64)
heart_data["heart_rate_max"].min(), heart_data["heart_rate_max"].max() # Ok

(71, 202)

In [22]:
#TODO -> clena oldpeak - research it too

In [23]:
# Clean categorical vars

# Sex
heart_data["sex"].unique()  # Interpret the values

array([1., 0.])

In [24]:
# Transform sex to categorical variable
sex_clean = heart_data["sex"].astype(np.int32)
heart_data["sex"] = pd.Categorical(sex_clean)

In [25]:
# Chest pain type cleaning  -> Four values -> category
heart_data["chest_pain_type"].unique()  # Interpret values

array([1., 4., 3., 2.])

In [26]:
chest_pain_clean = heart_data["chest_pain_type"].astype(np.int32)
heart_data["chest_pain_type"] = pd.Categorical(chest_pain_clean)

In [27]:
# TODO -> clean resting ECG
heart_data["ecg_rest"].unique()

array([2., 0., 1.])

In [28]:
ecg_clean = heart_data["ecg_rest"].astype(np.int32)
heart_data["ecg_rest"] = pd.Categorical(ecg_clean)

In [29]:
heart_data["ST_slope"].unique() # Interpret the values -> Show image

array([3., 2., 1.])

In [30]:
st_slope_clean = heart_data["ST_slope"].astype(np.int32)
heart_data["ST_slope"] = pd.Categorical(st_slope_clean)

In [31]:
blood_flow_clean = pd.to_numeric(heart_data["blood_flow"]).astype(np.int32)
heart_data["blood_flow"] = pd.Categorical(blood_flow_clean)

In [32]:
# Clean boolean variables

heart_data["diabetes"].unique()

array([1., 0.])

In [33]:
diabetes_clean = heart_data["diabetes"].astype(np.int32)
heart_data["diabetes"] = pd.Categorical(diabetes_clean)

In [34]:
heart_data["exercise_angina"].unique()

array([0., 1.])

In [35]:
exercise_angina_clean = heart_data["exercise_angina"].astype(np.int32)
heart_data["exercise_angina"] = pd.Categorical(exercise_angina_clean)

In [36]:
# Clean the target
# It is integer valued from 0 (no presence) to 4. Experiments with the Cleveland database have concentrated on simply
#      attempting to distinguish presence (values 1,2,3,4) from absence (value 0). 


# diagnosis of heart disease (angiographic disease status)
# -- Value 0: < 50% diameter narrowing
# -- Value 1: > 50% diameter narrowing
# presence (1-4) absence - 0

heart_data["heart_disease"].unique()

array([0, 2, 1, 3, 4], dtype=int64)

In [37]:
def check_heart_disease_presence(heart_disease_rate):
    return (1 if heart_disease_rate != 0 else 0)

heart_data["heart_disease"] = heart_data["heart_disease"].apply(check_heart_disease_presence)

In [38]:
heart_data.info() # Compare with the initial result on reading

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   age                  303 non-null    int64   
 1   sex                  303 non-null    category
 2   chest_pain_type      303 non-null    category
 3   rest_blood_pressure  303 non-null    int64   
 4   cholesterol          303 non-null    int64   
 5   diabetes             303 non-null    category
 6   ecg_rest             303 non-null    category
 7   heart_rate_max       303 non-null    int64   
 8   exercise_angina      303 non-null    category
 9   oldpeak              303 non-null    float64 
 10  ST_slope             303 non-null    category
 11  vessels              303 non-null    int64   
 12  blood_flow           303 non-null    category
 13  heart_disease        303 non-null    int64   
dtypes: category(7), float64(1), int64(6)
memory usage: 19.5 KB


In [39]:
heart_data

Unnamed: 0,age,sex,chest_pain_type,rest_blood_pressure,cholesterol,diabetes,ecg_rest,heart_rate_max,exercise_angina,oldpeak,ST_slope,vessels,blood_flow,heart_disease
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,3,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,7,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0,3,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45,1,1,110,264,0,0,132,0,1.2,2,0,7,1
299,68,1,4,144,193,1,0,141,0,3.4,2,2,7,1
300,57,1,4,130,131,0,0,115,1,1.2,2,1,7,1
301,57,0,2,130,236,0,2,174,0,0.0,2,1,3,1


In [40]:
# Check for duplicates
heart_data[heart_data.duplicated()].size

0

In [41]:
# Data cleaning for the exploratory analysis   !!!!!!!!!!!!!!!!!!!!!!!!!!
heart_data_eda = heart_data.copy(deep = True)

In [42]:

# Transform sex to categorical variable
def convert_sex_to_text(sex):
    return "Male" if sex == 1 else "Female"

sex_clean = heart_data_eda["sex"].apply(convert_sex_to_text)
heart_data_eda["sex"] = pd.Categorical(sex_clean)

In [43]:
def convert_chest_pain_type_to_text(pain_type):
    chest_pain_types = {
        1: "typical angina",
        2: "atypical angina",
        3: "non-anginal pain",
        4: "asymptomatic"
    }
    return chest_pain_types.get(pain_type)

chest_pain_clean = heart_data_eda["chest_pain_type"].apply(convert_chest_pain_type_to_text)
heart_data_eda["chest_pain_type"] = pd.Categorical(chest_pain_clean)

In [44]:
def convert_ecg_rest_to_text(ecg_rest):
    ecg_results = {
        0: "normal",
        1: "ST-T wave abnormality",
        2: "left ventricular hypertrophy"
    }
    return ecg_results.get(ecg_rest)

ecg_clean = heart_data_eda["ecg_rest"].apply(convert_ecg_rest_to_text)
heart_data_eda["ecg_rest"] = pd.Categorical(ecg_clean)

In [45]:
def transform_ST_slope_to_text(ST_slope):
    slope_types = {
        1: "upsloping",
        2: "flat",
        3: "downsloping"
    }
    return slope_types.get(ST_slope)

st_slope_clean = heart_data_eda["ST_slope"].apply(transform_ST_slope_to_text)
heart_data_eda["ST_slope"] = pd.Categorical(st_slope_clean)

In [46]:
def convert_blood_flow_to_text(blood_flow):
    blood_flow_states = {
        3: "normal",
        6: "fixed defect",
        7: "reversible defect"
    }
    return blood_flow_states.get(blood_flow)

blood_flow_clean = heart_data_eda["blood_flow"].apply(convert_blood_flow_to_text)
heart_data_eda["blood_flow"] = pd.Categorical(blood_flow_clean)

In [47]:

heart_data_eda["diabetes"] = heart_data_eda["diabetes"].astype(np.bool)

In [48]:
heart_data_eda["exercise_angina"] = heart_data_eda["exercise_angina"].astype(np.bool)

In [49]:
heart_data_eda.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   age                  303 non-null    int64   
 1   sex                  303 non-null    category
 2   chest_pain_type      303 non-null    category
 3   rest_blood_pressure  303 non-null    int64   
 4   cholesterol          303 non-null    int64   
 5   diabetes             303 non-null    bool    
 6   ecg_rest             303 non-null    category
 7   heart_rate_max       303 non-null    int64   
 8   exercise_angina      303 non-null    bool    
 9   oldpeak              303 non-null    float64 
 10  ST_slope             303 non-null    category
 11  vessels              303 non-null    int64   
 12  blood_flow           303 non-null    category
 13  heart_disease        303 non-null    int64   
dtypes: bool(2), category(5), float64(1), int64(6)
memory usage: 19.4 KB


In [50]:
heart_data_eda.head(5)

Unnamed: 0,age,sex,chest_pain_type,rest_blood_pressure,cholesterol,diabetes,ecg_rest,heart_rate_max,exercise_angina,oldpeak,ST_slope,vessels,blood_flow,heart_disease
0,63,Male,typical angina,145,233,True,left ventricular hypertrophy,150,False,2.3,downsloping,0,fixed defect,0
1,67,Male,asymptomatic,160,286,False,left ventricular hypertrophy,108,True,1.5,flat,3,normal,1
2,67,Male,asymptomatic,120,229,False,left ventricular hypertrophy,129,True,2.6,flat,2,reversible defect,1
3,37,Male,non-anginal pain,130,250,False,normal,187,False,3.5,downsloping,0,normal,0
4,41,Female,atypical angina,130,204,False,left ventricular hypertrophy,172,False,1.4,upsloping,0,normal,0


In [51]:
heart_disease_eda_path = os.path.join("output", "heart-disease-processed.data")
heart_data_eda.to_csv(heart_disease_eda_path, index = False)

### 3. Expore the data
#### 3.1. Structure - investigate the structure of the dataset
Data strucure analysis helps us to get an insights into the dataset and undercover the underlying structure.

In [52]:
# Shape - describe how many observations and variable do we have
heart_data_eda.shape   # TODO - write it as text and in a function

(303, 14)

In [53]:
heart_data_eda.info()  #Describe it once again after the cleaning process

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   age                  303 non-null    int64   
 1   sex                  303 non-null    category
 2   chest_pain_type      303 non-null    category
 3   rest_blood_pressure  303 non-null    int64   
 4   cholesterol          303 non-null    int64   
 5   diabetes             303 non-null    bool    
 6   ecg_rest             303 non-null    category
 7   heart_rate_max       303 non-null    int64   
 8   exercise_angina      303 non-null    bool    
 9   oldpeak              303 non-null    float64 
 10  ST_slope             303 non-null    category
 11  vessels              303 non-null    int64   
 12  blood_flow           303 non-null    category
 13  heart_disease        303 non-null    int64   
dtypes: bool(2), category(5), float64(1), int64(6)
memory usage: 19.4 KB


In [54]:
# Display several observation in order to get a sence of the data -> Tell what you see in one, two sentances
heart_data_eda.head()

Unnamed: 0,age,sex,chest_pain_type,rest_blood_pressure,cholesterol,diabetes,ecg_rest,heart_rate_max,exercise_angina,oldpeak,ST_slope,vessels,blood_flow,heart_disease
0,63,Male,typical angina,145,233,True,left ventricular hypertrophy,150,False,2.3,downsloping,0,fixed defect,0
1,67,Male,asymptomatic,160,286,False,left ventricular hypertrophy,108,True,1.5,flat,3,normal,1
2,67,Male,asymptomatic,120,229,False,left ventricular hypertrophy,129,True,2.6,flat,2,reversible defect,1
3,37,Male,non-anginal pain,130,250,False,normal,187,False,3.5,downsloping,0,normal,0
4,41,Female,atypical angina,130,204,False,left ventricular hypertrophy,172,False,1.4,upsloping,0,normal,0


In [55]:
heart_data_eda.tail(3)

Unnamed: 0,age,sex,chest_pain_type,rest_blood_pressure,cholesterol,diabetes,ecg_rest,heart_rate_max,exercise_angina,oldpeak,ST_slope,vessels,blood_flow,heart_disease
300,57,Male,asymptomatic,130,131,False,normal,115,True,1.2,flat,1,reversible defect,1
301,57,Female,atypical angina,130,236,False,left ventricular hypertrophy,174,False,0.0,flat,1,normal,1
302,38,Male,non-anginal pain,138,175,False,normal,173,False,0.0,upsloping,0,normal,0


#### 3.2. Summary Statistics

In [56]:
# 3.2.1. First show summary statistics for all of the numeric variables -> Make some assumptions and expectations for their distributions
# When expolring each distribution of the numeric variable calculate other summary statistics manually for example range, variance, percentile etc.
# TODO: plot some of the summary statistics on the distributions
heart_data_eda.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,303.0,54.438944,9.038662,29.0,48.0,56.0,61.0,77.0
rest_blood_pressure,303.0,131.689769,17.599748,94.0,120.0,130.0,140.0,200.0
cholesterol,303.0,246.693069,51.776918,126.0,211.0,241.0,275.0,564.0
heart_rate_max,303.0,149.607261,22.875003,71.0,133.5,153.0,166.0,202.0
oldpeak,303.0,1.039604,1.161075,0.0,0.0,0.8,1.6,6.2
vessels,303.0,0.663366,0.934375,0.0,0.0,0.0,1.0,3.0
heart_disease,303.0,0.458746,0.49912,0.0,0.0,0.0,1.0,1.0


In [57]:
# 3.2.2. Summary statistics for the categorical and boolean features

# Include the proportions and count when visualization of the categorical varaibles - use bar chart !!!

heart_data_eda.describe(include = ["category", bool]).T

Unnamed: 0,count,unique,top,freq
sex,303,2,Male,206
chest_pain_type,303,4,asymptomatic,144
diabetes,303,2,False,258
ecg_rest,303,3,normal,151
exercise_angina,303,2,False,204
ST_slope,303,3,upsloping,142
blood_flow,303,3,normal,167


In [58]:
# counts of a categorical variable - todo: remove from here!
heart_data_eda["sex"].value_counts()

Male      206
Female     97
Name: sex, dtype: int64

In [59]:
# proportions of a categorical variable - todo: remove from here!
heart_data_eda["sex"].value_counts(normalize = True)

Male      0.679868
Female    0.320132
Name: sex, dtype: float64

#### 3.3. Explore numeric variables - univariate distirbutions