In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'
import sklearn

## Dataset fields description
1. age: age in years
2. sex: sex (1 = male; 0 = female)
3. cp: chest pain type
    - Value 1: typical angina
    - Value 2: atypical angina
    - Value 3: non-anginal pain
    - Value 4: asymptomatic
4. trestbps: resting blood pressure (in mm Hg on admission to the 
    hospital)
5. chol: serum cholestoral in mg/dl
6. fbs: (fasting blood sugar > 120 mg/dl)  (1 = true; 0 = false)
7. restecg: resting electrocardiographic results
    - Value 0: normal
    - Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
    - Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
8. thalach: maximum heart rate achieved
9. exang: exercise induced angina (1 = yes; 0 = no)
10. oldpeak = ST depression induced by exercise relative to rest
11. slope: the slope of the peak exercise ST segment
    - Value 1: upsloping
    - Value 2: flat
    - Value 3: downsloping
12. ca: number of major vessels (0-3) colored by flourosopy
13. thal: 3 = normal; 6 = fixed defect; 7 = reversable defect
14. num: diagnosis of heart disease (angiographic disease status)
    - Value 0: < 50% diameter narrowing
    - Value 1: > 50% diameter narrowing
    (in any major vessel: attributes 59 through 68 are vessels)

In [None]:
heart_df = pd.read_csv("../data/processed.cleveland.data", delimiter=",",
            names=["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg",
                    "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"])
heart_df = heart_df.rename(columns={"cp":"chest_pain",
                         "thalach":"max_heart_rate",
                         "oldpeak":"st_dep_induced",
                         "ca":"num_maj_ves"})
heart_df[["sex", "chest_pain", "fbs",
          "restecg", "exang", "slope",
          "num_maj_ves", "thal", "num"]] = heart_df[["sex", "chest_pain", "fbs", "restecg", "exang", "slope", "num_maj_ves", "thal", "num"]].apply(lambda x:x.astype('category'))

heart_df.head()

## Statistical exploratory analysis

In [None]:
heart_df.describe()

### Plotting the dataset

In [None]:
fig, ax = plt.subplots(nrows=7, ncols=2, figsize=(10, 25))
ax = ax.reshape(-1)
plt.subplots_adjust(wspace=0.4, hspace=0.5)
for i, col in enumerate(heart_df.select_dtypes(exclude="category").columns):
    heart_df[col].plot.density(ax=ax[i])
    ax[i].set_xlabel(col)
for i, col in enumerate(heart_df.select_dtypes(include="category").columns):
    i += 5
    heart_df[col].value_counts().plot.bar(ax=ax[i])
    ax[i].set_xlabel(col)
    ax[i].set_ylabel("count")