# Hearth Disease Analysis

## 0. Preparation

In [None]:
import duckdb
from pathlib import Path

from utils.data_info import analyze_data, get_top_correlations, detect_outliers_iqr

In [None]:
project_dir = Path("pyproject.toml").resolve().parents[1]
interim_dir = project_dir / "data" / "interim"

In [3]:
conn = duckdb.connect(str(interim_dir / "db.db"))
df_raw = conn.execute("SELECT * FROM heart").fetchdf()
conn.close()

In [None]:
analyze_data(
    df_raw,
    columns_info=True,
    dtypes_info=True,
)


{'columns_info': {'total_columns': 14,
  'column_names': ['age',
   'sex',
   'cp',
   'trestbps',
   'chol',
   'fbs',
   'restecg',
   'thalach',
   'exang',
   'oldpeak',
   'slope',
   'ca',
   'thal',
   'target']},
 'dtypes_info': {'age': 'int64',
  'sex': 'int64',
  'cp': 'int64',
  'trestbps': 'int64',
  'chol': 'int64',
  'fbs': 'int64',
  'restecg': 'int64',
  'thalach': 'int64',
  'exang': 'int64',
  'oldpeak': 'float64',
  'slope': 'int64',
  'ca': 'int64',
  'thal': 'int64',
  'target': 'int64'},
 'nan_info': {'total_nan': np.int64(0),
  'nan_by_column': {},
  'nan_percentage_by_column': {}}}

In [14]:
analyze_data(
    df_raw,
    nan_info=True,
)

{'nan_info': {'total_nan': np.int64(0),
  'nan_by_column': {},
  'nan_percentage_by_column': {}}}

In [15]:
analyze_data(
    df_raw,
    stats_info=True,
)

{'statistics': {'numerical_stats': {'age': {'count': 1025.0,
    'mean': 54.43414634146342,
    'std': 9.072290233244278,
    'min': 29.0,
    '25%': 48.0,
    '50%': 56.0,
    '75%': 61.0,
    'max': 77.0},
   'sex': {'count': 1025.0,
    'mean': 0.6956097560975609,
    'std': 0.4603733241196493,
    'min': 0.0,
    '25%': 0.0,
    '50%': 1.0,
    '75%': 1.0,
    'max': 1.0},
   'cp': {'count': 1025.0,
    'mean': 0.9424390243902439,
    'std': 1.029640743645865,
    'min': 0.0,
    '25%': 0.0,
    '50%': 1.0,
    '75%': 2.0,
    'max': 3.0},
   'trestbps': {'count': 1025.0,
    'mean': 131.61170731707318,
    'std': 17.516718005376408,
    'min': 94.0,
    '25%': 120.0,
    '50%': 130.0,
    '75%': 140.0,
    'max': 200.0},
   'chol': {'count': 1025.0,
    'mean': 246.0,
    'std': 51.59251020618206,
    'min': 126.0,
    '25%': 211.0,
    '50%': 240.0,
    '75%': 275.0,
    'max': 564.0},
   'fbs': {'count': 1025.0,
    'mean': 0.14926829268292682,
    'std': 0.3565266897271575,
   

In [16]:
analyze_data(
    df_raw,
    correlation_info=True,
)

{'correlation': {'correlation_matrix': {'age': {'age': 1.0,
    'sex': -0.10324029759435553,
    'cp': -0.07196627394150716,
    'trestbps': 0.27112140631607556,
    'chol': 0.21982253466576054,
    'fbs': 0.12124347870535487,
    'restecg': -0.13269616796729033,
    'thalach': -0.3902270750315936,
    'exang': 0.08816338342423417,
    'oldpeak': 0.20813667742811404,
    'slope': -0.16910511083470267,
    'ca': 0.2715505294833465,
    'thal': 0.07229744773281882,
    'target': -0.2293235512676108},
   'sex': {'age': -0.10324029759435553,
    'sex': 1.0,
    'cp': -0.04111908876342355,
    'trestbps': -0.07897376914509617,
    'chol': -0.19825787170698494,
    'fbs': 0.02720046098910731,
    'restecg': -0.05511721329926214,
    'thalach': -0.049365243032544345,
    'exang': 0.13915680922171658,
    'oldpeak': 0.08468655858668672,
    'slope': -0.026666292452793054,
    'ca': 0.11172891299897979,
    'thal': 0.19842425379861423,
    'target': -0.2795007572922623},
   'cp': {'age': -0.071

In [17]:
analyze_data(
    df_raw,
    outlier_info=True,
)

  case np.number:
  case np.number:


{'outliers': {'oldpeak': {'lower_bound': -2.7,
   'upper_bound': 4.5,
   'num_outliers': 7,
   'outlier_percentage': 0.6829268292682927}}}

In [18]:
analyze_data(
    df_raw,
    target_analysis=True,
)

{'target_analysis': {'count': {1: 526, 0: 499},
  'percentage': {1: 51.32, 0: 48.68},
  'class_balance': 'balanced',
  'mean_values_by_class': {'age': {0: 56.569138276553105,
    1: 52.40874524714829},
   'sex': {0: 0.8276553106212425, 1: 0.5703422053231939},
   'cp': {0: 0.48296593186372744, 1: 1.3783269961977187},
   'trestbps': {0: 134.1062124248497, 1: 129.24524714828897},
   'chol': {0: 251.2925851703407, 1: 240.97908745247148},
   'fbs': {0: 0.16432865731462926, 1: 0.13498098859315588},
   'restecg': {0: 0.45691382765531063, 1: 0.5988593155893536},
   'thalach': {0: 139.1302605210421, 1: 158.58555133079847},
   'exang': {0: 0.5490981963927856, 1: 0.13498098859315588},
   'oldpeak': {0: 1.6002004008016033, 1: 0.5699619771863118},
   'slope': {0: 1.1663326653306614, 1: 1.5931558935361216},
   'ca': {0: 1.1583166332665331, 1: 0.37072243346007605},
   'thal': {0: 2.5390781563126255, 1: 2.1197718631178706},
   'target': {0: 0.0, 1: 1.0}}}}

In [19]:
get_top_correlations(df_raw.corr(), 10)

{('oldpeak', 'slope'): 0.5751885364375171,
 ('oldpeak', 'target'): 0.438441270111945,
 ('exang', 'target'): 0.4380285500558459,
 ('cp', 'target'): 0.4348542500527371,
 ('thalach', 'target'): 0.4228954964828723,
 ('cp', 'exang'): 0.40151271399953703,
 ('thalach', 'slope'): 0.395307843482417,
 ('age', 'thalach'): 0.3902270750315936,
 ('ca', 'target'): 0.3820852890386709,
 ('thalach', 'exang'): 0.3802808723919557}

In [20]:
detect_outliers_iqr(df_raw)

  case np.number:
  case np.number:


{'oldpeak': {'lower_bound': -2.7,
  'upper_bound': 4.5,
  'num_outliers': 7,
  'outlier_percentage': 0.6829268292682927}}