# Data preparation and exploration

## Setup and imports

In [None]:
try:
  from google.colab import userdata
  from google.colab import drive
  drive.mount('/content/drive')
  PROJECT_ROOT = userdata.get('PROJECT_ROOT')
except ImportError:
  PROJECT_ROOT = '/'

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pymc as pm

from google.colab import output
# output.enable_custom_widget_manager()
output.disable_custom_widget_manager()

sns.set_style('whitegrid')
sns.set_context('paper', font_scale=1)

## Pre-processing

In [None]:
heart_disease = pd.read_csv(f'{PROJECT_ROOT}data/heart_disease_cleveland_hungary.csv')

# Remove duplicates and null values, as per Straw et al.

rows_to_drop  = (heart_disease['ST slope'] == 0) | (heart_disease['cholesterol'] == 0) | (heart_disease['resting bp s'] == 0) | (heart_disease.duplicated(keep='first'))
heart_disease.drop(heart_disease[rows_to_drop].index, inplace=True)

heart_disease.rename(columns={'sex':'sex', 'chest pain type':'cp', 'resting bp s':'bp', 'cholesterol':'chol',
                              'fasting blood sugar':'fbs', 'resting ecg':'ecg', 'max heart rate':'mhr', 'exercise angina':'ang',
                              'oldpeak':'st', 'ST slope':'slope', 'target':'cvd'}, inplace=True)

print(heart_disease.describe())

In [None]:
# Clip negative values of st (oldpeak)
heart_disease['st'] = heart_disease['st'].clip(lower=0)

# Z-score for age and mhr
cont_variables = ['mhr', 'age']
for var in cont_variables:
  heart_disease[var] = (heart_disease[var] - heart_disease[var].mean()) / heart_disease[var].std()

# Scaling for chol
log_chol = np.log(heart_disease['chol'])
heart_disease['chol_scaled'] = (log_chol - log_chol.mean()) / log_chol.std()

# Scaling for BP
log_bp = np.log(heart_disease['bp'])
heart_disease['bp_scaled'] = (log_bp - log_bp.mean()) / log_bp.std()

# Indexing cp and slope at 0
heart_disease['cp'] = heart_disease['cp'] - 1
heart_disease['slope'] = heart_disease['slope'] - 1

heart_disease.reset_index(drop=True, inplace=True)

heart_disease.to_csv(f'{PROJECT_ROOT}data/heart_disease_cleaned.csv')