In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import time as t

In [2]:
heart = pd.read_csv('heart_2020_cleaned.csv')

In [None]:
heart.head()

# Stratified Subsample

In [None]:
stratify_list = ['HeartDisease', 'AgeCategory', 'Stroke', 'Sex', 'GenHealth']
heart2 = heart.groupby(stratify_list, group_keys=False).apply(lambda x: x.sample(frac=0.1, random_state = 42))

# 50-50 split Subsample

In [3]:
stratify_list = ['AgeCategory', 'Stroke', 'Sex', 'GenHealth']
heartYes = heart[heart.HeartDisease == 'Yes']

heartNo = heart[heart.HeartDisease == 'No']
no_sample = heartNo.groupby(stratify_list, group_keys=False).apply(lambda x: x.sample(frac=0.1, random_state = 42))

In [4]:
heart50 = pd.concat([heartYes, no_sample], axis = 0)
heart50 = heart50.sample(frac=1).reset_index(drop=True) # shuffle rows, reset index

In [5]:
_, count2 = np.unique(np.asarray(heart50['HeartDisease']),  return_counts = True)
print(count2)

[29230 27373]


In [None]:
#heart50.to_csv('heart2.csv', index=False)

## Preprocessing

In [7]:
# convert into ordinal

agecategories = list(heart50.AgeCategory.unique())
ageMidpoints = [57, 90, 67, 77, 42, 72, 62, 52, 47, 21, 37, 32, 27]
heart50.AgeCategory.replace(agecategories, ageMidpoints, inplace = True)

healthcategories = list(heart50.GenHealth.unique())
healthlevels = [4, 2, 3, 1, 5]
heart50.GenHealth.replace(healthcategories, healthlevels, inplace = True)

In [8]:
# Yes No columns into 1, 0

bin_columns = ["HeartDisease", "Smoking", "AlcoholDrinking", "Stroke", "DiffWalking", "PhysicalActivity", "Asthma", "KidneyDisease", "SkinCancer"]

heart50[bin_columns] = heart50[bin_columns].apply(lambda x: x.map({'Yes':1,'No':0}))
#heart2.head()

In [9]:
# dummy columns for categorical variables

categoricals = heart50.select_dtypes(include=['object'])
categoricals.head()
cat_dummies = pd.get_dummies(categoricals, drop_first=True)

# Drop the redundant columns
heart50.drop(list(categoricals.columns), axis=1, inplace=True)

# concat the heart and dummies data frames.
heart50 = pd.concat([heart50, cat_dummies], axis=1)


In [28]:
#heart50.head()

## Train-Test Split

In [11]:
from sklearn.model_selection import train_test_split

train50, test50 = train_test_split(heart50, test_size = 0.2, random_state = 42, stratify = heart50.HeartDisease)

In [12]:
train50.to_csv('train50.csv', index=False)
test50.to_csv('test50.csv', index=False)

In [13]:
train50_label = train50['HeartDisease']
train50_set = train50.drop(['HeartDisease'], axis = 1)

test50_label = test50['HeartDisease']
test50_set = test50.drop(['HeartDisease'], axis = 1)

# SMOTE

NOTE: SMOTE should be applied after the sampling and preprocessing steps

In [None]:
y = heart2.iloc[:,0]
X = heart2.drop(['HeartDisease'], axis = 1)

from imblearn.over_sampling import SMOTE
os = SMOTE(random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
columns = X_train.columns

os_train_X, os_train_y = os.fit_resample(X_train, y_train)
#os_train_X = pd.DataFrame(data = os_train_X, columns = columns )
#os_train_y = pd.DataFrame(data = os_train_y, columns = ['HeartDisease'])

In [None]:
_, count1 = np.unique(y, return_counts = True)
print(count1)

_, count2 = np.unique(os_train_y,  return_counts = True)
print(count2)

# Export data

In [None]:
os_X = pd.DataFrame(data = os_train_X, columns = columns )
os_y = pd.DataFrame(data = os_train_y, columns = ['HeartDisease'])
train_smoted = pd.concat([os_y, os_X], axis = 1)

train_smoted.to_csv('train_smoted.csv', index=False)

In [None]:
test_smoted = pd.concat([y_test, X_test], axis = 1)
test_smoted.to_csv('test_smoted.csv', index=False)