In [14]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 50)

In [2]:
data = pd.read_csv('heart_num_0228.csv')
Q1 = data[['SleepHours', 'BMI']].quantile(q = 0.25)
Q3 = data[['SleepHours', 'BMI']].quantile(q = 0.75)
IQR = Q3-Q1
data_iqr = data[(data['SleepHours'] <= Q3['SleepHours']+IQR['SleepHours']*1.5)&(data['SleepHours'] >= Q1['SleepHours']-IQR['SleepHours']*1.5)]
data_iqr = data_iqr[(data_iqr['BMI'] <= Q3['BMI']+IQR['BMI']*1.5)&(data_iqr['BMI'] >= Q1['BMI']-IQR['BMI']*1.5)]
print('Original data:', len(data))
print('Remove Outlier data:', len(data_iqr))

Original data: 274034
Remove Outlier data: 261812


In [5]:
X = data_iqr.drop('HadHeartAttack', axis = 1)
y = data_iqr['HadHeartAttack']

In [6]:
from imblearn.over_sampling import SMOTE
smo = SMOTE(random_state = 1234)
X_re, y_re = smo.fit_resample(X, y)
print('Original Data:', len(X))
print('Resampled Data:', len(X_re))

Original Data: 261812
Resampled Data: 494774


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_re, y_re, test_size = 0.2, random_state = 1234)

In [12]:
from sklearn.preprocessing import RobustScaler
num_features = ['PhysicalHealthDays', 'MentalHealthDays', 'SleepHours', 'AgeCategory', 'HeightInMeters', 'WeightInKilograms', 'BMI']
X_trn_num = X_train[num_features]
X_tst_num = X_test[num_features]
X_trn_cat = X_train.drop(num_features, axis = 1)
X_tst_cat = X_test.drop(num_features, axis = 1)

rbs = RobustScaler()
X_trn_num_scaled = rbs.fit_transform(X_trn_num)
X_tst_num_scaled = rbs.transform(X_tst_num)

In [16]:
print(X_trn_num_scaled.shape)
print(np.array(X_trn_cat).shape)

(395819, 7)
(395819, 39)


In [24]:
X_trn_scaled = np.concatenate((X_trn_num_scaled, np.array(X_trn_cat)), axis = 1)
X_tst_scaled = np.concatenate((X_tst_num_scaled, np.array(X_tst_cat)), axis = 1)

In [28]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

In [26]:
sgd = SGDClassifier(random_state = 1234)
sgd.fit(X_trn_scaled, y_train)

In [29]:
sgd_pred = sgd.predict(X_tst_scaled)
sgd_acc = accuracy_score(y_test, sgd_pred)
sgd_f1 = f1_score(y_test, sgd_pred)
sgd_cm = confusion_matrix(y_test, sgd_pred)

In [30]:
print(f'SGD Classifier Accuracy: {sgd_acc*100:.2f}%')
print(f'SGD Classifier F1 score: {sgd_f1:.4f}')

SGD Classifier Accuracy: 90.16%
SGD Classifier F1 score: 0.8940


In [31]:
print(classification_report(y_test, sgd_pred))

              precision    recall  f1-score   support

           0       0.85      0.98      0.91     49308
           1       0.97      0.83      0.89     49647

    accuracy                           0.90     98955
   macro avg       0.91      0.90      0.90     98955
weighted avg       0.91      0.90      0.90     98955



In [32]:
print(sgd_cm)

[[48147  1161]
 [ 8577 41070]]


In [33]:
base_model = DecisionTreeClassifier(max_depth = 5, min_samples_leaf = 10)
ada = AdaBoostClassifier(estimator = base_model)
ada.fit(X_trn_scaled, y_train)

In [34]:
ada_pred = ada.predict(X_tst_scaled)
ada_acc = accuracy_score(y_test, ada_pred)
ada_f1 = f1_score(y_test, ada_pred)
ada_cm = confusion_matrix(y_test, ada_pred)

In [35]:
print(f'AdaBoost Accuracy: {ada_acc*100:.2f}%')
print(f'AdaBoost F1 score: {ada_f1:.4f}')

AdaBoost Accuracy: 96.97%
AdaBoost F1 score: 0.9694


In [36]:
print(classification_report(y_test, ada_pred))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97     49308
           1       0.98      0.96      0.97     49647

    accuracy                           0.97     98955
   macro avg       0.97      0.97      0.97     98955
weighted avg       0.97      0.97      0.97     98955



In [37]:
print(ada_cm)

[[48490   818]
 [ 2182 47465]]
