In [85]:
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import math
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE, SMOTENC, ADASYN
from sklearn.linear_model import LogisticRegression, SGDClassifier

Data:

In [41]:
data = pd.read_csv('data/heart_disease_health_indicators_BRFSS2015.csv')
data.head()

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


Train test split:

In [42]:
X, y = data.drop('HeartDiseaseorAttack', axis=1), data['HeartDiseaseorAttack']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42, stratify=y)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(202944, 21) (50736, 21)
(202944,) (50736,)


## 1. Dealing with imbalanced dataset

#### 1.1. Oversampling:

a. Random oversampling

In [43]:
imbalanced_data = X_train.copy()
imbalanced_data['HeartDiseaseorAttack'] = y_train.copy()

minority_class = imbalanced_data[imbalanced_data['HeartDiseaseorAttack'] == 1.0]
majority_class = imbalanced_data[imbalanced_data['HeartDiseaseorAttack'] == 0.0]

# upsample the minority class
minority_upsampled = resample(minority_class, replace=True, n_samples=len(majority_class), random_state=42)

# combine the upsampled minority class with the majority class
balanced_data = pd.concat([majority_class, minority_upsampled])
X_train_ro, y_train_ro = balanced_data.drop('HeartDiseaseorAttack', axis=1), balanced_data['HeartDiseaseorAttack']

b. SMOTE (Synthetic Minority Over-sampling Technique)

In [50]:
# smote treat all features as numerical
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [62]:
# smotenc for numerical and categorical features
smotenc = SMOTENC(random_state=42, categorical_features=['GenHlth', 'Age', 'Education','Income'])
X_train_smotenc, y_train_smotenc = smotenc.fit_resample(X_train, y_train)

c. ADASYN

In [65]:
adasyn = ADASYN(random_state=42)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)

## 2. Models

#### 2.1. KNN

Baseline:

In [36]:
std_scaler = StandardScaler()
X_train_scaled = std_scaler.fit_transform(X_train)
X_test_scaled = std_scaler.transform(X_test)

k = int(math.sqrt(X_train_scaled.shape[0]))
knn = KNeighborsClassifier(n_neighbors=k, weights='distance')
knn.fit(X_train_scaled, y_train)

y_pred = knn.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.91      0.99      0.95     45957
         1.0       0.44      0.05      0.09      4779

    accuracy                           0.90     50736
   macro avg       0.68      0.52      0.52     50736
weighted avg       0.87      0.90      0.87     50736



Neighborhood Components Analysis:

In [13]:
k = int(math.sqrt(X_train.shape[0]))
nca = NeighborhoodComponentsAnalysis(init='pca')

X_train_nca = nca.fit_transform(X_train_pca, y_train)
X_test_nca = nca.transform(X_test_pca)

knn = KNeighborsClassifier(n_neighbors=k, weights='distance')
knn.fit(X_train_nca, y_train)

y_pred = knn.predict(X_test_nca)

print(classification_report(y_test, y_pred))

MemoryError: Unable to allocate 38.4 GiB for an array with shape (202944, 202944) and data type bool

Random oversampling:

In [None]:
std_scaler = StandardScaler()
X_train_ro_scaled = std_scaler.fit_transform(X_train_ro)
X_test_scaled = std_scaler.transform(X_test)

k = int(math.sqrt(X_train_ro.shape[0]))
knn = KNeighborsClassifier(n_neighbors=k, weights='distance')
knn.fit(X_train_ro_scaled, y_train_ro)

y_pred = knn.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.97      0.73      0.83     45957
         1.0       0.23      0.79      0.36      4779

    accuracy                           0.73     50736
   macro avg       0.60      0.76      0.59     50736
weighted avg       0.90      0.73      0.79     50736



SMOTE:

In [46]:
std_scaler = StandardScaler()
X_train_smote_scaled = std_scaler.fit_transform(X_train_smote)
X_test_scaled = std_scaler.transform(X_test)

k = int(math.sqrt(X_train_smote.shape[0]))
knn = KNeighborsClassifier(n_neighbors=k, weights='distance')
knn.fit(X_train_smote_scaled, y_train_smote)

y_pred = knn.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.97      0.74      0.84     45957
         1.0       0.23      0.77      0.36      4779

    accuracy                           0.74     50736
   macro avg       0.60      0.75      0.60     50736
weighted avg       0.90      0.74      0.79     50736



SMOTENC:

In [63]:
std_scaler = StandardScaler()
X_train_smotenc_scaled = std_scaler.fit_transform(X_train_smotenc)
X_test_scaled = std_scaler.transform(X_test)

k = int(math.sqrt(X_train_smotenc.shape[0]))
knn = KNeighborsClassifier(n_neighbors=k, weights='distance')
knn.fit(X_train_smotenc_scaled, y_train_smotenc)

y_pred = knn.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.97      0.73      0.83     45957
         1.0       0.23      0.77      0.35      4779

    accuracy                           0.74     50736
   macro avg       0.60      0.75      0.59     50736
weighted avg       0.90      0.74      0.79     50736



ADASYN:

In [66]:
std_scaler = StandardScaler()
X_train_adasyn_scaled = std_scaler.fit_transform(X_train_adasyn)
X_test_scaled = std_scaler.transform(X_test)

k = int(math.sqrt(X_train_adasyn.shape[0]))
knn = KNeighborsClassifier(n_neighbors=k, weights='distance')
knn.fit(X_train_adasyn_scaled, y_train_adasyn)

y_pred = knn.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.97      0.73      0.83     45957
         1.0       0.23      0.78      0.35      4779

    accuracy                           0.73     50736
   macro avg       0.60      0.75      0.59     50736
weighted avg       0.90      0.73      0.79     50736



#### 2.2. Logistic regression

##### a. Without resampling

In [82]:
std_scaler = StandardScaler()
X_train_scaled = std_scaler.fit_transform(X_train)
X_test_scaled = std_scaler.transform(X_test)

log_reg = LogisticRegression(class_weight='balanced')
log_reg.fit(X_train_scaled, y_train)

y_pred = log_reg.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.97      0.75      0.85     45957
         1.0       0.25      0.80      0.38      4779

    accuracy                           0.75     50736
   macro avg       0.61      0.77      0.61     50736
weighted avg       0.90      0.75      0.80     50736



In [88]:
std_scaler = StandardScaler()
X_train_scaled = std_scaler.fit_transform(X_train)
X_test_scaled = std_scaler.transform(X_test)

sgd_clf = SGDClassifier(loss='log_loss', class_weight='balanced')
sgd_clf.fit(X_train_scaled, y_train)

y_pred = sgd_clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.97      0.74      0.84     45957
         1.0       0.24      0.80      0.37      4779

    accuracy                           0.75     50736
   macro avg       0.61      0.77      0.61     50736
weighted avg       0.90      0.75      0.80     50736



##### b. Using resampling

Random oversampling:

In [76]:
std_scaler = StandardScaler()
X_train_ro_scaled = std_scaler.fit_transform(X_train_ro)
X_test_scaled = std_scaler.transform(X_test)

log_reg = LogisticRegression()
log_reg.fit(X_train_ro_scaled, y_train_ro)

y_pred = log_reg.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.97      0.75      0.85     45957
         1.0       0.25      0.80      0.38      4779

    accuracy                           0.75     50736
   macro avg       0.61      0.77      0.61     50736
weighted avg       0.90      0.75      0.80     50736



In [89]:
std_scaler = StandardScaler()
X_train_ro_scaled = std_scaler.fit_transform(X_train_ro)
X_test_scaled = std_scaler.transform(X_test)

sgd_clf = SGDClassifier(loss='log_loss')
sgd_clf.fit(X_train_ro_scaled, y_train_ro)

y_pred = sgd_clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.97      0.75      0.85     45957
         1.0       0.25      0.80      0.38      4779

    accuracy                           0.75     50736
   macro avg       0.61      0.77      0.61     50736
weighted avg       0.90      0.75      0.80     50736



SMOTE:

In [77]:
std_scaler = StandardScaler()
X_train_smote_scaled = std_scaler.fit_transform(X_train_smote)
X_test_scaled = std_scaler.transform(X_test)

log_reg = LogisticRegression()
log_reg.fit(X_train_smote_scaled, y_train_smote)

y_pred = log_reg.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.97      0.75      0.85     45957
         1.0       0.25      0.78      0.38      4779

    accuracy                           0.76     50736
   macro avg       0.61      0.77      0.61     50736
weighted avg       0.90      0.76      0.80     50736



In [90]:
std_scaler = StandardScaler()
X_train_smote_scaled = std_scaler.fit_transform(X_train_smote)
X_test_scaled = std_scaler.transform(X_test)

sgd_clf = SGDClassifier(loss='log_loss')
sgd_clf.fit(X_train_smote_scaled, y_train_smote)

y_pred = sgd_clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.97      0.77      0.86     45957
         1.0       0.26      0.77      0.38      4779

    accuracy                           0.77     50736
   macro avg       0.61      0.77      0.62     50736
weighted avg       0.90      0.77      0.81     50736



ADASYN:

In [93]:
std_scaler = StandardScaler()
X_train_adasyn_scaled = std_scaler.fit_transform(X_train_adasyn)
X_test_scaled = std_scaler.transform(X_test)

log_reg = LogisticRegression()
log_reg.fit(X_train_adasyn_scaled, y_train_adasyn)

y_pred = log_reg.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.97      0.74      0.84     45957
         1.0       0.24      0.80      0.37      4779

    accuracy                           0.75     50736
   macro avg       0.61      0.77      0.61     50736
weighted avg       0.90      0.75      0.80     50736



In [None]:
std_scaler = StandardScaler()
X_train_adasyn_scaled = std_scaler.fit_transform(X_train_adasyn)
X_test_scaled = std_scaler.transform(X_test)

sgd_clf = SGDClassifier(loss='log_loss')
log_reg.fit(X_train_adasyn_scaled, y_train_adasyn)

y_pred = log_reg.predict(X_test_scaled)
print(classification_report(y_test, y_pred))