# import libaries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,classification_report
from imblearn.metrics import classification_report_imbalanced
from imblearn.pipeline import Pipeline 
from imblearn.combine import SMOTETomek, SMOTEENN 
from imblearn.under_sampling import TomekLinks, EditedNearestNeighbours
from imblearn.over_sampling import SMOTEN
from collections import Counter

# import the dataset and balance labels

In [2]:
df=pd.read_csv('../heart_disease/heart_disease.csv')
#df.isnull().sum() #check for null values 
#df.columns

In [3]:
model = DecisionTreeClassifier()

# Combined: SmoteTomek

In [9]:
X = df.drop (labels = ['HeartDiseaseorAttack'], axis =1)
y = df['HeartDiseaseorAttack']

smt= SMOTEN(sampling_strategy=0.3, n_jobs=-1)
tomek= TomekLinks(sampling_strategy='majority', n_jobs=-1)

resample = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))

pipeline = Pipeline([('r', resample), ('model', model)])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

pipeline.fit(X_train, y_train) 

y_hat = pipeline.predict(X_test)
print(classification_report_imbalanced(y_test, y_hat))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.92      0.91      0.28      0.92      0.50      0.27     68840
        1.0       0.24      0.28      0.91      0.26      0.50      0.24      7264

avg / total       0.86      0.85      0.34      0.85      0.50      0.27     76104



# Combined: SMOTEENN

In [None]:
X = df.drop (labels = ['HeartDiseaseorAttack'], axis =1)
y = df['HeartDiseaseorAttack']

smt=SMOTEN(sampling_strategy=0.3, n_jobs=-1)
enn1=EditedNearestNeighbours(sampling_strategy= 'majority')

resample = SMOTEENN(random_state=42, n_jobs=-1, enn=enn1, smote=smt)

pipeline = Pipeline([('r', resample), ('model', model)])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

pipeline.fit(X_train, y_train) 

y_hat = pipeline.predict(X_test)
print(classification_report_imbalanced(y_test, y_hat))

# without pipeline (for testing)

In [None]:
from collections import Counter

print(f'Original dataset shape {Counter(y)}')

X_train, X_test, y_train , y_test = train_test_split(X,y,test_size=0.3, random_state=0) 

print(f'Y train {Counter(y_train)}')
print(f'Y test {Counter(y_test)}')

smt=SMOTEN(sampling_strategy=0.3, n_jobs=-1)
enn1=EditedNearestNeighbours(sampling_strategy= 'majority')

resample = SMOTEENN(random_state=42, n_jobs=-1, enn=enn1, smote=smt)

X_balanced, y_balanced = resample.fit_resample(X_train, y_train)

print(f'Y balanced {Counter(y_balanced)}')

In [None]:
# define model
model = DecisionTreeClassifier()
# define resampling

model.fit(X_balanced, y_balanced)
y_pred = model.predict(X_test)

# Classify and report the results
print(classification_report_imbalanced(y_test, y_pred))