# import libaries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,classification_report
from imblearn.metrics import classification_report_imbalanced

In [3]:
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# import the dataset and balance labels

In [4]:
df=pd.read_csv('../heart_disease/heart_disease.csv')
#df.isnull().sum() #check for null values 
#df.columns

In [20]:
X = df.drop (labels = ['HeartDiseaseorAttack'], axis =1)
y = df['HeartDiseaseorAttack']
#X_train, X_test, y_train , y_test = train_test_split(X,y,test_size=0.3, random_state=0) 

In [21]:
from collections import Counter

print(f'Original dataset shape {Counter(y)}')

X_train, X_test, y_train , y_test = train_test_split(X,y,test_size=0.3, random_state=0) 

print(f'Y train {Counter(y_train)}')
print(f'Y test {Counter(y_test)}')

over = RandomOverSampler(sampling_strategy=0.3)
under = RandomUnderSampler(sampling_strategy=0.7)

X_balanced, y_balanced = over.fit_resample(X_train, y_train)
X_balanced, y_balanced = under.fit_resample(X_balanced, y_balanced)

print(f'Y balanced {Counter(y_balanced)}')

Original dataset shape Counter({0.0: 229787, 1.0: 23893})
Y train Counter({0.0: 160947, 1.0: 16629})
Y balanced Counter({0.0: 68977, 1.0: 48284})
Y test Counter({0.0: 68840, 1.0: 7264})


In [22]:
# define model
model = DecisionTreeClassifier()
# define resampling

model.fit(X_balanced, y_balanced)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

# Classify and report the results
print(classification_report_imbalanced(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.93      0.85      0.89     68840
         1.0       0.22      0.41      0.29      7264

    accuracy                           0.81     76104
   macro avg       0.58      0.63      0.59     76104
weighted avg       0.86      0.81      0.83     76104

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.93      0.85      0.41      0.89      0.59      0.36     68840
        1.0       0.22      0.41      0.85      0.29      0.59      0.33      7264

avg / total       0.86      0.81      0.45      0.83      0.59      0.36     76104



In [30]:
from imblearn.pipeline import Pipeline 

X = df.drop (labels = ['HeartDiseaseorAttack'], axis =1)
y = df['HeartDiseaseorAttack']


over = RandomOverSampler(sampling_strategy=0.3)
under = RandomUnderSampler(sampling_strategy=0.7)

pipeline = Pipeline([('over', over), ('under', under), ('model', model)])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
pipeline.fit(X_train, y_train) 

y_hat = pipeline.predict(X_test)
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

         0.0       0.93      0.85      0.89     68840
         1.0       0.22      0.41      0.29      7264

    accuracy                           0.81     76104
   macro avg       0.58      0.63      0.59     76104
weighted avg       0.86      0.81      0.83     76104

