# import libaries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,classification_report
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
from imblearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler
from collections import Counter

# import the dataset and balance labels

In [2]:
df=pd.read_csv('../heart_disease/heart_disease.csv')
#df.isnull().sum() #check for null values 
#df.columns

In [3]:
X = df.drop (labels = ['HeartDiseaseorAttack'], axis =1)
y = df['HeartDiseaseorAttack']

# Feature Selection

In [4]:
from sklearn.feature_selection import RFECV

#balanced_accuracy, or f1_weighted
rfecv = RFECV(estimator= DecisionTreeClassifier(), step = 1, cv = 10, scoring="balanced_accuracy", n_jobs=-1)
rfecv = rfecv.fit(X, y)

print("The optimal number of features:", rfecv.n_features_)
print("Best features:", X.columns[rfecv.support_])


X_new = rfecv.transform(X)


The optimal number of features: 17
Best features: Index(['HighBP', 'HighChol', 'BMI', 'Smoker', 'Stroke', 'Diabetes',
       'PhysActivity', 'Fruits', 'Veggies', 'GenHlth', 'MentHlth', 'PhysHlth',
       'DiffWalk', 'Sex', 'Age', 'Education', 'Income'],
      dtype='object')


# Resampling the data

In [16]:
X_train, X_test, y_train , y_test = train_test_split(X_new,y,test_size=0.3, random_state=1234) 

over=  RandomOverSampler(sampling_strategy=0.2)
under= RepeatedEditedNearestNeighbours(sampling_strategy='majority', max_iter=100,n_neighbors=7, kind_sel='all', n_jobs=-1)

X_balanced, y_balanced = over.fit_resample(X_train, y_train)
X_balanced, y_balanced = under.fit_resample(X_balanced, y_balanced)

print(f'Y balanced {Counter(y_balanced)}')

Y balanced Counter({0.0: 96314, 1.0: 32189})


# Scaling the data

In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_balanced)

scaleddata=scaler.transform(X_balanced)

In [11]:
print(scaleddata.dtype.names)

None


In [18]:
# calculate heuristic class weighting
from sklearn.utils.class_weight import compute_class_weight

weighting = compute_class_weight(class_weight='balanced', classes=[0,1], y=y)
print(weighting)

[0.55198945 5.30866781]


# Logistic regression

In [26]:
from sklearn.linear_model import LogisticRegression

#hyperparameter tunning
#grid search 

from sklearn.model_selection import GridSearchCV
penalty=["None", "l2"]
C = [0.001, 0.01, 0.1, 1, 10]
class_weight=[{0:0.55,1:5.3},{0:5.3,1:0.55},{0:100,1:1}, {0:10,1:1}, {0:1,1:1}, {0:1,1:10}, {0:1,1:100}]
solver= ["lbfgs", "newton-cg", "newton-cholesky", "sag", "saga"]

hyperparameters=dict(penalty=penalty, C=C, class_weight=class_weight, solver=solver )

model=LogisticRegression()

param_search=GridSearchCV(model,hyperparameters,cv=10, scoring="roc_auc", n_jobs=-1)

#knn_param_search.fit(X_train,y_train)

best_model = param_search.fit(scaleddata,y_balanced)

print('Best leaf_size:', best_model.best_estimator_.get_params()['penalty'])
print('Best p:', best_model.best_estimator_.get_params()['C'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['class_weight'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['solver'])

2100 fits failed out of a total of 3500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1400 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\lucas\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\lucas\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\lucas\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 55, in _check_solver
    raise ValueError(
ValueError: Logistic Regression supports only penalties in ['l1', 'l2', 'elasticnet', 'none'], got None.


Best leaf_size: l2
Best p: 0.01
Best n_neighbors: {0: 1, 1: 1}
Best n_neighbors: lbfgs


# Decision Tree check training score

In [None]:
from sklearn.tree import DecisionTreeClassifier
#hyperparameter tunning
#grid search 

from sklearn.model_selection import GridSearchCV
criterion =["None", "l2"]
max_depth  = [0.001, 0.01, 0.1, 1, 10]
max_leaf_nodes =[{0:0.55,1:5.3},{0:5.3,1:0.55},{0:100,1:1}, {0:10,1:1}, {0:1,1:1}, {0:1,1:10}, {0:1,1:100}]
min_samples_split = ["lbfgs", "newton-cg", "newton-cholesky", "sag", "saga"]
min_samples_leaf 

hyperparameters=dict(penalty=penalty, C=C, class_weight=class_weight, solver=solver )

model=DecisionTreeClassifier()

param_search=GridSearchCV(model,hyperparameters,cv=10, scoring="roc_auc", n_jobs=-1)

#knn_param_search.fit(X_train,y_train)

best_model = param_search.fit(scaleddata,y_balanced)

print('Best leaf_size:', best_model.best_estimator_.get_params()['penalty'])
print('Best p:', best_model.best_estimator_.get_params()['C'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['class_weight'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['solver'])

In [8]:
from sklearn.tree import DecisionTreeClassifier

model2= DecisionTreeClassifier()

pipeline = Pipeline([('StandardScaler', StandardScaler()), ('over', over), ('under', under), ('model2', model2)])

pipeline.fit(X_train, y_train) 

y_hat = pipeline.predict(X_test)
print(classification_report_imbalanced(y_test, y_hat))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.93      0.76      0.47      0.84      0.60      0.37     68840
        1.0       0.17      0.47      0.76      0.25      0.60      0.35      7264

avg / total       0.86      0.73      0.50      0.78      0.60      0.37     76104



# SVM

In [None]:
from sklearn.svm import SVC

X_train, X_test, y_train , y_test = train_test_split(X_new,y,test_size=0.3, random_state=0) 

model2= SVC(random_state=0)

pipeline = Pipeline([('StandardScaler', StandardScaler()), ('over', over), ('under', under), ('model2', model2)])


pipeline.fit(X_train, y_train) 

y_hat = pipeline.predict(X_test)
print(classification_report_imbalanced(y_test, y_hat))

# KNN

In [38]:
from sklearn.neighbors import KNeighborsClassifier 

X_train, X_test, y_train , y_test = train_test_split(X_new,y,test_size=0.3, random_state=0) 

model2= KNeighborsClassifier(n_neighbors=5, n_jobs= -1)

pipeline = Pipeline([('StandardScaler', StandardScaler()), ('over', over), ('under', under), ('model2', model2)])


pipeline.fit(X_train, y_train) 

y_hat = pipeline.predict(X_test)
print(classification_report_imbalanced(y_test, y_hat))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.96      0.76      0.69      0.85      0.72      0.52     68840
        1.0       0.23      0.69      0.76      0.34      0.72      0.52      7264

avg / total       0.89      0.75      0.70      0.80      0.72      0.52     76104



# NB classifier

In [13]:
from sklearn.naive_bayes import CategoricalNB
    
X_train, X_test, y_train , y_test = train_test_split(X_new,y,test_size=0.3, random_state=0) 

model2= CategoricalNB()

pipeline = Pipeline([('over', over), ('under', under), ('model2', model2)])


pipeline.fit(X_train, y_train) 

y_hat = pipeline.predict(X_test)
print(classification_report_imbalanced(y_test, y_hat)) 
print(confusion_matrix(y_test, y_hat))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.96      0.76      0.72      0.85      0.74      0.55     68840
        1.0       0.24      0.72      0.76      0.36      0.74      0.54      7264

avg / total       0.89      0.76      0.72      0.80      0.74      0.55     76104

[[52552 16288]
 [ 2062  5202]]
