In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier

data = pd.read_csv('heart_2020_cleaned.csv')
df = pd.DataFrame(data)

df = df.sample(n = 10000,random_state=42)

#separate X and y
X = df.drop(columns = ['HeartDisease'])
y = df['HeartDisease']

#get dummies
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
numerical_columns = X.select_dtypes(include=['int64','float64']).columns.tolist()

df_encoded = pd.get_dummies(X,columns=categorical_columns)

X = pd.concat([df_encoded,df[numerical_columns]],axis=1)

#scale
scaler = StandardScaler()

X = scaler.fit_transform(X)

#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#define the model
classifier = DecisionTreeClassifier()


params_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth' : [None,5,10,15],
    'min_samples_split' : [5,10,15],
    'min_samples_leaf' : [2,5,8],
    'max_leaf_nodes' : [10,15,20],
    'max_features' : [1.0,'sqrt'],
    'ccp_alpha': [0.001,0.01,0.1,1,10]
}

#define a grid search model
grid_search = GridSearchCV(classifier,params_grid,cv=5)

grid_search.fit(X_train,y_train)

best_params = grid_search.best_params_

#define new classifier model using the best parameters in grid model
best_model = DecisionTreeClassifier(
    criterion=best_params['criterion'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    max_features = best_params['max_features'],
    ccp_alpha = best_params['ccp_alpha']
)

best_model.fit(X_train,y_train)

#predict using the new model
y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)

#define a baseline model
baseline_model = DummyClassifier(strategy = 'most_frequent')
baseline_model.fit(X_train,y_train)

baseline_pred = baseline_model.predict(X_test)
baseline_accuracy = accuracy_score(y_test,baseline_pred)

#print
print('My model accuracy= ',accuracy)
print('baseline accuracy = ',baseline_accuracy)

My model accuracy=  0.9045
baseline accuracy =  0.9045
