In [1]:
import pandas as pd
import numpy as np


In [4]:
df = pd.read_csv('data\heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [5]:
# Segregating independent and dependent variables

X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [6]:
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2


In [7]:
y

0       0
1       0
2       0
3       0
4       0
       ..
1020    1
1021    0
1022    0
1023    1
1024    0
Name: target, Length: 1025, dtype: int64

In [9]:
# Train test split

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state= 42)
x_train.shape

(820, 13)

In [10]:
X.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal'],
      dtype='object')

In [11]:
# Automated Feature Scaling

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

numerical_columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
                     'exang', 'oldpeak', 'slope', 'ca', 'thal']

numerical_pipeline = Pipeline(steps = [
    ('Imputer', SimpleImputer(strategy= 'median')),
    ('Scaler', StandardScaler())])

preprocessor = ColumnTransformer(
    [('Numerical_pipeline', numerical_pipeline, numerical_columns)]
)


In [15]:
# Applying preprocessing on the train dataset

x_train_scaled = preprocessor.fit_transform(x_train)
x_test_scaled = preprocessor.transform(x_test)

In [42]:
# Automated model training

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


models = {
    'LogisticRegression': LogisticRegression(),
    'DecisionTree': DecisionTreeClassifier(),
    'SVM': SVC(),
    'NaiveBayes': GaussianNB(),
    'RandomForest': RandomForestClassifier(),
    'KNN': KNeighborsClassifier()}

params = {
    'LogisticRegression': {},
    'DecisionTree': {},
    'SVM': {},
    'NaiveBayes': {},
    'RandomForest': {},
    'KNN': {}}

def evaluate_model(x_train_scaled, y_train, x_test_scaled, y_test, models, params):

    report = {}
    for model_name, model in models.items():
        estimator = model
        param = params[model_name]

        gs = GridSearchCV(estimator, param_grid=param, cv=5)
        gs.fit(x_train_scaled, y_train)

        gs.set_params(**gs.best_params_)
        gs.fit(x_train_scaled, y_train)
        prediction = gs.predict(x_test_scaled)
        
        acc_score = accuracy_score(y_test, prediction)

        report[model_name] = acc_score

    return report
    


model_report = evaluate_model(x_train_scaled, y_train, x_test_scaled, y_test, models, params)

best_modal_score = max(sorted(model_report.values()))
best_modal_name = list(model_report.keys())[list(model_report.values()).index(best_modal_score)]
best_modal = model_report[best_modal_name]

if best_modal_score <0.6:
                print("No best model found")
else:
        print(f'The best model is {best_modal_name} with a accuracy score of {best_modal_score}')

The best model is DecisionTree with a accuracy score of 0.9853658536585366
