import libraries



In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


import models

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

read the data and seperete it into x and y

In [6]:
heart_df = pd.read_csv('heart.csv')
X = heart_df.drop(columns = 'target', axis = 1)
Y = heart_df.target

get some info about the dataset

In [7]:
print(heart_df.isnull().sum())
print(heart_df.describe())
Y.value_counts()


age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64
              age         sex          cp    trestbps        chol         fbs  \
count  303.000000  303.000000  303.000000  303.000000  303.000000  303.000000   
mean    54.366337    0.683168    0.966997  131.623762  246.264026    0.148515   
std      9.082101    0.466011    1.032052   17.538143   51.830751    0.356198   
min     29.000000    0.000000    0.000000   94.000000  126.000000    0.000000   
25%     47.500000    0.000000    0.000000  120.000000  211.000000    0.000000   
50%     55.000000    1.000000    1.000000  130.000000  240.000000    0.000000   
75%     61.000000    1.000000    2.000000  140.000000  274.500000    0.000000   
max     77.000000    1.000000    3.000000  200.000000  564.000000    1.000000   

          restecg     thalach       exang     oldpeak       s

1    165
0    138
Name: target, dtype: int64

#model selection

#compare different models without hyperparameter tuning

create the models

In [8]:
models = [LogisticRegression(max_iter = 1000), SVC(kernel = 'linear'), RandomForestClassifier(), KNeighborsClassifier()]


train and evaluate the models

In [9]:

for model in models:
  all_accuracy = cross_val_score(model, X, Y, cv = 5)
  mean_accuracy = np.sum(all_accuracy) / len(all_accuracy)
  print('accuracy of model', model, '=', mean_accuracy)


accuracy of model LogisticRegression(max_iter=1000) = 0.8282513661202187
accuracy of model SVC(kernel='linear') = 0.8283060109289618
accuracy of model RandomForestClassifier() = 0.8249726775956283
accuracy of model KNeighborsClassifier() = 0.643879781420765


#compare different models with hyperparameter tuning

In [25]:
models_list = [LogisticRegression(max_iter = 10000), SVC(), RandomForestClassifier(), KNeighborsClassifier()]
models_hyperparameters = {
    'log_regression_params': {
        'C': [1, 5, 10, 20]
    },
    'svc_params': {
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'C': [1, 5, 10, 20]
    },
    'random_forest_params': {
        'n_estimators': [10, 20, 50, 100]
    },
    'kneighbors_params': {
        'n_neighbors': [3, 5, 10]
    }

}

models_keys = list(models_hyperparameters.keys())

models_classifiers = []
for index, model in enumerate(models_list):
  classifier = GridSearchCV(model, models_hyperparameters[models_keys[index]], cv = 5)
  classifier.fit(X, Y)
  result = pd.DataFrame(classifier.cv_results_)
  result = result[['mean_test_score', 'params']]
  print(result)
  print('best accuracy of model', model, classifier.best_score_)
  print('best params of model', model, classifier.best_params_)
  print('-------------------------------------------------------------')

   mean_test_score     params
0         0.828251   {'C': 1}
1         0.831585   {'C': 5}
2         0.828306  {'C': 10}
3         0.828306  {'C': 20}
best accuracy of model LogisticRegression(max_iter=10000) 0.831584699453552
best params of model LogisticRegression(max_iter=10000) {'C': 5}
-------------------------------------------------------------
    mean_test_score                          params
0          0.828306    {'C': 1, 'kernel': 'linear'}
1          0.653443      {'C': 1, 'kernel': 'poly'}
2          0.643497       {'C': 1, 'kernel': 'rbf'}
3          0.541311   {'C': 1, 'kernel': 'sigmoid'}
4          0.818306    {'C': 5, 'kernel': 'linear'}
5          0.699945      {'C': 5, 'kernel': 'poly'}
6          0.686503       {'C': 5, 'kernel': 'rbf'}
7          0.518033   {'C': 5, 'kernel': 'sigmoid'}
8          0.818470   {'C': 10, 'kernel': 'linear'}
9          0.699891     {'C': 10, 'kernel': 'poly'}
10         0.699781      {'C': 10, 'kernel': 'rbf'}
11         0.508142  {'