In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

import os
os.environ['LOKY_MAX_CPU_COUNT'] = "6"

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("heart.csv")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [4]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [5]:
y = df["HeartDisease"]
x = df.drop("HeartDisease", axis="columns")

In [6]:
x = pd.get_dummies(x, drop_first=True)
x.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,True,True,False,False,True,False,False,False,True
1,49,160,180,0,156,1.0,False,False,True,False,True,False,False,True,False
2,37,130,283,0,98,0.0,True,True,False,False,False,True,False,False,True
3,48,138,214,0,108,1.5,False,False,False,False,True,False,True,True,False
4,54,150,195,0,122,0.0,True,False,True,False,True,False,False,False,True


In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

numerical_columns = x.select_dtypes(include=['int', 'float']).columns
x[numerical_columns] = scaler.fit_transform(df[numerical_columns])
x

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,-1.433140,0.410909,0.825070,-0.551341,1.382928,-0.832432,True,True,False,False,True,False,False,False,True
1,-0.478484,1.491752,-0.171961,-0.551341,0.754157,0.105664,False,False,True,False,True,False,False,True,False
2,-1.751359,-0.129513,0.770188,-0.551341,-1.525138,-0.832432,True,True,False,False,False,True,False,False,True
3,-0.584556,0.302825,0.139040,-0.551341,-1.132156,0.574711,False,False,False,False,True,False,True,True,False
4,0.051881,0.951331,-0.034755,-0.551341,-0.581981,-0.832432,True,False,True,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,-0.902775,-1.210356,0.596393,-0.551341,-0.188999,0.293283,True,False,False,True,True,False,False,True,False
914,1.536902,0.627078,-0.053049,1.813758,0.164684,2.357094,True,False,False,False,True,False,False,True,False
915,0.370100,-0.129513,-0.620168,-0.551341,-0.857069,0.293283,True,False,False,False,True,False,True,True,False
916,0.370100,-0.129513,0.340275,-0.551341,1.461525,-0.832432,False,True,False,False,False,False,False,True,False


In [8]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
x = pca.fit_transform(x)

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [10]:
models = {
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear', multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    },
    'knn': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 5, 7],
            'algorithm': ['auto', 'ball_tree', 'kd_tree']
        }
    },
    'svm': {
        'model': SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    }
}

In [11]:
from sklearn.model_selection import RandomizedSearchCV

scores = []
for name, info in models.items():
    model =  RandomizedSearchCV(info['model'], info['params'], cv=5, return_train_score=False)
    model.fit(x_train, y_train)
    scores.append({
        'model': name,
        'best_score': model.best_score_,
        'best_params': model.best_params_
    })
    
df = pd.DataFrame(scores, columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,logistic_regression,0.85964,{'C': 10}
1,knn,0.840574,"{'n_neighbors': 7, 'algorithm': 'auto'}"
2,svm,0.855568,"{'kernel': 'linear', 'C': 10}"


In [14]:
best_model = models["logistic_regression"]["model"].set_params(C=10)
best_model.fit(x_train, y_train)
best_model.score(x_test, y_test)

0.8641304347826086

In [16]:
import joblib
joblib.dump(best_model, "model.pkl")

['model.pkl']