In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
df = pd.read_csv("cleaned_heart.csv")

In [3]:
df.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,-1.43314,0.414885,0.834754,0,1.382928,-0.727592,0,1,1,0,0,1,0,0,0,1
1,-0.478484,1.527224,-1.210675,0,0.754157,0.282891,1,0,0,1,0,1,0,0,1,0
2,-1.751359,-0.141284,0.722161,0,-1.525138,-0.727592,0,1,1,0,0,0,1,0,0,1
3,-0.584556,0.303651,-0.572651,0,-1.132156,0.282891,1,0,0,0,0,1,0,1,1,0
4,0.051881,0.971054,-0.929194,0,-0.581981,-0.727592,0,1,0,1,0,1,0,0,0,1


In [5]:
df.columns

Index(['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak',
       'HeartDisease', 'Sex_M', 'ChestPainType_ATA', 'ChestPainType_NAP',
       'ChestPainType_TA', 'RestingECG_Normal', 'RestingECG_ST',
       'ExerciseAngina_Y', 'ST_Slope_Flat', 'ST_Slope_Up'],
      dtype='object')

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [9]:
X = df.drop('HeartDisease', axis=1)
y = df["HeartDisease"]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) 

In [11]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_Scaled = scaler.fit_transform(X_test)

In [12]:
models = {
    "Logistic Regression" : LogisticRegression(),
    "KNN" : KNeighborsClassifier(), 
    "SVM" : SVC(), 
    "Decision tree" : DecisionTreeClassifier(),
    "Naive Bayes" : GaussianNB()
}

In [13]:
result = []

In [15]:
for name,model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_Scaled)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    result.append({
        'model': name,
        'Accuracy': round(acc, 4),
        'f1 score': round(f1, 2)
    })

In [16]:
result

[{'model': 'Logistic Regression', 'Accuracy': 0.8713, 'f1 score': 0.89},
 {'model': 'KNN', 'Accuracy': 0.8449, 'f1 score': 0.86},
 {'model': 'SVM', 'Accuracy': 0.8614, 'f1 score': 0.88},
 {'model': 'Decision tree', 'Accuracy': 0.7492, 'f1 score': 0.77},
 {'model': 'Naive Bayes', 'Accuracy': 0.8581, 'f1 score': 0.87}]

In [17]:
import joblib
joblib.dump(models['KNN'], 'KNN_heart.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(X.columns.tolist(), 'columns.pkl')

['columns.pkl']

In [18]:
df = sns.load_dataset('iris')

In [19]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [20]:
df['species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X = df.drop('species', axis = 1)
y = df['species']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [24]:
from sklearn.neighbors import KNeighborsClassifier

In [25]:
model_knn = KNeighborsClassifier(n_neighbors=5)

In [26]:
model_knn.fit(X_train,y_train)

In [27]:
model_knn.score(X_test, y_test)

0.98

In [28]:
from sklearn.svm import SVC

In [32]:
model_svm = SVC(gamma='auto')

In [33]:
model_svm.fit(X_train, y_train)

In [34]:
model_svm.score(X_test, y_test)

1.0

In [35]:
from sklearn.model_selection import GridSearchCV

In [36]:
classifier = GridSearchCV((model_svm), {
    'C': [10, 20, 30, 40 ,50],
    'kernel': ['rbf', 'linear'],
}, cv = 5, return_train_score= False)

In [37]:
classifier.fit(X,y)

In [38]:
classifier.cv_results_

{'mean_fit_time': array([0.00304537, 0.00431571, 0.00330253, 0.00310564, 0.0025363 ,
        0.00282502, 0.00202732, 0.0026073 , 0.0026135 , 0.00337324]),
 'std_fit_time': array([0.00063992, 0.00363883, 0.00087033, 0.00020014, 0.00042696,
        0.00076465, 0.00064361, 0.00059249, 0.00048047, 0.00073548]),
 'mean_score_time': array([0.002917  , 0.00271325, 0.00204167, 0.00202866, 0.00160317,
        0.00219827, 0.00180054, 0.00180068, 0.00218773, 0.00250769]),
 'std_score_time': array([0.00082278, 0.00134917, 0.00029658, 0.00063375, 0.00048513,
        0.00073385, 0.00077028, 0.00073687, 0.000602  , 0.00045663]),
 'param_C': masked_array(data=[10, 10, 20, 20, 30, 30, 40, 40, 50, 50],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear',
                    'rbf', 'linear', 'rbf', 'linear'],
 

In [39]:
results = pd.DataFrame(classifier.cv_results_)

In [40]:
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003045,0.00064,0.002917,0.000823,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.004316,0.003639,0.002713,0.001349,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,2
2,0.003303,0.00087,0.002042,0.000297,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,3
3,0.003106,0.0002,0.002029,0.000634,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,4
4,0.002536,0.000427,0.001603,0.000485,30,rbf,"{'C': 30, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.933333,1.0,0.96,0.038873,6
5,0.002825,0.000765,0.002198,0.000734,30,linear,"{'C': 30, 'kernel': 'linear'}",1.0,1.0,0.9,0.9,1.0,0.96,0.04899,6
6,0.002027,0.000644,0.001801,0.00077,40,rbf,"{'C': 40, 'kernel': 'rbf'}",1.0,0.966667,0.9,0.933333,1.0,0.96,0.038873,6
7,0.002607,0.000592,0.001801,0.000737,40,linear,"{'C': 40, 'kernel': 'linear'}",1.0,1.0,0.9,0.9,1.0,0.96,0.04899,6
8,0.002613,0.00048,0.002188,0.000602,50,rbf,"{'C': 50, 'kernel': 'rbf'}",1.0,0.966667,0.9,0.933333,1.0,0.96,0.038873,6
9,0.003373,0.000735,0.002508,0.000457,50,linear,"{'C': 50, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,4


In [41]:
from sklearn.model_selection import RandomizedSearchCV

In [42]:
classifier = RandomizedSearchCV((model_svm), {
    'C': [10, 20, 30, 40 ,50],
    'kernel': ['rbf', 'linear'],
}, n_iter=4, cv = 5, return_train_score= False)

In [43]:
classifier.fit(X,y)

In [44]:
results_ = classifier.cv_results_

In [46]:
results_

{'mean_fit_time': array([0.00239782, 0.00330315, 0.00310874, 0.0025116 ]),
 'std_fit_time': array([0.00048767, 0.00087105, 0.00066497, 0.00043658]),
 'mean_score_time': array([0.00211096, 0.00230355, 0.00209579, 0.00160713]),
 'std_score_time': array([0.00036796, 0.00040446, 0.00067573, 0.0004965 ]),
 'param_kernel': masked_array(data=['linear', 'linear', 'linear', 'rbf'],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_C': masked_array(data=[30, 20, 10, 40],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'kernel': 'linear', 'C': 30},
  {'kernel': 'linear', 'C': 20},
  {'kernel': 'linear', 'C': 10},
  {'kernel': 'rbf', 'C': 40}],
 'split0_test_score': array([1., 1., 1., 1.]),
 'split1_test_score': array([1.        , 1.        , 1.        , 0.96666667]),
 'split2_test_score': array([0.9, 0.9, 0.9, 0.9]),
 'split3_test_score': array([0.9       , 0.93333333, 0.96