In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.neighbors import KNeighborsClassifier

In [2]:
heart_data = pd.read_csv('heart_disease_data.csv')
heart_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'target']
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

In [4]:
heart_data = heart_data[['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'target']]
heart_data.head()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,sex,cp,fbs,restecg,exang,slope,ca,thal,target
0,63,145,233,150,2.3,1,3,1,0,0,0,0,1,1
1,37,130,250,187,3.5,1,2,0,1,0,0,0,2,1
2,41,130,204,172,1.4,0,1,0,0,0,2,0,2,1
3,56,120,236,178,0.8,1,1,0,1,0,2,0,2,1
4,57,120,354,163,0.6,0,0,0,1,1,2,0,2,1


In [5]:
y = heart_data['target']
X = heart_data.drop(['target'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 0)

In [6]:
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
numeric_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore", drop='first'))
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [8]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", KNeighborsClassifier())]
)

clf.fit(X_train, y_train)
# newarr = [[50,136,300,165,2.6,0,1,0,0,0,1,0,1]]
arr = {
            'age':50,
            'sex':1,
            'cp':0,
            'trestbps':120,
            'chol':250,
            'fbs':0,
            'restecg':1,
            'thalach':80,
            'exang':0,
            'oldpeak':52,
            'slope':1,
            'ca':2,
            'thal':2
        }
predictions=clf.predict(pd.DataFrame(arr,index=[0]))
print(predictions)

print("model score: %.3f" % clf.score(X_test, y_test))

[0]
model score: 0.810


In [9]:
clf.score(X_train, y_train)

0.8423645320197044

In [10]:
from joblib import dump

In [11]:
dump(clf,'model.joblib')

['model.joblib']