In [33]:
#ML Model
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import pickle
%matplotlib inline

In [21]:
df = pd.read_csv('heart_cleveland_upload.csv')

In [22]:
columns = df.columns[1:-1]
y = df.columns[-1:]
columns

Index(['sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang',
       'oldpeak', 'slope', 'ca', 'thal'],
      dtype='object')

In [23]:
#Create dummies for categorical columns
catColumns = ['sex','cp','fbs','restecg','exang','slope','ca','thal']
new_to_produce = []
for col in catColumns: 
    new_to_produce.append(pd.get_dummies(df[col], drop_first=False, prefix=col, dtype=int))
dataLog = pd.concat(new_to_produce, axis = 1).sort_index()

In [24]:
dataLog['condition'] = df['condition']
columns_to_fill = dataLog.columns
columns_to_fill = columns_to_fill[:-1]
df_to_fill = pd.DataFrame(columns = columns_to_fill)

In [25]:
X = dataLog.drop(['condition'],axis=1)
y = dataLog['condition']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                                  y, 
                                                                  test_size=0.2, 
                                                                  stratify=y,
                                                                  random_state=2)

In [26]:
#A Logistic Regression is more appropriate for this type of model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,r2_score
from sklearn.model_selection import GridSearchCV
logmod=LogisticRegression()
logmod.fit(X,y)

LogisticRegression()

In [27]:
logmod_pred=logmod.predict(X_test)

In [28]:
print(confusion_matrix(y_test,logmod_pred))
print(classification_report(y_test,logmod_pred))
accuracy_score(y_test,logmod_pred)*100

[[31  1]
 [ 4 24]]
              precision    recall  f1-score   support

           0       0.89      0.97      0.93        32
           1       0.96      0.86      0.91        28

    accuracy                           0.92        60
   macro avg       0.92      0.91      0.92        60
weighted avg       0.92      0.92      0.92        60



91.66666666666666

In [30]:
#Let's do a grid search
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

#pipLog = make_pipeline(StandardScaler(), LogisticRegression(random_state=2))
parameters = [{'penalty':['l1','l2']}, 
              {'C': np.logspace(-3,3,7)}]
grid_search = GridSearchCV(estimator = logmod,  
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           verbose=0)


grid_search.fit(X_train, y_train)

Traceback (most recent call last):
  File "C:\Users\Kamen\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Kamen\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Kamen\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



GridSearchCV(cv=10, estimator=LogisticRegression(),
             param_grid=[{'penalty': ['l1', 'l2']},
                         {'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])}],
             scoring='accuracy')

In [31]:
print(grid_search.best_score_)
print(grid_search.best_params_)
bestestim = grid_search.best_estimator_
print('Test accuracy: %.3f' % bestestim.score(X_test, y_test))

0.835144927536232
{'C': 0.01}
Test accuracy: 0.867


In [35]:
#Pickle the model to be used in the app
pickle.dump(logmod, open('logmod.pkl', 'wb'))