In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('heart_cleveland_upload.csv')

In [3]:
df.slope.unique()

array([1, 0, 2], dtype=int64)

In [4]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40,1,3,152,223,0,0,181,0,0.0,0,0,2,1
293,39,1,3,118,219,0,0,140,0,1.2,1,0,2,1
294,35,1,3,120,198,0,0,130,1,1.6,1,0,2,1
295,35,0,3,138,183,0,0,182,0,1.4,0,0,0,0


In [5]:
df.groupby(df.condition).count()

Unnamed: 0_level_0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,160,160,160,160,160,160,160,160,160,160,160,160,160
1,137,137,137,137,137,137,137,137,137,137,137,137,137


In [6]:
df['age'].nunique()

41

In [7]:
columns = df.columns[1:-1]
y = df.columns[-1:]
columns

Index(['sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang',
       'oldpeak', 'slope', 'ca', 'thal'],
      dtype='object')

In [8]:
target = df['condition']

In [9]:
#Create dummies
to_produce = []
for col in columns: 
    to_produce.append(pd.get_dummies(df[col], drop_first=False, prefix=col, dtype=int))
data = pd.concat(to_produce, axis = 1).sort_index()

In [10]:
data['condition'] = target 
data

Unnamed: 0,sex_0,sex_1,cp_0,cp_1,cp_2,cp_3,trestbps_94,trestbps_100,trestbps_101,trestbps_102,...,slope_1,slope_2,ca_0,ca_1,ca_2,ca_3,thal_0,thal_1,thal_2,condition
0,0,1,1,0,0,0,0,0,0,0,...,1,0,0,1,0,0,1,0,0,0
1,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
2,1,0,1,0,0,0,0,0,0,0,...,0,1,1,0,0,0,1,0,0,0
3,0,1,1,0,0,0,0,0,0,0,...,1,0,0,1,0,0,1,0,0,1
4,0,1,1,0,0,0,0,0,0,0,...,1,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,0,1,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,1
293,0,1,0,0,0,1,0,0,0,0,...,1,0,1,0,0,0,0,0,1,1
294,0,1,0,0,0,1,0,0,0,0,...,1,0,1,0,0,0,0,0,1,1
295,1,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0


In [11]:
#Train models
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [12]:
X = data.drop(['condition'],axis=1)
y = data['condition']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=66)
rfcModel =  RandomForestClassifier(n_estimators=20, max_depth=100,max_features=5)

In [13]:
rfcModel.fit(X_train,y_train)

RandomForestClassifier(max_depth=100, max_features=5, n_estimators=20)

In [14]:
rfcModel_pred = rfcModel.predict(X_test)

In [15]:
print(confusion_matrix(y_test,rfcModel_pred))
print(classification_report(y_test,rfcModel_pred))

[[33  4]
 [ 7 31]]
              precision    recall  f1-score   support

           0       0.82      0.89      0.86        37
           1       0.89      0.82      0.85        38

    accuracy                           0.85        75
   macro avg       0.86      0.85      0.85        75
weighted avg       0.86      0.85      0.85        75



In [16]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40,1,3,152,223,0,0,181,0,0.0,0,0,2,1
293,39,1,3,118,219,0,0,140,0,1.2,1,0,2,1
294,35,1,3,120,198,0,0,130,1,1.6,1,0,2,1
295,35,0,3,138,183,0,0,182,0,1.4,0,0,0,0


In [17]:
#Create dummies for categorical columns
catColumns = ['sex','cp','fbs','restecg','exang','slope','ca','thal']
new_to_produce = []
for col in catColumns: 
    new_to_produce.append(pd.get_dummies(df[col], drop_first=False, prefix=col, dtype=int))
dataLog = pd.concat(new_to_produce, axis = 1).sort_index()

In [18]:
dataLog['condition'] = df['condition']
columns_to_fill = dataLog.columns
columns_to_fill = columns_to_fill[:-1]
df_to_fill = pd.DataFrame(columns = columns_to_fill)

In [19]:
Ex = dataLog.drop(['condition'],axis=1)
Ouaille = dataLog['condition']
Ex_train, Ex_test, Ouaille_train, Ouaille_test = train_test_split(Ex, Ouaille, test_size=0.2, stratify=Ouaille,random_state=2)

In [20]:
#A Logistic Regression is more appropriate for this type of model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,r2_score
from sklearn.model_selection import GridSearchCV
logmod=LogisticRegression()
logmod.fit(Ex,Ouaille)

LogisticRegression()

In [21]:
logmod_pred=logmod.predict(Ex_test)

In [22]:
print(confusion_matrix(Ouaille_test,logmod_pred))
print(classification_report(Ouaille_test,logmod_pred))
accuracy_score(Ouaille_test,logmod_pred)*100

[[31  1]
 [ 4 24]]
              precision    recall  f1-score   support

           0       0.89      0.97      0.93        32
           1       0.96      0.86      0.91        28

    accuracy                           0.92        60
   macro avg       0.92      0.91      0.92        60
weighted avg       0.92      0.92      0.92        60



91.66666666666666

In [23]:
#Let's do a grid search
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

#pipLog = make_pipeline(StandardScaler(), LogisticRegression(random_state=2))
parameters = [{'penalty':['l1','l2']}, 
              {'C': np.logspace(-3,3,7)}]
grid_search = GridSearchCV(estimator = logmod,  
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           verbose=0)


grid_search.fit(Ex_train, Ouaille_train)

Traceback (most recent call last):
  File "C:\Users\Kamen\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Kamen\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Kamen\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



GridSearchCV(cv=10, estimator=LogisticRegression(),
             param_grid=[{'penalty': ['l1', 'l2']},
                         {'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])}],
             scoring='accuracy')

In [24]:
print(grid_search.best_score_)
print(grid_search.best_params_)
bestestim = grid_search.best_estimator_
print('Test accuracy: %.3f' % bestestim.score(Ex_test, Ouaille_test))

0.835144927536232
{'C': 0.01}
Test accuracy: 0.867


In [25]:
#new_input_log = [[1,2,130,10,130,0,90,1,0,1,0,0]]
#logmod.predict(new_input_log)

In [26]:
#Let's do a grid search
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [27]:
#pip = make_pipeline(StandardScaler(), RandomForestClassifier(criterion='gini', random_state=1))
#param_grid_rfc = [{
#    'randomforestclassifier__max_depth':[2,5,10,100,200],
#    'randomforestclassifier__max_features':[2, 3, 4, 5, 6,8,10],
#    'randomforestclassifier__n_estimators':[1,5,10,20,100]
#}]

#gs = GridSearchCV(estimator=pip,
#                     param_grid = param_grid_rfc,
#                     scoring='accuracy',
#                     cv=10,
#                     refit=True,
#                     n_jobs=1)

In [28]:
#Test the model 

#Comment out when running app

#gs = gs.fit(X_train, y_train)
# Print the training score of the best model
#print(gs.best_score_)

# Print the model parameters of the best model
#print(gs.best_params_)

# Print the test score of the best model
#clfRFC = gs.best_estimator_
#print('Test accuracy: %.3f' % clfRFC.score(X_test, y_test))

In [29]:
columns

Index(['sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang',
       'oldpeak', 'slope', 'ca', 'thal'],
      dtype='object')

In [30]:
new_input = [[1,2,130,10,130,0,90,1,0,1,0,0]]
input_df = pd.DataFrame(new_input, columns = columns)

In [31]:
input_df

Unnamed: 0,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,1,2,130,10,130,0,90,1,0,1,0,0


In [32]:
#Create dummies
to_produce_input = []
for col in columns: 
    to_produce_input.append(pd.get_dummies(input_df[col], drop_first=False, prefix=col, dtype=int))
data_input = pd.concat(to_produce_input, axis = 1).sort_index()

In [33]:
data_input

Unnamed: 0,sex_1,cp_2,trestbps_130,chol_10,fbs_130,restecg_0,thalach_90,exang_1,oldpeak_0,slope_1,ca_0,thal_0
0,1,1,1,1,1,1,1,1,1,1,1,1


In [34]:
#Create an empty df to have all columns
to_fill = pd.DataFrame(columns = X.columns)
to_fill

Unnamed: 0,sex_0,sex_1,cp_0,cp_1,cp_2,cp_3,trestbps_94,trestbps_100,trestbps_101,trestbps_102,...,slope_0,slope_1,slope_2,ca_0,ca_1,ca_2,ca_3,thal_0,thal_1,thal_2


In [35]:
data_input

Unnamed: 0,sex_1,cp_2,trestbps_130,chol_10,fbs_130,restecg_0,thalach_90,exang_1,oldpeak_0,slope_1,ca_0,thal_0
0,1,1,1,1,1,1,1,1,1,1,1,1


In [36]:
final_input_df = data_input.join(to_fill[to_fill.columns.difference(data_input.columns)])
final_input_df = final_input_df.fillna(0)
final_input_df = final_input_df.reindex(columns=to_fill.columns)

In [37]:
to_iterate = df.columns[:-1]
data_to_check= []
for col in to_iterate: 
    data_to_check.append(df[col].unique()) 
data_to_check

[array([69, 66, 65, 64, 63, 61, 60, 59, 58, 56, 52, 51, 45, 42, 40, 38, 34,
        74, 71, 70, 62, 57, 55, 54, 50, 49, 48, 46, 44, 41, 35, 29, 76, 68,
        67, 53, 47, 43, 39, 37, 77], dtype=int64),
 array([1, 0], dtype=int64),
 array([0, 1, 2, 3], dtype=int64),
 array([160, 140, 150, 138, 110, 170, 145, 134, 178, 120, 118, 152, 125,
        148, 156, 128, 136, 130, 124, 154, 135, 132, 108, 192, 101, 105,
        112, 126, 122, 180, 115, 146, 155, 102, 172, 100,  94, 129, 142,
        144, 106, 117, 158, 174, 164, 114, 165, 200, 123, 104], dtype=int64),
 array([234, 239, 226, 282, 211, 227, 233, 240, 270, 288, 273, 204, 283,
        193, 186, 298, 213, 264, 244, 199, 231, 182, 269, 302, 245, 246,
        195, 281, 208, 221, 284, 319, 236, 261, 232, 294, 250, 262, 342,
        309, 325, 201, 205, 271, 266, 229, 197, 160, 308, 263, 219, 220,
        295, 203, 198, 235, 306, 157, 192, 210, 265, 254, 274, 277, 564,
        212, 278, 417, 360, 335, 313, 252, 243, 185, 318, 178, 218, 230

In [38]:
to_feed = final_input_df.iloc[0].to_numpy()
to_feed

array([0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [39]:
rfcModel.predict(to_feed.reshape(1,-1))

array([0], dtype=int64)

In [40]:
columns

Index(['sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang',
       'oldpeak', 'slope', 'ca', 'thal'],
      dtype='object')

In [41]:
columns_to_fill

Index(['sex_0', 'sex_1', 'cp_0', 'cp_1', 'cp_2', 'cp_3', 'fbs_0', 'fbs_1',
       'restecg_0', 'restecg_1', 'restecg_2', 'exang_0', 'exang_1', 'slope_0',
       'slope_1', 'slope_2', 'ca_0', 'ca_1', 'ca_2', 'ca_3', 'thal_0',
       'thal_1', 'thal_2'],
      dtype='object')

In [42]:
# WebApp starts here
from pywebio.input import input, FLOAT, input_group, checkbox, radio
from pywebio.output import put_text
from pywebio.output import *

#define block functions
#def sexBlock(x): 
#    if x != 1 or x != 0: 
#        return "Input not valid"

input_data = []
def heart():
    #Age input start
    #Age = input("Input your age：", type=FLOAT)
    #input_data.append(Age)
    info = input_group("Fill this form",
        [radio('Input your sex', options=['Male','Female'],name='Sex'),
         checkbox('Input chest paint type', options=['Typical angina',\
                                                     'Atypical angina',\
                                                    'Non-anginal pain',\
                                                    'Asymptomatic'],\
                 name='ChestPain'),
         input("Input resting blood pressure (in mm Hg): ",name='RestingBloodPressure',type=FLOAT),
         input("Input serum cholestoral in mg/dl: ", name='Cholesterol',type=FLOAT),
         checkbox("Fasting blood sugar > 120 mg/dl",options=['Yes','No'],name='FastingBloodSugar'),
        checkbox('Resting electrocardiographic results',\
                 options=['Normal',\
                          'Having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)',\
                          'Showing probable or definite left ventricular hypertrophy by Estes\' criteria'],\
                name = 'Resting_electrocardiographic'),
        input("Input maximum heart rate achieved: ",name='MaxHeartRate',type=FLOAT),
        checkbox("Does the patient have exercise induced angina: ", options=['Yes','No'],name ='ExerciseInducedAngina'),
        input("Input ST depression induced by exercise relative to rest (Value should vary between 0 and 7): "\
                    ,type=FLOAT,\
             name='STDepression'),
        checkbox("Input the slope of the peak exercise ST segment: ",options=['Upsloping',\
                                                                              'Flat',\
                                                                              'Downsloping'],name='Slope'),
        checkbox("Number of major vessels colored by fluorosopy", options=['0','1','2','3'],
                name='MajorVessels'),
        checkbox("Input thalessemia level", options=['Normal',\
                                                     'Fixed defect',\
                                                     'Reversable defect'],
                name='Thalessemia')])
    
    #Create dictionaries
    sex_dict = {
        'Male': 1,
        'Female': 0
    }
    
    cp_dict = {
        'Typical angina' : 0,
         'Atypical angina' : 1,
         'Non-anginal pain': 2,
         'Asymptomatic': 3
    }
    
    fbs_dict = {
        'Yes' : 1,
        'No' : 0
    }
    
    restecg_dict = {
        'Normal' : 0,
        'Having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)' : 1, 
        'Showing probable or definite left ventricular hypertrophy by Estes\' criteria' : 2
    }
    
    exang_dict = { 
        'Yes' : 1,
        'No' : 0
    }
    
    slope_dict ={
        'Upsloping' : 0,
        'Flat' : 1,
        'Downsloping' : 2
    }
    
    thal_dict = {
        'Normal' : 0,
        'Fixed defect' : 1,
        'Reversable defect' : 2
    }
    
    input_data = [[sex_dict[info['Sex']],\
                    cp_dict[info['ChestPain'][0]],\
                    info['RestingBloodPressure'],\
                    info['Cholesterol'],\
                    fbs_dict[info['FastingBloodSugar'][0]],\
                    restecg_dict[info['Resting_electrocardiographic'][0]],\
                    info['MaxHeartRate'],\
                    exang_dict[info['ExerciseInducedAngina'][0]],\
                    info['STDepression'],\
                   slope_dict[info['Slope'][0]],\
                   int(info['MajorVessels'][0]),\
                   thal_dict[info['Thalessemia'][0]]]]
   # print(input_data)
    
    input_df = pd.DataFrame(input_data, columns = columns)
   # print(input_df)
    
    catColumns = ['sex','cp','fbs','restecg','exang','slope','ca','thal']
    new_input_to_produce = []
    for col in columns:
        if col not in catColumns: 
            new_input_to_produce.append(input_df[col])
        else: 
            new_input_to_produce.append(pd.get_dummies(input_df[col], drop_first=False, prefix=col, dtype=int))
    dataLogInput = pd.concat(new_input_to_produce, axis = 1).sort_index()
    #print(dataLogInput)
    
    final_new_input_df = dataLogInput.join(df_to_fill[df_to_fill.columns.difference(dataLogInput.columns)])
    final_new_input_df = final_new_input_df.fillna(0)
    final_new_input_df = final_new_input_df.reindex(columns=df_to_fill.columns)
    #print(final_new_input_df)
    
    to_feed = final_new_input_df.iloc[0].to_numpy()
    
    if logmod.predict(to_feed.reshape(1,-1)) == [1]: 
        put_text("You have a heart disease (accuracy: 91%)")
    else: 
        put_text("You do not have a heart disease (accuracy: 91%)")

if __name__ == '__main__':
    heart()