In [571]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeRegressor
import pickle
import numpy as np


In [572]:
#load the data file 
data=pd.read_csv('preprocessed.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,age,chest_pain,rest_bpress,blood_sugar,rest_electro,max_heart_rate,exercice_angina,disease
0,0,43.0,asympt,140.0,0,normal,135.0,1.0,1.0
1,1,39.0,atyp_angina,130.0,0,normal,160.0,1.0,0.0
2,2,39.0,non_anginal,160.0,1,normal,160.0,0.0,0.0
3,5,50.0,asympt,140.0,0,normal,135.0,0.0,0.0
4,6,59.0,asympt,140.0,1,left_vent_hyper,119.0,1.0,1.0


In [573]:
#find number of features available and remove one as that is the target feature and one is the unnamed column
n_features=data.shape[1]-2

#separate the features and the target/outcome
x_feats=data.drop(['disease','Unnamed: 0'],1)
y_feat=data['disease']
y_feat=y_feat.astype('int64')
y_feat.dtypes



dtype('int64')

In [574]:
#pre process the features
#x-dataframe containing the training features
def preprocess_features(x):
    #new output dataframe
    output=pd.DataFrame(index=x.index)
    #iterate through each column in features
    for col,col_data in x.iteritems():
        #convert categorical data to dummy variables/ one hot encoding of the categorical variables
        if col_data.dtype==object:
            col_data=pd.get_dummies(col_data,prefix=col)
        output=output.join(col_data)
    return output

In [575]:
 
    
#the function makes predictions ans returns the f-score and the accuracy of the model
#f score is the weighted average of the precision and the recall. A value closer to 1 will mean better results!

def predict_label(classifier,features,outcome):
    y_pred=classifier.predict(features)
    return f1_score(outcome,y_pred),sum(outcome==y_pred)/float(len(y_pred))
'''
def save_model(classifier):
    filename = 'model.pkl'
    pickle.dump(model, open(filename, 'wb'))
'''    
    
#the function calls the train method to train the model and predict
def model(classifier,x_train,y_train,x_test,y_test): 
   
    print("training dataset size",len(x_train))
    classifier.fit(x_train,y_train)
    save_model(classifier)
    f1,accuracy=predict_label(classifier,x_train,y_train)
    print ("f1 score and accuracy of training dataset")
    print(f1,accuracy*100)
    f1,accuracy=predict_label(classifier,x_test,y_test)
    print ("f1 score and accuracy of testing dataset")
    print(f1,accuracy*100)
    save_model(classifier)
    
    
    
    
    
    
    
    
    

In [576]:
#converted to one hot encodings for categorical variable
x_feats=preprocess_features(x_feats)
x_feats=x_feats.astype('int64')
x_feats.dtypes

age                                   int64
chest_pain_asympt                     int64
chest_pain_atyp_angina                int64
chest_pain_non_anginal                int64
chest_pain_typ_angina                 int64
rest_bpress                           int64
blood_sugar                           int64
rest_electro_left_vent_hyper          int64
rest_electro_normal                   int64
rest_electro_st_t_wave_abnormality    int64
max_heart_rate                        int64
exercice_angina                       int64
dtype: object

In [577]:
#split data into training and testing datasets
x_train,x_test,y_train,y_test=train_test_split(x_feats,y_feat,test_size=0.2, random_state=0)
classifier=svm.LinearSVC()
model(classifier,x_train,y_train,x_test,y_test)

training dataset size 161
f1 score and accuracy of training dataset
0.7770700636942676 78.26086956521739
f1 score and accuracy of testing dataset
0.7999999999999999 75.60975609756098




In [578]:
import coremltools
coreml_model = coremltools.converters.sklearn.convert(classifier,input_features=["age","chest_pain_asympt","chest_pain_atyp_angina","chest_pain_non_anginal","chest_pain_typ_angina","rest_bpress","blood_sugar","rest_electro_left_vent_hyper", "rest_electro_normal","rest_electro_st_t_wave_abnormality", "max_heart_rate","exercice_angina"],output_feature_names='disease')
coreml_model.save('model_svc.mlmodel')


In [579]:
#predict for a new record coming in
def predict_new_record(d,classifier):
    if d['blood_sugar']=='t':
        d['blood_sugar']=1
    else:
        d['blood_sugar']=0

    if d['exercice_angina']=='yes':
        d['exercice_angina']=1
    else:
        d['exercice_angina']=0



    temp=pd.DataFrame(columns=x_feats.columns)
    

    for key,value in d.items():
        if(type(value)==str):
            col_name=str(key)+'_'+str(value)
            
            temp.loc[0,col_name]=1
        else:
            temp.loc[0,key]=value
    temp.fillna(0,inplace=True)
    temp=temp.astype('int64')
    #print(temp.dtypes)
    #classifier = pickle.load(open('model.sav', 'rb'))
    result=classifier.predict(temp)
    print(result)
    if result>0.5:
        return 'Positive'
    else:
        return 'Negative'
    

In [580]:
#accept new record and predict 
d={'age':29,'chest_pain':'asympt','rest_bpress':120,'blood_sugar':'t','rest_electro':'normal','max_heart_rate':180,'exercice_angina':'yes'}   

print(predict_new_record(d,classifier))

    
    
    
        

[1]
Positive
