In [16]:
!pip install coremltools

Collecting coremltools
[?25l  Downloading https://files.pythonhosted.org/packages/08/e0/22d3659411724228a7c96eb6d8f813cca0e9e9f8f0647b1163f1aa7965b2/coremltools-2.0-cp36-none-macosx_10_13_intel.whl (3.1MB)
[K    100% |████████████████████████████████| 3.1MB 6.4kB/s ta 0:00:02
Collecting six==1.10.0 (from coremltools)
  Using cached https://files.pythonhosted.org/packages/c8/0a/b6723e1bc4c516cb687841499455a8505b44607ab535be01091c0f24f079/six-1.10.0-py2.py3-none-any.whl
[31mtensorflow-tensorboard 1.5.1 has requirement bleach==1.5.0, but you'll have bleach 3.0.2 which is incompatible.[0m
[31mtensorflow-tensorboard 1.5.1 has requirement html5lib==0.9999999, but you'll have html5lib 1.0.1 which is incompatible.[0m
[31mdocker-compose 1.23.1 has requirement texttable<0.10,>=0.9.0, but you'll have texttable 1.5.0 which is incompatible.[0m
Installing collected packages: six, coremltools
  Found existing installation: six 1.11.0
    Uninstalling six-1.11.0:
      Successfully uninstalled

In [17]:
import pandas as pd
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import pickle
import numpy as np
import time
import coremltools



In [18]:
# load the data file
data = pd.read_csv('../data/preprocessed.csv', index_col=0)
data.head()

Unnamed: 0,age,chest_pain,rest_bpress,blood_sugar,rest_electro,max_heart_rate,exercice_angina,disease
0,43.0,asympt,140.0,f,normal,135.0,yes,1.0
1,39.0,atyp_angina,130.0,f,normal,160.0,yes,0.0
2,39.0,non_anginal,160.0,t,normal,160.0,no,0.0
5,50.0,asympt,140.0,f,normal,135.0,no,0.0
6,59.0,asympt,140.0,t,left_vent_hyper,119.0,yes,1.0


In [19]:
# find number of features available and remove one as that is the target feature and one is the unnamed column
n_features = data.shape[1]-2

# separate the features and the target/outcome
x_feats = data.drop(['disease'], 1)
y_feat = data['disease']
y_feat = y_feat.astype('int64')
# y_feat.dtypes

In [20]:
# pre process the features
# x-dataframe containing the training features


def preprocess_features(x):
    # new output dataframe
    output = pd.DataFrame(index=x.index)
    # iterate through each column in features
    for col, col_data in x.iteritems():
        # convert categorical data to dummy variables/ one hot encoding of the categorical variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix=col)
        output = output.join(col_data)
    return output

In [21]:
# trains the model and generats a report for the performance metrics


def train(model, x_train, y_train, x_test, y_test):

    print(' ')
    print("training dataset size", len(x_train))
    start = time.time()
    model.fit(x_train, y_train)
    end = time.time()
    y_pred = model.predict(x_train)
    acc_train = model.score(x_train, y_train)
    y_pred = model.predict(x_test)
    acc_test = model.score(x_test, y_test)

    print('time for training: ', end-start)
    print('Accuracy of model on train dataset:  {:.2f} %'.format(
        acc_train*100))
    print('Accuracy of model on test dataset:  {:.2f} %'.format(acc_test*100))

    print('CONFUSION MATRIX:')
    print(confusion_matrix(y_test, y_pred))
    print('RESULTS')
    report = classification_report(y_pred, y_test)
    print(report)

In [22]:
# predict for a new record coming in
def predict_new_record(d, classifier):
    if d['blood_sugar'] == 't':
        d['blood_sugar'] = 1
    else:
        d['blood_sugar'] = 0

    if d['exercice_angina'] == 'yes':
        d['exercice_angina'] = 1
    else:
        d['exercice_angina'] = 0

    temp = pd.DataFrame(columns=x_feats.columns)

    for key, value in d.items():
        if(type(value) == str):
            col_name = str(key)+'_'+str(value)

            temp.loc[0, col_name] = 1
        else:
            temp.loc[0, key] = value
    temp.fillna(0, inplace=True)
    temp = temp.astype('int64')
    # print(temp.dtypes)
    #classifier = pickle.load(open('model.sav', 'rb'))
    result = classifier.predict(temp)
    # print(result)
    if result > 0.5:
        return 'Positive'
    else:
        return 'Negative'

In [23]:
# converted to one hot encodings for categorical variable
x_feats = preprocess_features(x_feats)
x_feats = x_feats.astype('int64')

In [24]:
# SMOTE an algorith used for oversampling. It generates a dataset that has equal proportions of data samples for positive and negative classes
# create new random samples from the minor class for equal distribution
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)
# split data into training and testing datasets
x_train, x_test, y_train, y_test = train_test_split(
    x_feats, y_feat, test_size=0.2, random_state=0)

columns = x_train.columns
os_data_x, os_data_y = os.fit_sample(x_train, y_train)
os_data_x = pd.DataFrame(data=os_data_x, columns=columns)
os_data_y = pd.DataFrame(data=os_data_y, columns=['disease'])
os_data_x = os_data_x.astype('int64')
os_data_y = os_data_y.astype('int64')
# we can Check the numbers of our data
print("length of oversampled data is ", len(os_data_x))

print('no of positives in over sampled data:',
      len(os_data_y[os_data_y['disease'] == 1]))
print('no of positives in over sampled data:',
      len(os_data_y[os_data_y['disease'] == 0]))

x_train = os_data_x
y_train = os_data_y

length of oversampled data is  188
no of positives in over sampled data: 94
no of positives in over sampled data: 94


In [25]:
# define models
print('--------------LOGISTIC REGRESSION----------------------')
classifier_lr = LogisticRegression(multi_class='ovr')
train(classifier_lr, x_train, y_train, x_test, y_test)
coreml_model = coremltools.converters.sklearn.convert(
    classifier_lr, input_features=list(x_train.columns), output_feature_names='disease')
coreml_model.save('model_lr.mlmodel')

print('--------------SUPPORT VECTOR MACHINE----------------------')
classifier_svc = LinearSVC()
train(classifier_svc, x_train, y_train, x_test, y_test)
coreml_model = coremltools.converters.sklearn.convert(
    classifier_svc, input_features=list(x_train.columns), output_feature_names='disease')
coreml_model.save('model_svc.mlmodel')

print('--------------RANDOM FOREST CLASSIFIER----------------------')
classifier_rf = RandomForestClassifier()
train(classifier_rf, x_train, y_train, x_test, y_test)
coreml_model = coremltools.converters.sklearn.convert(
    classifier_rf, input_features=list(x_train.columns), output_feature_names='disease')
coreml_model.save('model_randomforest.mlmodel')

print('--------------DECISION TREE CLASSIFIER----------------------')
classifier_dc = DecisionTreeClassifier()
train(classifier_dc, x_train, y_train, x_test, y_test)
coreml_model = coremltools.converters.sklearn.convert(
    classifier_dc, input_features=list(x_train.columns), output_feature_names='disease')
coreml_model.save('model_decisiontree.mlmodel')

--------------LOGISTIC REGRESSION----------------------
 
training dataset size 188
time for training:  0.0034270286560058594
Accuracy of model on train dataset:  82.45 %
Accuracy of model on test dataset:  73.17 %
CONFUSION MATRIX:
[[12  6]
 [ 5 18]]
RESULTS
              precision    recall  f1-score   support

           0       0.67      0.71      0.69        17
           1       0.78      0.75      0.77        24

   micro avg       0.73      0.73      0.73        41
   macro avg       0.72      0.73      0.73        41
weighted avg       0.73      0.73      0.73        41

--------------SUPPORT VECTOR MACHINE----------------------
 
training dataset size 188
time for training:  0.013113021850585938
Accuracy of model on train dataset:  51.60 %
Accuracy of model on test dataset:  56.10 %
CONFUSION MATRIX:
[[ 1 17]
 [ 1 22]]
RESULTS
              precision    recall  f1-score   support

           0       0.06      0.50      0.10         2
           1       0.96      0.56      0.7

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


--------------RANDOM FOREST CLASSIFIER----------------------
 
training dataset size 188
time for training:  0.032037973403930664
Accuracy of model on train dataset:  99.47 %
Accuracy of model on test dataset:  73.17 %
CONFUSION MATRIX:
[[14  4]
 [ 7 16]]
RESULTS
              precision    recall  f1-score   support

           0       0.78      0.67      0.72        21
           1       0.70      0.80      0.74        20

   micro avg       0.73      0.73      0.73        41
   macro avg       0.74      0.73      0.73        41
weighted avg       0.74      0.73      0.73        41

--------------DECISION TREE CLASSIFIER----------------------
 
training dataset size 188
time for training:  0.0031061172485351562
Accuracy of model on train dataset:  100.00 %
Accuracy of model on test dataset:  68.29 %
CONFUSION MATRIX:
[[13  5]
 [ 8 15]]
RESULTS
              precision    recall  f1-score   support

           0       0.72      0.62      0.67        21
           1       0.65      0.75 

  


In [27]:
# accept new record and predict
d = {'age': 43, 'chest_pain': 'asympt', 'rest_bpress': 140, 'blood_sugar': 'f',
     'rest_electro': 'normal', 'max_heart_rate': 120, 'exercice_angina': 'no'}
print('Logistic Regression')
print(predict_new_record(d, classifier_lr))
print('SVM')
print(predict_new_record(d, classifier_svc))
print('Random Forest')
print(predict_new_record(d, classifier_rf))
print('Decision Tree')
print(predict_new_record(d, classifier_dc))

Logistic Regression


ValueError: X has 16 features per sample; expecting 14