## Test RF model and save accuracy as a baseline

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split

In [2]:
#load the heart disease dataset

'''
/mnt/data/raw/heart.csv

attribute documentation:
      age: age in years
      sex: sex (1 = male; 0 = female)
      cp: chest pain type
        -- Value 1: typical angina
        -- Value 2: atypical angina
        -- Value 3: non-anginal pain
        -- Value 4: asymptomatic
     trestbps: resting blood pressure (in mm Hg on admission to the 
        hospital)
     chol: serum cholestoral in mg/dl
     fbs: (fasting blood sugar > 120 mg/dl)  (1 = true; 0 = false)
     restecg: resting electrocardiographic results
        -- Value 0: normal
        -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST 
                    elevation or depression of > 0.05 mV)
        -- Value 2: showing probable or definite left ventricular hypertrophy
                    by Estes' criteria
     thalach: maximum heart rate achieved
     exang: exercise induced angina (1 = yes; 0 = no)
     oldpeak = ST depression induced by exercise relative to rest
     slope: the slope of the peak exercise ST segment
        -- Value 1: upsloping
        -- Value 2: flat
        -- Value 3: downsloping
     ca: number of major vessels (0-3) colored by flourosopy
     thal: 
         3 = normal; 
         6 = fixed defect; 
         7 = reversable defect
     target: diagnosis of heart disease (angiographic disease status)
        -- Value 0: < 50% diameter narrowing
        -- Value 1: > 50% diameter narrowing
 '''

#column names
names = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang', \
         'oldpeak','slope','ca','thal','target']

#load data from Domino project directory
hd_data = pd.read_csv("/mnt/data/raw/heart.csv", header=None, names=names)

#some data came in as string
#convert to numeric and coerce errors to NaN
for col in hd_data.columns:  # Iterate over chosen columns
    hd_data[col] = pd.to_numeric(hd_data[col], errors='coerce')
    
#a function to do one hot encoding for categorical columns
def create_dummies(data, cols, drop1st=True):
    for c in cols:
        dummies_df = pd.get_dummies(data[c], prefix=c, drop_first=drop1st)  
        data=pd.concat([data, dummies_df], axis=1)
        data = data.drop([c], axis=1)
    return data
cats = ['cp', 'restecg', 'slope', 'ca', 'thal']
hd_data = create_dummies(hd_data, cats)

#drop nulls
hd_data.dropna(inplace=True)
    
#load the X set as a numpy array
X_hd = hd_data.drop('target', axis=1).values

#load the y set as a numpy array
y_hd = hd_data['target'].values

#build the train and test sets
X_hd_train, X_hd_test, y_hd_train, y_hd_test = \
    sklearn.model_selection.train_test_split(X_hd, y_hd, random_state=1)

In [3]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier(n_estimators=100,criterion="gini")

clf_rf.fit(X_hd_train,y_hd_train)

y_hd_pred_rf = clf_rf.predict(X_hd_test)

sklearn.metrics.accuracy_score(y_hd_test,y_hd_pred_rf)

  from numpy.core.umath_tests import inner1d


0.7894736842105263

In [4]:
#load breast cancer data

from sklearn.datasets import load_breast_cancer

'''
Attribute Information:

1) ID number 
2) Diagnosis (M = malignant, B = benign) 
3-32) 

Ten real-valued features are computed for each cell nucleus: 

a) radius (mean of distances from center to points on the perimeter) 
b) texture (standard deviation of gray-scale values) 
c) perimeter 
d) area 
e) smoothness (local variation in radius lengths) 
f) compactness (perimeter^2 / area - 1.0) 
g) concavity (severity of concave portions of the contour) 
h) concave points (number of concave portions of the contour) 
i) symmetry 
j) fractal dimension ("coastline approximation" - 1)
'''

#load from sklearn
X_bc, y_bc = sklearn.datasets.load_breast_cancer(return_X_y=True)

#build the train and test sets
X_bc_train, X_bc_test, y_bc_train, y_bc_test = \
    sklearn.model_selection.train_test_split(X_bc, y_bc, random_state=1)

In [5]:
#Random Forest

clf_rf.fit(X_bc_train,y_bc_train)

y_bc_pred_rf = clf_rf.predict(X_bc_test)

sklearn.metrics.accuracy_score(y_bc_test,y_bc_pred_rf)

0.958041958041958

## Save Domino Stat Metrics

In [6]:
#this is charted automatically when run as a batch job

hd_acc = sklearn.metrics.accuracy_score(y_hd_test,y_hd_pred_rf)
bc_acc = sklearn.metrics.accuracy_score(y_bc_test,y_bc_pred_rf)

import json
with open('../dominostats.json', 'w') as f:
    f.write(json.dumps( {"HD_ACC": hd_acc, "BC_ACC": bc_acc}))