In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

In [2]:

import ibmos2spark

# @hidden_cell
credentials = {
    'endpoint': '',
    'api_key': '',
    'service_id': '',
    'iam_service_endpoint': ''}

configuration_name = ''
cos = ibmos2spark.CloudObjectStorage(sc, credentials, configuration_name, 'bluemix_cos')

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df_data_1 = spark.read\
  .format('org.apache.spark.sql.execution.datasources.csv.CSVFileFormat')\
  .option('header', 'true')\
  .option('inferSchema','True')\
  .load(cos.url('patientdataV6.csv', 'donotdelete-pr-qxutelwivjjtz4'))
df_data_1.take(5)


[Row(AVGHEARTBEATSPERMIN=93, PALPITATIONSPERDAY=22, CHOLESTEROL=163, BMI=25, HEARTFAILURE='N', AGE=49, SEX='F', FAMILYHISTORY='N', SMOKERLAST5YRS='N', EXERCISEMINPERWEEK=110),
 Row(AVGHEARTBEATSPERMIN=108, PALPITATIONSPERDAY=22, CHOLESTEROL=181, BMI=24, HEARTFAILURE='N', AGE=32, SEX='F', FAMILYHISTORY='N', SMOKERLAST5YRS='N', EXERCISEMINPERWEEK=192),
 Row(AVGHEARTBEATSPERMIN=86, PALPITATIONSPERDAY=0, CHOLESTEROL=239, BMI=20, HEARTFAILURE='N', AGE=60, SEX='F', FAMILYHISTORY='N', SMOKERLAST5YRS='N', EXERCISEMINPERWEEK=121),
 Row(AVGHEARTBEATSPERMIN=80, PALPITATIONSPERDAY=36, CHOLESTEROL=164, BMI=31, HEARTFAILURE='Y', AGE=45, SEX='F', FAMILYHISTORY='Y', SMOKERLAST5YRS='N', EXERCISEMINPERWEEK=141),
 Row(AVGHEARTBEATSPERMIN=66, PALPITATIONSPERDAY=36, CHOLESTEROL=185, BMI=23, HEARTFAILURE='N', AGE=39, SEX='F', FAMILYHISTORY='N', SMOKERLAST5YRS='N', EXERCISEMINPERWEEK=63)]

In [3]:
df_data_1.printSchema()

root
 |-- AVGHEARTBEATSPERMIN: integer (nullable = true)
 |-- PALPITATIONSPERDAY: integer (nullable = true)
 |-- CHOLESTEROL: integer (nullable = true)
 |-- BMI: integer (nullable = true)
 |-- HEARTFAILURE: string (nullable = true)
 |-- AGE: integer (nullable = true)
 |-- SEX: string (nullable = true)
 |-- FAMILYHISTORY: string (nullable = true)
 |-- SMOKERLAST5YRS: string (nullable = true)
 |-- EXERCISEMINPERWEEK: integer (nullable = true)



In [4]:
# from above, HEARTFAILURE field is the one we would like to predict (label)
df_data_1.show()

+-------------------+------------------+-----------+---+------------+---+---+-------------+--------------+------------------+
|AVGHEARTBEATSPERMIN|PALPITATIONSPERDAY|CHOLESTEROL|BMI|HEARTFAILURE|AGE|SEX|FAMILYHISTORY|SMOKERLAST5YRS|EXERCISEMINPERWEEK|
+-------------------+------------------+-----------+---+------------+---+---+-------------+--------------+------------------+
|                 93|                22|        163| 25|           N| 49|  F|            N|             N|               110|
|                108|                22|        181| 24|           N| 32|  F|            N|             N|               192|
|                 86|                 0|        239| 20|           N| 60|  F|            N|             N|               121|
|                 80|                36|        164| 31|           Y| 45|  F|            Y|             N|               141|
|                 66|                36|        185| 23|           N| 39|  F|            N|             N|            

In [5]:
df_data_1.describe().show()

+-------+-------------------+------------------+------------------+------------------+------------+------------------+-----+-------------+--------------+------------------+
|summary|AVGHEARTBEATSPERMIN|PALPITATIONSPERDAY|       CHOLESTEROL|               BMI|HEARTFAILURE|               AGE|  SEX|FAMILYHISTORY|SMOKERLAST5YRS|EXERCISEMINPERWEEK|
+-------+-------------------+------------------+------------------+------------------+------------+------------------+-----+-------------+--------------+------------------+
|  count|              10800|             10800|             10800|             10800|       10800|             10800|10800|        10800|         10800|             10800|
|   mean|  87.11509259259259|20.423148148148147|195.08027777777778| 26.35972222222222|        null|49.965185185185184| null|         null|          null|119.72953703703703|
| stddev| 19.744375148984474|12.165320351622993|26.136731865042325|3.8201472810942136|        null|13.079280962015586| null|         nu

In [6]:
#split data into train and test 
split_data = df_data_1.randomSplit([0.80, 0.20], 24)
train_data = split_data[0]
test_data = split_data[1]


print ("Number of training records: " + str(train_data.count()))
print ("Number of testing records : " + str(test_data.count()))

Number of training records: 8637
Number of testing records : 2163


In [7]:
#import ApacheSpark packages
from pyspark.ml.feature import StringIndexer, IndexToString, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline, Model

In [8]:
#convert string fields to numeric ones
stringIndexer_label = StringIndexer(inputCol="HEARTFAILURE", outputCol="label").fit(df_data_1)
stringIndexer_sex = StringIndexer(inputCol="SEX", outputCol="SEX_IX")
stringIndexer_famhist = StringIndexer(inputCol="FAMILYHISTORY", outputCol="FAMILYHISTORY_IX")
stringIndexer_smoker = StringIndexer(inputCol="SMOKERLAST5YRS", outputCol="SMOKERLAST5YRS_IX")

In [9]:
#create a feature vector by combining all features together
vectorAssembler_features = VectorAssembler(inputCols=["AVGHEARTBEATSPERMIN","PALPITATIONSPERDAY","CHOLESTEROL","BMI","AGE","SEX_IX","FAMILYHISTORY_IX","SMOKERLAST5YRS_IX","EXERCISEMINPERWEEK"], outputCol="features")

In [10]:
#random forest classifier
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

In [11]:
# indexed labels back to original labels

labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=stringIndexer_label.labels)

In [12]:
transform_df_pipeline = Pipeline(stages=[stringIndexer_label, stringIndexer_sex, stringIndexer_famhist, stringIndexer_smoker, vectorAssembler_features])
transformed_df = transform_df_pipeline.fit(df_data_1).transform(df_data_1)
transformed_df.show()

+-------------------+------------------+-----------+---+------------+---+---+-------------+--------------+------------------+-----+------+----------------+-----------------+--------------------+
|AVGHEARTBEATSPERMIN|PALPITATIONSPERDAY|CHOLESTEROL|BMI|HEARTFAILURE|AGE|SEX|FAMILYHISTORY|SMOKERLAST5YRS|EXERCISEMINPERWEEK|label|SEX_IX|FAMILYHISTORY_IX|SMOKERLAST5YRS_IX|            features|
+-------------------+------------------+-----------+---+------------+---+---+-------------+--------------+------------------+-----+------+----------------+-----------------+--------------------+
|                 93|                22|        163| 25|           N| 49|  F|            N|             N|               110|  0.0|   1.0|             0.0|              0.0|[93.0,22.0,163.0,...|
|                108|                22|        181| 24|           N| 32|  F|            N|             N|               192|  0.0|   1.0|             0.0|              0.0|[108.0,22.0,181.0...|
|                 86|    

In [13]:

pipeline_rf = Pipeline(stages=[stringIndexer_label, stringIndexer_sex, stringIndexer_famhist, stringIndexer_smoker, vectorAssembler_features, rf, labelConverter])

In [14]:
model_rf = pipeline_rf.fit(train_data)

In [15]:
#check model accuracy and to evaluate we use test data

predictions = model_rf.transform(test_data)
evaluatorRF = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluatorRF.evaluate(predictions)
print("Accuracy = %g" % accuracy)
print("Test Error = %g" % (1.0 - accuracy))

#evaluatorRF = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
#f1score = evaluatorRF.evaluate(predictions)
#print("F1 score = %g" % f1score)

Accuracy = 0.866852
Test Error = 0.133148


In [16]:
from repository.mlrepositoryclient import MLRepositoryClient
from repository.mlrepositoryartifact import MLRepositoryArtifact

In [17]:
#Specify username, password, and instance_id credientials for Watson ML
service_path = ''
username = ''
password = ''
instance_id = ''

In [18]:
ml_repository_client = MLRepositoryClient(service_path)
ml_repository_client.authorize(username, password)

In [19]:
#Create model artifact (abstraction layer)
pipeline_artifact = MLRepositoryArtifact(pipeline_rf, name="pipeline")

model_artifact = MLRepositoryArtifact(model_rf, training_data=train_data, name="Heart Failure Prediction Model", pipeline_artifact=pipeline_artifact)

In [20]:
#save pipeline and model artifacts 
saved_model = ml_repository_client.models.save(model_artifact)

In [21]:
#Get saved model metadata from Watson Machine Learning
saved_model.meta.available_props()

dict_keys(['trainingDataSchema', 'lastUpdated', 'inputDataSchema', 'evaluationMetrics', 'version', 'pipelineType', 'modelVersionHref', 'evaluationMethod', 'trainingDataRef', 'label', 'pipelineVersionHref', 'authorEmail', 'creationTime', 'modelType', 'authorName', 'runtime'])

In [22]:
print("modelType: " + saved_model.meta.prop("modelType"))
print("trainingDataSchema: " + str(saved_model.meta.prop("trainingDataSchema")))
print("creationTime: " + str(saved_model.meta.prop("creationTime")))
print("modelVersionHref: " + saved_model.meta.prop("modelVersionHref"))
print("label: " + saved_model.meta.prop("label"))

modelType: sparkml-model-2.1
trainingDataSchema: {'type': 'struct', 'fields': [{'metadata': {}, 'nullable': True, 'type': 'integer', 'name': 'AVGHEARTBEATSPERMIN'}, {'metadata': {}, 'nullable': True, 'type': 'integer', 'name': 'PALPITATIONSPERDAY'}, {'metadata': {}, 'nullable': True, 'type': 'integer', 'name': 'CHOLESTEROL'}, {'metadata': {}, 'nullable': True, 'type': 'integer', 'name': 'BMI'}, {'metadata': {}, 'nullable': True, 'type': 'string', 'name': 'HEARTFAILURE'}, {'metadata': {}, 'nullable': True, 'type': 'integer', 'name': 'AGE'}, {'metadata': {}, 'nullable': True, 'type': 'string', 'name': 'SEX'}, {'metadata': {}, 'nullable': True, 'type': 'string', 'name': 'FAMILYHISTORY'}, {'metadata': {}, 'nullable': True, 'type': 'string', 'name': 'SMOKERLAST5YRS'}, {'metadata': {}, 'nullable': True, 'type': 'integer', 'name': 'EXERCISEMINPERWEEK'}]}
creationTime: 2019-02-08 04:33:17.204000+00:00
modelVersionHref: https://us-south.ml.cloud.ibm.com/v2/artifacts/models/ad80e40b-e0ec-4dcb-ab

In [23]:
#we can load model to make sure that it was saved correctly
loadedModelArtifact = ml_repository_client.models.get(saved_model.uid)

In [24]:
#Print the model name to make sure that model artifact has been loaded correctly

print(str(loadedModelArtifact.name))

Heart Failure Prediction Model


In [26]:
#Import Python WatsonML Repository SDK
# Save model to WML Service
from repository.mlrepositoryclient import MLRepositoryClient
from repository.mlrepositoryartifact import MLRepositoryArtifact

#Authenticate
ml_repository_client = MLRepositoryClient(service_path)
ml_repository_client.authorize(username, password)

#Deploy a new model.  I renamed the existing model as it has already been created above
pipeline_artifact = MLRepositoryArtifact(pipeline_rf, name="pipeline")

model_artifact = MLRepositoryArtifact(model_rf, training_data=train_data, name="Heart Failure Prediction Model", pipeline_artifact=pipeline_artifact)

In [27]:
#The Watson ML API authenticates all requests through a token, start by requesting the token from our Watson ML Service
import json
import requests
from base64 import b64encode

token_url = service_path + "/v3/identity/token"

# NOTE: for python 2.x, uncomment below, and comment out the next line of code:
#userAndPass = b64encode(bytes(username + ':' + password)).decode("ascii")
# Use below for python 3.x, comment below out for python 2.x
userAndPass = b64encode(bytes(username + ':' + password, "utf-8")).decode("ascii")
headers = { 'Authorization' : 'Basic %s' %  userAndPass }

response = requests.request("GET", token_url, headers=headers)

watson_ml_token = json.loads(response.text)['token']
print(watson_ml_token)

eyJhbGciOiJSUzUxMiIsInR5cCI6IkpXVCJ9.eyJ0ZW5hbnRJZCI6IjQxYzQ4MDI0LThmMzMtNDNmYy04MTBhLTRlZTc4N2ViNzhkOSIsImluc3RhbmNlSWQiOiI0MWM0ODAyNC04ZjMzLTQzZmMtODEwYS00ZWU3ODdlYjc4ZDkiLCJwbGFuSWQiOiIzZjZhY2Y0My1lZGU4LTQxM2EtYWM2OS1mOGFmM2JiMGNiZmUiLCJyZWdpb24iOiJ1cy1zb3V0aCIsInVzZXJJZCI6ImQzM2NkZjY3LTljNzgtNDI2Mi05MDZhLTA4NGZmZmNhOWU5YyIsImlzcyI6Imh0dHBzOi8vdXMtc291dGgubWwuY2xvdWQuaWJtLmNvbS92My9pZGVudGl0eSIsImlhdCI6MTU0OTYwNTg2NiwiZXhwIjoxNTQ5NjM0NjY2LCJjcmVhdGVkVGltZSI6MTU0OTYwNTg2Nn0.RKd6fcjDr8Y1cYqImnnTXg9eEZdD7T74z9F2cwcCLI4W5xcI9NGBdmnyI-7IsnZr2nIUGDiI9MZjTkPNciJj5_lkrTndmspOVp2rFvaqa0V-2YzYibk_aNdd1FzOQ-r_rMkw2pgOpJ4hztnSsKfQaoBkeBL01Mk5U1oO09GE2icsujV2M9kIgQHItT51ctDci39QtlDoSh4zg3JBCqAVqnKCNaWIwAeEDuuZAuOjweUxidG_F-vy6TcrX-5cmsZEqEqgBZ1GMVkjH3wgidkv09TP2-XEGaURTWzKnnT3Lt0eiT8_SOrjKbHLI7VPLVgsJVFexRd0o3NF9GsiQsjVnQ


In [28]:
# Preview currenly published models
model_url = service_path + "/v3/wml_instances/" + instance_id + "/published_models"

headers = {'authorization': 'Bearer ' + watson_ml_token }
response = requests.request("GET", model_url, headers=headers)

published_models = json.loads(response.text)
print(json.dumps(published_models, indent=2))

{
  "limit": 1000,
  "first": {
    "url": "https://ibm-watson-ml.mybluemix.net/v3/wml_instances/41c48024-8f33-43fc-810a-4ee787eb78d9/published_models?limit=1000"
  },
  "resources": [
    {
      "metadata": {
        "created_at": "2019-02-08T04:33:17.204Z",
        "url": "https://ibm-watson-ml.mybluemix.net/v3/wml_instances/41c48024-8f33-43fc-810a-4ee787eb78d9/published_models/ad80e40b-e0ec-4dcb-ab44-f41dee23aea3",
        "modified_at": "2019-02-08T04:33:17.312Z",
        "guid": "ad80e40b-e0ec-4dcb-ab44-f41dee23aea3"
      },
      "entity": {
        "latest_version": {
          "created_at": "2019-02-08T04:33:17.312Z",
          "url": "https://ibm-watson-ml.mybluemix.net/v3/ml_assets/models/ad80e40b-e0ec-4dcb-ab44-f41dee23aea3/versions/7067be23-3a5a-4672-a5e7-d8a6298edcb3",
          "guid": "7067be23-3a5a-4672-a5e7-d8a6298edcb3"
        },
        "runtime_environment": "spark-2.1",
        "name": "Heart Failure Prediction Model",
        "learning_configuration_url": "http

In [29]:
#Read the details of any returned models
print('{} model(s) are available in your Watson ML Service'.format(len(published_models['resources'])))
for model in published_models['resources']:
    print('\t- name:        {}'.format(model['entity']['name']))
    print('\t  model_id:    {}'.format(model['metadata']['guid']))
    print('\t  deployments: {}'.format(model['entity']['deployments']['count']))

1 model(s) are available in your Watson ML Service
	- name:        Heart Failure Prediction Model
	  model_id:    ad80e40b-e0ec-4dcb-ab44-f41dee23aea3
	  deployments: 0


In [30]:
#Create a new deployment of the Model
model_id = 'ad80e40b-e0ec-4dcb-ab44-f41dee23aea3'

deployment_url = service_path + "/v3/wml_instances/" + instance_id + "/published_models/" + model_id + "/deployments"

payload = "{\"name\": \"Heart Failure Prediction Model Deployment\", \"description\": \"First deployment of Heart Failure Prediction Model\", \"type\": \"online\"}"
headers = {'authorization': 'Bearer ' + watson_ml_token, 'content-type': "application/json" }

response = requests.request("POST", deployment_url, data=payload, headers=headers)

print(response.text)

{
  "metadata": {
    "guid": "d71567c5-0b0b-4019-bfb5-f9308ed10eca",
    "url": "https://ibm-watson-ml.mybluemix.net/v3/wml_instances/41c48024-8f33-43fc-810a-4ee787eb78d9/published_models/ad80e40b-e0ec-4dcb-ab44-f41dee23aea3/deployments/d71567c5-0b0b-4019-bfb5-f9308ed10eca",
    "created_at": "2019-02-08T06:07:56.986Z",
    "modified_at": "2019-02-08T06:08:00.247Z"
  },
  "entity": {
    "runtime_environment": "spark-2.1",
    "name": "Heart Failure Prediction Model Deployment",
    "scoring_url": "https://ibm-watson-ml.mybluemix.net/v3/wml_instances/41c48024-8f33-43fc-810a-4ee787eb78d9/published_models/ad80e40b-e0ec-4dcb-ab44-f41dee23aea3/deployments/d71567c5-0b0b-4019-bfb5-f9308ed10eca/online",
    "deployable_asset": {
      "name": "Heart Failure Prediction Model",
      "url": "https://ibm-watson-ml.mybluemix.net/v3/wml_instances/41c48024-8f33-43fc-810a-4ee787eb78d9/published_models/ad80e40b-e0ec-4dcb-ab44-f41dee23aea3",
      "guid": "ad80e40b-e0ec-4dcb-ab44-f41dee23aea3",
     

In [31]:
deployment = json.loads(response.text)

print('Model {} deployed.'.format(model_id))
print('\tname: {}'.format(deployment['entity']['name']))
print('\tdeployment_id: {}'.format(deployment['metadata']['guid']))
print('\tstatus: {}'.format(deployment['entity']['status']))
print('\tscoring_url: {}'.format(deployment['entity']['scoring_url']))

Model ad80e40b-e0ec-4dcb-ab44-f41dee23aea3 deployed.
	name: Heart Failure Prediction Model Deployment
	deployment_id: d71567c5-0b0b-4019-bfb5-f9308ed10eca
	status: DEPLOY_SUCCESS
	scoring_url: https://ibm-watson-ml.mybluemix.net/v3/wml_instances/41c48024-8f33-43fc-810a-4ee787eb78d9/published_models/ad80e40b-e0ec-4dcb-ab44-f41dee23aea3/deployments/d71567c5-0b0b-4019-bfb5-f9308ed10eca/online


In [32]:
#Monitor the status of deployment
deployment_id = "d71567c5-0b0b-4019-bfb5-f9308ed10eca"
deployment_details_url = service_path + "/v3/wml_instances/" + instance_id + "/published_models/" + model_id + "/deployments/" + deployment_id

headers = {'authorization': 'Bearer ' + watson_ml_token, 'content-type': "application/json" }

response = requests.request("GET", deployment_url, headers=headers)
print(response.text)

{
  "count": 1,
  "resources": [{
    "metadata": {
      "guid": "d71567c5-0b0b-4019-bfb5-f9308ed10eca",
      "url": "https://ibm-watson-ml.mybluemix.net/v3/wml_instances/41c48024-8f33-43fc-810a-4ee787eb78d9/deployments/d71567c5-0b0b-4019-bfb5-f9308ed10eca",
      "created_at": "2019-02-08T06:07:56.986Z",
      "modified_at": "2019-02-08T06:08:00.247Z"
    },
    "entity": {
      "runtime_environment": "spark-2.1",
      "name": "Heart Failure Prediction Model Deployment",
      "scoring_url": "https://ibm-watson-ml.mybluemix.net/v3/wml_instances/41c48024-8f33-43fc-810a-4ee787eb78d9/deployments/d71567c5-0b0b-4019-bfb5-f9308ed10eca/online",
      "deployable_asset": {
        "name": "Heart Failure Prediction Model",
        "url": "https://ibm-watson-ml.mybluemix.net/v3/wml_instances/41c48024-8f33-43fc-810a-4ee787eb78d9/published_models/ad80e40b-e0ec-4dcb-ab44-f41dee23aea3",
        "guid": "ad80e40b-e0ec-4dcb-ab44-f41dee23aea3",
        "created_at": "2019-02-08T06:07:56.915Z",
   

In [33]:
deployment_details = json.loads(response.text)

for resources in deployment_details['resources']:
    print('name: {}'.format(resources['entity']['name']))
    print('status: {}'.format(resources['entity']['status']))
    print('scoring url: {}'.format(resources['entity']['scoring_url']))

name: Heart Failure Prediction Model Deployment
status: DEPLOY_SUCCESS
scoring url: https://ibm-watson-ml.mybluemix.net/v3/wml_instances/41c48024-8f33-43fc-810a-4ee787eb78d9/deployments/d71567c5-0b0b-4019-bfb5-f9308ed10eca/online


In [2]:
#Invoke prediction model deployment
#Define a method to call scoring url. Replace the scoring_url in the method below with the scoring_url returned from above
def get_prediction_ml(ahb, ppd, chol, bmi, age, sex, fh, smoker, exercise_minutes ):
    scoring_url = 'https://ibm-watson-ml.mybluemix.net/v3/wml_instances/41c48024-8f33-43fc-810a-4ee787eb78d9/deployments/d71567c5-0b0b-4019-bfb5-f9308ed10eca/online'
    scoring_payload = { "fields":["AVGHEARTBEATSPERMIN","PALPITATIONSPERDAY","CHOLESTEROL","BMI","AGE","SEX","FAMILYHISTORY","SMOKERLAST5YRS","EXERCISEMINPERWEEK"],"values":[[ahb, ppd, chol, bmi, age, sex, fh, smoker, exercise_minutes]]}
    header = {'authorization': 'Bearer ' + watson_ml_token, 'content-type': "application/json" }
    scoring_response = requests.post(scoring_url, json=scoring_payload, headers=header)
    return (json.loads(scoring_response.text).get("values")[0][16])

In [35]:
#Call get_prediction_ml method exercising our prediction model
print('Is a 44 year old female that smokes with a low BMI at risk of Heart Failure?: {}'.format(get_prediction_ml(100,85,242,24,44,"F","Y","Y",125)))

Is a 44 year old female that smokes with a low BMI at risk of Heart Failure?: Y
