In [215]:
import pandas as pd

In [216]:
bucket='health-data-v1'
data_key='health_data.csv'
data_location= 's3://{}/{}/'.format(bucket, data_key)


df = pd.read_csv(data_location)
df.head()

Unnamed: 0,Dehydration,Medicine Overdose,Acidious,Cold,Cough,Temperature,Heart Rate,Pulse,BPSYS,BPDIA,Respiratory Rate,Oxygen Saturation,PH,Causes Respiratory Imbalance
0,1,0,0,1,0,99,196,94,98,57,26,0.9,3,1
1,1,1,0,0,0,103,170,135,96,84,48,0.98,1,2
2,1,1,1,0,0,96,171,169,95,97,40,0.96,13,2
3,0,0,1,0,0,98,85,69,106,79,16,1.0,8,3
4,0,1,0,0,0,99,78,69,106,70,16,0.99,8,2


In [217]:
X=df[['Dehydration','Medicine Overdose','Acidious','Cough','Temperature','Heart Rate','Pulse','BPSYS','BPDIA','Respiratory Rate','Oxygen Saturation','PH']]
y=df['Causes Respiratory Imbalance']

In [220]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(df[['SalePrice']], df['CentralAir'], test_size=0.2)

In [221]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)


In [222]:
train = pd.concat([pd.Series(y_train, index=X_train.index) , X_train], axis=1)

validation = pd.concat([pd.Series(y_val, index=X_val.index) , X_val], axis=1)

test = pd.concat([pd.Series(y_test, index=X_test.index) , X_test], axis=1)

In [225]:
train.to_csv('train.csv', index=False, header=False)
validation.to_csv('validation.csv', index=False, header=False)

In [None]:
import sagemaker, boto3, os

bucket='health-data-v1'
prefix='sagemaker-xgboost-patient-prioritisation'

boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(prefix, 'data/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(prefix, 'data/validation.csv')).upload_file('validation.csv')

# Training the model

In [None]:
sagemaker.__version__

In [None]:
import sagemaker

region = sagemaker.Session().boto_region_name
print("AWS Region: {}".format(region))

role = sagemaker.get_execution_role()
print("RoleArn: {}".format(role))

In [None]:
import sagemaker
from sagemaker.debugger import Rule, rule_configs
from sagemaker.session import TrainingInput
import boto3
from sagemaker import image_uris
from sagemaker.session import Session

s3_output_location='s3://{}/{}/{}'.format(bucket,prefix, 'xgboost_model')

container = sagemaker.image_uris.retrieve("xgboost", region, "1.2-1")
print(container)

xgb_model = sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    volume_size=5,
    output_path=s3_output_location,
    sagemaker_session=sagemaker.Session(),
    rules=[Rule.sagemaker(rule_configs.create_xgboost_report())]
)

In [230]:
xgb_model.set_hyperparameters(
        objective= 'multi:softmax',
        eval_metric= 'merror',
        num_class= 4,  
        max_depth= 6,  
        learning_rate= 0.1,  
        num_round= 1000 
)

In [231]:
from sagemaker.session import TrainingInput

train_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "data/train.csv"), content_type="csv"
)

validation_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "data/validation.csv"), content_type="csv"
)

In [None]:
xgb_model.fit({"train": train_input, "validation": validation_input}, wait=True)

In [None]:
xgb_model.fit({"train": train_input, "validation": validation_input}, wait=True)

In [None]:
rule_output_path = xgb_model.output_path + "/" + xgb_model.latest_training_job.name + "/rule-output"
! aws s3 ls {rule_output_path} --recursive

In [None]:
! aws s3 cp {rule_output_path} ./ --recursive

In [None]:
from IPython.display import FileLink, FileLinks
display("Click link below to view the XGBoost Training report", FileLink("CreateXgboostReport/xgboost_report.html"))

In [None]:
xgb_model.model_data

# Deploying the model

In [None]:
import sagemaker
from sagemaker.serializers import CSVSerializer
xgb_predictor=xgb_model.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge',
    serializer=CSVSerializer()
)

In [None]:
xgb_predictor.endpoint_name

# Evaluating the model

In [242]:
import numpy as np
def predict(data, rows=1000):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])
    return (np.fromstring(predictions[1:], sep=','))


In [None]:
import numpy as np
import matplotlib.pyplot as plt

predictions = predict(test.to_numpy()[:,1:])
# predictions_rounded = np.trunc(predictions).astype(int)
# predictions_scaled = (predictions * 1000).astype(int) / 1000



pred_str = np.char.mod('%d', predictions)  # convert the array to string format and remove the decimal point
pred_int = pred_str.astype(int)  # convert the resulting string array to integer type

plt.hist(pred_int)
plt.show()

In [None]:
import sklearn

cm = (sklearn.metrics.confusion_matrix(y_test, pred_int))
print(sklearn.metrics.classification_report(y_test, pred_int, labels=[0,1,2,3]))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns. heatmap(cm, annot=True, cmap="Blues", fmt="d")

plt.xlabel('Predicted labels')
plt.ylabel ('True labels')
plt.title ('Confusion Matrix')

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, pred_int)