In [50]:
import os

import numpy as np
import pandas as pd


import matplotlib.pyplot as plt

import sklearn.model_selection

import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.predictor import csv_serializer

In [51]:
session = sagemaker.Session()  #Sagemaker Necessities, can be ignored
role = get_execution_role()    #Sagemaker Necessities, can be ignored

In [52]:
df=pd.read_csv('heart_attack_dataset.csv')      # Reading the dataset and dropping a column that serves no purpose
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [53]:
df.columns.values  # Just getting an idea of the column titles we have

array(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
       'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype=object)

In [54]:
df= df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,69,0,3,140,239,0,1,151,0,1.8,2,2,2,1
1,49,1,2,118,149,0,0,126,0,0.8,2,3,2,0
2,65,0,2,155,269,0,1,148,0,0.8,2,0,2,1
3,44,1,2,130,233,0,1,179,1,0.4,2,0,2,1
4,45,1,0,104,208,0,0,148,1,3.0,1,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,40,1,0,152,223,0,1,181,0,0.0,2,0,3,0
299,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0
300,46,1,0,120,249,0,0,144,0,0.8,2,0,3,0
301,44,1,0,120,169,0,1,144,1,2.8,0,0,1,0


In [62]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()

dfs=pd.DataFrame(scaler.fit_transform(df.astype(float)))#Applying standardization to our dataset
dfs.columns=df.columns.values   #Making sure we add back the previous column names

labels=dfs.loc[:,'target'].values    #Giving the object 'labels' the classification of each sample ( 1 or 0)
data= dfs.drop(['target'], axis=1)   #retreiving the dataset, dropping the Y-value, to only have features
data=data.values

X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(data, labels, test_size=0.33)
X_train, X_val, Y_train, Y_val = sklearn.model_selection.train_test_split(X_train, Y_train, test_size=0.33)

In [63]:
X_test  # Just have an idea of what we are working with 

array([[0.60416667, 1.        , 0.        , ..., 0.        , 0.75      ,
        0.33333333],
       [0.54166667, 1.        , 0.33333333, ..., 1.        , 0.        ,
        0.66666667],
       [0.27083333, 1.        , 0.66666667, ..., 1.        , 0.        ,
        0.66666667],
       ...,
       [0.85416667, 1.        , 0.33333333, ..., 1.        , 0.        ,
        0.66666667],
       [0.5625    , 0.        , 0.        , ..., 0.5       , 0.5       ,
        1.        ],
       [0.60416667, 0.        , 0.66666667, ..., 1.        , 0.        ,
        0.66666667]])

In [64]:
data_dir = '../data/training_files'      #We are making sure the pathway we will add training,valid, and testing files exist
if not os.path.exists(data_dir):
    os.makedirs(data_dir)


In [65]:
X_train= pd.DataFrame(X_train)
X_val= pd.DataFrame(X_val)
X_test= pd.DataFrame(X_test)
    #Seperating the dataset into X and Y variables for their specific type (train, validation, testing)
Y_train= pd.DataFrame(Y_train)
Y_val= pd.DataFrame(Y_val)
Y_test= pd.DataFrame(Y_test)

In [66]:
X_test.to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)

#Here we prepare our data to the files we specified, in S3, this is sagemaker specific, it can be ignored

pd.concat([Y_train, X_train], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)
pd.concat([Y_val, X_val], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)

In [67]:
prefix = 'xgboost-heart-attack-model'

#Here we upload our data to the files we specified, in S3, this is sagemaker specific, it can be ignored


test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)
val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

In [68]:
# we use this utility method to construct the image name for the training container. This can be ignored
container = get_image_uri(session.boto_region_name, 'xgboost')

# Now that we know which container to use, we can construct the estimator object.Can be ignored

xgb = sagemaker.estimator.Estimator(container, # The name of the training container
                                    role,      # The IAM role to use (our current role in this case)
                                    train_instance_count=1, # The number of instances to use for training
                                    train_instance_type='ml.m4.xlarge', # The type of instance ot use for training
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                                                        # Where to save the output (the model artifacts)
                                    sagemaker_session=session) # The current SageMaker session

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
	get_image_uri(region, 'xgboost', '1.0-1').


In [69]:
#Here we have our hyperparameters, which are extremely similar to the solution code that got 94% accuracy

xgb.set_hyperparameters(max_depth=3,
                        eta=0.2,
                        gamma=0,
                        min_child_weight=1,
                        subsample=1,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=300)

In [70]:
#This is the location of our training and validation set
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')

#Here we fit our model to the training and validation sets
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})



2020-07-16 22:42:22 Starting - Starting the training job...
2020-07-16 22:42:24 Starting - Launching requested ML instances......
2020-07-16 22:43:28 Starting - Preparing the instances for training......
2020-07-16 22:44:25 Downloading - Downloading input data...
2020-07-16 22:45:22 Training - Training image download completed. Training in progress.
2020-07-16 22:45:22 Uploading - Uploading generated training model.[34mArguments: train[0m
[34m[2020-07-16:22:45:18:INFO] Running standalone xgboost training.[0m
[34m[2020-07-16:22:45:18:INFO] File size need to be processed in the node: 0.03mb. Available memory size in the node: 8497.6mb[0m
[34m[2020-07-16:22:45:18:INFO] Determined delimiter of CSV input is ','[0m
[34m[22:45:18] S3DistributionType set as FullyReplicated[0m
[34m[22:45:18] 136x13 matrix with 1768 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-07-16:22:45:18:INFO] Determined delimiter of CSV input is ','[0m
[34m[2

In [71]:
#This is sagemaker specific, we are just specifying the training job name for our model
xgb_attached = sagemaker.estimator.Estimator.attach('xgboost-2020-07-16-22-42-22-778')



2020-07-16 22:45:29 Starting - Preparing the instances for training
2020-07-16 22:45:29 Downloading - Downloading input data
2020-07-16 22:45:29 Training - Training image download completed. Training in progress.
2020-07-16 22:45:29 Uploading - Uploading generated training model
2020-07-16 22:45:29 Completed - Training job completed[34mArguments: train[0m
[34m[2020-07-16:22:45:18:INFO] Running standalone xgboost training.[0m
[34m[2020-07-16:22:45:18:INFO] File size need to be processed in the node: 0.03mb. Available memory size in the node: 8497.6mb[0m
[34m[2020-07-16:22:45:18:INFO] Determined delimiter of CSV input is ','[0m
[34m[22:45:18] S3DistributionType set as FullyReplicated[0m
[34m[22:45:18] 136x13 matrix with 1768 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-07-16:22:45:18:INFO] Determined delimiter of CSV input is ','[0m
[34m[22:45:18] S3DistributionType set as FullyReplicated[0m
[34m[22:45:18] 67x13 matrix w

In [72]:
xgb_transformer = xgb_attached.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')
#Making it available for BatchTesting



In [73]:
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')
#Now to actually test the model on the test data

In [74]:
xgb_transformer.wait()

......................[34mArguments: serve[0m
[34m[2020-07-16 22:53:14 +0000] [1] [INFO] Starting gunicorn 19.7.1[0m
[34m[2020-07-16 22:53:14 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2020-07-16 22:53:14 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2020-07-16 22:53:14 +0000] [37] [INFO] Booting worker with pid: 37[0m
[34m[2020-07-16 22:53:14 +0000] [38] [INFO] Booting worker with pid: 38[0m
[34m[2020-07-16 22:53:14 +0000] [39] [INFO] Booting worker with pid: 39[0m
[34m[2020-07-16 22:53:14 +0000] [40] [INFO] Booting worker with pid: 40[0m
[34m[2020-07-16:22:53:14:INFO] Model loaded successfully for worker : 37[0m
[34m[2020-07-16:22:53:14:INFO] Model loaded successfully for worker : 38[0m
[34m[2020-07-16:22:53:14:INFO] Model loaded successfully for worker : 39[0m
[34m[2020-07-16:22:53:14:INFO] Model loaded successfully for worker : 40[0m
[34m[2020-07-16:22:53:30:INFO] Sniff delimiter as ','[0m
[34m[2020-07-16:22:53:30:INFO] Determined de

In [76]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir 
# This is just allows us to have access to the predictions file

Completed 1.4 KiB/1.4 KiB (8.8 KiB/s) with 1 file(s) remainingdownload: s3://sagemaker-us-east-2-262034113260/xgboost-2020-07-16-22-42-22-778-2020-07-16-22-49-48-468/test.csv.out to ../data/training_files/test.csv.out


In [77]:
#This is the object that holds all of the predictions data
predictions = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)

#Here we will just convert that data to rounded integers so that they can ve classified as either 1 or 0
predictions = [round(num) for num in predictions.squeeze().values]

In [79]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, predictions)

0.85