In [1]:
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image
from IPython.display import display
from time import gmtime, strftime
from sagemaker.predictor import csv_serializer
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_squared_error

# Define IAM role
role = get_execution_role()
prefix = 'sagemaker/DEMO-xgboost-dm'
my_region = boto3.session.Session().region_name # set the region of the instance

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", my_region, "latest")

print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + xgboost_container + " container for your SageMaker endpoint.")

Success - the MySageMakerInstance is in the eu-central-1 region. You will use the 813361260812.dkr.ecr.eu-central-1.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


In [2]:
bucket_name = 'cs550sensordatabucket' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET
s3 = boto3.resource('s3')
try:
    if  my_region == 'us-east-1':
        s3.create_bucket(Bucket=bucket_name)
    else: 
        s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={ 'LocationConstraint': my_region })
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

S3 error:  An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.


In [12]:
#try:
#    urllib.request.urlretrieve ("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv")
#    print('Success: downloaded bank_clean.csv.')
#except Exception as e:
#    print('Data load error: ',e)

try:
    model_data = pd.read_csv('./sensor-data-proper_processed.csv',index_col=0)
    print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

Success: Data loaded into dataframe.


In [17]:
train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))])
print(train_data.shape, test_data.shape)

(270, 7) (117, 7)


In [14]:
train_data

Unnamed: 0,door_open,ldr,vibration,temp,humi,sonar,working
0,0,1024,0,22,48,2,1
0,1,207,0,22,49,182,1
0,1,206,0,22,45,2983,1
0,1,199,0,22,45,97,1
0,1,206,0,22,46,181,1
...,...,...,...,...,...,...,...
0,1,200,0,22,44,257,1
0,1,206,0,22,45,2971,1
0,0,1024,0,25,37,2,1
0,0,1024,0,22,48,2,1


In [15]:
test_data

Unnamed: 0,door_open,ldr,vibration,temp,humi,sonar,working
0,1,199,0,22,45,181,1
0,1,200,0,21,47,176,1
0,1,207,0,23,43,2954,1
0,0,1024,0,22,49,0,1
0,0,1024,0,22,44,2,1
...,...,...,...,...,...,...,...
0,1,234,0,25,35,180,1
0,1,207,0,24,38,182,1
0,0,1024,0,22,48,2,1
0,1,207,0,22,43,255,1


In [42]:
try:
    model_data = pd.read_csv('./bank_clean.csv',index_col=0)
    print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

Success: Data loaded into dataframe.


In [16]:
train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))])
print(train_data.shape, test_data.shape)

(270, 7) (117, 7)


In [28]:
train_data

Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_no,y_yes
40949,54,3,999,0,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,1,0
9332,56,2,999,0,1,0,0,1,0,0,...,1,0,0,0,0,0,1,0,1,0
32286,32,2,999,0,1,0,0,1,0,0,...,1,0,0,0,0,0,1,0,1,0
3925,46,3,999,0,1,0,1,0,0,0,...,0,1,0,0,0,0,1,0,1,0
9406,35,2,999,0,1,0,0,0,0,0,...,1,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3871,32,2,999,0,1,0,0,1,0,0,...,1,0,0,0,0,0,1,0,1,0
16681,26,2,999,0,1,0,0,1,0,0,...,0,0,0,0,1,0,1,0,1,0
39272,33,1,6,2,0,0,1,0,0,0,...,0,1,0,0,0,0,0,1,0,1
7717,32,4,999,0,1,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,1


In [18]:
pd.concat([train_data['door_open'], train_data.drop(['door_open'], axis=1)], axis=1).to_csv('sensor-data-train.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/sensor-data-train.csv')).upload_file('sensor-data-train.csv')
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train/sensor-data-train'.format(bucket_name, prefix), content_type='csv')

In [19]:
#ml.m4.xlarge
sess = sagemaker.Session()
xgb = sagemaker.estimator.Estimator(xgboost_container,role, instance_count=1, instance_type='ml.m5.large',output_path='s3://{}/{}/output'.format(bucket_name, prefix),sagemaker_session=sess)
xgb.set_hyperparameters(max_depth=5,eta=0.2,gamma=4,min_child_weight=6,subsample=0.8,silent=0,objective='reg:logistic',num_round=100)

In [20]:
xgb.fit({'train': s3_input_train})

2022-01-06 17:30:43 Starting - Starting the training job...
2022-01-06 17:31:07 Starting - Launching requested ML instancesProfilerReport-1641490242: InProgress
......
2022-01-06 17:32:11 Starting - Preparing the instances for training.........
2022-01-06 17:33:27 Downloading - Downloading input data...
2022-01-06 17:34:07 Training - Training image download completed. Training in progress.
2022-01-06 17:34:07 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2022-01-06:17:34:03:INFO] Running standalone xgboost training.[0m
[34m[2022-01-06:17:34:03:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2022-01-06:17:34:03:INFO] File size need to be processed in the node: 0.01mb. Available memory size in the node: 42.95mb[0m
[34m[2022-01-06:17:34:03:INFO] Determined delimiter of CSV input is ','[0m
[34m[17:34:03] S3DistributionType set as FullyReplicated[0m
[34m[17:34:03] 270x6 matrix with 1620 entries loaded from /opt/ml/input/data/train?f


2022-01-06 17:34:28 Completed - Training job completed
Training seconds: 58
Billable seconds: 58


In [10]:
xgb_predictor = xgb.deploy(initial_instance_count=1,instance_type='ml.m5.large')

----!

In [21]:
from sagemaker.serializers import CSVSerializer

test_data_array = test_data.drop(['door_open'], axis=1).values #load the data into an array
xgb_predictor.serializer = CSVSerializer() # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(test_data_array)
print(predictions)
print(predictions_array)
print(predictions_array.shape)

[[ 199    0   22   45  181    1]
 [ 200    0   21   47  176    1]
 [ 207    0   23   43 2954    1]
 [1024    0   22   49    0    1]
 [1024    0   22   44    2    1]
 [ 199    0   21   47  177    1]
 [1024    0   23   43    2    1]
 [ 207    0   25   41  174    1]
 [1024    0   23   41    0    1]
 [1024    0   22   48    2    1]
 [ 207    0   25   36  182    1]
 [1024    1   22   48    1    1]
 [ 206    0   22   47  181    1]
 [ 200    0   22   44  256    1]
 [1024    0   25   41    2    1]
 [1024    0   22   53    2    1]
 [ 199    0   22   46  180    1]
 [ 201    0   22   49  178    1]
 [ 199    0   22   49  181    1]
 [ 207    0   23   46  181    1]
 [ 200    0   22   45 3015    1]
 [1024    0   22   45    2    1]
 [ 199    0   22   50  197    1]
 [ 220    0   23   41  178    1]
 [ 206    0   22   47 2972    1]
 [1024    0   24   36    2    1]
 [ 207    0   24   38  182    1]
 [1024    0   25   35    0    1]
 [1024    0   22   47    2    1]
 [ 207    0   24   42  181    1]
 [ 200    

In [22]:
predictions_array_for_metrics = np.round(predictions_array)
test_data_true_values = [1,1,1,0,0,1,0,1,0,0,1,0,1,1,0,0,1,1,1,1,1,0,1,1,1,0,1,0,0,1,1,0,0,1,0,1,1,1,0,1,0,0,1,1,1,1,0,1,0,0,1,1,1,1,0,0,1,0,0,0,0,1,1,0,0,1,0,1,0,1,1,0,1,1,0,0,1,1,0,1,1,1,0,0,0,0,0,0,1,1,1,0,0,1,1,1,0,1,1,1,0,0,1,0,0,1,0,1,0,0,0,1,1,1,0,1,0]
print('Mean squared error: ', mean_squared_error(predictions_array_for_metrics, test_data_true_values))
#print('Mean absolute error: ', mean_absolute_error(predictions_array_for_metrics, test_data_true_values))
#print('Median Absolute error: ', median_absolute_error(predictions_array_for_metrics, test_data_true_values))
predictions_array_for_metrics

Mean squared error:  0.0


array([1., 1., 1., 0., 0., 1., 0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 1.,
       1., 1., 1., 1., 0., 1., 1., 1., 0., 1., 0., 0., 1., 1., 0., 0., 1.,
       0., 1., 1., 1., 0., 1., 0., 0., 1., 1., 1., 1., 0., 1., 0., 0., 1.,
       1., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 1.,
       0., 1., 1., 0., 1., 1., 0., 0., 1., 1., 0., 1., 1., 1., 0., 0., 0.,
       0., 0., 0., 1., 1., 1., 0., 0., 1., 1., 1., 0., 1., 1., 1., 0., 0.,
       1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 1., 1., 0., 1., 0.])

In [23]:
cm = pd.crosstab(index=test_data['door_open'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
print(cm)
print(cm.iloc[0,0])
print(cm.iloc[1,0])

Predicted  0.0  1.0
Observed           
0           54    0
1            0   63
54
0


In [24]:
cm = pd.crosstab(index=test_data['door_open'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
print(cm)
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "Not Open", "Open"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("Not Open", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Open", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))

Predicted  0.0  1.0
Observed           
0           54    0
1            0   63

Overall Classification Rate: 100.0%

Predicted      Not Open           Open
Observed
Not Open       100% (54)     0% (0)
Open            0% (0)    100% (63) 

