# Imports

In [1]:
import sagemaker
import boto3
from sagemaker.predictor import Predictor
import pandas as pd
import numpy as np
from urllib import request
import os

# Create the S3 bucket

In [2]:
bucket_name = 'newbucket'  # add bucket name
my_region = boto3.session.Session().region_name
print(my_region)

eu-north-1


In [3]:
s3 = boto3.resource('s3')
try:
    if my_region == 'eu-north-1':
        s3.create_bucket(Bucket=bucket_name, 
                         CreateBucketConfiguration={
                             'LocationConstraint': 'eu-north-1'})
    print('S3 bucket created')
except Exception as e:
    print(f'S3 error: {e}')

S3 error: An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.


In [4]:
prefix = 'xgboost-algo'
output_path = f"s3://{bucket_name}/{prefix}/output"
print(output_path)

s3://newbucket-199/xgboost-algo/output


## Download the Dataset and store to S3

#### Download the Dataset and create a Dataframe

In [5]:
try:
    request.urlretrieve(
        "https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv",
        "bank_clean.csv")
    print('The bank_clean.csv was downloaded .')
except Exception as e:
    print('Data load error: ', e)

try:
    df = pd.read_csv('./bank_clean.csv', index_col=0)
    print('Dataframe was created')
except Exception as e:
    print('Data load error: ', e)

The bank_clean.csv was downloaded .
Dataframe was created


In [6]:
df.head()

Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_no,y_yes
0,56,1,999,0,1,0,0,0,0,1,...,0,1,0,0,0,0,1,0,1,0
1,57,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
2,37,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
3,40,1,999,0,1,0,1,0,0,0,...,0,1,0,0,0,0,1,0,1,0
4,56,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0


#### Split the Data into train and test

In [7]:
# Train Test split

splitting_index = int(len(df) * 0.7)  # 0.7 = 70 % for train

train_data, test_data = np.split(df.sample(frac=1, random_state=42),
                                 [splitting_index])

print(train_data.shape, test_data.shape)

(28831, 61) (12357, 61)


In [8]:
# Train Data to CSV

# make the target (y) column first
train_csv = pd.concat([train_data['y_yes'],
                       train_data.drop(['y_no', 'y_yes'], axis=1)], axis=1)

# save to csv and drop index and headers
train_csv.to_csv('train.csv', index=False, header=False)

# Test Data to CSV

# make the target (y) column first
test_csv = pd.concat([test_data['y_yes'],
                      test_data.drop(['y_no', 'y_yes'], axis=1)], axis=1)

# save to csv and drop index and headers
test_csv.to_csv('test.csv', index=False, header=False)


#### Upload the train.csv and test.csv to S3

In [9]:
# Upload CSVs to S3 bucket

# train.csv
boto3.Session().resource('s3').Bucket(bucket_name).Object(
    os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')

s3_input_train = sagemaker.inputs.TrainingInput(
    s3_data=f"s3://{bucket_name}/{prefix}/train",
    content_type='csv')

# test.csv
boto3.Session().resource('s3').Bucket(bucket_name).Object(
    os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')

s3_input_test = sagemaker.inputs.TrainingInput(
    s3_data=f"s3://{bucket_name}/{prefix}/test",
    content_type='csv')

## Model: XGboost

#### Initiate the model and estimator

In [10]:
# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
xgboost_container = sagemaker.image_uris.retrieve("xgboost",
                                                  my_region,
                                                  "1.7-1")
# initialize hyperparameters
hyperparameters = {
        "max_depth": "5",
        "eta": "0.2",
        "gamma": "4",
        "min_child_weight": "6",
        "subsample": "0.7",
        "objective": "binary:logistic",
        "num_round": "50"
}


In [11]:
# construct a SageMaker estimator that calls the xgboost-container

estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container,
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1,
                                          instance_type='ml.m5.xlarge',
                                          volume_size=5,  # 5GB
                                          output_path=output_path,
                                          use_spot_instances=True,
                                          max_run=300,
                                          max_wait=600)

#### Train the model

In [12]:
estimator.fit({'train': s3_input_train, 'validation': s3_input_test})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-07-26-11-37-53-555


2023-07-26 11:37:53 Starting - Starting the training job...
2023-07-26 11:38:08 Starting - Preparing the instances for training.........
2023-07-26 11:39:42 Downloading - Downloading input data...
2023-07-26 11:40:07 Training - Downloading the training image...
2023-07-26 11:40:48 Uploading - Uploading generated training model[34m[2023-07-26 11:40:42.155 ip-10-0-185-208.eu-north-1.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2023-07-26 11:40:42.179 ip-10-0-185-208.eu-north-1.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2023-07-26:11:40:42:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2023-07-26:11:40:42:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2023-07-26:11:40:42:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2023-07-26:11:40:42:INFO] Running XGBoost Sagemaker in algorithm mode

## Deploy the model

In [13]:
xgb_predictor = estimator.deploy(initial_instance_count=1,
                                 instance_type='ml.m5.xlarge')

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-07-26-11-42-06-933
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2023-07-26-11-42-06-933
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2023-07-26-11-42-06-933


----!

## Prediction

In [15]:
# Create a predictor
endpoint = xgb_predictor.endpoint_name
predictor = Predictor(endpoint)

# Set the content type and serializer type
predictor.content_type = 'text/csv'
predictor.serializer = sagemaker.serializers.CSVSerializer()

# Define the test data
test_data_array = test_data.drop(['y_no', 'y_yes'], axis=1).values

# Make predictions
predictions = predictor.predict(test_data_array).decode('utf-8')

# Process the prediction result
predictions_array = np.fromstring(predictions[1:], sep='\n')

# Print the shape of the predictions array
print(predictions_array.shape)

(12357,)


In [16]:
predictions_array

array([0.02587362, 0.02734977, 0.08641617, ..., 0.68751293, 0.04483907,
       0.10314494])

In [17]:
# Initialize the confusion matrix
cm = pd.crosstab(index=test_data['y_yes'],
                 columns=np.round(predictions_array),
                 rownames=['Observed'],
                 colnames=['Predicted'])

tn = cm.iloc[0, 0]  # true negative
fn = cm.iloc[1, 0]  # false negtive
tp = cm.iloc[1, 1]  # true positive
fp = cm.iloc[0, 1]  # false positive

# metrics
accuracy = (tp+tn)/(tp+tn+fp+fn) * 100
precision = tp / (tp+fp) * 100
recall = tp / (tp + fn) * 100
f1 = 2*(precision*recall)/(precision+recall)

print("Metrics\n")
print(f"accuracy: {accuracy:<4.1f}%")
print(f"precision: {precision: <4.1f}%")
print(f"recall:    {recall: <4.1f}%")
print(f"f1 score:  {f1: <4.1f}%")
print("\n")
print(f"{'Predicted':<15}{'No Purchase':<15}{'Purchase':>8}")
print("Observed")
print(f"{'No Purchase':<15}{tn/(tn+fn)*100:<2.0f}% ({tn:<}){fp/(tp+fp)*100:>6.0f}% ({fp:<})")
print(f"{'Purchase':<16}{fn/(tn+fn)*100:<1.0f}% ({fn:<}){tp/(tp+fp)*100:>7.0f}% ({tp:<}) \n")

Metrics

accuracy: 89.8%
precision: 65.8%
recall:    21.0%
f1 score:  31.8%


Predicted      No Purchase    Purchase
Observed
No Purchase    91% (10809)    34% (152)
Purchase        9% (1103)     66% (293) 



the metrics indicate an imbalanced dataset

## Delete Endpoint

In [18]:
# delete the endpoint
predictor.delete_endpoint()

# delete the files from bucket
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

INFO:sagemaker:Deleting endpoint configuration with name: sagemaker-xgboost-2023-07-26-11-42-06-933
INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2023-07-26-11-42-06-933


[{'ResponseMetadata': {'RequestId': 'DKBXZZBE1EG42HT6',
   'HostId': 'DyMF9qWX7Au17CMZwhER0D8/lHBOMkuNerqfmOAoLFjXkMrSLb5O6WMkfKBv7d5YCC0gDQBpMDw=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'DyMF9qWX7Au17CMZwhER0D8/lHBOMkuNerqfmOAoLFjXkMrSLb5O6WMkfKBv7d5YCC0gDQBpMDw=',
    'x-amz-request-id': 'DKBXZZBE1EG42HT6',
    'date': 'Wed, 26 Jul 2023 11:45:42 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'xgboost-algo/test/test.csv'},
   {'Key': 'xgboost-algo/output/sagemaker-xgboost-2023-07-26-11-37-53-555/debug-output/index/000000000/000000000030_worker_0.json'},
   {'Key': 'xgboost-algo/output/sagemaker-xgboost-2023-07-26-11-37-53-555/debug-output/collections/000000000/worker_0_collections.json'},
   {'Key': 'xgboost-algo/output/sagemaker-xgboost-2023-07-26-11-37-53-555/debug-output/index/000000000/000000000020_worker_0.json'},
   {'Key': 'x