**Steps to be followed**
1. Importing necessary libraries
2. Creating S3 bucket (to save the models into AWS sagemaker - scaleable)
3. Mapping train and test data in S3 (map the output path of the model on S3)
4. Mapping the path of the models in S3

## Data Preparation

#### 1. Import Necessary Libraries

In [1]:
# Import necessary libraries

import sagemaker
import boto3 #read the S3 buckets if it's public
from botocore.exceptions import ClientError
import logging
import os

from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.amazon.amazon_estimator import get_image_uri

from sagemaker.session import s3_input, Session

#### 2. Create a new S3 bucket

In [3]:
bucket_name = 'atp-tennis-prediction'
my_region = boto3.session.Session().region_name # set the region of the instance
print(my_region)

None


In [3]:
# Create an S3 bucket
# Source: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-example-creating-buckets.html

def create_bucket(bucket_name, region=None):
    """Create an S3 bucket in a specified region

    If a region is not specified, the bucket is created in the S3 default
    region (us-east-1).

    :param bucket_name: Bucket to create
    :param region: String region to create bucket in, e.g., 'us-west-2'
    :return: True if bucket created, else False
    """

    # Create bucket
    try:
        if region is None:
            s3_client = boto3.client('s3')
            s3_client.create_bucket(Bucket=bucket_name)
        else:
            s3_client = boto3.client('s3', region_name=region)
            location = {'LocationConstraint': region}
            s3_client.create_bucket(Bucket=bucket_name,
                                    CreateBucketConfiguration=location)
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [4]:
# Call the function to create S3 bucket
create_bucket(bucket_name, my_region)

ERROR:root:An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.


False

In [5]:
# Retrieve the list of existing buckets
s3 = boto3.client('s3')
response = s3.list_buckets()

# Output the bucket names
print('Existing buckets:')
for bucket in response['Buckets']:
    print(f'  {bucket["Name"]}')

Existing buckets:
  atp-tennis-prediction
  dev-terraform-state-bucket-1


#### 3. Mapping the output path

In [6]:
# Set an output path where the trained model will be saved
prefix = 'xgboost-as-a-built-in-algo'
output_path = 's3://{}/{}/output'.format(bucket_name, prefix)
print(output_path)

s3://atp-tennis-prediction/xgboost-as-a-built-in-algo/output


#### 4. Downloading the Dataset and Save it on S3

In [7]:
import pandas as pd
import urllib
try:
    urllib.request.urlretrieve ("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv")
    print('Success: downloaded bank_clean.csv.')
except Exception as e:
    print('Data load error: ',e)

try:
    model_data = pd.read_csv('./bank_clean.csv',index_col=0)
    print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

Success: downloaded bank_clean.csv.
Success: Data loaded into dataframe.


In [8]:
### Train Test split

import numpy as np
train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))])
print(train_data.shape, test_data.shape)

(28831, 61) (12357, 61)


In [9]:
### Saving Train And Test Into Buckets
## We start with Train Data
import os
pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], 
                                                axis=1)], 
                                                axis=1).to_csv('train.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

In [10]:
# Test Data Into Buckets
pd.concat([test_data['y_yes'], test_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('test.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')
s3_input_test = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/test'.format(bucket_name, prefix), content_type='csv')

## Model Building & Training

In [12]:
# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the version depending on your preference.

container = get_image_uri(boto3.Session().region_name,
                         'xgboost',
                         repo_version='1.0-1')

#container = sagemaker.image_uris.retrieve(my_region,'xgboost',version='1.0-1')

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [13]:
# initialize hyperparameters
# Hyperparameter tuning should be donee separately
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":50
        }

In [15]:
# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          train_instance_count=1, 
                                          train_instance_type='ml.m5.2xlarge', #2x large --> faster
                                          train_volume_size=5, # 5 GB 
                                          output_path=output_path,
                                          train_use_spot_instances=True, # important parameters to limit the building time/reduce the building hours
                                          train_max_run=300, # in seconds
                                          train_max_wait=600)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [16]:
# Input is from the S3 bucket

estimator.fit({'train': s3_input_train,'validation': s3_input_test}) # The input value is the path of the dataset

2022-04-17 12:44:46 Starting - Starting the training job...
2022-04-17 12:44:48 Starting - Launching requested ML instancesProfilerReport-1650199486: InProgress
......
2022-04-17 12:46:16 Starting - Preparing the instances for training......
2022-04-17 12:47:07 Downloading - Downloading input data
2022-04-17 12:47:07 Training - Downloading the training image...
2022-04-17 12:47:42 Uploading - Uploading generated training model
2022-04-17 12:47:42 Completed - Training job completed
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV 

### Deploy Machine Learning Model

In [57]:
xgb_predictor = estimator.deploy(
            initial_instance_count=1,
            instance_type='ml.m4.xlarge')

# xgb_predictor = estimator.deploy(
#             initial_instance_count=1,
#             instance_type='ml.m4.xlarge')

# instance_count helps to have parallel instances (to get faster response)
# xlarge is selected so that the response time is less

-------!

**Prediction of the Test Data**

In [58]:
# Import a library to serialize a tabular dataset to be passed to the model
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer

# Drop the dependent feature from test data and convert into an array
test_data_array = test_data.drop(['y_no', 'y_yes'], axis=1).values #load the data into an array

In [59]:
# Use .content_type everytime we use csv_serializer
xgb_predictor.CONTENT_TYPE = 'text/csv' # set the data type for an inference
xgb_predictor.serializer = CSVSerializer() # set the serializer type
# xgb_predictor.deserializer = None

In [60]:
# Predict the test data
# Decoding is required because when we are doing the prediction, the format was encoded, so we need to decode
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!

In [61]:
# Take the first part of the particular data
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)

(12357,)


In [62]:
predictions_array

array([0.05214286, 0.05660191, 0.05096195, ..., 0.03436061, 0.02942475,
       0.03715819])

In [63]:
# Creating a confusion matrix

# Crosstab 
cm = pd.crosstab(index=test_data['y_yes'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Purchase", "Purchase"))
print("Observed")

# Based on the purchase/no purchase, check the classification matrix
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Purchase", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Purchase", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Classification Rate: 89.7%

Predicted      No Purchase    Purchase
Observed
No Purchase    91% (10785)    34% (151)
Purchase        9% (1124)     66% (297) 



### Deleting the End Points

In [None]:
# # Delete endpoints of the estimator
# sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)

# # Specify bucket to delete
# bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)

# # Everything will be deleted
# bucket_to_delete.objects.all().delete()

### Backup Code

In [64]:
# # Import a library to serialize a tabular dataset to be passed to the model
# from sagemaker.serializers import CSVSerializer
# from sagemaker.deserializers import CSVDeserializer

# # Drop the dependent feature from test data and convert into an array
# test_data_array = test_data.drop(['y_no', 'y_yes'], axis=1).values #load the data into an array

# class FMSerializer(CSVSerializer):
#     def serialize(self, data):
#         c = {'instances': []}
#         for row in data:
#             c['instances'].append({'features': row.tolist()})
#         return csv.dumps(c)


# xgb_predictor = estimator.deploy(
#             initial_instance_count=1,
#             instance_type='ml.m4.xlarge',
#             serializer=FMSerializer(),
#             deserializer=JSONDeserializer()
# )

# # instance_count helps to have parallel instances (to get faster response)
# # xlarge is selected so that the response time is less