### Steps to be followed
1. importing necessary libraries
2. Create S3 Bucket
3. mapping train and test data in S3
4. Mapping the path of the model in S3

### The use of S3 Bucket is, 
1. In this we will be able to save the training and test data of our model.
2. What ever model we have trained in the sage maker we will save in the S3 Bucket
3. It acts as a stroage unit

In [1]:
import sagemaker
import boto3 # when ever we work with the sagemaker we have to import BOTO3
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import s3_input, Session

In [15]:
import boto3

bucket_name = "bankapplications"  # Example bucket name, adjust as needed
my_region = boto3.session.Session().region_name
print("Selected region:", my_region)

s3 = boto3.resource("s3")

try:
    if my_region == "us-east-1":
            s3.create_bucket(Bucket=bucket_name)
    else:
        s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': my_region})
        print("S3 bucket '{}' successfully created.".format(bucket_name))
except Exception as e:
    print("S3 error:", e)


Selected region: us-east-1


In [16]:
#set an output path where the trained model will be saved
prefix = "xgboost-as-a-built-in-algo"
output_path = "s3://{}/{}/output".format(bucket_name, prefix)
print(output_path)

s3://bankapplications/xgboost-as-a-built-in-algo/output


In [17]:
import pandas as pd
import urllib

try:
    urllib.request.urlretrieve ("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv")
    print("Sucess: Downloaded bank_clean.csv")
except Exception as e:
    print("data load eerror: ", e)

try:
    model_name = pd.read_csv("./bank_clean.csv", index_col = 0)
    print("Sucess: data loaded into the dataframe")
except Exception as e:
    print("data load error: ", e)


Sucess: Downloaded bank_clean.csv
Sucess: data loaded into the dataframe


In [19]:
# train test split using numpy library

import numpy as np

train_data, test_data = np.split(model_name.sample(frac=1, random_state=124), [int(0.7*len(model_name))])
print(train_data.shape, test_data.shape)

(28831, 61) (12357, 61)


In [21]:
### Saving Train And Test Into Buckets
## We start with Train Data
import os
pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], 
                                                axis=1)], 
                                                axis=1).to_csv('train.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

In [22]:
### Saving Test Into Buckets
## We start with Test Data
import os
pd.concat([test_data['y_yes'], test_data.drop(['y_no', 'y_yes'], 
                                                axis=1)], 
                                                axis=1).to_csv('test.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')
s3_input_test = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/test'.format(bucket_name, prefix), content_type='csv')

## Building the xgboost inbuilt algorithm

In [30]:
# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.

import sagemaker
container = sagemaker.image_uris.retrieve(
    "xgboost", region=boto3.Session().region_name, version = "1.2-1"
)

In [31]:

# initialize hyperparameters
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":50
        }

In [36]:
# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          train_instance_count=1, 
                                          train_instance_type='ml.m5.2xlarge', 
                                          train_volume_size=5, 
                                          output_path=output_path,
                                          train_use_spot_instances=True,
                                          train_max_run=300,
                                          train_max_wait=600)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_run has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_use_spot_instances has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_wait has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_volume_size has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [33]:
from sagemaker.estimator import Estimator
from sagemaker.image_uris import retrieve

In [37]:
estimator.fit({"train": s3_input_train, "validation": s3_input_test})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-08-20-17-19-56-480


2023-08-20 17:19:56 Starting - Starting the training job...
2023-08-20 17:20:12 Starting - Preparing the instances for training......
2023-08-20 17:21:09 Downloading - Downloading input data...
2023-08-20 17:21:50 Training - Training image download completed. Training in progress....
2023-08-20 17:22:21 Uploading - Uploading generated training model[34m[2023-08-20 17:22:13.263 ip-10-0-213-242.ec2.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m

## Deploy Machine Learning Model As Endpoints

In [38]:
xgb_predictor = estimator.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-08-20-17-26-24-566
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2023-08-20-17-26-24-566
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2023-08-20-17-26-24-566


---------!

### prediction of the test data 

In [41]:

from sagemaker.serializers import CSVSerializer

# Assuming you already have the xgb_predictor object created

test_data_array = test_data.drop(['y_no', 'y_yes'], axis=1).values
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = CSVSerializer()  # Use the CSVSerializer from sagemaker.serializers

predictions = xgb_predictor.predict(test_data_array).decode('utf-8')
predictions_array = np.fromstring(predictions[1:], sep=',')
print(predictions_array.shape)

(12357,)


In [42]:
predictions_array

array([0.03087804, 0.04877723, 0.07058939, ..., 0.05399493, 0.33494297,
       0.03112546])

In [43]:
cm = pd.crosstab(index=test_data['y_yes'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Purchase", "Purchase"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Purchase", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Purchase", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))



Overall Classification Rate: 89.8%

Predicted      No Purchase    Purchase
Observed
No Purchase    91% (10787)    34% (163)
Purchase        9% (1092)     66% (315) 



In [44]:
# deleteling the end points
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2023-08-20-17-26-24-566


[{'ResponseMetadata': {'RequestId': 'F5N0WRSKD53FF0J3',
   'HostId': 'MoVrFokLLNAzlvTQmMV9BlxeGL/fGt0UID73sK2vz1CU4sHmx0YvHnLx1xXgb2dzn8501XM+tbKp7CazeSQVjQ==',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'MoVrFokLLNAzlvTQmMV9BlxeGL/fGt0UID73sK2vz1CU4sHmx0YvHnLx1xXgb2dzn8501XM+tbKp7CazeSQVjQ==',
    'x-amz-request-id': 'F5N0WRSKD53FF0J3',
    'date': 'Sun, 20 Aug 2023 17:38:30 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2023-08-20-17-19-56-480/debug-output/events/000000000020/000000000020_worker_0.tfevents'},
   {'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2023-08-20-17-19-56-480/debug-output/index/000000000/000000000030_worker_0.json'},
   {'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2023-08-20-17-19-56-480/debug-output/events/000000000010/000000000010