In [27]:
import sagemaker
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import s3_input,Session

In [15]:
bucket_name="app-bucket1-kundan16"
my_region=boto3.session.Session().region_name
print(my_region)

us-east-1


In [16]:
s3=boto3.resource('s3')
try:
    if my_region == 'us-east-1':
        s3.create_bucket(Bucket=bucket_name)
    print('S3 bucket created successfully')
except Exception as e:
    print("Error",e)

S3 bucket created successfully


In [17]:
#Setting an output path where trained model will be saved
prefix='xgboost-as-a-built-in-algo'
out_path='s3://{}/{}/output'.format(bucket_name,prefix)
print(out_path)

s3://app-bucket1-kundan16/xgboost-as-a-built-in-algo/output


In [18]:
import pandas as pd
import urllib
try:
    urllib.request.urlretrieve ("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv")
    print('Success: downloaded bank_clean.csv.')
except Exception as e:
    print('Data load error: ',e)

try:
    model_data = pd.read_csv('./bank_clean.csv',index_col=0)
    print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

Success: downloaded bank_clean.csv.
Success: Data loaded into dataframe.


In [19]:
import numpy as np
train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))])
print(train_data.shape, test_data.shape)

(28831, 61) (12357, 61)


  return bound(*args, **kwds)


In [24]:
#Putting training data into bucket
import os
pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], 
                                                axis=1)], 
                                                axis=1).to_csv('train.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = sagemaker.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

In [25]:
#Putting test data into bucket
pd.concat([test_data['y_yes'], test_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('test.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')
s3_input_test = sagemaker.TrainingInput(s3_data='s3://{}/{}/test'.format(bucket_name, prefix), content_type='csv')
     

In [29]:
from sagemaker import image_uris

container = image_uris.retrieve(framework='xgboost', region=boto3.Session().region_name, version='1.0-1')


In [43]:
hyperparameters = {
    "max_depth":"6",
    "eta":"0.2",
    "gamma":"4",
    "min_child_weight":"6",
    "subsample":"0.7",
    "objective":"binary:logistic",
    "num_round":"50"
    
}

In [44]:
#constructing a Sagemaker estimator that calls the xgboost-contanier

estimator = sagemaker.estimator.Estimator(image_uri=container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m5.large', 
                                          volume_size=5, # 5 GB 
                                          output_path=out_path,
                                          use_spot_instances=True,
                                          max_run=300,
                                          max_wait=800)
     

In [41]:
estimator

<sagemaker.estimator.Estimator at 0x7fe7bb85e5c0>

In [45]:
estimator.fit({'train':s3_input_train,'validation':s3_input_test})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-08-07-14-14-02-531


2024-08-07 14:14:02 Starting - Starting the training job...
2024-08-07 14:14:18 Starting - Preparing the instances for training...
2024-08-07 14:14:49 Downloading - Downloading input data......
2024-08-07 14:15:34 Downloading - Downloading the training image...
2024-08-07 14:16:20 Training - Training image download completed. Training in progress.[34m[2024-08-07 14:16:23.362 ip-10-0-215-22.ec2.internal:8 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[

In [None]:
#9.615% error on the train data and 10.326 on the test data

#This means no over or underfitting has happened



In [46]:
#Deploy Now

xgb_predictor=estimator.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-08-07-14-25-54-491
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-08-07-14-25-54-491
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-08-07-14-25-54-491


------!