<a href="https://colab.research.google.com/github/ManognaKachiraju/Sagemaker-XgBoost/blob/main/sagemaker_implementation_xgboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
Steps To Be Followed:

1) Importing necessary Libraries
2) Creating S3 bucket
3) Mapping train And Test Data in S3
4) Mapping The path of the models in S3

1) CREATING S3 BUCKETS

In [None]:
import sagemaker
import boto3
#boto3 can enable api to access any s3 bucket as long as its public
from sagemaker.amazon.amazon_estimator import get_image_uri #get_image_uri to downlaod the pre-available algorithm image for our use
from sagemaker.session import s3_input, Session #we need a session to use s3 in a sagemaker instance

In [None]:
bucket_name = 'bankbucket' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET
# this bucket name is just a variable not arelated to s3 bucket
my_region = boto3.session.Session().region_name # set the region of the instance
print(my_region)

In [None]:
#noe we use bucket name vaiable from before cell and create a bucket using code in the desired region
s3 = boto3.resource('s3')
try:
    if  my_region == 'us-east-1':
        s3.create_bucket(Bucket=bucket_name)
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

In [None]:
# set an output path where the trained model will be saved
prefix = 'xgboost-as-a-built-in-algo'
output_path ='s3://{}/{}/output'.format(bucket_name, prefix)
# to access out bucket we need s3://bankyaswanth to access
#here first {} is replace with bucket name and second {} with prefix name
print(output_path)

2) ADDING FILES IN BUCKETS

In [None]:
import pandas as pd

#urllib is used to get dataset from requesting it from url and rename it to bank_clean.csv
import urllib
try:
    urllib.request.urlretrieve ("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv")
    print('Success: downloaded bank_clean.csv.')
except Exception as e:
    print('Data load error: ',e)

try:
    model_data = pd.read_csv('./bank_clean.csv',index_col=0)
    #above line we just defined the index col and created df called "mode-data" using the retrieved data
    print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

In [None]:
### Train Test split
# instead of x_train and y_train we will split the data into just train and test where train has both x_train & y_train
import numpy as np
train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))])
# data is split with 70% of length of data for train and 30% for testing data
print(train_data.shape, test_data.shape)

In [None]:
### Saving Train And Test Into Buckets
## We start with Train Data

#os to handle files and file paths
import os
pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'],
                                                axis=1)],
                                                axis=1).to_csv('train.csv', index=False, header=False)
# we have 2 dependent feature y_yes and y_no which are one-hot encoded so we can consider any 1 as dependent feature and drop the other
# Above line is we editing the structure of dataframe, we add y_yes as dependent feature in front of dataframe and (index and header) = false i.e.., we drop index and header of columns
# sagemaker dataframe default structure is y value or dependent features are in 1st col and
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')
# sagemaker.inputs.TrainingInput to create path in s3 to store traininf data

In [None]:
# edit dataset structure and put Test Data Into Buckets
pd.concat([test_data['y_yes'], test_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('test.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')
s3_input_test = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/test'.format(bucket_name, prefix), content_type='csv')
# even to store testing data we using sagemaker.inputs.TrainingInput which just means we create new path to store files in s3

3) Building Models using Xgboot- Inbuilt Algorithm

In [None]:
# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
from sagemaker import image_uris
container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name,version ="latest")
# the method get_image_uri has been renamed in sagemaker>=2 See: https://sagemaker.readthedocs.io/en/stable/v2.html for details

In [None]:
# never tune hyperparams in sagemake since it's very time consuming, do them in anywhere and get the params directly into sagemaker
# always in key value pairs this hyperparametrs is just a vriable not related to sagemaker or model
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":50
        }

In [None]:
# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=container,
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(), #to get iam role of the instance to access buckets
                                          train_instance_count=1,              # train in one instance of ml.m5.2xlarge
                                          train_instance_type='ml.m5.2xlarge',  # large to train it a bit faster
                                          train_volume_size=5, # 5 GB
                                          output_path=output_path, # from 4th cell
                                          #nest 3 params are used to reduce the billing cost by 50%
                                          train_use_spot_instances=True,
                                          train_max_run=300,
                                          train_max_wait=600)

In [None]:
estimator.fit({'train': s3_input_train,'validation': s3_input_test})
# s3_input_train is training resoure path variable defined above 5th cell
# simlilarly for s3_input_test is also a path towards s3 bucket

4) DEPLOY THE MODEL

In [None]:
# the above will automatically save the model inside s3 bucket output folder with data and time in model name and we can retrain the model and all models are saved and sent to s3->ouputs with data and time in their name
xgb_predictor = estimator.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')
# this will deploy the model as endpoint in ml.m4.xlarge instance

5) USING THE DEPLOYED MODEL FOR PREDICTIONS

In [None]:
from sagemaker.serializers import CSVSerializer # we can't directly give csv to endpoint, we need to serialize it before givng to endpoint
test_data_array = test_data.drop(['y_no', 'y_yes'], axis=1).values #Load the data into an array and drop dependent features and axis = 1 means we are dropping columns
xgb_predictor.content_type = 'text/csv' # define the model input type
xgb_predictor.serializer = CSVSerializer() # assign csvserializer as serializer for sgb_predictor
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict  and decode (desiralize) the predictions
predictions_array = np.fromstring(predictions[1:], sep=',') # predictions[1:] means out of all predictions we are taking the most accurate column of predictions and tuting the decoded output into an array
print(predictions_array.shape)

In [None]:
predictions_array # we are just looking over the predictions done by our model

In [None]:
# confusion matrix code copied from aws sagemaker documentation to print out accuracy and it's confusion matrix since it's binary classification
cm = pd.crosstab(index=test_data['y_yes'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Purchase", "Purchase"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Purchase", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Purchase", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))

6) DELETING THE ENDPOINTS


In [None]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint) # deletes the deployed endpoint
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name) # deletes all model, folder and data in bucket
bucket_to_delete.objects.all().delete() # deletes all objects from buckets but not deletes bucket itself for that we have to manually delete bucket from s3

In [None]:
1) MANUNALLY DELETE THE BUCKET INSIDE S3
SELECT NOTEBOOK INSTANCE -> STOP -> (AFTER IT STOPS) DELETE