<a href="https://colab.research.google.com/github/Joykareko/AWS-Udacity/blob/main/Training_Batch_transform_processing_job_endpoint.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Training Job

In [None]:
  #Training Job
  import sagemaker
from sagemaker import get_execution_role
from sagemaker import image_uris
from sagemaker.predictor import csv_serializer

session = sagemaker.Session()

role = get_execution_role()

# If you're following along, you'll need to upload these datasets to your own bucket in S3. 

test_location = 's3://sagemaker-us-west-2-565094796913/boston-xgboost-HL/test.csv'
val_location = 's3://sagemaker-us-west-2-565094796913/boston-xgboost-HL/validation.csv'
train_location = 's3://sagemaker-us-west-2-565094796913/boston-xgboost-HL/train.csv'

# We use this prefix to help us determine where the output will go. 

prefix = 's3://sagemaker-us-west-2-565094796913/'

# We need to get the location of the container. 

container = image_uris.retrieve('xgboost', session.boto_region_name, version='latest')

# Now that we know which container to use, we can construct the estimator object.
xgb = sagemaker.estimator.Estimator(container, # The image name of the training container
                                    role,      # The IAM role to use (our current role in this case)
                                    instance_count=1, # The number of instances to use for training
                                    instance_type='ml.m4.xlarge', # The type of instance to use for training
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                                                        # Where to save the output (the model artifacts)
                                    sagemaker_session=session) # The current SageMaker session
             
# These hyperparameters are beyond the scope of this course, but you can research the algoirthm here: 
# https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html    
    
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='reg:linear',
                        early_stopping_rounds=10,
                        num_round=200)
                        
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=val_location, content_type='csv')

# The fit method launches the training job. 

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})


### Endpoint

In [None]:
#endpoint
from sagemaker import get_execution_role
from sagemaker.model import Model
from sagemaker import image_uris

role = get_execution_role()

# You'll need to confirm that this region is located in the same place as the S3 uri of your training job.
# (Check the upper right-hand side of the console.)

image_uri = image_uris.retrieve(framework='xgboost',region='us-west-2', version='latest')

# You'll need to replace this model data with the output S3 uri of your training job. 

model_data = "s3://sagemaker-us-west-2-565094796913/boston-xgboost-HL/output/xgboost-2021-08-31-23-02-30-970/output/model.tar.gz"

model = Model(image_uri=image_uri, model_data=model_data, role=role)

predictor = model.deploy(initial_instance_count=1, instance_type="ml.m5.large")


### Batch Transform Job

In [None]:
from sagemaker import get_execution_role
from sagemaker.model import Model
from sagemaker import image_uris

role = get_execution_role()

# You'll need to confirm that this region is located in the same place as the S3 uri of your training job.
# (Check the upper right-hand side of the console.)

image_uri = image_uris.retrieve(framework='xgboost',region='us-west-2', version='latest')

# You'll need to replace this with the output uri of a training job. 

model_data = "s3://sagemaker-us-west-2-565094796913/boston-xgboost-HL/output/xgboost-2021-08-31-23-02-30-970/output/model.tar.gz"

# You'll need to replace this with the desired output of your batch transform job. 

batch_transform_output_path = "s3://sagemaker-us-west-2-565094796913/boston-xgboost-HL/test_batch_output-2"

model = Model(image_uri=image_uri, model_data=model_data, role=role)

transformer = model.transformer(
    instance_count=1,
    instance_type='ml.m4.xlarge',
    output_path=batch_transform_output_path
)

# You'll need to replace the output data with your S3 uri of your dataset in S3. 

transformer.transform(
    data="s3://sagemaker-us-west-2-565094796913/boston-xgboost-HL/test.csv",
    data_type='S3Prefix',
    content_type='text/csv',
    split_type='Line'
)


### Processing Job

In [None]:
#Processing job
%%writefile xgboost_process_script.py

# Execute this cell first to write this script to your local directory. 

import pandas

# This method filters out the column at index 1, which is the crime data. 

def filter_crime_data(input_data_path):
    with open(input_data_path, 'r') as f:
        df = pandas.read_csv(f)
    df.drop(df.columns[[1]], axis=1)
    return df

# The main method takes in data at '/opt/ml/processing/input/data/train.csv' 
# and outputs it as a csv to '/opt/ml/processing/output/data_processed'

if __name__ == "__main__":
    filtered_data = filter_crime_data('/opt/ml/processing/input/data/train.csv')
    filtered_data.to_csv('/opt/ml/processing/output/data_processed')



In [None]:
import boto3

from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

role = get_execution_role()

sklearn_processor = SKLearnProcessor(framework_version='0.20.0',
                                     role=role,
                                     instance_type='ml.m5.large',
                                     instance_count=1)


# You will need to replace the 'source' code with the location of the dataset you want to process. 

sklearn_processor.run(code='xgboost_process_script.py',
                        inputs=[ProcessingInput(
                        source='s3://sagemaker-us-west-2-565094796913/boston-xgboost-HL/train.csv',
                        destination='/opt/ml/processing/input/data/')],
                      outputs=[ProcessingOutput(source='/opt/ml/processing/output')]
                     )
