In [1]:
import pandas as pd
import numpy as np
import os
import joblib
from sagemaker.sklearn.estimator import SKLearn

In [2]:
cur_path = os.getcwd()
training_data_path = os.path.join(cur_path, 'input_data')
training_files = os.listdir(training_data_path)
file = os.path.join(training_data_path, training_files[0])

In [3]:
file

'/root/dynamic_pricing/v3/input_data/greenx_container_booking_raw.csv'

## Prepare sagemaker session

In [4]:
import sagemaker
from sagemaker import get_execution_role
import boto3

sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

## Upload the data for training

In [5]:
import boto3

In [6]:
s3_bucket = 'bluex-booking-pridict'
s3_prefix = 'train_data'

In [7]:
s3_prefix

'train_data'

In [8]:
boto3.Session().resource('s3').Bucket(s3_bucket).Object(os.path.join(s3_prefix,'training.csv')) \
                                .upload_file(file)

## Create SageMaker Scikit Estimator 
To run our Scikit-learn training script on SageMaker, we construct a sagemaker.sklearn.estimator.sklearn estimator, which accepts several constructor arguments:

entry_point: The path to the Python script SageMaker runs for training and prediction.
role: Role ARN

train_instance_type (optional): The type of SageMaker instances for training. Note: Because Scikit-learn does not natively support GPU training, Sagemaker Scikit-learn does not currently support training on GPU instance types.

sagemaker_session (optional): The session used to train on Sagemaker.

hyperparameters (optional): A dictionary passed to the train function as hyperparameters.

In [20]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"
script_path = 'train.py'
s3_training_data = "s3://bluex-booking-pridict/train_data/training.csv"


sklearn = SKLearn(
    entry_point= script_path,
    framework_version=FRAMEWORK_VERSION,
    py_version='py3', 
    instance_type="ml.c4.xlarge",
    role=role,
    base_job_name="kneighbor-regression-byo",
    sagemaker_session=sagemaker_session
)

inputs = {'training': s3_training_data}
sklearn.fit(inputs=inputs,  wait=False)

In [21]:
job_name = sklearn.latest_training_job.name
job_name

'kneighbor-regression-byo-2020-09-21-06-44-14-154'

In [22]:
import time
sm = boto3.client('sagemaker')

result = 1
while result is not 0:
    result = 0    
    respones = sm.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
    print("job status:{}".format(respones))
    status = 0 if respones == 'Completed' else 1
    result = result + status
    time.sleep(20)
print('all job completed')

job status:InProgress
job status:InProgress
job status:InProgress
job status:InProgress
job status:InProgress
job status:InProgress
job status:InProgress
job status:InProgress
job status:InProgress
job status:InProgress
job status:InProgress
job status:Completed
all job completed


## Deploy your training model

In [24]:
sklearn.name = 'Kneighbor-regression_byo'
my_predictor = sklearn.deploy(initial_instance_count = 1, instance_type = 'ml.c4.xlarge', wait=False)

In [26]:
endpoint_name = my_predictor.endpoint_name
endpoint_name

'kneighbor-regression-byo-2020-09-21-06-49-18-664'

In [27]:
# check endpoint status
result = 1
while result is not 0:
    result = 0
    respones = sm.describe_endpoint(EndpointName=endpoint_name)['EndpointStatus']
    print("job status:{}".format(respones))
    status = 0 if respones == 'InService' else 1
    result = result + status
    time.sleep(20)
print('all endpoints in service')

job status:Creating
job status:Creating
job status:Creating
job status:Creating
job status:Creating
job status:Creating
job status:Creating
job status:Creating
job status:Creating
job status:Creating
job status:Creating
job status:Creating
job status:Creating
job status:Creating
job status:Creating
job status:Creating
job status:Creating
job status:Creating
job status:Creating
job status:InService
all endpoints in service


## Prediction on Test Dataset

In [39]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

my_predictor.content_types = 'text/csv'
my_predictor.serializer = CSVSerializer()
my_predictor.deserializer = JSONDeserializer()


In [40]:
# fbx_diff, d_quote_search_amount, conversion_rate, peak_season
dummy_data = [
    ['-190', '150', '95', '1'],
    ['-190', '120', '90', '1'],
    ['-190', '110', '80', '1'],
    ['-190', '100', '70', '1'],
    ['-500', '10', '30', '0']
]

dummy_df = pd.DataFrame(dummy_data)

In [43]:
dummy_df

Unnamed: 0,0,1,2,3
0,-190,150,95,1
1,-190,120,90,1
2,-190,110,80,1
3,-190,100,70,1
4,-500,10,30,0


In [51]:
def predict(data):
    predictions = []
    
    result = my_predictor.predict(data)
    predictions.append(result)
    
    return predictions

In [52]:
result_out = predict(dummy_df.to_numpy())

In [53]:
result_out

[[4.8, 5.4, 4.6, 4.6, 1.6]]

## Delete endpoints

In [54]:
import boto3
sm = boto3.client('sagemaker')
sm.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': 'be8af933-aa68-4053-ae6d-abcbceca4d27',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'be8af933-aa68-4053-ae6d-abcbceca4d27',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Mon, 21 Sep 2020 07:21:48 GMT'},
  'RetryAttempts': 0}}