In [43]:
import pandas as pd
import boto3
import numpy as np
from sklearn.model_selection import train_test_split
from preprocessing import datetime, read_dataset, find_season, create_weekday, scale_outliers, drop
import os
from sagemaker import get_execution_role
from sagemaker import image_uris
import io
import json

In [2]:
bucket='cost-prediction-sports-equipment'
data_key = 'spending.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)
df=pd.read_csv(data_location)
df.head()

Unnamed: 0.1,Unnamed: 0,Date,mode_of_purchase,sporting_equipment,store,estimated_spending,number_of_items_bought
0,0,2019-08-01,order,fitness,east,119.412,5
1,1,2019-08-01,order,jog/run,south,212.506,6
2,2,2019-08-01,order,pilates,south,107.512,4
3,3,2019-08-01,order,pilates,east,65.651,5
4,4,2019-08-01,physical_collection,diving,west,64.871,2


In [3]:
df=datetime(df)    
season_list = []
for month in df['month']:
    season = find_season(month)
    season_list.append(season)
df['seasons'] = season_list
df=create_weekday(df)
df=scale_outliers(df)
df=drop(df)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['estimated_spending'][df['estimated_spending']>3*UL]=3*UL


Unnamed: 0,mode_of_purchase,sporting_equipment,store,estimated_spending,number_of_items_bought,seasons,weekday
0,order,fitness,east,119.412,5,Summer,weekday
1,order,jog/run,south,212.506,6,Summer,weekday
2,order,pilates,south,107.512,4,Summer,weekday
3,order,pilates,east,65.651,5,Summer,weekday
4,physical_collection,diving,west,64.871,2,Summer,weekday


In [4]:
def split_data(df,test_size,valid_size):
    X = df.drop(columns=['estimated_spending'])
    y = df['estimated_spending']
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=test_size,random_state=0)
    X_train,X_valid,y_train,y_valid=train_test_split(X_train,y_train,test_size=valid_size,random_state=0)
    return X_train,X_valid,y_train,y_valid,X_test,y_test

X_train,X_valid,y_train,y_valid,X_test,y_test=split_data(df,0.15,0.2)    

In [5]:
len(X_train),len(X_valid),len(X_test)

(34003, 8501, 7501)

In [6]:
def one_hot_encode(df):
    df=pd.get_dummies(df)
    return df

In [7]:
X_train=one_hot_encode(X_train)
X_valid=one_hot_encode(X_valid)
X_test=one_hot_encode(X_test)

In [8]:
data_dir = 'data'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [9]:
pd.DataFrame(X_test).to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)
pd.concat([y_valid, X_valid], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)
pd.concat([y_train, X_train], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [10]:
X_test.head()

Unnamed: 0,number_of_items_bought,mode_of_purchase_order,mode_of_purchase_physical_collection,sporting_equipment_badminton,sporting_equipment_basketball,sporting_equipment_boxing,sporting_equipment_climbing,sporting_equipment_crosstrain,sporting_equipment_cycling,sporting_equipment_diving,...,sporting_equipment_yoga,store_east,store_south,store_west,seasons_Autumn,seasons_Spring,seasons_Summer,seasons_Winter,weekday_weekday,weekday_weekend
15638,3,0,1,0,0,0,0,0,0,0,...,1,0,0,1,0,1,0,0,1,0
27513,8,1,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,1,0
50034,3,1,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,1,0
10114,5,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,1
12564,20,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,1


In [11]:
import sagemaker

session = sagemaker.Session() 
bucket='cost-prediction-sports-equipment'

test_location = session.upload_data(os.path.join(data_dir, 'test.csv'),bucket=bucket)
val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), bucket=bucket)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'),bucket=bucket)


In [12]:
# Our current execution role is require when creating the model as the training
# and inference code will need to access the model artifacts.
role = get_execution_role()

In [13]:
# We need to retrieve the location of the container which is provided by Amazon for using XGBoost.
# As a matter of convenience, the training and inference code both use the same container.

container = image_uris.retrieve('xgboost', session.boto_region_name, 'latest')
prefix = 'cost-prediction-sports-equipment'

In [14]:
xgb = sagemaker.estimator.Estimator(container, 
                                    role,           
                                    instance_count=1,                  
                                    instance_type='ml.m4.xlarge',      
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)

xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        eval_metric='rmse',
                        early_stopping_rounds=10,
                        num_round=500)

In [15]:
s3_input_train = sagemaker.TrainingInput(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.TrainingInput(s3_data=val_location, content_type='csv')

In [16]:
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

2022-05-21 09:04:16 Starting - Starting the training job...
2022-05-21 09:04:43 Starting - Preparing the instances for trainingProfilerReport-1653123856: InProgress
............
2022-05-21 09:06:40 Downloading - Downloading input data...
2022-05-21 09:07:12 Training - Downloading the training image......
2022-05-21 09:08:08 Training - Training image download completed. Training in progress.[34mArguments: train[0m
[34m[2022-05-21:09:08:12:INFO] Running standalone xgboost training.[0m
[34m[2022-05-21:09:08:12:INFO] File size need to be processed in the node: 3.71mb. Available memory size in the node: 8449.37mb[0m
[34m[2022-05-21:09:08:12:INFO] Determined delimiter of CSV input is ','[0m
[34m[09:08:12] S3DistributionType set as FullyReplicated[0m
[34m[09:08:12] 34003x40 matrix with 1360120 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2022-05-21:09:08:12:INFO] Determined delimiter of CSV input is ','[0m
[34m[09:08:12] S3Distribut

In [17]:
xgb_transformer = xgb.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

In [18]:
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

....................................
[34mArguments: serve[0m
[34m[2022-05-21 09:14:51 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2022-05-21 09:14:51 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2022-05-21 09:14:51 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2022-05-21 09:14:51 +0000] [21] [INFO] Booting worker with pid: 21[0m
[34m[2022-05-21 09:14:51 +0000] [22] [INFO] Booting worker with pid: 22[0m
[34m[2022-05-21 09:14:51 +0000] [23] [INFO] Booting worker with pid: 23[0m
[34m[2022-05-21 09:14:51 +0000] [24] [INFO] Booting worker with pid: 24[0m
  monkey.patch_all(subprocess=True)[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2022-05-21:09:14:51:INFO] Model loaded successfully for worker : 21[0m
[34m[2022-05-21:09:14:51:INFO] Model loaded successfully for worker : 22[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2022-05-21:09:14:51:INFO] Model loaded successfully for worker : 23[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2

In [19]:
xgb_transformer.wait()

[34mArguments: serve[0m
[35mArguments: serve[0m
[34m[2022-05-21 09:14:51 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2022-05-21 09:14:51 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2022-05-21 09:14:51 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2022-05-21 09:14:51 +0000] [21] [INFO] Booting worker with pid: 21[0m
[34m[2022-05-21 09:14:51 +0000] [22] [INFO] Booting worker with pid: 22[0m
[34m[2022-05-21 09:14:51 +0000] [23] [INFO] Booting worker with pid: 23[0m
[34m[2022-05-21 09:14:51 +0000] [24] [INFO] Booting worker with pid: 24[0m
  monkey.patch_all(subprocess=True)[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2022-05-21:09:14:51:INFO] Model loaded successfully for worker : 21[0m
[34m[2022-05-21:09:14:51:INFO] Model loaded successfully for worker : 22[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2022-05-21:09:14:51:INFO] Model loaded successfully for worker : 23[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2022-05-21:0

In [20]:
xgb_predictor = xgb.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

-----------!

In [31]:
endpoint_name = 'xgboost-2022-05-21-09-15-16-922'

In [23]:
def np2csv(arr):
    csv = io.BytesIO()
    np.savetxt(csv, arr, delimiter=",", fmt="%g")
    return csv.getvalue().decode().rstrip()

In [44]:
runtime = boto3.client("runtime.sagemaker")
payload = np2csv(pd.DataFrame(X_test[:1]))
response = runtime.invoke_endpoint(
    EndpointName=endpoint_name, ContentType="text/csv", Body=payload
)
result = json.loads(response["Body"].read().decode())
result

71.19747924804688