In [1]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.5.2-py3-none-manylinux2014_x86_64.whl (173.6 MB)
     |████████████████████████████████| 173.6 MB 5.5 kB/s              
Installing collected packages: xgboost
Successfully installed xgboost-1.5.2


In [7]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.estimator import Estimator, Framework
from sagemaker.xgboost.estimator import XGBoost
import seaborn as sns
import matplotlib.pyplot as plt
import os
import json
import xgboost as xgb

from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)


In [8]:
prefix = "sagemaker/binaryforecaster"

sagemaker_session = sagemaker.Session()
role = get_execution_role()
bucket = sagemaker_session.default_bucket()
region = sagemaker_session.boto_region_name

s3_data_path = f"{bucket}/{prefix}/data"
s3_output_path = f"{bucket}/{prefix}/output"
# s3_data_train_path = sagemaker.s3_input(s3_data=f"s3://{s3_data_path}/train/", content_type='application/json')
s3_data_train_path = sagemaker.TrainingInput(s3_data=f"s3://{s3_data_path}/train/", content_type='text/csv')
# s3_data_valid_path = sagemaker.s3_input(s3_data=f"s3://{s3_data_path}/valid/", content_type='application/json')
# s3_data_valid_path = sagemaker.TrainingInput(s3_data=f"s3://{s3_data_path}/validation/", content_type='text/csv')
data_channels = {"train": s3_data_train_path, 
#                  "validation": s3_data_valid_path
                }
# data_channels = {"train": f"s3://{s3_data_path}/train/", "validation": f"s3://{s3_data_path}/validation/"}

In [36]:
from sagemaker.xgboost.estimator import Framework
from sagemaker.sklearn.estimator import SKLearn
definition = "xgboost"
if definition == "sklearn":
    estimator = SKLearn(
        entry_point = "hpo.py",
        framework_version="0.23-1",
        sagemaker_session=sagemaker_session,
        role=role,
        instance_count=1,
    #     instance_type="ml.m5.xlarge",
        instance_type = "ml.m5.4xlarge",
        base_job_name="XGBForecaster",
        output_path=f"s3://{s3_output_path}",
    )
elif definition == "xgboost":
    estimator = XGBoost(
        entry_point = "hpo1.py",
        framework_version="1.2-1",
        sagemaker_session=sagemaker_session,
        role=role,
        instance_count=1,
    #     instance_type="ml.m5.xlarge",
        instance_type = "ml.m5.4xlarge",
        base_job_name="XGBForecaster",
        output_path=f"s3://{s3_output_path}",
    )

hyperparameter_ranges = {
    "alpha": ContinuousParameter(0.01, 10, scaling_type="Logarithmic"),
#     "lambda": ContinuousParameter(0.01, 10, scaling_type="Logarithmic"),
    "gamma" : ContinuousParameter(0.01, 10, scaling_type = "Logarithmic"),
    "eta" : ContinuousParameter(0.01, 0.5, scaling_type = "Logarithmic"), 
    "max_depth" : IntegerParameter(0, 10), 
#     "early_stopping_rounds": []
#     "num_rounds" : IntegerParameter(10, 500, scaling_type= "Logarithmic")
}


# objective_metric_name = "Validation Accuracy"
objective_metric_name = "validation:auc"
objective_type = "Maximize"
metric_definitions = [{"Name": "Validation Accuracy", 
                       "Regex": "Validation Accuracy: ([0-9\\.]+)"}]


tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
#     metric_definitions,
    max_jobs=20,
    max_parallel_jobs=5,
    objective_type=objective_type
)

os.environ['SM_CHANNEL_TRAINING']=f"s3://{s3_data_path}/train/"
# os.environ['SM_MODEL_DIR']=f's3://{bucket}/dogImages/model/'
os.environ['SM_OUTPUT_DATA_DIR']=s3_output_path+'/'

tuner.fit(data_channels)


..........................................................................................................................................................................................................!


In [57]:
from sagemaker.xgboost.estimator import XGBoost
BetterTrainingJobName=tuner.best_training_job()
print("Best training job:", BetterTrainingJobName)


Best training job: sagemaker-xgboost-220203-2313-016-3fdd822f


In [40]:
# Note that the training hyperparameters are manually collected, due to a bug on Sagemaker 
# The above cell has shown that a best training job is obtained from the tuning jobs we submitted
# The bug on Sagemaker makes it hard to retrieve the hyperparameters. The bug is caused by how 
# Sagemaker devs code up the regular expression in XGBoost.attach() method. 

# Problem specified here: https://github.com/aws/sagemaker-python-sdk/issues/2202

from sagemaker import image_uris
xgb_image = image_uris.retrieve(framework='xgboost',
                                region=region,
                                version='latest')

hyperparameters = {
    "alpha" : 0.015533316376102427, 
    "eta" : 0.1747233886055575,
    "gamma" : 0.010016366689420794,
    "max_depth" : int(7)
}

estimator = sagemaker.estimator.Estimator( # Define another estimator so that we can set hyperparameters. 
    sagemaker_session=sagemaker_session,
    image_uri=xgb_image, # built in image_uri of XGB. 
    role=role,
    instance_count=1,
#     instance_type="ml.m5.xlarge",
    instance_type = "ml.m5.4xlarge",
    base_job_name="XGBForecaster",
    output_path=f"s3://{s3_output_path}",
)

estimator.set_hyperparameters(
    learning_rate = hyperparameters['eta'], 
    min_child_weight = 7, # this is from EDA
    reg_alpha = hyperparameters['alpha'], 
    eval_metric = "auc", 
    num_round=200, # same as tuning script
    objective = "binary:logistic"
)

estimator.fit(data_channels) # Fit data again. 

2022-02-03 23:37:43 Starting - Starting the training job...
2022-02-03 23:38:12 Starting - Launching requested ML instancesProfilerReport-1643931463: InProgress
......
2022-02-03 23:39:12 Starting - Preparing the instances for training.........
2022-02-03 23:40:43 Downloading - Downloading input data
2022-02-03 23:40:43 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2022-02-03:23:40:44:INFO] Running standalone xgboost training.[0m
[34m[2022-02-03:23:40:44:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2022-02-03:23:40:44:INFO] File size need to be processed in the node: 3.43mb. Available memory size in the node: 55362.62mb[0m
[34m[2022-02-03:23:40:44:INFO] Determined delimiter of CSV input is ','[0m
[34m[23:40:44] S3DistributionType set as FullyReplicated[0m
[34m[23:40:44] 971x192 matrix with 186432 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[23:40:44] src/t

In [41]:
predictor = estimator.deploy(initial_instance_count = 1, instance_type="ml.m5.4xlarge")

-----!

In [73]:
# Endpoint created. Now we read local data to invoke endpoint. 
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer, CSVDeserializer
from sagemaker.predictor import Predictor

predictor = Predictor(endpoint_name = predictor.endpoint_name, 
                     sagemaker_session=sagemaker_session, 
                     serializer = CSVSerializer(content_type='text/csv'), 
                     )

import numpy as np
import pandas as pd
df_test = pd.read_csv(os.path.join("data", "test.csv"),header = None).values
# print(df_test.head(5))
# X_test =
X_test = df_test[:, 1:]
y_test = df_test[:, 0]
# print(X)
# response = predictor.predict(data = X_test, initial_args={'ContentType': 'text/csv'})

response = predictor.predict(data = X_test, initial_args ={'ContentType': 'text/csv'})
result = np.fromstring(response.decode("utf-8"), sep = ",")

In [74]:
# See prediction below. The results from Sagemaker XGBoost is returned in form of logit probabilities. 
print(result[:20])
print(len(result))

[0.05404769 0.62995422 0.70535654 0.92642754 0.8985942  0.88099909
 0.45903829 0.95404589 0.40508902 0.32953069 0.88420665 0.92696571
 0.50839639 0.88147193 0.97923511 0.92640024 0.30640066 0.92987418
 0.54030359 0.25016189]
271


In [75]:
y_pred = (result > 0.5)*1
from sklearn.metrics import confusion_matrix, precision_score, accuracy_score
print("Precision:", precision_score(y_pred, y_test))
print("Accuracy:", accuracy_score(y_pred, y_test))

Precision: 0.764102564102564
Accuracy: 0.6457564575645757


In [91]:
# Now, for an interesting evaluation, we see how well will we do when we apply our predictions in actual trading. 
# Refer to the split function in processing notebook to see that we used all data up to 2020 year end for training.
# We fit the entirety of available 2021 data into the endpoint to see how well we can do with our robot trader! 

# This is exactly the same data as df_test in the last cell, but with headers. 
# This allows us to access return of spy. 

data = pd.read_csv(os.path.join("data", "test_header.csv")) 
portfolio_return = (data.loc[y_pred == 0, 'ret_spy']+1).prod()
print("Robot trader yields return:")
print(portfolio_return*100,"%")

Robot trader yields return:
75.41649007701396 %


In [92]:
# If you don't move your portfolio at all, the reuturn is
print("If you don't move position at all:")
print((data.loc[:,'ret_spy']+1).prod()*100,"%")

If you don't move position at all:
119.78205380582835 %


In [104]:
y_pred = (result > 0.95)*1
data = pd.read_csv(os.path.join("data", "test_header.csv")) 
portfolio_return = (data.loc[y_pred == 0, 'ret_spy']+1).prod()
print("Robot trader yields return:")
print(portfolio_return*100,"%")
print("If you don't move position at all:")
print((data.loc[:,'ret_spy']+1).prod()*100,"%")

Robot trader yields return:
97.35611578530875 %
If you don't move position at all:
119.78205380582835 %


In [105]:
# We visualize different portfolio returns: 
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv(os.path.join("data", "test_header.csv")) 

thresholds = [0.5, 0.8, 0.9, 0.95]
for t in thresholds:
    data.loc[:,f'strat_{t}'] = [0]*len(data)
    y_pred = (result > t)
    data.loc[,f'strat_{t}'] = 

Unnamed: 0,target,ret_fb,ret_amzn,ret_aapl,ret_nflx,ret_goog,ret_spy,vol_fb,vol_amzn,vol_aapl,...,obv_aapl_lag_1,obv_nflx_lag_1,obv_goog_lag_1,obv_spy_lag_1,retbbk_fb_lag_1,retbbk_amzn_lag_1,retbbk_aapl_lag_1,retbbk_nflx_lag_1,retbbk_goog_lag_1,retbbk_spy_lag_1
0,0,-0.015449,-0.021585,-0.024719,-0.033048,-0.013494,-0.013614,0.301214,0.240926,0.360196,...,-1.594205e+07,1.189331e+06,-201247.277824,7.592349e+06,0.409283,-0.924786,-0.663642,1.470403,0.828912,0.651770
1,0,0.007548,0.010004,0.012364,-0.003940,0.007337,0.006887,0.288611,0.232408,0.362710,...,-3.292336e+07,4.381665e+05,-428000.974114,-8.114738e+06,-1.018081,-1.555281,-1.361245,-1.797984,-1.123047,-2.186590
2,0,-0.028269,-0.024897,-0.033661,-0.038998,-0.003234,0.005978,0.338522,0.271443,0.398363,...,-1.551159e+07,-3.810905e+04,-218227.510899,1.824054e+06,0.582784,0.575340,0.327373,-0.223171,0.809468,0.798741
3,0,0.020622,0.007577,0.034123,0.016784,0.029943,0.014858,0.343200,0.280128,0.368714,...,-3.412178e+07,-7.458545e+05,-536077.176112,1.598054e+07,-1.582687,-1.489289,-1.713915,-1.808889,-0.197946,0.843009
4,1,-0.004354,0.006496,0.008631,0.002967,0.011168,0.005698,0.344004,0.287937,0.380134,...,-1.496178e+07,-1.548872e+05,-162600.219297,2.301871e+07,1.303926,0.613495,1.502291,0.912561,2.238565,1.863985
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266,1,0.018274,0.013327,-0.004864,-0.026038,0.002152,0.004247,0.454760,0.417358,0.331411,...,-3.558229e+07,-9.922929e+06,-603369.591065,-6.841834e+07,-1.611755,-2.289869,-0.430995,-3.340417,-1.071391,-1.465027
267,1,-0.027728,-0.031534,-0.011385,-0.053545,-0.027893,-0.012209,0.451350,0.417850,0.347210,...,-5.247727e+07,-1.291267e+07,-155266.978923,-2.572471e+07,1.089652,1.222696,0.288352,0.048202,0.568829,1.078758
268,1,-0.018391,-0.007954,-0.000563,-0.018340,0.019762,-0.002509,0.482324,0.462628,0.377919,...,-6.080990e+07,-1.321042e+07,-374618.048400,-4.469439e+07,-0.852119,-0.906138,-0.254525,-0.381672,-1.057674,-0.590197
269,0,0.000034,0.005509,-0.002943,0.075063,-0.000921,-0.004938,0.488932,0.485341,0.397033,...,-6.713862e+07,-1.314023e+07,-60748.975280,-6.358728e+07,-0.536568,0.181758,0.528621,0.210383,1.542154,0.344516
