# Process Pipeline for StockPicksAI using Amazon SageMaker
Processing job downloads input from Amazon Simple Storage Service (Amazon S3), then uploads outputs to Amazon S3 during or after the processing job.

In [1]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ScriptProcessor

region = boto3.session.Session().region_name

#role = get_execution_role()
# each person needs to create their own sagemaker roles (https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html)
role = 'arn:aws:iam::142074230768:role/w210sagemaker'


In [2]:
import pandas as pd
from sagemaker.processing import ProcessingInput, ProcessingOutput

In [3]:
import json
from sagemaker.s3 import S3Downloader

In [4]:
#!cp preprocessing.py ./code
from boto3 import Session
import os

session = Session()#profile_name='rrj')
credentials = session.get_credentials()
# Credentials are refreshable, so accessing your access key / secret key
# separately can lead to a race condition. Use this to get an actual matched
# set.
current_credentials = credentials.get_frozen_credentials()
wrds_username = os.environ.get('WRDS_USERNAME')
# I would not recommend actually printing these. Generally unsafe.
# print(current_credentials.access_key)
# print(current_credentials.secret_key)
# print(current_credentials.token)
# print(wrds_username)

In [5]:
# You must have a Wharton password set up and a .pgpass file created
#
wrdspass = !cat ~/.pgpass

In [5]:
!mkdir docker

mkdir: cannot create directory ‘docker’: File exists


In [6]:
!cp s3finance.py docker

In [7]:
%%writefile docker/Dockerfile

FROM python:3.7-slim-buster

RUN pip3 install boto3 joblib matplotlib numpy pandas pandas_market_calendars psutil python_dateutil s3fs \
scikit_learn tqdm wrds fastparquet pyarrow matplotlib matplotlib-label-lines sklearn datetime python-dateutil \
statistics pandas_market_calendars joblib scipy shap xgboost category_encoders seaborn ipython

RUN pwd
COPY s3finance.py /tmp
ENV PYTHONUNBUFFERED=TRUE

ENTRYPOINT ["python3"]

Overwriting docker/Dockerfile


In [8]:
import boto3

account_id = boto3.client("sts").get_caller_identity().get("Account")
ecr_repository = "sagemaker-processing-container"
tag = ":latest"

uri_suffix = "amazonaws.com"
if region in ["cn-north-1", "cn-northwest-1"]:
    uri_suffix = "amazonaws.com.cn"
processing_repository_uri = "{}.dkr.ecr.{}.{}/{}".format(
    account_id, region, uri_suffix, ecr_repository + tag
)

# Create ECR repository and push docker image
!docker build -t $ecr_repository docker


Sending build context to Docker daemon  14.34kB
Step 1/6 : FROM python:3.7-slim-buster
 ---> 7eb290bee268
Step 2/6 : RUN pip3 install boto3 joblib matplotlib numpy pandas pandas_market_calendars psutil python_dateutil s3fs scikit_learn tqdm wrds fastparquet pyarrow matplotlib matplotlib-label-lines sklearn datetime python-dateutil statistics pandas_market_calendars joblib scipy shap xgboost category_encoders seaborn ipython
 ---> Using cache
 ---> ed264f71c5c8
Step 3/6 : RUN pwd
 ---> Using cache
 ---> 872bef0a717d
Step 4/6 : COPY s3finance.py /tmp
 ---> Using cache
 ---> fb05e90d9346
Step 5/6 : ENV PYTHONUNBUFFERED=TRUE
 ---> Using cache
 ---> 48772d07d26c
Step 6/6 : ENTRYPOINT ["python3"]
 ---> Using cache
 ---> e078aea4dfd6
Successfully built e078aea4dfd6
Successfully tagged sagemaker-processing-container:latest


In [9]:
!$(aws ecr get-login --region $region --registry-ids $account_id --no-include-email)
!aws ecr create-repository --repository-name $ecr_repository
!docker tag {ecr_repository + tag} $processing_repository_uri
!docker push $processing_repository_uri

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded

An error occurred (RepositoryAlreadyExistsException) when calling the CreateRepository operation: The repository with name 'sagemaker-processing-container' already exists in the registry with id '142074230768'
The push refers to repository [142074230768.dkr.ecr.us-west-2.amazonaws.com/sagemaker-processing-container]

[1Bbe57e6ac: Preparing 
[1Bd9c091f4: Preparing 
[1B30a9ebbb: Preparing 
[1B39f02cb7: Preparing 
[1B9b2c8960: Preparing 
[1Bb6c0b614: Preparing 
[1Bdb357ed5: Layer already exists [2A[2Klatest: digest: sha256:9f8f066206ef61e5aabf8a1b62b7c7967fdaf3bd1e998aaed4fb5effc35cc429 size: 1792


In [6]:
processing_repository_uri='142074230768.dkr.ecr.us-west-2.amazonaws.com/sagemaker-processing-container'

In [8]:
# do id mapping
script_processor = ScriptProcessor(
    command=["python3"],
    image_uri=processing_repository_uri,
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
)

In [14]:
# get company ID mappings
# Takes about 15-20 minutes
script_processor.run(
    code="getidmapping.py",
    inputs=[],
    outputs=[],
    arguments=["--aws-access-key-id", current_credentials.access_key,
               "--aws-secret-access-key", current_credentials.secret_key,
               "--wrds-username", wrdspass[0].split(':')[3],
               "--wrds-pgpass",wrdspass[0].split(':')[4],
               "--write-data","1"]
)
script_processor_job_description = script_processor.jobs[-1].describe()
#print(script_processor_job_description)

In [None]:
script_processor = ScriptProcessor(
    command=["python3"],
    image_uri=processing_repository_uri,
    role=role,
    instance_count=1,
    instance_type="ml.m5.12xlarge",
#    instance_type="ml.m5.xlarge",
)

In [7]:
processing_repository_uri

'142074230768.dkr.ecr.us-west-2.amazonaws.com/sagemaker-processing-container'

In [None]:
# this job gets all the price, and fundamental and quarterly data from Wharton Research Data Service
# It runs a long time (8-10) hours, so be aware - generates stuff in the raw summary area of S3
#
script_processor.run(
    code="getfeaturedata.py",
    inputs=[],
    outputs=[],
    arguments=["--aws-access-key-id", current_credentials.access_key,
               "--aws-secret-access-key", current_credentials.secret_key,
               "--wrds-username", wrdspass[0].split(':')[3],
               "--wrds-pgpass",wrdspass[0].split(':')[4],
               "--write-data","1"]
)
script_processor_job_description = script_processor.jobs[-1].describe()
#print(script_processor_job_description)

In [None]:
print(script_processor_job_description)

In [None]:

# This job reads all the raw feature summaries and create final joined features that we will use
# for training.
#
script_processor.run(
    code="generatefeatures.py",
    inputs=[],
    outputs=[],
    arguments=["--aws-access-key-id", current_credentials.access_key,
               "--aws-secret-access-key", current_credentials.secret_key,
               "--write-data","1"]
)
script_processor_job_description = script_processor.jobs[-1].describe()
#print(script_processor_job_description)

In [None]:
print(script_processor_job_description)

In [8]:
script_processor = ScriptProcessor(
    command=["python3"],
    image_uri=processing_repository_uri,
    role=role,
    instance_count=1,
#    instance_type="ml.m5.xlarge",
#    instance_type="ml.r5.16xlarge", # 6:27
#    instance_type="ml.c5.18xlarge", # 6:27
#    instance_type="ml.m5.24xlarge", # 11:00 hours
#    instance_type="ml.m5.12xlarge", #10: hours
    instance_type="ml.c5.18xlarge", #7: hours
)

In [13]:
#
# This job trains the models. We can dictate what features we look for and other parameters to choose
# appropriate stocks.
# When we run over long times from 1990 as in this case, it could take up to 10 or more hours to run.
# The default limit on Sagemaker is 24 hours so something to be careful of.
#

script_processor.run(
    code="trainmodels.py",
    inputs=[],
    outputs=[],
    arguments=["--aws-access-key-id", current_credentials.access_key,
               "--aws-secret-access-key", current_credentials.secret_key,
               "--write-data","1",
               "--period", "10",
               "--cycles", "300",
               "--min-price", "5",
               "--market-cap","300",
               "--output-analysis","0",
               "--train-start", "1990-01-01",
               "--end-period", "2023-08-01",
               "--key-features1", "adjusted_price,rd_sale,gsubind,fcf_ocf,stko,pe_op_basic,peg_trailing,volatility,gprof,inv_turn,ocf_lct,gpm,debt_ebitda,ggroup,peg_ltgforward,divyiel\
d,at_turn,pe_exi,pretret_earnat,sic,eps,rect_act,cash_debt,rect_turn,evm,pe_inc,invt_act",
               "--key-features2","naics,pay_turn,lt_ppent,de_ratio,sale_nwc,intcov_ratio,debt_assets,cash_conversion,roa,capei,spcindcd,totdebt_invcap,efftax,dpr,pretret_noa,debt_at,roce,profit_lct,cshoc,sal\
e_equity,prccd,prchd,prcld,prcod,prcstd,pe_op_basic",
               "--version","1659216789.930134",
               "--savemodel-cycle","12",
               "--scoring","f1"]
)
script_processor_job_description_train = script_processor.jobs[-1].describe()
#print(script_processor_job_description)

In [None]:
print(script_processor_job_description_train)

In [None]:
# Now we are going to do the model evaluations so that 
# we can see overall how our models predict outcomes over an xtended period
# and which features are most important.
#
script_processor = ScriptProcessor(
    command=["python3"],
    image_uri=processing_repository_uri,
    role=role,
    instance_count=1,
#    instance_type="ml.m5.xlarge",
#    instance_type="ml.r5.16xlarge", 
#    instance_type="ml.c5.18xlarge",
#    instance_type="ml.m5.24xlarge",
    instance_type="ml.m5.12xlarge", #10: hours
)

In [None]:
script_processor.run(
#    code="generatefeatures.py",
    inputs=[],
    outputs=[],
    arguments=["--aws-access-key-id", current_credentials.access_key,
               "--aws-secret-access-key", current_credentials.secret_key,
               "--write-data","1"]
)
script_processor_job_description = script_processor.jobs[-1].describe()
#print(script_processor_job_description)