# Objective
Productionize maching learning model using weights.pkl

In [33]:
import joblib
import pandas as pd

In [34]:
# Get current working directory
import os
cwd = os.getcwd()
cwd

'/Users/yibin/Documents/MachineLearningDeployment/xgboost_model'

# Write config file

In [35]:
!mkdir config

mkdir: config: File exists


In [36]:
%%writefile ./config/config.yml
data_downloaded_path: "/data/VTI_downloaded.csv" # specify the path of your downloaded file here
data_processed_path: "/data/VTI.csv"             # specify the path of your processed file here
stock_code: "VTI"                                # stock code

Overwriting ./config/config.yml


In [37]:
# Load config file
import yaml

with open("./config/config.yml", 'r') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
config

{'data_downloaded_path': '/data/VTI_downloaded.csv',
 'data_processed_path': '/data/VTI.csv',
 'stock_code': 'VTI'}

# Process data into right format

In [38]:
# Load data
df = pd.read_csv(cwd+config['data_downloaded_path'])

# Remove unneccesary columns
df.drop(['Open', 'High', 'Low', 'Close', 'Volume'], axis=1, inplace=True)

# Change all column headings to be lower case, and remove spacing
df.columns = [str(x).lower().replace(' ', '_') for x in df.columns]

df.head()

Unnamed: 0,date,adj_close
0,2014-11-17,95.533783
1,2014-11-18,96.097031
2,2014-11-19,95.851738
3,2014-11-20,96.124275
4,2014-11-21,96.633034


In [39]:
# Save df to file
df.to_csv(cwd+config['data_processed_path'], index=False)

# Write job to extract latest price

In [40]:
%%writefile ./extract.py
import csv
import os
import requests
import yaml

from bs4 import BeautifulSoup
from datetime import date
from requests.packages.urllib3.exceptions import InsecureRequestWarning

# Disable InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

# Get current working directory
cwd = os.getcwd()

with open(cwd+"/config/config.yml", 'r') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

def parse(stock_code):
    """
    Parse yahoo finance webpage
    :return:
    stock year low, stock year high, stock price
    """
    url = "https://finance.yahoo.com/quote/%s" % (stock_code)
    response = requests.get(url, verify=False)
    soup = BeautifulSoup(response.text, "lxml")

    # Find current price
    y = soup.findAll('span', attrs={'class': 'Trsdu(0.3s) Trsdu(0.3s) Fw(b) Fz(36px) Mb(-4px) D(b)', 'data-reactid': "14"})[0]

    return float(y.text.replace(',', ''))

def insert(path, date, price):
    """
    Insert new data into the data file specified by path
    """
    with open(path, 'a') as csvFile:
        writer = csv.writer(csvFile)
        writer.writerow([str(date.today()), str(price)])
    
# Get latest price
price = parse(config['stock_code'])

# Insert into data file
insert(cwd+config['data_processed_path'], date, price)

Overwriting ./extract.py


In [41]:
!python extract.py

# Write requirements file

In [42]:
%%writefile ./requirements.txt
beautifulsoup4==4.7.1
joblib==0.13.2
pandas==0.24.2
pyyaml==5.1.1
xgboost==0.90

Overwriting ./requirements.txt


# Write model file

In [43]:
%%writefile ./model.py

import joblib
import os
import pandas as pd
import numpy as np
import xgboost

from collections import defaultdict

# Get current working directory
cwd = os.getcwd()

class Xgboost_model:
    def __init__(self, N, H):
        """
        Initialize model.
        Inputs
            N: For feature at day t, we use lags from t-1, t-2, ..., t-N as features
            H: Forecast horizon, in days. Note there are about 252 trading days in a year
        """
        # Load model. This is what you get when you do joblib.dump(model, "weights.pkl")
        self.model = joblib.load(cwd+"/weights.pkl")

        # Load parameters
        self.N = N
        self.H = H

        # Get list of features
        self.features = []
        for n in range(self.N, 0, -1):
            self.features.append("adj_close_lag_"+str(n))

    def add_lags(self, df, N, lag_cols):
        """
        Add lags up to N number of days to use as features
        The lag columns are labelled as 'adj_close_lag_1', 'adj_close_lag_2', ... etc.
        """
        # Use lags up to N number of days to use as features
        df_w_lags = df.copy()
        # Add a column 'order_day' to indicate the order of the rows by date
        df_w_lags.loc[:, 'order_day'] = [x for x in list(range(len(df)))]
        merging_keys = ['order_day']  # merging_keys
        shift_range = [x+1 for x in range(N)]
        for shift in shift_range:
            train_shift = df_w_lags[merging_keys + lag_cols].copy()

            # E.g. order_day of 0 becomes 1, for shift = 1.
            # So when this is merged with order_day of 1 in df_w_lags, this will represent lag of 1.
            train_shift['order_day'] = train_shift['order_day'] + shift

            def foo(x): return '{}_lag_{}'.format(
                x, shift) if x in lag_cols else x
            train_shift = train_shift.rename(columns=foo)

            df_w_lags = pd.merge(df_w_lags, train_shift,
                                 on=merging_keys, how='left')  # .fillna(0)
        del train_shift

        return df_w_lags

    def do_scaling(self, df, N):
        """
        Do scaling for the adj_close and lag cols
        """
        df.loc[:, 'adj_close_scaled'] = (
            df['adj_close'] - df['adj_close_mean']) / df['adj_close_std']
        for n in range(N, 0, -1):
            df.loc[:, 'adj_close_scaled_lag_'+str(n)] = \
                (df['adj_close_lag_'+str(n)] - df['adj_close_mean']) / df['adj_close_std']

            # Remove adj_close_lag column which we don't need anymore
            df.drop(['adj_close_lag_'+str(n)], axis=1, inplace=True)

        return df

    def get_mov_avg_std(self, df, col, N):
        """
        Given a dataframe, get mean and std dev at timestep t using values from t-1, t-2, ..., t-N.
        Inputs
            df         : dataframe. Can be of any length.
            col        : name of the column you want to calculate mean and std dev
            N          : get mean and std dev at timestep t using values from t-1, t-2, ..., t-N
        Outputs
            df_out     : same as df but with additional column containing mean and std dev
        """
        mean_list = df[col].rolling(
            window=N, min_periods=1).mean()  # len(mean_list) = len(df)
        # first value will be NaN, because normalized by N-1
        std_list = df[col].rolling(window=N, min_periods=1).std()

        # Add one timestep to the predictions
        mean_list = np.concatenate(
            (np.array([np.nan]), np.array(mean_list[:-1])))
        std_list = np.concatenate(
            (np.array([np.nan]), np.array(std_list[:-1])))

        # Append mean_list to df
        df_out = df.copy()
        df_out[col + '_mean'] = mean_list
        df_out[col + '_std'] = std_list

        return df_out

    def transform(self, data):
        """
        Do transformation on data
        """
        data0 = data.copy(deep=True)
        
        # Add lags up to N number of days to use as features
        data0 = self.add_lags(data0, self.N, ['adj_close'])

        # Get mean and std dev at timestamp t using values from t-1, ..., t-N
        data0 = self.get_mov_avg_std(data0, 'adj_close', self.N)

        # Do scaling
        data0 = self.do_scaling(data0, self.N)
        
        # Drop the NaNs
        data0.dropna(axis=0, how='any', inplace=True)

        return data0

    def pred_xgboost(self, model, N, H, prev_vals, prev_mean_val, prev_std_val):
        """
        Do recursive forecasting using xgboost
        Inputs
            model              : the xgboost model
            N                  : for feature at day t, we use lags from t-1, t-2, ..., t-N as features
            H                  : forecast horizon
            prev_vals          : numpy array. If predict at time t, 
                                 prev_vals will contain the N unscaled values at t-1, t-2, ..., t-N
            prev_mean_val      : the mean of the unscaled values at t-1, t-2, ..., t-N
            prev_std_val       : the std deviation of the unscaled values at t-1, t-2, ..., t-N
        Outputs
            Times series of predictions. Numpy array of shape (H,). This is unscaled.
        """
        forecast = prev_vals.copy()

        for n in range(H):
            forecast_scaled = (forecast[-N:] - prev_mean_val) / prev_std_val

            # Create the features dataframe
            X = defaultdict(list)
            for n in range(N, 0, -1):
                X['adj_close_scaled_lag_'+str(n)] = [forecast_scaled[-n]]
            X = pd.DataFrame(X)

            # Do prediction
            est_scaled = self.model.predict(X)

            # Unscale the prediction
            forecast = np.concatenate([forecast,
                                       np.array((est_scaled * prev_std_val) + prev_mean_val).reshape(1,)])

            # Comp. new mean and std
            prev_mean_val = np.mean(forecast[-N:])
            prev_std_val = np.std(forecast[-N:])

        return forecast[-H:]

    def predict(self, df):
        """
        Do predictions
        """
        prev_vals = df[-self.N:]['adj_close'].to_numpy()
        prev_mean_val = np.mean(prev_vals)
        prev_std_val = np.std(prev_vals)

        # Get predicted labels and scale back to original range
        est = self.pred_xgboost(self.model, self.N, self.H, prev_vals,
                                prev_mean_val, prev_std_val)

        return est

Overwriting ./model.py


# Test model locally

In [44]:
!mkdir out

mkdir: out: File exists


In [45]:
%%writefile ./predict.py

import model as mod
import os
import pandas as pd
import yaml

from datetime import date
from numpy import savetxt

# Get current working directory
cwd = os.getcwd()

# Load config
with open(cwd+"/config/config.yml", 'r') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

# Load test file
data = pd.read_csv(cwd+config['data_processed_path'], sep=",")

# Create an instance of xgboost_model
xgb_model = mod.Xgboost_model(N=10, H=21)

# Do transformation
data = xgb_model.transform(data)

# Do prediction
est = xgb_model.predict(data)

# Save predictions to file
savetxt(cwd+'/out/est_' + str(date.today()) + '.csv', est, delimiter=',')

Overwriting ./predict.py


In [46]:
# Run the prediction script
!python predict.py

In [47]:
# Check output
import csv

from datetime import date

est2 = []
with open(cwd+'/out/est_' + str(date.today()) + '.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for row in csv_reader:
        est2.append(row[0])
est2

['1.596570281982421875e+02',
 '1.590828552246093750e+02',
 '1.591213989257812500e+02',
 '1.588684539794921875e+02',
 '1.595322875976562500e+02',
 '1.592567596435546875e+02',
 '1.595471191406250000e+02',
 '1.595873870849609375e+02',
 '1.597397155761718750e+02',
 '1.601928558349609375e+02',
 '1.601864318847656250e+02',
 '1.601270446777343750e+02',
 '1.601215057373046875e+02',
 '1.601946411132812500e+02',
 '1.602317962646484375e+02',
 '1.602755584716796875e+02',
 '1.603481292724609375e+02',
 '1.603082427978515625e+02',
 '1.602938079833984375e+02',
 '1.602314300537109375e+02',
 '1.602135467529296875e+02']

In [48]:
%%writefile ./plot.py
import chart_studio.plotly as py
import csv
import os
import pandas as pd
import plotly.graph_objs as go
import yaml

from collections import defaultdict
from datetime import date, timedelta

def gen_plotly_url():
    # Sign in to plotly if you haven't done so
    py.sign_in('<YOUR-PLOTLY-USERNAME>', '<YOUR-PLOTLY-PASSWORD>')  # Be careful with this, don't put it on Github!!!

    # Get current working directory
    cwd = os.getcwd()

    # Load config
    with open(cwd+"/config/config.yml", 'r') as f:
        config = yaml.load(f, Loader=yaml.FullLoader)

    # Load test file
    df = pd.read_csv(cwd+config['data_processed_path'], sep=",")

    # Load the predictions
    est_dict = {'date': [df[-1:]['date'].values[0]],
                'forecast': [df[-1:]['adj_close'].values[0]]}
    day = 1
    with open(cwd+'/out/est_' + str(date.today()) + '.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
            est_dict['date'].append(str(date.today() + timedelta(days=day)))
            est_dict['forecast'].append(float(row[0]))
            day = day + 1
    est_df = pd.DataFrame(est_dict)

    # Plot with plotly
    miny = min(min(df[-63:]['adj_close']), min(est_df['forecast']))-1 # min y-value of the plot
    maxy = max(max(df[-63:]['adj_close']), max(est_df['forecast']))+1 # max y-value of the plot
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df[-63:]['date'],
                             y=df[-63:]['adj_close'],
                             mode='lines',
                             name='actual',
                             line=dict(color='blue')))
    fig.add_trace(go.Scatter(x=est_df['date'],
                             y=est_df['forecast'],
                             mode='lines',
                             name='predictions',
                             line=dict(color='red')))
    fig.add_trace(go.Scatter(x=[str(date.today()), str(date.today())],
                             y=[miny, maxy],
                             mode='lines',
                             line=dict(color='black', dash="dot"),
                             showlegend=False))
    fig.update_layout(yaxis=dict(title='USD'),
                      xaxis=dict(title='date'))
    fig.update_yaxes(range=[miny, maxy])
    
    url=py.plot(fig, filename='est_'+str(date.today()), auto_open=False)
    
    return url

Overwriting ./plot.py


In [49]:
# Run the plot script
import plot as pl

url = pl.gen_plotly_url()
print(url)

https://plot.ly/~ngyibin/377/


Check that the plotly image is generated at the above url.

In [50]:
# Check HTML scipt for plotly plot
from IPython.display import display, HTML

graphs = [url]

template = (''
    '<a href="{graph_url}" target="_blank">' # Open the interactive graph when you click on the image
        '<img src="{graph_url}.png">'        # Use the ".png" magic url so that the latest, most-up-to-date image is included
    '</a>'
    '{caption}'                              # Optional caption to include below the graph
    '<br>'                                   # Line break
    '<a href="{graph_url}" style="color: rgb(190,190,190); text-decoration: none; font-weight: 200;" target="_blank">'
        'Click to comment and see the interactive graph'  # Direct readers to Plotly for commenting, interactive graph
    '</a>'
    '<br>'
    '<hr>'                                   # horizontal line
'')

email_body = ''
for graph in graphs:
    _ = template
    _ = _.format(graph_url=graph, caption='')
    email_body += _

display(HTML(email_body))

# Send results locally

In [None]:
import csv
import smtplib

from datetime import date
from email.mime.text import MIMEText

TO_LIST=["<TO-EMAIL-ADDRESSES>"]
FROM_EMAIL="<YOUR-EMAIL-ADDRESS>"

est2 = []
with open(cwd+'/out/est_' + str(date.today()) + '.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for row in csv_reader:
        est2.append(row[0])

def sendEmail(subj, body):
    msg = MIMEText(str(body))
    msg['Subject'] = subj
    msg['From'] = FROM_EMAIL
    
    s = smtplib.SMTP_SSL('smtp.gmail.com', 465)
    s.login("<YOUR-USER-NAME>", "<YOUR-PASSWORD>") # Enter your username and password here. Be careful with this, don't put it on Github!!!
    s.sendmail(FROM_EMAIL, TO_LIST, msg.as_string())
    s.quit()
    
sendEmail('Results for ' + str(date.today()), est2)
# Check you got the email

# Write send_email file

In [52]:
%%writefile ./send_email.py

import csv
import os
import pandas as pd
import plot as pl
import smtplib
import email.utils
import yaml
from datetime import date
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText

# Get current working directory
cwd = os.getcwd()

# Load config
with open(cwd+"/config/config.yml", 'r') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

# Replace sender@example.com with your "From" address.
# This address must be verified.
SENDER = '<YOUR-EMAIL-ADDRESS>'
SENDERNAME = '<YOUR-COMPANYS-NAME>'

# Replace recipient@example.com with a "To" address. If your account
# is still in the sandbox, this address must be verified.
RECIPIENT = ['<TO-EMAIL-ADDRESS1>', '<TO-EMAIL-ADDRESS2>']

# Replace smtp_username with your Amazon SES SMTP user name.
USERNAME_SMTP = "<SMTP-USERNAME>" # Be careful with this, don't put it on Github!!!

# Replace smtp_password with your Amazon SES SMTP password.
PASSWORD_SMTP = "<SMTP-PASSWORD>" # Be careful with this, don't put it on Github!!!

# If you're using Amazon SES in an AWS Region other than US West (Oregon),
# replace email-smtp.us-west-2.amazonaws.com with the Amazon SES SMTP
# endpoint in the appropriate region.
HOST = "email-smtp.us-west-2.amazonaws.com"
PORT = 587

# Load predictions into a list
est = []
with open(cwd+'/out/est_' + str(date.today()) + '.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for row in csv_reader:
        est.append(row[0])

# The subject line of the email.
SUBJECT = 'Results for ' + str(date.today())

# Load test file
data = pd.read_csv(cwd+config['data_processed_path'], sep=",")

# Run the plot script
url = pl.gen_plotly_url()

# The email body for recipients with non-HTML email clients.
BODY_TEXT = ("Current price: " + 
             str(data[-1:]['adj_close'].values) + 
             "\n Forecast for next 21 days using XGBoost: " +
             str(est))

# The HTML body of the email.
BODY_HTML = """<html>
<head>
<style>
table, th, td {{
  border: 1px solid black;
  border-collapse: collapse;
}}
th, td {{
  padding: 15px;
}}
</style>
</head>
<body>
  <h1>Current price</h1>
  <p>""" + "{0:.2f}".format(float(data[-1:]['adj_close'].values)) + """</p>
  <h1>Forecast for next 21 days using XGBoost</h1>
  <table>
    <tr><th>Day</th><th>Forecast</th></tr>
    <tr><td>1</td><td>""" + "{0:.2f}".format(float(est[0])) + """</td></tr>
    <tr><td>2</td><td>""" + "{0:.2f}".format(float(est[1])) + """</td></tr>
    <tr><td>3</td><td>""" + "{0:.2f}".format(float(est[2])) + """</td></tr>
    <tr><td>4</td><td>""" + "{0:.2f}".format(float(est[3])) + """</td></tr>
    <tr><td>5</td><td>""" + "{0:.2f}".format(float(est[4])) + """</td></tr>
    <tr><td>6</td><td>""" + "{0:.2f}".format(float(est[5])) + """</td></tr>
    <tr><td>7</td><td>""" + "{0:.2f}".format(float(est[6])) + """</td></tr>
    <tr><td>8</td><td>""" + "{0:.2f}".format(float(est[7])) + """</td></tr>
    <tr><td>9</td><td>""" + "{0:.2f}".format(float(est[8])) + """</td></tr>
    <tr><td>10</td><td>""" + "{0:.2f}".format(float(est[9])) + """</td></tr>
    <tr><td>11</td><td>""" + "{0:.2f}".format(float(est[10])) + """</td></tr>
    <tr><td>12</td><td>""" + "{0:.2f}".format(float(est[11])) + """</td></tr>
    <tr><td>13</td><td>""" +"{0:.2f}".format(float(est[12])) + """</td></tr>
    <tr><td>14</td><td>""" + "{0:.2f}".format(float(est[13])) + """</td></tr>
    <tr><td>15</td><td>""" + "{0:.2f}".format(float(est[14])) + """</td></tr>
    <tr><td>16</td><td>""" + "{0:.2f}".format(float(est[15])) + """</td></tr>
    <tr><td>17</td><td>""" + "{0:.2f}".format(float(est[16])) + """</td></tr>
    <tr><td>18</td><td>""" + "{0:.2f}".format(float(est[17])) + """</td></tr>
    <tr><td>19</td><td>""" + "{0:.2f}".format(float(est[18])) + """</td></tr>
    <tr><td>20</td><td>""" + "{0:.2f}".format(float(est[19])) + """</td></tr>
    <tr><td>21</td><td>""" + "{0:.2f}".format(float(est[20])) + """</td></tr>
  </table>
  <a href="{graph_url}" target="_blank">
    <img src="{graph_url}.png">
  </a>
  {caption}
  <br>
  <a href="{graph_url}" style="color: rgb(190,190,190); text-decoration: none; font-weight: 200;" target="_blank">
    Click to comment and see the interactive graph  
  </a>
  <br>
  <hr>  
</body>
</html>
"""
BODY_HTML = BODY_HTML.format(graph_url=url, caption='')

# Create message container - the correct MIME type is multipart/alternative.
msg = MIMEMultipart('alternative')
msg['Subject'] = SUBJECT
msg['From'] = email.utils.formataddr((SENDERNAME, SENDER))
msg['To'] = ", ".join(RECIPIENT)
# Comment or delete the next line if you are not using a configuration set
#msg.add_header('X-SES-CONFIGURATION-SET',CONFIGURATION_SET)

# Record the MIME types of both parts - text/plain and text/html.
part1 = MIMEText(BODY_TEXT, 'plain')
part2 = MIMEText(BODY_HTML, 'html')

# Attach parts into message container.
# According to RFC 2046, the last part of a multipart message, in this case
# the HTML message, is best and preferred.
msg.attach(part1)
msg.attach(part2)

# Try to send the message.
try:
    server = smtplib.SMTP(HOST, PORT)
    server.ehlo()
    server.starttls()
    #stmplib docs recommend calling ehlo() before & after starttls()
    server.ehlo()
    server.login(USERNAME_SMTP, PASSWORD_SMTP)
    server.sendmail(SENDER, RECIPIENT, msg.as_string())
    server.close()
# Display an error message if something goes wrong.
except Exception as e:
    print ("Error: ", e)
else:
    print ("Email sent!")

Overwriting ./send_email.py
