# Flask & Docker

In [1]:
import json
import os
import pandas as pd
import re as re
import matplotlib.pyplot as plt
import statsmodels.api as sm
import joblib
import itertools

import warnings
warnings.filterwarnings("ignore")

MODEL_DIR = 'models'
DATA_DIR = 'data'

In [2]:
current_path = 'C:\\Users\\ERICMenguy\\Documents\\AIAcademy\\Part B\\AI in Production\\ai-workflow-capstone-master\\'
os.chdir(current_path)
if not os.path.exists("models") :
    os.mkdir("models")
MODEL_DIR=os.path.join(current_path,"models")
DATA_DIR=os.path.join(current_path,"data")

In [3]:
## preprocessing pipeline
#### There is not really any preprocessing task, except differencing

def load_invoices_data():
    column_names = ["country","customer_id","invoice","price","stream_id","times_viewed","year","month","day"]
    df = pd.DataFrame(columns = column_names) 
    list_files = os.listdir(DATA_DIR)
    
    for file in list_files:
        with open(os.path.join(DATA_DIR,file)) as f:
            data = json.load(f)
            df_new = pd.DataFrame(data)
            actual_column_names=list(df_new.columns.values)
            for k in range(len(actual_column_names)):
                df_new.rename(columns={actual_column_names[k]: column_names[k]}, inplace=True)       
            df_new.head()
            df = df.append(df_new,ignore_index = True)
            
    # Post preparation
    def find_number(text):
        num = re.findall(r'[0-9]+',text)
        return " ".join(num)
        
    df['invoice']=df['invoice'].apply(lambda x: find_number(x))   
    df['times_viewed'] = pd.to_numeric(df['times_viewed'], errors='coerce')
    df['month'] = pd.to_numeric(df['month'], errors='coerce')
    df['year'] = pd.to_numeric(df['year'], errors='coerce')
    df['day'] = pd.to_numeric(df['day'], errors='coerce')
    
    df.loc[df['price'] < 0] = 0
    df = df.loc[df['year'] > 0]
    
    
    # monthly aggregation
        ## Prepare data to get the number of purchase. A purchases are identified by invoice id
    df1=pd.pivot_table(df, index=['country','invoice','year','month','day'], 
                                 values=['price','times_viewed'],
                                 aggfunc='sum').round(1)
    df1['purchase'] = 1
         ## Aggregation
    df2 = pd.pivot_table(df1, index=['year','month'], #'country',
                                            values=['price','times_viewed','purchase'],
                                            aggfunc='sum').round(2)
    
    # Create time series
    df2.reset_index(inplace = True)
    df2['day'] = 1
    df2['date'] = pd.to_datetime(df2[['year', 'month','day']])
    # Convert to time series
    ts= df2.set_index('date')
    
    ts.drop(columns=['year','month','day'],inplace = True)

    #return
    del df1,df2
    return(ts)

## Persist a machine learning model


In [4]:
## load data (you may need to adjust the location of the data to match your system)
X = load_invoices_data()

### focus only on price
X = X[['price']]

In [5]:
## train and check model performance (assumes you have already grid-searched to tune model)
#### param with minimum AIC value are : [((0, 1, 0), (1, 1, 0, 12))]

model = sm.tsa.statespace.SARIMAX(X,order=(0, 1, 0),seasonal_order=(1, 1, 0, 12))
result = model.fit()
print('aic value :', result.aic)

forecast = result.get_forecast(steps=6)
print(forecast.summary_frame())

## Save model
saved_model = 'capstone.joblib'
joblib.dump(result,os.path.join(MODEL_DIR,saved_model))

aic value : 206.00039008233236
price                mean        mean_se  mean_ci_lower  mean_ci_upper
2019-08-01  192886.235867   93527.682511    9575.346588  376197.125145
2019-09-01  202789.107543  132268.117064  -56451.638205  462029.853291
2019-10-01  217184.051976  161994.698023 -100319.721835  534687.825787
2019-11-01  254691.015519  187055.365016 -111930.763028  621312.794066
2019-12-01  357755.247842  209132.529577  -52136.978125  767647.473809
2020-01-01  353706.521672  229091.947203  -95305.443994  802718.487337


['C:\\Users\\ERICMenguy\\Documents\\AIAcademy\\Part B\\AI in Production\\ai-workflow-capstone-master\\models\\capstone.joblib']

In [13]:
# test reload model before updating app.py

saved_model = 'capstone.joblib'
loaded_model = joblib.load(os.path.join(MODEL_DIR, saved_model))
forecast = loaded_model.get_forecast(6)
print(forecast.summary_frame())


price                mean        mean_se  mean_ci_lower  mean_ci_upper
2019-08-01  192886.235867   93527.682511    9575.346588  376197.125145
2019-09-01  202789.107543  132268.117064  -56451.638205  462029.853291
2019-10-01  217184.051976  161994.698023 -100319.721835  534687.825787
2019-11-01  254691.015519  187055.365016 -111930.763028  621312.794066
2019-12-01  357755.247842  209132.529577  -52136.978125  767647.473809
2020-01-01  353706.521672  229091.947203  -95305.443994  802718.487337


## Create a simple flask app

In [7]:
%%writefile app.py

from flask import Flask, jsonify, request
import joblib
import socket
import json
import pandas as pd
import os

MODEL_DIR = 'models'
DATA_DIR = 'data'

app = Flask(__name__)

@app.route("/")
#def hello():
#    html = "<h3>Capstone {name}!</h3>" \
#           "<b>Hostname:</b> {hostname}<br/>"
#    return html.format(name=os.getenv("NAME", "world"), hostname=socket.gethostname())

@app.route('/get_forecast', methods=['GET','POST'])

def get_forecast():   
    ## input checking
    if not request.json:
        print("ERROR: API (predict): did not receive request data")
        return jsonify([])
    
    query = request.json
    query_init = query
    query = pd.DataFrame.from_dict(query,orient = 'index')

    if len(query.shape) == 1:
         query = query.reshape(1, -1)

    forecast = model.get_forecast(query[0][0])
    
    result = forecast.summary_frame()
    result.drop(columns=['mean_se','mean_ci_lower','mean_ci_upper'],inplace=True)
    result.rename(columns={"mean": "forecast"},inplace=True)
    result.index = result.index.set_names(['date'])
    result = result.reset_index()
    result['date'] = result['date'].astype(str)

    dict=result.to_dict('records')
    
    return(jsonify(dict))

            
if __name__ == '__main__':
    saved_model = 'capstone.joblib'
    model = joblib.load(os.path.join(MODEL_DIR, saved_model))
    app.run(host='127.0.0.1', port=5000,debug=True)


Overwriting app.py


## Test the flask app

Move into your `docker-tutorial` directory and start the app 


```bash
$ python app.py
```

Then go to [http://127.0.0.1:5000/](http://127.0.0.1:5000/)

Stop the server.  We will relaunch it in a few moments from within Docker.

## Create the DockerFile

Before we build the DockerFile first we need to create a requirement.txt

In [8]:
%%writefile requirements.txt

cython
numpy
flask
pandas
scikit-learn
statsmodels.api


Overwriting requirements.txt


In [9]:
%%writefile Dockerfile

# Use an official Python runtime as a parent image
FROM python:3.7.3-stretch

RUN apt-get update && apt-get install -y \
python3-dev \
build-essential    
        
# Set the working directory to /app
WORKDIR /app

# Copy the current directory contents into the container at /app
ADD . /app

# Install any needed packages specified in requirements.txt
RUN pip install --upgrade pip
RUN pip install --no-cache-dir -r requirements.txt

# Make port 80 available to the world outside this container
EXPOSE 80

# Define environment variable
ENV NAME World

# Run app.py when the container launches
CMD ["python", "app.py"]

Overwriting Dockerfile


## Test the running app

In [24]:
## define horizon
query = {
    'steps' : '2019-11-01'
}


In [25]:
import requests
from ast import literal_eval

## data needs to be in dict format for JSON

## test the Flask API
port = 5000
r = requests.post('http://127.0.0.1:{}/get_forecast'.format(port),json=query)

## test the Docker API
#port = 5000
#r = requests.post('http://127.0.0.1:{}/get_forecast'.format(port),json=query)

response = literal_eval(r.text)
print(response)

[{'date': '2019-08-01', 'forecast': 192886.23586675752}, {'date': '2019-09-01', 'forecast': 202789.10754276463}, {'date': '2019-10-01', 'forecast': 217184.0519760289}, {'date': '2019-11-01', 'forecast': 254691.01551877992}]
