In [1]:
import plotly
from plotly import tools
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np
from scipy.stats import norm
from sklearn.gaussian_process import GaussianProcessRegressor as GPR

In [2]:
def plot_gp(mu, lb, ub, test_x, test_y, train_x=None, train_y=None, name='', samples=[], samples_names=None,
            layout='v', xaxis_title='Time', yaxis_title='Sales', fig_size=[1000,500], w=3, f=10):
    fig = make_subplots(rows=1, cols=1, subplot_titles=("Samples"))
    samples_names = ['sample '+str(i) for i in range(len(samples))] if samples_names is None else samples_names
    if train_x is not None:
        fig.add_trace(go.Scatter(x=train_x, y=train_y, mode='lines', name='History', line=dict(width=w), line_color='#1a76ff'))  # plot training data

    fig.add_trace(
        go.Scatter(x=test_x, y=ub, fill=None, mode='lines', line_color='rgba(199, 19, 19, 0.3)',
                   fillcolor='rgba(249, 129, 37, 0.3)', showlegend=True, name='95% uncertainty interval'))
    fig.add_trace(
        go.Scatter(x=test_x, y=lb, fill='tonexty', mode='lines', line_color='rgba(199, 19, 19, 0.3)',
                   fillcolor='rgba(249, 129, 37, 0.3)', showlegend=True, name='95% uncertainty interval'))

    fig.add_trace(go.Scatter(x=test_x, y=mu, line=dict(color='#c71313', width=w), mode='lines', name='Skyolia Forecast'))  # plot the mean
    fig.add_trace(go.Scatter(x=test_x, y=test_y, line=dict(color='#1a76ff', width=w), mode='lines', name='Observed'))
    for i, s in enumerate(samples):
        fig.add_trace(go.Scatter(x=test_x, y=s, name=samples_names[i], mode='lines', 
                                 line=dict(width=w, color='#00ff00')))  # plot samples
    fig.update_layout(title_text=name, paper_bgcolor='#343434', plot_bgcolor='#343434', xaxis_title=xaxis_title, yaxis_title=yaxis_title,
                          font=dict(family="Montserrat", color="#fff", size=f), title_x=0.5, hovermode="x")
    fig.update_xaxes(showgrid=True, showline=False, gridcolor='#c9c9c9', gridwidth=0.0005)
    fig.update_yaxes(showgrid=True, showline=False, gridcolor='#c9c9c9', gridwidth=0.0005)
    return fig

def dtypes_validation(df, time_col, categorical, binary, numerical):
    df[binary] = df[binary].astype(int)
    df[categorical] = df[categorical].astype('O')
    df[numerical] = df[numerical].astype(float)
    df[time_col] = pd.to_datetime(df[time_col], format="%Y-%m-%d")
    return df

def categorical_encoding(df, OE, categorical):
    for cat in categorical:
        test_ohe = OE[cat].transform(df[[cat]].astype(str))
        for i in range(test_ohe.shape[1]):
            c = OE[cat].categories_[0][i]
            df[cat + '_' + str(c)] = test_ohe[:, i]
    return df

def numerical_scaling(df, MS, numerical):
    scaled_test = MS.transform(df[numerical])
    df[numerical] = scaled_test
    return df

def get_explaination(expl_kernels, X_train, Y_train, X_test):
    mus = []
    for i in expl_kernels:
        k = expl_kernels[i]
        gpr = GPR(kernel=k, optimizer=None, alpha=1e-5).fit(X_train, Y_train)
        mus.append(gpr.predict(X_test, return_std=False))
    return np.stack(mus)
def feature_processing(row, flow_data, flow_res, time_col, sku): #INFER THE OUTPUT OF NEW DATA POINTS
    start_date, features = flow_res['start_date'][sku], flow_res['Features'][sku]
    df = pd.DataFrame([row])
    df['delta_t'] = (df[time_col] - start_date) / np.timedelta64(1, 'D')#DATE ENCODING
    df['norm_delta_t'] = df['delta_t']
    df = categorical_encoding(df, flow_res['OHEncoders'][sku], flow_data.categorical) #CATEGORICAL ENCODING
    df = numerical_scaling(df, flow_res['MMScalers'][sku], flow_data.numerical+['norm_delta_t']) #NUMERICAL SCALING
    df = df[features] #KEEP ONLY THE NECESSARY COLUMNS AND REMOVE THE ONES WITH UNIQUE VALUE
    return df
def predict(df, flow_res, sku):
    X_train, Y_train, y_scaler = flow_res['X_train'][sku], flow_res['Y_train'][sku], flow_res['YS'][sku]
    Y_train = y_scaler.transform(Y_train[:, None]).ravel() #SCALE Y_TRAIN
    gpr = flow_res['GPRS'][sku].fit(X_train, Y_train) #BUILD AND FIT THE MODEL
    mu, std = gpr.predict(df, return_std=True) #PREDICT ON THE NEX DATA POINTS
    lb, ub = norm.ppf(0.025, mu, std), norm.ppf(0.975, mu, std) #GET LOWER UND UPPER BOUND
    mus = get_explaination(flow_res['Expl'][sku], X_train, Y_train, df) #GET EXPLAINATION CURVES
    pred = y_scaler.inverse_transform(np.concatenate((mu[:, None], lb[:, None], ub[:, None], mus.T), 1)) #CONCAT THE OUTPUTS AND INVERSE SCALE ALL
    return np.concatenate((pred, mu[:, None], std[:, None]), 1) #KEEP MU AND STD TO COMPUTE THE UNCERTAINTY

def infer(row, flow_data, flow_res):
    time_col, sku_col = flow_data.time_col, flow_data.sku_col
    sku = row[sku_col]
    df = feature_processing(row, flow_data, flow_res, time_col, sku)
    pred = predict(df, flow_res, sku)
    columns = ['Forecast', 'lb', 'ub'] + list(flow_res['Expl'][sku].keys()) + ['mu', 'std']
    pred = pd.Series(dict(zip(columns, pred.ravel())))
    return pd.concat((row[[time_col, sku_col]], pred))

In [3]:
from metaflow import Flow, get_metadata

flow_res = Flow('TrainTestFlow').latest_successful_run.data.flow_res
sku = 1

X_train = flow_res['X_train'][sku]
T_train = flow_res['T_train'][sku]
Y_train = flow_res['Y_train'][sku]

X_test = flow_res['X_test'][sku]
T_test = flow_res['T_test'][sku]
Y_test = flow_res['Y_test'][sku]

pred = flow_res['Pred'][sku]
errors = flow_res['Errors'][sku]
mu = pred[0]
lb = pred[1]
ub = pred[2]
errors

(287.3236052168993, 404.01876428675666, 4.0)

In [7]:
pd.concat((flow_res['Raw'][sku][0], flow_res['Raw'][sku][1]))

Unnamed: 0,Date,Store,Sales,TV_3,Radio_3,Banners_3,TV_2,Radio_2,Banners_2,TV_1,Radio_1,Banners_1,TV,Radio,Banners
3,2018-01-28,1,8846.95,13528.10,0.00,0.00,0.00,5349.65,2218.93,0.00,4235.86,2046.96,0.00,3562.21,0.00
4,2018-02-04,1,9797.07,0.00,5349.65,2218.93,0.00,4235.86,2046.96,0.00,3562.21,0.00,0.00,0.00,2187.29
5,2018-02-11,1,13527.65,0.00,4235.86,2046.96,0.00,3562.21,0.00,0.00,0.00,2187.29,8045.44,4310.55,1992.98
6,2018-02-18,1,9635.43,0.00,3562.21,0.00,0.00,0.00,2187.29,8045.44,4310.55,1992.98,0.00,0.00,2253.02
7,2018-02-25,1,15355.11,0.00,0.00,2187.29,8045.44,4310.55,1992.98,0.00,0.00,2253.02,9697.29,4478.81,2042.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,2021-10-03,1,9030.17,0.00,5213.48,1715.62,7663.81,0.00,2399.59,0.00,4757.98,0.00,0.00,0.00,1691.68
196,2021-10-10,1,15904.11,7663.81,0.00,2399.59,0.00,4757.98,0.00,0.00,0.00,1691.68,11543.58,4615.35,2518.88
197,2021-10-17,1,12839.29,0.00,4757.98,0.00,0.00,0.00,1691.68,11543.58,4615.35,2518.88,0.00,4556.16,1919.19
198,2021-10-24,1,9063.45,0.00,0.00,1691.68,11543.58,4615.35,2518.88,0.00,4556.16,1919.19,0.00,0.00,1707.65


In [8]:
flow_res['Pred'][sku]

array([[ 8151.58737757,  7756.39173356,  7479.5295702 , 10970.61796071,
         7929.73804268, 10904.92784284, 11287.2690022 , 12207.66642643,
         9783.51254428, 10022.75577927, 10557.0880655 , 13063.73910748,
        10602.40491136, 11181.62317766, 10771.72697456,  9043.72518755,
        13791.0444246 , 13755.92878292, 13777.01061525, 13385.46123165,
        13165.11246068,  9882.03438129,  8698.23351668, 15406.52324726,
        12648.13410301,  9092.63967833,  8113.21068494],
       [ 7252.11952849,  6873.6761306 ,  6577.51212084, 10075.10570008,
         7040.35980587,  9994.14595963, 10371.37447432, 11280.6740256 ,
         8877.74451623,  9129.22182289,  9645.20329796, 12119.40797543,
         9692.17909501, 10275.95147524,  9865.77174012,  8095.55038453,
        12775.13242212, 12774.92697376, 12783.95691475, 12423.32396032,
        12230.61120689,  8959.8123523 ,  7801.56283839, 14441.55969671,
        11726.48804828,  8185.98398653,  7212.37548995],
       [ 9051.05522665

In [14]:
flow_res['Pred'][sku].shape

(6, 27)

In [15]:
fig = plot_gp(pred[0], pred[1], pred[2], T_test, Y_test, T_train, Y_train, samples=pred[3:], layout='h')
fig.show()

In [5]:
from metaflow import Flow, get_metadata

flow_data = Flow('TrainTestFlow').latest_successful_run.data
flow_res = flow_data.flow_res
inputs_dict = [{'Date': '2021-05-09 00:00:00',
  'Store': 1,
  'Sales': 7405.54,
  'TV_3': 0.0,
  'Radio_3': 6742.67,
  'Banners_3': 1824.36,
  'TV_2': 0.0,
  'Radio_2': 0.0,
  'Banners_2': 2139.88,
  'TV_1': 0.0,
  'Radio_1': 0.0,
  'Banners_1': 1787.76,
  'TV': 0.0,
  'Radio': 0.0,
  'Banners': 1955.5},
 {'Date': '2021-07-25 00:00:00',
  'Store': 1,
  'Sales': 9745.9,
  'TV_3': 0.0,
  'Radio_3': 2760.4,
  'Banners_3': 1847.58,
  'TV_2': 5553.19,
  'Radio_2': 0.0,
  'Banners_2': 1822.44,
  'TV_1': 11250.46,
  'Radio_1': 0.0,
  'Banners_1': 2187.28,
  'TV': 0.0,
  'Radio': 0.0,
  'Banners': 1894.87},
 {'Date': '2021-09-26 00:00:00',
  'Store': 1,
  'Sales': 10237.24,
  'TV_3': 10534.1,
  'Radio_3': 0.0,
  'Banners_3': 1964.94,
  'TV_2': 0.0,
  'Radio_2': 5213.48,
  'Banners_2': 1715.62,
  'TV_1': 7663.81,
  'Radio_1': 0.0,
  'Banners_1': 2399.59,
  'TV': 0.0,
  'Radio': 4757.98,
  'Banners': 0.0},
 {'Date': '2021-05-23 00:00:00',
  'Store': 1,
  'Sales': 10888.41,
  'TV_3': 0.0,
  'Radio_3': 0.0,
  'Banners_3': 1787.76,
  'TV_2': 0.0,
  'Radio_2': 0.0,
  'Banners_2': 1955.5,
  'TV_1': 0.0,
  'Radio_1': 4111.03,
  'Banners_1': 0.0,
  'TV': 0.0,
  'Radio': 5242.12,
  'Banners': 2010.19},
 {'Date': '2021-10-31 00:00:00',
  'Store': 1,
  'Sales': 7250.21,
  'TV_3': 11543.58,
  'Radio_3': 4615.35,
  'Banners_3': 2518.88,
  'TV_2': 0.0,
  'Radio_2': 4556.16,
  'Banners_2': 1919.19,
  'TV_1': 0.0,
  'Radio_1': 0.0,
  'Banners_1': 1707.65,
  'TV': 0.0,
  'Radio': 0.0,
  'Banners': 2863.31}]
df = pd.DataFrame(inputs_dict)
df[flow_data.binary] = df[flow_data.binary].astype(int)
df[flow_data.categorical] = df[flow_data.categorical].astype('O')
df[flow_data.numerical] = df[flow_data.numerical].astype(float)
df[flow_data.time_col] = pd.to_datetime(df[flow_data.time_col], format="%Y-%m-%d")
df

Unnamed: 0,Date,Store,Sales,TV_3,Radio_3,Banners_3,TV_2,Radio_2,Banners_2,TV_1,Radio_1,Banners_1,TV,Radio,Banners
0,2021-05-09,1,7405.54,0.0,6742.67,1824.36,0.0,0.0,2139.88,0.0,0.0,1787.76,0.0,0.0,1955.5
1,2021-07-25,1,9745.9,0.0,2760.4,1847.58,5553.19,0.0,1822.44,11250.46,0.0,2187.28,0.0,0.0,1894.87
2,2021-09-26,1,10237.24,10534.1,0.0,1964.94,0.0,5213.48,1715.62,7663.81,0.0,2399.59,0.0,4757.98,0.0
3,2021-05-23,1,10888.41,0.0,0.0,1787.76,0.0,0.0,1955.5,0.0,4111.03,0.0,0.0,5242.12,2010.19
4,2021-10-31,1,7250.21,11543.58,4615.35,2518.88,0.0,4556.16,1919.19,0.0,0.0,1707.65,0.0,0.0,2863.31


In [6]:
pred = df.apply(infer, flow_data=flow_data, flow_res=flow_res, axis=1)
pred


X does not have valid feature names, but MinMaxScaler was fitted with feature names


X does not have valid feature names, but MinMaxScaler was fitted with feature names


X does not have valid feature names, but MinMaxScaler was fitted with feature names


X does not have valid feature names, but MinMaxScaler was fitted with feature names


X does not have valid feature names, but MinMaxScaler was fitted with feature names



Unnamed: 0,Date,Store,Forecast,lb,ub,poly,year,trend,mu,std
0,2021-05-09,1,7756.391734,6873.676131,8639.107337,7656.75266,11244.755565,10669.189637,0.245437,0.034285
1,2021-07-25,1,10602.404911,9692.179095,11512.630728,10606.240181,10461.800538,10677.094387,0.462094,0.035354
2,2021-09-26,1,9882.034381,8959.812352,10804.25641,10472.824046,9919.804825,10683.607213,0.407255,0.03582
3,2021-05-23,1,10970.617961,10075.1057,11866.130221,10985.659271,11151.691328,10670.622333,0.490125,0.034782
4,2021-10-31,1,9298.410785,8361.825001,10234.99657,9092.256081,9883.929088,10687.243066,0.362826,0.036378
