In [None]:
import numpy as np
import plotly
from plotly import tools
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import pandas as pd
from scipy.stats import poisson
from statsmodels.tsa.seasonal import seasonal_decompose, STL
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, pairwise, r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import gplearn as gpl        
from gplearn.genetic import SymbolicRegressor
import dask
import time
from dateutil.relativedelta import relativedelta
import datetime
from operator import itemgetter
from copy import copy, deepcopy
%matplotlib inline

In [None]:
def plot_gp(mu, lb, ub, test_x, test_y, train_x=None, train_y=None, name='', samples=[], layout='v'):
    fig = make_subplots(rows=1, cols=1, subplot_titles=("Samples"))
    if train_x is not None:
        fig.add_trace(go.Scatter(x=train_x, y=train_y, mode='lines', name='train', marker={'size':10})
                      , row=1, col=1) #plot training data
    
    fig.add_trace(
        go.Scatter(x=test_x, y=ub, fill=None, mode='lines', line_color='rgba(128, 128,128,0.7)',
                  fillcolor='rgba(128, 128,128,0.7)', showlegend=True, name='uncertainty'), row=1, col=1)
    fig.add_trace(
        go.Scatter(x=test_x, y=lb, fill='tonexty',mode='lines', line_color='rgba(128, 128,128,0.7)',
                  fillcolor='rgba(128, 128,128,0.7)', showlegend=True, name='uncertainty'), row=1, col=1)
    
    fig.add_trace(go.Scatter(x=test_x, y=mu, line_color='rgb(0,0,0)', mode='lines', name='mean'), row=1, col=1) #plot the mean
    fig.add_trace(go.Scatter(x=test_x, y=test_y, line_color='rgb(29, 181, 22)',mode='lines', name='test'), row=1, col=1)
    for i, s in enumerate(samples):
        fig.add_trace(go.Scatter(x=test_x, y=s, name='sample '+str(i), mode='lines'), row=1, col=1) #plot samples
    fig.update_layout(title_text=name,paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', height=500)
    return fig

def plot_ts_decomposition(df, index, obs, model="additive", freq=None, samples=None):
    df.index = df[index]
    decompose = df[[index, obs]]
    decompose.index = df[index]
    decompose = decompose[[obs]]
    
    decomposition = seasonal_decompose(decompose, model=model, freq=freq)
    trend, seasonal, residual = decomposition.trend, decomposition.seasonal, decomposition.resid
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=decompose.index, y=decompose.iloc[:,0], mode='lines', name='observed')) #plot the observed
    fig.add_trace(go.Scatter(x=decompose.index, y=trend.tolist(), mode='lines', name='trend')) #plot the trend
    fig.add_trace(go.Scatter(x=decompose.index, y=seasonal.tolist(), mode='lines', name='seasonal')) #plot the seasonal
    fig.add_trace(go.Scatter(x=decompose.index, y=residual.tolist(), mode='lines', name='residual')) #plot the residual
    if samples is not None:
        for i, s in enumerate(samples):
            fig.add_trace(go.Scatter(x=decompose.index, y=s, name='sample '+str(i), mode='lines')) #plot samples
    fig.update_layout(title_text='Decomposition')
    return fig, trend.dropna().values, seasonal.dropna().values, residual.dropna().values

def plot_stl_decomposition(df, index, obs, model="additive", period=None, seasonal=7, samples=None):
    df.index = df[index]
    decompose = df[[index, obs]]
    decompose.index = df[index]
    decompose = decompose[[obs]]
    
    decomposition = STL(decompose, period=period, seasonal=seasonal).fit()
    trend, seasonal, residual = decomposition.trend, decomposition.seasonal, decomposition.resid
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=decompose.index, y=decompose.iloc[:,0], mode='lines', name='observed')) #plot the observed
    fig.add_trace(go.Scatter(x=decompose.index, y=trend.tolist(), mode='lines', name='trend')) #plot the trend
    fig.add_trace(go.Scatter(x=decompose.index, y=seasonal.tolist(), mode='lines', name='seasonal')) #plot the seasonal
    fig.add_trace(go.Scatter(x=decompose.index, y=residual.tolist(), mode='lines', name='residual')) #plot the residual
    if samples is not None:
        for i, s in enumerate(samples):
            fig.add_trace(go.Scatter(x=decompose.index, y=s, name='sample '+str(i), mode='lines')) #plot samples
    fig.update_layout(title_text='Decomposition')
    return fig, trend.dropna().values, seasonal.dropna().values, residual.dropna().values

In [None]:
df = pd.read_csv(data_path)
df = df.loc[df['Store'] == 1045]
df = df.drop(columns=[col for col in df.columns if len(df[col].unique()) == 1])
df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d")
df

Unnamed: 0,Date,Sales,Open,Promo,StateHoliday,SchoolHoliday,Competition
952351,2013-01-01,0,0,0,a,1,0.0
952352,2013-01-02,8282,1,0,0,1,0.0
952353,2013-01-03,7582,1,0,0,1,0.0
952354,2013-01-04,6351,1,0,0,1,0.0
952355,2013-01-05,4854,1,0,0,0,0.0
...,...,...,...,...,...,...,...
953288,2015-07-27,14919,1,1,0,1,1.0
953289,2015-07-28,10901,1,1,0,1,1.0
953290,2015-07-29,11185,1,1,0,1,1.0
953291,2015-07-30,12042,1,1,0,1,1.0


In [None]:
df['dayofweek'] = df['Date'].dt.dayofweek 
df['sin_dayofweek'] = np.sin(2*np.pi*df['dayofweek']/np.max(df['dayofweek']))
df['cos_dayofweek'] = np.cos(2*np.pi*df['dayofweek']/np.max(df['dayofweek']))
df.drop(columns=['dayofweek'], inplace=True)

df['dayofyear'] = df['Date'].dt.dayofyear
df['sin_dayofyear'] = np.sin(2*np.pi*df['dayofyear']/np.max(df['dayofyear']))
df['cos_dayofyear'] = np.cos(2*np.pi*df['dayofyear']/np.max(df['dayofyear']))
df.drop(columns=['dayofyear'], inplace=True)
df

Unnamed: 0,Date,Sales,Open,Promo,StateHoliday,SchoolHoliday,Competition,sin_dayofweek,cos_dayofweek,sin_dayofyear,cos_dayofyear
952351,2013-01-01,0,0,0,a,1,0.0,8.660254e-01,0.5,0.017213,0.999852
952352,2013-01-02,8282,1,0,0,1,0.0,8.660254e-01,-0.5,0.034422,0.999407
952353,2013-01-03,7582,1,0,0,1,0.0,1.224647e-16,-1.0,0.051620,0.998667
952354,2013-01-04,6351,1,0,0,1,0.0,-8.660254e-01,-0.5,0.068802,0.997630
952355,2013-01-05,4854,1,0,0,0,0.0,-8.660254e-01,0.5,0.085965,0.996298
...,...,...,...,...,...,...,...,...,...,...,...
953288,2015-07-27,14919,1,1,0,1,1.0,0.000000e+00,1.0,-0.425000,-0.905193
953289,2015-07-28,10901,1,1,0,1,1.0,8.660254e-01,0.5,-0.440519,-0.897743
953290,2015-07-29,11185,1,1,0,1,1.0,8.660254e-01,-0.5,-0.455907,-0.890028
953291,2015-07-30,12042,1,1,0,1,1.0,1.224647e-16,-1.0,-0.471160,-0.882048


In [None]:
output_col = 'Sales'

train = df[df['Date'] < '2015-06-14']
test = df[df['Date']>='2015-06-14']

OE = OneHotEncoder(sparse=False)
train_ohe = OE.fit_transform(train[['StateHoliday']])
test_ohe = OE.transform(test[['StateHoliday']])
for i, c in enumerate(OE.categories_[0]):
    train['StateHoliday_'+str(c)] = train_ohe[:, i]
    test['StateHoliday_'+str(c)] = test_ohe[:, i]
train.drop(columns=['StateHoliday'], inplace=True)
test.drop(columns=['StateHoliday'], inplace=True)

#y_scaler = MinMaxScaler(feature_range=(0, 1))
#train_y, test_y = y_scaler.fit_transform(train[[output_col]]).ravel(), test[output_col].values
train_y, test_y = train[output_col].values, test[output_col].values
train_idx, test_idx = train['Date'].tolist(), test['Date'].tolist()
train_x, test_x = train.drop(columns=['Date',output_col]), test.drop(columns=['Date',output_col])

train_x.shape, train_y.shape, test_x.shape, test_y.shape



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



((894, 12), (894,), (48, 12), (48,))

In [None]:
fig, trend, seasonal, residual = plot_ts_decomposition(train, 'Date', 'Sales', freq=364)
fig.show()
print(np.mean(trend), np.var(trend), np.std(trend))
print(np.mean(seasonal), np.var(seasonal), np.std(seasonal))
print(np.mean(residual), np.var(residual), np.std(residual))


the 'freq'' keyword is deprecated, use 'period' instead



7074.4603177482895 2501.2026243456376 50.012024797498825
-148.3344271910146 9035222.455295222 3005.864676810189
2.830836402608049 377683.10011558555 614.5592730693969


In [None]:
train_x

Unnamed: 0,Open,Promo,SchoolHoliday,Competition,sin_dayofweek,cos_dayofweek,sin_dayofyear,cos_dayofyear,StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c
952351,0,0,1,0.0,8.660254e-01,0.5,0.017213,0.999852,0.0,1.0,0.0,0.0
952352,1,0,1,0.0,8.660254e-01,-0.5,0.034422,0.999407,1.0,0.0,0.0,0.0
952353,1,0,1,0.0,1.224647e-16,-1.0,0.051620,0.998667,1.0,0.0,0.0,0.0
952354,1,0,1,0.0,-8.660254e-01,-0.5,0.068802,0.997630,1.0,0.0,0.0,0.0
952355,1,0,0,0.0,-8.660254e-01,0.5,0.085965,0.996298,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
953240,1,0,0,1.0,8.660254e-01,0.5,0.377708,-0.925925,1.0,0.0,0.0,0.0
953241,1,0,0,1.0,8.660254e-01,-0.5,0.361714,-0.932289,1.0,0.0,0.0,0.0
953242,1,0,0,1.0,1.224647e-16,-1.0,0.345612,-0.938377,1.0,0.0,0.0,0.0
953243,1,0,0,1.0,-8.660254e-01,-0.5,0.329408,-0.944188,1.0,0.0,0.0,0.0


In [None]:
def power_f(x, d):
    d = np.round(np.clip(d, 1, 2)).astype(int)
    return np.power(x, d)

def _tanh(x, a, b):
    return np.tanh(a*x+b)

def _mle_poisson(true, pred, w=None):
    pred = np.where(pred>0, pred, np.exp(pred) - 1)
    pred += 1
    ll = -np.mean(poisson.logpmf(true, pred))
    ll = np.nan_to_num(ll, nan=1e5)
    return ll

power = gpl.functions.make_function(power_f, 'pow', 2, wrap=True)
tanh = gpl.functions.make_function(_tanh, 'tanh', 3, wrap=True)
function_set = ['add', 'sub', 'mul', 'div', 'sqrt', power, 'log', 'min', 'max', 'sin', 'cos', 'abs', tanh]
mle_poisson = gpl.fitness.make_fitness(_mle_poisson, greater_is_better=False)

model = SymbolicRegressor(population_size = 2000, tournament_size=500, const_range=(-5,5),
                          generations = 500, stopping_criteria=-1000,
                          function_set = function_set, metric=mle_poisson,
                          p_crossover=0.65, p_subtree_mutation=0.15,
                          p_hoist_mutation=0.05, p_point_mutation=0.1, low_memory=True,
                          verbose = 1, random_state = None, n_jobs = -1)

model.fit(train_x, train_y)
print(model._program)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left



overflow encountered in reduce



   0    16.16              inf       45          41194.4              N/A     11.10m
   1    26.68              inf       85          29155.8              N/A     10.37m
   2    54.74          35182.4       91          7281.05              N/A     13.73m
   3    70.14          68582.7       46          483.557              N/A     13.47m
   4    60.30           374377       46           470.58              N/A     13.63m
   5    41.89           777767       42          413.688              N/A     13.02m
   6    43.27      1.00397e+06       41          346.434              N/A     14.38m
   7    43.14           810917       44          341.209              N/A     13.56m
   8    39.60              inf       40          318.927              N/A     12.92m
   9    41.47      1.15042e+06       45          313.468              N/A     12.48m
  10    41.10     8.98847e+304       45          295.525              N/A     14.38m
  11    42.10           802939       47          280.458         

  97   787.99          78576.4      775          188.202              N/A     29.35m
  98   766.67          74984.7      756          187.737              N/A     30.45m
  99   755.32          19751.3      800          187.391              N/A     28.98m
 100   752.62          7689.55      801          186.626              N/A     30.67m
 101   775.28          44557.1      805          186.377              N/A     30.48m
 102   784.57           120181      910          186.227              N/A     29.37m
 103   806.15          79053.3      791            185.8              N/A     29.49m
 104   808.16            91373      830          185.712              N/A     30.04m
 105   789.90          27351.5      808          185.325              N/A     31.33m
 106   793.30            32711      808          184.981              N/A     29.72m
 107   793.98     8.98847e+304      799          183.651              N/A     32.51m
 108   799.05          9049.94      803          183.355         

 194  1596.99          95896.8     1606          145.176              N/A     42.99m
 195  1590.90          38428.6     1631          144.856              N/A     39.91m
 196  1609.28          6505.31     1631          144.563              N/A     37.30m
 197  1610.03           104345     1688          144.457              N/A     37.08m
 198  1628.69          9857.34     1665           144.29              N/A     39.26m
 199  1655.37          20309.1     1663          143.868              N/A     38.51m
 200  1657.74          9541.38     1675          143.812              N/A     38.23m
 201  1644.50          9940.92     1701          143.339              N/A     38.01m
 202  1651.74          19251.6     1704          143.227              N/A     36.38m
 203  1695.48          7915.24     1695          142.506              N/A     39.17m
 204  1699.33           112953     1744          142.342              N/A     37.67m
 205  1702.39          12372.8     1760          142.168         

 291  2492.67          18595.1     2510          126.882              N/A     39.47m
 292  2496.68          15575.2     2579          126.716              N/A     40.36m
 293  2493.68          12480.1     2514          126.661              N/A     39.46m
 294  2550.23          7874.15     2614          126.209              N/A     38.14m
 295  2522.36          15980.9     2617          126.131              N/A     39.68m
 296  2600.72          6035.66     2634          125.917              N/A     38.57m
 297  2609.40          85570.3     2635          125.907              N/A     39.51m
 298  2617.82      1.39982e+07     2636          125.631              N/A     40.39m
 299  2614.84          4241.78     2635          125.483              N/A     37.87m
 300  2616.08          5924.38     2643          125.464              N/A     40.72m
 301  2627.26          32331.1     2624          125.353              N/A     37.47m
 302  2622.60            40157     2654          125.334         

 388  3499.33          8436.73     3573          106.455              N/A     30.82m
 389  3537.36          10836.1     3570          106.415              N/A     27.41m
 390  3541.57          5598.76     3581          106.331              N/A     27.56m
 391  3506.66          3393.18     3670          106.146              N/A     26.48m
 392  3530.67           485273     3667          106.113              N/A     28.57m
 393  3479.58          3910.26     3577          106.079              N/A     29.31m
 394  3507.25          6880.74     3718          106.033              N/A     26.90m
 395  3447.26          6535.63     3474          105.959              N/A     26.27m
 396  3452.65          10356.5     3476          105.812              N/A     26.27m
 397  3453.28          9351.03     3646          105.757              N/A     25.94m
 398  3455.84          44383.3     3473          105.672              N/A     25.30m
 399  3458.02      2.45768e+07     3471          105.654         

BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.

In [None]:
pred = model.predict(test_x)
pred = np.where(pred>0, pred, np.exp(pred) - 1)
pred += 1
pred


overflow encountered in exp



array([ 5008.90689659, 11144.54494225,  9543.94286254,  7912.85937943,
        9156.70452073,  8490.21545831,  6073.98259545,  5034.65858923,
        5037.30040346,  6050.74327682,  6574.07409195,  7958.74656345,
        7057.53266882,  6216.75815637,  5047.62848386, 10469.64279244,
        8798.20160508,  9255.66269756,  9352.00646199,  8147.89631409,
        7003.76127913,  5267.47864345,  5295.80475102,  7401.00111835,
        6470.45351296,  8463.14131226,  7464.50376887,  7727.02023302,
        5414.528712  , 11051.6831926 ,  9349.20230714,  8946.16211842,
        9961.47330652,  9822.08202416,  8039.75289095,  5496.9394672 ,
       10569.24329949,  9243.59328172,  8874.72378856, 10071.99535808,
        9318.63487604,  8289.90416905,  5803.15364464, 13677.79090627,
       11813.9311642 , 11383.87608086, 12819.01133508, 11422.55950626])

In [None]:
lb, ub = poisson.ppf(0.025, pred), poisson.ppf(0.975, pred)
fig = plot_gp(pred, lb, ub, test_idx, test_y, train_idx, train_y, samples=[], layout='h')
fig.show()

In [None]:
def mase(train_y, test_y, pred):
    n = train_y.shape[0]
    d = np.abs(np.diff(train_y)).sum()/(n-1)
    errors = np.abs(test_y - pred)
    return errors.mean()/d

def mape(test_y, pred):
    return np.round(np.mean(np.abs(100*(test_y-pred)/(test_y + 1e-9))), 0)

def rmspe(test_y, pred):
    return (np.sqrt(np.mean(np.square((test_y - pred) / (test_y + 1e-9))))) * 100

def persistence(train_y, test_y):
    predictions, history = [], list(np.copy(train_y))
    for i in test_y:
        predictions.append(history[-1])
        history.append(i)
    return np.asarray(predictions)

naive = persistence(train_y, test_y)
errors = {'MAE':[mean_absolute_error(test_y, pred), mean_absolute_error(test_y, naive)], 
        'RMSE':[mean_squared_error(test_y, pred), mean_squared_error(test_y, naive)], 
        'RMSPE': [rmspe(test_y, pred), rmspe(test_y, naive)],
        'MAPE':[mape(test_y, pred), mape(test_y, naive)],
        'MASE':[mase(train_y, test_y, pred), mase(train_y, test_y, naive)]} 
errors = pd.DataFrame(errors, index =['THIS', 'NAIVE']) 
errors

Unnamed: 0,MAE,RMSE,RMSPE,MAPE,MASE
THIS,1113.439454,2397036.0,20.296135,14.0,0.411405
NAIVE,2392.166667,11489310.0,42.119587,30.0,0.883881


In [None]:
pred = model.predict(train_x)
#pred = y_scaler.inverse_transform(pred[:,np.newaxis]).ravel()
mean_absolute_error(train_y, pred)

899.5163123524347