In [1]:
!pip install einops
!pip install statsmodels --upgrade

Collecting einops
  Downloading einops-0.6.1-py3-none-any.whl (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.6.1
Collecting statsmodels
  Downloading statsmodels-0.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: statsmodels
  Attempting uninstall: statsmodels
    Found existing installation: statsmodels 0.13.5
    Uninstalling statsmodels-0.13.5:
      Successfully uninstalled statsmodels-0.13.5
Successfully installed statsmodels-0.14.0


In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.optim as opt
from torch.distributions.normal import Normal
from torch.distributions.log_normal import LogNormal
from torch.distributions.poisson import Poisson
from torch.distributions.negative_binomial import NegativeBinomial
from torch.distributions.categorical import Categorical
from torch.distributions.mixture_same_family import MixtureSameFamily
from einops import rearrange
import pandas as pd
import numpy as np
from scipy.optimize import dual_annealing, minimize, fmin_bfgs
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from statsmodels.tsa.seasonal import seasonal_decompose
from copy import copy, deepcopy
import plotly
from plotly import tools
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from copy import copy, deepcopy
import dask
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
'''torch.cuda.set_device(0)
torch.backends.cudnn.benchmark = True'''

Mounted at /content/drive


'torch.cuda.set_device(0)\ntorch.backends.cudnn.benchmark = True'

In [3]:
def plot_gp(mu, lb, ub, test_x, test_y, train_x=None, train_y=None, name='', samples={},
            layout='v', xaxis_title='Time', yaxis_title='Sales', fig_size=[1000,500], w=3, f=10):
    fig = make_subplots(rows=1, cols=1, subplot_titles=("Samples"))
    samples = {'sample '+str(i): s for i, s in enumerate(samples)} if not isinstance(samples, dict) else samples
    if train_x is not None:
        fig.add_trace(go.Scatter(x=train_x, y=train_y, mode='lines', name='History', line=dict(width=w), line_color='#1a76ff'))  # plot training data

    fig.add_trace(
        go.Scatter(x=test_x, y=ub, fill=None, mode='lines', line_color='rgba(199, 19, 19, 0.3)',
                   fillcolor='rgba(249, 129, 37, 0.3)', showlegend=True, name='95% uncertainty interval'))
    fig.add_trace(
        go.Scatter(x=test_x, y=lb, fill='tonexty', mode='lines', line_color='rgba(199, 19, 19, 0.3)',
                   fillcolor='rgba(249, 129, 37, 0.3)', showlegend=True, name='95% uncertainty interval'))

    fig.add_trace(go.Scatter(x=test_x, y=mu, line=dict(color='#c71313', width=w), mode='lines', name='Skyolia Forecast'))  # plot the mean
    fig.add_trace(go.Scatter(x=test_x, y=test_y, line=dict(color='#1a76ff', width=w), mode='lines', name='Observed'))
    for k, v in samples.items():
        fig.add_trace(go.Scatter(x=test_x, y=v, name=k, mode='lines',
                                 line=dict(width=w)))  # plot samples
    fig.update_layout(title_text=name, paper_bgcolor='#343434', plot_bgcolor='#343434', xaxis_title=xaxis_title, yaxis_title=yaxis_title,
                          font=dict(family="Montserrat", color="#fff", size=f), title_x=0.5, hovermode="x")
    fig.update_xaxes(showgrid=True, showline=False, gridcolor='#c9c9c9', gridwidth=0.0005)
    fig.update_yaxes(showgrid=True, showline=False, gridcolor='#c9c9c9', gridwidth=0.0005)
    return fig

def confidence_interval(mu, cov):
    std = np.sqrt(np.diag(cov)) #compute std
    uncertainty = 1.96 * std
    return mu, std, mu-uncertainty, mu+uncertainty

def order_quantity(mu, std, cu, co):
    cf = cu/(cu+co)
    return scipy.stats.norm.ppf(cf, loc=mu, scale=std)

def plot_cov(covs, cols, subplot_titles, labels=None):
    fig = make_subplots(rows=int(len(covs)/cols) + 1, cols=cols, subplot_titles=subplot_titles)
    height = (1000/cols)*2
    for i, cov in enumerate(covs):
        row, col = int(i / cols)+1, (i%cols)+1
        fig.add_trace(go.Heatmap(z=cov, x=labels, y=labels, colorscale='Greys'), row=row, col=col)
    fig.update_layout(title_text='Cov matrix', height=height)#, yaxis1=dict(domain=[0, 1]), yaxis1=dict(domain=[0, 1])
    return fig

def plot_ts_decomposition(df, index, obs, model="additive", features=False, period=None, samples=None):
    df.index = df[index]
    decompose = df[[index, obs]]
    decompose.index = df[index]
    decompose = decompose[[obs]]

    decomposition = seasonal_decompose(decompose, model=model, period=period)
    trend, seasonal, residual = decomposition.trend, decomposition.seasonal, decomposition.resid
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=decompose.index, y=decompose.iloc[:,0], mode='lines', name='observed')) #plot the observed
    fig.add_trace(go.Scatter(x=decompose.index, y=trend.tolist(), mode='lines', name='trend')) #plot the trend
    fig.add_trace(go.Scatter(x=decompose.index, y=seasonal.tolist(), mode='lines', name='seasonal')) #plot the seasonal
    fig.add_trace(go.Scatter(x=decompose.index, y=residual.tolist(), mode='lines', name='residual')) #plot the residual
    if features:
        features = [col for col in list(df.columns) if col not in [index, obs]]
        for col in features:
            fig.add_trace(go.Scatter(x=decompose.index, y=df[col].values, name=col, mode='lines'))
    if samples is not None:
        for i, s in enumerate(samples):
            fig.add_trace(go.Scatter(x=decompose.index, y=s, name='sample '+str(i), mode='lines')) #plot samples
    fig.update_layout(title_text='Decomposition')
    return fig, trend.dropna().values, seasonal.dropna().values, residual.dropna().values

def plot_stl_decomposition(df, index, obs, model="additive", period=None, seasonal=7, samples=None):
    df.index = df[index]
    decompose = df[[index, obs]]
    decompose.index = df[index]
    decompose = decompose[[obs]]

    decomposition = STL(decompose, period=period, seasonal=seasonal).fit()
    trend, seasonal, residual = decomposition.trend, decomposition.seasonal, decomposition.resid
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=decompose.index, y=decompose.iloc[:,0], mode='lines', name='observed')) #plot the observed
    fig.add_trace(go.Scatter(x=decompose.index, y=trend.tolist(), mode='lines', name='trend')) #plot the trend
    fig.add_trace(go.Scatter(x=decompose.index, y=seasonal.tolist(), mode='lines', name='seasonal')) #plot the seasonal
    fig.add_trace(go.Scatter(x=decompose.index, y=residual.tolist(), mode='lines', name='residual')) #plot the residual
    if samples is not None:
        for i, s in enumerate(samples):
            fig.add_trace(go.Scatter(x=decompose.index, y=s, name='sample '+str(i), mode='lines')) #plot samples
    fig.update_layout(title_text='Decomposition')
    return fig, trend.dropna().values, seasonal.dropna().values, residual.dropna().values

In [4]:
def cyclic_column_processing(df, dt_column, hourly=True, daily=True, yearly=True):
    df[dt_column], periodic_column = df[dt_column].astype('datetime64[ns]'), []
    if hourly:
        df['hourofday'] = df[dt_column].dt.hour
        df['sin_hourofday'] = np.sin(2*np.pi*df['hourofday']/np.max(df['hourofday']))
        df['cos_hourofday'] = np.cos(2*np.pi*df['hourofday']/np.max(df['hourofday']))
        df.drop(columns=['hourofday'], inplace=True), periodic_column.extend(['sin_hourofday', 'cos_hourofday'])

    if daily:
        df['dayofweek'] = df[dt_column].dt.dayofweek
        df['sin_dayofweek'] = np.sin(2*np.pi*df['dayofweek']/np.max(df['dayofweek']))
        df['cos_dayofweek'] = np.cos(2*np.pi*df['dayofweek']/np.max(df['dayofweek']))
        df.drop(columns=['dayofweek'], inplace=True), periodic_column.extend(['sin_dayofweek', 'cos_dayofweek'])
    if yearly:
        df['dayofyear'] = df[dt_column].dt.dayofyear
        df['sin_dayofyear'] = np.sin(2*np.pi*df['dayofyear']/np.max(df['dayofyear']))
        df['cos_dayofyear'] = np.cos(2*np.pi*df['dayofyear']/np.max(df['dayofyear']))
        df.drop(columns=['dayofyear'], inplace=True), periodic_column.extend(['sin_dayofyear', 'cos_dayofyear'])
    return df, periodic_column


def categorical_encoding(train, valid, test, categorical):
    new_cat, LEncoders = [], {}
    for cat in categorical:
        LE = LabelEncoder()
        train[cat] = LE.fit_transform(train[cat]).astype(int)
        valid[cat] = LE.transform(valid[cat]).astype(int)
        test[cat] = LE.transform(test[cat]).astype(int)
        LEncoders[cat] = LE
    return train, valid, test, LEncoders


def numerical_scaling(train, valid, test, numerical):
    MS = MinMaxScaler(feature_range=(-1, 1))
    scaled_train = MS.fit_transform(train[numerical])
    scaled_valid = MS.transform(valid[numerical])
    scaled_test = MS.transform(test[numerical])
    train[numerical] = scaled_train
    valid[numerical] = scaled_valid
    test[numerical] = scaled_test
    return train, valid, test, MS


def output_scaling(train, valid, test, output_col):
    YScaler = MinMaxScaler(feature_range=(0, 1))
    Y_train = YScaler.fit_transform(train[[output_col]]).ravel()+1e-5
    Y_valid = YScaler.transform(valid[[output_col]]).ravel()+1e-5
    Y_test = test[output_col]
    return Y_train, Y_valid, Y_test, YScaler


def shift_df(df, shift, dropna=True):
    origin = df.copy()
    for i in range(1, shift+1):
        shifted_df = origin.shift(i)
        shifted_df = shifted_df.rename(columns=dict(zip(shifted_df.columns, [str(c)+'_'+str(i) for c in shifted_df.columns])))
        df = pd.concat([shifted_df, df], axis=1)
    return df.dropna() if dropna else df

In [5]:
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/time series/nytaxi_demand/nytaxi_demand.csv')
df.drop(columns=['sin_hourofday', 'cos_hourofday', 'sin_dayofweek', 'cos_dayofweek', 'sin_dayofyear', 'cos_dayofyear'], inplace=True)
df

Unnamed: 0,temp,dew,humidity,precip,precipprob,snow,snowdepth,windspeed,winddir,sealevelpressure,cloudcover,visibility,conditions,icon,description,trips,passengers,pickup_datetime
0,5.8,-3.3,52.23,0.0,0.0,0.0,0.0,13.0,300.0,1017.9,100.0,16.0,Overcast,cloudy,Partly cloudy throughout the day.,566.0,1115.0,2016-01-01 01:00:00
1,5.3,-3.4,53.48,0.0,0.0,0.0,0.0,10.9,320.0,1017.8,100.0,16.0,Overcast,cloudy,Partly cloudy throughout the day.,503.0,928.0,2016-01-01 02:00:00
2,5.2,-3.2,54.94,0.0,0.0,0.0,0.0,11.0,296.0,1017.7,100.0,16.0,Overcast,cloudy,Partly cloudy throughout the day.,479.0,850.0,2016-01-01 03:00:00
3,5.2,-3.2,54.94,0.0,0.0,0.0,0.0,16.3,285.0,1017.7,100.0,16.0,Overcast,cloudy,Partly cloudy throughout the day.,323.0,615.0,2016-01-01 04:00:00
4,4.5,-3.0,58.21,0.0,0.0,0.0,0.0,17.6,270.0,1017.0,100.0,16.0,Overcast,cloudy,Partly cloudy throughout the day.,142.0,256.0,2016-01-01 05:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4352,26.1,14.8,49.94,0.0,0.0,0.0,0.0,10.4,190.0,1015.8,50.0,16.0,Partially cloudy,partly-cloudy-day,Clear conditions throughout the day.,484.0,773.0,2016-06-30 19:00:00
4353,26.1,15.8,53.13,0.0,0.0,0.0,0.0,11.8,200.0,1015.8,33.2,14.5,Partially cloudy,partly-cloudy-day,Clear conditions throughout the day.,440.0,733.0,2016-06-30 20:00:00
4354,25.3,16.3,57.46,0.0,0.0,0.0,0.0,7.9,190.0,1015.9,13.4,16.0,Clear,clear-night,Clear conditions throughout the day.,484.0,845.0,2016-06-30 21:00:00
4355,24.7,16.3,59.56,0.0,0.0,0.0,0.0,3.5,295.0,1016.5,13.4,16.0,Clear,clear-night,Clear conditions throughout the day.,462.0,723.0,2016-06-30 22:00:00


In [None]:
pd.DataFrame(data={'Dtypes': df.dtypes, 'Isnull': df.isnull().sum(), 'Nunique': df.nunique()}, index=df.columns)

Unnamed: 0,Dtypes,Isnull,Nunique
temp,float64,0,386
dew,float64,0,430
humidity,float64,0,3035
precip,float64,0,234
precipprob,float64,0,2
snow,float64,0,7
snowdepth,float64,0,335
windspeed,float64,0,317
winddir,float64,0,355
sealevelpressure,float64,0,427


In [6]:
out_column, time_column = 'trips', 'pickup_datetime'
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

to_remove = ['passengers','description','conditions'] + [c for c in df.columns if df[c].nunique()==1]
features = [c for c in df.columns if (c not in [out_column]) and (c not in to_remove)]
categorical = [c for c in features if (df[c].dtype=='object') and (df[c].nunique() >= 2)]
numerical = [c for c in features if df[c].dtype=='float']

fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.03, subplot_titles=('Label Distribution', "Features Correlation"))
fig.append_trace(go.Histogram(x=df[out_column]), row=1, col=1)
fig.append_trace(go.Heatmap(z=df[numerical+[out_column]].corr(),x=numerical+[out_column],y=numerical+[out_column]), row=1, col=2)
fig.show()

In [None]:
#sub = df.loc[(df['Store'] == 1) & (df['Dept'] == 23)]
fig, trend, seasonal, residual = plot_ts_decomposition(df, time_column, out_column, features=True, period=168)
fig.show()
print(np.mean(trend), np.var(trend), np.std(trend))
print(np.mean(seasonal), np.var(seasonal), np.std(seasonal))
print(np.mean(residual), np.var(residual), np.std(residual))

Output hidden; open in https://colab.research.google.com to view.

In [7]:
df, periodic = cyclic_column_processing(df, time_column, hourly=True, daily=True, yearly=True)
train = df[df[time_column] < '2016-06-01']
valid = df[(df[time_column] >= '2016-06-01') & (df[time_column] < '2016-06-15')]
test = df[df[time_column]>='2016-06-15']


train, valid, test, LEncoders = categorical_encoding(train, valid, test, categorical)  # CATEGORICAL ENCODING
if len(numerical) > 0:
    train, valid, test, MS = numerical_scaling(train, valid, test, numerical)  # NUMERCIAL SCALING
features = categorical + numerical + periodic
X_train, T_train = train[features], train[time_column]
X_valid, T_valid = valid[features], valid[time_column]
X_test, T_test = test[features], test[time_column]
#Y_train, Y_valid, Y_test, YScaler = output_scaling(train, valid, test, out_column)
Y_train, Y_valid, Y_test = train[out_column].values, valid[out_column].values, test[out_column].values
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

((3637, 19), (384, 19), (3637,), (384,))

In [None]:
X_train

Unnamed: 0,icon,temp,dew,humidity,precip,precipprob,snow,snowdepth,windspeed,winddir,sealevelpressure,cloudcover,visibility,sin_hourofday,cos_hourofday,sin_dayofweek,cos_dayofweek,sin_dayofyear,cos_dayofyear
0,2,-0.063241,-0.002088,-0.033337,-1.0,-1.0,-1.0,-1.0,-0.388235,0.666667,0.198312,1.000,1.0000,2.697968e-01,0.962917,-0.866025,-0.5,0.034516,0.999404
1,2,-0.083004,-0.006263,-0.004892,-1.0,-1.0,-1.0,-1.0,-0.487059,0.777778,0.194093,1.000,1.0000,5.195840e-01,0.854419,-0.866025,-0.5,0.034516,0.999404
2,2,-0.086957,0.002088,0.028331,-1.0,-1.0,-1.0,-1.0,-0.482353,0.644444,0.189873,1.000,1.0000,7.308360e-01,0.682553,-0.866025,-0.5,0.034516,0.999404
3,2,-0.086957,0.002088,0.028331,-1.0,-1.0,-1.0,-1.0,-0.232941,0.583333,0.189873,1.000,1.0000,8.878852e-01,0.460065,-0.866025,-0.5,0.034516,0.999404
4,2,-0.114625,0.010438,0.102742,-1.0,-1.0,-1.0,-1.0,-0.171765,0.500000,0.160338,1.000,1.0000,9.790841e-01,0.203456,-0.866025,-0.5,0.034516,0.999404
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3632,0,0.727273,0.845511,0.100011,-1.0,-1.0,-1.0,-1.0,-0.581176,0.005556,0.000000,-0.616,0.7750,-8.878852e-01,0.460065,0.866025,0.5,-0.860214,0.509933
3633,0,0.656126,0.895616,0.368301,-1.0,-1.0,-1.0,-1.0,-0.769412,0.811111,0.021097,-0.718,0.6125,-7.308360e-01,0.682553,0.866025,0.5,-0.860214,0.509933
3634,4,0.644269,0.883090,0.371715,-1.0,-1.0,-1.0,-1.0,-0.882353,0.722222,0.029536,-0.316,0.7750,-5.195840e-01,0.854419,0.866025,0.5,-0.860214,0.509933
3635,4,0.640316,0.895616,0.413130,-1.0,-1.0,-1.0,-1.0,-0.802353,0.955556,0.059072,-0.442,0.7750,-2.697968e-01,0.962917,0.866025,0.5,-0.860214,0.509933


In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

from torch import autograd

def sinc(x):
    x = x * np.pi
    return torch.where(x != 0., torch.sin(x) / x, 1.)


class RegrDataLoader(Dataset):
    def __init__(self, X, Y, numerical_col, categorical_col):
        self.X1, self.X2, self.Y = X[numerical_col].values.astype(np.float32), X[categorical_col].values, Y.astype(np.float32)

    def __len__(self):
        return len(self.Y)

    def __getitem__(self, idx):
        return self.X1[idx], self.X2[idx], self.Y[idx]

class Embedder(nn.Module):
    def __init__(self, vocab_size, dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, dim)

    def forward(self, x):
        return self.embeddings(x)

class NNModel(nn.Module):
    def __init__(self, input_shape, units=None, factors=None, activ=True, norm=False, dropout=False, slops=None):
        super().__init__()
        self.input_shape = input_shape
        self.units = units
        self.factors = factors
        self.activ, self.norm = activ, norm
        self.network = nn.ModuleList()
        if self.factors:
            self.units = np.round(self.input_shape * np.asarray(self.factors)).astype(int)
        if self.units is not None:
            self.dropout = np.zeros_like(self.units) if not dropout else dropout
            self.slops = np.full(len(self.units), 1) if slops is None else slops
            for i, j, k in zip(self.units, self.dropout, self.slops):
                if i >= 1:
                    block = self.__build_block__(input_shape, i, p=j, slop=k)
                    self.network.extend(block)
                    input_shape = i
        self.output_shape = input_shape
        self.reset_parameters()

    def __build_block__(self, input_shape, units, p, slop):
        block = []
        block.append(nn.Linear(input_shape, units, bias=not self.norm))
        if self.norm:
            #block.append(nn.BatchNorm1d(units))
            block.append(nn.LayerNorm(units, eps=1e-5))
        if self.activ:
            #block.append(nn.LeakyReLU())
            block.append(nn.ELU(slop))
            #block.append(nn.GELU())
        if p > 0:
            block.append(nn.Dropout(p))
        return block

    def forward(self, x):
        for layer in self.network:
          tmp = layer(x)
          x = tmp
        return x

    def reset_parameters(self):
        for layer in self.network:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_normal_(layer.weight)
                if not self.norm:
                    layer.bias.data.fill_(0.1)

class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
        self.pe = torch.zeros(1, max_len, d_model).to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
        self.pe[0,:, 0::2] = torch.sin(position * div_term)
        self.pe[0,:, 1::2] = torch.cos(position * div_term)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, dim, heads=8, dim_head=None):
        """
        Implementation of multi-head attention layer of the original transformer model.
        einsum and einops.rearrange is used whenever possible
        Args:
            dim: token's dimension, i.e. word embedding vector size
            heads: the number of distinct representations to learn
            dim_head: the dim of the head. In general dim_head<dim.
            However, it may not necessary be (dim/heads)
        """
        super().__init__()
        self.dim_head = (int(dim / heads)) if dim_head is None else dim_head
        _dim = self.dim_head * heads
        self.heads = heads
        self.to_qvk = nn.Linear(dim, _dim * 3, bias=False)
        self.W_0 = nn.Linear( _dim, dim, bias=False)
        self.scale_factor = self.dim_head ** -0.5
    def forward(self, x, mask=None):
        assert x.dim() == 3
        # Step 1
        qkv = self.to_qvk(x)  # [batch, tokens, dim*3*heads ]
        # Step 2
        # decomposition to q,v,k and cast to tuple
        # the resulted shape before casting to tuple will be:
        # [3, batch, heads, tokens, dim_head]
        q, k, v = tuple(rearrange(qkv, 'b t (d k h) -> k b h t d ', k=3, h=self.heads))
        # Step 3
        # resulted shape will be: [batch, heads, tokens, tokens]
        scaled_dot_prod = torch.einsum('b h i d , b h j d -> b h i j', q, k) * self.scale_factor
        if mask is not None:
            assert mask.shape == scaled_dot_prod.shape[2:]
            scaled_dot_prod = scaled_dot_prod.masked_fill(mask, -np.inf)
        attention = torch.softmax(scaled_dot_prod, dim=-1)
        # Step 4. Calc result per batch and per head h
        out = torch.einsum('b h i j , b h j d -> b h i d', attention, v)
        # Step 5. Re-compose: merge heads with dim_head d
        out = rearrange(out, "b h t d -> b t (h d)")
        # Step 6. Apply final linear transformation layer
        return self.W_0(out)

class TransformerBlock(nn.Module):
   """
   Vanilla transformer block from the original paper "Attention is all you need"
   Detailed analysis: https://theaisummer.com/transformer/
   """
   def __init__(self, dim, heads=8, dim_head=None, dim_linear_block=1024, dropout=0.1):
       """
       Args:
           dim: token's vector length
           heads: number of heads
           dim_head: if none dim/heads is used
           dim_linear_block: the inner projection dim
           dropout: probability of droppping values
       """
       super().__init__()
       self.mhsa = MultiHeadSelfAttention(dim=dim, heads=heads, dim_head=dim_head)
       self.drop = nn.Dropout(dropout)
       self.norm_1 = nn.LayerNorm(dim)
       self.norm_2 = nn.LayerNorm(dim)
       #self.linear = NNModel(dim, units=[dim_linear_block, dim], factors=None, dropout=[dropout, dropout], norm=False)
       self.linear = nn.Sequential(
           nn.Linear(dim, dim_linear_block), nn.ReLU(), nn.Dropout(dropout),
           nn.Linear(dim_linear_block, dim), nn.Dropout(dropout)
       )
   def forward(self, x, mask=None):
       y = self.norm_1(self.drop(self.mhsa(x, mask)) + x)
       return self.norm_2(self.linear(y) + y)

class Transformer(nn.Module):
    def __init__(self, blocks, dim, heads=8, dim_head=None, dim_linear_block=1024, dropout=0.1):
       super().__init__()
       self.block_list = [TransformerBlock(dim, heads, dim_head, dim_linear_block=dim_linear_block, dropout=dropout) for _ in range(blocks)]
       self.layers = nn.ModuleList(self.block_list)

    def forward(self, x, mask=None):
       for layer in self.layers:
           x = layer(x, mask)
       return x

class TabTransformer(nn.Module):
    def __init__(self, categories, numerical_nb, blocks, dim, mlp_units, heads=8, dim_head=None, dim_linear_block=1024, dropout=0.1, mlp_dropout=0.00001, apply_pos=False):
        """
        categories: tuple containing the number of unique values within each category
        """
        super().__init__()
        self.embed = Embedder(sum(categories), dim)
        #self.cont_embed = nn.ModuleList([NNModel(1, units=[dim], factors=None, norm=False) for i in range(numerical_nb)])
        self.cont_embed = NNModel(numerical_nb, units=[dim], factors=None, norm=False, activ=False)
        self.pe = PositionalEncoder(dim)
        self.transformer = Transformer(blocks, dim, heads, dim_head, dim_linear_block, dropout)
        #self.input_size = dim * (len(categories) + numerical_nb) + numerical_nb
        self.input_size = dim * (len(categories) + 1) + numerical_nb
        self.mlp = NNModel(self.input_size, units=mlp_units, factors=None, dropout=[mlp_dropout]*len(mlp_units))
        self.norm = nn.LayerNorm(numerical_nb)
        self.output_shape = mlp_units[-1]
        self.apply_pos = apply_pos

    def forward(self, x_cont, x_cat):
        #x = torch.stack([e(x_cont[:,[i]]) for i,e in enumerate(self.cont_embed)], 1)#[batch_shape, num_cont, dim]
        x1_ = self.cont_embed(x_cont)#[batch_shape, dim]
        x = torch.unsqueeze(x1_, 1)
        if x_cat.nelement() != 0: #skipped if there's no categorical feature
            x2 = self.embed(x_cat)#[batch_shape, num_cat, dim]
            #x = torch.cat((x, x2), 1)#[batch_shape, num_cat+num_cont, dim]
            x = torch.cat((x, x2), 1)#[batch_shape, num_cat+1, dim]
        x = self.pe(x) if self.apply_pos else x
        x = self.transformer(x)
        x = x.flatten(1)
        x = torch.cat((x, self.norm(x_cont)), dim = -1)
        return self.mlp(x)

class MDN(nn.Module):
    def __init__(self, shared, clf_nn, n_comp):
        super(MDN, self).__init__()
        self.shared = shared
        self.clf_nn = clf_nn
        self.n_comp = n_comp
        self.pi = nn.Linear(self.clf_nn.output_shape, self.n_comp) if self.n_comp > 1 else None

    def proba_model(self, x):
        model = self.clf_nn(x)
        model = self.pi(model)
        model = nn.Softmax(dim=-1)(model)
        return model

class OneParamMDN(MDN):
    def __init__(self, shared, clf_nn, n_comp, alpha_nn):
        super(OneParamMDN, self).__init__(shared, clf_nn, n_comp)
        self.alpha_nn = alpha_nn
        self.ai = nn.Linear(self.alpha_nn.output_shape, self.n_comp)

    def forward(self, x_cont, x_cat):
        x = self.shared(x_cont, x_cat)
        proba = self.proba_model(x) if self.n_comp > 1 else torch.ones((len(x),1)).to(device)
        alpha = self.alpha_model(x)
        return proba, alpha

    def alpha_model(self, x):
        model = self.alpha_nn(x)
        model = self.ai(model)
        return model

class TwoParamMDN(MDN):
    def __init__(self, shared, clf_nn, n_comp, alpha_nn, beta_nn):
        super(TwoParamMDN, self).__init__(shared, clf_nn, n_comp)
        self.alpha_nn = alpha_nn
        self.beta_nn = beta_nn
        self.ai = nn.Linear(self.alpha_nn.output_shape, self.n_comp)
        self.bi = nn.Linear(self.beta_nn.output_shape, self.n_comp)

    def forward(self, x_cont, x_cat):
        x = self.shared(x_cont, x_cat)
        proba = self.proba_model(x) if self.n_comp > 1 else torch.ones((len(x),1)).to(device)
        alpha = self.alpha_model(x)
        beta = self.beta_model(x)
        return proba, alpha, beta

    def alpha_model(self, x):
        model = self.alpha_nn(x)
        model = self.ai(model)
        return model

    def beta_model(self, x):
        model = self.beta_nn(x)
        model = self.bi(model)
        return model

class BaseParametric:
    def __init__(self, model, numerical_col, categorical_col, resume=None):
        self.model = model.to(device)
        self.losses = {'Epoch': [], 'Train': [], 'Test': [], 'BState': [], 'LState': [], 'LR': []}
        self.numerical_col, self.categorical_col = numerical_col, categorical_col
        self.times = None
        self.optim = resume

    def train_model(self, optim, train_loader, grad_clip, l2_reg):
          total_loss = 0
          self.model = self.model.train()
        #with autograd.detect_anomaly():
          for i, (X1, X2, Y) in enumerate(train_loader):
              X1, X2, Y = X1.to(device), X2.to(device), Y.to(device)
              #self.model.get_weight()
              self.optim.zero_grad()
              loss = self.loss_function(X1, X2, Y, l2_reg)
              loss.backward()
              torch.nn.utils.clip_grad_norm_(self.model.parameters(), grad_clip)
              self.optim.step()
              '''self.model.get_grad()
              print('_'*50)'''
              total_loss += loss.item()
          return total_loss/(i+1)


    def eval_model(self, test_loader):
        self.model = self.model.eval()
        total_loss = 0
        for i, (X1, X2, Y) in enumerate(test_loader):
            X1, X2, Y = X1.to(device), X2.to(device), Y.to(device)
            loss = self.loss_function(X1, X2, Y, l2_reg=0)
            total_loss += loss.item()
        return total_loss/(i+1)#np.abs(-100. - total_loss)


    def fit(self, X_train, Y_train, epoch, lr, opt_kwarg, batch_size=None,  grad_clip=100, momentum=0.9, X_test=None, Y_test=None, l2_reg=0, eval=True, verbose=True, save=True):

        batch_size = len(X_train) if batch_size is None else batch_size
        #X_train_uncens, Y_train_uncens, T_train_uncens, X_train_cens, Y_train_cens, T_train_cens = self.process_data(X_train, T_train, E_train)
        train_load = DataLoader(RegrDataLoader(X_train, Y_train, self.numerical_col, self.categorical_col), batch_size=batch_size, shuffle=True)  # DATALOADER obj
        if X_test is not None:
            #X_test_uncens, Y_test_uncens, T_test_uncens, X_test_cens, Y_test_cens, T_test_cens = self.process_data(X_test, T_test, E_test)
            test_load = DataLoader(RegrDataLoader(X_test, Y_test, self.numerical_col, self.categorical_col), batch_size=batch_size, shuffle=True)  # DATALOADER obj

        best_loss = 1e100
        #self.optim = opt.Adam(self.model.parameters(), lr=lr)
        self.optim = opt.SGD(self.model.parameters(), lr=lr, momentum=momentum, nesterov=True)

        #scheduler = None
        scheduler = opt.lr_scheduler.CyclicLR(self.optim, **opt_kwarg)
        #scheduler = opt.lr_scheduler.ReduceLROnPlateau(self.optim, **opt_kwarg)
        #scheduler = opt.lr_scheduler.MultiStepLR(self.optim, milestones=[28, 120], gamma=0.1)

        eval_score = ''
        for i in range(epoch):
            if verbose:
                print('##### EPOCH ' + str(i) + ' #####')

            train_loss = self.train_model(self.optim, train_load, grad_clip, l2_reg)
            self.losses['LState'] = deepcopy(self.model.state_dict())

            if verbose:
                print('train loss : ', train_loss)
            self.losses['Epoch'].append(i), self.losses['Train'].append(train_loss)

            if X_test is not None:
                valid_loss = self.eval_model(test_load)

                if verbose:
                    print('test loss : ', valid_loss)
                self.losses['Test'].append(valid_loss)

                if scheduler is not None:
                    '''scheduler.step(valid_loss)
                    self.losses['LR'].append(self.optim.param_groups[0]['lr'])'''
                    scheduler.step()
                    self.losses['LR'].append(scheduler.get_last_lr()[0])

                if valid_loss < best_loss:
                    self.losses['BState'] = deepcopy(self.model.state_dict())
                    best_loss = valid_loss
                    print('===========SAVE===========')




    def feature_importance(self, rep, X_test, Y_test, batch_size=None):
        res = np.zeros((rep, X_test.shape[1]))
        batch_size = len(X_train) if batch_size is None else batch_size
        test_load = DataLoader(RegrDataLoader(X_test, Y_test, self.numerical_col, self.categorical_col), batch_size=batch_size, shuffle=True)  # DATALOADER obj
        base_loss = self.eval_model(test_load)
        origin = X_test.copy()
        for i, col in enumerate(X_test.columns):
            for j in range(rep):
                X_test.loc[:,col] = np.random.permutation(X_test.loc[:, col])
                test_load = DataLoader(RegrDataLoader(X_test, Y_test, self.numerical_col, self.categorical_col), batch_size=batch_size, shuffle=True)  # DATALOADER obj
                loss = self.eval_model(test_load)
                res[j, i] = base_loss - loss
                X_test = origin
        res = np.abs(res)
        return {'importances': res, 'importances_mean': np.mean(res, 0), 'importances_std': np.std(res, 0)}


class PoissonModel(BaseParametric):
    def __init__(self, model, numerical_col, categorical_col,resume=None):
        super(PoissonModel, self).__init__(model, numerical_col, categorical_col,)

    def loss_function(self, X1, X2, Y, l2_reg):
        pi, rate = self.model(X1, X2)
        rate = nn.ELU()(rate) + 1 + 1e-15 #POSITIVE
        mix = Categorical(pi)
        comp = Poisson(rate, validate_args=None)
        mixture = MixtureSameFamily(mix, comp)
        loss = mixture.log_prob(Y)
        loss = -torch.mean(loss)
        return loss

    def prdict(self, X, batch_size):
        self.model.eval()
        outputs = {'pi': [], 'rate': [], 'pred': []}
        X1 = torch.tensor(X[self.numerical_col].values.astype(np.float32)).to(device)
        X2 = torch.tensor(X[self.categorical_col].values).to(device)
        data_load = DataLoader(TensorDataset(X1, X2), batch_size=batch_size)
        for i, (X1, X2) in enumerate(data_load):
            X1, X2 = X1.to(device), X2.to(device)
            pi, rate = self.model(X1, X2)
            rate = nn.ELU()(rate) + 1 + 1e-15 #POSITIVE
            mix = Categorical(pi)
            comp = Poisson(rate, validate_args=None)
            mixture = MixtureSameFamily(mix, comp)
            pi, rate, pred = pi.cpu().data.numpy(), rate.cpu().data.numpy(), mixture.mean.cpu().data.numpy()
            outputs['pi'].extend(pi), outputs['rate'].extend(rate), outputs['pred'].extend(pred)
        return outputs

class NegativeBinomialModel(BaseParametric):
    def __init__(self, model, numerical_col, categorical_col,resume=None):
        super(NegativeBinomialModel, self).__init__(model, numerical_col, categorical_col,)

    def loss_function(self, X1, X2, Y, l2_reg):
        pi, count, probs = self.model(X1, X2)
        count = nn.ELU()(count) + 1 + 1e-15 #POSITIVE
        probs = nn.Sigmoid()(probs) #[0, 1)
        mix = Categorical(pi)
        comp = NegativeBinomial(count, probs, validate_args=None)
        mixture = MixtureSameFamily(mix, comp)
        loss = mixture.log_prob(Y)
        loss = -torch.mean(loss)
        return loss

    def prdict(self, X, batch_size):
        self.model.eval()
        outputs = {'pi': [], 'count': [], 'probs': [], 'pred': []}
        X1 = torch.tensor(X[self.numerical_col].values.astype(np.float32)).to(device)
        X2 = torch.tensor(X[self.categorical_col].values).to(device)
        data_load = DataLoader(TensorDataset(X1, X2), batch_size=batch_size)
        for i, (X1, X2) in enumerate(data_load):
            X1, X2 = X1.to(device), X2.to(device)
            pi, count, probs = self.model(X1, X2)
            count = nn.ELU()(count) + 1 + 1e-15 #POSITIVE
            probs = nn.Sigmoid()(probs) #[0, 1)
            mix = Categorical(pi)
            comp = NegativeBinomial(count, probs, validate_args=None)
            mixture = MixtureSameFamily(mix, comp)
            pi, count, probs, pred = pi.cpu().data.numpy(), count.cpu().data.numpy(), probs.cpu().data.numpy(), mixture.mean.cpu().data.numpy()
            outputs['pi'].extend(pi), outputs['count'].extend(count), outputs['probs'].extend(probs), outputs['pred'].extend(pred)
        return outputs

class NormalModel(BaseParametric):
    def __init__(self, model, numerical_col, categorical_col,resume=None):
        super(NormalModel, self).__init__(model, numerical_col, categorical_col,)

    def loss_function(self, X1, X2, Y, l2_reg):
        pi, mu, std = self.model(X1, X2)
        std = nn.ELU()(std) + 1 + 1e-15 #POSITIVE
        mix = Categorical(pi)
        comp = Normal(mu, std, validate_args=None)
        mixture = MixtureSameFamily(mix, comp)
        loss = mixture.log_prob(Y)
        loss = -torch.mean(loss)
        return loss

    def prdict(self, X, batch_size):
        self.model.eval()
        outputs = {'pi': [], 'mu': [], 'std': [], 'pred': []}
        X1 = torch.tensor(X[self.numerical_col].values.astype(np.float32)).to(device)
        X2 = torch.tensor(X[self.categorical_col].values).to(device)
        data_load = DataLoader(TensorDataset(X1, X2), batch_size=batch_size)
        for i, (X1, X2) in enumerate(data_load):
            X1, X2 = X1.to(device), X2.to(device)
            pi, mu, std = self.model(X1, X2)
            std = nn.ELU()(std) + 1 + 1e-15 #POSITIVE
            mix = Categorical(pi)
            comp = Normal(mu, std, validate_args=None)
            mixture = MixtureSameFamily(mix, comp)
            pi, mu, std, pred = pi.cpu().data.numpy(), mu.cpu().data.numpy(), std.cpu().data.numpy(), mixture.mean.cpu().data.numpy()
            outputs['pi'].extend(pi), outputs['mu'].extend(mu), outputs['std'].extend(std), outputs['pred'].extend(pred)
        return outputs

class LogNormalModel(BaseParametric):
    def __init__(self, model, numerical_col, categorical_col,resume=None):
        super(LogNormalModel, self).__init__(model, numerical_col, categorical_col,)

    def loss_function(self, X1, X2, Y, l2_reg):
        pi, mu, std = self.model(X1, X2)
        std = nn.ELU()(std) + 1 + 1e-15 #LOGNORMAL
        mix = Categorical(pi)
        comp = LogNormal(mu, std, validate_args=None)
        mixture = MixtureSameFamily(mix, comp)
        loss = mixture.log_prob(Y)
        loss = -torch.mean(loss)
        return loss

    def prdict(self, X, batch_size):
        self.model.eval()
        outputs = {'pi': [], 'mu': [], 'std': [], 'pred': []}
        X1 = torch.tensor(X[self.numerical_col].values.astype(np.float32)).to(device)
        X2 = torch.tensor(X[self.categorical_col].values).to(device)
        data_load = DataLoader(TensorDataset(X1, X2), batch_size=batch_size)
        for i, (X1, X2) in enumerate(data_load):
            X1, X2 = X1.to(device), X2.to(device)
            pi, mu, std = self.model(X1, X2)
            std = nn.ELU()(std) + 1 + 1e-15 #LOGNORMAL
            mix = Categorical(pi)
            comp = LogNormal(mu, std, validate_args=None)
            mixture = MixtureSameFamily(mix, comp)
            pi, mu, std, pred = pi.cpu().data.numpy(), mu.cpu().data.numpy(), std.cpu().data.numpy(), mixture.mean.cpu().data.numpy()
            outputs['pi'].extend(pi), outputs['mu'].extend(mu), outputs['std'].extend(std), outputs['pred'].extend(pred)
        return outputs

    def mixture_quantile(self, pi, mu, std, q):
        n, solutions, evaluations = pi.shape[0], [], []
        for i in range(n):
            mix = Categorical(pi[i])
            comp = LogNormal(mu[i], std[i], validate_args=None)
            pdf = MixtureSameFamily(mix, comp)

            def objf(x):
                x = torch.from_numpy(x)
                return torch.abs(pdf.cdf(x) - q).data.numpy()
            def bounds():
                dists = list(zip(pdf.component_distribution.mean, pdf.component_distribution.stddev))
                res = [LogNormal(d[0], d[1], validate_args=None).icdf(torch.tensor(q)).data.numpy() for d in dists]
                return np.min(res), np.max(res)

            lb, ub = bounds()
            result = dual_annealing(objf, list(zip([lb], [ub])),maxiter=1000)
            solution = result['x']
            evaluation = objf(solution)
            solutions.append(solution), evaluations.append(evaluation)
        return np.asarray(solutions), np.asarray(evaluations)

def gradient_clipper(model: nn.Module, val: float) -> nn.Module:
    def process_grad(grad):
        grad[grad != grad] = 1e-10
        return torch.clamp(grad, -val, val)
    for parameter in model.parameters():
        parameter.register_hook(lambda grad: process_grad(grad))

    return model

In [9]:
n_comp, epoch, lr, batch_size, d, mlp_d = 3, 50000, 1e-5, 1024, 0.000001, 1e-6
cyclic_kwarg = {'base_lr': lr, 'max_lr': 1e-2, 'step_size_up':250, 'step_size_down':250}
plateau_kwarg = {'factor':0.5, 'patience':200, 'verbose':True, 'min_lr':1e-7, 'mode':'min'}

categories = list(X_train[categorical].nunique())
shared_nn = TabTransformer(categories, len(numerical+periodic), blocks=6, dim=64, mlp_units=[512], heads=8, dim_head=None, dim_linear_block=1024, dropout=0.1, mlp_dropout=mlp_d)
clf_nn = NNModel(shared_nn.output_shape , units=[64], factors=None, dropout=[mlp_d])
alpha_nn = NNModel(shared_nn.output_shape, units=[64], factors=None, dropout=[mlp_d])
beta_nn = NNModel(shared_nn.output_shape, units=[64], factors=None, dropout=[mlp_d])#, dropout=[d,d,d,d]

nn_model = gradient_clipper(TwoParamMDN(shared_nn, clf_nn, n_comp, alpha_nn, beta_nn), 10)
#nn_model = gradient_clipper(OneParamMDN(shared_nn, clf_nn, n_comp, alpha_nn), 10)
#nn_model.load_state_dict(best_state)
print(nn_model)
print(sum(p.numel() for p in nn_model.parameters() if p.requires_grad))
mdn = NegativeBinomialModel(nn_model, numerical+periodic, categorical)
mdn.fit(X_train, Y_train, epoch, lr, cyclic_kwarg, batch_size=batch_size, grad_clip=10, momentum=0.9, X_test=X_valid, Y_test=Y_valid, l2_reg=0, eval=False, verbose=True)

TwoParamMDN(
  (shared): TabTransformer(
    (embed): Embedder(
      (embeddings): Embedding(7, 64)
    )
    (cont_embed): NNModel(
      (network): ModuleList(
        (0): Linear(in_features=18, out_features=64, bias=True)
      )
    )
    (pe): PositionalEncoder()
    (transformer): Transformer(
      (layers): ModuleList(
        (0-5): 6 x TransformerBlock(
          (mhsa): MultiHeadSelfAttention(
            (to_qvk): Linear(in_features=64, out_features=192, bias=False)
            (W_0): Linear(in_features=64, out_features=64, bias=False)
          )
          (drop): Dropout(p=0.1, inplace=False)
          (norm_1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (norm_2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (linear): Sequential(
            (0): Linear(in_features=64, out_features=1024, bias=True)
            (1): ReLU()
            (2): Dropout(p=0.1, inplace=False)
            (3): Linear(in_features=1024, out_features=64, bias=T

KeyboardInterrupt: ignored

In [10]:
best_state = deepcopy(mdn.losses['BState'])
mdn.model.load_state_dict(best_state), np.min(mdn.losses['Test'])
print(np.min(mdn.losses['Test']))
fig = make_subplots(rows=2, cols=1)
s = 0
fig.append_trace(go.Scatter(x=mdn.losses['Epoch'][s:], y=mdn.losses['Train'][s:],mode='lines',name='Train'), row=1, col=1)
fig.append_trace(go.Scatter(x=mdn.losses['Epoch'][s:], y=mdn.losses['Test'][s:],mode='lines',name='Test'), row=1, col=1)
fig.append_trace(go.Scatter(x=mdn.losses['Epoch'][s:], y=mdn.losses['LR'][s:],mode='lines',name='LR'), row=2, col=1)
fig.update_layout(height=1000, width=1500, title_text="Stacked Subplots")
fig.show()

5.509681701660156


In [None]:
outputs = mdn.prdict(X_test, len(X_test))
pi, rate, mdn_mu = np.asarray(outputs['pi']), np.asarray(outputs['rate']), np.asarray(outputs['pred'])
#nn_output = pd.DataFrame(torch.cat((pi, mu, std), 1).data.cpu().numpy())
'''mdn_lb, evaluations_lb = mdn.mixture_quantile(pi, mu, std, 0.025)
mdn_ub, evaluations_ub = mdn.mixture_quantile(pi, mu, std, 0.975)'''
'''mdn_lb = y_scaler.inverse_transform(mdn_lb).ravel()
mdn_ub = y_scaler.inverse_transform(mdn_ub).ravel()'''
pred = pd.DataFrame({"mu": mdn_mu, "lb": mdn_mu, "ub": mdn_mu})
pred = pd.concat((test.reset_index(drop=True), pred), 1)
pred


In a future version of pandas all arguments of concat except for the argument 'objs' will be keyword-only.



Unnamed: 0,temp,dew,humidity,precip,precipprob,snow,snowdepth,windspeed,winddir,sealevelpressure,...,pickup_datetime,sin_hourofday,cos_hourofday,sin_dayofweek,cos_dayofweek,sin_dayofyear,cos_dayofyear,mu,lb,ub
0,0.549407,0.507307,-0.198089,-1.0,-1.0,-1.0,-1.0,-0.882353,0.894444,0.067511,...,2016-06-15 00:00:00,0.000000e+00,1.000000,8.660254e-01,-0.5,-4.950088e-01,0.868888,213.274323,213.274323,213.274323
1,0.498024,0.515658,-0.095005,-1.0,-1.0,-1.0,-1.0,-0.717647,0.638889,0.071730,...,2016-06-15 01:00:00,2.697968e-01,0.962917,8.660254e-01,-0.5,-4.950088e-01,0.868888,87.504013,87.504013,87.504013
2,0.486166,0.511482,-0.081579,-1.0,-1.0,-1.0,-1.0,-0.882353,0.916667,0.071730,...,2016-06-15 02:00:00,5.195840e-01,0.854419,8.660254e-01,-0.5,-4.950088e-01,0.868888,51.908752,51.908752,51.908752
3,0.470356,0.486430,-0.101149,-1.0,-1.0,-1.0,-1.0,-0.905882,0.916667,0.050633,...,2016-06-15 03:00:00,7.308360e-01,0.682553,8.660254e-01,-0.5,-4.950088e-01,0.868888,48.767738,48.767738,48.767738
4,0.486166,0.453027,-0.182615,-1.0,-1.0,-1.0,-1.0,-0.581176,0.650000,0.050633,...,2016-06-15 04:00:00,8.878852e-01,0.460065,8.660254e-01,-0.5,-4.950088e-01,0.868888,61.324760,61.324760,61.324760
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,0.739130,0.753653,-0.085448,-1.0,-1.0,-1.0,-1.0,-0.510588,0.055556,0.109705,...,2016-06-30 19:00:00,-8.878852e-01,0.460065,1.224647e-16,-1.0,-2.449294e-16,1.000000,523.279236,523.279236,523.279236
380,0.739130,0.795407,-0.012857,-1.0,-1.0,-1.0,-1.0,-0.444706,0.111111,0.109705,...,2016-06-30 20:00:00,-7.308360e-01,0.682553,1.224647e-16,-1.0,-2.449294e-16,1.000000,505.275818,505.275818,505.275818
381,0.707510,0.816284,0.085675,-1.0,-1.0,-1.0,-1.0,-0.628235,0.055556,0.113924,...,2016-06-30 21:00:00,-5.195840e-01,0.854419,1.224647e-16,-1.0,-2.449294e-16,1.000000,483.331543,483.331543,483.331543
382,0.683794,0.816284,0.133462,-1.0,-1.0,-1.0,-1.0,-0.835294,0.638889,0.139241,...,2016-06-30 22:00:00,-2.697968e-01,0.962917,1.224647e-16,-1.0,-2.449294e-16,1.000000,425.263153,425.263153,425.263153


In [12]:
outputs = mdn.prdict(X_test, len(X_test))
pi, mu, std, mdn_mu = np.asarray(outputs['pi']), np.asarray(outputs['count']), np.asarray(outputs['probs']), np.asarray(outputs['pred'])
#nn_output = pd.DataFrame(torch.cat((pi, mu, std), 1).data.cpu().numpy())
'''pi, mu, std = torch.from_numpy(pi), torch.from_numpy(mu), torch.from_numpy(std)
mdn_lb, evaluations_lb = mdn.mixture_quantile(pi, mu, std, 0.025)
mdn_ub, evaluations_ub = mdn.mixture_quantile(pi, mu, std, 0.975)'''
#mdn_mu = YScaler.inverse_transform(mdn_mu[:,np.newaxis]).ravel()
'''mdn_lb = y_scaler.inverse_transform(mdn_lb).ravel()
mdn_ub = y_scaler.inverse_transform(mdn_ub).ravel()'''
pred = pd.DataFrame({"mu": mdn_mu, "lb": mdn_mu, "ub": mdn_mu})
pred = pd.concat((test.reset_index(drop=True), pred), 1)
pred


In a future version of pandas all arguments of concat except for the argument 'objs' will be keyword-only.



Unnamed: 0,temp,dew,humidity,precip,precipprob,snow,snowdepth,windspeed,winddir,sealevelpressure,...,pickup_datetime,sin_hourofday,cos_hourofday,sin_dayofweek,cos_dayofweek,sin_dayofyear,cos_dayofyear,mu,lb,ub
0,0.549407,0.507307,-0.198089,-1.0,-1.0,-1.0,-1.0,-0.882353,0.894444,0.067511,...,2016-06-15 00:00:00,0.000000e+00,1.000000,8.660254e-01,-0.5,-4.950088e-01,0.868888,293.415558,293.415558,293.415558
1,0.498024,0.515658,-0.095005,-1.0,-1.0,-1.0,-1.0,-0.717647,0.638889,0.071730,...,2016-06-15 01:00:00,2.697968e-01,0.962917,8.660254e-01,-0.5,-4.950088e-01,0.868888,118.446564,118.446564,118.446564
2,0.486166,0.511482,-0.081579,-1.0,-1.0,-1.0,-1.0,-0.882353,0.916667,0.071730,...,2016-06-15 02:00:00,5.195840e-01,0.854419,8.660254e-01,-0.5,-4.950088e-01,0.868888,67.490303,67.490303,67.490303
3,0.470356,0.486430,-0.101149,-1.0,-1.0,-1.0,-1.0,-0.905882,0.916667,0.050633,...,2016-06-15 03:00:00,7.308360e-01,0.682553,8.660254e-01,-0.5,-4.950088e-01,0.868888,51.926243,51.926243,51.926243
4,0.486166,0.453027,-0.182615,-1.0,-1.0,-1.0,-1.0,-0.581176,0.650000,0.050633,...,2016-06-15 04:00:00,8.878852e-01,0.460065,8.660254e-01,-0.5,-4.950088e-01,0.868888,51.876167,51.876167,51.876167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,0.739130,0.753653,-0.085448,-1.0,-1.0,-1.0,-1.0,-0.510588,0.055556,0.109705,...,2016-06-30 19:00:00,-8.878852e-01,0.460065,1.224647e-16,-1.0,-2.449294e-16,1.000000,482.316284,482.316284,482.316284
380,0.739130,0.795407,-0.012857,-1.0,-1.0,-1.0,-1.0,-0.444706,0.111111,0.109705,...,2016-06-30 20:00:00,-7.308360e-01,0.682553,1.224647e-16,-1.0,-2.449294e-16,1.000000,503.014923,503.014923,503.014923
381,0.707510,0.816284,0.085675,-1.0,-1.0,-1.0,-1.0,-0.628235,0.055556,0.113924,...,2016-06-30 21:00:00,-5.195840e-01,0.854419,1.224647e-16,-1.0,-2.449294e-16,1.000000,509.656433,509.656433,509.656433
382,0.683794,0.816284,0.133462,-1.0,-1.0,-1.0,-1.0,-0.835294,0.638889,0.139241,...,2016-06-30 22:00:00,-2.697968e-01,0.962917,1.224647e-16,-1.0,-2.449294e-16,1.000000,421.112579,421.112579,421.112579


In [13]:
def mase(train_y, test_y, pred):
    n = train_y.shape[0]
    d = np.abs(np.diff(train_y)).sum()/(n-1)
    errors = np.abs(test_y - pred)
    return errors.mean()/d

def mape(test_y, pred):
    return np.round(np.mean(np.abs(100*(test_y-pred)/(test_y + 1e-9))), 0)

def rmspe(test_y, pred):
    return (np.sqrt(np.mean(np.square((test_y - pred) / (test_y + 1e-9))))) * 100

def persistence(train_y, test_y):
    predictions, history = [], list(np.copy(train_y))
    for i in test_y:
        predictions.append(history[-1])
        history.append(i)
    return np.asarray(predictions)

def mda(actual, predicted):
    """ Mean Directional Accuracy """
    return np.mean((np.sign(actual[1:] - actual[:-1]) == np.sign(predicted[1:] - predicted[:-1])).astype(int))

def wape(true, pred):
    return np.sum(np.abs(true - pred))/np.sum(true)

naive = persistence(Y_train, Y_test)
errors = {'MAE':[mean_absolute_error(Y_test, pred['mu'].values)],
        'RMSE':[mean_squared_error(Y_test, pred['mu'].values)],
        'RMSPE': [rmspe(Y_test, pred['mu'].values)],
        'MAPE':[mape(Y_test, pred['mu'].values)],
        'R2': [r2_score(Y_test, pred['mu'].values)],
        'MASE':[mase(Y_train, Y_test, pred['mu'].values)],
        'MDA': [mda(Y_test, pred['mu'].values)],
        'WAPE':[wape(Y_test, pred['mu'].values)]}
errors = pd.DataFrame(errors, index =['THIS', 'NAIVE'])
errors

Unnamed: 0,MAE,RMSE,RMSPE,MAPE,R2,MASE,MDA,WAPE
THIS,48.587022,3933.484105,36.077478,20.0,0.779908,0.884865,0.673629,0.151431
NAIVE,48.587022,3933.484105,36.077478,20.0,0.779908,0.884865,0.673629,0.151431


In [14]:
fig = plot_gp(pred['mu'], pred['lb'], pred['ub'], pred[time_column], pred[out_column], train[time_column], train[out_column], name='', samples=[], layout='h')
fig.show()

In [None]:
imp = mdn.feature_importance(10, X_test, Y_test, batch_size=4096)
fig = go.Figure()
for i in range(X_test.shape[1]):
    fig.add_trace(go.Box(x=imp['importances'][:, i], name=X_test.columns[i]))
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

