In [2]:
# standard library
import os

import pandas as pd
import seaborn as sns
import numpy as np
from highstreets.data import make_dataset as mhsd
from dotenv import load_dotenv, find_dotenv

import numpyro
from numpyro.infer import MCMC, NUTS, Predictive, init_to_feasible
from numpyro.infer.reparam import TransformReparam
import numpyro.distributions as dist
from jax import random
import arviz as az

import dill

assert numpyro.__version__.startswith("0.9.2")

load_dotenv(find_dotenv())

YOY_FILE = os.environ.get("YOY_FILE")
PROFILE_FILE = os.environ.get("PROFILE_FILE")
PROJECT_ROOT = os.environ.get("PROJECT_ROOT")

%load_ext autoreload
%autoreload 2

sns.set_theme(style="darkgrid")
sns.set_context("notebook")

### Load mastercard spend data along with high street profiles and setup data arrays and time vectors for convenience


In [3]:
hsp = pd.read_excel(PROFILE_FILE)
hsd_yoy = pd.read_csv(YOY_FILE, parse_dates=["week_start"])

# some important dates
nb_dates = pd.to_datetime(
    [
        "2020-03-24",  # first lockdown starts
        "2020-06-15",  # shops reopen
        "2020-11-05",  # second lockdown starts
        "2020-12-02",  # back to 'tier 2' (i.e. partial reopening)
        "2021-01-05",  # third lockdown starts
        "2021-04-12",  # shops reopen
    ]
)

# average weekday and weekend expenditure (should probably relax this
# later - no need to lose information)
hsd_yoy_minimal = mhsd.avg_retail_wd_we(hsd_yoy, "yoy_").dropna(how="any", axis="rows")

dates_2020 = ("2020-04-15", "2020-10-31")
dates_2020_full = ("2020-01-01", "2020-12-31")
dates_2021 = ("2021-02-12", "2021-08-31")
dates_full = ("2020-01-01", "2021-12-31")

data_2020 = mhsd.extract_data_array(hsd_yoy_minimal, dates_2020, "txn_amt")
data_2021 = mhsd.extract_data_array(hsd_yoy_minimal, dates_2021, "txn_amt")
data_2020_full = mhsd.extract_data_array(hsd_yoy_minimal, dates_2020_full, "txn_amt")
data_full = mhsd.extract_data_array(hsd_yoy_minimal, dates_full, "txn_amt")

start_times = {"2020": "2020-04-01", "2021": "2021-04-12", "full": "2020-04-01"}
tvecs = {"2020": data_2020.index, "2021": data_2021.index, "full": data_full.index}
arrays = {
    "2020": np.transpose(data_2020.to_numpy()),
    "2021": np.transpose(data_2021.to_numpy()),
    "full": np.transpose(data_full.to_numpy()),
}

### Set up data to be used in hierarcical regressions

In [4]:
predictors = [
    "percentage of commercial addresses (%)",
    "total estimated number of home workers",
    "Sum_y2019_07wd",
]
full_data = (
    hsd_yoy_minimal.join(
        hsp[["highstreet_id"] + predictors],
        on="highstreet_id",
        how="left",
        lsuffix="_left",
        rsuffix="_right",
    )
    .drop(["highstreet_id_right", "txn_cnt"], axis=1)
    .rename(columns={"highstreet_id_left": "highstreet_id"})
)
full_data["weeks_since_start"] = (
    full_data.index - pd.to_datetime(nb_dates[0])
) / pd.Timedelta(1, "W")
train = full_data.loc[nb_dates[0] : nb_dates[1]]
train.head()

Unnamed: 0_level_0,week_start,highstreet_id,highstreet_name,txn_amt,percentage of commercial addresses (%),total estimated number of home workers,Sum_y2019_07wd,weeks_since_start
week_start,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-03-30,2020-03-30,1,"Pimlico Road, Belgravia",0.039,10.004793,10374.0,7354.840627,0.857143
2020-03-30,2020-03-30,2,"Queensway, Westbourne Grove, Bayswater",0.198,6.657224,4388.0,1141.060698,0.857143
2020-03-30,2020-03-30,3,"Carshalton Road, Carshalton.",0.0,3.311258,3632.0,1410.542551,0.857143
2020-03-30,2020-03-30,4,"Mitcham Road, Croydon.",1.297,8.472856,2879.0,2159.100879,0.857143
2020-03-30,2020-03-30,5,"Bridge Road, Chessington.",0.0,7.789256,3672.0,2693.80858,0.857143


### Define hierarchical regression model

In [6]:
def model(highstreet_id, weeks, hs_obs=None):
    mu_a = numpyro.sample("mu_a", dist.Normal(0.0, 1.0))
    sigma_a = numpyro.sample("sigma_a", dist.HalfNormal(1.0))
    mu_b = numpyro.sample("mu_b", dist.Normal(0.0, 1.0))
    sigma_b = numpyro.sample("sigma_b", dist.HalfNormal(1.0))

    unique_hs_ids = np.unique(highstreet_id)
    n_hs = len(unique_hs_ids)

    with numpyro.plate("plate_i", n_hs):
        a = numpyro.sample("a", dist.Normal(mu_a, sigma_a))
        b = numpyro.sample("b", dist.Normal(mu_b, sigma_b))

    sigma = numpyro.sample("sigma", dist.HalfNormal(1.0))
    hs_est = a[highstreet_id] + b[highstreet_id] * weeks

    with numpyro.plate("data", len(highstreet_id)):
        numpyro.sample("obs", dist.Normal(hs_est, sigma), obs=hs_obs)

### Define non-centered version of the same model 

In [17]:
def model_noncentered(highstreet_id, weeks, hs_obs=None):
    mu_a = numpyro.sample("mu_a", dist.Normal(0.0, 1.0))
    sigma_a = numpyro.sample("sigma_a", dist.HalfNormal(1.0))
    mu_b = numpyro.sample("mu_b", dist.Normal(0.0, 1.0))
    sigma_b = numpyro.sample("sigma_b", dist.HalfNormal(1.0))

    unique_hs_ids = np.unique(highstreet_id)
    n_hs = len(unique_hs_ids)

    with numpyro.plate("plate_i", n_hs):
        with numpyro.handlers.reparam(
            config={
                "a": TransformReparam(),
                "b": TransformReparam(),
            }
        ):
            a = numpyro.sample(
                "a",
                dist.TransformedDistribution(
                    dist.Normal(0.0, 1.0),
                    dist.transforms.AffineTransform(mu_a, sigma_a),
                ),
            )
            b = numpyro.sample(
                "b",
                dist.TransformedDistribution(
                    dist.Normal(0.0, 1.0),
                    dist.transforms.AffineTransform(mu_b, sigma_b),
                ),
            )

    sigma = numpyro.sample("sigma", dist.HalfNormal(1.0))
    hs_est = a[highstreet_id] + b[highstreet_id] * weeks

    with numpyro.plate("data", len(highstreet_id)):
        numpyro.sample("obs", dist.Normal(hs_est, sigma), obs=hs_obs)

In [18]:
hs_obs = train["txn_amt"].values
weeks = train["weeks_since_start"].values
highstreet_id = train["highstreet_id"].values

### Sample from the posterior

In [None]:
nuts_kernel = NUTS(model, init_strategy=init_to_feasible())

mcmc = MCMC(nuts_kernel, num_samples=2000, num_warmup=2000)
rng_key = random.PRNGKey(0)
mcmc.run(rng_key, highstreet_id, weeks, hs_obs=hs_obs)
posterior_samples = mcmc.get_samples()

#### Save the model samples

In [23]:
dill_file = PROJECT_ROOT + "/models/bayesian/posterior_samples_basic.pkl"
with open(dill_file, "wb") as f:
    dill.dump(
        posterior_samples,
        f,
    )

In [19]:
nuts_kernel_noncentered = NUTS(model_noncentered, init_strategy=init_to_feasible())

mcmc_noncentered = MCMC(nuts_kernel_noncentered, num_samples=2000, num_warmup=2000)
rng_key = random.PRNGKey(0)
mcmc_noncentered.run(rng_key, highstreet_id, weeks, hs_obs=hs_obs)

posterior_samples_noncentered = mcmc_noncentered.get_samples()

sample: 100%|██████████| 4000/4000 [12:12<00:00,  5.46it/s, 1023 steps of size 9.10e-04. acc. prob=0.78]


In [None]:
dill_file = PROJECT_ROOT + "/models/bayesian/posterior_samples_noncentered_basic.pkl"
with open(dill_file, "wb") as f:
    dill.dump(
        posterior_samples_noncentered,
        f,
    )

### Load previous samples

In [5]:
dill_file = PROJECT_ROOT + "models/bayesian/posterior_samples_basic.pkl"
with open(dill_file, "rb") as f:
    posterior_samples = dill.load(f)

EOFError: Ran out of input

### Trace plot for centered model

In [None]:
data = az.from_numpyro(mcmc)
az.plot_trace(data, compact=True, figsize=(14, 32))

### Trace plot for unnoncentered model

In [None]:
data_noncentered = az.from_numpyro(mcmc_noncentered)
az.plot_trace(data_noncentered, compact=True, figsize=(14, 32))

### Examine model output

In [None]:
mcmc.print_summary()

In [None]:
mcmc_noncentered.print_summary()

### Compare predictions to data

In [122]:
weeks_pred = train[train["highstreet_id"] == 1]["weeks_since_start"]
pred_template = []
for i in np.unique(train["highstreet_id"]):
    df = pd.DataFrame(columns=["highstreet_id", "weeks"])
    df["weeks"] = weeks_pred
    df["highstreet_id"] = i
    pred_template.append(df)
pred_template = pd.concat(pred_template, ignore_index=True)

highstreet_id = pred_template["highstreet_id"].values
weeks = pred_template["weeks"].values
predictive = Predictive(model, posterior_samples, return_sites=["sigma", "obs"])
samples_predictive = predictive(random.PRNGKey(0), highstreet_id, weeks, None)

df = pd.DataFrame(
    columns=["highstreet_id", "weeks_since_start", "txn_amt_pred", "sigma"]
)
df["highstreet_id"] = pred_template["highstreet_id"]
df["weeks_since_start"] = pred_template["weeks"]
df["txn_amt_pred"] = samples_predictive["obs"].T.mean(axis=1)
df["sigma"] = samples_predictive["obs"].T.std(axis=1)
df["hs_inf"] = df["txn_amt_pred"] - df["sigma"]
df["hs_sup"] = df["txn_amt_pred"] + df["sigma"]
df = pd.merge(
    df,
    train[["highstreet_id", "weeks_since_start", "txn_amt"]],
    how="left",
    on=["highstreet_id", "weeks_since_start"],
)
df = df.rename(columns={"txn_amt": "txn_amt_true"})
df.head()