In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, HuberRegressor
from sklearn import tree
from sklearn.svm import SVR

import mlflow

import missingno as msno

from highstreets.models import train_model as tm
from highstreets.data import make_dataset as mhsd
from highstreets.visualisation import visualise as vhsd
from highstreets.features import build_features as bf

from dotenv import load_dotenv, find_dotenv
import os

load_dotenv(find_dotenv())

YOY_FILE = os.environ.get("YOY_FILE")
PROFILE_FILE = os.environ.get("PROFILE_FILE")
PROJECT_ROOT = os.environ.get("PROJECT_ROOT")
MLFLOW_TRACKING_URI = os.environ.get("MLFLOW_TRACKING_URI")

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("High street profile regression experiments")
mlflow.sklearn.autolog(disable=True)

%load_ext autoreload
%autoreload 2

sns.set_theme(style="darkgrid")
sns.set_context("notebook")

### Load mastercard spend data along with high street profiles and setup data arrays and time vectors for convenience

In [None]:
hsp = pd.read_excel(PROFILE_FILE)
hsd_yoy = pd.read_csv(YOY_FILE, parse_dates=["week_start"])

# some important dates
nb_dates = pd.to_datetime(
    [
        "2020-03-24",  # first lockdown starts
        "2020-06-15",  # shops reopen
        "2020-11-05",  # second lockdown starts
        "2020-12-02",  # back to 'tier 2' (i.e. partial reopening)
        "2021-01-05",  # third lockdown starts
        "2021-04-12",  # shops reopen
    ]
)

# average weekday and weekend expenditure (should probably relax this
# later - no need to lose information)
hsd_yoy_minimal = mhsd.stack_retail_we_wd(hsd_yoy, "yoy_")

dates_2020 = ("2020-04-15", "2020-10-31")
dates_2020_full = ("2020-01-01", "2020-12-31")
dates_2021 = ("2021-02-12", "2021-08-31")
dates_full = ("2020-01-01", "2021-12-31")

data_2020 = mhsd.extract_data_array(hsd_yoy_minimal, dates_2020, "txn_amt")
data_2021 = mhsd.extract_data_array(hsd_yoy_minimal, dates_2021, "txn_amt")
data_2020_full = mhsd.extract_data_array(hsd_yoy_minimal, dates_2020_full, "txn_amt")
data_full = mhsd.extract_data_array(hsd_yoy_minimal, dates_full, "txn_amt")

start_times = {"2020": "2020-04-01", "2021": "2021-04-12", "full": "2020-04-01"}
tvecs = {"2020": data_2020.index, "2021": data_2021.index, "full": data_full.index}
arrays = {
    "2020": np.transpose(data_2020.to_numpy()),
    "2021": np.transpose(data_2021.to_numpy()),
    "full": np.transpose(data_full.to_numpy()),
}

### Run k-means on 2020, 2021, and full data separately:

In [None]:
n_clus = 3
max_iter = 50
tol = 1e-2

# 2020 data:
kmeans20 = KMeans(
    init="random",
    n_clusters=n_clus,
    random_state=None,
    max_iter=max_iter,
    tol=tol,
    copy_x=True,
    verbose=0,
    n_init=10,
)
kmeans20.fit(np.transpose(data_2020.to_numpy()))

# 2021 data:
kmeans21 = KMeans(
    init="random",
    n_clusters=n_clus,
    random_state=None,
    max_iter=max_iter,
    tol=tol,
    copy_x=True,
    verbose=0,
    n_init=10,
)
kmeans21.fit(np.transpose(data_2021.to_numpy()))

# full data:
kmeansfull = KMeans(
    init="random",
    n_clusters=n_clus,
    random_state=None,
    max_iter=max_iter,
    tol=tol,
    copy_x=True,
    verbose=0,
    n_init=10,
)
kmeansfull.fit(np.transpose(data_full.to_numpy()))

## Regressions: trying to summarise trends across time

In [None]:
# Unpooled regression - fit slope and intercept independently for each high street
fit_lines = {}
reg_model = {}

reg_model["2020"], fit_lines["2020"] = mhsd.get_fit_lines(
    start_times["2020"], tvecs["2020"], arrays["2020"], robust=False
)
reg_model["2021"], fit_lines["2021"] = mhsd.get_fit_lines(
    start_times["2021"], tvecs["2021"], arrays["2021"], robust=False
)
reg_model["full"], fit_lines["full"] = mhsd.get_fit_lines(
    start_times["full"], tvecs["full"], arrays["full"], robust=False
)

### Plot each highstreet with lines fit to each recovery period

In [None]:
# vhsd.plot_all_profiles_full(
#     {"2020": data_2020, "2021": data_2021, "full": data_full}, fit_lines
# )

### Sort highstreets by their 2020 mean and 2020 fit slope and plot by group

In [None]:
# columns by which we will sort the highstreets
# (for example,
# slope of the best fit line to 2020 recovery and the mean 2020)
sort_cols = (
    data_2020_full.loc[nb_dates[0] : nb_dates[3], :].mean().to_numpy(),
    reg_model["2020"].coef_,
)

plot_array = np.transpose(data_full.to_numpy())
plot_tvec = data_full.index
filename = "2020-sorted-by-mean-slope.pdf"

vhsd.plot_highstreets_grouped(
    plot_array,
    plot_tvec,
    sort_cols,
    nb_dates,
    filename,
    xlim=("2020-01-01", "2020-12-31"),
    figure_title="2020",
    n_grp=6,
)

### Sort highstreets by their 2021 mean and slope and plot in groups

In [None]:
# columns by which we will sort the highstreets
# (for example, slope of the best fit line to 2020 recovery
# and the initial hit in 2020)
sort_cols = (
    data_2021.loc["2021-03-01" : nb_dates[-1], :].mean().to_numpy(),
    reg_model["2021"].coef_,
)

plot_array = np.transpose(data_full.to_numpy())
plot_tvec = data_full.index
filename = "2021-sorted-by-mean-slope.pdf"

vhsd.plot_highstreets_grouped(
    plot_array,
    plot_tvec,
    sort_cols,
    nb_dates,
    filename,
    xlim=("2021-03-01", "2021-09-01"),
    figure_title="2021",
    n_grp=6,
)

### Sort highstreets by their 2020 means and slopes and plot across full period sorted into groups

In [None]:
# columns by which we will sort the highstreets
# (for example, slope of the best fit line to 2020 recovery
# and the initial hit in 2020)
sort_cols = (
    data_2020_full.loc["2020-03-14":"2020-11-01", :].mean().to_numpy(),
    reg_model["2020"].coef_,
)

plot_array = np.transpose(data_full.to_numpy())
plot_tvec = data_full.index
filename = "full-sorted-by-2020-mean-slope.pdf"

vhsd.plot_highstreets_grouped(
    plot_array,
    plot_tvec,
    sort_cols,
    nb_dates,
    filename,
    xlim=("2020-01-01", "2021-09-01"),
    figure_title="Full period (sorted by 2020 params)",
    n_grp=6,
)

### Append 2020 & 2021 means and fit lines to High Street Profiles for further analysis

In [None]:
stats = mhsd.append_profile_features(hsp, data_full, reg_model)
stats = bf.clean_hs_profiles(stats)
stats.columns

In [None]:
msno.matrix(stats);  # Option 3: With missingno

### Look at correlations between features

In [None]:
plt.figure(figsize=(30, 16))
sns.heatmap(stats.corr(), annot=True, cmap="viridis")
plt.savefig(PROJECT_ROOT + "/reports/figures/feature-correlations.pdf")

### Fit some simple regressions

In [None]:
# One-hot encode categorical features
data = pd.get_dummies(stats)

target_variables = [
    "mean 2020",
    "mean 2021",
    "slope 2020",
    "slope 2021",
]

target_col = "mean 2020"

y = data[target_col]
X = data.drop(columns=target_variables)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

### Baseline model is ridge regression with regularization parameter chosen with cross-validation

In [None]:
with mlflow.start_run():

    ridge_model = Ridge()
    tuned_params = {"model__alpha": [200, 300, 400, 500, 600, 700, 800, 900]}
    best_rrm, fig_train_test, _ = tm.run_experiment_w_cv(
        ridge_model, tuned_params, X_train, X_test, y_train, y_test
    )

    mlflow.log_metrics(tm.evaluate(best_rrm, X_test, y_test))
    mlflow.log_param("regression target", target_col)
    mlflow.sklearn.log_model(best_rrm, "Ridge Regression Model")
    mlflow.log_figure(fig_train_test, "train_test_results.png")

### Huber regression to see if robustness to outliers in the target variable helps

In [None]:
with mlflow.start_run():
    huber_model = HuberRegressor()

    tuned_params = {
        "model__epsilon": [1.35],
        "model__alpha": [300, 400, 600],
    }

    best_huber, fig, ax = tm.run_experiment_w_cv(
        huber_model, tuned_params, X_train, X_test, y_train, y_test
    )

    mlflow.log_metrics(tm.evaluate(best_huber, X_test, y_test))
    mlflow.log_param("regression target", target_col)
    mlflow.sklearn.log_model(best_huber, "Huber Regression Model")
    mlflow.log_figure(fig, "train_test_results.png")

### Next try Support Vector Regression with hyperparameters chosen by cross-validation

In [None]:
tuned_parameters = {
    "model__kernel": ["rbf", "poly"],
    "model__C": [0.001, 0.01, 0.1, 0.2],
    "model__epsilon": [0.0009, 0.001, 0.0015, 0.002],
}

svr_model = SVR(gamma="scale")

best_svr = tm.run_experiment_w_cv(
    svr_model, tuned_parameters, X_train, X_test, y_train, y_test
)

svr_r2, svr_mae, svr_mse = tm.evaluate(best_svr, X_test, y_test)

### Decision tree regressors

In [None]:
tuned_parameters = {
    "model__max_depth": [2, 4, 6, 8],
    "model__criterion": ["squared_error", "friedman_mse", "absolute_error", "poisson"],
}

dtr_model = tree.DecisionTreeRegressor()

best_dtr = tm.run_experiment_w_cv(
    dtr_model, tuned_parameters, X_train, X_test, y_train, y_test
)

### Here we try sorting and grouping highstreets by 2020 mean and slope and seeing if these groupings can be predicted from profile data

In [None]:
stats_w_grps = bf.add_split_group_vals(stats)

In [None]:
stats_w_grps.groupby(["group_mean 2020"]).groups

In [None]:
stats.head()