In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, HuberRegressor
from sklearn import tree
from sklearn.svm import SVR
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.metrics import (
    silhouette_samples,
    silhouette_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
)
import matplotlib.cm as cm
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB


import mlflow

from highstreets.models import train_model as tm
from highstreets.data import make_dataset as mhsd
from highstreets.visualisation import visualise as vhsd
from highstreets.features import build_features as bf

from dotenv import load_dotenv, find_dotenv
import os

load_dotenv(find_dotenv())

YOY_FILE = os.environ.get("YOY_FILE")
PROFILE_FILE = os.environ.get("PROFILE_FILE")
PROJECT_ROOT = os.environ.get("PROJECT_ROOT")
MLFLOW_TRACKING_URI = os.environ.get("MLFLOW_TRACKING_URI")
TC_LOOKUP = os.environ.get("TC_LOOKUP")
O2_CLUSTERS = os.environ.get("O2_CLUSTERS")

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("High street profile regression experiments")
mlflow.sklearn.autolog(disable=True)

%load_ext autoreload
%autoreload 2

sns.set_theme(style="darkgrid")
sns.set_context("notebook")

### Load mastercard spend data along with high street profiles and setup data arrays and time vectors for convenience

In [None]:
hsp = pd.read_excel(PROFILE_FILE)
hsd_yoy = pd.read_csv(YOY_FILE, parse_dates=["week_start"])

# some important dates
nb_dates = pd.to_datetime(
    [
        "2020-03-24",  # first lockdown starts
        "2020-06-15",  # shops reopen
        "2020-11-05",  # second lockdown starts
        "2020-12-02",  # back to 'tier 2' (i.e. partial reopening)
        "2021-01-05",  # third lockdown starts
        "2021-04-12",  # shops reopen
    ]
)

hsd_yoy_minimal = mhsd.stack_retail_we_wd(hsd_yoy, "yoy_")

hsd_yoy_minimal.head()

In [None]:
dates_2020 = ("2020-04-15", "2020-10-31")
dates_2020_full = ("2020-01-01", "2020-12-31")
dates_2021 = ("2021-02-12", "2021-08-31")
dates_full = ("2020-01-01", "2021-12-31")

data_2020 = mhsd.extract_data_array(hsd_yoy_minimal, dates_2020, "txn_amt")
data_2021 = mhsd.extract_data_array(hsd_yoy_minimal, dates_2021, "txn_amt")
data_2020_full = mhsd.extract_data_array(hsd_yoy_minimal, dates_2020_full, "txn_amt")
data_full = mhsd.extract_data_array(hsd_yoy_minimal, dates_full, "txn_amt")

start_times = {"2020": "2020-04-01", "2021": "2021-04-12", "full": "2020-04-01"}
tvecs = {"2020": data_2020.index, "2021": data_2021.index, "full": data_full.index}
arrays = {
    "2020": np.transpose(data_2020.to_numpy()),
    "2021": np.transpose(data_2021.to_numpy()),
    "full": np.transpose(data_full.to_numpy()),
}

### Run k-means on 2020, 2021, and full data separately:

In [None]:
n_clus = 3
max_iter = 50
tol = 1e-2

# 2020 data:
kmeans20 = KMeans(
    init="random",
    n_clusters=n_clus,
    random_state=None,
    max_iter=max_iter,
    tol=tol,
    copy_x=True,
    verbose=0,
    n_init=10,
)
kmeans20.fit(np.transpose(data_2020.to_numpy()))

# 2021 data:
kmeans21 = KMeans(
    init="random",
    n_clusters=n_clus,
    random_state=None,
    max_iter=max_iter,
    tol=tol,
    copy_x=True,
    verbose=0,
    n_init=10,
)
kmeans21.fit(np.transpose(data_2021.to_numpy()))

# full data:
kmeansfull = KMeans(
    init="random",
    n_clusters=n_clus,
    random_state=None,
    max_iter=max_iter,
    tol=tol,
    copy_x=True,
    verbose=0,
    n_init=10,
)
kmeansfull.fit(np.transpose(data_full.to_numpy()))

## Regressions: trying to summarise trends across time

In [None]:
# Unpooled regression - fit slope and intercept independently for each high street
fit_lines = {}
reg_model = {}

reg_model["2020"], fit_lines["2020"] = bf.get_fit_lines(
    start_times["2020"], tvecs["2020"], arrays["2020"], robust=False
)
reg_model["2021"], fit_lines["2021"] = bf.get_fit_lines(
    start_times["2021"], tvecs["2021"], arrays["2021"], robust=False
)
reg_model["full"], fit_lines["full"] = bf.get_fit_lines(
    start_times["full"], tvecs["full"], arrays["full"], robust=False
)

### Run K-means on fit parameters only for comparison with results of k-means run on the full timeseries

In [None]:
x_cluster_20 = reg_model["2020"].coef_
y_cluster_20 = reg_model["2020"].intercept_.reshape(-1, 1)
# x_cluster_20 = array_2020.mean(1).reshape(-1,1)
# y_cluster_20 = array_2020.mean(1).reshape(-1,1)

fit_params_20 = np.concatenate((x_cluster_20, y_cluster_20), axis=1)

n_clus = 3

kmeans_lines = KMeans(
    init="random",
    n_clusters=n_clus,
    random_state=None,
    max_iter=max_iter,
    tol=tol,
    copy_x=True,
    verbose=0,
    n_init=10,
)
kmeans_lines.fit(fit_params_20)

x_cluster_21 = reg_model["2021"].coef_
y_cluster_21 = reg_model["2021"].intercept_.reshape(-1, 1)
# x_cluster_20 = array_2020.mean(1).reshape(-1,1)

fit_params_21 = np.concatenate((x_cluster_21, y_cluster_21), axis=1)

kmeans_lines_21 = KMeans(
    init="random",
    n_clusters=n_clus,
    random_state=None,
    max_iter=max_iter,
    tol=tol,
    copy_x=True,
    verbose=0,
    n_init=10,
)
kmeans_lines_21.fit(fit_params_21)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 12), sharey=True)

x_20 = x_cluster_20.flatten()
y_20 = y_cluster_20.flatten()
x_21 = x_cluster_21.flatten()
y_21 = y_cluster_21.flatten()

sns.scatterplot(x=x_20, y=y_20, hue=kmeans_lines.labels_, ax=axes[0][0])
sns.scatterplot(x=x_20, y=y_20, hue=kmeans20.labels_, ax=axes[0][1])

sns.scatterplot(x=x_21, y=y_21, hue=kmeans_lines_21.labels_, ax=axes[1][0])
sns.scatterplot(x=x_21, y=y_21, hue=kmeans21.labels_, ax=axes[1][1])

for ax in axes.reshape(-1):
    yl = ax.get_ylim()
    ax.plot([0, 0], yl, "--r")
    ax.set_ylim([yl[0], 4])
    ax.set_xlim([-0.01, 0.016])
    ax.set_xlabel("Best fit slope")
    ax.set_ylabel("Intercept")

axes[0][0].set_title("Clustering run on slopes and intercepts")
axes[0][1].set_title("Clustering run on full timeseries")

In [None]:
param_shifts = fit_params_21 - fit_params_20
sns.scatterplot(x=param_shifts[:, 0], y=param_shifts[:, 1], hue=kmeans_lines.labels_)

In [None]:
adjusted_rand_score(kmeans20.labels_, kmeans21.labels_)

In [None]:
kmeans_diff = KMeans(
    init="random",
    n_clusters=n_clus,
    random_state=None,
    max_iter=max_iter,
    tol=tol,
    copy_x=True,
    verbose=0,
    n_init=10,
)
kmeans_diff.fit(param_shifts)
sns.scatterplot(x=param_shifts[:, 0], y=param_shifts[:, 1], hue=kmeans_diff.labels_)

In [None]:
fit_params_20 = np.concatenate((x_cluster_20, y_cluster_20), axis=1)
fit_params_21 = np.concatenate((x_cluster_21, y_cluster_21), axis=1)

scaler = StandardScaler()
scaler.fit(fit_params_20)

fit_params_scaled_20 = scaler.transform(fit_params_20)
fit_params_scaled_21 = scaler.transform(fit_params_21)

X = fit_params_20[:, 0]
Y = fit_params_20[:, 1]

param_shifts = fit_params_scaled_21 - fit_params_scaled_20

U = param_shifts[:, 0]
V = param_shifts[:, 1]

ax, _ = plt.subplots(1, 1, figsize=(10, 8))
plt.quiver(X, Y, U, V, [kmeans_lines.labels_])
plt.gca().set_xlim([-0.0075, 0.0175])
plt.gca().set_ylim([-1, 3])
plt.set_cmap("Set1")
plt.ylabel("Intercept")
plt.xlabel("Slope")
plt.show()

### Plot each highstreet with lines fit to each recovery period

In [None]:
# vhsd.plot_all_profiles_full(
#     {"2020": data_2020, "2021": data_2021, "full": data_full}, fit_lines
# )

### Sort highstreets by their 2020 mean and 2020 fit slope and plot by group

In [None]:
# columns by which we will sort the highstreets
# (for example,
# slope of the best fit line to 2020 recovery and the mean 2020)

hit_percent_2020 = (
    data_2020_full.loc[nb_dates[0] : nb_dates[1]].mean()
    / data_2020_full.loc["2020-01-04":"2020-03-01"].mean()
).to_numpy()[:, np.newaxis]
mean_2020 = (
    data_2020_full.loc[nb_dates[0] : nb_dates[3], :].mean().to_numpy()[:, np.newaxis]
)

# sort_by = 'hit' #'mean'

sort_cols = (
    hit_percent_2020,
    reg_model["2020"].coef_,
)

plot_array = np.transpose(data_full.to_numpy())
plot_tvec = data_full.index
filename = "2020-sorted-by-hit-slope.pdf"

vhsd.plot_highstreets_grouped(
    plot_array,
    plot_tvec,
    sort_cols,
    nb_dates,
    filename,
    xlim=("2020-01-01", "2020-12-31"),
    figure_title="2020",
    n_grp=4,
)

In [None]:
hit_percent_2020 = (
    data_2020_full.loc[nb_dates[0] : nb_dates[1]].quantile(q=0.2, axis=0)
    / data_2020_full.loc["2020-01-04":"2020-03-01"].mean()
).to_numpy()[:, np.newaxis]
mean_2020 = (
    data_2020_full.loc[nb_dates[0] : nb_dates[3], :].mean().to_numpy()[:, np.newaxis]
)

# sort_by = 'hit' #'mean'

sort_cols = (
    hit_percent_2020,
    365
    * reg_model[
        "2020"
    ].coef_,  # multiply by 365 to convert slopes into units of MRLI/year
)

ret = vhsd.plot_highstreets_grouped(
    plot_array,
    plot_tvec,
    sort_cols,
    nb_dates,
    filename,
    xlim=("2020-01-01", "2020-12-31"),
    figure_title="2020",
    n_grp=4,
    equal_hs_per_bin=False,
    low_pct=10,
    high_pct=90,
)

### Sort highstreets by their 2021 mean and slope and plot in groups

In [None]:
# columns by which we will sort the highstreets
# (for example, slope of the best fit line to 2020 recovery
# and the initial hit in 2020)
sort_cols = (
    data_2021.loc["2021-03-01" : nb_dates[-1], :].mean().to_numpy()[:, np.newaxis],
    reg_model["2021"].coef_,
)

plot_array = np.transpose(data_full.to_numpy())
plot_tvec = data_full.index
filename = "2021-sorted-by-mean-slope.pdf"


vhsd.plot_highstreets_grouped(
    plot_array,
    plot_tvec,
    sort_cols,
    nb_dates,
    filename,
    xlim=("2021-01-05", "2021-09-01"),
    figure_title="2021",
    n_grp=6,
)

### Sort highstreets by their 2020 means and slopes and plot across full period sorted into groups

In [None]:
hs_id_name_lookup = dict(
    zip(
        data_2020_full.columns.get_level_values(1),
        data_2020_full.columns.get_level_values(2),
    )
)

In [None]:
# columns by which we will sort the highstreets
# (for example, slope of the best fit line to 2020 recovery
# and the initial hit in 2020)
hit_percent_2021 = (
    data_full.loc["2021-12-01":"2022-01-01"].mean()
    / data_full.loc["2021-08-01":"2021-10-01"].mean()
).to_numpy()[:, np.newaxis]

mean_2020 = (
    data_2020_full.loc["2020-03-14":"2020-11-01", :].mean().to_numpy()[:, np.newaxis]
)

sort_cols = (
    hit_percent_2020,
    reg_model["2020"].coef_,
)

plot_array = np.transpose(data_full.to_numpy())
plot_tvec = data_full.index
filename = "full-sorted-by-2020-hit-slope.pdf"

vhsd.plot_highstreets_grouped(
    plot_array,
    plot_tvec,
    sort_cols,
    nb_dates,
    filename,
    xlim=("2020-01-01", "2021-09-01"),
    figure_title="Full period (sorted by 2020 params)",
    n_grp=4,
)

### Append 2020 & 2021 means and fit lines to High Street Profiles for further analysis

In [None]:
stats = bf.append_profile_features(hsp, data_full, reg_model)
stats = bf.clean_hs_profiles(stats)

group_cols = ["hit percent 2020", "slope 2020"]
low_pct, high_pct = 10, 90

rcg_names = [
    "mrli_hp_2020_group",
    "mrli_slope_2020_group",
    "group",
]

n_grp = 1

stats = bf.hist2d_highstreets(
    stats,
    n_grp=n_grp,
    group_cols=group_cols,
    rcg_names=rcg_names,
    low_pct=low_pct,
    high_pct=high_pct,
)

stats_out_paul = stats[["highstreet_name"] + rcg_names]

stats_out_paul.to_csv(
    f"{PROJECT_ROOT}/data/HS_mrli_by_hitpct_slope_group_2020_{n_grp+2}.csv"
)

### Histogram of numerical feature distributions to look for any further skewed features that should be transformed

In [None]:
feature_hist = stats.hist(figsize=(18, 15))

### Look at correlations between features

In [None]:
plt.figure(figsize=(30, 16))
sns.set(font_scale=1.6)
sns.heatmap(stats.corr(), annot=True, cmap="viridis")
plt.savefig(PROJECT_ROOT + "/reports/figures/feature-correlations.pdf")

In [None]:
numerical_features = [
    "mean 2020",
    "mean 2021",
    "hit percent 2020",
    "hit percent 2021",
    "percent_eating",
    "percent_apparel",
    "percent_retail",
    "percent_we",
    "percent_wd",
    "slope 2020",
    "slope 2021",
    "num_addresses",
    "pct residential addresses",
    "average IMD2019 score",
    "pct JSA 2021",
    "pct work age",
    "log_pct commercial addresses",
    "log_Pop",
    "log_2019 scale",
    "log_pct offices",
    "log_pct HW",
    "log_pct employees",
    "cluster_hourly",
    "cluster_daily",
    "cluster_size",
]

X = stats[numerical_features].dropna().to_numpy()

pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("pca", PCA(n_components=10)),
    ]
)

pipe.fit(X)

plt.plot(pipe.named_steps["pca"].explained_variance_, marker="o")

X_red = pipe.fit_transform(X)

In [None]:
kmeans_profiles = KMeans(n_clusters=4).fit(X)

metrics.silhouette_score(X, kmeans_profiles.labels_, metric="euclidean")

In [None]:
range_n_clusters = [2, 3, 4, 5]

for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print(
        "For n_clusters =",
        n_clusters,
        "The average silhouette_score is :",
        silhouette_avg,
    )

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("Silhouette plot for the various clusters.")
    ax1.set_xlabel("Silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(
        X_red[:, 0],
        X_red[:, 1],
        marker=".",
        s=30,
        lw=0,
        alpha=0.7,
        c=colors,
        edgecolor="k",
    )

    # Labeling the clusters
    centers = clusterer.cluster_centers_
    centers = pipe.transform(centers)
    # Draw white circles at cluster centers
    ax2.scatter(
        centers[:, 0],
        centers[:, 1],
        marker="o",
        c="white",
        alpha=1,
        s=200,
        edgecolor="k",
    )

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")

    ax2.set_title("High street clusters in PC projection")
    ax2.set_xlabel("1st PC")
    ax2.set_ylabel("2nd PC")

    plt.suptitle(
        "Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
        % n_clusters,
        fontsize=14,
        fontweight="bold",
    )

plt.show()

In [None]:
def scramble(a, axis=-1):
    """
    Return an array with the values of `a` independently shuffled along the
    given axis
    """
    b = a.swapaxes(axis, -1)
    n = a.shape[axis]
    idx = np.random.choice(n, n, replace=False)
    b = b[..., idx]
    return b.swapaxes(axis, -1)

In [None]:
X = scramble(X, axis=0)

kmeans_profiles_rand = KMeans(n_clusters=4).fit(X)

range_n_clusters = [2, 3, 4, 5]

for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=12)
    cluster_labels = clusterer.fit_predict(X)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print(
        "For n_clusters =",
        n_clusters,
        "The average silhouette_score is :",
        silhouette_avg,
    )

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("Silhouette plot for the various clusters.")
    ax1.set_xlabel("Silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(
        X_red[:, 0],
        X_red[:, 1],
        marker=".",
        s=30,
        lw=0,
        alpha=0.7,
        c=colors,
        edgecolor="k",
    )

    # Labeling the clusters
    centers = clusterer.cluster_centers_
    centers = pipe.transform(centers)
    # Draw white circles at cluster centers
    ax2.scatter(
        centers[:, 0],
        centers[:, 1],
        marker="o",
        c="white",
        alpha=1,
        s=200,
        edgecolor="k",
    )

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")

    ax2.set_title("High street clusters in PC projection")
    ax2.set_xlabel("1st PC")
    ax2.set_ylabel("2nd PC")

    plt.suptitle(
        "Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
        % n_clusters,
        fontsize=14,
        fontweight="bold",
    )

plt.show()

### Fit some simple regressions

In [None]:
feature_cols = [
    "percent_eating",
    "percent_apparel",
    "percent_retail",
    "percent_we",
    "percent_wd",
    "num_addresses",
    "pct residential addresses",
    "average IMD2019 score",
    "loac rank 1",
    "ptal",
    "pct JSA 2021",
    "pct work age",
    "log_pct commercial addresses",
    "log_Pop",
    "log_2019 scale",
    "log_pct offices",
    "log_pct HW",
    "log_pct employees",
    "cluster_daily",
    "cluster_hourly",
    "cluster_size",
]

target_col = ["slope 2020"]

# drop non-feature columns
data = stats[feature_cols + target_col]

# One-hot encode categorical features
data = pd.get_dummies(data).dropna()

# define target and features
y = data[target_col]
X = data.drop(columns=target_col)

# make train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

### Baseline model is ridge regression with regularization parameter chosen with cross-validation

In [None]:
with mlflow.start_run() as run:

    ridge_model = Ridge()
    tuned_params = {"model__alpha": [200, 300, 400, 500, 600, 700, 800, 900]}
    best_rrm, fig_train_test = tm.run_experiment_w_cv(
        ridge_model, tuned_params, X_train, X_test, y_train, y_test
    )

    mlflow.log_metrics(tm.evaluate(best_rrm, X_test, y_test))
    mlflow.log_param("regression target", target_col)
    mlflow.sklearn.log_model(best_rrm, "Ridge Regression Model")
    mlflow.log_figure(fig_train_test, "train_test_results.png")

### Huber regression to see if robustness to outliers in the target variable helps

In [None]:
with mlflow.start_run():
    huber_model = HuberRegressor()

    tuned_params = {
        "model__epsilon": [1.35],
        "model__alpha": [300, 400, 600],
    }

    best_huber, fig = tm.run_experiment_w_cv(
        huber_model, tuned_params, X_train, X_test, y_train, y_test
    )

    mlflow.log_metrics(tm.evaluate(best_huber, X_test, y_test))
    mlflow.log_param("regression target", target_col)
    mlflow.sklearn.log_model(best_huber, "Huber Regression Model")
    mlflow.log_figure(fig, "train_test_results.png")

### Next try Support Vector Regression with hyperparameters chosen by cross-validation

In [None]:
with mlflow.start_run():

    tuned_parameters = {
        "model__kernel": ["rbf", "poly"],
        "model__C": [0.0001, 0.001, 0.01, 0.1, 0.2],
        "model__epsilon": [0.0009, 0.001, 0.0015, 0.002],
    }

    svr_model = SVR(gamma="scale")

    best_svr, fig = tm.run_experiment_w_cv(
        svr_model, tuned_parameters, X_train, X_test, y_train, y_test
    )

    mlflow.log_metrics(tm.evaluate(best_svr, X_test, y_test))
    mlflow.log_param("regression target", target_col)
    mlflow.sklearn.log_model(best_svr, "SVR Model")
    mlflow.log_figure(fig, "train_test_results.png")

### Decision tree regressors

In [None]:
with mlflow.start_run():

    tuned_parameters = {
        "model__max_depth": [2, 4, 6, 8],
        "model__criterion": ["squared_error", "friedman_mse", "absolute_error"],
    }

    dtr_model = tree.DecisionTreeRegressor()

    best_dtr, fig = tm.run_experiment_w_cv(
        dtr_model, tuned_parameters, X_train, X_test, y_train, y_test
    )

    mlflow.log_metrics(tm.evaluate(best_dtr, X_test, y_test))
    mlflow.log_param("regression target", target_col)
    mlflow.sklearn.log_model(best_dtr, "Decision tree regressor Model")
    mlflow.log_figure(fig, "train_test_results.png")

### Here we try sorting and grouping highstreets by 2020 mean and slope and seeing if these groupings can be predicted from profile data

In [None]:
np.unique(stats["mrli_hp_2020_group"])

In [None]:
# One-hot encode categorical features
stats_target = stats.groupby("mrli_hp_2020_group").get_group(0)

data = pd.get_dummies(stats_target.drop(columns="highstreet_name")).dropna()

target_variables = [
    "mean 2020",
    "mean 2021",
    "slope 2020",
    "slope 2021",
]

target_col = "slope 2020"

drop_cols = [
    "mean_slope_group",
    "mean_group",
    "slope_group",
    "mrli_yoy_mean_2020_recovery",
    "mrli_fit_slope_2020_recovery",
]

y = data[target_col]
X = data.drop(columns=target_variables + drop_cols)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
tuned_parameters = {
    "model__kernel": ["rbf", "poly"],
    "model__C": [0.001, 0.01, 0.1, 0.2],
    "model__epsilon": [0.0009, 0.001, 0.0015, 0.002],
}

svr_model = SVR(gamma="scale")

best_svr = tm.run_experiment_w_cv(
    svr_model, tuned_parameters, X_train, X_test, y_train, y_test
)

svr_r2, svr_mae, svr_mse = tm.evaluate(best_svr, X_test, y_test)

### Try classification with hit/slope group as target

In [None]:
stats = stats.dropna()

non_feature_cols = [
    "mean 2020",
    "mean 2021",
    "slope 2021",
    "hit percent 2020",
    "hit percent 2021",
    "highstreet_name",
] + rcg_names

target_col = rcg_names[2]

y = stats[target_col]

le = LabelEncoder()
le.fit(y)
y_enc = le.transform(y)

# drop non-feature columns
X = stats.drop(columns=non_feature_cols)

# One-hot encode categorical features
X = pd.get_dummies(X)

# make train test split
X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.33)

In [None]:
gnb_classifier = GaussianNB()

gnb_classifier.fit(X_train, y_train)

y_pred = gnb_classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm)

disp.plot()

plt.show()
