# London Lockdown Multilevel Model

For our model, we will be building a linear mixed effects model for each target output variable (e.g. `Pleasant`)

In [None]:
import os
sys.path.append("C:\\Users\\Andrew\\OneDrive - University College London\\_PhD\\Papers - Drafts\\J5_JASA_Lockdown-SS")

from scripts import lockdown_mlm as mlm
import pandas as pd
from pathlib import Path
import numpy as np
import time
import statsmodels.api as sm
import statsmodels.formula.api as smf

import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="ticks")
sns.set_context("paper", font_scale=1.4)

# Define some constants and options
## variables
dep_vars = ["Natural", "Traffic", "Human", "Other", "loudness", "overall", "Pleasant", "Eventful"]

FEATS_LISTS = mlm.FEATS_LISTS
remove = ["FS_TEMP", "LAeq_TEMP", "LCeq_TEMP", "LZeq_TEMP", "I_TEMP", "N_TEMP", "R_TEMP", "S_TEMP", "SIL_TEMP", "THD_TEMP", "T_TEMP"]

for k in remove:
    FEATS_LISTS.pop(k, None)

acoustic_vars = sorted({x for v in FEATS_LISTS.values() for x in v})

## processing options
nonlinear_transformations = []
criterion = "aic"

## Load Data

In [None]:
DATA_DIR = Path("C:\\Users\\Andrew\\OneDrive - University College London\\_PhD\\Papers - Drafts\\J5_JASA_Lockdown-SS\\data")
ssidData = pd.read_csv(DATA_DIR.joinpath("2020-08-13\\LondonVeniceBINResults_2020-08-13_4.csv"))

for col_name in ["Lockdown"]:
    ssidData[col_name] = ssidData[col_name].astype('category')

ssidData.head()

In [None]:
# Cutdown the dataset
cols = ["GroupID", "LocationID", "SessionID", "Lockdown"] + dep_vars + acoustic_vars
ssidData = ssidData[cols]

# Compress to mean of each GroupID
# compressData = ssidData.copy()
compressData = ssidData.groupby(["GroupID"]).mean()
compressData = compressData.merge(ssidData[["GroupID", "LocationID", "SessionID", "Lockdown"]].drop_duplicates(),  on="GroupID")

location_codes = pd.Categorical(compressData["LocationID"]).codes
compressData["LocationID_codes"] = location_codes
compressData.loc[compressData["Lockdown"] == 1].dropna(inplace=True)
compressData = compressData.dropna(subset=acoustic_vars)

print(compressData.shape)
compressData.head()

In [None]:
compressData, acoustic_vars = mlm.nonlinear_features(compressData, acoustic_vars, transformations=nonlinear_transformations)
print(acoustic_vars)
compressData.head()

In [None]:
# Standardise
from sklearn.preprocessing import StandardScaler
compressData = compressData.replace([np.inf, -np.inf], np.nan)
compressData = compressData.dropna(subset=acoustic_vars)
scaler = StandardScaler()
compressData[acoustic_vars] = scaler.fit_transform(compressData[acoustic_vars])
print(compressData.shape)
compressData.head()

Split into pre- and during-lockdown datasets.

In [None]:
prelockdownData = compressData.loc[compressData["Lockdown"] == 1]
prelockdownData = prelockdownData.dropna()
print(prelockdownData.shape)
prelockdownData.head()

In [None]:
lockdownData = compressData.loc[compressData["Lockdown"] == 2]
print(lockdownData.shape)
lockdownData.head() 

## Checking distribution of variables

In [None]:
g = sns.FacetGrid(prelockdownData[["LocationID", "Pleasant", "Eventful"]], col="LocationID", col_wrap=4, xlim=(-1,1), legend_out=True)
g.map(sns.distplot, "Pleasant", rug=True, hist=False)
g.map(sns.distplot, "Eventful", rug=True, hist=False, color="y")

In [None]:
fig = plt.figure(figsize=(10,8))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
for i in range(len(dep_vars)):
    ax = fig.add_subplot(2, 4, i+1)
    bins=None
    kde_kws=None
    if dep_vars[i] not in ["Pleasant", "Eventful"]:
        bins=5
        kde_kws = {"bw": 0.5}
    sns.distplot(prelockdownData[dep_vars[i]], ax=ax, bins=bins, kde_kws=kde_kws, rug=True)

In [None]:
g = sns.FacetGrid(prelockdownData[["LocationID", "loudness"]], col="LocationID", col_wrap=4, xlim=(0.5,5.5))
g.map(sns.distplot, "loudness", kde_kws={"bw":0.5}, bins=[0.5, 1.5, 2.5, 3.5, 4.5, 5.5])

In [None]:
grid = sns.lmplot(x = "LCeq_10", y = "Pleasant", col = "LocationID", sharex = False, col_wrap = 4, data = prelockdownData, height=4)

## Feature Selection and Model Building

### Backward step feature selection
In order to filter out the massive number of potential features, we want to determine which ones significantly contribute to the final model. To do this, we use backward step feature selection which starts by including all possible features and gradually reduces them based on their p-values. This follows 6 steps:

![](https://miro.medium.com/max/700/1*Jub_nEYtN0htxFpTRzRtBQ.png)

#### Step 1
Select a significance level to use as the criterion for selection. Typically, this will be 0.05.

#### Step 2
Fit the model with all the features selected.

#### Step 3
Identify the feature with the highest p-value (i.e. least statistically significant).

#### Step 4
If the p-value of this feature is greater than the significance level (e.g. p-value is > 0.05), we remove it from the feature set. If the highest p-value is less than the significance level, skip to step 6 and finish.

#### Step 5
Remove the feature from the set and fit a new model. Return to step 3 and repeat until all features have a p-value below the significance level.

#### Step 6
We have now identified the feature set, so fit the final model.

This results in a drastically cut down feature set, which is good, but it's still a very complex model, resulting in a low adjusted r-squared value. We've identified all of the potentially significant features, but we still need to do further feature selection. For this, we move onto forward step feature selection with the Akaike Information Criterion as our criterion.

### Forward Step Feature selection
https://planspace.org/20150423-forward_selection_with_statsmodels/

In forward step selection, we build a model for each potential feature individually, calculate the best performing model, and select that feature. We then add every other feature to it and build n-1 two-feature models, then calculate the best performing model and select those two features. This continues until adding features to the model no longer improves its criterion performance. 

It is important in this method to use a criterion which punishes model complexity, otherwise the model will always improve by adding new features.

Success! We can see that this method has reduced the features even further. However, I suspect there are some issues with multicollinearity, so we'll tackle that next.

### Reducing multi-collinearity
We've identified multi-collinearity among several of the features which were selected by the backward-forward feature selection. This is identified through the Variance Inflation Factor (VIF). We've set the max acceptable VIF at a fairly high level of 10 to be very lenient to our potential features. To address this, we remove the highest VIF feature and re-build the model, then check the VIF again. We do this until the max VIF feature is below our set criterion.

## Feature selection and model building

In [None]:
models = {}
back_models = {}
forward_models = {}
vifs = {}
for var in dep_vars:
    try:
        print("\n###########################################################")
        print(f"\nPERFORMING FEATURE SELECTION AND MODEL BUILDING FOR {var}.")
        print("\n###########################################################\n")
        model, back, forward, vif = mlm.mlm_feature_selection(prelockdownData, var, acoustic_vars, "LocationID", criterion="aic", verbose=0)

        print("\n=========================================================")
        print(f"\nFINAL MODEL FOR {var}.\n")
        mlm.summarise_model(model, prelockdownData)

        models[var] = model
        back_models[var] = back
        forward_models[var] = forward
        vifs[var] = vif

    except:
        print(f"Ran into an unresolvable error for {var} model. Moving on.")
        continue


## Use the models to predict the values for the during lockdown data

In [None]:
for model in models:
    lockdownData[f"{model}_pred"] = models[model].predict(lockdownData)

p = sns.pairplot(x_vars=["Pleasant"], y_vars=["Eventful"], data = prelockdownData, hue ="LocationID", size = 8)
p.set(xlim = (-1,1))
p.set(ylim= (-1,1))
plt.show()

l = sns.pairplot(x_vars=["Pleasant_pred"], y_vars=["Eventful_pred"], data = lockdownData, hue ="LocationID", size = 8)
l.set(xlim = (-1,1))
l.set(ylim= (-1,1))
plt.show()

In [None]:
from statsmodels.regression.mixed_linear_model import MixedLMResults
Pleasant_model = MixedLMResults.load("C:\\Users\\Andrew\\OneDrive - University College London\\_PhD\Papers - Drafts\\J5_JASA_Lockdown-SS\\results\\2020-08-21\\Pleasant_RI-only_2020-08-21.pickle")

In [None]:
len(Pleasant_model.model.exog_names)