# London Lockdown Multilevel Model

## Load Data

In [47]:
import pandas as pd
import statsmodels.api as sm
from pathlib import Path
import itertools

DATA_DIR = Path("C:\\Users\\Andrew\\OneDrive - University College London\\_PhD\\Papers - Drafts\\J5_JASA_Lockdown-SS\\data")
ssidData = pd.read_csv(DATA_DIR.joinpath("2020-08-11\\LondonBINResults_2020-08-10.csv"))
ssidData.head()

Unnamed: 0.1,Unnamed: 0,GroupID,SessionID,LocationID,record_id,recording,start_time,end_time,latitude,longitude,...,Peak_2500,Peak_3150,Peak_4000,Peak_5000,Peak_6300,Peak_8000,Peak_10000,Peak_12500,Peak_16000,Peak_20000
0,0,CT101,CamdenTown1,CamdenTown,525.0,11-42_1,2019-05-02 11:40,2019-05-02 11:43,51.539124,-0.142624,...,61.87,61.63,58.22,56.82,57.17,53.24,57.93,60.33,57.48,48.83
1,1,CT101,CamdenTown1,CamdenTown,526.0,11-42_1,2019-05-02 11:41,2019-05-02 11:44,51.539124,-0.142624,...,61.87,61.63,58.22,56.82,57.17,53.24,57.93,60.33,57.48,48.83
2,2,CT101,CamdenTown1,CamdenTown,561.0,11-42_1,2019-05-02 11:40,2019-05-02 11:43,,,...,61.87,61.63,58.22,56.82,57.17,53.24,57.93,60.33,57.48,48.83
3,3,CT102,CamdenTown1,CamdenTown,560.0,11-50_2,2019-05-02 11:50,2019-05-02 11:53,,,...,66.16,65.49,63.22,58.21,57.16,59.01,55.79,53.71,50.08,43.81
4,4,CT103,CamdenTown1,CamdenTown,527.0,11-52_3,2019-05-02 11:49,2019-05-02 11:54,51.539124,-0.142624,...,62.84,56.87,63.46,57.45,49.32,48.77,60.07,47.32,45.77,42.35


## Reshape, standardise, and filter the data

In [82]:
for col_name in ["Lockdown"]:
    ssidData[col_name] = ssidData[col_name].astype('category')

acoustic_vars = ["FS", "FS_5", "FS_10", "FS_50", "FS_90", "FS_95", "FS_Min", "FS_Max", "FS_5-FS_95", "FS_10-FS_90", "FS_Max-FS_Min", "LAeq", "LAeq_5", "LAeq_10", "LAeq_50", "LAeq_90", "LAeq_95", "LAeq_Max", "LAeq_Min", "LAeq_10-LAeq_90", "LAeq_5-LAeq_95", "LAeq_Max-LAeq_Min", "N_5", "N_10", "N_50", "N_90", "N_95", "N_Min", "N_Max", "N_5-N_95", "N_10-N_90", "N_Max-N_Min", "R", "R_5", "R_10", "R_50", "R_90", "R_95", "R_Min", "R_Max", "R_5-R_95", "R_10-R_90", "R_Max-R_Min", "S", "S_5", "S_10", "S_50", "S_90", "S_95", "S_Min", "S_Max", "S_5-S_95", "S_10-S_90", "S_Max-S_Min", "SIL_5", "SIL_10", "SIL_50", "SIL_90", "SIL_95", "SIL_Min", "SIL_Max", "SIL_5-SIL_95", "SIL_10-SIL_90", "SIL_Max-SIL_Min", "T_5", "T_10", "T_50", "T_90", "T_95", "T_Max", "T_5-T_95", "T_10-T_90", "SpectralCentroid", "MaxFreq", "PeakSpectralCentroid", "PeakMaxFreq", "FS_M0", "FS_nrmse0", "LAeq_M0", "LAeq_nrmse0", "N_M0", "N_nrmse0", "N_M1", "N_nrmse1", "N_M2", "N_nrmse2", "R_M0", "R_nrmse0", "S_M0", "S_nrmse0", "S_M1", "S_nrmse1", "S_M2", "S_nrmse2", "SIL_M0", "SIL_nrmse0", "T_M0", "T_nrmse0"]

dep_vars = ["Natural", "overall", "Pleasant", "Eventful"]

# Cutdown the dataset
cols = ["GroupID", "LocationID", "SessionID", "Lockdown"] + dep_vars + acoustic_vars
ssidData = ssidData[cols]

# Compress to mean of each GroupID
compressData = ssidData.groupby(["GroupID"]).mean()
compressData = compressData.merge(ssidData[["GroupID", "LocationID", "SessionID", "Lockdown"]].drop_duplicates(),  on="GroupID")

# Standardise
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
compressData[acoustic_vars] = scaler.fit_transform(compressData[acoustic_vars])

location_codes = pd.Categorical(compressData["LocationID"]).codes
compressData["LocationID_codes"] = location_codes
compressData.head()


Unnamed: 0,GroupID,Natural,overall,Pleasant,Eventful,FS,FS_5,FS_10,FS_50,FS_90,...,S_M2,S_nrmse2,SIL_M0,SIL_nrmse0,T_M0,T_nrmse0,LocationID,SessionID,Lockdown,LocationID_codes
0,CT101,1.666667,2.666667,-0.22978,0.284518,0.158062,-0.134407,0.0603,0.394966,0.005514,...,0.093033,-0.98662,0.097,-0.10406,-0.327324,-0.048898,CamdenTown,CamdenTown1,1,0
1,CT102,1.0,2.0,0.103553,-0.75,0.217614,0.064081,0.204233,0.290698,0.399773,...,-0.558087,-1.371447,-1.077617,-0.100549,-0.277098,-0.049036,CamdenTown,CamdenTown1,1,0
2,CT103,1.0,3.0,0.25,0.75,-0.378574,-0.442926,-0.408919,-0.289194,-0.17229,...,-0.546989,0.054116,-0.240687,-0.104391,-0.343367,-0.048971,CamdenTown,CamdenTown1,1,0
3,CT104,1.0,2.0,0.073223,0.676777,-0.000746,-0.127934,-0.080753,0.026017,0.269899,...,0.074412,-1.10999,-1.444458,-0.087578,0.727215,-0.049255,CamdenTown,CamdenTown1,1,0
4,CT107,2.0,4.0,0.073223,0.366117,0.667568,0.269042,0.466189,0.731832,0.965651,...,-0.722239,-0.523214,0.42935,-0.104047,-0.918465,-0.031732,CamdenTown,CamdenTown1,1,0


Split into pre- and during-lockdown datasets

In [83]:
prelockdownData = compressData.loc[compressData["Lockdown"] == 1]
print(prelockdownData.shape)
prelockdownData.head()

(620, 107)


Unnamed: 0,GroupID,Natural,overall,Pleasant,Eventful,FS,FS_5,FS_10,FS_50,FS_90,...,S_M2,S_nrmse2,SIL_M0,SIL_nrmse0,T_M0,T_nrmse0,LocationID,SessionID,Lockdown,LocationID_codes
0,CT101,1.666667,2.666667,-0.22978,0.284518,0.158062,-0.134407,0.0603,0.394966,0.005514,...,0.093033,-0.98662,0.097,-0.10406,-0.327324,-0.048898,CamdenTown,CamdenTown1,1,0
1,CT102,1.0,2.0,0.103553,-0.75,0.217614,0.064081,0.204233,0.290698,0.399773,...,-0.558087,-1.371447,-1.077617,-0.100549,-0.277098,-0.049036,CamdenTown,CamdenTown1,1,0
2,CT103,1.0,3.0,0.25,0.75,-0.378574,-0.442926,-0.408919,-0.289194,-0.17229,...,-0.546989,0.054116,-0.240687,-0.104391,-0.343367,-0.048971,CamdenTown,CamdenTown1,1,0
3,CT104,1.0,2.0,0.073223,0.676777,-0.000746,-0.127934,-0.080753,0.026017,0.269899,...,0.074412,-1.10999,-1.444458,-0.087578,0.727215,-0.049255,CamdenTown,CamdenTown1,1,0
4,CT107,2.0,4.0,0.073223,0.366117,0.667568,0.269042,0.466189,0.731832,0.965651,...,-0.722239,-0.523214,0.42935,-0.104047,-0.918465,-0.031732,CamdenTown,CamdenTown1,1,0


In [84]:
lockdownData = compressData.loc[compressData["Lockdown"] == 2]
print(lockdownData.shape)
lockdownData.head() 

(481, 107)


Unnamed: 0,GroupID,Natural,overall,Pleasant,Eventful,FS,FS_5,FS_10,FS_50,FS_90,...,S_M2,S_nrmse2,SIL_M0,SIL_nrmse0,T_M0,T_nrmse0,LocationID,SessionID,Lockdown,LocationID_codes
57,CT501,,,,,-0.503635,-0.5508,-0.526943,-0.408701,-0.286702,...,-0.11911,0.659046,-0.425823,-0.104331,1.205492,-0.049256,CamdenTown,CamdenTown5,2,0
58,CT502,,,,,-0.53275,-0.52491,-0.475128,-0.513771,-0.39493,...,-1.111193,-0.805798,1.504482,-0.105003,2.105115,-0.049257,CamdenTown,CamdenTown5,2,0
59,CT503,,,,,-0.047064,0.497735,0.169689,-0.350953,-0.275879,...,0.623476,-1.850262,1.438965,-0.104789,-0.210116,-0.049109,CamdenTown,CamdenTown5,2,0
60,CT504,,,,,0.237465,0.051137,0.169689,0.394966,-0.003763,...,0.262401,-1.017468,-0.046886,-0.101261,1.687615,-0.049257,CamdenTown,CamdenTown5,2,0
61,CT505,,,,,-0.212489,-0.205603,-0.187263,-0.136802,-0.203212,...,-1.38914,-0.968412,-0.72131,-0.100731,-0.151282,-0.049207,CamdenTown,CamdenTown5,2,0


## `overall` model
### Partial correlation filtering

In [133]:
import pingouin as pg

def max_pcor(feature_list, target_feature, covar, data):
    cors_table = pd.DataFrame()
    for feature in feature_list:
        par_cor = pg.partial_corr(data, x=feature, y = target_feature, covar=covar)
        cors_table[feature] = par_cor['r']
    
    cors_table = cors_table.T.squeeze()

    max_feature = cors_table.abs().idxmax()
    max_val = cors_table[max_feature]

    return max_feature, max_val

FS_stats = ["FS", "FS_5", "FS_10", "FS_50", "FS_90", "FS_95", "FS_Min", "FS_Max"]
FS_variation = ["FS_5-FS_95", "FS_10-FS_90", "FS_Max-FS_Min"]
LAeq_stats = ["LAeq", "LAeq_5", "LAeq_10", "LAeq_50", "LAeq_90", "LAeq_95", "LAeq_Max", "LAeq_Min"]
LAeq_variation = ["LAeq_10-LAeq_90", "LAeq_5-LAeq_95", "LAeq_Max-LAeq_Min"]
N_stats = ["N_5", "N_10", "N_50", "N_90", "N_95", "N_Min", "N_Max"]
N_variation = ["N_5-N_95", "N_10-N_90", "N_Max-N_Min"]
R_stats = ["R", "R_5", "R_10", "R_50", "R_90", "R_95", "R_Min", "R_Max"]
R_variation = ["R_5-R_95", "R_10-R_90", "R_Max-R_Min"]
S_stats = ["S", "S_5", "S_10", "S_50", "S_90", "S_95", "S_Min", "S_Max"]
S_variation = ["S_5-S_95", "S_10-S_90", "S_Max-S_Min"]
SIL_stats = ["SIL_5", "SIL_10", "SIL_50", "SIL_90", "SIL_95", "SIL_Min", "SIL_Max"]
SIL_variation = ["SIL_5-SIL_95", "SIL_10-SIL_90", "SIL_Max-SIL_Min"]
T_stats = ["T_5", "T_10", "T_50", "T_90", "T_95", "T_Max"]
T_variation = ["T_5-T_95", "T_10-T_90"]
frequency_features = ["SpectralCentroid", "MaxFreq", "PeakSpectralCentroid", "PeakMaxFreq"]

all_features = [FS_stats, FS_variation, LAeq_stats, LAeq_variation, N_stats, N_variation, R_stats, R_variation, S_stats, S_variation, SIL_stats, SIL_variation, T_stats, T_variation, frequency_features]

overall_features = {}
for item in all_features:
    feature, val = max_pcor(item, "overall", ["LocationID_codes"], prelockdownData)
    overall_features[feature] = val
overall_features

{'FS_90': -0.1558819152084901,
 'FS_Max-FS_Min': -0.030898698219446605,
 'LAeq_10': -0.35610804837366805,
 'LAeq_10-LAeq_90': -0.19280570733641145,
 'N_5': -0.33848420675283586,
 'N_10-N_90': -0.35104200723741347,
 'R_95': -0.4375346924391178,
 'R_10-R_90': -0.15844501931370966,
 'S_Max': -0.12412961495139185,
 'S_10-S_90': -0.22351738003765428,
 'SIL_5': -0.35221911874401,
 'SIL_5-SIL_95': -0.23612992298568908,
 'T_50': -0.23492286716061253,
 'T_10-T_90': -0.2072290531418789,
 'PeakSpectralCentroid': -0.07668712957767493}

In [123]:
FS_stats_corr.abs().idxmax()


'FS_90'

dtype('int8')