## Setup

In [None]:
import pandas as pd 
from matplotlib import pyplot as plt 
import numpy as np 
import matplotlib
from statsmodels.tsa.stattools import acf, adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose

plt.rcParams['figure.figsize'] = [16, 4]

np.random.seed(1000)

## Config

In [None]:
path_data = "../data"

## Fetch dataset

In [None]:
df_ref = pd.read_csv(f'{path_data}/train.csv')
df_ref

In [None]:
df = df_ref.copy()
df.datetime = pd.to_datetime(df.datetime)
df = df.set_index(['datetime','site_id'], drop=True).sort_index()

sites = df.index.get_level_values('site_id').unique()
sensors = df.columns

df.head()

In [None]:
df_forecast = pd.read_csv(f'{path_data}/nox_forecast.csv')
df_forecast.datetime = pd.to_datetime(df_forecast.datetime)
df_forecast = df_forecast.set_index('datetime', drop=True).sort_index()
df_forecast.head()

## Visualize sensors and sites

In [None]:
for sensor in sensors:
    for loc in sites:
        df.loc[pd.IndexSlice[:, loc], sensor].plot(title=f"{sensor} at {loc}")
        plt.show()

## 1) Data clean-up - Handle extreme values

Monitor station vendor states `pm25` values range from `0` to `125`

In [None]:
for sensor in sensors:
    for loc in sites:
        my_min = df.loc[pd.IndexSlice[:, loc], sensor].min()
        my_max = df.loc[pd.IndexSlice[:, loc], sensor].max()
        print(f"{sensor} at {loc}: Min: {my_min} Max: {my_max}")

In [None]:
upper_threshold = 165

df.pm25.plot()
plt.axhline(upper_threshold, color='red', linestyle='--', label='Upper Threshold')
plt.show()

df = df[df['pm25'] <= upper_threshold]

df.pm25.plot()
plt.axhline(upper_threshold, color='red', linestyle='--', label='Upper Threshold')
plt.show()

## 2a) Visualize sensors and sites - Daily view

In [None]:
df_freq_d = df.groupby('site_id').resample('d', level=0).mean()

for sensor in sensors:
    for loc in sites:
        
        df_freq_d.loc[pd.IndexSlice[loc, :], sensor].plot(title=f"Daily Average {sensor}", label=loc)
        plt.legend()
        
    plt.show()

## 2b) Spatial aggregation

In [None]:
df_spatial = df.groupby('datetime').mean()

for sensor in sensors:
    df_spatial[sensor].plot(title=f"Spatial Average {sensor}")
    plt.show()

## 2c) Spatial aggregation - Daily view

In [None]:
df_spatial_d = df_freq_d.groupby('datetime').mean()

for sensor in sensors:
    df_spatial_d[sensor].plot(title=f"Spatial & Daily Average {sensor}", label=sensor)
    if sensor=='nox':
        plt.plot(df_forecast.resample('d').mean(), label='forecast')
        
    plt.legend()    
    plt.show()

## `nox` as exogenous candidate

In [None]:
sensor1 = 'pm25'
sensor2 = 'nox'

sensor1_norm = (df_spatial[sensor1]-df_spatial[sensor1].mean())/df_spatial[sensor1].std()
sensor2_norm = (df_spatial[sensor2]-df_spatial[sensor2].mean())/df_spatial[sensor2].std()

plt.title(f'{sensor1} vs {sensor2}')
sensor1_norm.loc['2023-12-01':].plot(label=sensor1)
sensor2_norm.loc['2023-12-01':].plot(label=sensor2)
plt.legend()    
plt.show()

In [None]:
sensor1 = 'pm25'
sensor2 = 'nox'

sensor1_norm = (df_spatial_d[sensor1]-df_spatial_d[sensor1].mean())/df_spatial_d[sensor1].std()
sensor2_norm = (df_spatial_d[sensor2]-df_spatial_d[sensor2].mean())/df_spatial_d[sensor2].std()

plt.title(f'{sensor1} vs {sensor2}')
sensor1_norm.plot(label=sensor1)
sensor2_norm.plot(label=sensor2)
plt.legend()    
plt.show()

## 2d) Spatial aggregation - Time of day

In [None]:
df_spatial['hour'] = df_spatial.index.hour

for sensor in sensors:
    hourly_avg = df_spatial.groupby('hour')[sensor].mean()
    hourly_avg.plot(kind='bar')
    plt.title(f'Spatial Average {sensor} by Time of Day')
    plt.ylabel(sensor)
    plt.xlabel('Hour of Day')
    plt.xticks(range(0, 24))
    plt.show()

## 2e) Spatial aggregation - Day of week

In [None]:
for sensor in sensors:
    weekday_avg = df_spatial_d.groupby(df_spatial_d.index.dayofweek)[sensor].mean()
    weekday_avg.index = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

    weekday_avg.plot(kind='bar')
    plt.title(f'Average {sensor} by Day of week')
    plt.ylabel(sensor)
    plt.xlabel('Day of week')
    plt.show()

## 3a) Autocorrelation

In [None]:
lags = range(1, 48)
for sensor in sensors:
    
    data = df_spatial[sensor].dropna()
    
    plot_acf(data, lags=lags)
    plt.xlabel('Lag (h)')
    plt.ylabel('Autocorrelation')
    plt.title(f'Autocorrelation of {sensor}')
    plt.ylim([-0.25,1])
    plt.show()
    
    plot_pacf(data, lags=lags)
    plt.xlabel('Lag (h)')
    plt.ylabel('Partial Autocorrelation')
    plt.title(f'Partial Autocorrelation of {sensor}')
    plt.ylim([-0.25,1])
    plt.show()

In [None]:
lags = range(1, 30)
for sensor in sensors:
    
    data = df_spatial_d[sensor].dropna()
    
    plot_acf(data, lags=lags)
    plt.xlabel('Lag (d)')
    plt.ylabel('Autocorrelation')
    plt.title(f'Autocorrelation of {sensor}')
    plt.ylim([-0.75,0.75])
    plt.show()
    
    plot_pacf(data, lags=lags)
    plt.xlabel('Lag (d)')
    plt.ylabel('Partial Autocorrelation')
    plt.title(f'Partial Autocorrelation of {sensor}')
    plt.ylim([-0.75,0.75])
    plt.show()

## 3b) Cross-Correlation

In [None]:
sensor1 = 'pm25'
sensor2 = 'nox'

In [None]:
lags = range(-47, 48)
cross_corr_values = [
    df_spatial[sensor1].dropna().corr(df_spatial[sensor2].dropna().shift(lag))
    for lag in lags
]

plt.bar(lags, cross_corr_values)
plt.xlabel('Lag (h)')
plt.ylabel('Cross-Correlation')
plt.title(f'Cross-Correlation {sensor1} - {sensor2}')
plt.show()

plot_acf(cross_corr_values, lags=lags)
plt.xlabel('Lag (h)')
plt.ylabel('Cross-Correlation')
plt.title(f'Cross-Correlation {sensor1} - {sensor2}')
plt.show()

plot_pacf(cross_corr_values, lags=lags)
plt.xlabel('Lag (h)')
plt.ylabel('Partial Cross-Correlation')
plt.title(f'Partial Cross-Correlation {sensor1} - {sensor2}')
plt.show()

In [None]:
lags = range(-29, 30)
cross_corr_values = [
    df_spatial_d[sensor1].dropna().corr(df_spatial_d[sensor2].dropna().shift(lag))
    for lag in lags
]

plt.bar(lags, cross_corr_values)
plt.xlabel('Lag (d)')
plt.ylabel('Cross-Correlation')
plt.title(f'Cross-Correlation {sensor1} - {sensor2}')
plt.show()

plot_acf(cross_corr_values, lags=lags)
plt.xlabel('Lag (d)')
plt.ylabel('Cross-Correlation')
plt.title(f'Cross-Correlation {sensor1} - {sensor2}')
plt.show()

plot_pacf(cross_corr_values, lags=lags)
plt.xlabel('Lag (h)')
plt.ylabel('Partial Cross-Correlation')
plt.title(f'Partial Cross-Correlation {sensor1} - {sensor2}')
plt.show()

## Stationarity - Visual estimation

In [None]:
days = 7
for sensor in sensors:
    df_spatial_d[sensor].rolling(days).mean().plot()
    plt.title(f'Rolling Average {days}d {sensor}')
    plt.show()

In [None]:
for sensor in sensors:
    df_spatial_d[sensor].rolling(days).std().plot()
    plt.title(f'Rolling Standard Deviation {days}d {sensor}')
    plt.show()

## Seasonal decomposition - `multiplicative`

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

for sensor in sensors:
    decomposition = seasonal_decompose(df_spatial_d[sensor], model='multiplicative')

    fig = decomposition.plot()
    fig.axes[0].set_ylabel('Original')
    fig.axes[3].lines[0].set_markersize(3)
    fig.axes[3].set_xlabel('Time')

## Seasonal decomposition - `additive`

In [None]:
for sensor in sensors:
    decomposition = seasonal_decompose(df_spatial_d[sensor], model='additive')

    fig = decomposition.plot()
    fig.axes[0].set_ylabel('Original')
    fig.axes[3].lines[0].set_markersize(3)
    fig.axes[3].set_xlabel('Time')

## Weather seasonality - split 1st November

In [None]:
day = '2023-11-01'

df_spatial_d_summer = df_spatial_d.loc[:day]
df_spatial_d_winter = df_spatial_d.loc[day:]

In [None]:
for sensor in sensors:
    decomposition = seasonal_decompose(df_spatial_d_summer[sensor], model='additive')

    fig = decomposition.plot()
    fig.axes[0].set_title(f'Summer {sensor}')
    fig.axes[0].set_ylabel('Original')
    fig.axes[3].lines[0].set_markersize(3)
    fig.axes[3].set_xlabel('Time')

In [None]:
for sensor in sensors:
    decomposition = seasonal_decompose(df_spatial_d_winter[sensor], model='additive')

    fig = decomposition.plot()
    fig.axes[0].set_title(f'Winter {sensor}')
    fig.axes[0].set_ylabel('Original')
    fig.axes[3].lines[0].set_markersize(3)
    fig.axes[3].set_xlabel('Time')

In [None]:
for sensor in sensors:
    
    adfstat, pvalue, usedlag, nobs, critvalues, icbest = adfuller(df_spatial[sensor].dropna())

    print(sensor)
    print('Statistic: %0.02f' % adfstat)
    print('pvalue:    %0.03f' % pvalue, '\n')

In [None]:
for sensor in sensors:
    
    adfstat, pvalue, usedlag, nobs, critvalues, icbest = adfuller(df_spatial_d[sensor].dropna())

    print(sensor)
    print('Statistic: %0.02f' % adfstat)
    print('pvalue:    %0.03f' % pvalue, '\n')

In [None]:
for sensor in sensors:
    
    adfstat, pvalue, usedlag, nobs, critvalues, icbest = adfuller(df_spatial_d_summer[sensor].dropna())

    print(sensor)
    print('Statistic: %0.02f' % adfstat)
    print('pvalue:    %0.03f' % pvalue, '\n')

In [None]:
for sensor in sensors:
    
    adfstat, pvalue, usedlag, nobs, critvalues, icbest = adfuller(df_spatial_d_winter[sensor].dropna())

    print(sensor)
    print('Statistic: %0.02f' % adfstat)
    print('pvalue:    %0.03f' % pvalue, '\n')