[Chapter 4] Setting a Strong Baseline Forecast

1. setting up a test harness

In [None]:
# test harness is a collection of code and inputs that have been configured to test a program under various situations

import numpy as np
import pandas as pd

from pathlib import Path
from tqdm.auto import tqdm

import random
np.random.seed(42)
random.seed(42)
tqdm.pandas()

In [16]:
# read and select data (in compact form)

# read raw data
lclid_acorn_map = pd.read_pickle("./data/london_smart_meters/preprocessed/london_smart_meters_lclid_acorn_map.pkl")

# stratification based on acorn calssification
affluent_households = lclid_acorn_map.loc[lclid_acorn_map.Acorn_grouped=="Affluent", ["LCLid",'file']]
adversity_households = lclid_acorn_map.loc[lclid_acorn_map.Acorn_grouped=="Adversity", ["LCLid",'file']]
comfortable_households = lclid_acorn_map.loc[lclid_acorn_map.Acorn_grouped=="Comfortable", ["LCLid",'file']]

# sampling: randomly select 50 households for each calssification (150 in total) given RAM constraints 
# <= 50 for 4GB RAM, 50-100 for 8GB RAM, 100-150 for 16GB RAM, 250 for 32GB
selected_households = pd.concat(
    [
        affluent_households.sample(50, random_state=76),
        comfortable_households.sample(50, random_state=76),
        adversity_households.sample(50, random_state=76),
    ]
)
selected_households['block']=selected_households.file.str.split("_", expand=True).iloc[:,1].astype(int)

# extracting the paths to the different blocks and extracting the starting and ending blocks
path_blocks = [
    (p, *list(map(int, p.name.split("_")[5].split(".")[0].split("-"))))
    for p in Path("data/london_smart_meters/preprocessed").glob(
        "london_smart_meters_merged_block*"
    )
]

household_df_l = []
for path, start_b, end_b in tqdm(path_blocks):
    block_df = pd.read_parquet(path, engine='fastparquet')
    selected_households['block'].between
    mask = selected_households['block'].between(start_b, end_b)
    lclids = selected_households.loc[mask, "LCLid"]
    household_df_l.append(block_df.loc[block_df.LCLid.isin(lclids)])

block_df = pd.concat(household_df_l)
del household_df_l
block_df.head(2)

  0%|          | 0/14 [00:00<?, ?it/s]

Unnamed: 0,LCLid,start_timestamp,frequency,energy_consumption,series_length,stdorToU,Acorn,Acorn_grouped,file,holidays,...,windBearing,temperature,dewPoint,pressure,apparentTemperature,windSpeed,precipType,icon,humidity,summary
57,MAC000768,2012-04-21,30min,"[0.8440000000000001, 0.265, 0.262, 0.233999999...",32544,Std,ACORN-A,Affluent,block_1,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...",...,"[251, 251, 251, 251, 246, 246, 242, 242, 244, ...","[6.42, 6.42, 6.2, 6.2, 5.68, 5.68, 5.16, 5.16,...","[3.54, 3.54, 3.61, 3.61, 3.52, 3.52, 3.11, 3.1...","[994.96, 994.96, 994.98, 994.98, 994.82, 994.8...","[3.79, 3.79, 3.67, 3.67, 3.15, 3.15, 2.61, 2.6...","[3.64, 3.64, 3.42, 3.42, 3.25, 3.25, 3.13, 3.1...","[rain, rain, rain, rain, rain, rain, rain, rai...","[partly-cloudy-night, partly-cloudy-night, par...","[0.82, 0.82, 0.83, 0.83, 0.86, 0.86, 0.87, 0.8...","[Partly Cloudy, Partly Cloudy, Partly Cloudy, ..."
63,MAC000948,2012-05-02,30min,"[0.008, 0.009, 0.008, 0.008, 0.008, 0.009, 0.0...",32016,Std,ACORN-A,Affluent,block_1,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...",...,"[351, 351, 0, 0, 0, 0, 351, 351, 348, 348, 3, ...","[11.81, 11.81, 11.12, 11.12, 11.2, 11.2, 11.18...","[10.47, 10.47, 10.15, 10.15, 9.89, 9.89, 9.29,...","[1021.42, 1021.42, 1021.44, 1021.44, 1021.33, ...","[11.81, 11.81, 11.12, 11.12, 11.2, 11.2, 11.18...","[2.53, 2.53, 2.41, 2.41, 2.06, 2.06, 2.98, 2.9...","[rain, rain, rain, rain, rain, rain, rain, rai...","[partly-cloudy-night, partly-cloudy-night, par...","[0.91, 0.91, 0.94, 0.94, 0.92, 0.92, 0.88, 0.8...","[Mostly Cloudy, Mostly Cloudy, Mostly Cloudy, ..."


In [17]:
# convert from compact form to expanded form
exp_block_df = compact_to_expanded(block_df, timeseries_col = 'energy_consumption',
static_cols = ["frequency", "series_length", "stdorToU", "Acorn", "Acorn_grouped", "file"],
time_varying_cols = ['holidays', 'visibility', 'windBearing', 'temperature', 'dewPoint',
       'pressure', 'apparentTemperature', 'windSpeed', 'precipType', 'icon',
       'humidity', 'summary'],
ts_identifier = "LCLid")

exp_block_df.head(2)

  0%|          | 0/150 [00:00<?, ?it/s]

Unnamed: 0,timestamp,LCLid,energy_consumption,frequency,series_length,stdorToU,Acorn,Acorn_grouped,file,holidays,...,windBearing,temperature,dewPoint,pressure,apparentTemperature,windSpeed,precipType,icon,humidity,summary
0,2012-04-21 00:00:00,MAC000768,0.844,30min,32544,Std,ACORN-A,Affluent,block_1,NO_HOLIDAY,...,251,6.42,3.54,994.96,3.79,3.64,rain,partly-cloudy-night,0.82,Partly Cloudy
1,2012-04-21 00:30:00,MAC000768,0.265,30min,32544,Std,ACORN-A,Affluent,block_1,NO_HOLIDAY,...,251,6.42,3.54,994.96,3.79,3.64,rain,partly-cloudy-night,0.82,Partly Cloudy


In [None]:
# reduce memory footprint

from src.utils.data_utils import reduce_memory_footprint

# memory was reduced from 3.3GB to 301.1MB
exp_block_df.info(memory_usage="deep", verbose=False)
exp_block_df = reduce_memory_footprint(exp_block_df)
exp_block_df.info(memory_usage="deep", verbose=False)

<class 'pandas.core.frame.DataFrame'>
Index: 4711440 entries, 0 to 33263
Columns: 21 entries, timestamp to summary
dtypes: datetime64[ns](1), float64(8), int64(2), object(10)
memory usage: 3.3 GB
<class 'pandas.core.frame.DataFrame'>
Index: 4711440 entries, 0 to 33263
Columns: 21 entries, timestamp to summary
dtypes: category(10), datetime64[ns](1), float32(8), int32(2)
memory usage: 301.1 MB


In [19]:
# train test validation split

# validation data: used in the modeling process to assess the quality of the model 
#                  (to select between different models, tune the hyperparameters, perform feature selection)
# test data:final test of your chosen model to measure how well model is doing in unseen data

test_mask = (exp_block_df.timestamp.dt.year==2014) & (exp_block_df.timestamp.dt.month==2)
val_mask = (exp_block_df.timestamp.dt.year==2014) & (exp_block_df.timestamp.dt.month==1)

train = exp_block_df[~(val_mask|test_mask)]
val = exp_block_df[val_mask]
test = exp_block_df[test_mask]
print(f"# of Training samples: {len(train)} | # of Validation samples: {len(val)} | # of Test samples: {len(test)}")
print(f"Max Date in Train: {train.timestamp.max()} | Min Date in Validation: {val.timestamp.min()} | Min Date in Test: {test.timestamp.min()}")

train.to_parquet("data/london_smart_meters/preprocessed/selected_blocks_train.parquet")
val.to_parquet("data/london_smart_meters/preprocessed/selected_blocks_val.parquet")
test.to_parquet("data/london_smart_meters/preprocessed/selected_blocks_test.parquet")

# of Training samples: 4293840 | # of Validation samples: 223200 | # of Test samples: 194400
Max Date in Train: 2013-12-31 23:30:00 | Min Date in Validation: 2014-01-01 00:00:00 | Min Date in Test: 2014-02-01 00:00:00


In [20]:
# train test split after filling in missing values

from src.imputation.interpolation import SeasonalInterpolation

block_df.energy_consumption = block_df.energy_consumption.\
    progress_apply(lambda x: SeasonalInterpolation(seasonal_period=48*7).fit_transform(np.array(x).reshape(-1,1)).squeeze())

# covert to expanded form
exp_block_df = compact_to_expanded(block_df, timeseries_col = 'energy_consumption',
static_cols = ["frequency", "series_length", "stdorToU", "Acorn", "Acorn_grouped", "file"],
time_varying_cols = ['holidays', 'visibility', 'windBearing', 'temperature', 'dewPoint',
       'pressure', 'apparentTemperature', 'windSpeed', 'precipType', 'icon',
       'humidity', 'summary'],
ts_identifier = "LCLid")

exp_block_df.head(2)

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

Unnamed: 0,timestamp,LCLid,energy_consumption,frequency,series_length,stdorToU,Acorn,Acorn_grouped,file,holidays,...,windBearing,temperature,dewPoint,pressure,apparentTemperature,windSpeed,precipType,icon,humidity,summary
0,2012-04-21 00:00:00,MAC000768,0.844,30min,32544,Std,ACORN-A,Affluent,block_1,NO_HOLIDAY,...,251,6.42,3.54,994.96,3.79,3.64,rain,partly-cloudy-night,0.82,Partly Cloudy
1,2012-04-21 00:30:00,MAC000768,0.265,30min,32544,Std,ACORN-A,Affluent,block_1,NO_HOLIDAY,...,251,6.42,3.54,994.96,3.79,3.64,rain,partly-cloudy-night,0.82,Partly Cloudy


In [21]:
# reduce memory footprint

exp_block_df.info(memory_usage="deep", verbose=False)
exp_block_df = reduce_memory_footprint(exp_block_df)
exp_block_df.info(memory_usage="deep", verbose=False)

<class 'pandas.core.frame.DataFrame'>
Index: 4711440 entries, 0 to 33263
Columns: 21 entries, timestamp to summary
dtypes: datetime64[ns](1), float64(8), int64(2), object(10)
memory usage: 3.3 GB
<class 'pandas.core.frame.DataFrame'>
Index: 4711440 entries, 0 to 33263
Columns: 21 entries, timestamp to summary
dtypes: category(10), datetime64[ns](1), float32(8), int32(2)
memory usage: 301.1 MB


In [22]:
# train test validation split

test_mask = (exp_block_df.timestamp.dt.year==2014) & (exp_block_df.timestamp.dt.month==2)
val_mask = (exp_block_df.timestamp.dt.year==2014) & (exp_block_df.timestamp.dt.month==1)

train = exp_block_df[~(val_mask|test_mask)]
val = exp_block_df[val_mask]
test = exp_block_df[test_mask]
print(f"# of Training samples: {len(train)} | # of Validation samples: {len(val)} | # of Test samples: {len(test)}")
print(f"Max Date in Train: {train.timestamp.max()} | Min Date in Validation: {val.timestamp.min()} | Min Date in Test: {test.timestamp.min()}")

# of Training samples: 4293840 | # of Validation samples: 223200 | # of Test samples: 194400
Max Date in Train: 2013-12-31 23:30:00 | Min Date in Validation: 2014-01-01 00:00:00 | Min Date in Test: 2014-02-01 00:00:00


In [23]:
train.to_parquet("data/london_smart_meters/preprocessed/selected_blocks_train_missing_imputed.parquet")
val.to_parquet("data/london_smart_meters/preprocessed/selected_blocks_val_missing_imputed.parquet")
test.to_parquet("data/london_smart_meters/preprocessed/selected_blocks_test_missing_imputed.parquet")