In [1]:
from lib.feature_engineering_utils import add_lags, LogTime, add_temporal_features
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm.autonotebook import tqdm
np.random.seed(42)
tqdm.pandas()

  from tqdm.autonotebook import tqdm


In [2]:
%cd ../..
project_folder = Path("AI/london_smart_meters")
source_data = project_folder/"data"
preprocessed_data = project_folder/"output"/"preprocessed_data"
dst_img = project_folder/"output"/"img"
dst_result = project_folder/"output"/"result"

/home/hien/Work


In [3]:
train_df = pd.read_parquet(preprocessed_data/"selected_blocks_train_missing_imputed.parquet")
val_df = pd.read_parquet(preprocessed_data/"selected_blocks_val_missing_imputed.parquet")
test_df = pd.read_parquet(preprocessed_data/"selected_blocks_test_missing_imputed.parquet")

In [4]:
train_df["type"] = "train"
val_df["type"] = "val"
test_df["type"] = "test"
full_df = pd.concat([train_df, val_df, test_df]).sort_values(["LCLid", "timestamp"])
del train_df, test_df, val_df

In [5]:
full_df.columns

Index(['timestamp', 'LCLid', 'frequency', 'series_length', 'stdorToU', 'Acorn',
       'Acorn_grouped', 'file', 'holidays', 'visibility', 'windBearing',
       'temperature', 'dewPoint', 'pressure', 'apparentTemperature',
       'windSpeed', 'precipType', 'icon', 'humidity', 'summary', 'hour',
       'weekday', 'day_hourly_profile', 'energy_consumption_imputed', 'type'],
      dtype='object')

In [6]:
full_df = full_df.rename(columns={'energy_consumption_imputed': 'energy_consumption'})

# Lag Features

In [7]:
lags = (
    (np.arange(5) + 1).tolist()
    + (np.arange(5) + 46).tolist()
    + (np.arange(5) + (48 * 7) - 2).tolist()
)
lags

[1, 2, 3, 4, 5, 46, 47, 48, 49, 50, 334, 335, 336, 337, 338]

In [8]:
with LogTime():
    full_df, added_features = add_lags(
        full_df, lags=lags, column="energy_consumption", ts_id="LCLid", use_32_bit=True
    )
print(f"Features Created: {','.join(added_features)}")

Time Elapsed: 2 seconds
Features Created: energy_consumption_lag_1,energy_consumption_lag_2,energy_consumption_lag_3,energy_consumption_lag_4,energy_consumption_lag_5,energy_consumption_lag_46,energy_consumption_lag_47,energy_consumption_lag_48,energy_consumption_lag_49,energy_consumption_lag_50,energy_consumption_lag_334,energy_consumption_lag_335,energy_consumption_lag_336,energy_consumption_lag_337,energy_consumption_lag_338


# Temporal Features

In [9]:
with LogTime():
    full_df, added_features = add_temporal_features(
        full_df,
        field_name="timestamp",
        frequency="30min",
        add_elapsed=True,
        drop=False,
        use_32_bit=True,
    )
print(f"Features Created: {','.join(added_features)}")

Time Elapsed: 0 microseconds
Features Created: timestamp_Month,timestamp_Quarter,timestamp_Is_quarter_end,timestamp_Is_quarter_start,timestamp_Is_year_end,timestamp_Is_year_start,timestamp_Is_month_start,timestamp_Day,timestamp_Dayofweek,timestamp_Dayofyear,timestamp_Hour,timestamp_Minute,timestamp_Week,timestamp_Elapsed


In [10]:
full_df

Unnamed: 0,timestamp,LCLid,frequency,timestamp_Week,series_length,stdorToU,Acorn,Acorn_grouped,file,holidays,...,timestamp_Is_quarter_start,timestamp_Is_year_end,timestamp_Is_year_start,timestamp_Is_month_start,timestamp_Day,timestamp_Dayofweek,timestamp_Dayofyear,timestamp_Hour,timestamp_Minute,timestamp_Elapsed
4437792,2012-01-01 00:00:00,MAC000061,30min,52,37872,Std,ACORN-Q,Adversity,block_96,NO_HOLIDAY,...,1,0,1,1,1,6,1,0,0,1325376000
4437793,2012-01-01 00:30:00,MAC000061,30min,52,37872,Std,ACORN-Q,Adversity,block_96,NO_HOLIDAY,...,1,0,1,1,1,6,1,0,30,1325377800
4437794,2012-01-01 01:00:00,MAC000061,30min,52,37872,Std,ACORN-Q,Adversity,block_96,NO_HOLIDAY,...,1,0,1,1,1,6,1,1,0,1325379600
4437795,2012-01-01 01:30:00,MAC000061,30min,52,37872,Std,ACORN-Q,Adversity,block_96,NO_HOLIDAY,...,1,0,1,1,1,6,1,1,30,1325381400
4437796,2012-01-01 02:00:00,MAC000061,30min,52,37872,Std,ACORN-Q,Adversity,block_96,NO_HOLIDAY,...,1,0,1,1,1,6,1,2,0,1325383200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22027,2014-02-27 21:30:00,MAC005529,30min,9,32688,ToU,ACORN-L,Adversity,block_82,NO_HOLIDAY,...,0,0,0,0,27,3,58,21,30,1393536600
22028,2014-02-27 22:00:00,MAC005529,30min,9,32688,ToU,ACORN-L,Adversity,block_82,NO_HOLIDAY,...,0,0,0,0,27,3,58,22,0,1393538400
22029,2014-02-27 22:30:00,MAC005529,30min,9,32688,ToU,ACORN-L,Adversity,block_82,NO_HOLIDAY,...,0,0,0,0,27,3,58,22,30,1393540200
22030,2014-02-27 23:00:00,MAC005529,30min,9,32688,ToU,ACORN-L,Adversity,block_82,NO_HOLIDAY,...,0,0,0,0,27,3,58,23,0,1393542000


In [11]:
full_df.columns

Index(['timestamp', 'LCLid', 'frequency', 'timestamp_Week', 'series_length',
       'stdorToU', 'Acorn', 'Acorn_grouped', 'file', 'holidays', 'visibility',
       'windBearing', 'temperature', 'dewPoint', 'pressure',
       'apparentTemperature', 'windSpeed', 'precipType', 'icon', 'humidity',
       'summary', 'hour', 'weekday', 'day_hourly_profile',
       'energy_consumption', 'type', 'energy_consumption_lag_1',
       'energy_consumption_lag_2', 'energy_consumption_lag_3',
       'energy_consumption_lag_4', 'energy_consumption_lag_5',
       'energy_consumption_lag_46', 'energy_consumption_lag_47',
       'energy_consumption_lag_48', 'energy_consumption_lag_49',
       'energy_consumption_lag_50', 'energy_consumption_lag_334',
       'energy_consumption_lag_335', 'energy_consumption_lag_336',
       'energy_consumption_lag_337', 'energy_consumption_lag_338',
       'timestamp_Month', 'timestamp_Quarter', 'timestamp_Is_quarter_end',
       'timestamp_Is_quarter_start', 'timestamp_Is_

In [12]:
full_df[full_df["type"] == "train"].drop(columns="type").to_parquet(
    preprocessed_data / "selected_blocks_train_missing_imputed_feature_engg.parquet"
)
full_df[full_df["type"] == "val"].drop(columns="type").to_parquet(
    preprocessed_data / "selected_blocks_val_missing_imputed_feature_engg.parquet"
)
full_df[full_df["type"] == "test"].drop(columns="type").to_parquet(
    preprocessed_data / "selected_blocks_test_missing_imputed_feature_engg.parquet"
)