# From DP to FE
Rewrite all the data cleaning, data preprocessing and generation utilities to align inference phase, which can alleviate the risk of errors.


## Ref
https://www.kaggle.com/code/vitalykudelya/enefit-object-oriented-gbdt/notebook

## Import Packages

In [1]:
%load_ext autoreload
%autoreload 2
import json
import re
import math
import pickle
import warnings
from datetime import datetime, timedelta
from pathlib import Path
from typing import Any, Dict, List, Optional, Union, Tuple, Iterator
warnings.simplefilter("ignore")

import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from polars.testing import assert_series_equal
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import GroupKFold
from matplotlib.axes import Axes

from metadata import *
from cv.build import TSCV

pd.options.display.max_columns = None

In [2]:
!cat metadata.py

"""
Project metadata for global access.
Author: JiaWei
"""
import polars as pl

# == Data ==
UNIT_ID_COL = "prediction_unit_id"
TGT_COL = "target"
DBI = "data_block_id"
TGT_PK_COLS = ["county", "is_business", "product_type"]
REVEALED_TGT_COLS = [UNIT_ID_COL, "datetime", TGT_COL, "is_consumption"]
COORD_COL2ABBR = {"latitude": "lat", "longitude": "lon"}
LOC_COLS = ["lat", "lon"]
PRODUCT_TYPE2NAME = {0: "Combined", 1: "Fixed", 2: "General service", 3: "Spot"}

# == Join keys ==
CLI_JOIN_KEYS = ["county", "is_business", "product_type", "date"]
FWTH_JOIN_KEYS = ["county", "datetime"]

HWTH_JOIN_KEYS = ["county", "datetime"]
REVEALED_TGT_JOIN_KEYS = [UNIT_ID_COL, "datetime", "is_consumption"]
REVEALED_TGT_ROLLING_JOIN_KEYS = [UNIT_ID_COL, "datetime", "is_consumption"]

# == Groupby keys ==
FWTH_LGP_KEYS = ["county", "datetime"]
FWTH_GGP_KEYS = ["datetime"]
HWTH_LGP_KEYS = ["county", "datetime"]
HWTH_GGP_KEYS = ["datetime"]
REVEALED_TGT_ROLLING_GP_KEYS = [UNIT_ID_COL, "is_consumption"]

# ==

## Define Local Data Paths

In [3]:
RAW_DATA_PATH = Path("./data/raw/")
PROC_DATA_PATH = Path("./data/processed/")

## Define Utilities

In [8]:
def _reduce_memory_usage(
    df: pl.DataFrame, 
    cols_to_skip: List[str] = [],
    data_name: str = ""
) -> pl.DataFrame:
    """Reduce memory usage by dtype casting.
    
    Args:
        df: raw DataFrame
        cols_to_skip: columns to skip dtype casting
    
    Returns:
        df: DataFrame with reduced memory footprint
    """
    start_mem = df.estimated_size("mb")
    print(f"Memory usage of {data_name} DataFrame is {start_mem:.2f} MB.")
    
    # pl.Uint8, pl.UInt16, pl.UInt32, pl.UInt64
    NUM_INT_TYPES = [pl.Int8,pl.Int16,pl.Int32,pl.Int64]
    NUM_FLOAT_TYPES = [pl.Float32, pl.Float64]
    for col in df.columns:
        if col in cols_to_skip: continue
        
        col_type = df[col].dtype
        c_min = df[col].min()
        c_max = df[col].max()
        if col_type in NUM_INT_TYPES:
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df = df.with_columns(df[col].cast(pl.Int8))
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df = df.with_columns(df[col].cast(pl.Int16))
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df = df.with_columns(df[col].cast(pl.Int32))
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df = df.with_columns(df[col].cast(pl.Int64))
        elif col_type in NUM_FLOAT_TYPES:
            if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df = df.with_columns(df[col].cast(pl.Float32))
            else:
                pass
        elif col_type == pl.Utf8:
            df = df.with_columns(df[col].cast(pl.Categorical))
        else:
            pass
    end_mem = df.estimated_size("mb")
    print(f"Memory usage became: {end_mem:.2f} MB.")
    print(f"-> Total {(start_mem-end_mem) / start_mem * 100}% reduced.")
    

    return df

In [9]:
def _shift_and_join(
    ldf: Union[pl.DataFrame, pl.LazyFrame],
    rdf: Union[pl.DataFrame, pl.LazyFrame],
    join_keys: List[str],
    dt_col: str,
    feats: List[str],
    shift_amt: str
) -> Tuple[Union[pl.DataFrame, pl.LazyFrame], List[str]]:
    """Perform DataFrame join with the shifted features.
    
    Left DataFrame is usually the main DataFrame, but this utility can
    also be used to extend multiple lookbacks of a homogeneous feat.
    
    Args:
        ldf: left DataFrame, usually the main DataFrame
        rdf: right DataFrame
        join_keys: join keys
        dt_col: datetime column, either "datetime" or "date"
        feats: feature list
        shift_amt: shift amount (e.g., 1h, 2d)
            *Note: Only "h" and "d" are supported.
        
    Returns:
        df: joined DataFrame
        new_feats: new feature list
    """
    feats_map = {feat: f"{feat}_lag{shift_amt}" for feat in feats}
    if shift_amt[-1] == "h":
        # Hour resolution
        shifted_dt = pl.col(dt_col) + pl.duration(hours=int(shift_amt[:-1]))
    elif shift_amt[-1] == "d":    
        # Day resolution
        shifted_dt = pl.col(dt_col) + pl.duration(days=int(shift_amt[:-1]))
    if dt_col == "date":
        shifted_dt = shifted_dt.cast(pl.Date)
    
    df = (
        ldf
        .join(rdf.with_columns(shifted_dt), on=join_keys, how="left")
        .rename(feats_map)
    )
    new_feats = list(feats_map.values())
    
    return df, new_feats

In [10]:
def _get_fillnul_exprs(dst_cols: List[str], src_cols: List[str]) -> List[pl.Expr]:
    """Return expressions of null filling logics, from source column to
    destination column.
    
    Args:
        dst_cols: destination column list
        src_cols: source column list
        
    Returns:
        fillnul_exprs: expressions of null filling logics
    """
    fillnul_exprs = [
        pl.when(pl.col(dst_col).is_null())
            .then(pl.col(src_col))
            .otherwise(pl.col(dst_col)).alias(dst_col) for dst_col, src_col in zip(dst_cols, src_cols)
    ]
    
    return fillnul_exprs

## Load Data

In [4]:
load_proc_df = True

In [5]:
if not load_proc_df:
    train = pl.read_csv(RAW_DATA_PATH / "train.csv", try_parse_dates=True)
    client = pl.read_csv(RAW_DATA_PATH / "client.csv", try_parse_dates=True)
    fwth = pl.read_csv(RAW_DATA_PATH / "forecast_weather.csv", try_parse_dates=True)
    hwth = pl.read_csv(RAW_DATA_PATH / "historical_weather.csv", try_parse_dates=True)
    elec = pl.read_csv(RAW_DATA_PATH / "electricity_prices.csv", try_parse_dates=True)
    gas = pl.read_csv(RAW_DATA_PATH / "gas_prices.csv", try_parse_dates=True)
else:
    train = pl.read_parquet(PROC_DATA_PATH / "data_eager_new.parquet")

## Data Preprocess

In [8]:
train = _reduce_memory_usage(train, data_name="train")
client = _reduce_memory_usage(client, data_name="client")
fwth = _reduce_memory_usage(fwth, data_name="fwth")
hwth = _reduce_memory_usage(hwth, data_name="hwth")
elec = _reduce_memory_usage(elec, data_name="elec")
gas = _reduce_memory_usage(gas, data_name="gas")

Memory usage of train DataFrame is 138.83 MB.
Memory usage became: 44.51 MB.
-> Total 67.93760831889082% reduced.
Memory usage of client DataFrame is 2.08 MB.
Memory usage became: 0.60 MB.
-> Total 71.15384615384616% reduced.
Memory usage of fwth DataFrame is 470.69 MB.
Memory usage became: 245.35 MB.
-> Total 47.875108412836084% reduced.
Memory usage of hwth DataFrame is 234.94 MB.
Memory usage became: 101.16 MB.
-> Total 56.94444444444444% reduced.
Memory usage of elec DataFrame is 0.47 MB.
Memory usage became: 0.32 MB.
-> Total 31.25% reduced.
Memory usage of gas DataFrame is 0.02 MB.
Memory usage became: 0.01 MB.
-> Total 43.75% reduced.


In [9]:
train = train.with_columns(pl.col("datetime").dt.date().alias("date"))

### *Client*
> Join **day-level** `installed_capacity` and `eic_count` from 2 days ago.
1. Get `installed_capacity` and `eic_count` from 2 days ago.
    * For modeling with **transformed targets**, the available features come from 2 days ago.

<div class="alert alert-block alert-danger">
    <p>For features, lagged <code>target_div_cap</code> can be generated by dividing target by <strong>the actual cap and eic values</strong>, because they're already accessible. Or, we can keep things simple to use 2-day lagged ones, just as what we do now.</p>
</div>

In [11]:
feats = ["installed_capacity", "eic_count"]
cols_to_drop = [DBI, "data_block_id_right"]

train, cli_feats = _shift_and_join(train, client, join_keys=CLI_JOIN_KEYS, feats=feats, dt_col="date", shift_amt="2d")
train = train.drop(cols_to_drop)
train.head()

county,is_business,product_type,target,is_consumption,datetime,row_id,prediction_unit_id,date,eic_count_lag2d,installed_capacity_lag2d
i8,i8,i8,f32,i8,datetime[μs],i32,i8,date,i16,f32
0,0,1,0.713,0,2021-09-01 00:00:00,0,0,2021-09-01,,
0,0,1,96.589996,1,2021-09-01 00:00:00,1,0,2021-09-01,,
0,0,2,0.0,0,2021-09-01 00:00:00,2,1,2021-09-01,,
0,0,2,17.313999,1,2021-09-01 00:00:00,3,1,2021-09-01,,
0,0,3,2.904,0,2021-09-01 00:00:00,4,2,2021-09-01,,


In [12]:
train.select(cli_feats).null_count() / len(train) * 100

installed_capacity_lag2d,eic_count_lag2d
f64,f64
0.428072,0.428072


### *FWTH*
> Join **non-shifted** local and global weather stats, because we have the corresponding **forecast timestamp**.
1. Some weather stations are scattered outside the counties of Estonia.
2. County 12 is always unknown.
3. There exist duplicated entries on `2021-10-31 03:00:00` and `2022-10-30 03:00:00`.
    * Two rows per loc-datetime.

<div class="alert alert-block alert-danger">
    <p>Filtering the 24-hour forecast subset per day to prevent overlapping. If rolling features are taken into consideration, we also can use the earlier, but this will be more tricky. Now, we concatenate all 24-hour forecast subsets to form a no-gap sequence! Note that the original_date is at 02:00, so we should take the correct interval of hours_ahead 22 ~ 45 to align with the predicting horizons.</p>
    <p>We can derive global stats on all stations or non-null counties only. Also, we can  consider fwth features from lag.</p>
</div>

In [13]:
def _get_wstn_loc2county() -> pl.DataFrame:
    wth_loc2_county = (
        pl.read_csv(PROC_DATA_PATH / "wth_station_latlon2county.csv")
        .drop("")
        .rename(COORD_COL2ABBR)
        .with_columns(CAST_COUNTY+CAST_COORDS)
    )
    
    return wth_loc2_county

In [14]:
wstn_loc2county = _get_wstn_loc2county()
assert len(wstn_loc2county) == wstn_loc2county.n_unique(["lat", "lon"])
print(f"There are {wstn_loc2county.n_unique(['lat', 'lon'])} unique weather stations"
      f" scattered in {wstn_loc2county['county'].n_unique()}.")

There are 75 unique weather stations scattered in 15.


In [15]:
fwth = (
    fwth
    # Time
    .rename({"forecast_datetime": "datetime"})
    # .filter(pl.col("hours_ahead") > 24)
    .filter((pl.col("hours_ahead") >= 22) & (pl.col("hours_ahead") < 46))
    .drop(["origin_datetime", "hours_ahead", DBI])
    # Location
    .rename(COORD_COL2ABBR)
    .with_columns(CAST_COORDS)
    .join(wstn_loc2county, on=LOC_COLS, how="left")
)
fwth.head(1)

lat,lon,temperature,dewpoint,cloudcover_high,cloudcover_low,cloudcover_mid,cloudcover_total,10_metre_u_wind_component,10_metre_v_wind_component,datetime,direct_solar_radiation,surface_solar_radiation_downwards,snowfall,total_precipitation,county
f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,datetime[μs],f32,f32,f32,f32,i8
57.599998,21.700001,13.957666,7.513818,0.0,0.224258,0.412949,0.476897,1.517155,-11.050412,2021-09-02 00:00:00,0.0,0.0,0.0,1.3e-05,


In [16]:
# Add other fwth features (e.g., use coord to derive angle and interact with radiation)
cols_to_skip = ["lat", "lon", "county", "datetime"]
fwth_feats = [c for c in fwth.columns if c not in cols_to_skip]

In [17]:
# fwth_demo = fwth.to_pandas()
# fwth_demo_dup = fwth_demo[fwth_demo[LOC_COLS + ["datetime"]].duplicated()]
# fwth_demo_dup["datetime"].unique()

In [18]:
# Handle duplicated rows on daylight savings tweaking days
print(f"Temporary workaround on duplicated rows...")
print(f"Before taking unique, fwth shape {fwth.shape}")
fwth = fwth.unique(LOC_COLS + ["datetime"])
print(f"After taking unique, fwth shape {fwth.shape}")

Temporary workaround on duplicated rows...
Before taking unique, fwth shape (1712256, 16)
After taking unique, fwth shape (1712032, 16)


In [19]:
# Which county, which predicting horiz (we already take the non-overlapping 24-hour forecast subset,
# so no need to consider DBI due to no overlapping now)
agg_stats = [
    *[pl.col(feat).mean().alias(f"{feat}_local_mean") for feat in fwth_feats],
    # ===
    # Add other stats
    # ===
]
fwth_stats_by_county = (
    fwth
    .filter(pl.col("county").is_not_null())
    .group_by(FWTH_LGP_KEYS)
    .agg(agg_stats)
)
display(fwth_stats_by_county.head(1))

county,datetime,temperature_local_mean,dewpoint_local_mean,cloudcover_high_local_mean,cloudcover_low_local_mean,cloudcover_mid_local_mean,cloudcover_total_local_mean,10_metre_u_wind_component_local_mean,10_metre_v_wind_component_local_mean,direct_solar_radiation_local_mean,surface_solar_radiation_downwards_local_mean,snowfall_local_mean,total_precipitation_local_mean
i8,datetime[μs],f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
1,2021-09-02 02:00:00,13.115319,5.85,0.0,0.106145,0.244115,0.276258,0.91408,-9.736794,0.0,0.0,0.0,1.7e-05


In [20]:
agg_stats = [
    *[pl.col(feat).mean().alias(f"{feat}_global_mean") for feat in fwth_feats],
    # ===
    # Add other stats
    # ===
]
fwth_stats = (
    fwth
    # ===
    # Filter non-null county or not
    # ===
    .group_by(FWTH_GGP_KEYS)
    .agg(agg_stats)
)

In [23]:
fillnul_exprs = _get_fillnul_exprs(
    [f"{feat}_local_mean" for feat in fwth_feats],
    [f"{feat}_global_mean" for feat in fwth_feats]
)
# [
#     pl.when(pl.col(f"{feat}_local_mean").is_null())
#         .then(pl.col(f"{feat}_global_mean"))
#         .otherwise(pl.col(f"{feat}_local_mean")).alias(f"{feat}_local_mean") for feat in fwth_feats        
# ]

train = (
    train
    .join(fwth_stats_by_county, on=FWTH_LGP_KEYS, how="left")
    .join(fwth_stats, on=FWTH_GGP_KEYS, how="left")
    .with_columns(fillnul_exprs)
)
train.head()

county,is_business,product_type,target,is_consumption,datetime,row_id,prediction_unit_id,date,eic_count_lag2d,installed_capacity_lag2d,temperature_local_mean,dewpoint_local_mean,cloudcover_high_local_mean,cloudcover_low_local_mean,cloudcover_mid_local_mean,cloudcover_total_local_mean,10_metre_u_wind_component_local_mean,10_metre_v_wind_component_local_mean,direct_solar_radiation_local_mean,surface_solar_radiation_downwards_local_mean,snowfall_local_mean,total_precipitation_local_mean,temperature_global_mean,dewpoint_global_mean,cloudcover_high_global_mean,cloudcover_low_global_mean,cloudcover_mid_global_mean,cloudcover_total_global_mean,10_metre_u_wind_component_global_mean,10_metre_v_wind_component_global_mean,direct_solar_radiation_global_mean,surface_solar_radiation_downwards_global_mean,snowfall_global_mean,total_precipitation_global_mean
i8,i8,i8,f32,i8,datetime[μs],i32,i8,date,i16,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
0,0,1,0.713,0,2021-09-01 00:00:00,0,0,2021-09-01,,,,,,,,,,,,,,,,,,,,,,,,,,
0,0,1,96.589996,1,2021-09-01 00:00:00,1,0,2021-09-01,,,,,,,,,,,,,,,,,,,,,,,,,,
0,0,2,0.0,0,2021-09-01 00:00:00,2,1,2021-09-01,,,,,,,,,,,,,,,,,,,,,,,,,,
0,0,2,17.313999,1,2021-09-01 00:00:00,3,1,2021-09-01,,,,,,,,,,,,,,,,,,,,,,,,,,
0,0,3,2.904,0,2021-09-01 00:00:00,4,2,2021-09-01,,,,,,,,,,,,,,,,,,,,,,,,,,


In [24]:
train.filter(pl.col("temperature_local_mean").is_null())["county"].value_counts()

county,counts
i8,u32
9,208
0,316
8,156
10,258
2,112
12,52
4,212
1,156
14,208
15,212


### *HWTH*
1. Two deviated weather stations should be dropped, one mapped to county 10 and one null.
2. No duplicated rows observed in hwth.
3. Some original columns casted to integer are casted to F64 after corresponding stats are derived (those stats can't keep int, so forced to cast to F64???). 

<div class="alert alert-block alert-danger">
    <p>More lags of historical weathers can be joined, including the closest 10 hours or something like that.</p>
    <p>Also, DL handling is a little bit different. Because we can use pointers X_s and X_e to access the available data chunks, we don't need to use <code>+ pl.durations(days=n)</code> to shift the features. Btw, DL can also consider the closest 10 hours.</p>
</div>

In [25]:
hwth = pl.read_csv(RAW_DATA_PATH / "historical_weather.csv", try_parse_dates=True)
hwth = _reduce_memory_usage(hwth)

Memory usage of  DataFrame is 234.94 MB.
Memory usage became: 101.16 MB.
-> Total 56.94444444444444% reduced.


In [26]:
stns_to_drop = ["57.624.2", "57.623.2"]

hwth = (
    hwth
    .rename(COORD_COL2ABBR)
    .with_columns(
        *CAST_COORDS,
        pl.concat_str([pl.col("lat"), pl.col("lon")], separator="").alias("loc")
    )
    .filter(~pl.col("loc").is_in(stns_to_drop))
    .join(wstn_loc2county, on=LOC_COLS, how="left")
    .drop([DBI, "loc"])
)
hwth.head(1)
assert len(hwth) == hwth.n_unique(LOC_COLS + ["datetime"]) # Pass

In [27]:
# Add other hwth features (e.g., use coord to derive angle and interact with radiation)
cols_to_skip = ["datetime", "lat", "lon", "county"]
hwth_feats = [c for c in hwth.columns if c not in cols_to_skip]

In [28]:
# Which county, which predicting horiz
agg_stats = [
    *[pl.col(feat).mean().alias(f"{feat}_local_mean_hist") for feat in hwth_feats],
    # ===
    # Add other stats
    # ===
]
hwth_stats_by_county = (
    hwth
    .filter(pl.col("county").is_not_null())
    .group_by(HWTH_LGP_KEYS)
    .agg(agg_stats)
)
display(hwth_stats_by_county.head(1))

county,datetime,temperature_local_mean_hist,dewpoint_local_mean_hist,rain_local_mean_hist,snowfall_local_mean_hist,surface_pressure_local_mean_hist,cloudcover_total_local_mean_hist,cloudcover_low_local_mean_hist,cloudcover_mid_local_mean_hist,cloudcover_high_local_mean_hist,windspeed_10m_local_mean_hist,winddirection_10m_local_mean_hist,shortwave_radiation_local_mean_hist,direct_solar_radiation_local_mean_hist,diffuse_radiation_local_mean_hist
i8,datetime[μs],f32,f32,f32,f32,f32,f64,f64,f64,f64,f32,f64,f32,f32,f32
4,2021-09-01 01:00:00,11.875,9.875,0.0,0.0,1004.950012,62.5,64.75,3.5,8.25,3.194444,326.5,0.0,0.0,0.0


In [29]:
agg_stats = [
    *[pl.col(feat).mean().alias(f"{feat}_global_mean_hist") for feat in hwth_feats],
    # ===
    # Add other stats
    # ===
]
hwth_stats = (
    hwth
    # ===
    # Filter non-null county or not
    # ===
    .group_by(HWTH_GGP_KEYS)
    .agg(agg_stats)
)

hwth_stats.head()

datetime,temperature_global_mean_hist,dewpoint_global_mean_hist,rain_global_mean_hist,snowfall_global_mean_hist,surface_pressure_global_mean_hist,cloudcover_total_global_mean_hist,cloudcover_low_global_mean_hist,cloudcover_mid_global_mean_hist,cloudcover_high_global_mean_hist,windspeed_10m_global_mean_hist,winddirection_10m_global_mean_hist,shortwave_radiation_global_mean_hist,direct_solar_radiation_global_mean_hist,diffuse_radiation_global_mean_hist
datetime[μs],f32,f32,f32,f32,f32,f64,f64,f64,f64,f32,f64,f32,f32,f32
2023-01-04 14:00:00,-0.979091,-3.087273,0.002727,0.005727,1011.885498,88.345455,81.145455,5.872727,51.290909,2.608081,228.727273,51.136364,8.445455,42.69091
2022-06-13 22:00:00,13.760904,9.902726,0.0,0.0,1005.671021,34.136364,4.709091,5.072727,89.490909,2.835606,196.490909,2.236364,0.218182,2.018182
2021-10-30 02:00:00,8.086363,6.424544,0.0,0.0,1011.020935,47.8,42.927273,0.0,34.590909,5.175506,220.745455,0.0,0.0,0.0
2021-10-17 14:00:00,8.533635,1.422727,0.028182,0.0,1003.339966,28.454545,17.381818,20.990909,0.927273,6.004798,274.881818,260.418182,179.55455,80.86364
2022-08-31 00:00:00,12.851819,9.111819,0.045455,0.0,1012.987305,58.409091,49.054545,40.690909,2.536364,5.79899,88.436364,0.0,0.0,0.0


In [30]:
hwth_l_stats = [c for c in hwth_stats_by_county.columns if c.endswith("_local_mean_hist")]
hwth_g_stats = [c for c in hwth_stats.columns if c.endswith("_global_mean_hist")]

train, hwth_l_feats = _shift_and_join(
    train, 
    hwth_stats_by_county, 
    join_keys=HWTH_LGP_KEYS, 
    dt_col="datetime",
    feats=hwth_l_stats,
    shift_amt="2d"
)
train, hwth_g_feats = _shift_and_join(
    train,
    hwth_stats,
    join_keys=HWTH_GGP_KEYS, 
    dt_col="datetime", 
    feats=hwth_g_stats,
    shift_amt="2d"
)

fillnul_exprs = _get_fillnul_exprs(hwth_l_feats, hwth_g_feats)
train = train.with_columns(fillnul_exprs)
train.head(2)

county,is_business,product_type,target,is_consumption,datetime,row_id,prediction_unit_id,date,eic_count_lag2d,installed_capacity_lag2d,temperature_local_mean,dewpoint_local_mean,cloudcover_high_local_mean,cloudcover_low_local_mean,cloudcover_mid_local_mean,cloudcover_total_local_mean,10_metre_u_wind_component_local_mean,10_metre_v_wind_component_local_mean,direct_solar_radiation_local_mean,surface_solar_radiation_downwards_local_mean,snowfall_local_mean,total_precipitation_local_mean,temperature_global_mean,dewpoint_global_mean,cloudcover_high_global_mean,cloudcover_low_global_mean,cloudcover_mid_global_mean,cloudcover_total_global_mean,10_metre_u_wind_component_global_mean,10_metre_v_wind_component_global_mean,direct_solar_radiation_global_mean,surface_solar_radiation_downwards_global_mean,snowfall_global_mean,total_precipitation_global_mean,temperature_local_mean_hist_lag2d,dewpoint_local_mean_hist_lag2d,rain_local_mean_hist_lag2d,snowfall_local_mean_hist_lag2d,surface_pressure_local_mean_hist_lag2d,cloudcover_total_local_mean_hist_lag2d,cloudcover_low_local_mean_hist_lag2d,cloudcover_mid_local_mean_hist_lag2d,cloudcover_high_local_mean_hist_lag2d,windspeed_10m_local_mean_hist_lag2d,winddirection_10m_local_mean_hist_lag2d,shortwave_radiation_local_mean_hist_lag2d,direct_solar_radiation_local_mean_hist_lag2d,diffuse_radiation_local_mean_hist_lag2d,temperature_global_mean_hist_lag2d,dewpoint_global_mean_hist_lag2d,rain_global_mean_hist_lag2d,snowfall_global_mean_hist_lag2d,surface_pressure_global_mean_hist_lag2d,cloudcover_total_global_mean_hist_lag2d,cloudcover_low_global_mean_hist_lag2d,cloudcover_mid_global_mean_hist_lag2d,cloudcover_high_global_mean_hist_lag2d,windspeed_10m_global_mean_hist_lag2d,winddirection_10m_global_mean_hist_lag2d,shortwave_radiation_global_mean_hist_lag2d,direct_solar_radiation_global_mean_hist_lag2d,diffuse_radiation_global_mean_hist_lag2d
i8,i8,i8,f32,i8,datetime[μs],i32,i8,date,i16,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f64,f64,f64,f64,f32,f64,f32,f32,f32,f32,f32,f32,f32,f32,f64,f64,f64,f64,f32,f64,f32,f32,f32
0,0,1,0.713,0,2021-09-01 00:00:00,0,0,2021-09-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,0,1,96.589996,1,2021-09-01 00:00:00,1,0,2021-09-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [31]:
train.select(hwth_l_feats+hwth_g_feats).null_count() / len(train)

temperature_local_mean_hist_lag2d,dewpoint_local_mean_hist_lag2d,rain_local_mean_hist_lag2d,snowfall_local_mean_hist_lag2d,surface_pressure_local_mean_hist_lag2d,cloudcover_total_local_mean_hist_lag2d,cloudcover_low_local_mean_hist_lag2d,cloudcover_mid_local_mean_hist_lag2d,cloudcover_high_local_mean_hist_lag2d,windspeed_10m_local_mean_hist_lag2d,winddirection_10m_local_mean_hist_lag2d,shortwave_radiation_local_mean_hist_lag2d,direct_solar_radiation_local_mean_hist_lag2d,diffuse_radiation_local_mean_hist_lag2d,temperature_global_mean_hist_lag2d,dewpoint_global_mean_hist_lag2d,rain_global_mean_hist_lag2d,snowfall_global_mean_hist_lag2d,surface_pressure_global_mean_hist_lag2d,cloudcover_total_global_mean_hist_lag2d,cloudcover_low_global_mean_hist_lag2d,cloudcover_mid_global_mean_hist_lag2d,cloudcover_high_global_mean_hist_lag2d,windspeed_10m_global_mean_hist_lag2d,winddirection_10m_global_mean_hist_lag2d,shortwave_radiation_global_mean_hist_lag2d,direct_solar_radiation_global_mean_hist_lag2d,diffuse_radiation_global_mean_hist_lag2d
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.002901,0.002901,0.002901,0.002901,0.002901,0.002901,0.002901,0.002901,0.002901,0.002901,0.002901,0.002901,0.002901,0.002901,0.002901,0.002901,0.002901,0.002901,0.002901,0.002901,0.002901,0.002901,0.002901,0.002901,0.002901,0.002901,0.002901,0.002901


### *Elec*
<div class="alert alert-block alert-warning">
    Postponed...
</div>

In [35]:
elec.head()

forecast_date,euros_per_mwh,origin_date,data_block_id
datetime[μs],f32,datetime[μs],i16
2021-09-01 00:00:00,92.510002,2021-08-31 00:00:00,1
2021-09-01 01:00:00,88.900002,2021-08-31 01:00:00,1
2021-09-01 02:00:00,87.349998,2021-08-31 02:00:00,1
2021-09-01 03:00:00,86.879997,2021-08-31 03:00:00,1
2021-09-01 04:00:00,88.43,2021-08-31 04:00:00,1


### *Gas*
<div class="alert alert-block alert-warning">
    Postponed...
</div>

### *Revealed Targets*

In [38]:
?_shift_and_join

[0;31mSignature:[0m
[0m_shift_and_join[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mldf[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mpolars[0m[0;34m.[0m[0mdataframe[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mDataFrame[0m[0;34m,[0m [0mpolars[0m[0;34m.[0m[0mlazyframe[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mLazyFrame[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrdf[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mpolars[0m[0;34m.[0m[0mdataframe[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mDataFrame[0m[0;34m,[0m [0mpolars[0m[0;34m.[0m[0mlazyframe[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mLazyFrame[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mjoin_keys[0m[0;34m:[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdt_col[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfeats[0m[0;34m:[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m 

In [79]:
class RevealedFeatFE(object):
    """Revealed feature engineer.
    
    Accept all features, including targets as features.
    
    Args:
        base_df: base DataFrame, selecting used columns beforehand is
            recommended (e.g., selecting `join_keys` + `base_feats`)
        cross_pc: if True, use prod/cons revealed targets as features
            for cons/prod
            *Note: Only used when `base_feats` are targets themselves
    """
    
    def __init__(
        self,
        base_df: pl.DataFrame,
        join_keys: List[str],
        base_feats: List[str],
        shift_amts: List[str],
        dt_col: str = "datetime",
        cross_pc: bool = False
    ) -> None:
        self.base_df = base_df
        self.join_keys = join_keys
        self.base_feats = base_feats
        self.shift_amts = shift_amts
        self.dt_col = dt_col
        self.cross_pc = cross_pc
        
        self._feats = []
        
    @property
    def feats(self) -> List[str]:
        return self._feats
    
    def run(self) -> pl.LazyFrame:
        """Run revealed feature engineering.
        
        Returns:
            revealed_df: DataFrame containing revealed features
        """
        df_raw = self.base_df.lazy()
        df_lookback = self.base_df.lazy()
        df_lookback = df_lookback.drop(self.base_feats)

        for shift_amt in self.shift_amts:
            df_lookback, new_feats = _shift_and_join(
                df_lookback, 
                df_raw,
                self.join_keys,
                self.dt_col,
                self.base_feats,
                shift_amt=shift_amt
            )
            self._feats.extend(new_feats)

        if self.cross_pc:
            # Prod/Cons long to wide
            df_yp_lookback = df_lookback.filter(pl.col("is_consumption") == 0)
            df_yc_lookback = df_lookback.filter(pl.col("is_consumption") == 1)
            df_lookback = (
                df_yp_lookback
                .join(df_yc_lookback, on=["datetime", UNIT_ID_COL], suffix="_cons")
                .rename({feat: f"{feat}_prod" for feat in self.feats})
                .drop(["is_consumption", "is_consumption_cons"])
            )
            tgt_col_prefix = "|".join([f"{tgt_col}_lag" for tgt_col in self.base_feats])
            # ===
            # Be careful for inference
            self._feats = [c for c in df_lookback.columns if re.search(f"{tgt_col_prefix}.*", c)]
            # ===
            
        return df_lookback

<div class="alert alert-block alert-danger">
    <p>We can also use target divided by <strong>the exact cap and eic</strong>, because they're available. The difference is just the way we interpret the features.</p>
</div>

In [39]:
train = train.with_columns([
    (pl.col("target") / pl.col("installed_capacity_lag2d")).alias("target_div_cap_lag2d"),
    (pl.col("target") / pl.col("eic_count_lag2d")).alias("target_div_eic_lag2d")
])
train.head()

county,is_business,product_type,target,is_consumption,datetime,row_id,prediction_unit_id,date,eic_count_lag2d,installed_capacity_lag2d,temperature_local_mean,dewpoint_local_mean,cloudcover_high_local_mean,cloudcover_low_local_mean,cloudcover_mid_local_mean,cloudcover_total_local_mean,10_metre_u_wind_component_local_mean,10_metre_v_wind_component_local_mean,direct_solar_radiation_local_mean,surface_solar_radiation_downwards_local_mean,snowfall_local_mean,total_precipitation_local_mean,temperature_global_mean,dewpoint_global_mean,cloudcover_high_global_mean,cloudcover_low_global_mean,cloudcover_mid_global_mean,cloudcover_total_global_mean,10_metre_u_wind_component_global_mean,10_metre_v_wind_component_global_mean,direct_solar_radiation_global_mean,surface_solar_radiation_downwards_global_mean,snowfall_global_mean,total_precipitation_global_mean,temperature_local_mean_hist_lag2d,dewpoint_local_mean_hist_lag2d,rain_local_mean_hist_lag2d,snowfall_local_mean_hist_lag2d,surface_pressure_local_mean_hist_lag2d,cloudcover_total_local_mean_hist_lag2d,cloudcover_low_local_mean_hist_lag2d,cloudcover_mid_local_mean_hist_lag2d,cloudcover_high_local_mean_hist_lag2d,windspeed_10m_local_mean_hist_lag2d,winddirection_10m_local_mean_hist_lag2d,shortwave_radiation_local_mean_hist_lag2d,direct_solar_radiation_local_mean_hist_lag2d,diffuse_radiation_local_mean_hist_lag2d,temperature_global_mean_hist_lag2d,dewpoint_global_mean_hist_lag2d,rain_global_mean_hist_lag2d,snowfall_global_mean_hist_lag2d,surface_pressure_global_mean_hist_lag2d,cloudcover_total_global_mean_hist_lag2d,cloudcover_low_global_mean_hist_lag2d,cloudcover_mid_global_mean_hist_lag2d,cloudcover_high_global_mean_hist_lag2d,windspeed_10m_global_mean_hist_lag2d,winddirection_10m_global_mean_hist_lag2d,shortwave_radiation_global_mean_hist_lag2d,direct_solar_radiation_global_mean_hist_lag2d,diffuse_radiation_global_mean_hist_lag2d,target_div_cap_lag2d,target_div_eic_lag2d
i8,i8,i8,f32,i8,datetime[μs],i32,i8,date,i16,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f64,f64,f64,f64,f32,f64,f32,f32,f32,f32,f32,f32,f32,f32,f64,f64,f64,f64,f32,f64,f32,f32,f32,f32,f32
0,0,1,0.713,0,2021-09-01 00:00:00,0,0,2021-09-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,0,1,96.589996,1,2021-09-01 00:00:00,1,0,2021-09-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,0,2,0.0,0,2021-09-01 00:00:00,2,1,2021-09-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,0,2,17.313999,1,2021-09-01 00:00:00,3,1,2021-09-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,0,3,2.904,0,2021-09-01 00:00:00,4,2,2021-09-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [80]:
?RevealedFeatFE

[0;31mInit signature:[0m
[0mRevealedFeatFE[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mbase_df[0m[0;34m:[0m [0mpolars[0m[0;34m.[0m[0mdataframe[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mDataFrame[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mjoin_keys[0m[0;34m:[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbase_feats[0m[0;34m:[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mshift_amts[0m[0;34m:[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdt_col[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'datetime'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcross_pc[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Revealed feature engineer.

Accept all features, including targets as features.

Arg

In [81]:
REVEALED_TGT_JOIN_KEYS

['prediction_unit_id', 'datetime', 'is_consumption']

In [82]:
train.select(join_keys + base_feats).head()

prediction_unit_id,datetime,is_consumption,target
i8,datetime[μs],i8,f32
0,2021-09-01 00:00:00,0,0.713
0,2021-09-01 00:00:00,1,96.589996
1,2021-09-01 00:00:00,0,0.0
1,2021-09-01 00:00:00,1,17.313999
2,2021-09-01 00:00:00,0,2.904


In [83]:
join_keys = REVEALED_TGT_JOIN_KEYS
base_feats = ["target", "target_div_cap_lag2d", "target_div_eic_lag2d"]
revealed_tgt_fe = RevealedFeatFE(
    base_df=train.select(join_keys + base_feats),
    join_keys=join_keys,
    base_feats=base_feats,
    shift_amts=[f"{d}d" for d in range(2, 8)],
    dt_col="datetime",
    cross_pc=True
)
df_tmp = revealed_tgt_fe.run().collect()

train = train.join(df_tmp, on=[UNIT_ID_COL, "datetime"], how="left")
train.head()

county,is_business,product_type,target,is_consumption,datetime,row_id,prediction_unit_id,date,eic_count_lag2d,installed_capacity_lag2d,temperature_local_mean,dewpoint_local_mean,cloudcover_high_local_mean,cloudcover_low_local_mean,cloudcover_mid_local_mean,cloudcover_total_local_mean,10_metre_u_wind_component_local_mean,10_metre_v_wind_component_local_mean,direct_solar_radiation_local_mean,surface_solar_radiation_downwards_local_mean,snowfall_local_mean,total_precipitation_local_mean,temperature_global_mean,dewpoint_global_mean,cloudcover_high_global_mean,cloudcover_low_global_mean,cloudcover_mid_global_mean,cloudcover_total_global_mean,10_metre_u_wind_component_global_mean,10_metre_v_wind_component_global_mean,direct_solar_radiation_global_mean,surface_solar_radiation_downwards_global_mean,snowfall_global_mean,total_precipitation_global_mean,temperature_local_mean_hist_lag2d,dewpoint_local_mean_hist_lag2d,…,target_div_eic_lag2d,target_lag2d_prod,target_div_cap_lag2d_lag2d_prod,target_div_eic_lag2d_lag2d_prod,target_lag3d_prod,target_div_cap_lag2d_lag3d_prod,target_div_eic_lag2d_lag3d_prod,target_lag4d_prod,target_div_cap_lag2d_lag4d_prod,target_div_eic_lag2d_lag4d_prod,target_lag5d_prod,target_div_cap_lag2d_lag5d_prod,target_div_eic_lag2d_lag5d_prod,target_lag6d_prod,target_div_cap_lag2d_lag6d_prod,target_div_eic_lag2d_lag6d_prod,target_lag7d_prod,target_div_cap_lag2d_lag7d_prod,target_div_eic_lag2d_lag7d_prod,target_lag2d_cons,target_div_cap_lag2d_lag2d_cons,target_div_eic_lag2d_lag2d_cons,target_lag3d_cons,target_div_cap_lag2d_lag3d_cons,target_div_eic_lag2d_lag3d_cons,target_lag4d_cons,target_div_cap_lag2d_lag4d_cons,target_div_eic_lag2d_lag4d_cons,target_lag5d_cons,target_div_cap_lag2d_lag5d_cons,target_div_eic_lag2d_lag5d_cons,target_lag6d_cons,target_div_cap_lag2d_lag6d_cons,target_div_eic_lag2d_lag6d_cons,target_lag7d_cons,target_div_cap_lag2d_lag7d_cons,target_div_eic_lag2d_lag7d_cons
i8,i8,i8,f32,i8,datetime[μs],i32,i8,date,i16,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
0,0,1,0.713,0,2021-09-01 00:00:00,0,0,2021-09-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,0,1,96.589996,1,2021-09-01 00:00:00,1,0,2021-09-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,0,2,0.0,0,2021-09-01 00:00:00,2,1,2021-09-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,0,2,17.313999,1,2021-09-01 00:00:00,3,1,2021-09-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0,0,3,2.904,0,2021-09-01 00:00:00,4,2,2021-09-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### *Time Identifiers*

In [25]:
holidays = pl.read_csv("./data/raw/open_src/holidays.csv")
if "date" not in train:
    train = train.with_columns(pl.col("datetime").dt.date().alias("date"))

In [26]:
tid_feats = [
    pl.col("datetime").dt.quarter().alias("quarter"),
    pl.col("datetime").dt.month().alias("month"),
    pl.col("datetime").dt.day().alias("day"),
    pl.col("datetime").dt.weekday().alias("weekday"),
    pl.col("datetime").dt.hour().alias("hour"),
    # ===
    # Can convert to other encoding (e.g., sin/cos)
    pl.col("datetime").dt.ordinal_day().alias("dayofyear"),
    # ===
]

In [27]:
ds, de = datetime(2021, 9, 1), datetime(2024, 12, 31)
date_range = [ds + timedelta(days=d) for d in range((de - ds).days + 1)]
date_df = pl.DataFrame({"date": date_range}).with_columns(pl.col("date").cast(pl.Date))
holidays = (
    holidays
    .select(["date", "holiday_type"])
    .with_columns(pl.col("date").cast(pl.Date))
)
holidays = (
    date_df
    .join(holidays, on="date", how="left")
    .fill_null(0)
    .unique(subset=["date"])
    .rename({"date": "holiday_date"})
)

In [28]:
train = (
    train
    .rename({"date": "holiday_date"})
    .with_columns(tid_feats)
    .join(holidays, on=["holiday_date"], how="left")
    .drop("holiday_date")
)

county,is_business,product_type,target,is_consumption,datetime,row_id,prediction_unit_id,eic_count_lag2d,installed_capacity_lag2d,temperature_local_mean,dewpoint_local_mean,cloudcover_high_local_mean,cloudcover_low_local_mean,cloudcover_mid_local_mean,cloudcover_total_local_mean,10_metre_u_wind_component_local_mean,10_metre_v_wind_component_local_mean,direct_solar_radiation_local_mean,surface_solar_radiation_downwards_local_mean,snowfall_local_mean,total_precipitation_local_mean,temperature_global_mean,dewpoint_global_mean,cloudcover_high_global_mean,cloudcover_low_global_mean,cloudcover_mid_global_mean,cloudcover_total_global_mean,10_metre_u_wind_component_global_mean,10_metre_v_wind_component_global_mean,direct_solar_radiation_global_mean,surface_solar_radiation_downwards_global_mean,snowfall_global_mean,total_precipitation_global_mean,temperature_local_mean_hist_lag2d,dewpoint_local_mean_hist_lag2d,rain_local_mean_hist_lag2d,…,target_lag4d_prod,target_div_cap_lag2d_lag4d_prod,target_div_eic_lag2d_lag4d_prod,target_lag5d_prod,target_div_cap_lag2d_lag5d_prod,target_div_eic_lag2d_lag5d_prod,target_lag6d_prod,target_div_cap_lag2d_lag6d_prod,target_div_eic_lag2d_lag6d_prod,target_lag7d_prod,target_div_cap_lag2d_lag7d_prod,target_div_eic_lag2d_lag7d_prod,target_lag2d_cons,target_div_cap_lag2d_lag2d_cons,target_div_eic_lag2d_lag2d_cons,target_lag3d_cons,target_div_cap_lag2d_lag3d_cons,target_div_eic_lag2d_lag3d_cons,target_lag4d_cons,target_div_cap_lag2d_lag4d_cons,target_div_eic_lag2d_lag4d_cons,target_lag5d_cons,target_div_cap_lag2d_lag5d_cons,target_div_eic_lag2d_lag5d_cons,target_lag6d_cons,target_div_cap_lag2d_lag6d_cons,target_div_eic_lag2d_lag6d_cons,target_lag7d_cons,target_div_cap_lag2d_lag7d_cons,target_div_eic_lag2d_lag7d_cons,quarter,month,day,weekday,hour,dayofyear,holiday_type
i8,i8,i8,f32,i8,datetime[μs],i32,i8,i16,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,u32,u32,u32,u32,u32,u32,i64
0,0,1,0.713,0,2021-09-01 00:00:00,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,9,1,3,0,244,0
0,0,1,96.589996,1,2021-09-01 00:00:00,1,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,9,1,3,0,244,0
0,0,2,0.0,0,2021-09-01 00:00:00,2,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,9,1,3,0,244,0
0,0,2,17.313999,1,2021-09-01 00:00:00,3,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,9,1,3,0,244,0
0,0,3,2.904,0,2021-09-01 00:00:00,4,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,9,1,3,0,244,0


In [29]:
train.write_parquet("./data/processed/data_eager_new.parquet")

## Feature Version Control

In [None]:
tid_feats = ['quarter', 'month', 'day', 'weekday', 'hour', 'dayofyear', "holiday_type"]

In [16]:
num_feats = (
    TGT_PK_COLS
    + [f"target_lag{d}_prod" for d in range(2, 8)]
    + [f"target_lag{d}_cons" for d in range(2, 8)]
    + [c for c in train.columns if c.endswith("_local_mean")]
    + tid_feats
)
cat_feats = []
feats = {"num": num_feats, "cat": cat_feats}

with open("./data/processed/nv1.pkl", "wb") as f:
    pickle.dump(feats, f)