# Preparation

In [52]:
import datetime as dt

START_NOTEBOOK = dt.datetime.now()
str(START_NOTEBOOK)

'2024-05-03 10:07:44.322476'

In [53]:
import os
import warnings
import itertools
import functools
import gc
import logging
import pickle
import pytz
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score, cross_validate, KFold
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import VotingRegressor

import lightgbm as lgb
import catboost as cb

import optuna

from tqdm.auto import tqdm

In [54]:
from sklearn import set_config
set_config(transform_output="pandas")

In [55]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [56]:
logger = logging.getLogger("enefit")
c_handler = logging.StreamHandler()
c_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger.setLevel(logging.INFO)
c_handler.setFormatter(c_format)
logger.addHandler(c_handler)

In [57]:
if os.path.exists('/kaggle'):
    INPUT_DIR = Path('/kaggle/input/predict-energy-behavior-of-prosumers')
    OUTPUT_DIR = Path('/kaggle/working')
    HOLIDAYS_DIR = Path('/kaggle/input/estonian-holidays-2021-2024')
else:
    INPUT_DIR = Path('input/predict-energy-behavior-of-prosumers')
    OUTPUT_DIR = Path('working')
    HOLIDAYS_DIR = Path('input/estonian-holidays-2021-2024')

GPU = True
DEBUG = False
VALIDATE = False

# Read data

In [58]:
data_cols        = ['target', 'county', 'is_business', 'product_type', 'is_consumption', 'datetime', 'row_id']
client_cols      = ['product_type', 'county', 'eic_count', 'installed_capacity', 'is_business', 'date']
gas_cols         = ['forecast_date', 'lowest_price_per_mwh', 'highest_price_per_mwh']
electricity_cols = ['forecast_date', 'euros_per_mwh']
forecast_cols    = ['latitude', 'longitude',  'origin_datetime', 'hours_ahead', 'temperature', 'dewpoint', 'cloudcover_high', 'cloudcover_low', 'cloudcover_mid', 'cloudcover_total', '10_metre_u_wind_component', '10_metre_v_wind_component', 'forecast_datetime', 'direct_solar_radiation', 'surface_solar_radiation_downwards', 'snowfall', 'total_precipitation']
historical_cols  = ['datetime', 'temperature', 'dewpoint', 'rain', 'snowfall', 'surface_pressure','cloudcover_total','cloudcover_low','cloudcover_mid','cloudcover_high','windspeed_10m','winddirection_10m','shortwave_radiation','direct_solar_radiation','diffuse_radiation','latitude','longitude']
location_cols    = ['longitude', 'latitude', 'county']
target_cols      = ['target', 'county', 'is_business', 'product_type', 'is_consumption', 'datetime']
holidays_cols    = ['date', 'holiday']

In [59]:
df_data        = pl.read_csv(INPUT_DIR / "train.csv", columns=data_cols, try_parse_dates=True)
df_client      = pl.read_csv(INPUT_DIR / "client.csv", columns=client_cols, try_parse_dates=True)
df_gas         = pl.read_csv(INPUT_DIR / "gas_prices.csv", columns=gas_cols, try_parse_dates=True)
df_electricity = pl.read_csv(INPUT_DIR / "electricity_prices.csv", columns=electricity_cols, try_parse_dates=True)
df_forecast    = pl.read_csv(INPUT_DIR / "forecast_weather.csv", columns=forecast_cols, try_parse_dates=True)
df_historical  = pl.read_csv(INPUT_DIR / "historical_weather.csv", columns=historical_cols, try_parse_dates=True)
df_location    = pl.read_csv(INPUT_DIR / "weather_station_to_county_mapping.csv", columns=location_cols, try_parse_dates=True)
df_holidays    = pl.read_csv(HOLIDAYS_DIR / 'holidays.csv', columns=holidays_cols, try_parse_dates=True)
df_target      = df_data.select(target_cols)

schema_data        = df_data.schema
schema_client      = df_client.schema
schema_gas         = df_gas.schema
schema_electricity = df_electricity.schema
schema_forecast    = df_forecast.schema
schema_historical  = df_historical.schema
schema_target      = df_target.schema

In [60]:
def generate_possible_units():
    df_units = pd.DataFrame(
        list(
            itertools.product(
                range(15+1),
                (0, 1),
                (0, 1, 2, 3),
            )
        ), columns = [
            "county",
            "is_business",
            "product_type",
        ]
    )
    df_units['unit_id'] = df_units.index
    return pl.from_pandas(df_units)

df_units = generate_possible_units()

# Feature engineering

- Client data 2 days ago.
- Forecast weather data for the current date and 7 days ago, by date and location and by date only.
- Historical weather 2 and 7 days ago, by date and location and by date only.
- Holidays.
- Datetime features.
- Target 2-21 days ago for both consumption and production, so for each row there are both target_consumption_{lag} and target_production_{lag}, and some statistics like mean, min, max, std, mean / std and ratios.
- is_consumption, is_business, product_type etc.

In [61]:
def feature_eng(
    df_data, 
    df_client, 
    df_gas, 
    df_electricity, 
    df_forecast, 
    df_historical, 
    df_location, 
    df_target,
    df_holidays, 
    df_units
):
    df_data = (
        df_data
        .with_columns(
            pl.col("datetime").cast(pl.Date).alias("date"),
        )
    )
    
    df_client = (
        df_client
        .with_columns(
            (pl.col("date") + pl.duration(days=2)).cast(pl.Date)
        )
    )

    df_gas = (
        df_gas
        .rename({"forecast_date": "date"})
        .with_columns(
            (pl.col("date") + pl.duration(days=1)).cast(pl.Date)
        )
    )
    
    df_electricity = (
        df_electricity
        .rename({"forecast_date": "datetime"})
        .with_columns(
            pl.col("datetime") + pl.duration(days=1)
        )
    )
    
    df_location = (
        df_location
        .with_columns(
            pl.col("latitude").cast(pl.datatypes.Float32),
            pl.col("longitude").cast(pl.datatypes.Float32)
        )
    )
    
    df_forecast = (
        df_forecast
        .rename({"forecast_datetime": "datetime"})
        .with_columns(
            pl.col("latitude").cast(pl.datatypes.Float32),
            pl.col("longitude").cast(pl.datatypes.Float32),
            # pl.col('origin_datetime').dt.convert_time_zone("Europe/Bucharest").dt.replace_time_zone(None).cast(pl.Datetime("us")),
            # pl.col('datetime').dt.convert_time_zone("Europe/Bucharest").dt.replace_time_zone(None).cast(pl.Datetime("us")),
            (pl.col('surface_solar_radiation_downwards') - pl.col('direct_solar_radiation')).alias("diffuse_solar_radiation")
        )
        .filter((pl.col('origin_datetime') + pl.duration(days=1)).cast(pl.Date) == pl.col('datetime').cast(pl.Date)) # Avoid look-ahead bias
        .join(df_location, how="left", on=["longitude", "latitude"])
        .drop("longitude", "latitude", 'origin_datetime')
    )
    
    df_historical = (
        df_historical
        .with_columns(
            pl.col("latitude").cast(pl.datatypes.Float32),
            pl.col("longitude").cast(pl.datatypes.Float32),
            pl.col("datetime") + pl.duration(days=2),
            (pl.col('direct_solar_radiation') + pl.col('diffuse_radiation')).alias("surface_solar_radiation_downwards")
        )
        .join(df_location, how="left", on=["longitude", "latitude"])
        .drop("longitude", "latitude")
    )
    
    df_forecast_date = (
        df_forecast
        .group_by("datetime").mean()
        .drop("county")
    )
    
    df_forecast_local = (
        df_forecast
        .filter(pl.col("county").is_not_null())
        .group_by("county", "datetime").mean()
    )
    
    df_historical_date = (
        df_historical
        .group_by("datetime").mean()
        .drop("county")
    )
    
    df_historical_local = (
        df_historical
        .filter(pl.col("county").is_not_null())
        .group_by("county", "datetime").mean()
    )
    
    df_target_consumption = df_target.filter(pl.col("is_consumption") == 1).drop('is_consumption')
    df_target_production = df_target.filter(pl.col("is_consumption") == 0).drop('is_consumption')
    df_data = (
        df_data
        .join(df_gas, on="date", how="left")
        .join(df_electricity, on="datetime", how="left")

        .join(df_client, on=["county", "is_business", "product_type", "date"], how="left")
        
        .join(df_forecast_date, on="datetime", how="left", suffix="_fd")
        .join(df_forecast_local, on=["county", "datetime"], how="left", suffix="_fl")
        .join(df_historical_date, on="datetime", how="left", suffix="_hd")
        .join(df_historical_local, on=["county", "datetime"], how="left", suffix="_hl")
        
        .join(df_forecast_date.with_columns(pl.col("datetime") + pl.duration(days=7)), on="datetime", how="left", suffix="_fdw")
        .join(df_forecast_local.with_columns(pl.col("datetime") + pl.duration(days=7)), on=["county", "datetime"], how="left", suffix="_flw")
        .join(df_historical_date.with_columns(pl.col("datetime") + pl.duration(days=5)), on="datetime", how="left", suffix="_hdw")
        .join(df_historical_local.with_columns(pl.col("datetime") + pl.duration(days=5)), on=["county", "datetime"], how="left", suffix="_hlw")

        .join(df_forecast_date.with_columns(pl.col("datetime") + pl.duration(days=14)), on="datetime", how="left", suffix="_fd2w")
        .join(df_forecast_local.with_columns(pl.col("datetime") + pl.duration(days=14)), on=["county", "datetime"], how="left", suffix="_f2lw")
        .join(df_historical_date.with_columns(pl.col("datetime") + pl.duration(days=12)), on="datetime", how="left", suffix="_hd2w")
        .join(df_historical_local.with_columns(pl.col("datetime") + pl.duration(days=12)), on=["county", "datetime"], how="left", suffix="_hl2w")

        .join(df_target_production.with_columns(pl.col("datetime") + pl.duration(days=2)).rename({"target": "target_prod_2"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_production.with_columns(pl.col("datetime") + pl.duration(days=3)).rename({"target": "target_prod_3"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_production.with_columns(pl.col("datetime") + pl.duration(days=4)).rename({"target": "target_prod_4"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_production.with_columns(pl.col("datetime") + pl.duration(days=5)).rename({"target": "target_prod_5"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_production.with_columns(pl.col("datetime") + pl.duration(days=6)).rename({"target": "target_prod_6"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_production.with_columns(pl.col("datetime") + pl.duration(days=7)).rename({"target": "target_prod_7"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_production.with_columns(pl.col("datetime") + pl.duration(days=8)).rename({"target": "target_prod_8"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_production.with_columns(pl.col("datetime") + pl.duration(days=9)).rename({"target": "target_prod_9"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_production.with_columns(pl.col("datetime") + pl.duration(days=10)).rename({"target": "target_prod_10"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_production.with_columns(pl.col("datetime") + pl.duration(days=11)).rename({"target": "target_prod_11"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_production.with_columns(pl.col("datetime") + pl.duration(days=12)).rename({"target": "target_prod_12"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_production.with_columns(pl.col("datetime") + pl.duration(days=13)).rename({"target": "target_prod_13"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_production.with_columns(pl.col("datetime") + pl.duration(days=14)).rename({"target": "target_prod_14"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_production.with_columns(pl.col("datetime") + pl.duration(days=15)).rename({"target": "target_prod_15"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_production.with_columns(pl.col("datetime") + pl.duration(days=16)).rename({"target": "target_prod_16"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_production.with_columns(pl.col("datetime") + pl.duration(days=17)).rename({"target": "target_prod_17"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_production.with_columns(pl.col("datetime") + pl.duration(days=18)).rename({"target": "target_prod_18"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_production.with_columns(pl.col("datetime") + pl.duration(days=19)).rename({"target": "target_prod_19"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_production.with_columns(pl.col("datetime") + pl.duration(days=20)).rename({"target": "target_prod_20"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_production.with_columns(pl.col("datetime") + pl.duration(days=21)).rename({"target": "target_prod_21"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        
        .join(df_target_consumption.with_columns(pl.col("datetime") + pl.duration(days=2)).rename({"target": "target_cons_2"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_consumption.with_columns(pl.col("datetime") + pl.duration(days=3)).rename({"target": "target_cons_3"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_consumption.with_columns(pl.col("datetime") + pl.duration(days=4)).rename({"target": "target_cons_4"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_consumption.with_columns(pl.col("datetime") + pl.duration(days=5)).rename({"target": "target_cons_5"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_consumption.with_columns(pl.col("datetime") + pl.duration(days=6)).rename({"target": "target_cons_6"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_consumption.with_columns(pl.col("datetime") + pl.duration(days=7)).rename({"target": "target_cons_7"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_consumption.with_columns(pl.col("datetime") + pl.duration(days=8)).rename({"target": "target_cons_8"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_consumption.with_columns(pl.col("datetime") + pl.duration(days=9)).rename({"target": "target_cons_9"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_consumption.with_columns(pl.col("datetime") + pl.duration(days=10)).rename({"target": "target_cons_10"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_consumption.with_columns(pl.col("datetime") + pl.duration(days=11)).rename({"target": "target_cons_11"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_consumption.with_columns(pl.col("datetime") + pl.duration(days=12)).rename({"target": "target_cons_12"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_consumption.with_columns(pl.col("datetime") + pl.duration(days=13)).rename({"target": "target_cons_13"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_consumption.with_columns(pl.col("datetime") + pl.duration(days=14)).rename({"target": "target_cons_14"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_consumption.with_columns(pl.col("datetime") + pl.duration(days=15)).rename({"target": "target_cons_15"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_consumption.with_columns(pl.col("datetime") + pl.duration(days=16)).rename({"target": "target_cons_16"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_consumption.with_columns(pl.col("datetime") + pl.duration(days=17)).rename({"target": "target_cons_17"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_consumption.with_columns(pl.col("datetime") + pl.duration(days=18)).rename({"target": "target_cons_18"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_consumption.with_columns(pl.col("datetime") + pl.duration(days=19)).rename({"target": "target_cons_19"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_consumption.with_columns(pl.col("datetime") + pl.duration(days=20)).rename({"target": "target_cons_20"}), on=["county", "is_business", "product_type", "datetime"], how="left")
        .join(df_target_consumption.with_columns(pl.col("datetime") + pl.duration(days=21)).rename({"target": "target_cons_21"}), on=["county", "is_business", "product_type", "datetime"], how="left")

        .join(df_holidays, on='date', how='left')

        .join(df_units, on=["county", "is_business", "product_type"], how='left')

        .with_columns(
            pl.col("datetime").dt.ordinal_day().alias("dayofyear"),
            pl.col("datetime").dt.hour().alias("hour"),
            pl.col("datetime").dt.day().alias("day"),
            pl.col("datetime").dt.weekday().alias("weekday"),
            pl.col("datetime").dt.month().alias("month"),
            pl.col("datetime").dt.year().alias("year"),
        )
        
        .with_columns(
            (np.pi * pl.col("dayofyear") / 183).sin().alias("sin(dayofyear)"),
            (np.pi * pl.col("dayofyear") / 183).cos().alias("cos(dayofyear)"),
            (np.pi * pl.col("hour") / 12).sin().alias("sin(hour)"),
            (np.pi * pl.col("hour") / 12).cos().alias("cos(hour)"),
        )
        
        .with_columns(
            pl.col(pl.Float64).cast(pl.Float32),
        )
        
        .drop("date", "datetime", "dayofyear")
    )
    
    return df_data

In [62]:
cat_cols = ["county", "is_business", "product_type", "is_consumption", 'holiday']

col_to_cats = {
    'county': tuple(range(15+1)),
    'is_business': (0, 1),
    'product_type': tuple(range(3+1)),
    'is_consumption': (0, 1),
    'holiday': df_holidays['holiday'].unique().to_list() + ['-'],
    'unit_id': df_units['unit_id'].to_list()
}

def to_categorical(X):
    X['holiday'] = X['holiday'].fillna('-')
    for col in cat_cols:
        X[col] = pd.Categorical(X[col], categories=col_to_cats[col])
    return X
        
def to_pandas(X, y=None):
    
    if y is not None:
        df = pd.concat([X.to_pandas(), y.to_pandas()], axis=1)
    else:
        df = X.to_pandas()    
    
    df = df.set_index("row_id")
    
    target_cons_cols_7 = [f"target_cons_{i}" for i in range(2, 7 + 1)]
    target_prod_cols_7 = [f"target_prod_{i}" for i in range(2, 7 + 1)]

    df["target_cons_mean_7"] = df[target_cons_cols_7].mean(1)
    df["target_prod_mean_7"] = df[target_prod_cols_7].mean(1)

    df["target_cons_std_7"] = df[target_cons_cols_7].std(1)
    df["target_prod_std_7"] = df[target_prod_cols_7].std(1)

    df["target_cons_mean_std_7"] = df["target_cons_mean_7"] / df["target_cons_std_7"]
    df["target_prod_mean_std_7"] = df["target_prod_mean_7"] / df["target_prod_std_7"]

    df["target_cons_min_7"] = df[target_cons_cols_7].min(1)
    df["target_prod_min_7"] = df[target_prod_cols_7].min(1)

    df["target_cons_max_7"] = df[target_cons_cols_7].max(1)
    df["target_prod_max_7"] = df[target_prod_cols_7].max(1)
    
    
    target_cons_cols_14 = [f"target_cons_{i}" for i in range(2, 14 + 1)]
    target_prod_cols_14 = [f"target_prod_{i}" for i in range(2, 14 + 1)]

    df["target_cons_mean_14"] = df[target_cons_cols_14].mean(1)
    df["target_prod_mean_14"] = df[target_prod_cols_14].mean(1)

    df["target_cons_std_14"] = df[target_cons_cols_14].std(1)
    df["target_prod_std_14"] = df[target_prod_cols_14].std(1)

    df["target_cons_mean_std_14"] = df["target_cons_mean_14"] / df["target_cons_std_14"]
    df["target_prod_mean_std_14"] = df["target_prod_mean_14"] / df["target_prod_std_14"]

    df["target_cons_min_14"] = df[target_cons_cols_14].min(1)
    df["target_prod_min_14"] = df[target_prod_cols_14].min(1)

    df["target_cons_max_14"] = df[target_cons_cols_14].max(1)
    df["target_prod_max_14"] = df[target_prod_cols_14].max(1)

    
    target_cons_cols_21 = [f"target_cons_{i}" for i in range(2, 21 + 1)]
    target_prod_cols_21 = [f"target_prod_{i}" for i in range(2, 21 + 1)]

    df["target_cons_mean_21"] = df[target_cons_cols_21].mean(1)
    df["target_prod_mean_21"] = df[target_prod_cols_21].mean(1)

    df["target_cons_std_21"] = df[target_cons_cols_21].std(1)
    df["target_prod_std_21"] = df[target_prod_cols_21].std(1)

    df["target_cons_min_21"] = df[target_cons_cols_21].min(1)
    df["target_prod_min_21"] = df[target_prod_cols_21].min(1)

    df["target_cons_max_21"] = df[target_cons_cols_21].max(1)
    df["target_prod_max_21"] = df[target_prod_cols_21].max(1)

    df["target_cons_mean_std_21"] = df["target_cons_mean_21"] / df["target_cons_std_21"]
    df["target_prod_mean_std_21"] = df["target_prod_mean_21"] / df["target_prod_std_21"]

    df["target_cons_ratio_7_21"] = df["target_cons_7"] / (df["target_cons_21"] + 1e-3)
    df["target_prod_ratio_7_21"] = df["target_prod_7"] / (df["target_prod_21"] + 1e-3)

    df["target_cons_ratio_7_14"] = df["target_cons_7"] / (df["target_cons_14"] + 1e-3)
    df["target_prod_ratio_7_14"] = df["target_prod_7"] / (df["target_prod_14"] + 1e-3)

    df["target_cons_ratio_2_21"] = df["target_cons_2"] / (df["target_cons_21"] + 1e-3)
    df["target_prod_ratio_2_21"] = df["target_prod_2"] / (df["target_prod_21"] + 1e-3)

    df["target_cons_ratio_2_14"] = df["target_cons_2"] / (df["target_cons_14"] + 1e-3)
    df["target_prod_ratio_2_14"] = df["target_prod_2"] / (df["target_prod_14"] + 1e-3)

    df["target_cons_ratio_2_7"] = df["target_cons_2"] / (df["target_cons_7"] + 1e-3)
    df["target_prod_ratio_2_7"] = df["target_prod_2"] / (df["target_prod_7"] + 1e-3)

    df["target_cons_ratio_2_3"] = df["target_cons_2"] / (df["target_cons_3"] + 1e-3)
    df["target_prod_ratio_2_3"] = df["target_prod_2"] / (df["target_prod_3"] + 1e-3)

    return df

In [63]:
%%time
X, y = df_data.drop("target"), df_data.select("target")

X = feature_eng(X, df_client, df_gas, df_electricity, df_forecast, df_historical, df_location, df_target, df_holidays, df_units)

df_train = to_pandas(X, y)

df_train = df_train.replace([np.inf, -np.inf, None], np.nan)
timestamps = pd.to_datetime(df_train[['year', 'month', 'day', 'hour']])

CPU times: user 1min 27s, sys: 17 s, total: 1min 44s
Wall time: 50 s


In [64]:
df_train = df_train[
    df_train["target"].notnull() \
    & (timestamps >= '2021-09-15')
]

X = df_train.drop(columns=["target"])
X = to_categorical(X)
y=df_train["target"]
X.shape

(1976832, 276)

# Time series split

In [65]:
class TimeSeriesKFold:
    def __init__(self, n_splits=3, resample_rule='MS', train_max_months_depth = 24, days_gap = 1):
        self.n_splits = n_splits
        self.resample_rule = resample_rule
        self.train_max_months_depth = train_max_months_depth
        self.days_gap = days_gap
        
    def split(self, X, y, groups=None):
        X, y = X.copy(), y.copy()
        X, y = X.reset_index(drop=True), y.reset_index(drop=True), 
        X['timestamp'] = pd.to_datetime(X[["year", "month", "day", 'hour']])
        periods = X.set_index('timestamp', drop=False).resample(self.resample_rule)['timestamp'].agg(['min', 'max'])
        
        for period, period_limits in periods.iloc[-self.n_splits:].iterrows():
            start, end = period_limits['min'], period_limits['max']
            train_mask = X['timestamp'].between(
                start - pd.DateOffset(months=self.train_max_months_depth), 
                start - pd.DateOffset(days=self.days_gap), 
                inclusive='left'
            )
                
            idx_train = X[train_mask].index
            test_mask = X['timestamp'].between(start, end, inclusive='both')
            idx_test = X[test_mask].index
            
            yield idx_train, idx_test
            
    def get_n_splits(self, X, y, groups=None):
        return self.n_splits

In [66]:
timestamps = pd.to_datetime(X[['year', 'month', 'day', 'hour']])
# cv = TimeSeriesKFold(5, resample_rule='2D', train_max_months_depth=14)
# cv = TimeSeriesKFold(1, resample_rule='AS', train_max_months_depth=14)
cv = TimeSeriesKFold(1, resample_rule='13M', train_max_months_depth=14)

for k, (train_idx, test_idx) in enumerate(cv.split(X, y)):
    print(f'\nFOLD {k+1}')
    print('Train', timestamps.iloc[train_idx].min(), timestamps.iloc[train_idx].max())
    print('Test', timestamps.iloc[test_idx].min(), timestamps.iloc[test_idx].max())


FOLD 1
Train 2021-09-15 00:00:00 2022-10-30 23:00:00
Test 2022-11-01 00:00:00 2023-05-31 23:00:00


# Hyper parameter optimization

In [67]:
n_trials = 150
SEED = 5

# LightGBM

In [68]:
def lgb_objective(trial, X, y, cv):
    params = {
        'objective'        : trial.suggest_categorical('objective', ['l1', 'l2']),
        'n_estimators'     : trial.suggest_categorical('n_estimators', [800]),
        "learning_rate"    : trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        'colsample_bytree' : trial.suggest_float('colsample_bytree', 0.5, 1.0),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        'subsample_freq'   : trial.suggest_int('subsample_freq', 1, 100),
        'reg_alpha'        : trial.suggest_float('lambda_l1', 1e-2, 10.0),
        'reg_lambda'        : trial.suggest_float('lambda_l2', 1e-2, 10.0),
        'min_child_samples' : trial.suggest_int('min_data_in_leaf', 4, 256),
        'max_depth'        : trial.suggest_int('max_depth', 5, 11),
        "max_bin": trial.suggest_categorical("max_bin", [63, 127, 255]),
        
        'verbose'          : -1,
        'random_state'     : SEED,
    }
    # if GPU:
    #     params['device'] = 'gpu'
    #     params['gpu_platform_id'] = 0
    #     params['gpu_device_id'] = 0
    
    model  = lgb.LGBMRegressor(**params)
    cv     = cv
    scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_absolute_error')
    
    return -1 * np.mean(scores)

In [69]:
"""study_lgb_cons = optuna.create_study(direction='minimize', study_name='LightGBMConsumption')
study_lgb_cons.optimize(functools.partial(lgb_objective, X=X[X['is_consumption']==1], y=y[X['is_consumption']==1], cv=cv), n_trials=n_trials, show_progress_bar=True)
print(study_lgb_cons.best_params)"""

"study_lgb_cons = optuna.create_study(direction='minimize', study_name='LightGBMConsumption')\nstudy_lgb_cons.optimize(functools.partial(lgb_objective, X=X[X['is_consumption']==1], y=y[X['is_consumption']==1], cv=cv), n_trials=n_trials, show_progress_bar=True)\nprint(study_lgb_cons.best_params)"

In [70]:
"""study_lgb_prod = optuna.create_study(direction='minimize', study_name='LightGBMProduction')
study_lgb_prod.optimize(functools.partial(lgb_objective, X=X[X['is_consumption']==0], y=y[X['is_consumption']==0], cv=cv), n_trials=n_trials, show_progress_bar=True)
print(study_lgb_prod.best_params)"""

"study_lgb_prod = optuna.create_study(direction='minimize', study_name='LightGBMProduction')\nstudy_lgb_prod.optimize(functools.partial(lgb_objective, X=X[X['is_consumption']==0], y=y[X['is_consumption']==0], cv=cv), n_trials=n_trials, show_progress_bar=True)\nprint(study_lgb_prod.best_params)"

# CatBoost

In [71]:
def cb_objective(trial, X, y, cv):
    params = {
        'boosting_type': 'Plain',

        'objective': 'RMSE',        
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bernoulli', 'Poisson']), # Bernoulli, Poisson
        'n_estimators': trial.suggest_categorical('n_estimators', [800]),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 4, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        # "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "l2_leaf_reg": trial.suggest_float('l2_leaf_reg', 0.5, 1.0, log=True),
        "max_bin": trial.suggest_int("max_bin", 64, 1024),
        
        'random_state' : SEED
    }
    if GPU:
        params['task_type'] = 'GPU'
    
    model = cb.CatBoostRegressor(**params, silent=True, cat_features=cat_cols)
    cv     = cv
    scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_absolute_error')
    return -1 * np.mean(scores)

In [72]:
"""study_cb_cons = optuna.create_study(direction='minimize', study_name='CatBoostConsumption')
study_cb_cons.optimize(functools.partial(cb_objective, X=X[X['is_consumption']==1], y=y[X['is_consumption']==1], cv=cv), n_trials=n_trials, show_progress_bar=True)
print(study_cb_cons.best_params)"""

"study_cb_cons = optuna.create_study(direction='minimize', study_name='CatBoostConsumption')\nstudy_cb_cons.optimize(functools.partial(cb_objective, X=X[X['is_consumption']==1], y=y[X['is_consumption']==1], cv=cv), n_trials=n_trials, show_progress_bar=True)\nprint(study_cb_cons.best_params)"

In [73]:
"""study_cb_prod = optuna.create_study(direction='minimize', study_name='CatBoostProduction')
study_cb_prod.optimize(functools.partial(cb_objective, X=X[X['is_consumption']==0], y=y[X['is_consumption']==0], cv=cv), n_trials=n_trials, show_progress_bar=True)
print(study_cb_prod.best_params)"""

"study_cb_prod = optuna.create_study(direction='minimize', study_name='CatBoostProduction')\nstudy_cb_prod.optimize(functools.partial(cb_objective, X=X[X['is_consumption']==0], y=y[X['is_consumption']==0], cv=cv), n_trials=n_trials, show_progress_bar=True)\nprint(study_cb_prod.best_params)"

# Models params

In [74]:
PARAMS = {
    'consumption': {
        'lgb': [
            {
                'objective': 'tweedie', 
                'n_estimators': 2180, 
                'learning_rate': 0.05703984137040211, 
                'colsample_bytree': 0.3012440013056294, 
                'subsample': 0.8983901318884009, 
                'subsample_freq': 403, 
                'lambda_l1': 5.758927193756652, 
                'lambda_l2': 0.6834387065270533, 
                'min_data_in_leaf': 247, 
                'max_depth': 12, 
                'max_bin': 255
            }, 
        ], 
        'cb': [
            {
                'boosting_type': 'Plain',
                'bootstrap_type': 'Bernoulli', 
                'n_estimators': 2500, 
                'learning_rate': 0.056270110753975935, 
                'depth': 9, 
                'subsample': 0.9714729455236966, 
                'min_data_in_leaf': 43, 
                'l2_leaf_reg': 0.6088529683065117, 
                'max_bin': 396
            }, 
        ]
    },
    'production': {
        'lgb': [
            {
                'objective': 'tweedie', 
                'n_estimators': 3998, 
                'learning_rate': 0.014965163788474123, 
                'colsample_bytree': 0.5135868922095641, 
                'subsample': 0.35433323304753966, 
                'subsample_freq': 32, 
                'lambda_l1': 7.345717566670195, 
                'lambda_l2': 1.2369626798060307, 
                'min_data_in_leaf': 190, 
                'max_depth': 11, 
                'max_bin': 255
            }, 
        ],
        'cb': [
            {
                'boosting_type': 'Plain',
                'bootstrap_type': 'Poisson', 
                'n_estimators': 2500, 
                'learning_rate': 0.059813649721763486, 
                'depth': 8, 
                'subsample': 0.5828519349744947, 
                'min_data_in_leaf': 15, 
                'l2_leaf_reg': 0.9988509260613393, 
                'max_bin': 984
            }, 
        ]
    }
}

In [75]:
PARAMS_DIFF = {
    'consumption': {
        'lgb': [
            {
                'objective': 'l1', 
                'n_estimators': 2180, 
                'learning_rate': 0.05703984137040211, 
                'colsample_bytree': 0.3012440013056294, 
                'subsample': 0.8983901318884009, 
                'subsample_freq': 403, 
                'lambda_l1': 5.758927193756652, 
                'lambda_l2': 0.6834387065270533, 
                'min_data_in_leaf': 247, 
                'max_depth': 12, 
                'max_bin': 255
            }, 
        ], 
        'cb': [
            {
                'boosting_type': 'Plain',
                'bootstrap_type': 'Bernoulli', 
                'n_estimators': 2500, 
                'learning_rate': 0.056270110753975935, 
                'depth': 9, 
                'subsample': 0.9714729455236966, 
                'min_data_in_leaf': 43, 
                'l2_leaf_reg': 0.6088529683065117, 
                'max_bin': 396
            }, 
        ]
    },
    'production': {
        'lgb': [
            {
                'objective': 'l1', 
                'n_estimators': 3998, 
                'learning_rate': 0.014965163788474123, 
                'colsample_bytree': 0.5135868922095641, 
                'subsample': 0.35433323304753966, 
                'subsample_freq': 32, 
                'lambda_l1': 7.345717566670195, 
                'lambda_l2': 1.2369626798060307, 
                'min_data_in_leaf': 190, 
                'max_depth': 11, 
                'max_bin': 255
            }, 
        ],
        'cb': [
            {
                'boosting_type': 'Plain',
                'bootstrap_type': 'Poisson', 
                'n_estimators': 2500, 
                'learning_rate': 0.059813649721763486, 
                'depth': 8, 
                'subsample': 0.5828519349744947, 
                'min_data_in_leaf': 15, 
                'l2_leaf_reg': 0.9988509260613393, 
                'max_bin': 984
            }, 
        ]
    }
}

In [76]:
def get_model(params, cat_cols=cat_cols, verbose=False,):
    esimators = []
    for model_name in params:
        if model_name == 'lgb':
            for i, p in enumerate(params[model_name]):
                if GPU:
                    p.update({
                        "device": "gpu",
                        "gpu_platform_id": 0,
                        "gpu_device_id": 0,
                    })
                esimators.append((f'lgb{i+1}', lgb.LGBMRegressor(**p, random_state=SEED, verbose=-1)))
#         elif model_name == 'cb':
#             for i, p in enumerate(params[model_name]):
#                 if GPU:
#                     p.update({
#                         'task_type': "GPU",
#                         'devices': '0:1'
#                     })
#                 esimators.append((f'cb{i+1}', cb.CatBoostRegressor(**p, silent=True, cat_features=cat_cols, random_state=SEED)))
    return VotingRegressor(esimators, verbose=verbose)

# Feature selection

Simplified forward feature selection. 

Adding features one by one to the model, if feature improved score on CV - add it, otherwise continue. Repeat until MAE stopes falling down.

In [77]:
def select_columns(is_consumption = True):
    selected_columns = pd.Series({"year": 1e5, "month": 1e5, "day":1e5, 'hour':1e5})
    
    while True:
        n_selected = len(selected_columns)
        current_loss = selected_columns.min()
        for col in X.columns:
            if col in selected_columns.index:
                continue
            cols = selected_columns.index.tolist() + [col]
            
            res = cross_validate(
                estimator=get_model(
                    params['consumption'] if is_consumption else params['production'], 
                    verbose=False, cat_cols = [c for c in cat_cols if c in cols]),
                X=X[X['is_consumption']==int(is_consumption)][cols],
                y=y[X['is_consumption']==int(is_consumption)],
                scoring="neg_mean_absolute_error",
                cv= cv,
                return_estimator=True
            )
            current_loss = -res['test_score'].mean()
            
            if current_loss < selected_columns.min():
                selected_columns[col] = current_loss
                logger.info(f"{len(selected_columns)}, {col}, {current_loss}")
        if n_selected == len(selected_columns):
            break
    return selected_columns

In [78]:
"""consumption_cols_stats = select_columns(is_consumption=True)"""

'consumption_cols_stats = select_columns(is_consumption=True)'

In [79]:
"""production_cols_stats = select_columns(is_consumption=False)"""

'production_cols_stats = select_columns(is_consumption=False)'

# Consumption columns

In [80]:
consumption_features_output = """
2023-12-16 10:30:26,359 - enefit - INFO - 5, county, 500.1379218338758
2023-12-16 10:30:59,400 - enefit - INFO - 6, is_business, 476.6330678278262
2023-12-16 10:31:35,695 - enefit - INFO - 7, product_type, 211.95768526946662
2023-12-16 10:32:11,892 - enefit - INFO - 8, is_consumption, 211.9317140791662
2023-12-16 10:34:42,904 - enefit - INFO - 9, eic_count, 209.23428829252236
2023-12-16 10:35:22,411 - enefit - INFO - 10, installed_capacity, 187.71582801887513
2023-12-16 10:36:01,856 - enefit - INFO - 11, hours_ahead, 186.71683660395962
2023-12-16 10:36:42,196 - enefit - INFO - 12, temperature, 172.15629625683817
2023-12-16 10:37:22,925 - enefit - INFO - 13, dewpoint, 165.74197624187005
2023-12-16 10:38:43,649 - enefit - INFO - 14, cloudcover_low, 164.06772153505673
2023-12-16 10:39:24,421 - enefit - INFO - 15, cloudcover_mid, 163.72814821078867
2023-12-16 10:42:14,635 - enefit - INFO - 16, direct_solar_radiation, 163.0856016518288
2023-12-16 10:42:56,875 - enefit - INFO - 17, surface_solar_radiation_downwards, 159.80825347723413
2023-12-16 10:45:04,227 - enefit - INFO - 18, diffuse_solar_radiation, 159.27746082538818
2023-12-16 10:58:28,988 - enefit - INFO - 19, surface_pressure, 158.77838832920014
2023-12-16 10:59:56,626 - enefit - INFO - 20, cloudcover_low_hd, 158.51160911698688
2023-12-16 11:00:40,428 - enefit - INFO - 21, cloudcover_mid_hd, 158.00528452458673
2023-12-16 12:07:25,578 - enefit - INFO - 22, direct_solar_radiation_fd2w, 157.73743517433488
2023-12-16 12:08:10,192 - enefit - INFO - 23, surface_solar_radiation_downwards_fd2w, 157.6962410678005
2023-12-16 12:11:10,691 - enefit - INFO - 24, hours_ahead_f2lw, 156.9872137763231
2023-12-16 12:11:55,758 - enefit - INFO - 25, temperature_f2lw, 156.3101470361462
2023-12-16 12:53:22,837 - enefit - INFO - 26, target_prod_14, 156.0687973163967
2023-12-16 12:59:28,138 - enefit - INFO - 27, target_cons_2, 128.73932370031085
2023-12-16 13:00:13,726 - enefit - INFO - 28, target_cons_3, 122.5232398476195
2023-12-16 13:00:59,618 - enefit - INFO - 29, target_cons_4, 106.49748966680414
2023-12-16 13:01:45,580 - enefit - INFO - 30, target_cons_5, 99.46544470275539
2023-12-16 13:02:33,963 - enefit - INFO - 31, target_cons_6, 92.9786598141034
2023-12-16 13:03:21,684 - enefit - INFO - 32, target_cons_7, 74.73607825329586
2023-12-16 13:04:56,727 - enefit - INFO - 33, target_cons_9, 73.45557439924212
2023-12-16 13:05:44,595 - enefit - INFO - 34, target_cons_10, 73.31700683763857
2023-12-16 13:06:34,265 - enefit - INFO - 35, target_cons_11, 72.67885989682085
2023-12-16 13:08:10,856 - enefit - INFO - 36, target_cons_13, 71.56461238671281
2023-12-16 13:08:59,996 - enefit - INFO - 37, target_cons_14, 66.30044259045607
2023-12-16 13:09:49,667 - enefit - INFO - 38, target_cons_15, 65.95116547212169
2023-12-16 13:10:38,885 - enefit - INFO - 39, target_cons_16, 65.83768152877161
2023-12-16 13:11:29,090 - enefit - INFO - 40, target_cons_17, 65.73795058246121
2023-12-16 13:13:11,497 - enefit - INFO - 41, target_cons_19, 65.22004930957402
2023-12-16 13:15:51,236 - enefit - INFO - 42, holiday, 64.44480003002035
2023-12-16 13:16:48,243 - enefit - INFO - 43, weekday, 61.45608745712063
2023-12-16 13:17:46,374 - enefit - INFO - 44, sin(dayofyear), 61.11365603957031
2023-12-16 13:18:43,871 - enefit - INFO - 45, cos(dayofyear), 60.62251938088358
2023-12-16 13:19:41,725 - enefit - INFO - 46, sin(hour), 60.54735091390976
2023-12-16 13:20:39,604 - enefit - INFO - 47, cos(hour), 60.459193076967225
2023-12-16 13:22:35,714 - enefit - INFO - 48, target_prod_mean_7, 60.34914133863077
2023-12-16 13:23:33,600 - enefit - INFO - 49, target_cons_std_7, 60.334592022878724
2023-12-16 13:24:31,222 - enefit - INFO - 50, target_prod_std_7, 60.142034898135215
2023-12-16 13:25:29,305 - enefit - INFO - 51, target_cons_mean_std_7, 59.90132776872127
2023-12-16 14:15:27,873 - enefit - INFO - 52, cloudcover_low_fl, 59.62410294872592
2023-12-16 14:20:21,685 - enefit - INFO - 53, direct_solar_radiation_fl, 59.21665663820572
2023-12-16 14:21:20,478 - enefit - INFO - 54, surface_solar_radiation_downwards_fl, 59.20630111316062
2023-12-16 14:23:19,519 - enefit - INFO - 55, total_precipitation_fl, 59.023485151611155
2023-12-16 14:24:19,009 - enefit - INFO - 56, diffuse_solar_radiation_fl, 58.62762080834745
2023-12-16 14:53:36,739 - enefit - INFO - 57, temperature_fdw, 58.330358536905315
2023-12-16 14:54:38,550 - enefit - INFO - 58, dewpoint_fdw, 58.28197080310644
2023-12-16 14:55:39,158 - enefit - INFO - 59, cloudcover_high_fdw, 57.978737138431306
2023-12-16 15:26:53,062 - enefit - INFO - 60, cloudcover_total_hdw, 57.932946194610366
2023-12-16 15:27:54,983 - enefit - INFO - 61, cloudcover_low_hdw, 57.8180649100694
2023-12-16 15:44:28,629 - enefit - INFO - 62, cloudcover_mid_hlw, 57.80570583920305
2023-12-16 15:49:41,112 - enefit - INFO - 63, direct_solar_radiation_hlw, 57.57314766336697
2023-12-16 16:54:13,273 - enefit - INFO - 64, target_prod_7, 57.479506745624775
2023-12-16 16:59:26,122 - enefit - INFO - 65, target_prod_12, 57.339574614822254
2023-12-16 17:04:39,930 - enefit - INFO - 66, target_prod_18, 57.312292460089466
2023-12-16 17:09:58,541 - enefit - INFO - 67, target_cons_12, 57.234556267672545
2023-12-16 17:15:21,701 - enefit - INFO - 68, target_prod_mean_std_7, 57.16981436547856
2023-12-16 20:59:55,912 - enefit - INFO - 69, target_cons_mean_7, 57.032884137656445
2023-12-16 21:03:11,979 - enefit - INFO - 70, target_cons_max_7, 56.81722758298316
2023-12-16 21:09:46,860 - enefit - INFO - 71, target_cons_mean_std_14, 56.772616016267996
2023-12-16 21:15:15,828 - enefit - INFO - 72, target_prod_max_14, 56.689172315478295
2023-12-16 21:36:07,506 - enefit - INFO - 73, target_cons_ratio_2_7, 56.64842982550197
2023-12-16 21:37:13,578 - enefit - INFO - 74, target_prod_ratio_2_7, 56.64180710519355
2023-12-16 21:38:20,164 - enefit - INFO - 75, target_cons_ratio_2_3, 56.52014436159403
2023-12-16 21:48:19,998 - enefit - INFO - 76, snowfall, 56.46282207881825
2023-12-16 21:50:34,477 - enefit - INFO - 77, hours_ahead_fl, 56.36946298238
2023-12-16 21:51:41,509 - enefit - INFO - 78, temperature_fl, 56.315556658878585
2023-12-16 22:06:22,767 - enefit - INFO - 79, cloudcover_high_hd, 56.2749412285748
2023-12-16 22:15:29,381 - enefit - INFO - 80, dewpoint_hl, 56.26571063915242
2023-12-16 22:58:49,272 - enefit - INFO - 81, diffuse_solar_radiation_flw, 56.24523602775786
2023-12-17 00:54:40,794 - enefit - INFO - 82, target_cons_min_7, 56.15117204944836
2023-12-17 00:56:58,759 - enefit - INFO - 83, target_prod_max_7, 56.05035411699943
"""
consumption_cols = [out.split(', ')[1] for out in consumption_features_output.split('\n') if out != '']
consumption_cols = ["year", "month", "day", 'hour', 'unit_id'] + consumption_cols
consumption_cols

['year',
 'month',
 'day',
 'hour',
 'unit_id',
 'county',
 'is_business',
 'product_type',
 'is_consumption',
 'eic_count',
 'installed_capacity',
 'hours_ahead',
 'temperature',
 'dewpoint',
 'cloudcover_low',
 'cloudcover_mid',
 'direct_solar_radiation',
 'surface_solar_radiation_downwards',
 'diffuse_solar_radiation',
 'surface_pressure',
 'cloudcover_low_hd',
 'cloudcover_mid_hd',
 'direct_solar_radiation_fd2w',
 'surface_solar_radiation_downwards_fd2w',
 'hours_ahead_f2lw',
 'temperature_f2lw',
 'target_prod_14',
 'target_cons_2',
 'target_cons_3',
 'target_cons_4',
 'target_cons_5',
 'target_cons_6',
 'target_cons_7',
 'target_cons_9',
 'target_cons_10',
 'target_cons_11',
 'target_cons_13',
 'target_cons_14',
 'target_cons_15',
 'target_cons_16',
 'target_cons_17',
 'target_cons_19',
 'holiday',
 'weekday',
 'sin(dayofyear)',
 'cos(dayofyear)',
 'sin(hour)',
 'cos(hour)',
 'target_prod_mean_7',
 'target_cons_std_7',
 'target_prod_std_7',
 'target_cons_mean_std_7',
 'cloudcover_

# Production columns

In [81]:
production_features_output = """
2023-12-15 19:48:00,467 - enefit - INFO - 5, county, 124.15651210080429
2023-12-15 19:48:28,603 - enefit - INFO - 6, is_business, 121.20772056434865
2023-12-15 19:49:00,896 - enefit - INFO - 7, product_type, 92.68979880981117
2023-12-15 19:49:33,367 - enefit - INFO - 8, is_consumption, 92.65604927152384
2023-12-15 19:51:48,423 - enefit - INFO - 9, eic_count, 90.38026798661473
2023-12-15 19:52:22,952 - enefit - INFO - 10, installed_capacity, 88.03690993003886
2023-12-15 19:52:57,544 - enefit - INFO - 11, hours_ahead, 86.23870932242907
2023-12-15 19:53:32,632 - enefit - INFO - 12, temperature, 85.60573737444231
2023-12-15 19:54:08,582 - enefit - INFO - 13, dewpoint, 75.7232262636399
2023-12-15 19:54:45,119 - enefit - INFO - 14, cloudcover_high, 72.74785490674934
2023-12-15 19:55:21,114 - enefit - INFO - 15, cloudcover_low, 62.76119235053672
2023-12-15 19:55:58,495 - enefit - INFO - 16, cloudcover_mid, 61.80701610978085
2023-12-15 19:56:35,826 - enefit - INFO - 17, cloudcover_total, 61.49053519970857
2023-12-15 19:57:51,725 - enefit - INFO - 18, 10_metre_v_wind_component, 60.94309133863345
2023-12-15 19:58:30,539 - enefit - INFO - 19, direct_solar_radiation, 53.87075372150773
2023-12-15 19:59:09,556 - enefit - INFO - 20, surface_solar_radiation_downwards, 48.633650592843345
2023-12-15 19:59:49,381 - enefit - INFO - 21, snowfall, 48.3369650433416
2023-12-15 20:00:28,989 - enefit - INFO - 22, total_precipitation, 47.713101226936125
2023-12-15 20:01:10,004 - enefit - INFO - 23, diffuse_solar_radiation, 47.458059482148066
2023-12-15 20:04:37,522 - enefit - INFO - 24, cloudcover_low_fl, 46.273454250762185
2023-12-15 20:05:20,288 - enefit - INFO - 25, cloudcover_mid_fl, 46.259132849130616
2023-12-15 20:06:02,518 - enefit - INFO - 26, cloudcover_total_fl, 46.077201076327064
2023-12-15 20:08:09,476 - enefit - INFO - 27, direct_solar_radiation_fl, 45.71983207044323
2023-12-15 20:08:52,309 - enefit - INFO - 28, surface_solar_radiation_downwards_fl, 45.4061581792446
2023-12-15 20:09:35,308 - enefit - INFO - 29, snowfall_fl, 45.21174307018654
2023-12-15 20:10:18,485 - enefit - INFO - 30, total_precipitation_fl, 45.022433088505025
2023-12-15 20:11:47,026 - enefit - INFO - 31, temperature_hd, 44.9442836397696
2023-12-15 20:13:16,000 - enefit - INFO - 32, rain, 44.92834827044291
2023-12-15 20:14:00,877 - enefit - INFO - 33, snowfall_hd, 44.84847128807503
2023-12-15 20:18:29,086 - enefit - INFO - 34, windspeed_10m, 44.840975875333946
2023-12-15 20:25:13,487 - enefit - INFO - 35, snowfall_hl, 44.792867554522054
2023-12-15 21:28:02,634 - enefit - INFO - 36, diffuse_solar_radiation_fd2w, 44.75085400091216
2023-12-15 21:38:02,139 - enefit - INFO - 37, total_precipitation_f2lw, 44.73921185760579
2023-12-15 22:03:12,995 - enefit - INFO - 38, target_prod_2, 44.574743085609676
2023-12-15 22:03:59,626 - enefit - INFO - 39, target_prod_3, 44.38799857327207
2023-12-15 22:04:46,428 - enefit - INFO - 40, target_prod_4, 44.384476923297505
2023-12-15 22:06:22,075 - enefit - INFO - 41, target_prod_6, 43.978415939209974
2023-12-15 22:07:10,258 - enefit - INFO - 42, target_prod_7, 43.465636357486154
2023-12-15 22:07:58,510 - enefit - INFO - 43, target_prod_8, 43.44079416058617
2023-12-15 22:12:51,036 - enefit - INFO - 44, target_prod_14, 43.0817563100863
2023-12-15 22:22:46,535 - enefit - INFO - 45, target_cons_6, 43.06252847330742
2023-12-15 22:23:36,512 - enefit - INFO - 46, target_cons_7, 42.684133606889624
2023-12-15 22:35:19,998 - enefit - INFO - 47, target_cons_21, 42.632302107761355
2023-12-15 22:36:14,741 - enefit - INFO - 48, holiday, 42.559001424559234
2023-12-15 22:40:48,600 - enefit - INFO - 49, cos(hour), 42.55083962652696
2023-12-15 22:44:28,778 - enefit - INFO - 50, target_prod_std_7, 41.7984418387088
2023-12-15 22:50:01,332 - enefit - INFO - 51, target_prod_max_7, 41.00738422365394
2023-12-15 22:53:45,617 - enefit - INFO - 52, target_prod_std_14, 40.879598318803644
2023-12-15 22:55:37,611 - enefit - INFO - 53, target_prod_mean_std_14, 40.81052778955865
2023-12-15 22:59:22,082 - enefit - INFO - 54, target_prod_max_14, 40.305945728978365
2023-12-15 23:06:52,876 - enefit - INFO - 55, target_prod_max_21, 40.20175541031205
2023-12-15 23:15:39,542 - enefit - INFO - 56, target_cons_ratio_2_14, 40.199122803420394
2023-12-15 23:16:37,548 - enefit - INFO - 57, target_prod_ratio_2_14, 40.07847970966332
2023-12-15 23:29:16,906 - enefit - INFO - 58, 10_metre_u_wind_component_fl, 40.05304091162581
2023-12-15 23:30:15,358 - enefit - INFO - 59, 10_metre_v_wind_component_fl, 40.00083517136362
2023-12-15 23:39:13,343 - enefit - INFO - 60, shortwave_radiation, 39.96561082265238
2023-12-15 23:45:12,281 - enefit - INFO - 61, rain_hl, 39.90446257214588
2023-12-16 00:02:07,485 - enefit - INFO - 62, cloudcover_mid_fdw, 39.86308317319391
2023-12-16 00:05:09,078 - enefit - INFO - 63, 10_metre_v_wind_component_fdw, 39.80925766707845
2023-12-16 00:08:09,195 - enefit - INFO - 64, snowfall_fdw, 39.796950303304385
2023-12-16 00:11:11,137 - enefit - INFO - 65, hours_ahead_flw, 39.68899159566499
2023-12-16 00:16:16,068 - enefit - INFO - 66, cloudcover_mid_flw, 39.65985217694088
2023-12-16 02:23:55,567 - enefit - INFO - 67, weekday, 39.65522643409212
2023-12-16 02:28:01,240 - enefit - INFO - 68, target_cons_mean_7, 39.6481885073438
2023-12-16 02:30:06,187 - enefit - INFO - 69, target_cons_std_7, 39.59014298616953
2023-12-16 02:35:23,155 - enefit - INFO - 70, target_cons_max_7, 39.58914752988496
2023-12-16 02:52:22,572 - enefit - INFO - 71, target_prod_mean_std_21, 39.57912850647163
2023-12-16 02:53:26,210 - enefit - INFO - 72, target_cons_ratio_7_21, 39.5258184169848
"""
production_cols = [out.split(', ')[1] for out in production_features_output.split('\n') if out != '']
production_cols = ["year", "month", "day", 'hour', 'unit_id'] + production_cols
production_cols

['year',
 'month',
 'day',
 'hour',
 'unit_id',
 'county',
 'is_business',
 'product_type',
 'is_consumption',
 'eic_count',
 'installed_capacity',
 'hours_ahead',
 'temperature',
 'dewpoint',
 'cloudcover_high',
 'cloudcover_low',
 'cloudcover_mid',
 'cloudcover_total',
 '10_metre_v_wind_component',
 'direct_solar_radiation',
 'surface_solar_radiation_downwards',
 'snowfall',
 'total_precipitation',
 'diffuse_solar_radiation',
 'cloudcover_low_fl',
 'cloudcover_mid_fl',
 'cloudcover_total_fl',
 'direct_solar_radiation_fl',
 'surface_solar_radiation_downwards_fl',
 'snowfall_fl',
 'total_precipitation_fl',
 'temperature_hd',
 'rain',
 'snowfall_hd',
 'windspeed_10m',
 'snowfall_hl',
 'diffuse_solar_radiation_fd2w',
 'total_precipitation_f2lw',
 'target_prod_2',
 'target_prod_3',
 'target_prod_4',
 'target_prod_6',
 'target_prod_7',
 'target_prod_8',
 'target_prod_14',
 'target_cons_6',
 'target_cons_7',
 'target_cons_21',
 'holiday',
 'cos(hour)',
 'target_prod_std_7',
 'target_prod_ma

In [82]:
all_columns = [col for col in X.columns if col in consumption_cols + production_cols]
X = X[all_columns]

# Validation

Models:
1) by is_consumption

2) by is_consumption and is_business

3) by is_consumption and product_type

4) 2-day diffs by is_consumption

Equal weights, single lightgbm for each model of the ensemble.

In [83]:
class Model:
    def __init__(self):
        self.model_consumption = get_model(PARAMS['consumption'], cat_cols=[col for col in cat_cols if col in consumption_cols])
        self.model_production = get_model(PARAMS['production'], cat_cols=[col for col in cat_cols if col in production_cols])

    def fit(self, X, y):
        is_consumption = X['is_consumption'] == 1
        self.model_consumption.fit(X[consumption_cols][is_consumption], y[is_consumption])
        self.model_production.fit(X[production_cols][~is_consumption], y[~is_consumption])
        return self

    def predict(self, X):
        is_consumption = X['is_consumption'] == 1

        predictions = np.zeros(len(X))
        predictions[is_consumption.values] = self.model_consumption.predict(X[consumption_cols][is_consumption])
        predictions[~is_consumption.values] = self.model_production.predict(X[production_cols][~is_consumption])

        return np.clip(predictions, 0, np.inf)

In [84]:
class ModelBusiness:
    def __init__(self):
        self.model_consumption = get_model(PARAMS['consumption'], cat_cols=[col for col in cat_cols if col in consumption_cols])
        self.model_consumption_business = get_model(PARAMS['consumption'], cat_cols=[col for col in cat_cols if col in consumption_cols])

        self.model_production = get_model(PARAMS['production'], cat_cols=[col for col in cat_cols if col in production_cols])
        self.model_production_business = get_model(PARAMS['production'], cat_cols=[col for col in cat_cols if col in production_cols])

    def fit(self, X, y):
        is_consumption = X['is_consumption'] == 1
        is_business = X['is_business'] == 1
        
        self.model_consumption.fit(X[consumption_cols][is_consumption & ~is_business], y[is_consumption & ~is_business])
        self.model_consumption_business.fit(X[consumption_cols][is_consumption & is_business], y[is_consumption & is_business])

        self.model_production.fit(X[production_cols][~is_consumption & ~is_business], y[~is_consumption & ~is_business])
        self.model_production_business.fit(X[production_cols][~is_consumption & is_business], y[~is_consumption & is_business])

        return self

    def predict(self, X):
        is_consumption = X['is_consumption'] == 1
        is_business = X['is_business'] == 1

        predictions = np.zeros(len(X))
        predictions[is_consumption.values & ~is_business.values] = self.model_consumption.predict(X[consumption_cols][is_consumption & ~is_business])
        predictions[is_consumption.values & is_business.values] = self.model_consumption_business.predict(X[consumption_cols][is_consumption & is_business])

        predictions[~is_consumption.values & ~is_business.values] = self.model_production.predict(X[production_cols][~is_consumption & ~is_business])
        predictions[~is_consumption.values & is_business.values] = self.model_production_business.predict(X[production_cols][~is_consumption & is_business])

        return np.clip(predictions, 0, np.inf)

In [85]:
class ModelProductType:
    def __init__(self):
        self.model_consumption_0 = get_model(PARAMS['consumption'], cat_cols=[col for col in cat_cols if col in consumption_cols])
        self.model_consumption_1 = get_model(PARAMS['consumption'], cat_cols=[col for col in cat_cols if col in consumption_cols])
        self.model_consumption_2 = get_model(PARAMS['consumption'], cat_cols=[col for col in cat_cols if col in consumption_cols])
        self.model_consumption_3 = get_model(PARAMS['consumption'], cat_cols=[col for col in cat_cols if col in consumption_cols])

        self.model_production_0 = get_model(PARAMS['production'], cat_cols=[col for col in cat_cols if col in production_cols])
        self.model_production_1 = get_model(PARAMS['production'], cat_cols=[col for col in cat_cols if col in production_cols])
        self.model_production_2 = get_model(PARAMS['production'], cat_cols=[col for col in cat_cols if col in production_cols])
        self.model_production_3 = get_model(PARAMS['production'], cat_cols=[col for col in cat_cols if col in production_cols])

    def fit(self, X, y):
        is_consumption = X['is_consumption'] == 1
        is_product_type_0 = X['product_type'] == 0
        is_product_type_1 = X['product_type'] == 1
        is_product_type_2 = X['product_type'] == 2
        is_product_type_3 = X['product_type'] == 3

        self.model_consumption_0.fit(X[consumption_cols][is_consumption & is_product_type_0], y[is_consumption & is_product_type_0])
        self.model_consumption_1.fit(X[consumption_cols][is_consumption & is_product_type_1], y[is_consumption & is_product_type_1])
        self.model_consumption_2.fit(X[consumption_cols][is_consumption & is_product_type_2], y[is_consumption & is_product_type_2])
        self.model_consumption_3.fit(X[consumption_cols][is_consumption & is_product_type_3], y[is_consumption & is_product_type_3])

        self.model_production_0.fit(X[production_cols][~is_consumption & is_product_type_0], y[~is_consumption & is_product_type_0])
        self.model_production_1.fit(X[production_cols][~is_consumption & is_product_type_1], y[~is_consumption & is_product_type_1])
        self.model_production_2.fit(X[production_cols][~is_consumption & is_product_type_2], y[~is_consumption & is_product_type_2])
        self.model_production_3.fit(X[production_cols][~is_consumption & is_product_type_3], y[~is_consumption & is_product_type_3])
        
        return self

    def predict(self, X):
        is_consumption = X['is_consumption'] == 1
        is_product_type_0 = X['product_type'] == 0
        is_product_type_1 = X['product_type'] == 1
        is_product_type_2 = X['product_type'] == 2
        is_product_type_3 = X['product_type'] == 3


        predictions = np.zeros(len(X))
        predictions[is_consumption.values & is_product_type_0.values] = self.model_consumption_0.predict(X[consumption_cols][is_consumption & is_product_type_0])
        predictions[is_consumption.values & is_product_type_1.values] = self.model_consumption_1.predict(X[consumption_cols][is_consumption & is_product_type_1])
        predictions[is_consumption.values & is_product_type_2.values] = self.model_consumption_2.predict(X[consumption_cols][is_consumption & is_product_type_2])
        predictions[is_consumption.values & is_product_type_3.values] = self.model_consumption_3.predict(X[consumption_cols][is_consumption & is_product_type_3])

        predictions[~is_consumption.values & is_product_type_0.values] = self.model_production_0.predict(X[production_cols][~is_consumption & is_product_type_0])
        predictions[~is_consumption.values & is_product_type_1.values] = self.model_production_1.predict(X[production_cols][~is_consumption & is_product_type_1])
        predictions[~is_consumption.values & is_product_type_2.values] = self.model_production_2.predict(X[production_cols][~is_consumption & is_product_type_2])
        predictions[~is_consumption.values & is_product_type_3.values] = self.model_production_3.predict(X[production_cols][~is_consumption & is_product_type_3])

        return np.clip(predictions, 0, np.inf)

In [86]:
class ModelDiff:
    def __init__(self, lag=2):
        self.lag = lag
        self.model_consumption = get_model(PARAMS_DIFF['consumption'], cat_cols=[col for col in cat_cols if col in consumption_cols])
        self.model_production = get_model(PARAMS_DIFF['production'], cat_cols=[col for col in cat_cols if col in production_cols])

    def fit(self, X, y):
        is_consumption = X['is_consumption'] == 1
        self.model_consumption.fit(
            X[consumption_cols][is_consumption], 
            y[is_consumption] - X[is_consumption][f'target_cons_{self.lag}'].fillna(0)
        )
        self.model_production.fit(
            X[production_cols][~is_consumption], 
            y[~is_consumption] - X[~is_consumption][f'target_prod_{self.lag}'].fillna(0)
        )
        return self

    def predict(self, X):
        is_consumption = X['is_consumption'] == 1

        predictions = np.zeros(len(X))
        
        predictions[is_consumption.values] = \
            self.model_consumption.predict(X[consumption_cols][is_consumption]) + \
            X[is_consumption][f'target_cons_{self.lag}'].fillna(0).values
        
        predictions[~is_consumption.values] = \
            self.model_production.predict(X[production_cols][~is_consumption]) + \
            X[~is_consumption][f'target_prod_{self.lag}'].fillna(0).values

        return np.clip(predictions, 0, np.inf)

In [87]:
class ModelCombined:
    def __init__(self, weights = None):
        self.models = {
            # 'general': get_model(PARAMS['consumption'], cat_cols=[col for col in cat_cols if col in all_columns]),
            'is_consumption': Model(),
            'is_business': ModelBusiness(),
            'product_type': ModelProductType(),
            'diff_2': ModelDiff(2),
#             'diff_7': ModelDiff(7), 
        }
        if weights is None:
            self.weights = [1. / len(self.models) for _ in range(len(self.models))]
        else:
            assert(len(weights) == len(self.models))
            self.weights = weights

    def fit(self, X, y):
        for model in self.models.values():
            model.fit(X, y)
        return self

    def predict(self, X):
        predictions = np.zeros(len(X))

        predictions = np.average([
            model.predict(X) for model in self.models.values()
        ], weights=self.weights, axis=0)
        return predictions

In [88]:
def scatterplot_results(y_test, y_pred):
    c = abs(y_test - y_pred) / (y_test.max() - y_test.min())
    
    plt.scatter(y_test, y_pred, alpha=0.5, c=c, cmap='RdYlGn_r')
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel("Actual values")
    plt.ylabel("Predicted values")
    plt.title("Scatter plot of predicted values against actual values")
    plt.show()
    plt.close()

def validate_model(model):
    df_res = []
    for train_idx, val_idx in tqdm(cv.split(X, y), total=cv.n_splits):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
        model.fit(X_train, y_train)

        y_pred = model.predict(X_val)
        
        df_res_fold = X_val.copy()
        df_res_fold['y'] = y_val.values
        df_res_fold['y_hat'] = y_pred
        df_res.append(df_res_fold)

    df_res = pd.concat(df_res)
    
    mask_cons = df_res['is_consumption'] == 1
    mask_prod = df_res['is_consumption'] == 0
    
    logger.info(f"MAE {mean_absolute_error(df_res['y'], df_res['y_hat']):.3f}")

    logger.info(f"Consumption MAE {mean_absolute_error(df_res[mask_cons]['y'], df_res[mask_cons]['y_hat']):.3f}")
    scatterplot_results(df_res[mask_cons]['y'], df_res[mask_cons]['y_hat'])

    logger.info(f"Production MAE {mean_absolute_error(df_res[mask_prod]['y'], df_res[mask_prod]['y_hat']):.3f}")
    scatterplot_results(df_res[mask_prod]['y'], df_res[mask_prod]['y_hat'])
    
    return model

In [89]:
if VALIDATE:
    model = validate_model(Model())

In [90]:
if VALIDATE:
    model = validate_model(ModelProductType())

In [91]:
if VALIDATE:
    model = validate_model(ModelBusiness())

In [92]:
if VALIDATE:
    model = validate_model(ModelDiff())

In [93]:
if VALIDATE:
    model = validate_model(ModelCombined())

# Model training

In [94]:
def train_model(X, y):
    idx_revealed = np.intersect1d(X.index, y.index).tolist()

    X_train = to_categorical(X.loc[idx_revealed])
    y_train = y.loc[idx_revealed]
    
    timestamps = pd.to_datetime(X_train[['year', 'month', 'day', 'hour']])
    
    mask = timestamps > (timestamps.max() - pd.DateOffset(months=14))
    X_train, y_train = X_train[mask], y_train[mask]
    
    model = ModelCombined()
    model.fit(X_train, y_train)

    return model

# Prediction

In [95]:
from collections import deque

In [96]:
import enefit

env = enefit.make_env()
enefit.make_env.func_dict['__called__'] = False
iter_test = env.iter_test()

In [97]:
def update_X_y(X, y, X_revealed, y_revealed):
    X = pd.concat([X, X_revealed])
    y = pd.concat([y, y_revealed])

    X = X[~X.index.duplicated(keep='first')]
    y = y[~y.index.duplicated(keep='first')]

    return X, y

In [98]:
n_models_stored = 1
models = deque(maxlen=n_models_stored)

In [99]:
def trim_historical_data(df, ts, ts_column='datetime'):
    return df.filter(df[ts_column] >= ts)

In [100]:
def get_model_weights(n, step=1):
    w = np.exp(np.arange(1 * step, n * step + 1e-6, step))
    return w / sum(w)
    
step = 1
get_model_weights(n_models_stored, step)

array([1.])

In [101]:
%%time
train_every_n_days = 4
def is_time_to_retrain(current_ts, last_train_ts):
    return current_ts - last_train_ts >= pd.Timedelta(days=train_every_n_days)

def is_enough_time_for_training(max_time_elapsed = 8.5 * 60 * 60):
    """
    Submission should take less than 9 hours, so in case less than 8 hours 30 minutes left 
    do not re-train models to avoid timeout as it takes a lot of time.
    """
    return (dt.datetime.now() - START_NOTEBOOK).total_seconds() < max_time_elapsed

last_train_timestamp = pd.Timestamp(2020, 1, 1)
for i, (test, revealed_targets, client, historical_weather,
        forecast_weather, electricity_prices, gas_prices, sample_prediction) in enumerate(iter_test):
    
    test = test.rename(columns={"prediction_datetime": "datetime"})
    current_timestamp = test['datetime'].min()
    buffer_timestamp = current_timestamp - pd.DateOffset(weeks=4)

    logger.info(current_timestamp)

    currently_scored = False
    
    if test['currently_scored'].sum() > 0:
        currently_scored = True

    if not DEBUG:
        if not currently_scored and current_timestamp <= pd.Timestamp(2023, 5, 31): # Skip train set
            env.predict(sample_prediction)
            continue
    
    df_test           = pl.from_pandas(test[data_cols[1:]], schema_overrides=schema_data)
    df_client         = pl.from_pandas(client[client_cols], schema_overrides=schema_client)
    df_gas            = pl.from_pandas(gas_prices[gas_cols], schema_overrides=schema_gas)
    df_electricity    = pl.from_pandas(electricity_prices[electricity_cols], schema_overrides=schema_electricity)
    df_new_forecast   = pl.from_pandas(forecast_weather[forecast_cols], schema_overrides=schema_forecast)
    df_new_historical = pl.from_pandas(historical_weather[historical_cols], schema_overrides=schema_historical)
    df_new_target     = pl.from_pandas(revealed_targets[target_cols], schema_overrides=schema_target)
    
    df_forecast       = trim_historical_data(pl.concat([df_forecast, df_new_forecast]).unique(), buffer_timestamp, 'forecast_datetime')
    df_historical     = trim_historical_data(pl.concat([df_historical, df_new_historical]).unique(), buffer_timestamp, 'datetime')
    df_target         = trim_historical_data(pl.concat([df_target, df_new_target]).unique(), buffer_timestamp, 'datetime')

    X_test = feature_eng(df_test, df_client, df_gas, df_electricity, df_forecast, df_historical, df_location, df_target, df_holidays, df_units)
    X_test = to_pandas(X_test)
    
    X_test = X_test.replace([np.inf, -np.inf, None], np.nan)
    
    X_test = to_categorical(X_test)
    X_test = X_test[all_columns]

    X, y = update_X_y(X, y, X_test, revealed_targets.set_index('row_id')['target'].dropna())

    if currently_scored or DEBUG:
        if is_time_to_retrain(current_timestamp, last_train_timestamp) and is_enough_time_for_training():
            model = train_model(X, y)
            models.append(model)
            last_train_timestamp = current_timestamp
        
        preds = [model.predict(X_test) for model in models]
        
        sample_prediction["target"] = np.average(preds, weights=get_model_weights(len(models), step), axis=0).clip(0)

    env.predict(sample_prediction)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


2024-05-03 10:08:44,839 - enefit - INFO - 2023-05-28 00:00:00
2024-05-03 10:08:44,839 - enefit - INFO - 2023-05-28 00:00:00
2024-05-03 10:08:44,846 - enefit - INFO - 2023-05-29 00:00:00
2024-05-03 10:08:44,846 - enefit - INFO - 2023-05-29 00:00:00
2024-05-03 10:08:44,855 - enefit - INFO - 2023-05-30 00:00:00
2024-05-03 10:08:44,855 - enefit - INFO - 2023-05-30 00:00:00
2024-05-03 10:08:44,863 - enefit - INFO - 2023-05-31 00:00:00
2024-05-03 10:08:44,863 - enefit - INFO - 2023-05-31 00:00:00


CPU times: user 228 ms, sys: 2.48 ms, total: 231 ms
Wall time: 242 ms


In [102]:
print("Sucessfully saved as CSV file")

Sucessfully saved as CSV file
