# Libraries

In [16]:
import numpy as np
import pandas as pd
import polars as pl
import holidays
import datetime
import os
import warnings

In [17]:
import enefit

In [18]:
from neuralforecast import NeuralForecast

# Load trained models

In [19]:
nf_production = NeuralForecast.load(path='/kaggle/input/upd-neural-models/production')
nf_consumption = NeuralForecast.load(path='/kaggle/input/upd-neural-models/consumption')

In [20]:
nf_production.models, nf_consumption.models

([TFT, PatchTST, NHITS], [TFT, PatchTST, NHITS])

# Data preprocessing

In [21]:
def feature_eng(df_data, df_client, df_gas, df_electricity, df_forecast, df_location, df_target):
    df_data = (
        df_data
        .join(df_target, on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
    )

    df_consume = (
        df_data
        .filter(pl.col("is_consumption").eq(1))
        .select("target", "county", "is_business", "product_type", "datetime")
    )

    df_data = (
        df_data
        .filter(pl.col("is_consumption").eq(0))
        .join(df_consume, on=["county", "is_business", "product_type", "datetime"], how="left")
        .rename({"target": "production", "target_right": "consumption"})
        .with_columns(pl.col("datetime").cast(pl.Date).alias("date"))
        .drop("is_consumption")
    )

    df_client = (
        df_client
        .with_columns((pl.col("date") + pl.duration(days=2)).cast(pl.Date))
    )

    df_gas = (
        df_gas
        .rename({"forecast_date": "date"})
        .with_columns((pl.col("date") + pl.duration(days=1)).cast(pl.Date))
    )

    df_electricity = (
        df_electricity
        .rename({"forecast_date": "datetime"})
        .with_columns(datetime=pl.col("datetime") + pl.duration(days=1))
    )

    df_location = (
        df_location
        .with_columns(
            pl.col("latitude").cast(pl.datatypes.Float32),
            pl.col("longitude").cast(pl.datatypes.Float32)
        )
    )

    df_forecast = (
        df_forecast
        .rename({"forecast_datetime": "datetime"})
        .with_columns(
            pl.col("latitude").cast(pl.datatypes.Float32),
            pl.col("longitude").cast(pl.datatypes.Float32),
        )
        .join(df_location, how="left", on=["longitude", "latitude"])
        .drop("longitude", "latitude")
        .with_columns(pl.col("county").fill_nan(12))
        .group_by("county", "datetime").mean()
    )

    df_data = (
        df_data
        .join(df_gas, on="date", how="left")
        .join(df_client, on=["county", "is_business", "product_type", "date"], how="left")
        .join(df_electricity, on="datetime", how="left")
        .join(df_forecast, on=["county", "datetime"], how="left")
        .with_columns(
            pl.col(pl.Float64).cast(pl.Float32),
        )
        .drop("date")
    )

    return df_data

In [22]:
def clean_interpolate_timeseries(df_data_to_clean):

    chunks = []
    for (county, is_business, product_type), grp in df_data_to_clean.groupby(
            ["county", "is_business", "product_type"]
    ):
        # interpolate missing values
        interpolated = (
            grp.sort_values(by="datetime")
            .set_index("datetime")
            .interpolate(method="time")
            .reset_index()
        )

        has_gaps = len(interpolated["datetime"].diff().value_counts()) != 1
        if has_gaps:
            # it is assumed that there will be no gaps between observations
            warnings.warn(
                f"{(is_business,product_type,county)=} has gaps in data (there are larger gaps than 1h in datetime column); skipping this series."
            )
            continue
        chunks.append(interpolated)
    
    if len(chunks) != 0:
        df_train_interpolated = pd.concat(chunks).reset_index()
        return df_train_interpolated
    else:
        return df_data_to_clean

In [23]:
class FeaturesGenerator:
    def __init__(self):
        self.estonian_holidays = list(
            holidays.country_holidays("EE", years=range(2021, 2026)).keys()
        )

    def _add_general_features(self, df_features):
        df_features = (
            df_features.with_columns(
                pl.col("datetime").dt.ordinal_day().alias("dayofyear"),
                pl.col("datetime").dt.hour().alias("hour"),
                pl.col("datetime").dt.day().alias("day"),
                pl.col("datetime").dt.weekday().alias("weekday"),
                pl.col("datetime").dt.month().alias("month"),
                pl.col("datetime").dt.year().alias("year"),
            )
            .with_columns(
                pl.concat_str(
                    "county",
                    "is_business",
                    "product_type",
                    separator="_",
                ).alias("segment"),
            )
            .with_columns(
                (np.pi * pl.col("dayofyear") / 183).sin().alias("sin(dayofyear)"),
                (np.pi * pl.col("dayofyear") / 183).cos().alias("cos(dayofyear)"),
                (np.pi * pl.col("hour") / 12).sin().alias("sin(hour)"),
                (np.pi * pl.col("hour") / 12).cos().alias("cos(hour)"),
            )
        )
        return df_features


    def is_country_holiday(self, row):
        return (
            datetime.date(row["year"], row["month"], row["day"])
            in self.estonian_holidays
        )

    def _add_holidays_features(self, df_features):
        df_features = df_features.with_columns(
            pl.struct(["year", "month", "day"])
            .apply(self.is_country_holiday)
            .alias("is_country_holiday")
        )
        return df_features

    def _drop_columns(self, df_features):
        df_features = df_features.drop(
            "date", "hour", "dayofyear"
        )
        return df_features

    def generate_features(self, df_features):

        for add_features in [
            self._add_general_features,
            self._add_holidays_features,
            self._drop_columns,
        ]:
            df_features = add_features(df_features)

        return df_features

In [24]:
def to_pandas(df):
    df = df.to_pandas()
    df = df.set_index("row_id")

    return df

In [25]:
feat_gen = FeaturesGenerator()

In [26]:
root = "/kaggle/input/predict-energy-behavior-of-prosumers"

data_cols        = ['target', 'county', 'is_business', 'product_type', 'is_consumption', 'datetime', 'row_id']
client_cols      = ['product_type', 'county', 'eic_count', 'installed_capacity', 'is_business', 'date']
gas_cols         = ['forecast_date', 'lowest_price_per_mwh', 'highest_price_per_mwh']
electricity_cols = ['forecast_date', 'euros_per_mwh']
forecast_cols    = ['latitude', 'longitude', 'temperature', 'dewpoint', 'cloudcover_high', 'cloudcover_low', 'cloudcover_mid', 'cloudcover_total', '10_metre_u_wind_component', '10_metre_v_wind_component', 'forecast_datetime', 'direct_solar_radiation', 'surface_solar_radiation_downwards', 'snowfall', 'total_precipitation']
location_cols    = ['longitude', 'latitude', 'county']
target_cols      = ['county', 'is_business', 'product_type', 'target', 'is_consumption', 'datetime']

In [27]:
df_data        = pl.read_csv(os.path.join(root, "train.csv"), columns=data_cols, try_parse_dates=True)
df_client      = pl.read_csv(os.path.join(root, "client.csv"), columns=client_cols, try_parse_dates=True)
df_gas         = pl.read_csv(os.path.join(root, "gas_prices.csv"), columns=gas_cols, try_parse_dates=True)
df_electricity = pl.read_csv(os.path.join(root, "electricity_prices.csv"), columns=electricity_cols, try_parse_dates=True)
df_forecast    = pl.read_csv(os.path.join(root, "forecast_weather.csv"), columns=forecast_cols, try_parse_dates=True)
df_location    = pl.read_csv(os.path.join(root, "weather_station_to_county_mapping.csv"), columns=location_cols, try_parse_dates=True)

df_target      = df_data.select(target_cols)
df_data        = df_data.drop("target")

df_data        = df_data.filter(pl.col("datetime").lt(pd.Timestamp("20230528")))
df_client      = df_client.filter(pl.col("date").lt(pd.Timestamp("20230526")))
df_gas         = df_gas.filter(pl.col("forecast_date").lt(pd.Timestamp("20230527")))
df_electricity = df_electricity.filter(pl.col("forecast_date").lt(pd.Timestamp("20230527")))
df_target      = df_target.filter(pl.col("datetime").lt(pd.Timestamp("20230526")))

schema_data        = df_data.schema
schema_client      = df_client.schema
schema_gas         = df_gas.schema
schema_electricity = df_electricity.schema
schema_forecast    = df_forecast.schema
schema_target      = df_target.schema

# Make predictions

In [28]:
import enefit
enefit.make_env.func_dict['__called__'] = False
env = enefit.make_env()

In [29]:
iter_test = env.iter_test()

i = 0
for (
    test, 
    revealed_targets, 
    client, 
    _, 
    forecast_weather, 
    electricity_prices, 
    gas_prices, 
    sample_prediction
) in iter_test:
    
    print('Iter_test ', i)
    i+=1
    
    test = test.rename(columns={"prediction_datetime": "datetime"})

    print(test.datetime.unique())
    
    test = test.drop_duplicates(subset=["row_id"])
    revealed_targets = revealed_targets.drop_duplicates(subset=["datetime", "county", "is_business", "product_type", "is_consumption"])
    client = client.drop_duplicates(["county", "is_business", "product_type", "date"])
    gas_prices = gas_prices.drop_duplicates(["forecast_date"])
    electricity_prices = electricity_prices.drop_duplicates(["forecast_date"])
    
    row_id = test["row_id"].tolist()

    df_new_data        = pl.from_pandas(test[data_cols[1:]], schema_overrides=schema_data)
    df_new_client      = pl.from_pandas(client[client_cols], schema_overrides=schema_client)
    df_new_gas         = pl.from_pandas(gas_prices[gas_cols].tail(1), schema_overrides=schema_gas)
    df_new_electricity = pl.from_pandas(electricity_prices[electricity_cols], schema_overrides=schema_electricity)
    df_new_forecast    = pl.from_pandas(forecast_weather[forecast_cols], schema_overrides=schema_forecast)
    df_new_target      = pl.from_pandas(revealed_targets[target_cols], schema_overrides=schema_target)

    df_data        = pl.concat([df_data, df_new_data])
    df_client      = pl.concat([df_client, df_new_client])
    df_gas         = pl.concat([df_gas, df_new_gas])
    df_electricity = pl.concat([df_electricity, df_new_electricity])
    df_forecast    = pl.concat([df_forecast, df_new_forecast])
    df_target      = pl.concat([df_target, df_new_target])

    df_test = feature_eng(df_data, df_client, df_gas, df_electricity, df_forecast, df_location, df_target)
    df_test = feat_gen.generate_features(df_test)
    df_test = df_test.to_pandas()
    
    display("Predicting...")
    
    try:
        last_48_h = df_test.datetime.unique()[-2*24:]
        print('Time steps to predict: ', len(last_48_h))

        lookback_h = df_test.datetime.unique()[-9*24:-2*24]
        print('Time steps lookback: ', len(lookback_h))

        segments_to_predict = df_test[df_test['datetime'].isin(last_48_h)]['segment']
        print('Number of unique segments to predict:', segments_to_predict.nunique())
        print('Number of total segments in df_test:', df_test['segment'].nunique())

        segments_in_train_set = nf_production.uids

        static_df = df_test[["segment","county", "is_business", "product_type"]][df_test['segment'].isin(segments_to_predict)].rename(columns={'segment':'unique_id'})
        static_df = static_df[static_df['unique_id'].isin(segments_in_train_set)]
        static_df = static_df.drop_duplicates()
        print('Number of unique segments in static_df:', static_df['unique_id'].nunique())

        df_test_lookback = df_test[df_test['datetime'].isin(lookback_h)]
        df_test_lookback = df_test_lookback[df_test_lookback['segment'].isin(segments_in_train_set)]
        #df_test_lookback = clean_interpolate_timeseries(df_test_lookback)
        df_test_lookback = df_test_lookback.drop(columns=["county", "is_business", "product_type"])

        df_test_lookback = df_test_lookback.interpolate(method='bfill')

        futr_df = df_test[df_test['datetime'].isin(last_48_h)]
        futr_df = futr_df[futr_df['segment'].isin(segments_in_train_set)]
        print(f"Segments in futr_df: {futr_df['segment'].nunique()}")
        futr_df = futr_df.drop(columns=["county", "is_business", "product_type","production","consumption","row_id"])

        preds_production = nf_production.predict(df=df_test_lookback.rename(columns={'datetime':'ds','production':'y','segment':'unique_id'}),
                                                 static_df=static_df,
                                                 futr_df=futr_df.rename(columns={'datetime':'ds','segment':'unique_id'}))

        preds_production[['TFT', 'PatchTST', 'NHITS']] = preds_production[['TFT', 'PatchTST', 'NHITS']].applymap(lambda x: max(0, x))
        preds_production['pred_prod'] = (preds_production['TFT'] + preds_production['PatchTST'] + preds_production['NHITS']) / 3

        preds_cons = nf_consumption.predict(df=df_test_lookback.rename(columns={'datetime':'ds','consumption':'y','segment':'unique_id'}),
                                             static_df=static_df,
                                             futr_df=futr_df.rename(columns={'datetime':'ds','segment':'unique_id'}))

        preds_cons[['TFT', 'PatchTST', 'NHITS']] = preds_cons[['TFT', 'PatchTST', 'NHITS']].applymap(lambda x: max(0, x))
        preds_cons['pred_cons'] = (preds_cons['TFT'] + preds_cons['PatchTST'] + preds_cons['NHITS']) / 3

        test['unique_id'] = test[['county', 'is_business', 'product_type']].astype(int).astype(str).agg('_'.join, axis=1)

        preds_cons = preds_cons.reset_index().rename(columns={'ds':'datetime'})
        preds_prods = preds_production.reset_index().rename(columns={'ds':'datetime'})

        test_with_preds = test.merge(preds_cons[['datetime', 'unique_id', 'pred_cons']], on=['datetime', 'unique_id'], how="left")
        test_with_preds = test_with_preds.merge(preds_prods[['datetime', 'unique_id', 'pred_prod']], on=['datetime', 'unique_id'], how="left")

        # Is consumption
        mask = test_with_preds['is_consumption'] == 1
        preds_consumption = test_with_preds[mask]['pred_cons']
        test_with_preds.loc[mask.values, "target"] = preds_consumption

        # Is production
        mask = test['is_consumption'] == 0
        preds_production = test_with_preds[mask]['pred_prod']
        test_with_preds.loc[mask.values, "target"] = preds_production

        test_with_preds = test_with_preds.sort_values('datetime').interpolate(method='bfill')

        test_with_preds['target'] = test_with_preds['target'].clip(0,15000)

        test_with_preds = test_with_preds.groupby('row_id').agg({'target': np.mean}).reset_index().fillna(0)

        test_with_preds = test_with_preds[['row_id','target']].sort_index()

        test_with_preds["target"] = test_with_preds["target"].astype(float)

        test_with_preds["target"].fillna(0.0, inplace=True)

        # send predictions
        env.predict(test_with_preds)
        
    except:
        sample_prediction['target'] = 0
        env.predict(sample_prediction)
        
    print('Prediction sent.')
    print()
    break

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
Iter_test  0
<DatetimeArray>
['2023-05-28 00:00:00', '2023-05-28 01:00:00', '2023-05-28 02:00:00',
 '2023-05-28 03:00:00', '2023-05-28 04:00:00', '2023-05-28 05:00:00',
 '2023-05-28 06:00:00', '2023-05-28 07:00:00', '2023-05-28 08:00:00',
 '2023-05-28 09:00:00', '2023-05-28 10:00:00', '2023-05-28 11:00:00',
 '2023-05-28 12:00:00', '2023-05-28 13:00:00', '2023-05-28 14:00:00',
 '2023-05-28 15:00:00', '2023-05-28 16:00:00', '2023-05-28 17:00:00',
 '2023-05-28 18:00:00', '2023-05-28 19:00:00', '2023-05-28 20:00:00',
 '2023-05-28 21:00:00', '2023-05-28 22:00:00', '2023-05-28 23:00:00']
Length: 24, dtype: datetime64[ns]


'Predicting...'

Time steps to predict:  48
Time steps lookback:  168
Number of unique segments to predict: 66
Number of total segments in df_test: 69
Number of unique segments in static_df: 62
Segments in futr_df: 62


Predicting: |          | 0/? [00:00<?, ?it/s]

Predicting: |          | 0/? [00:00<?, ?it/s]

Predicting: |          | 0/? [00:00<?, ?it/s]

Predicting: |          | 0/? [00:00<?, ?it/s]

Predicting: |          | 0/? [00:00<?, ?it/s]

Predicting: |          | 0/? [00:00<?, ?it/s]

Prediction sent.

