# Install darts, load libraries

In [None]:
# Cannot use simple pip install because internet is disabled in this competition
#!pip install -qq darts --target=/kaggle/working/ # See https://unit8co.github.io/darts/
# Using another notebook with internet access as a utility script. See https://www.kaggle.com/code/kononenko/pip-install-no-internet/notebook

In [None]:
import darts

In [None]:
darts.__version__

In [None]:
import torch
import random

from tqdm.notebook import tqdm
from pytorch_lightning.callbacks import Callback, EarlyStopping

In [None]:
import os
import warnings

import numpy as np
import matplotlib.pyplot as plt

import pandas as pd
import polars as pl

import torch
from torch import nn

import enefit

warnings.filterwarnings("ignore")

# Data preparation

In [None]:
def feature_eng(df_data, df_client, df_gas, df_electricity, df_forecast, df_location, df_target):    
    df_data = (
        df_data
        .join(df_target, on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
    )
    
    df_consume = (
        df_data
        .filter(pl.col("is_consumption").eq(1))
        .select("target", "county", "is_business", "product_type", "datetime")
    )

    df_data = (
        df_data
        .filter(pl.col("is_consumption").eq(0))
        .join(df_consume, on=["county", "is_business", "product_type", "datetime"], how="left")
        .rename({"target": "production", "target_right": "consumption"})
        .with_columns(pl.col("datetime").cast(pl.Date).alias("date"))
        .drop("is_consumption")
    )
    
    df_client = (
        df_client
        .with_columns((pl.col("date") + pl.duration(days=2)).cast(pl.Date))
    )
    
    df_gas = (
        df_gas
        .rename({"forecast_date": "date"})
        .with_columns((pl.col("date") + pl.duration(days=1)).cast(pl.Date))
    )

    df_electricity = (
        df_electricity
        .rename({"forecast_date": "datetime"})
        .with_columns(datetime=pl.col("datetime") + pl.duration(days=1))
    )
    
    df_location = (
        df_location
        .with_columns(
            pl.col("latitude").cast(pl.datatypes.Float32),
            pl.col("longitude").cast(pl.datatypes.Float32)
        )
    )
    
    df_forecast = (
        df_forecast
        .rename({"forecast_datetime": "datetime"})
        .with_columns(
            pl.col("latitude").cast(pl.datatypes.Float32),
            pl.col("longitude").cast(pl.datatypes.Float32),
        )
        .join(df_location, how="left", on=["longitude", "latitude"])
        .drop("longitude", "latitude")
        .with_columns(pl.col("county").fill_nan(12))
        .group_by("county", "datetime").mean()
    )
    
    df_data = (
        df_data
        .join(df_gas, on="date", how="left")
        .join(df_client, on=["county", "is_business", "product_type", "date"], how="left")
        .join(df_electricity, on="datetime", how="left")
        .join(df_forecast, on=["county", "datetime"], how="left")
        .with_columns(
            pl.col(pl.Float64).cast(pl.Float32),
        )
        .drop("date")
    )
    
    return df_data

In [None]:
def to_pandas(df):
    df = df.to_pandas()
    df = df.set_index("row_id")
    
    return df

### Global Variables

In [None]:
root = "/kaggle/input/predict-energy-behavior-of-prosumers"

data_cols        = ['target', 'county', 'is_business', 'product_type', 'is_consumption', 'datetime', 'row_id']
client_cols      = ['product_type', 'county', 'eic_count', 'installed_capacity', 'is_business', 'date']
gas_cols         = ['forecast_date', 'lowest_price_per_mwh', 'highest_price_per_mwh']
electricity_cols = ['forecast_date', 'euros_per_mwh']
forecast_cols    = ['latitude', 'longitude', 'temperature', 'dewpoint', 'cloudcover_high', 'cloudcover_low', 'cloudcover_mid', 'cloudcover_total', '10_metre_u_wind_component', '10_metre_v_wind_component', 'forecast_datetime', 'direct_solar_radiation', 'surface_solar_radiation_downwards', 'snowfall', 'total_precipitation']
location_cols    = ['longitude', 'latitude', 'county']
target_cols      = ['county', 'is_business', 'product_type', 'target', 'is_consumption', 'datetime']

### Data I/O

In [None]:
df_data        = pl.read_csv(os.path.join(root, "train.csv"), columns=data_cols, try_parse_dates=True)
df_client      = pl.read_csv(os.path.join(root, "client.csv"), columns=client_cols, try_parse_dates=True)
df_gas         = pl.read_csv(os.path.join(root, "gas_prices.csv"), columns=gas_cols, try_parse_dates=True)
df_electricity = pl.read_csv(os.path.join(root, "electricity_prices.csv"), columns=electricity_cols, try_parse_dates=True)
df_forecast    = pl.read_csv(os.path.join(root, "forecast_weather.csv"), columns=forecast_cols, try_parse_dates=True)
df_location    = pl.read_csv(os.path.join(root, "weather_station_to_county_mapping.csv"), columns=location_cols, try_parse_dates=True)

df_target      = df_data.select(target_cols)
df_data        = df_data.drop("target")

df_data        = df_data.filter(pl.col("datetime").lt(pd.Timestamp("20230528")))
df_client      = df_client.filter(pl.col("date").lt(pd.Timestamp("20230526")))
df_gas         = df_gas.filter(pl.col("forecast_date").lt(pd.Timestamp("20230527")))
df_electricity = df_electricity.filter(pl.col("forecast_date").lt(pd.Timestamp("20230527")))
df_target      = df_target.filter(pl.col("datetime").lt(pd.Timestamp("20230526")))

schema_data        = df_data.schema
schema_client      = df_client.schema
schema_gas         = df_gas.schema
schema_electricity = df_electricity.schema
schema_forecast    = df_forecast.schema
schema_target      = df_target.schema

### Feature Engineering

In [None]:
df_train = feature_eng(df_data, df_client, df_gas, df_electricity, df_forecast, df_location, df_target)
df_train = to_pandas(df_train)

df_train = df_train[df_train["consumption"].notnull()]
df_train = df_train[df_train["production"].notnull()]

In [None]:
# Remove old data
df_train = df_train[df_train['datetime'] > pd.to_datetime('2022-01-01 00:00:00')]

In [None]:
df_train.info()

In [None]:
df_train

In [None]:
county_list = df_train['county'].unique()
business_list = df_train['is_business'].unique()
product_type_list = df_train['product_type'].unique()
display(county_list)
display(business_list)
display(product_type_list)

In [None]:
# This is all combinations. However, not all combinations occur
len(county_list) * len(business_list) * len(product_type_list)

In [None]:
def to_time_series_group(df_data_in):
    
    df_data = df_data_in.copy(deep=True)
    
    time_col   = "datetime"
    group_cols = ["county", "is_business", "product_type"]
    value_cols = ["production", "consumption", "lowest_price_per_mwh", "highest_price_per_mwh", "eic_count", "installed_capacity", "euros_per_mwh", 'temperature', 'dewpoint', 'cloudcover_high', 'cloudcover_low', 'cloudcover_mid', 'cloudcover_total', '10_metre_u_wind_component', '10_metre_v_wind_component', 'direct_solar_radiation', 'surface_solar_radiation_downwards', 'snowfall', 'total_precipitation']

    county_TS_dict = {}

    for county in county_list:
        df_train_county = df_data.loc[df_data['county'] == county]

        list_of_TS_county = TimeSeries.from_group_dataframe(
            df_train_county,
            group_cols=group_cols,
            time_col=time_col,
            value_cols=value_cols,
            fill_missing_dates=True,
            freq="1h",
        )

        county_TS_dict[county] = list_of_TS_county

    return county_TS_dict

### Multivariate Time Series Forecasting

In [None]:
from darts import TimeSeries
from darts.models import TiDEModel
from darts.dataprocessing.transformers import Scaler
from darts.utils.missing_values import fill_missing_values

In [None]:
county_TS_dict = to_time_series_group(df_train)

In [None]:
county_TS_dict.keys()

In [None]:
len(county_TS_dict)

In [None]:
# Past covariates: Time series whose past values are known at prediction time. Those series often contain values that have to be observed to be known.
# Future covariates: Time series whose future values are already known at prediction time for the span of the forecast horizon. These can for instance represent known future holidays, or weather forecasts.

target_col = ["production", "consumption"]
past_covariates = ["lowest_price_per_mwh", "highest_price_per_mwh", "eic_count", "installed_capacity", "euros_per_mwh"]
future_covariates = ["holidays", "temperature", "dewpoint", "cloudcover_high", "cloudcover_mid", "cloudcover_low", "cloudcover_total", "total_precipitation", "surface_solar_radiation_downwards"]

In [None]:
# fixed parameters that will be the same for all models
SEED = 42
MAX_N_EPOCHS = 100 # Default in Darts for Vanilla, TiDE and TFT is 100
BATCH_SIZE = 128 # Default in Darts is 32
IN_LEN = 7*24 #  Number of time steps in the past to take as a model input (per chunk)
OUT_LEN = 2*24 # Number of time steps predicted at once (per chunk) by the internal model.
NR_EPOCHS_VAL_PERIOD = 1

In [None]:
# encoders
encoders={
'cyclic': {'future': ['hour', 'dayofyear']},
'datetime_attribute': {'future': ['hour', 'dayofweek']},
'position': {'past': ['relative'], 'future': ['relative']},
'transformer': Scaler(),
}

In [None]:
def build_fit_tide_model(
    IN_LEN,
    train_target,
    train_past_covariates,
    train_future_covariates,
    val_target,
    val_past_covariates,
    val_future_covariates,
    num_encoder_layers,
    num_decoder_layers,
    decoder_output_dim,
    temporal_decoder_hidden,
    use_layer_norm,
    use_reversible_instance_norm,
    dropout,
    lr,
    likelihood=None,
    callbacks=None,
    encoders=encoders,
    batch_size=BATCH_SIZE,
    n_epochs=MAX_N_EPOCHS,
):

    # reproducibility
    torch.manual_seed(SEED)
    
    # throughout training we'll monitor the validation loss for early stopping
    early_stopper = EarlyStopping("val_loss", min_delta=0.001, patience=3, verbose=True)
    if callbacks is None:
        callbacks = [early_stopper]
    else:
        callbacks = [early_stopper] + callbacks

    # detect if a GPU is available
    if torch.cuda.is_available():
        pl_trainer_kwargs = {
            "accelerator": "gpu",
            "devices": [0],
            "callbacks": callbacks,
        }
        num_workers = 4
    else:
        pl_trainer_kwargs = {"callbacks": callbacks}
        num_workers = 0

    
    model_TiDE = TiDEModel(
        input_chunk_length=IN_LEN,
        output_chunk_length=OUT_LEN,
        add_encoders=encoders,
        loss_fn=nn.L1Loss(),
        num_encoder_layers=num_encoder_layers,
        num_decoder_layers=num_decoder_layers,
        decoder_output_dim=decoder_output_dim,
        temporal_decoder_hidden=temporal_decoder_hidden,
        dropout=dropout,
        use_layer_norm=use_layer_norm,
        use_reversible_instance_norm=use_reversible_instance_norm,
        optimizer_kwargs={"lr": lr},
        batch_size=batch_size,
        n_epochs=n_epochs,
        random_state=42,
        model_name="tide_model",
        likelihood=likelihood,
        force_reset=True,
        save_checkpoints=True,
        pl_trainer_kwargs=pl_trainer_kwargs,
    )
    
    model_TiDE.fit(
        series=train_target,
        past_covariates=train_past_covariates,
        future_covariates=train_future_covariates,
        val_series=val_target,
        val_past_covariates=val_past_covariates,
        val_future_covariates=val_future_covariates,
        num_loader_workers=num_workers,
    )

    # reload best model over course of training
    model = model_TiDE.load_from_checkpoint("tide_model")

    return model

In [None]:
# Train model per group
trained_models = dict()

display("Training...")

for county in county_list:
    
    print('County:', county)
    
    county_data = county_TS_dict[county]
    
    # Fill Missing Values
    county_data = [fill_missing_values(series) for series in county_data]
    
    # Add EE holidays
    county_data = [series.add_holidays("EE") for series in county_data] # Estonia holidays
    
    splitted = [series.split_before(0.90) for series in county_data]
    train_series = [split[0] for split in splitted]
    val_series = [split[1] for split in splitted]

    train_target = [series[target_col] for series in train_series]
    train_past_covariates = [series[past_covariates] for series in train_series]
    train_future_covariates = [series[future_covariates] for series in train_series]
    
    val_target = [series[target_col] for series in val_series]
    val_past_covariates = [series[past_covariates] for series in val_series]
    val_future_covariates = [series[future_covariates] for series in val_series]

    encoders={
        'cyclic': {'future': ['hour', 'dayofyear']},
        'datetime_attribute': {'future': ['hour', 'dayofweek']},
        'position': {'past': ['relative'], 'future': ['relative']},
        'transformer': Scaler(),
    }

    # Ref https://arxiv.org/pdf/2304.08424.pdf

    # Hyperparameters optimized with Optuna on one segment
    model = build_fit_tide_model(
        7*24,
        train_target,
        train_past_covariates,
        train_future_covariates,
        val_target,
        val_past_covariates,
        val_future_covariates,
        num_encoder_layers=2,
        num_decoder_layers=2,
        decoder_output_dim=32,
        temporal_decoder_hidden=64,
        use_layer_norm=False,
        use_reversible_instance_norm=False,
        dropout=0.0,
        lr=0.00032143292251485413,
        likelihood=None,
        n_epochs=100
    )

    trained_models[county] = model

In [None]:
trained_models

### Test Data Construction

In [None]:
import enefit
enefit.make_env.func_dict['__called__'] = False
env = enefit.make_env()
iter_test = env.iter_test()

In [None]:
i = 0
for (
    test, 
    revealed_targets, 
    client, 
    _, 
    forecast_weather, 
    electricity_prices, 
    gas_prices, 
    sample_prediction
) in iter_test:
    
    print('Iter_test ', i)
    i+=1
    
    
    test = test.rename(columns={"prediction_datetime": "datetime"})
    row_id = test["row_id"].tolist()

    df_new_data        = pl.from_pandas(test[data_cols[1:]], schema_overrides=schema_data)
    df_new_client      = pl.from_pandas(client[client_cols], schema_overrides=schema_client)
    df_new_gas         = pl.from_pandas(gas_prices[gas_cols], schema_overrides=schema_gas)
    df_new_electricity = pl.from_pandas(electricity_prices[electricity_cols], schema_overrides=schema_electricity)
    df_new_forecast    = pl.from_pandas(forecast_weather[forecast_cols], schema_overrides=schema_forecast)
    df_new_target      = pl.from_pandas(revealed_targets[target_cols], schema_overrides=schema_target)

    df_data        = pl.concat([df_data, df_new_data])
    df_client      = pl.concat([df_client, df_new_client])
    df_gas         = pl.concat([df_gas, df_new_gas])
    df_electricity = pl.concat([df_electricity, df_new_electricity])
    df_forecast    = pl.concat([df_forecast, df_new_forecast])
    df_target      = pl.concat([df_target, df_new_target])

    df_test = feature_eng(df_data, df_client, df_gas, df_electricity, df_forecast, df_location, df_target)
    df_test = to_pandas(df_test)
    
    # To Darts time series
    # Returns a list
    county_TS_dict_test = to_time_series_group(df_test)
    #print(len(county_TS_dict_test))
    
    print(test.datetime.unique())
    
    predictions = dict()
    display("Predicting...")

    for county in county_list:

        county_data = county_TS_dict_test[county]

        # Fill Missing Values
        county_data = [fill_missing_values(series) for series in county_data]

        # Add EE holidays
        county_data = [series.add_holidays("EE") for series in county_data] # Estonia holidays

        test_target = [series[target_col][:-2*24] for series in county_data]
        test_past_covariates = [series[past_covariates] for series in county_data]
        test_future_covariates = [series[future_covariates] for series in county_data]


        prediction = trained_models[county].predict(
                    2*24, test_target,
                    past_covariates=test_past_covariates,
                    future_covariates=test_future_covariates,
                    )

        predictions[county] = prediction
        
    for county in county_list:
        
        print('County:', county)

        for preds_segment in predictions[county]:

            is_business = preds_segment.static_covariates.to_numpy().astype(int)[0][1]
            prod_type = preds_segment.static_covariates.to_numpy().astype(int)[0][2]
            print(preds_segment.static_covariates.to_numpy().astype(int))

            # Is consumption
            mask = (test['is_consumption'] == 1) & (test['county'] == county) & (test['is_business'] == is_business) & (test['product_type'] == prod_type)
            if len(mask[mask.values]) != 0:
                preds_consumption = preds_segment['consumption'].pd_dataframe().values[24:]
                sample_prediction.loc[mask.values, "target"] = preds_consumption
            else:
                print('Mask len 0')

            # Is production
            mask = (test['is_consumption'] == 0) & (test['county'] == county) & (test['is_business'] == is_business) & (test['product_type'] == prod_type)
            if len(mask[mask.values]) != 0:
                preds_production = preds_segment['production'].pd_dataframe().values[24:]
                sample_prediction.loc[mask.values, "target"] = preds_production
            else:
                print('Mask len 0')
            
    # Prediction cannot be below 0
    sample_prediction['target'][sample_prediction['target'] < 0] = 0

    # Fill NAs
    sample_prediction['target'][sample_prediction['target'].isna()] = sample_prediction['target'].mean()
    sample_prediction.fillna(0, inplace=True)
    
    # Prediction cannot be higher than installed capacity?
    # To-Do

    # send predictions
    env.predict(sample_prediction)
    print('Prediction sent.')
    print()