In [2]:
import pandas as pd
import catboost
from sklearn.metrics import mean_squared_error

import utils

utils.configs.setup()


In [3]:
def load_data(seed: int):
    raw = utils.datasets.make_electricity_data(
        "2016-01-01", "2024-01-01", freq="15min", random_state=seed
    )
    return raw


def load_splits(seed: int, features: list[str]):
    raw = load_data(seed)
    display(raw.head(3))

    raw = raw.loc[:, features]
    data = utils.transformations.minute_to_daily(raw)
    display(data.head(3))
    train_end_date = "2022-01-01"
    validation_end_date = "2023-01-01"
    train, validation, test = utils.splits.to_train_validation_test_data(
        data, train_end_date, validation_end_date
    )
    return train, validation, test

def delay(df, delays: int | list[int]):
    if isinstance(df, pd.Series):
        df = df.to_frame()
    dfs = [df]
    if isinstance(delays, int):
        delays = range(1, delays + 1)
    for t in delays:
        delayed_df = df.shift(t)
        delayed_df.columns = [f"{c}_m{t}" for c in delayed_df.columns]
        dfs.append(delayed_df)
    vstacked_df = pd.concat(reversed(dfs), axis=1).dropna()
    return vstacked_df


def get_columns_by_time(df, time: str):
    time = time.replace(":", "_")
    columns = [c for c in df.columns if c.endswith(time)]
    selection = df.loc[:, columns]
    return selection


def evaluate(time: str, delays, train, validation, **kwargs):
    train = get_columns_by_time(train, time)
    validation = get_columns_by_time(validation, time)
    train_tf = delay(train, delays)
    val_tf = delay(validation, delays)
    model = catboost.CatBoostRegressor(**kwargs)
    X, y = train_tf.iloc[:, :-1], train_tf.iloc[:, -1]
    model.fit(X, y)
    y_pred = model.predict(X)
    train_mse = mean_squared_error(y, y_pred)
    val_mse = mean_squared_error(val_tf.iloc[:, -1], model.predict(val_tf.iloc[:, :-1]))

    print()
    print(f"{[c for c in X.columns]} -> {y.name}")
    print("Train MSE:\t", train_mse)
    print("Validation MSE:\t", val_mse)

In [6]:
seed = 42
columns = ["electricity"]
train, validation, test = load_splits(seed, columns)

2024-12-18 17:57:01,588 - INFO - Setting numpy seed to: 42
2024-12-18 17:57:01,786 - INFO - Shape: (280512, 6) | Start: 2016-01-01 00:00:00 | End: 2023-12-31 23:45:00
2024-12-18 17:57:01,787 - INFO - Columns: ['electricity', 'wind_speed', 'wind_speed_no_seasonality', 'daily_seasonality', 'weekly_seasonality', 'yearly_seasonality']


Unnamed: 0_level_0,electricity,wind_speed,wind_speed_no_seasonality,daily_seasonality,weekly_seasonality,yearly_seasonality
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-01-01 00:00:00,6.48,8.1,8.99,0.0,-0.43,0.02
2016-01-01 00:15:00,4.48,6.49,7.72,0.0,-0.43,0.02
2016-01-01 00:30:00,6.55,8.18,9.3,0.0,-0.43,0.02


2024-12-18 17:57:04,330 - INFO - Frequency change: 15min -> 1d
2024-12-18 17:57:04,331 - INFO - Shape change: (280512, 1) -> (2922, 96)


Unnamed: 0_level_0,electricity_00_00,electricity_00_15,electricity_00_30,electricity_00_45,electricity_01_00,electricity_01_15,electricity_01_30,electricity_01_45,electricity_02_00,electricity_02_15,...,electricity_21_30,electricity_21_45,electricity_22_00,electricity_22_15,electricity_22_30,electricity_22_45,electricity_23_00,electricity_23_15,electricity_23_30,electricity_23_45
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-01,6.48,4.48,6.55,12.0,4.92,4.38,11.35,7.93,6.13,9.99,...,4.07,4.51,1.8,5.49,4.46,3.81,2.39,3.44,2.66,0.1
2016-01-02,6.1,3.43,2.4,4.4,3.24,3.78,1.64,1.68,6.56,9.16,...,0.94,2.35,0.9,5.39,4.65,2.45,1.41,1.85,2.29,4.65
2016-01-03,4.68,4.61,4.12,3.79,3.37,6.92,5.25,2.99,7.21,7.51,...,2.79,0.99,2.55,4.59,5.08,2.08,8.99,1.33,2.72,5.55


2024-12-18 17:57:04,361 - INFO - # of training observations: 2192 | 75.02%
2024-12-18 17:57:04,362 - INFO - # of validation observations: 365 | 12.49%
2024-12-18 17:57:04,362 - INFO - # of test observations: 365 | 12.49%


In [8]:
class MultiRmseObjective(catboost.MultiTargetCustomObjective):
    def calc_ders_multi(self, approx, target, weight):
        assert len(target) == len(approx)

        w = weight if weight is not None else 1.0
        der1 = [(target[i] - approx[i]) * w for i in range(len(approx))]
        der2 = [-w for i in range(len(approx))]

        return (der1, der2)


In [11]:
from sklearn.datasets import make_regression
X, y = make_regression(random_state=0, n_targets=3)
display(X.shape)
display(y.shape)


(100, 100)

(100, 3)

In [9]:
time = "00:00"
delays = 1
train = get_columns_by_time(train, time)
validation = get_columns_by_time(validation, time)
train_tf = delay(train, delays)
val_tf = delay(validation, delays)
model = catboost.CatBoostRegressor(iterations=10, loss_function=MultiRmseObjective())
X, y = train_tf.iloc[:, :-1], train_tf.iloc[:, -1]
model.fit(X, y)
y_pred = model.predict(X)
train_mse = mean_squared_error(y, y_pred)
val_mse = mean_squared_error(val_tf.iloc[:, -1], model.predict(val_tf.iloc[:, :-1]))

print()
print(f"{[c for c in X.columns]} -> {y.name}")
print("Train MSE:\t", train_mse)
print("Validation MSE:\t", val_mse)


  _check_train_params(params)


CatBoostError: catboost/libs/metrics/metric.cpp:6723: If loss function is a user defined object, then the eval metric must be specified.