In [1]:
import warnings

warnings.filterwarnings("ignore")

import os
import gc
import pickle

import numpy as np
import pandas as pd
import polars as pl
import plotly.express as px

from sklearn.ensemble import VotingRegressor
import lightgbm as lgb


# Classes

### DataStorage

In [2]:
class DataStorage:
    root = "/kaggle/input/predict-energy-behavior-of-prosumers"

    data_cols = ["target","county","is_business","product_type","is_consumption","datetime","row_id",]
    client_cols = ["product_type","county","eic_count","installed_capacity","is_business","date",]
    gas_prices_cols = ["forecast_date", "lowest_price_per_mwh", "highest_price_per_mwh"]
    electricity_prices_cols = ["forecast_date", "euros_per_mwh"]
    forecast_weather_cols = ["latitude","longitude","hours_ahead","temperature","dewpoint","cloudcover_high","cloudcover_low","cloudcover_mid","cloudcover_total","10_metre_u_wind_component","10_metre_v_wind_component","forecast_datetime","direct_solar_radiation","surface_solar_radiation_downwards","snowfall","total_precipitation",]
    historical_weather_cols = ["datetime","temperature","dewpoint","rain","snowfall","surface_pressure","cloudcover_total","cloudcover_low","cloudcover_mid","cloudcover_high","windspeed_10m","winddirection_10m","shortwave_radiation","direct_solar_radiation","diffuse_radiation","latitude","longitude",]
    location_cols = ["longitude", "latitude", "county"]
    target_cols = ["target","county","is_business","product_type","is_consumption","datetime",]


    def __init__(self):
        self.df_data = pl.read_csv(
            os.path.join(self.root, "train.csv"),
            columns=self.data_cols,
            try_parse_dates=True,
        )
        self.df_client = pl.read_csv(
            os.path.join(self.root, "client.csv"),
            columns=self.client_cols,
            try_parse_dates=True,
        )
        self.df_gas_prices = pl.read_csv(
            os.path.join(self.root, "gas_prices.csv"),
            columns=self.gas_prices_cols,
            try_parse_dates=True,
        )
        self.df_electricity_prices = pl.read_csv(
            os.path.join(self.root, "electricity_prices.csv"),
            columns=self.electricity_prices_cols,
            try_parse_dates=True,
        )
        self.df_forecast_weather = pl.read_csv(
            os.path.join(self.root, "forecast_weather.csv"),
            columns=self.forecast_weather_cols,
            try_parse_dates=True,
        )
        self.df_historical_weather = pl.read_csv(
            os.path.join(self.root, "historical_weather.csv"),
            columns=self.historical_weather_cols,
            try_parse_dates=True,
        )
        self.df_weather_station_to_county_mapping = pl.read_csv(
            os.path.join(self.root, "weather_station_to_county_mapping.csv"),
            columns=self.location_cols,
            try_parse_dates=True,
        )
        self.df_data = self.df_data.filter(
            pl.col("datetime") >= pd.to_datetime("2022-01-01")
        )
        self.df_target = self.df_data.select(self.target_cols)

        self.schema_data = self.df_data.schema
        self.schema_client = self.df_client.schema
        self.schema_gas_prices = self.df_gas_prices.schema
        self.schema_electricity_prices = self.df_electricity_prices.schema
        self.schema_forecast_weather = self.df_forecast_weather.schema
        self.schema_historical_weather = self.df_historical_weather.schema
        self.schema_target = self.df_target.schema

        self.df_weather_station_to_county_mapping = (
            self.df_weather_station_to_county_mapping.with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
        )

    def update_with_new_data(
        self,
        df_new_client,
        df_new_gas_prices,
        df_new_electricity_prices,
        df_new_forecast_weather,
        df_new_historical_weather,
        df_new_target,
    ):
        df_new_client = pl.from_pandas(
            df_new_client[self.client_cols], schema_overrides=self.schema_client
        )
        df_new_gas_prices = pl.from_pandas(
            df_new_gas_prices[self.gas_prices_cols],
            schema_overrides=self.schema_gas_prices,
        )
        df_new_electricity_prices = pl.from_pandas(
            df_new_electricity_prices[self.electricity_prices_cols],
            schema_overrides=self.schema_electricity_prices,
        )
        df_new_forecast_weather = pl.from_pandas(
            df_new_forecast_weather[self.forecast_weather_cols],
            schema_overrides=self.schema_forecast_weather,
        )
        df_new_historical_weather = pl.from_pandas(
            df_new_historical_weather[self.historical_weather_cols],
            schema_overrides=self.schema_historical_weather,
        )
        df_new_target = pl.from_pandas(
            df_new_target[self.target_cols], schema_overrides=self.schema_target
        )

        self.df_client = pl.concat([self.df_client, df_new_client]).unique(
            ["date", "county", "is_business", "product_type"]
        )
        self.df_gas_prices = pl.concat([self.df_gas_prices, df_new_gas_prices]).unique(
            ["forecast_date"]
        )
        self.df_electricity_prices = pl.concat(
            [self.df_electricity_prices, df_new_electricity_prices]
        ).unique(["forecast_date"])
        self.df_forecast_weather = pl.concat(
            [self.df_forecast_weather, df_new_forecast_weather]
        ).unique(["forecast_datetime", "latitude", "longitude", "hours_ahead"])
        self.df_historical_weather = pl.concat(
            [self.df_historical_weather, df_new_historical_weather]
        ).unique(["datetime", "latitude", "longitude"])
        self.df_target = pl.concat([self.df_target, df_new_target]).unique(
            ["datetime", "county", "is_business", "product_type", "is_consumption"]
        )

    def preprocess_test(self, df_test):
        df_test = df_test.rename(columns={"prediction_datetime": "datetime"})
        df_test = pl.from_pandas(
            df_test[self.data_cols[1:]], schema_overrides=self.schema_data
        )
        return df_test


### FeaturesGenerator

In [3]:
class FeaturesGenerator:
    def __init__(self, data_storage):
        self.data_storage = data_storage

    def _add_general_features(self, df_features):
        df_features = (
            df_features.with_columns(
                pl.col("datetime").dt.ordinal_day().alias("dayofyear"),
                pl.col("datetime").dt.hour().alias("hour"),
                pl.col("datetime").dt.day().alias("day"),
                pl.col("datetime").dt.weekday().alias("weekday"),
                pl.col("datetime").dt.month().alias("month"),
                pl.col("datetime").dt.year().alias("year"),
            )
            .with_columns(
                pl.concat_str(
                    "county",
                    "is_business",
                    "product_type",
                    "is_consumption",
                    separator="_",
                ).alias("segment"),
            )
            .with_columns(
                (np.pi * pl.col("dayofyear") / 183).sin().alias("sin(dayofyear)"),
                (np.pi * pl.col("dayofyear") / 183).cos().alias("cos(dayofyear)"),
                (np.pi * pl.col("hour") / 12).sin().alias("sin(hour)"),
                (np.pi * pl.col("hour") / 12).cos().alias("cos(hour)"),
            )
        )
        return df_features

    def _add_client_features(self, df_features):
        df_client = self.data_storage.df_client

        df_features = df_features.join(
            df_client.with_columns(
                (pl.col("date") + pl.duration(days=2)).cast(pl.Date)
            ),
            on=["county", "is_business", "product_type", "date"],
            how="left",
        )
        return df_features

    def _add_forecast_weather_features(self, df_features):
        df_forecast_weather = self.data_storage.df_forecast_weather
        df_weather_station_to_county_mapping = (
            self.data_storage.df_weather_station_to_county_mapping
        )

        df_forecast_weather = (
            df_forecast_weather.rename({"forecast_datetime": "datetime"})
            .filter((pl.col("hours_ahead") >= 22) & pl.col("hours_ahead") <= 45)
            .drop("hours_ahead")
            .with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
            .join(
                df_weather_station_to_county_mapping,
                how="left",
                on=["longitude", "latitude"],
            )
            .drop("longitude", "latitude")
        )

        df_forecast_weather_date = (
            df_forecast_weather.group_by("datetime").mean().drop("county")
        )

        df_forecast_weather_local = (
            df_forecast_weather.filter(pl.col("county").is_not_null())
            .group_by("county", "datetime")
            .mean()
        )

        for hours_lag in [0, 7 * 24]:
            df_features = df_features.join(
                df_forecast_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on="datetime",
                how="left",
                suffix=f"_forecast_{hours_lag}h",
            )
            df_features = df_features.join(
                df_forecast_weather_local.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on=["county", "datetime"],
                how="left",
                suffix=f"_forecast_local_{hours_lag}h",
            )

        return df_features

    def _add_historical_weather_features(self, df_features):
        df_historical_weather = self.data_storage.df_historical_weather
        df_weather_station_to_county_mapping = (
            self.data_storage.df_weather_station_to_county_mapping
        )

        df_historical_weather = (
            df_historical_weather.with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
            .join(
                df_weather_station_to_county_mapping,
                how="left",
                on=["longitude", "latitude"],
            )
            .drop("longitude", "latitude")
        )

        df_historical_weather_date = (
            df_historical_weather.group_by("datetime").mean().drop("county")
        )

        df_historical_weather_local = (
            df_historical_weather.filter(pl.col("county").is_not_null())
            .group_by("county", "datetime")
            .mean()
        )

        for hours_lag in [2 * 24, 7 * 24]:
            df_features = df_features.join(
                df_historical_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on="datetime",
                how="left",
                suffix=f"_historical_{hours_lag}h",
            )
            df_features = df_features.join(
                df_historical_weather_local.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on=["county", "datetime"],
                how="left",
                suffix=f"_historical_local_{hours_lag}h",
            )

        for hours_lag in [1 * 24]:
            df_features = df_features.join(
                df_historical_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag),
                    pl.col("datetime").dt.hour().alias("hour"),
                )
                .filter(pl.col("hour") <= 10)
                .drop("hour"),
                on="datetime",
                how="left",
                suffix=f"_historical_{hours_lag}h",
            )

        return df_features

    def _add_target_features(self, df_features):
        df_target = self.data_storage.df_target

        df_target_all_type_sum = (
            df_target.group_by(["datetime", "county", "is_business", "is_consumption"])
            .sum()
            .drop("product_type")
        )

        df_target_all_county_type_sum = (
            df_target.group_by(["datetime", "is_business", "is_consumption"])
            .sum()
            .drop("product_type", "county")
        )

        for hours_lag in [
            2 * 24,
            3 * 24,
            4 * 24,
            5 * 24,
            6 * 24,
            7 * 24,
            8 * 24,
            9 * 24,
            10 * 24,
            11 * 24,
            12 * 24,
            13 * 24,
            14 * 24,
        ]:
            df_features = df_features.join(
                df_target.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_{hours_lag}h"}),
                on=["county","is_business","product_type","is_consumption","datetime",],
                how="left",
            )

        for hours_lag in [2 * 24, 3 * 24, 7 * 24, 14 * 24]:
            df_features = df_features.join(
                df_target_all_type_sum.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_all_type_sum_{hours_lag}h"}),
                on=["county", "is_business", "is_consumption", "datetime"],
                how="left",
            )

            df_features = df_features.join(
                df_target_all_county_type_sum.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_all_county_type_sum_{hours_lag}h"}),
                on=["is_business", "is_consumption", "datetime"],
                how="left",
                suffix=f"_all_county_type_sum_{hours_lag}h",
            )

        cols_for_stats = [
            f"target_{hours_lag}h" for hours_lag in [2 * 24, 3 * 24, 4 * 24, 5 * 24]
        ]
        df_features = df_features.with_columns(
            df_features.select(cols_for_stats).mean(axis=1).alias(f"target_mean"),
            df_features.select(cols_for_stats)
            .transpose()
            .std()
            .transpose()
            .to_series()
            .alias(f"target_std"),
        )

        for target_prefix, lag_nominator, lag_denomonator in [
            ("target", 24 * 7, 24 * 14),
            ("target", 24 * 2, 24 * 9),
            ("target", 24 * 3, 24 * 10),
            ("target", 24 * 2, 24 * 3),
            ("target_all_type_sum", 24 * 2, 24 * 3),
            ("target_all_type_sum", 24 * 7, 24 * 14),
            ("target_all_county_type_sum", 24 * 2, 24 * 3),
            ("target_all_county_type_sum", 24 * 7, 24 * 14),
        ]:
            df_features = df_features.with_columns(
                (
                    pl.col(f"{target_prefix}_{lag_nominator}h")
                    / (pl.col(f"{target_prefix}_{lag_denomonator}h") + 1e-3)
                ).alias(f"{target_prefix}_ratio_{lag_nominator}_{lag_denomonator}")
            )

        return df_features

    def _reduce_memory_usage(self, df_features):
        df_features = df_features.with_columns(pl.col(pl.Float64).cast(pl.Float32))
        return df_features

    def _drop_columns(self, df_features):
        df_features = df_features.drop(
            "date", "datetime", "hour", "dayofyear"
        )
        return df_features

    def _to_pandas(self, df_features, y):
        cat_cols = ["county","is_business","product_type","is_consumption","segment",
        ]

        if y is not None:
            df_features = pd.concat([df_features.to_pandas(), y.to_pandas()], axis=1)
        else:
            df_features = df_features.to_pandas()

        df_features = df_features.set_index("row_id")
        df_features[cat_cols] = df_features[cat_cols].astype("category")

        return df_features

    def generate_features(self, df_prediction_items):
        if "target" in df_prediction_items.columns:
            df_prediction_items, y = (
                df_prediction_items.drop("target"),
                df_prediction_items.select("target"),
            )
        else:
            y = None

        df_features = df_prediction_items.with_columns(
            pl.col("datetime").cast(pl.Date).alias("date"),
        )

        for add_features in [
            self._add_general_features,
            self._add_client_features,
            self._add_forecast_weather_features,
            self._add_historical_weather_features,
            self._add_target_features,
            self._reduce_memory_usage,
            self._drop_columns,
        ]:
            df_features = add_features(df_features)

        df_features = self._to_pandas(df_features, y)

        return df_features


### Model

# Initialisation

In [4]:
data_storage = DataStorage()
features_generator = FeaturesGenerator(data_storage=data_storage)

# Feature Generation

In [5]:
df_train_features = features_generator.generate_features(data_storage.df_data)
df_train_features = df_train_features[df_train_features['target'].notnull()]

In [6]:
import holidays
import datetime

estonian_holidays = holidays.country_holidays('EE', years=range(2021, 2026))
estonian_holidays = list(estonian_holidays.keys())

def add_holidays_as_binary_features(df):
    df['country_holiday'] = df.apply(lambda row: (datetime.date(row['year'], row['month'], row['day']) in estonian_holidays) * 1, axis=1)
    
    return df

df_train_features = add_holidays_as_binary_features(df_train_features)

In [7]:
df_train_features.head()

Unnamed: 0_level_0,county,is_business,product_type,is_consumption,day,weekday,month,year,segment,sin(dayofyear),...,target_ratio_168_336,target_ratio_48_216,target_ratio_72_240,target_ratio_48_72,target_all_type_sum_ratio_48_72,target_all_type_sum_ratio_168_336,target_all_county_type_sum_ratio_48_72,target_all_county_type_sum_ratio_168_336,target,country_holiday
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
366048,0,0,1,0,1,6,1,2022,0_0_1_0,0.017166,...,,,,,,,,,0.0,1
366049,0,0,1,1,1,6,1,2022,0_0_1_1,0.017166,...,,,,,,,,,442.226,1
366050,0,0,2,0,1,6,1,2022,0_0_2_0,0.017166,...,,,,,,,,,0.0,1
366051,0,0,2,1,1,6,1,2022,0_0_2_1,0.017166,...,,,,,,,,,44.899,1
366052,0,0,3,0,1,6,1,2022,0_0_3_0,0.017166,...,,,,,,,,,0.015,1


In [8]:
print(df_train_features.columns.to_list())

['county', 'is_business', 'product_type', 'is_consumption', 'day', 'weekday', 'month', 'year', 'segment', 'sin(dayofyear)', 'cos(dayofyear)', 'sin(hour)', 'cos(hour)', 'eic_count', 'installed_capacity', 'temperature', 'dewpoint', 'cloudcover_high', 'cloudcover_low', 'cloudcover_mid', 'cloudcover_total', '10_metre_u_wind_component', '10_metre_v_wind_component', 'direct_solar_radiation', 'surface_solar_radiation_downwards', 'snowfall', 'total_precipitation', 'temperature_forecast_local_0h', 'dewpoint_forecast_local_0h', 'cloudcover_high_forecast_local_0h', 'cloudcover_low_forecast_local_0h', 'cloudcover_mid_forecast_local_0h', 'cloudcover_total_forecast_local_0h', '10_metre_u_wind_component_forecast_local_0h', '10_metre_v_wind_component_forecast_local_0h', 'direct_solar_radiation_forecast_local_0h', 'surface_solar_radiation_downwards_forecast_local_0h', 'snowfall_forecast_local_0h', 'total_precipitation_forecast_local_0h', 'temperature_forecast_168h', 'dewpoint_forecast_168h', 'cloudcove

In [9]:
df_train_features.isnull().sum()

county                                          0
is_business                                     0
product_type                                    0
is_consumption                                  0
day                                             0
                                            ...  
target_all_type_sum_ratio_168_336           43680
target_all_county_type_sum_ratio_48_72       9360
target_all_county_type_sum_ratio_168_336    43680
target                                          0
country_holiday                                 0
Length: 166, dtype: int64

In [10]:
df_train_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1651902 entries, 366048 to 2018351
Columns: 166 entries, county to country_holiday
dtypes: category(5), float32(154), float64(2), int32(1), int64(1), uint32(3)
memory usage: 1.0 GB


## Model Training

In [11]:
parameter_list = [{'num_iterations': 500, 'verbose': -1, 'colsample_bynode': 0.7103688779357847, 'colsample_bytree': 0.6759995481832958, 'learning_rate': 0.08107710302284142, 'max_depth': 14, 'min_child_samples': 175, 'num_leaves': 169, 'objective': 'tweedie', 'path_smooth': 0.03638766162304646, 'reg_alpha': 4.241310552615537, 'reg_lambda': 1.91388003514709}, {'num_iterations': 500, 'verbose': -1, 'colsample_bynode': 0.5762487113184288, 'colsample_bytree': 0.5811355426944578, 'learning_rate': 0.08597630012152205, 'max_depth': 26, 'min_child_samples': 194, 'num_leaves': 361, 'objective': 'regression', 'path_smooth': 0.03465551777589707, 'reg_alpha': 6.996661462358405, 'reg_lambda': 8.470525016492395}, {'num_iterations': 500, 'verbose': -1, 'colsample_bynode': 0.43517350965256685, 'colsample_bytree': 0.7831532979997649, 'learning_rate': 0.07025903099500738, 'max_depth': 23, 'min_child_samples': 211, 'num_leaves': 321, 'objective': 'regression', 'path_smooth': 0.08559566561454247, 'reg_alpha': 2.6435598455478146, 'reg_lambda': 4.51993047978816}, {'num_iterations': 500, 'verbose': -1, 'colsample_bynode': 0.6069746474662208, 'colsample_bytree': 0.47723286276461496, 'learning_rate': 0.06380390287538754, 'max_depth': 10, 'min_child_samples': 245, 'num_leaves': 192, 'objective': 'regression', 'path_smooth': 0.03607590859625657, 'reg_alpha': 3.3788113950113328, 'reg_lambda': 5.702284030317997}, {'num_iterations': 500, 'verbose': -1, 'colsample_bynode': 0.360239888533631, 'colsample_bytree': 0.8355751080647349, 'learning_rate': 0.07606039804311976, 'max_depth': 16, 'min_child_samples': 121, 'num_leaves': 478, 'objective': 'regression', 'path_smooth': 0.0997680881977257, 'reg_alpha': 3.734520254197353, 'reg_lambda': 9.635057034367291}, {'num_iterations': 500, 'verbose': -1, 'colsample_bynode': 0.4174740961644613, 'colsample_bytree': 0.7508888306232886, 'learning_rate': 0.07708410483102476, 'max_depth': 25, 'min_child_samples': 216, 'num_leaves': 380, 'objective': 'regression', 'path_smooth': 0.06390932048435091, 'reg_alpha': 3.326634605594462, 'reg_lambda': 4.437207873684814}, {'num_iterations': 500, 'verbose': -1, 'colsample_bynode': 0.7381382697935609, 'colsample_bytree': 0.523576726896179, 'learning_rate': 0.06981333132164486, 'max_depth': 11, 'min_child_samples': 126, 'num_leaves': 214, 'objective': 'tweedie', 'path_smooth': 0.07218072257652315, 'reg_alpha': 9.975905654063325, 'reg_lambda': 2.551064575107957}, {'num_iterations': 500, 'verbose': -1, 'colsample_bynode': 0.9235361356621526, 'colsample_bytree': 0.43075258164239827, 'learning_rate': 0.06755536733597006, 'max_depth': 28, 'min_child_samples': 243, 'num_leaves': 213, 'objective': 'regression', 'path_smooth': 0.020429341100388392, 'reg_alpha': 2.553972315959381, 'reg_lambda': 2.4041022554705522}, {'num_iterations': 500, 'verbose': -1, 'colsample_bynode': 0.44166912467574293, 'colsample_bytree': 0.6606565063118934, 'learning_rate': 0.0897868781748779, 'max_depth': 18, 'min_child_samples': 240, 'num_leaves': 42, 'objective': 'tweedie', 'path_smooth': 0.01116875211972208, 'reg_alpha': 2.4423874109911345, 'reg_lambda': 4.128799810939097}, {'num_iterations': 500, 'verbose': -1, 'colsample_bynode': 0.9525732017313245, 'colsample_bytree': 0.6382630603007415, 'learning_rate': 0.08270391387009293, 'max_depth': 12, 'min_child_samples': 243, 'num_leaves': 292, 'objective': 'tweedie', 'path_smooth': 0.08723701882589538, 'reg_alpha': 6.1157717164194905, 'reg_lambda': 6.675900725606523}]

In [12]:
# model_consumption = VotingRegressor([
#         ('lgb_0', lgb.LGBMRegressor(**parameter_list[0], random_state=42)),
#         ('lgb_1', lgb.LGBMRegressor(**parameter_list[1], random_state=42)),
#         ('lgb_2', lgb.LGBMRegressor(**parameter_list[2], random_state=42)), 
#         ('lgb_3', lgb.LGBMRegressor(**parameter_list[3], random_state=42)), 
#         ('lgb_4', lgb.LGBMRegressor(**parameter_list[4], random_state=42)), 
#         ('lgb_5', lgb.LGBMRegressor(**parameter_list[5], random_state=42)), 
#         ('lgb_6', lgb.LGBMRegressor(**parameter_list[6], random_state=42)), 
#         ('lgb_7', lgb.LGBMRegressor(**parameter_list[7], random_state=42)),
#         ('lgb_8', lgb.LGBMRegressor(**parameter_list[8], random_state=42)),
#         ('lgb_9', lgb.LGBMRegressor(**parameter_list[9], random_state=42)),
# ],weights=[0.14,0.13,0.08,0.11,0.09,0.1,0.09,0.07,0.12,0.07])
    
# model_production = VotingRegressor([
#         ('lgb_10', lgb.LGBMRegressor(**parameter_list[0], random_state=42)),
#         ('lgb_11', lgb.LGBMRegressor(**parameter_list[1], random_state=42)),
#         ('lgb_12', lgb.LGBMRegressor(**parameter_list[2], random_state=42)), 
#         ('lgb_13', lgb.LGBMRegressor(**parameter_list[3], random_state=42)), 
#         ('lgb_14', lgb.LGBMRegressor(**parameter_list[4], random_state=42)), 
#         ('lgb_15', lgb.LGBMRegressor(**parameter_list[5], random_state=42)), 
#         ('lgb_16', lgb.LGBMRegressor(**parameter_list[6], random_state=42)), 
#         ('lgb_17', lgb.LGBMRegressor(**parameter_list[7], random_state=42)),
#         ('lgb_18', lgb.LGBMRegressor(**parameter_list[8], random_state=42)),
#         ('lgb_19', lgb.LGBMRegressor(**parameter_list[9], random_state=42)),
# ],weights=[0.14,0.13,0.08,0.11,0.09,0.1,0.09,0.07,0.12,0.07])

# mask = df_train_features['is_consumption'] == 1
# model_consumption.fit(
#     X=df_train_features[mask].drop(columns=["target"]),
#     y=df_train_features[mask]["target"]
# )

# mask = df_train_features['is_consumption'] == 0
# model_production.fit(
#     X=df_train_features[mask].drop(columns=["target"]),
#     y=df_train_features[mask]["target"]
# )

In [13]:
# import joblib

# joblib.dump(model_consumption, 'model_consumption.pkl')
# joblib.dump(model_production, 'model_production.pkl')

In [14]:
import joblib

model_consumption = joblib.load('/kaggle/input/enefit-trained-model/voting_regressor_consumption_model.joblib')
model_production = joblib.load('/kaggle/input/enefit-trained-model/voting_regressor_production_model.joblib')

# Submit API

In [15]:
import enefit

env = enefit.make_env()
iter_test = env.iter_test()

In [16]:
for (df_test, df_new_target, df_new_client, df_new_historical_weather,
     df_new_forecast_weather, df_new_electricity_prices, df_new_gas_prices,
     df_sample_prediction) in iter_test:

    data_storage.update_with_new_data(
        df_new_client=df_new_client,
        df_new_gas_prices=df_new_gas_prices,
        df_new_electricity_prices=df_new_electricity_prices,
        df_new_forecast_weather=df_new_forecast_weather,
        df_new_historical_weather=df_new_historical_weather,
        df_new_target=df_new_target
    )
    
    df_test = data_storage.preprocess_test(df_test)
    df_test_features = features_generator.generate_features(df_test)
    df_test_features = add_holidays_as_binary_features(df_test_features)

    # Ensure the index is aligned
    df_test_features = df_test_features.reset_index(drop=True)
    df_sample_prediction = df_sample_prediction.reset_index(drop=True)

    # Initialize default
    df_sample_prediction["target"] = 0

    # Boolean masks
    consumption_mask = df_test_features["is_consumption"] == 1
    production_mask = ~consumption_mask

    # Get index positions
    consumption_idx = df_test_features[consumption_mask].index
    production_idx = df_test_features[production_mask].index

    if len(consumption_idx) > 0:
        preds_c = model_consumption.predict(df_test_features.loc[consumption_idx])
        df_sample_prediction.loc[consumption_idx, "target"] = preds_c

    if len(production_idx) > 0:
        preds_p = model_production.predict(df_test_features.loc[production_idx])
        df_sample_prediction.loc[production_idx, "target"] = preds_p

    env.predict(df_sample_prediction)


This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
