In [14]:
import pandas as pd
import numpy as np
from pycaret.regression import setup, compare_models, tune_model, evaluate_model
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingRegressor, VotingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import warnings

warnings.filterwarnings("ignore")

import os
import gc
import pickle

import polars as pl
import plotly.express as px

In [None]:
class DataStorage:
    root = "./data"

    data_cols = [
        "target",
        "county",
        "is_business",
        "product_type",
        "is_consumption",
        "datetime",
        "row_id",
    ]
    client_cols = [
        "product_type",
        "county",
        "eic_count",
        "installed_capacity",
        "is_business",
        "date",
    ]
    gas_prices_cols = ["forecast_date", "lowest_price_per_mwh", "highest_price_per_mwh"]
    electricity_prices_cols = ["forecast_date", "euros_per_mwh"]
    forecast_weather_cols = [
        "latitude",
        "longitude",
        "hours_ahead",
        "temperature",
        "dewpoint",
        "cloudcover_high",
        "cloudcover_low",
        "cloudcover_mid",
        "cloudcover_total",
        "10_metre_u_wind_component",
        "10_metre_v_wind_component",
        "forecast_datetime",
        "direct_solar_radiation",
        "surface_solar_radiation_downwards",
        "snowfall",
        "total_precipitation",
    ]
    historical_weather_cols = [
        "datetime",
        "temperature",
        "dewpoint",
        "rain",
        "snowfall",
        "surface_pressure",
        "cloudcover_total",
        "cloudcover_low",
        "cloudcover_mid",
        "cloudcover_high",
        "windspeed_10m",
        "winddirection_10m",
        "shortwave_radiation",
        "direct_solar_radiation",
        "diffuse_radiation",
        "latitude",
        "longitude",
    ]
    location_cols = ["longitude", "latitude", "county"]
    target_cols = [
        "target",
        "county",
        "is_business",
        "product_type",
        "is_consumption",
        "datetime",
    ]

    def __init__(self):
        self.df_data = pl.read_csv(
            os.path.join(self.root, "train.csv"),
            columns=self.data_cols,
            try_parse_dates=True,
        )
        self.df_client = pl.read_csv(
            os.path.join(self.root, "client.csv"),
            columns=self.client_cols,
            try_parse_dates=True,
        )
        self.df_gas_prices = pl.read_csv(
            os.path.join(self.root, "gas_prices.csv"),
            columns=self.gas_prices_cols,
            try_parse_dates=True,
        )
        self.df_electricity_prices = pl.read_csv(
            os.path.join(self.root, "electricity_prices.csv"),
            columns=self.electricity_prices_cols,
            try_parse_dates=True,
        )
        self.df_forecast_weather = pl.read_csv(
            os.path.join(self.root, "forecast_weather.csv"),
            columns=self.forecast_weather_cols,
            try_parse_dates=True,
        )
        self.df_historical_weather = pl.read_csv(
            os.path.join(self.root, "historical_weather.csv"),
            columns=self.historical_weather_cols,
            try_parse_dates=True,
        )
        self.df_weather_station_to_county_mapping = pl.read_csv(
            os.path.join(self.root, "weather_station_to_county_mapping.csv"),
            columns=self.location_cols,
            try_parse_dates=True,
        )
        self.df_data = self.df_data.filter(
            pl.col("datetime") >= pd.to_datetime("2022-01-01")
        )
        self.df_target = self.df_data.select(self.target_cols)

        self.schema_data = self.df_data.schema
        self.schema_client = self.df_client.schema
        self.schema_gas_prices = self.df_gas_prices.schema
        self.schema_electricity_prices = self.df_electricity_prices.schema
        self.schema_forecast_weather = self.df_forecast_weather.schema
        self.schema_historical_weather = self.df_historical_weather.schema
        self.schema_target = self.df_target.schema

        self.df_weather_station_to_county_mapping = (
            self.df_weather_station_to_county_mapping.with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
        )

    def update_with_new_data(
        self,
        df_new_client,
        df_new_gas_prices,
        df_new_electricity_prices,
        df_new_forecast_weather,
        df_new_historical_weather,
        df_new_target,
    ):
        df_new_client = pl.from_pandas(
            df_new_client[self.client_cols], schema_overrides=self.schema_client
        )
        df_new_gas_prices = pl.from_pandas(
            df_new_gas_prices[self.gas_prices_cols],
            schema_overrides=self.schema_gas_prices,
        )
        df_new_electricity_prices = pl.from_pandas(
            df_new_electricity_prices[self.electricity_prices_cols],
            schema_overrides=self.schema_electricity_prices,
        )
        df_new_forecast_weather = pl.from_pandas(
            df_new_forecast_weather[self.forecast_weather_cols],
            schema_overrides=self.schema_forecast_weather,
        )
        df_new_historical_weather = pl.from_pandas(
            df_new_historical_weather[self.historical_weather_cols],
            schema_overrides=self.schema_historical_weather,
        )
        df_new_target = pl.from_pandas(
            df_new_target[self.target_cols], schema_overrides=self.schema_target
        )

        self.df_client = pl.concat([self.df_client, df_new_client]).unique(
            ["date", "county", "is_business", "product_type"]
        )
        self.df_gas_prices = pl.concat([self.df_gas_prices, df_new_gas_prices]).unique(
            ["forecast_date"]
        )
        self.df_electricity_prices = pl.concat(
            [self.df_electricity_prices, df_new_electricity_prices]
        ).unique(["forecast_date"])
        self.df_forecast_weather = pl.concat(
            [self.df_forecast_weather, df_new_forecast_weather]
        ).unique(["forecast_datetime", "latitude", "longitude", "hours_ahead"])
        self.df_historical_weather = pl.concat(
            [self.df_historical_weather, df_new_historical_weather]
        ).unique(["datetime", "latitude", "longitude"])
        self.df_target = pl.concat([self.df_target, df_new_target]).unique(
            ["datetime", "county", "is_business", "product_type", "is_consumption"]
        )

    def preprocess_test(self, df_test):
        df_test = df_test.rename(columns={"prediction_datetime": "datetime"})
        df_test = pl.from_pandas(
            df_test[self.data_cols[1:]], schema_overrides=self.schema_data
        )
        return df_test

In [None]:
class FeaturesGenerator:
    def __init__(self, data_storage):
        self.data_storage = data_storage

    def _add_general_features(self, df_features):
        df_features = (
            df_features.with_columns(
                pl.col("datetime").dt.ordinal_day().alias("dayofyear"),
                pl.col("datetime").dt.hour().alias("hour"),
                pl.col("datetime").dt.day().alias("day"),
                pl.col("datetime").dt.weekday().alias("weekday"),
                pl.col("datetime").dt.month().alias("month"),
                pl.col("datetime").dt.year().alias("year"),
            )
            .with_columns(
                pl.concat_str(
                    "county",
                    "is_business",
                    "product_type",
                    "is_consumption",
                    separator="_",
                ).alias("segment"),
            )
            .with_columns(
                (np.pi * pl.col("dayofyear") / 183).sin().alias("sin(dayofyear)"),
                (np.pi * pl.col("dayofyear") / 183).cos().alias("cos(dayofyear)"),
                (np.pi * pl.col("hour") / 12).sin().alias("sin(hour)"),
                (np.pi * pl.col("hour") / 12).cos().alias("cos(hour)"),
            )
        )
        return df_features

    def _add_client_features(self, df_features):
        df_client = self.data_storage.df_client

        df_features = df_features.join(
            df_client.with_columns(
                (pl.col("date") + pl.duration(days=2)).cast(pl.Date)
            ),
            on=["county", "is_business", "product_type", "date"],
            how="left",
        )
        return df_features

    def _add_forecast_weather_features(self, df_features):
        df_forecast_weather = self.data_storage.df_forecast_weather
        df_weather_station_to_county_mapping = (
            self.data_storage.df_weather_station_to_county_mapping
        )

        df_forecast_weather = (
            df_forecast_weather.rename({"forecast_datetime": "datetime"})
            .filter((pl.col("hours_ahead") >= 22) & pl.col("hours_ahead") <= 45)
            .drop("hours_ahead")
            .with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
            .join(
                df_weather_station_to_county_mapping,
                how="left",
                on=["longitude", "latitude"],
            )
            .drop("longitude", "latitude")
        )

        df_forecast_weather_date = (
            df_forecast_weather.group_by("datetime").mean().drop("county")
        )

        df_forecast_weather_local = (
            df_forecast_weather.filter(pl.col("county").is_not_null())
            .group_by("county", "datetime")
            .mean()
        )

        for hours_lag in [0, 7 * 24]:
            df_features = df_features.join(
                df_forecast_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on="datetime",
                how="left",
                suffix=f"_forecast_{hours_lag}h",
            )
            df_features = df_features.join(
                df_forecast_weather_local.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on=["county", "datetime"],
                how="left",
                suffix=f"_forecast_local_{hours_lag}h",
            )

        return df_features

    def _add_historical_weather_features(self, df_features):
        df_historical_weather = self.data_storage.df_historical_weather
        df_weather_station_to_county_mapping = (
            self.data_storage.df_weather_station_to_county_mapping
        )

        df_historical_weather = (
            df_historical_weather.with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
            .join(
                df_weather_station_to_county_mapping,
                how="left",
                on=["longitude", "latitude"],
            )
            .drop("longitude", "latitude")
        )

        df_historical_weather_date = (
            df_historical_weather.group_by("datetime").mean().drop("county")
        )

        df_historical_weather_local = (
            df_historical_weather.filter(pl.col("county").is_not_null())
            .group_by("county", "datetime")
            .mean()
        )

        for hours_lag in [2 * 24, 7 * 24]:
            df_features = df_features.join(
                df_historical_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on="datetime",
                how="left",
                suffix=f"_historical_{hours_lag}h",
            )
            df_features = df_features.join(
                df_historical_weather_local.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on=["county", "datetime"],
                how="left",
                suffix=f"_historical_local_{hours_lag}h",
            )

        for hours_lag in [1 * 24]:
            df_features = df_features.join(
                df_historical_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag),
                    pl.col("datetime").dt.hour().alias("hour"),
                )
                .filter(pl.col("hour") <= 10)
                .drop("hour"),
                on="datetime",
                how="left",
                suffix=f"_historical_{hours_lag}h",
            )

        return df_features

    def _add_target_features(self, df_features):
        df_target = self.data_storage.df_target

        df_target_all_type_sum = (
            df_target.group_by(["datetime", "county", "is_business", "is_consumption"])
            .sum()
            .drop("product_type")
        )

        df_target_all_county_type_sum = (
            df_target.group_by(["datetime", "is_business", "is_consumption"])
            .sum()
            .drop("product_type", "county")
        )

        for hours_lag in [
            2 * 24,
            3 * 24,
            4 * 24,
            5 * 24,
            6 * 24,
            7 * 24,
            8 * 24,
            9 * 24,
            10 * 24,
            11 * 24,
            12 * 24,
            13 * 24,
            14 * 24,
        ]:
            df_features = df_features.join(
                df_target.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_{hours_lag}h"}),
                on=[
                    "county",
                    "is_business",
                    "product_type",
                    "is_consumption",
                    "datetime",
                ],
                how="left",
            )

        for hours_lag in [2 * 24, 3 * 24, 7 * 24, 14 * 24]:
            df_features = df_features.join(
                df_target_all_type_sum.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_all_type_sum_{hours_lag}h"}),
                on=["county", "is_business", "is_consumption", "datetime"],
                how="left",
            )

            df_features = df_features.join(
                df_target_all_county_type_sum.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_all_county_type_sum_{hours_lag}h"}),
                on=["is_business", "is_consumption", "datetime"],
                how="left",
                suffix=f"_all_county_type_sum_{hours_lag}h",
            )

        cols_for_stats = [
            f"target_{hours_lag}h" for hours_lag in [2 * 24, 3 * 24, 4 * 24, 5 * 24]
        ]
        df_features = df_features.with_columns(
            df_features.select(cols_for_stats).mean(axis=1).alias(f"target_mean"),
            df_features.select(cols_for_stats)
            .transpose()
            .std()
            .transpose()
            .to_series()
            .alias(f"target_std"),
        )

        for target_prefix, lag_nominator, lag_denomonator in [
            ("target", 24 * 7, 24 * 14),
            ("target", 24 * 2, 24 * 9),
            ("target", 24 * 3, 24 * 10),
            ("target", 24 * 2, 24 * 3),
            ("target_all_type_sum", 24 * 2, 24 * 3),
            ("target_all_type_sum", 24 * 7, 24 * 14),
            ("target_all_county_type_sum", 24 * 2, 24 * 3),
            ("target_all_county_type_sum", 24 * 7, 24 * 14),
        ]:
            df_features = df_features.with_columns(
                (
                    pl.col(f"{target_prefix}_{lag_nominator}h")
                    / (pl.col(f"{target_prefix}_{lag_denomonator}h") + 1e-3)
                ).alias(f"{target_prefix}_ratio_{lag_nominator}_{lag_denomonator}")
            )

        return df_features

    def _reduce_memory_usage(self, df_features):
        df_features = df_features.with_columns(pl.col(pl.Float64).cast(pl.Float32))
        return df_features

    def _drop_columns(self, df_features):
        df_features = df_features.drop(
            "date", "datetime", "hour", "dayofyear"
        )
        return df_features

    def _to_pandas(self, df_features, y):
        cat_cols = [
            "county",
            "is_business",
            "product_type",
            "is_consumption",
            "segment",
        ]

        if y is not None:
            df_features = pd.concat([df_features.to_pandas(), y.to_pandas()], axis=1)
        else:
            df_features = df_features.to_pandas()

        df_features = df_features.set_index("row_id")
        df_features[cat_cols] = df_features[cat_cols].astype("category")

        return df_features

    def generate_features(self, df_prediction_items):
        if "target" in df_prediction_items.columns:
            df_prediction_items, y = (
                df_prediction_items.drop("target"),
                df_prediction_items.select("target"),
            )
        else:
            y = None

        df_features = df_prediction_items.with_columns(
            pl.col("datetime").cast(pl.Date).alias("date"),
        )

        for add_features in [
            self._add_general_features,
            self._add_client_features,
            self._add_forecast_weather_features,
            self._add_historical_weather_features,
            self._add_target_features,
            self._reduce_memory_usage,
            self._drop_columns,
        ]:
            df_features = add_features(df_features)

        df_features = self._to_pandas(df_features, y)

        return df_features

In [None]:
class Model:
    def __init__(self):
        self.model_parameters = {
            "n_estimators": 2500,
            "learning_rate": 0.06,
            "colsample_bytree": 0.9,
            "colsample_bynode": 0.6,
            "lambda_l1": 3.5,
            "lambda_l2": 1.5,
            "max_depth": 15,
            "num_leaves": 500,
            "min_data_in_leaf": 50,
            "objective": "regression_l1",
            "device": "gpu"
        }

        self.model_consumption = VotingRegressor(
            [
                (
                    f"consumption_lgb_{i}",
                    lgb.LGBMRegressor(**self.model_parameters, random_state=i),
                )
                for i in range(10)
            ]
        )
        self.model_production = VotingRegressor(
            [
                (
                    f"production_lgb_{i}",
                    lgb.LGBMRegressor(**self.model_parameters, random_state=i),
                )
                for i in range(10)
            ]
        )

    def fit(self, df_train_features):
        mask = df_train_features["is_consumption"] == 1
        self.model_consumption.fit(
            X=df_train_features[mask].drop(columns=["target"]),
            y=df_train_features[mask]["target"]
        )

        mask = df_train_features["is_consumption"] == 0
        self.model_production.fit(
            X=df_train_features[mask].drop(columns=["target"]),
            y=df_train_features[mask]["target"]
        )

    def predict(self, df_features):
        predictions = np.zeros(len(df_features))

        mask = df_features["is_consumption"] == 1
        predictions[mask.values] = self.model_consumption.predict(
            df_features[mask]
        ).clip(0)

        mask = df_features["is_consumption"] == 0
        predictions[mask.values] = self.model_production.predict(
            df_features[mask]
        ).clip(0)

        return predictions

In [None]:
data_storage = DataStorage()
features_generator = FeaturesGenerator(data_storage=data_storage)

In [None]:
df_train_features = features_generator.generate_features(data_storage.df_data)
df_train_features = df_train_features[df_train_features['target'].notnull()]

In [None]:
import holidays
import datetime

estonian_holidays = holidays.country_holidays('EE', years=range(2021, 2026))
estonian_holidays = list(estonian_holidays.keys())

def add_holidays_as_binary_features(df):
    df['country_holiday'] = df.apply(lambda row: (datetime.date(row['year'], row['month'], row['day']) in estonian_holidays) * 1, axis=1)
    
    return df

df_train_features = add_holidays_as_binary_features(df_train_features)

In [None]:
df_train_features.head()

Unnamed: 0_level_0,county,is_business,product_type,is_consumption,day,weekday,month,year,segment,sin(dayofyear),...,target_ratio_168_336,target_ratio_48_216,target_ratio_72_240,target_ratio_48_72,target_all_type_sum_ratio_48_72,target_all_type_sum_ratio_168_336,target_all_county_type_sum_ratio_48_72,target_all_county_type_sum_ratio_168_336,target,country_holiday
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
366048,0,0,1,0,1,6,1,2022,0_0_1_0,0.017166,...,,,,,,,,,0.0,1
366049,0,0,1,1,1,6,1,2022,0_0_1_1,0.017166,...,,,,,,,,,442.226,1
366050,0,0,2,0,1,6,1,2022,0_0_2_0,0.017166,...,,,,,,,,,0.0,1
366051,0,0,2,1,1,6,1,2022,0_0_2_1,0.017166,...,,,,,,,,,44.899,1
366052,0,0,3,0,1,6,1,2022,0_0_3_0,0.017166,...,,,,,,,,,0.015,1


In [None]:
df_train_features.to_csv("./data/df_train_features.csv", index = False)

In [None]:
df_train_features.shape

(1651902, 166)

In [None]:
df_train_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1651902 entries, 366048 to 2018351
Columns: 166 entries, county to country_holiday
dtypes: category(5), float32(154), float64(2), int32(1), int64(1), int8(3)
memory usage: 1.0 GB


In [None]:
df_train_features.select_dtypes('category')

Unnamed: 0_level_0,county,is_business,product_type,is_consumption,segment
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
366048,0,0,1,0,0_0_1_0
366049,0,0,1,1,0_0_1_1
366050,0,0,2,0,0_0_2_0
366051,0,0,2,1,0_0_2_1
366052,0,0,3,0,0_0_3_0
...,...,...,...,...,...
2018347,15,1,0,1,15_1_0_1
2018348,15,1,1,0,15_1_1_0
2018349,15,1,1,1,15_1_1_1
2018350,15,1,3,0,15_1_3_0


In [None]:
null_ratio = (df_train_features.isnull().sum()/len(df_train_features)).sort_values(ascending = False)

In [None]:
null_much = list(null_ratio[null_ratio > 0.5].index)

In [None]:
df_train_features.drop(null_much, axis = 1, inplace = True)

In [None]:
(df_train_features.isnull().sum()/len(df_train_features)).sort_values(ascending = False)

county                                 0.0
diffuse_radiation_historical_168h      0.0
cloudcover_low_historical_168h         0.0
cloudcover_mid_historical_168h         0.0
cloudcover_high_historical_168h        0.0
                                      ... 
temperature_forecast_local_168h        0.0
dewpoint_forecast_local_168h           0.0
cloudcover_high_forecast_local_168h    0.0
cloudcover_low_forecast_local_168h     0.0
country_holiday                        0.0
Length: 152, dtype: float64

In [None]:
df_med = df_train_features.median(axis = 0)
df_med

day                                           16.000000
weekday                                        4.000000
month                                          5.000000
year                                        2022.000000
sin(dayofyear)                                 0.416125
                                               ...     
target_all_type_sum_ratio_168_336              0.983894
target_all_county_type_sum_ratio_48_72         0.998563
target_all_county_type_sum_ratio_168_336       1.004637
target                                        35.205500
country_holiday                                0.000000
Length: 147, dtype: float64

In [None]:
cols = df_train_features.select_dtypes(['int64', 'float64', 'float32', 'int8', 'int32']).columns

In [None]:
for i in cols:
    df_train_features[i] = df_train_features[i].fillna(df_med[i])

In [None]:
df_train_features.to_csv("./data/df_train_features_2.csv", index = False)

In [80]:
df = pd.read_csv("./data/df_train_features_2.csv", low_memory = False)

In [81]:
df.isnull().sum().sort_values(ascending = False)

county                                 0
diffuse_radiation_historical_168h      0
cloudcover_low_historical_168h         0
cloudcover_mid_historical_168h         0
cloudcover_high_historical_168h        0
                                      ..
temperature_forecast_local_168h        0
dewpoint_forecast_local_168h           0
cloudcover_high_forecast_local_168h    0
cloudcover_low_forecast_local_168h     0
country_holiday                        0
Length: 152, dtype: int64

In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1651902 entries, 0 to 1651901
Columns: 152 entries, county to country_holiday
dtypes: float64(142), int64(9), object(1)
memory usage: 1.9+ GB


In [None]:
df3 = df.drop('segment', axis = 1)

In [88]:
df3.head()

Unnamed: 0,county,is_business,product_type,is_consumption,day,weekday,month,year,sin(dayofyear),cos(dayofyear),...,target_all_type_sum_ratio_48_72,target_all_type_sum_ratio_168_336,target_all_county_type_sum_ratio_48_72,target_all_county_type_sum_ratio_168_336,target,country_holiday,seg_0,seg_1,seg_2,seg_3
0,0,0,1,0,1,6,1,2022,0.017166,0.999853,...,0.981667,0.983894,0.998563,1.004637,0.0,1,0,0,1,0
1,0,0,1,1,1,6,1,2022,0.017166,0.999853,...,0.981667,0.983894,0.998563,1.004637,442.226,1,0,0,1,1
2,0,0,2,0,1,6,1,2022,0.017166,0.999853,...,0.981667,0.983894,0.998563,1.004637,0.0,1,0,0,2,0
3,0,0,2,1,1,6,1,2022,0.017166,0.999853,...,0.981667,0.983894,0.998563,1.004637,44.899,1,0,0,2,1
4,0,0,3,0,1,6,1,2022,0.017166,0.999853,...,0.981667,0.983894,0.998563,1.004637,0.015,1,0,0,3,0


In [89]:
df3.shape

(1651902, 155)

In [90]:
test = setup(data  = df3, target = 'target', train_size = 0.7, fold = 5)

Unnamed: 0,Description,Value
0,Session id,5910
1,Target,target
2,Target type,Regression
3,Original data shape,"(1651902, 155)"
4,Transformed data shape,"(1651902, 155)"
5,Transformed train set shape,"(1156331, 155)"
6,Transformed test set shape,"(495571, 155)"
7,Numeric features,154
8,Preprocess,True
9,Imputation type,simple


In [92]:
top5 = compare_models(n_select = 5, sort = 'MAE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,27.3183,9255.764,96.1906,0.9893,0.3799,3.7792,428.548
rf,Random Forest Regressor,31.943,13982.6316,118.2376,0.9838,0.4105,4.3631,982.328
xgboost,Extreme Gradient Boosting,38.0837,11965.2842,109.3831,0.9862,0.7701,34.7636,9.31
lightgbm,Light Gradient Boosting Machine,43.276,16574.6226,128.7391,0.9808,0.7561,30.0212,4.398
dt,Decision Tree Regressor,46.7242,31548.5717,177.6052,0.9635,0.516,4.6373,45.952
gbr,Gradient Boosting Regressor,56.3644,32582.5112,180.4919,0.9623,0.9646,57.4354,407.674
knn,K Neighbors Regressor,57.1386,28364.5176,168.3986,0.9672,0.6804,28.4535,567.514
omp,Orthogonal Matching Pursuit,63.8874,50276.0326,224.2021,0.9419,1.0886,85.967,3.03
llar,Lasso Least Angle Regression,64.9637,43665.5604,208.9478,0.9495,1.4434,169.9884,1.888
en,Elastic Net,64.9757,43904.4394,209.5176,0.9493,1.4468,171.5969,60.064


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

In [98]:
import joblib
joblib.dump(top5[0], './Study/Kaggle_0104/models/model_et.pkl')
joblib.dump(top5[1], './Study/Kaggle_0104/models/model_rf.pkl')
joblib.dump(top5[2], './Study/Kaggle_0104/models/model_xgb.pkl')
joblib.dump(top5[3], './Study/Kaggle_0104/models/model_lgbm.pkl')
joblib.dump(top5[4], './Study/Kaggle_0104/models/model_dt.pkl')

['./Study/Kaggle_0104/models/model_dt.pkl']

In [116]:
data_storage = DataStorage()
features_generator = FeaturesGenerator(data_storage=data_storage)

In [None]:
df_train_features = features_generator.generate_features(data_storage.df_data)
df_train_features = df_train_features[df_train_features['target'].notnull()]

In [189]:
df_test = pd.read_csv("./Study/Kaggle_0104/data/example_test_files/test.csv")

In [190]:
df_test['prediction_datetime'] = df_test['prediction_datetime'].astype('datetime64')

In [192]:
df_test2 = data_storage.preprocess_test(df_test)

In [193]:
df_test_features = features_generator.generate_features(df_test2)

In [194]:
import holidays
import datetime

estonian_holidays = holidays.country_holidays('EE', years=range(2021, 2026))
estonian_holidays = list(estonian_holidays.keys())

def add_holidays_as_binary_features(df):
    df['country_holiday'] = df.apply(lambda row: (datetime.date(row['year'], row['month'], row['day']) in estonian_holidays) * 1, axis=1)
    
    return df

df_test_features = add_holidays_as_binary_features(df_test_features)

In [195]:
df_test_features.isnull().sum().sort_values(ascending = False)/len(df_test_features)

cloudcover_high_historical_24h                     0.541667
cloudcover_mid_historical_24h                      0.541667
windspeed_10m_historical_24h                       0.541667
winddirection_10m_historical_24h                   0.541667
shortwave_radiation_historical_24h                 0.541667
                                                     ...   
snowfall_forecast_168h                             0.000000
surface_solar_radiation_downwards_forecast_168h    0.000000
direct_solar_radiation_forecast_168h               0.000000
10_metre_v_wind_component_forecast_168h            0.000000
country_holiday                                    0.000000
Length: 165, dtype: float64

In [196]:
null_ratio = (df_test_features.isnull().sum()/len(df_test_features)).sort_values(ascending = False)

In [197]:
null_much = list(null_ratio[null_ratio > 0.5].index)

In [198]:
df_test_features.drop(null_much, axis = 1, inplace = True)

In [199]:
df_med = df_test_features.median(axis = 0)
df_med

day                                           29.500000
weekday                                        2.500000
month                                          5.000000
year                                        2023.000000
sin(dayofyear)                                 0.543899
                                               ...     
target_all_type_sum_ratio_48_72                1.000399
target_all_type_sum_ratio_168_336              1.012225
target_all_county_type_sum_ratio_48_72         1.014979
target_all_county_type_sum_ratio_168_336       1.020643
country_holiday                                0.000000
Length: 146, dtype: float64

In [200]:
cols = df_test_features.select_dtypes(['int64', 'float64', 'float32', 'int8', 'int32']).columns

In [201]:
for i in cols:
    df_test_features[i] = df_test_features[i].fillna(df_med[i])

In [202]:
(df_test_features.isnull().sum()/len(df_test_features)).sort_values(ascending = False)

county                                 0.0
surface_pressure_historical_168h       0.0
cloudcover_low_historical_168h         0.0
cloudcover_mid_historical_168h         0.0
cloudcover_high_historical_168h        0.0
                                      ... 
temperature_forecast_local_168h        0.0
dewpoint_forecast_local_168h           0.0
cloudcover_high_forecast_local_168h    0.0
cloudcover_low_forecast_local_168h     0.0
country_holiday                        0.0
Length: 151, dtype: float64

In [203]:
df_test_features.reset_index(inplace = True)

In [204]:
df_test_features.to_csv("./Study/Kaggle_0104/data/example_test_files/test_2.csv", index = False)

In [205]:
df_test_features.shape

(12480, 152)

In [206]:
df_test_features.head()

Unnamed: 0,row_id,county,is_business,product_type,is_consumption,day,weekday,month,year,segment,...,target_std,target_ratio_168_336,target_ratio_48_216,target_ratio_72_240,target_ratio_48_72,target_all_type_sum_ratio_48_72,target_all_type_sum_ratio_168_336,target_all_county_type_sum_ratio_48_72,target_all_county_type_sum_ratio_168_336,country_holiday
0,2005872,0,0,1,0,28,7,5,2023,0_0_1_0,...,0.400051,1.205243,1.171704,0.879843,1.490251,0.970318,0.677167,1.096825,0.876597,1
1,2005873,0,0,1,1,28,7,5,2023,0_0_1_1,...,23.788008,1.034855,0.767853,0.746407,1.128152,1.168614,1.048807,1.125365,1.017569,1
2,2005874,0,0,2,0,28,7,5,2023,0_0_2_0,...,0.0,0.0,0.0,0.0,0.0,0.970318,0.677167,1.096825,0.876597,1
3,2005875,0,0,2,1,28,7,5,2023,0_0_2_1,...,1.198592,0.453541,1.017669,0.465929,2.008905,1.168614,1.048807,1.125365,1.017569,1
4,2005876,0,0,3,0,28,7,5,2023,0_0_3_0,...,1.325521,0.624666,0.721539,0.946384,0.909204,0.970318,0.677167,1.096825,0.876597,1


In [207]:
tmp_t = (df_test_features['segment'].str.split("_").apply(to_int))

In [208]:
li_t = []

for i in tmp_t:
    li_t.append(i)

In [209]:
df_test_tmp = pd.DataFrame(li_t, columns = ['seg_0', 'seg_1', 'seg_2', 'seg_3'])

In [210]:
df_test_features_2 = pd.concat([df_test_features, df_test_tmp], axis = 1).drop('segment', axis = 1)

In [212]:
test_id = df_test_features_2.pop('row_id')

In [213]:
df_test_features_2.shape

(12480, 154)

In [217]:
df3.shape

(1651902, 155)

In [222]:
df_test.head()

Unnamed: 0,county,is_business,product_type,is_consumption,prediction_datetime,data_block_id,row_id,prediction_unit_id,currently_scored
0,0,0,1,0,2023-05-28,634,2005872,0,False
1,0,0,1,1,2023-05-28,634,2005873,0,False
2,0,0,2,0,2023-05-28,634,2005874,1,False
3,0,0,2,1,2023-05-28,634,2005875,1,False
4,0,0,3,0,2023-05-28,634,2005876,2,False


In [220]:
top5[0]

In [228]:
tg1 = pd.DataFrame(np.array([top5[0].predict(df_test_features_2)]).T, columns = ['target'])

In [232]:
submit1 = pd.concat([df_test[['data_block_id']], test_id, tg1], axis = 1)

In [234]:
submit1.to_csv("./Study/Kaggle_0104/submissions/1st.csv", index = False)