In [1]:
import warnings

warnings.filterwarnings("ignore")

import os
import gc
import pickle

import numpy as np
import pandas as pd
import polars as pl
import plotly.express as px
import datetime as dt

from sklearn.metrics import mean_absolute_error

In [2]:
class DataStorage:
    root = "C:/Users/yjg10/OneDrive/문서/Kaggle_data/Energy"

    data_cols = [
        "target",
        "county",
        "is_business",
        "product_type",
        "is_consumption",
        "datetime",
        "row_id",
    ]
    client_cols = [
        "product_type",
        "county",
        "eic_count",
        "installed_capacity",
        "is_business",
        "date",
    ]
    gas_prices_cols = ["forecast_date", "lowest_price_per_mwh", "highest_price_per_mwh"]
    electricity_prices_cols = ["forecast_date", "euros_per_mwh"]
    forecast_weather_cols = [
        "latitude",
        "longitude",
        "hours_ahead",
        "temperature",
        "dewpoint",
        "cloudcover_high",
        "cloudcover_low",
        "cloudcover_mid",
        "cloudcover_total",
        "10_metre_u_wind_component",
        "10_metre_v_wind_component",
        "forecast_datetime",
        "direct_solar_radiation",
        "surface_solar_radiation_downwards",
        "snowfall",
        "total_precipitation",
    ]
    historical_weather_cols = [
        "datetime",
        "temperature",
        "dewpoint",
        "rain",
        "snowfall",
        "surface_pressure",
        "cloudcover_total",
        "cloudcover_low",
        "cloudcover_mid",
        "cloudcover_high",
        "windspeed_10m",
        "winddirection_10m",
        "shortwave_radiation",
        "direct_solar_radiation",
        "diffuse_radiation",
        "latitude",
        "longitude",
    ]
    location_cols = ["longitude", "latitude", "county"]
    target_cols = [
        "target",
        "county",
        "is_business",
        "product_type",
        "is_consumption",
        "datetime",
    ]

    def __init__(self):
        self.df_data = pl.read_csv(
            os.path.join(self.root, "train.csv"),
            columns=self.data_cols,
            try_parse_dates=True,
        )
        self.df_client = pl.read_csv(
            os.path.join(self.root, "client.csv"),
            columns=self.client_cols,
            try_parse_dates=True,
        )
        self.df_gas_prices = pl.read_csv(
            os.path.join(self.root, "gas_prices.csv"),
            columns=self.gas_prices_cols,
            try_parse_dates=True,
        )
        self.df_electricity_prices = pl.read_csv(
            os.path.join(self.root, "electricity_prices.csv"),
            columns=self.electricity_prices_cols,
            try_parse_dates=True,
        )
        self.df_forecast_weather = pl.read_csv(
            os.path.join(self.root, "forecast_weather.csv"),
            columns=self.forecast_weather_cols,
            try_parse_dates=True,
        )
        self.df_historical_weather = pl.read_csv(
            os.path.join(self.root, "historical_weather.csv"),
            columns=self.historical_weather_cols,
            try_parse_dates=True,
        )
        self.df_weather_station_to_county_mapping = pl.read_csv(
            os.path.join(self.root, "weather_station_to_county_mapping.csv"),
            columns=self.location_cols,
            try_parse_dates=True,
        )
        self.df_data = self.df_data.filter(
            pl.col("datetime") >= pd.to_datetime("2022-01-01")
        )
        self.df_target = self.df_data.select(self.target_cols)

        self.schema_data = self.df_data.schema
        self.schema_client = self.df_client.schema
        self.schema_gas_prices = self.df_gas_prices.schema
        self.schema_electricity_prices = self.df_electricity_prices.schema
        self.schema_forecast_weather = self.df_forecast_weather.schema
        self.schema_historical_weather = self.df_historical_weather.schema
        self.schema_target = self.df_target.schema

        self.df_weather_station_to_county_mapping = (
            self.df_weather_station_to_county_mapping.with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
        )

    def update_with_new_data(
        self,
        df_new_client,
        df_new_gas_prices,
        df_new_electricity_prices,
        df_new_forecast_weather,
        df_new_historical_weather,
        df_new_target,
    ):
        df_new_client = pl.from_pandas(
            df_new_client[self.client_cols], schema_overrides=self.schema_client
        )
        df_new_gas_prices = pl.from_pandas(
            df_new_gas_prices[self.gas_prices_cols],
            schema_overrides=self.schema_gas_prices,
        )
        df_new_electricity_prices = pl.from_pandas(
            df_new_electricity_prices[self.electricity_prices_cols],
            schema_overrides=self.schema_electricity_prices,
        )
        df_new_forecast_weather = pl.from_pandas(
            df_new_forecast_weather[self.forecast_weather_cols],
            schema_overrides=self.schema_forecast_weather,
        )
        df_new_historical_weather = pl.from_pandas(
            df_new_historical_weather[self.historical_weather_cols],
            schema_overrides=self.schema_historical_weather,
        )
        df_new_target = pl.from_pandas(
            df_new_target[self.target_cols], schema_overrides=self.schema_target
        )

        self.df_client = pl.concat([self.df_client, df_new_client]).unique(
            ["date", "county", "is_business", "product_type"]
        )
        self.df_gas_prices = pl.concat([self.df_gas_prices, df_new_gas_prices]).unique(
            ["forecast_date"]
        )
        self.df_electricity_prices = pl.concat(
            [self.df_electricity_prices, df_new_electricity_prices]
        ).unique(["forecast_date"])
        self.df_forecast_weather = pl.concat(
            [self.df_forecast_weather, df_new_forecast_weather]
        ).unique(["forecast_datetime", "latitude", "longitude", "hours_ahead"])
        self.df_historical_weather = pl.concat(
            [self.df_historical_weather, df_new_historical_weather]
        ).unique(["datetime", "latitude", "longitude"])
        self.df_target = pl.concat([self.df_target, df_new_target]).unique(
            ["datetime", "county", "is_business", "product_type", "is_consumption"]
        )

    def preprocess_test(self, df_test):
        df_test = df_test.rename(columns={"prediction_datetime": "datetime"})
        df_test = pl.from_pandas(
            df_test[self.data_cols[1:]], schema_overrides=self.schema_data
        )
        return df_test

In [3]:
class FeaturesGenerator:
    def __init__(self, data_storage):
        self.data_storage = data_storage

    def _add_general_features(self, df_features):
        df_features = (
            df_features.with_columns(
                pl.col("datetime").dt.ordinal_day().alias("dayofyear"),
                pl.col("datetime").dt.hour().alias("hour"),
                pl.col("datetime").dt.day().alias("day"),
                pl.col("datetime").dt.weekday().alias("weekday"),
                pl.col("datetime").dt.month().alias("month"),
                pl.col("datetime").dt.year().alias("year"),
            )
            .with_columns(
                pl.concat_str(
                    "county",
                    "is_business",
                    "product_type",
                    "is_consumption",
                    separator="_",
                ).alias("segment"),
            )
            .with_columns(
                (np.pi * pl.col("dayofyear") / 183).sin().alias("sin(dayofyear)"),
                (np.pi * pl.col("dayofyear") / 183).cos().alias("cos(dayofyear)"),
                (np.pi * pl.col("hour") / 12).sin().alias("sin(hour)"),
                (np.pi * pl.col("hour") / 12).cos().alias("cos(hour)"),
            )
        )
        return df_features

    def _add_client_features(self, df_features):
        df_client = self.data_storage.df_client

        df_features = df_features.join(
            df_client.with_columns(
                (pl.col("date") + pl.duration(days=2)).cast(pl.Date)
            ),
            on=["county", "is_business", "product_type", "date"],
            how="left",
        )
        return df_features

    def _add_forecast_weather_features(self, df_features):
        df_forecast_weather = self.data_storage.df_forecast_weather
        df_weather_station_to_county_mapping = (
            self.data_storage.df_weather_station_to_county_mapping
        )

        df_forecast_weather = (
            df_forecast_weather.rename({"forecast_datetime": "datetime"})
            .filter((pl.col("hours_ahead") >= 22) & pl.col("hours_ahead") <= 45)
            .drop("hours_ahead")
            .with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
            .join(
                df_weather_station_to_county_mapping,
                how="left",
                on=["longitude", "latitude"],
            )
            .drop("longitude", "latitude")
        )

        df_forecast_weather_date = (
            df_forecast_weather.group_by("datetime").mean().drop("county")
        )

        df_forecast_weather_local = (
            df_forecast_weather.filter(pl.col("county").is_not_null())
            .group_by("county", "datetime")
            .mean()
        )

        for hours_lag in [0, 7 * 24]:
            df_features = df_features.join(
                df_forecast_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on="datetime",
                how="left",
                suffix=f"_forecast_{hours_lag}h",
            )
            df_features = df_features.join(
                df_forecast_weather_local.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on=["county", "datetime"],
                how="left",
                suffix=f"_forecast_local_{hours_lag}h",
            )

        return df_features

    def _add_historical_weather_features(self, df_features):
        df_historical_weather = self.data_storage.df_historical_weather
        df_weather_station_to_county_mapping = (
            self.data_storage.df_weather_station_to_county_mapping
        )

        df_historical_weather = (
            df_historical_weather.with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
            .join(
                df_weather_station_to_county_mapping,
                how="left",
                on=["longitude", "latitude"],
            )
            .drop("longitude", "latitude")
        )

        df_historical_weather_date = (
            df_historical_weather.group_by("datetime").mean().drop("county")
        )

        df_historical_weather_local = (
            df_historical_weather.filter(pl.col("county").is_not_null())
            .group_by("county", "datetime")
            .mean()
        )

        for hours_lag in [2 * 24, 7 * 24]:
            df_features = df_features.join(
                df_historical_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on="datetime",
                how="left",
                suffix=f"_historical_{hours_lag}h",
            )
            df_features = df_features.join(
                df_historical_weather_local.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on=["county", "datetime"],
                how="left",
                suffix=f"_historical_local_{hours_lag}h",
            )

        for hours_lag in [1 * 24]:
            df_features = df_features.join(
                df_historical_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag),
                    pl.col("datetime").dt.hour().alias("hour"),
                )
                .filter(pl.col("hour") <= 10)
                .drop("hour"),
                on="datetime",
                how="left",
                suffix=f"_historical_{hours_lag}h",
            )

        return df_features

    def _add_target_features(self, df_features):
        df_target = self.data_storage.df_target

        df_target_all_type_sum = (
            df_target.group_by(["datetime", "county", "is_business", "is_consumption"])
            .sum()
            .drop("product_type")
        )

        df_target_all_county_type_sum = (
            df_target.group_by(["datetime", "is_business", "is_consumption"])
            .sum()
            .drop("product_type", "county")
        )

        for hours_lag in [
            2 * 24,
            3 * 24,
            4 * 24,
            5 * 24,
            6 * 24,
            7 * 24,
            8 * 24,
            9 * 24,
            10 * 24,
            11 * 24,
            12 * 24,
            13 * 24,
            14 * 24,
        ]:
            df_features = df_features.join(
                df_target.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_{hours_lag}h"}),
                on=[
                    "county",
                    "is_business",
                    "product_type",
                    "is_consumption",
                    "datetime",
                ],
                how="left",
            )

        for hours_lag in [2 * 24, 3 * 24, 7 * 24, 14 * 24]:
            df_features = df_features.join(
                df_target_all_type_sum.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_all_type_sum_{hours_lag}h"}),
                on=["county", "is_business", "is_consumption", "datetime"],
                how="left",
            )

            df_features = df_features.join(
                df_target_all_county_type_sum.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_all_county_type_sum_{hours_lag}h"}),
                on=["is_business", "is_consumption", "datetime"],
                how="left",
                suffix=f"_all_county_type_sum_{hours_lag}h",
            )

        cols_for_stats = [
            f"target_{hours_lag}h" for hours_lag in [2 * 24, 3 * 24, 4 * 24, 5 * 24]
        ]
        df_features = df_features.with_columns(
            df_features.select(cols_for_stats).mean(axis=1).alias(f"target_mean"),
            df_features.select(cols_for_stats)
            .transpose()
            .std()
            .transpose()
            .to_series()
            .alias(f"target_std"),
        )

        for target_prefix, lag_nominator, lag_denomonator in [
            ("target", 24 * 7, 24 * 14),
            ("target", 24 * 2, 24 * 9),
            ("target", 24 * 3, 24 * 10),
            ("target", 24 * 2, 24 * 3),
            ("target_all_type_sum", 24 * 2, 24 * 3),
            ("target_all_type_sum", 24 * 7, 24 * 14),
            ("target_all_county_type_sum", 24 * 2, 24 * 3),
            ("target_all_county_type_sum", 24 * 7, 24 * 14),
        ]:
            df_features = df_features.with_columns(
                (
                    pl.col(f"{target_prefix}_{lag_nominator}h")
                    / (pl.col(f"{target_prefix}_{lag_denomonator}h") + 1e-3)
                ).alias(f"{target_prefix}_ratio_{lag_nominator}_{lag_denomonator}")
            )

        return df_features

    def _reduce_memory_usage(self, df_features):
        df_features = df_features.with_columns(pl.col(pl.Float64).cast(pl.Float32))
        return df_features

    def _drop_columns(self, df_features):
        df_features = df_features.drop(
            "date", "datetime", "hour", "dayofyear"
        )
        return df_features

    def _to_pandas(self, df_features, y):
        cat_cols = [
            "county",
            "is_business",
            "product_type",
            "is_consumption",
            "segment",
        ]

        if y is not None:
            df_features = pd.concat([df_features.to_pandas(), y.to_pandas()], axis=1)
        else:
            df_features = df_features.to_pandas()

        df_features = df_features.set_index("row_id")
        df_features[cat_cols] = df_features[cat_cols].astype("category")

        return df_features

    def generate_features(self, df_prediction_items):
        if "target" in df_prediction_items.columns:
            df_prediction_items, y = (
                df_prediction_items.drop("target"),
                df_prediction_items.select("target"),
            )
        else:
            y = None

        df_features = df_prediction_items.with_columns(
            pl.col("datetime").cast(pl.Date).alias("date"),
        )

        for add_features in [
            self._add_general_features,
            self._add_client_features,
            self._add_forecast_weather_features,
            self._add_historical_weather_features,
            self._add_target_features,
            self._reduce_memory_usage,
            self._drop_columns,
        ]:
            df_features = add_features(df_features)

        df_features = self._to_pandas(df_features, y)

        return df_features

In [4]:
data_storage = DataStorage()
features_generator = FeaturesGenerator(data_storage=data_storage)

In [5]:
df_train_features = features_generator.generate_features(data_storage.df_data)
df_train_features = df_train_features[df_train_features['target'].notnull()]

In [6]:
import holidays
import datetime

estonian_holidays = holidays.country_holidays('EE', years=range(2021, 2026))
estonian_holidays = list(estonian_holidays.keys())

def add_holidays_as_binary_features(df):
    df['country_holiday'] = df.apply(lambda row: (datetime.date(row['year'], row['month'], row['day']) in estonian_holidays) * 1, axis=1)
    
    return df

df_train_features = add_holidays_as_binary_features(df_train_features)

In [7]:
null_ratio = (df_train_features.isnull().sum()/len(df_train_features)).sort_values(ascending = False)
null_much = list(null_ratio[null_ratio > 0.5].index)
df_train_features.drop(null_much, axis = 1, inplace = True)
(df_train_features.isnull().sum()/len(df_train_features)).sort_values(ascending = False)
df_med = df_train_features.median(axis = 0, numeric_only = True)

cols = df_train_features.select_dtypes(['int64', 'float64', 'float32', 'int8', 'int32']).columns
for i in cols:
    df_train_features[i] = df_train_features[i].fillna(df_med[i])

In [8]:
df_train_features.head()

Unnamed: 0_level_0,county,is_business,product_type,is_consumption,day,weekday,month,year,segment,sin(dayofyear),...,target_ratio_168_336,target_ratio_48_216,target_ratio_72_240,target_ratio_48_72,target_all_type_sum_ratio_48_72,target_all_type_sum_ratio_168_336,target_all_county_type_sum_ratio_48_72,target_all_county_type_sum_ratio_168_336,target,country_holiday
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
366048,0,0,1,0,1,6,1,2022,0_0_1_0,0.017166,...,0.945753,0.946264,0.946073,0.953341,0.981667,0.983894,0.998563,1.004637,0.0,1
366049,0,0,1,1,1,6,1,2022,0_0_1_1,0.017166,...,0.945753,0.946264,0.946073,0.953341,0.981667,0.983894,0.998563,1.004637,442.226,1
366050,0,0,2,0,1,6,1,2022,0_0_2_0,0.017166,...,0.945753,0.946264,0.946073,0.953341,0.981667,0.983894,0.998563,1.004637,0.0,1
366051,0,0,2,1,1,6,1,2022,0_0_2_1,0.017166,...,0.945753,0.946264,0.946073,0.953341,0.981667,0.983894,0.998563,1.004637,44.899,1
366052,0,0,3,0,1,6,1,2022,0_0_3_0,0.017166,...,0.945753,0.946264,0.946073,0.953341,0.981667,0.983894,0.998563,1.004637,0.015,1


In [12]:
df_train_features.describe(include = 'category')

Unnamed: 0,county,is_business,product_type,is_consumption,segment
count,1651902,1651902,1651902,1651902,1651902
unique,16,2,4,2,138
top,0,1,3,0,0_0_1_0
freq,173334,895846,742860,825951,12381


In [None]:
del_cols = ['county', 'is_business', 'product_type', 'is_consumption']
df_train_features.drop(del_cols, axis = 1, inplace = True)

In [None]:
df_train_features['datetime'] = 0

for i in df_train_features.index:
    df_train_features.loc[i, 'datetime'] = dt.datetime(df_train_features['year'][i], df_train_features['month'][i], df_train_features['day'][i])

In [None]:
df_train_features.head()

Unnamed: 0_level_0,day,weekday,month,year,segment,sin(dayofyear),cos(dayofyear),sin(hour),cos(hour),eic_count,...,target_ratio_48_216,target_ratio_72_240,target_ratio_48_72,target_all_type_sum_ratio_48_72,target_all_type_sum_ratio_168_336,target_all_county_type_sum_ratio_48_72,target_all_county_type_sum_ratio_168_336,target,country_holiday,datetime
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
366048,1,6,1,2022,0_0_1_0,0.017166,0.999853,0.0,1.0,148.0,...,0.946264,0.946073,0.953341,0.981667,0.983894,0.998563,1.004637,0.0,1,2022-01-01 00:00:00
366049,1,6,1,2022,0_0_1_1,0.017166,0.999853,0.0,1.0,148.0,...,0.946264,0.946073,0.953341,0.981667,0.983894,0.998563,1.004637,442.226,1,2022-01-01 00:00:00
366050,1,6,1,2022,0_0_2_0,0.017166,0.999853,0.0,1.0,16.0,...,0.946264,0.946073,0.953341,0.981667,0.983894,0.998563,1.004637,0.0,1,2022-01-01 00:00:00
366051,1,6,1,2022,0_0_2_1,0.017166,0.999853,0.0,1.0,16.0,...,0.946264,0.946073,0.953341,0.981667,0.983894,0.998563,1.004637,44.899,1,2022-01-01 00:00:00
366052,1,6,1,2022,0_0_3_0,0.017166,0.999853,0.0,1.0,739.0,...,0.946264,0.946073,0.953341,0.981667,0.983894,0.998563,1.004637,0.015,1,2022-01-01 00:00:00


In [None]:
del_cols = ['day', 'month', 'year']
df_train_features.drop(del_cols, axis = 1, inplace = True)

In [None]:
df_train_features['weekday'] = df_train_features['weekday'].astype("category")

In [None]:
p_corr = df_train_features.corr()['target']

In [None]:
df_train_features['datetime'] = df_train_features['datetime'].astype('datetime64')

In [None]:
df_train_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1651902 entries, 366048 to 2018351
Columns: 146 entries, weekday to datetime
dtypes: category(2), datetime64[ns](1), float32(140), float64(2), int64(1)
memory usage: 950.0 MB


In [None]:
df_train_features.to_csv("C:/Users/yjg10/OneDrive/문서/Kaggle_data/Energy/ref/refined.csv", index = False)

In [17]:
from sklearn.preprocessing import StandardScaler

In [18]:
Y = df_train_features.pop('target')

In [19]:
col_n = df_train_features.select_dtypes(['float32', 'float64', 'int64']).columns
col_o = df_train_features.select_dtypes(['category']).columns

In [20]:
df_n = df_train_features[col_n]
df_o = df_train_features[col_o]

In [21]:
ss = StandardScaler()

In [22]:
df_n = pd.DataFrame(ss.fit_transform(df_n), columns = col_n)

In [23]:
df_o = pd.get_dummies(df_o)

In [24]:
df_n.reset_index(drop = True, inplace = True)
df_o.reset_index(drop = True, inplace = True)
df_dt = df_train_features['datetime'].reset_index(drop = True)

In [25]:
df2 = pd.concat([df_dt, df_n, df_o], axis = 1)

In [26]:
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from xgboost import XGBRegressor

In [27]:
?TimeSeriesSplit

[1;31mInit signature:[0m [0mTimeSeriesSplit[0m[1;33m([0m[0mn_splits[0m[1;33m=[0m[1;36m5[0m[1;33m,[0m [1;33m*[0m[1;33m,[0m [0mmax_train_size[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m [0mtest_size[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m [0mgap[0m[1;33m=[0m[1;36m0[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m     
Time Series cross-validator

Provides train/test indices to split time series data samples
that are observed at fixed time intervals, in train/test sets.
In each split, test indices must be higher than before, and thus shuffling
in cross validator is inappropriate.

This cross-validation object is a variation of :class:`KFold`.
In the kth split, it returns first k folds as train set and the
(k+1)th fold as test set.

Note that unlike standard cross-validation methods, successive
training sets are supersets of those that come before them.

Read more in the :ref:`User Guide <time_series_split>`.

.. versionadded:: 0.18

Parameters
---

In [28]:
# df2.drop('datetime', axis = 1, inplace = True)

In [52]:
df2['datetime'] =  pd.to_numeric(df2['datetime']) / 10**18

In [53]:
tscv = TimeSeriesSplit(n_splits = 5, gap = 0)

In [54]:
depth = [3, 5, 7, 9, 11, 13, 15]
n_esti = [300]

for j in depth:
    for k in n_esti:
        avg = np.array([])
        for i, (train_index, test_index) in enumerate(tscv.split(df2)):

            x_tr = df2.iloc[train_index, :]
            y_tr = Y.iloc[train_index]

            x_val = df2.iloc[test_index, :]
            y_val = Y.iloc[test_index]

            xgb = XGBRegressor(random_state = 0, max_depth = j, n_estimators = k,
                               learning_rate = 0.2, subsample = 0.5, colsample_bytree = 0.5,
                               eval_metric = 'mae')
            xgb.fit(x_tr, y_tr)

            mae = mean_absolute_error(y_val, xgb.predict(x_val))
            print(f"fold {i} MAE : {mae}")
            avg = np.append(avg, mae)
        print(f"max_depth : {j}, estimators : {k} average MAE : {avg.mean()}")

fold 0 MAE : 80.98154087418631
fold 1 MAE : 73.85010839911494
fold 2 MAE : 49.21523738857073
fold 3 MAE : 58.4311694308936
fold 4 MAE : 82.13250013260446
max_depth : 3, estimators : 300 average MAE : 68.922111245074
fold 0 MAE : 77.98795956270433
fold 1 MAE : 76.69316824012498
fold 2 MAE : 47.910502929999865
fold 3 MAE : 47.93731763497343
fold 4 MAE : 78.26713641636397
max_depth : 5, estimators : 300 average MAE : 65.75921695683331
fold 0 MAE : 77.52810609688137
fold 1 MAE : 73.92370209910862
fold 2 MAE : 48.167232701796195
fold 3 MAE : 46.015119371682246
fold 4 MAE : 79.02890147901375
max_depth : 7, estimators : 300 average MAE : 64.93261234969644
fold 0 MAE : 77.47144540904547
fold 1 MAE : 71.28483948475908
fold 2 MAE : 49.65545433082948
fold 3 MAE : 44.32718913868249
fold 4 MAE : 78.75376546836202
max_depth : 9, estimators : 300 average MAE : 64.29853876633571
fold 0 MAE : 78.47335174157426
fold 1 MAE : 70.90644522958334
fold 2 MAE : 49.36180759135011
fold 3 MAE : 46.71987138534364


In [55]:
from lightgbm import LGBMRegressor

In [58]:
depth = [3, 5, 7, 9, 11, 13, 15]
n_esti = [300]

for j in depth:
    for k in n_esti:
        avg = np.array([])
        for i, (train_index, test_index) in enumerate(tscv.split(df2)):

            x_tr = df2.iloc[train_index, :]
            y_tr = Y.iloc[train_index]

            x_val = df2.iloc[test_index, :]
            y_val = Y.iloc[test_index]

            lgbm = LGBMRegressor(random_state = 0, max_depth = j, n_estimators = k, force_row_wise = True,
                                 num_leaves = 2**j - 2, verbose = -1)
            lgbm.fit(x_tr, y_tr)

            mae = mean_absolute_error(y_val, lgbm.predict(x_val))
            print(f"fold {i} MAE : {mae}")
            avg = np.append(avg, mae)
        print(f"max_depth : {j}, estimators : {k} average MAE : {avg.mean()}")

fold 0 MAE : 75.72025875241857
fold 1 MAE : 68.54943751851908
fold 2 MAE : 50.90576992133868
fold 3 MAE : 57.723488872399095
fold 4 MAE : 85.95184691366464
max_depth : 3, estimators : 300 average MAE : 67.77016039566801
fold 0 MAE : 71.93508387909478
fold 1 MAE : 65.5089694241525
fold 2 MAE : 45.90355055523187
fold 3 MAE : 46.11468641010669
fold 4 MAE : 78.28003495824615
max_depth : 5, estimators : 300 average MAE : 61.5484650453664
fold 0 MAE : 69.08583481551645
fold 1 MAE : 66.32195263591586
fold 2 MAE : 45.46371996991175
fold 3 MAE : 44.39988797653489
fold 4 MAE : 79.85594480462278
max_depth : 7, estimators : 300 average MAE : 61.02546804050034
fold 0 MAE : 68.57551174452631
fold 1 MAE : 63.83024322173762
fold 2 MAE : 44.94523856023461
fold 3 MAE : 44.19009263186432
fold 4 MAE : 76.99425694828425
max_depth : 9, estimators : 300 average MAE : 59.70706862132943
fold 0 MAE : 70.5683556453778
fold 1 MAE : 65.23540188611213
fold 2 MAE : 44.516310460312546
fold 3 MAE : 44.58014709656504
f