In [72]:
import warnings

warnings.filterwarnings("ignore")

import os
import gc
import pickle

import numpy as np
import pandas as pd
import polars as pl
import plotly.express as px
import matplotlib.pyplot as plt

from sklearn.metrics import mean_absolute_error

In [2]:
class DataStorage:
    root = "./Study/Kaggle_0104/data"

    data_cols = [
        "target",
        "county",
        "is_business",
        "product_type",
        "is_consumption",
        "datetime",
        "row_id",
    ]
    client_cols = [
        "product_type",
        "county",
        "eic_count",
        "installed_capacity",
        "is_business",
        "date",
    ]
    gas_prices_cols = ["forecast_date", "lowest_price_per_mwh", "highest_price_per_mwh"]
    electricity_prices_cols = ["forecast_date", "euros_per_mwh"]
    forecast_weather_cols = [
        "latitude",
        "longitude",
        "hours_ahead",
        "temperature",
        "dewpoint",
        "cloudcover_high",
        "cloudcover_low",
        "cloudcover_mid",
        "cloudcover_total",
        "10_metre_u_wind_component",
        "10_metre_v_wind_component",
        "forecast_datetime",
        "direct_solar_radiation",
        "surface_solar_radiation_downwards",
        "snowfall",
        "total_precipitation",
    ]
    historical_weather_cols = [
        "datetime",
        "temperature",
        "dewpoint",
        "rain",
        "snowfall",
        "surface_pressure",
        "cloudcover_total",
        "cloudcover_low",
        "cloudcover_mid",
        "cloudcover_high",
        "windspeed_10m",
        "winddirection_10m",
        "shortwave_radiation",
        "direct_solar_radiation",
        "diffuse_radiation",
        "latitude",
        "longitude",
    ]
    location_cols = ["longitude", "latitude", "county"]
    target_cols = [
        "target",
        "county",
        "is_business",
        "product_type",
        "is_consumption",
        "datetime",
    ]

    def __init__(self):
        self.df_data = pl.read_csv(
            os.path.join(self.root, "train.csv"),
            columns=self.data_cols,
            try_parse_dates=True,
        )
        self.df_client = pl.read_csv(
            os.path.join(self.root, "client.csv"),
            columns=self.client_cols,
            try_parse_dates=True,
        )
        self.df_gas_prices = pl.read_csv(
            os.path.join(self.root, "gas_prices.csv"),
            columns=self.gas_prices_cols,
            try_parse_dates=True,
        )
        self.df_electricity_prices = pl.read_csv(
            os.path.join(self.root, "electricity_prices.csv"),
            columns=self.electricity_prices_cols,
            try_parse_dates=True,
        )
        self.df_forecast_weather = pl.read_csv(
            os.path.join(self.root, "forecast_weather.csv"),
            columns=self.forecast_weather_cols,
            try_parse_dates=True,
        )
        self.df_historical_weather = pl.read_csv(
            os.path.join(self.root, "historical_weather.csv"),
            columns=self.historical_weather_cols,
            try_parse_dates=True,
        )
        self.df_weather_station_to_county_mapping = pl.read_csv(
            os.path.join(self.root, "weather_station_to_county_mapping.csv"),
            columns=self.location_cols,
            try_parse_dates=True,
        )
        self.df_data = self.df_data.filter(
            pl.col("datetime") >= pd.to_datetime("2022-01-01")
        )
        self.df_target = self.df_data.select(self.target_cols)

        self.schema_data = self.df_data.schema
        self.schema_client = self.df_client.schema
        self.schema_gas_prices = self.df_gas_prices.schema
        self.schema_electricity_prices = self.df_electricity_prices.schema
        self.schema_forecast_weather = self.df_forecast_weather.schema
        self.schema_historical_weather = self.df_historical_weather.schema
        self.schema_target = self.df_target.schema

        self.df_weather_station_to_county_mapping = (
            self.df_weather_station_to_county_mapping.with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
        )

    def update_with_new_data(
        self,
        df_new_client,
        df_new_gas_prices,
        df_new_electricity_prices,
        df_new_forecast_weather,
        df_new_historical_weather,
        df_new_target,
    ):
        df_new_client = pl.from_pandas(
            df_new_client[self.client_cols], schema_overrides=self.schema_client
        )
        df_new_gas_prices = pl.from_pandas(
            df_new_gas_prices[self.gas_prices_cols],
            schema_overrides=self.schema_gas_prices,
        )
        df_new_electricity_prices = pl.from_pandas(
            df_new_electricity_prices[self.electricity_prices_cols],
            schema_overrides=self.schema_electricity_prices,
        )
        df_new_forecast_weather = pl.from_pandas(
            df_new_forecast_weather[self.forecast_weather_cols],
            schema_overrides=self.schema_forecast_weather,
        )
        df_new_historical_weather = pl.from_pandas(
            df_new_historical_weather[self.historical_weather_cols],
            schema_overrides=self.schema_historical_weather,
        )
        df_new_target = pl.from_pandas(
            df_new_target[self.target_cols], schema_overrides=self.schema_target
        )

        self.df_client = pl.concat([self.df_client, df_new_client]).unique(
            ["date", "county", "is_business", "product_type"]
        )
        self.df_gas_prices = pl.concat([self.df_gas_prices, df_new_gas_prices]).unique(
            ["forecast_date"]
        )
        self.df_electricity_prices = pl.concat(
            [self.df_electricity_prices, df_new_electricity_prices]
        ).unique(["forecast_date"])
        self.df_forecast_weather = pl.concat(
            [self.df_forecast_weather, df_new_forecast_weather]
        ).unique(["forecast_datetime", "latitude", "longitude", "hours_ahead"])
        self.df_historical_weather = pl.concat(
            [self.df_historical_weather, df_new_historical_weather]
        ).unique(["datetime", "latitude", "longitude"])
        self.df_target = pl.concat([self.df_target, df_new_target]).unique(
            ["datetime", "county", "is_business", "product_type", "is_consumption"]
        )

    def preprocess_test(self, df_test):
        df_test = df_test.rename(columns={"prediction_datetime": "datetime"})
        df_test = pl.from_pandas(
            df_test[self.data_cols[1:]], schema_overrides=self.schema_data
        )
        return df_test

In [3]:
class FeaturesGenerator:
    def __init__(self, data_storage):
        self.data_storage = data_storage

    def _add_general_features(self, df_features):
        df_features = (
            df_features.with_columns(
                pl.col("datetime").dt.ordinal_day().alias("dayofyear"),
                pl.col("datetime").dt.hour().alias("hour"),
                pl.col("datetime").dt.day().alias("day"),
                pl.col("datetime").dt.weekday().alias("weekday"),
                pl.col("datetime").dt.month().alias("month"),
                pl.col("datetime").dt.year().alias("year"),
            )
            .with_columns(
                pl.concat_str(
                    "county",
                    "is_business",
                    "product_type",
                    "is_consumption",
                    separator="_",
                ).alias("segment"),
            )
            .with_columns(
                (np.pi * pl.col("dayofyear") / 183).sin().alias("sin(dayofyear)"),
                (np.pi * pl.col("dayofyear") / 183).cos().alias("cos(dayofyear)"),
                (np.pi * pl.col("hour") / 12).sin().alias("sin(hour)"),
                (np.pi * pl.col("hour") / 12).cos().alias("cos(hour)"),
            )
        )
        return df_features

    def _add_client_features(self, df_features):
        df_client = self.data_storage.df_client

        df_features = df_features.join(
            df_client.with_columns(
                (pl.col("date") + pl.duration(days=2)).cast(pl.Date)
            ),
            on=["county", "is_business", "product_type", "date"],
            how="left",
        )
        return df_features

    def _add_forecast_weather_features(self, df_features):
        df_forecast_weather = self.data_storage.df_forecast_weather
        df_weather_station_to_county_mapping = (
            self.data_storage.df_weather_station_to_county_mapping
        )

        df_forecast_weather = (
            df_forecast_weather.rename({"forecast_datetime": "datetime"})
            .filter((pl.col("hours_ahead") >= 22) & pl.col("hours_ahead") <= 45)
            .drop("hours_ahead")
            .with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
            .join(
                df_weather_station_to_county_mapping,
                how="left",
                on=["longitude", "latitude"],
            )
            .drop("longitude", "latitude")
        )

        df_forecast_weather_date = (
            df_forecast_weather.group_by("datetime").mean().drop("county")
        )

        df_forecast_weather_local = (
            df_forecast_weather.filter(pl.col("county").is_not_null())
            .group_by("county", "datetime")
            .mean()
        )

        for hours_lag in [0, 7 * 24]:
            df_features = df_features.join(
                df_forecast_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on="datetime",
                how="left",
                suffix=f"_forecast_{hours_lag}h",
            )
            df_features = df_features.join(
                df_forecast_weather_local.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on=["county", "datetime"],
                how="left",
                suffix=f"_forecast_local_{hours_lag}h",
            )

        return df_features

    def _add_historical_weather_features(self, df_features):
        df_historical_weather = self.data_storage.df_historical_weather
        df_weather_station_to_county_mapping = (
            self.data_storage.df_weather_station_to_county_mapping
        )

        df_historical_weather = (
            df_historical_weather.with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
            .join(
                df_weather_station_to_county_mapping,
                how="left",
                on=["longitude", "latitude"],
            )
            .drop("longitude", "latitude")
        )

        df_historical_weather_date = (
            df_historical_weather.group_by("datetime").mean().drop("county")
        )

        df_historical_weather_local = (
            df_historical_weather.filter(pl.col("county").is_not_null())
            .group_by("county", "datetime")
            .mean()
        )

        for hours_lag in [2 * 24, 7 * 24]:
            df_features = df_features.join(
                df_historical_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on="datetime",
                how="left",
                suffix=f"_historical_{hours_lag}h",
            )
            df_features = df_features.join(
                df_historical_weather_local.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on=["county", "datetime"],
                how="left",
                suffix=f"_historical_local_{hours_lag}h",
            )

        for hours_lag in [1 * 24]:
            df_features = df_features.join(
                df_historical_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag),
                    pl.col("datetime").dt.hour().alias("hour"),
                )
                .filter(pl.col("hour") <= 10)
                .drop("hour"),
                on="datetime",
                how="left",
                suffix=f"_historical_{hours_lag}h",
            )

        return df_features

    def _add_target_features(self, df_features):
        df_target = self.data_storage.df_target

        df_target_all_type_sum = (
            df_target.group_by(["datetime", "county", "is_business", "is_consumption"])
            .sum()
            .drop("product_type")
        )

        df_target_all_county_type_sum = (
            df_target.group_by(["datetime", "is_business", "is_consumption"])
            .sum()
            .drop("product_type", "county")
        )

        for hours_lag in [
            2 * 24,
            3 * 24,
            4 * 24,
            5 * 24,
            6 * 24,
            7 * 24,
            8 * 24,
            9 * 24,
            10 * 24,
            11 * 24,
            12 * 24,
            13 * 24,
            14 * 24,
        ]:
            df_features = df_features.join(
                df_target.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_{hours_lag}h"}),
                on=[
                    "county",
                    "is_business",
                    "product_type",
                    "is_consumption",
                    "datetime",
                ],
                how="left",
            )

        for hours_lag in [2 * 24, 3 * 24, 7 * 24, 14 * 24]:
            df_features = df_features.join(
                df_target_all_type_sum.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_all_type_sum_{hours_lag}h"}),
                on=["county", "is_business", "is_consumption", "datetime"],
                how="left",
            )

            df_features = df_features.join(
                df_target_all_county_type_sum.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_all_county_type_sum_{hours_lag}h"}),
                on=["is_business", "is_consumption", "datetime"],
                how="left",
                suffix=f"_all_county_type_sum_{hours_lag}h",
            )

        cols_for_stats = [
            f"target_{hours_lag}h" for hours_lag in [2 * 24, 3 * 24, 4 * 24, 5 * 24]
        ]
        df_features = df_features.with_columns(
            df_features.select(cols_for_stats).mean(axis=1).alias(f"target_mean"),
            df_features.select(cols_for_stats)
            .transpose()
            .std()
            .transpose()
            .to_series()
            .alias(f"target_std"),
        )

        for target_prefix, lag_nominator, lag_denomonator in [
            ("target", 24 * 7, 24 * 14),
            ("target", 24 * 2, 24 * 9),
            ("target", 24 * 3, 24 * 10),
            ("target", 24 * 2, 24 * 3),
            ("target_all_type_sum", 24 * 2, 24 * 3),
            ("target_all_type_sum", 24 * 7, 24 * 14),
            ("target_all_county_type_sum", 24 * 2, 24 * 3),
            ("target_all_county_type_sum", 24 * 7, 24 * 14),
        ]:
            df_features = df_features.with_columns(
                (
                    pl.col(f"{target_prefix}_{lag_nominator}h")
                    / (pl.col(f"{target_prefix}_{lag_denomonator}h") + 1e-3)
                ).alias(f"{target_prefix}_ratio_{lag_nominator}_{lag_denomonator}")
            )

        return df_features

    def _reduce_memory_usage(self, df_features):
        df_features = df_features.with_columns(pl.col(pl.Float64).cast(pl.Float32))
        return df_features

    def _drop_columns(self, df_features):
        df_features = df_features.drop(
            "date", "datetime", "hour", "dayofyear"
        )
        return df_features

    def _to_pandas(self, df_features, y):
        cat_cols = [
            "county",
            "is_business",
            "product_type",
            "is_consumption",
            "segment",
        ]

        if y is not None:
            df_features = pd.concat([df_features.to_pandas(), y.to_pandas()], axis=1)
        else:
            df_features = df_features.to_pandas()

        df_features = df_features.set_index("row_id")
        df_features[cat_cols] = df_features[cat_cols].astype("category")

        return df_features

    def generate_features(self, df_prediction_items):
        if "target" in df_prediction_items.columns:
            df_prediction_items, y = (
                df_prediction_items.drop("target"),
                df_prediction_items.select("target"),
            )
        else:
            y = None

        df_features = df_prediction_items.with_columns(
            pl.col("datetime").cast(pl.Date).alias("date"),
        )

        for add_features in [
            self._add_general_features,
            self._add_client_features,
            self._add_forecast_weather_features,
            self._add_historical_weather_features,
            self._add_target_features,
            self._reduce_memory_usage,
            self._drop_columns,
        ]:
            df_features = add_features(df_features)

        df_features = self._to_pandas(df_features, y)

        return df_features

In [4]:
data_storage = DataStorage()
features_generator = FeaturesGenerator(data_storage=data_storage)

In [5]:
df_train_features = features_generator.generate_features(data_storage.df_data)
df_train_features = df_train_features[df_train_features['target'].notnull()]

In [6]:
import holidays
import datetime

estonian_holidays = holidays.country_holidays('EE', years=range(2021, 2026))
estonian_holidays = list(estonian_holidays.keys())

def add_holidays_as_binary_features(df):
    df['country_holiday'] = df.apply(lambda row: (datetime.date(row['year'], row['month'], row['day']) in estonian_holidays) * 1, axis=1)
    
    return df

df_train_features = add_holidays_as_binary_features(df_train_features)

- 결측치 과다 컬럼 제거

In [7]:
null_much = ['windspeed_10m_historical_24h',
 'direct_solar_radiation_historical_24h',
 'dewpoint_historical_24h',
 'temperature_historical_24h',
 'cloudcover_high_historical_24h',
 'snowfall_historical_24h',
 'surface_pressure_historical_24h',
 'diffuse_radiation_historical_24h',
 'cloudcover_total_historical_24h',
 'rain_historical_24h',
 'cloudcover_mid_historical_24h',
 'cloudcover_low_historical_24h',
 'shortwave_radiation_historical_24h',
 'winddirection_10m_historical_24h']
df_train_features.drop(null_much, axis = 1, inplace = True)

- 결측치 중앙값 대체

In [8]:
df_med = df_train_features.median(axis = 0, numeric_only = True)
cols = df_med.index
for i in cols:
    df_train_features[i] = df_train_features[i].fillna(df_med[i])

- 불필요 컬럼 제거

In [9]:
del_cols = ['segment']
df_train_features.drop(del_cols, axis = 1, inplace = True)

- day, month, year 일 변환 및 기존 컬럼 삭제

In [10]:
import datetime as dt

df_train_features['datetime'] = 0

for i in df_train_features.index:
    df_train_features.loc[i, 'datetime'] = dt.datetime(df_train_features['year'][i], df_train_features['month'][i], df_train_features['day'][i])

del_cols = ['day', 'month', 'year']
df_train_features.drop(del_cols, axis = 1, inplace = True)

df_train_features['weekday'] = df_train_features['weekday'].astype("category")
df_train_features['datetime'] = df_train_features['datetime'].astype('datetime64[ns]')
df_train_features['datetime'] =  pd.to_numeric(df_train_features['datetime']) / 10**18

In [11]:
X1 = df_train_features.drop('target', axis = 1)
Y1 = df_train_features['target']

- 스케일링 / 원핫인코딩

In [12]:
col_n = X1.select_dtypes(['float32', 'float64', 'int64']).columns
col_o = X1.select_dtypes(['category']).columns

In [13]:
df_n = X1[col_n]
df_o = X1[col_o]

In [14]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

ss = StandardScaler()
df_n = pd.DataFrame(ss.fit_transform(df_n), columns = col_n)

oh = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore')

o1 = oh.fit_transform(df_o)
df_o = pd.DataFrame(o1, columns = oh.get_feature_names_out())

In [15]:
df_n.reset_index(drop = True, inplace = True)
df_o.reset_index(drop = True, inplace = True)

In [16]:
df2 = pd.concat([df_n, df_o], axis = 1)

In [17]:
params = {'n_estimators' : 300, 'max_depth' : 9, 'random_state' : 0, 'force_row_wise' : True, 'num_leaves' : 2**9 - 2, 'verbose' : -1}

In [18]:
from lightgbm import LGBMRegressor

In [19]:
model = LGBMRegressor(**params)

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
x_tr, x_val, y_tr, y_val = train_test_split(df2, Y1, test_size = 0.3, random_state = 0)

In [22]:
x_tr.shape, x_val.shape, y_tr.shape, y_val.shape

((1156331, 174), (495571, 174), (1156331,), (495571,))

In [23]:
model.fit(x_tr, y_tr)

In [24]:
mean_absolute_error(y_val, model.predict(x_val))

28.48033306055626

In [25]:
len_t = int(len(df2) * 0.7)
x_cvtr, x_cvval = (df2.iloc[:len_t, :], df2.iloc[len_t:, :])
y_cvtr, y_cvval = (Y1.iloc[:len_t], Y1.iloc[len_t:]) 

In [26]:
x_cvtr.shape, x_cvval.shape, y_cvtr.shape, y_cvval.shape

((1156331, 174), (495571, 174), (1156331,), (495571,))

In [28]:
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, StackingRegressor
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit

In [29]:
lgbm = LGBMRegressor(random_state=0, bagging_fraction = 0.7, feature_fraction = 0.7, verbose = -1)
xgb = XGBRegressor(random_state=0, eval_metric = 'mae', subsample = 0.7, colsample_bytree = 0.7)
catb = CatBoostRegressor(random_state=0)
rf = RandomForestRegressor(random_state=0)

In [30]:
tscv = TimeSeriesSplit(n_splits = 5, gap = 0)

In [42]:
xgb_params = {
    'min_child_weight' : [0.01, 0.1, 1, 10, 100],
    'max_depth' : [3, 5, 7, 9, 11, 13, 15],
    'max_leaves' : [2**i - 2 for i in range(3, 16, 2)],
    'lambda' : [1, 10, 100]
}

In [33]:
lgbm_params = {
    'num_iterations' : [100, 200, 300],
    'max_depth' : [-1, 3, 5, 7, 9, 11, 13, 15],
    'num_leaves' : [2**i - 2 for i in range(3, 16, 2)]
}

In [34]:
rf_params = {
    'n_estimators' : [100, 200, 300],
    'max_depth' : [3, 5, 7, 9, 11, 13, 15],
    'max_leaf_nodes' : [2**i - 2 for i in range(3, 16, 2)]
}

In [35]:
rs_xgb = RandomizedSearchCV(xgb, param_distributions = xgb_params, n_iter = 10, cv = tscv, n_jobs = -1)
rs_lgbm = RandomizedSearchCV(lgbm, param_distributions = lgbm_params, n_iter = 10, cv = tscv, n_jobs = -1)
rs_rf = RandomizedSearchCV(rf, param_distributions = rf_params, n_iter = 10, cv = tscv, n_jobs = -1)

In [44]:
?VotingRegressor

[1;31mInit signature:[0m [0mVotingRegressor[0m[1;33m([0m[0mestimators[0m[1;33m,[0m [1;33m*[0m[1;33m,[0m [0mweights[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m [0mn_jobs[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m [0mverbose[0m[1;33m=[0m[1;32mFalse[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m     
Prediction voting regressor for unfitted estimators.

A voting regressor is an ensemble meta-estimator that fits several base
regressors, each on the whole dataset. Then it averages the individual
predictions to form a final prediction.

Read more in the :ref:`User Guide <voting_regressor>`.

.. versionadded:: 0.21

Parameters
----------
estimators : list of (str, estimator) tuples
    Invoking the ``fit`` method on the ``VotingRegressor`` will fit clones
    of those original estimators that will be stored in the class attribute
    ``self.estimators_``. An estimator can be set to ``'drop'`` using
    :meth:`set_params`.

    .. versionchanged:: 0.21
    

In [39]:
rs_xgb.fit(x_cvtr, y_cvtr)

In [40]:
xgb_best = rs_xgb.best_estimator_

In [46]:
rs_lgbm.fit(x_cvtr, y_cvtr)

In [48]:
lgbm_best = rs_lgbm.best_estimator_

In [51]:
catb.fit(x_cvtr, y_cvtr)

Learning rate set to 0.12478
0:	learn: 774.2267189	total: 174ms	remaining: 2m 53s
1:	learn: 689.3677193	total: 205ms	remaining: 1m 42s
2:	learn: 615.7966653	total: 236ms	remaining: 1m 18s
3:	learn: 551.4885899	total: 268ms	remaining: 1m 6s
4:	learn: 496.3945906	total: 299ms	remaining: 59.6s
5:	learn: 448.0636974	total: 330ms	remaining: 54.6s
6:	learn: 406.9723544	total: 359ms	remaining: 50.9s
7:	learn: 371.3493198	total: 389ms	remaining: 48.2s
8:	learn: 341.3114479	total: 417ms	remaining: 46s
9:	learn: 315.7031040	total: 447ms	remaining: 44.2s
10:	learn: 293.6031807	total: 479ms	remaining: 43.1s
11:	learn: 275.1760562	total: 510ms	remaining: 42s
12:	learn: 258.7373074	total: 540ms	remaining: 41s
13:	learn: 246.0916337	total: 574ms	remaining: 40.4s
14:	learn: 234.9232152	total: 607ms	remaining: 39.8s
15:	learn: 226.0731720	total: 637ms	remaining: 39.2s
16:	learn: 216.9161982	total: 669ms	remaining: 38.7s
17:	learn: 207.2687167	total: 705ms	remaining: 38.4s
18:	learn: 201.2088456	total: 

<catboost.core.CatBoostRegressor at 0x14706ed3610>

In [52]:
mean_absolute_error(y_cvtr, catb.predict(x_cvtr))

30.384564692098735

In [55]:
models = [('xgb', xgb_best), ('lgbm', lgbm_best),  ('catb', catb)]

In [58]:
model_vo = VotingRegressor(models)

In [59]:
model_vo.fit(x_cvtr, y_cvtr)

Learning rate set to 0.12478
0:	learn: 774.2267189	total: 35.5ms	remaining: 35.4s
1:	learn: 689.3677193	total: 66.4ms	remaining: 33.1s
2:	learn: 615.7966653	total: 97.1ms	remaining: 32.3s
3:	learn: 551.4885899	total: 128ms	remaining: 31.8s
4:	learn: 496.3945906	total: 161ms	remaining: 32.1s
5:	learn: 448.0636974	total: 192ms	remaining: 31.9s
6:	learn: 406.9723544	total: 222ms	remaining: 31.5s
7:	learn: 371.3493198	total: 254ms	remaining: 31.4s
8:	learn: 341.3114479	total: 283ms	remaining: 31.2s
9:	learn: 315.7031040	total: 313ms	remaining: 31s
10:	learn: 293.6031807	total: 347ms	remaining: 31.2s
11:	learn: 275.1760562	total: 381ms	remaining: 31.3s
12:	learn: 258.7373074	total: 411ms	remaining: 31.2s
13:	learn: 246.0916337	total: 446ms	remaining: 31.4s
14:	learn: 234.9232152	total: 480ms	remaining: 31.5s
15:	learn: 226.0731720	total: 510ms	remaining: 31.4s
16:	learn: 216.9161982	total: 543ms	remaining: 31.4s
17:	learn: 207.2687167	total: 580ms	remaining: 31.6s
18:	learn: 201.2088456	tot

In [60]:
mean_absolute_error(y_cvtr, model_vo.predict(x_cvtr))

28.462204713191866

In [61]:
lgbm_st = LGBMRegressor(**params)

In [63]:
model_st = StackingRegressor(models, lgbm_st, cv = 5)

In [64]:
model_st.fit(x_cvtr, y_cvtr)

Learning rate set to 0.12478
0:	learn: 774.2267189	total: 32.9ms	remaining: 32.9s
1:	learn: 689.3677193	total: 63.4ms	remaining: 31.6s
2:	learn: 615.7966653	total: 93.8ms	remaining: 31.2s
3:	learn: 551.4885899	total: 124ms	remaining: 30.8s
4:	learn: 496.3945906	total: 155ms	remaining: 30.8s
5:	learn: 448.0636974	total: 187ms	remaining: 30.9s
6:	learn: 406.9723544	total: 215ms	remaining: 30.5s
7:	learn: 371.3493198	total: 246ms	remaining: 30.5s
8:	learn: 341.3114479	total: 275ms	remaining: 30.3s
9:	learn: 315.7031040	total: 304ms	remaining: 30.1s
10:	learn: 293.6031807	total: 337ms	remaining: 30.3s
11:	learn: 275.1760562	total: 369ms	remaining: 30.4s
12:	learn: 258.7373074	total: 398ms	remaining: 30.2s
13:	learn: 246.0916337	total: 432ms	remaining: 30.4s
14:	learn: 234.9232152	total: 465ms	remaining: 30.5s
15:	learn: 226.0731720	total: 494ms	remaining: 30.4s
16:	learn: 216.9161982	total: 526ms	remaining: 30.4s
17:	learn: 207.2687167	total: 560ms	remaining: 30.6s
18:	learn: 201.2088456	t

In [65]:
mean_absolute_error(y_cvtr, model_st.predict(x_cvtr))

36.394559397145194

In [67]:
xgb_best.get_params()

{'objective': 'reg:squarederror',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': 0.7,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': 'mae',
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': 7,
 'max_leaves': 2046,
 'min_child_weight': 1,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': 0,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': 0.7,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None,
 'sub_sample': 0.7,
 'nrounds': 100,
 'lambda': 1}

In [69]:
rs_xgb.best_params_

{'sub_sample': 0.7,
 'nrounds': 100,
 'min_child_weight': 1,
 'max_leaves': 2046,
 'max_depth': 7,
 'lambda': 1}

In [70]:
rs_lgbm.best_params_

{'num_leaves': 8190, 'num_iterations': 300, 'max_depth': 5}

In [71]:
?CatBoostRegressor

[1;31mInit signature:[0m
[0mCatBoostRegressor[0m[1;33m([0m[1;33m
[0m    [0miterations[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mlearning_rate[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mdepth[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0ml2_leaf_reg[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmodel_size_reg[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mrsm[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mloss_function[0m[1;33m=[0m[1;34m'RMSE'[0m[1;33m,[0m[1;33m
[0m    [0mborder_count[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mfeature_border_type[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mper_float_feature_quantization[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0minput_borders[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0moutput_borders[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mf

In [73]:
from tensorflow import test
from tensorflow.python.client.device_lib import list_local_devices

In [74]:
test.is_built_with_cuda()

True

In [82]:
list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 9148454337323634951
 xla_global_id: -1,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 22395486208
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 1159572900191049654
 physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 4090, pci bus id: 0000:01:00.0, compute capability: 8.9"
 xla_global_id: 416903419]