In [1]:
import warnings

warnings.filterwarnings("ignore")

import os
import gc
import pickle

import numpy as np
import pandas as pd
import polars as pl
import plotly.express as px
import matplotlib.pyplot as plt

from sklearn.metrics import mean_absolute_error

In [2]:
class DataStorage:
    root = "C:/Users/yjg10/OneDrive/문서/Kaggle_data/Energy"

    data_cols = [
        "target",
        "county",
        "is_business",
        "product_type",
        "is_consumption",
        "datetime",
        "row_id",
    ]
    client_cols = [
        "product_type",
        "county",
        "eic_count",
        "installed_capacity",
        "is_business",
        "date",
    ]
    gas_prices_cols = ["forecast_date", "lowest_price_per_mwh", "highest_price_per_mwh"]
    electricity_prices_cols = ["forecast_date", "euros_per_mwh"]
    forecast_weather_cols = [
        "latitude",
        "longitude",
        "hours_ahead",
        "temperature",
        "dewpoint",
        "cloudcover_high",
        "cloudcover_low",
        "cloudcover_mid",
        "cloudcover_total",
        "10_metre_u_wind_component",
        "10_metre_v_wind_component",
        "forecast_datetime",
        "direct_solar_radiation",
        "surface_solar_radiation_downwards",
        "snowfall",
        "total_precipitation",
    ]
    historical_weather_cols = [
        "datetime",
        "temperature",
        "dewpoint",
        "rain",
        "snowfall",
        "surface_pressure",
        "cloudcover_total",
        "cloudcover_low",
        "cloudcover_mid",
        "cloudcover_high",
        "windspeed_10m",
        "winddirection_10m",
        "shortwave_radiation",
        "direct_solar_radiation",
        "diffuse_radiation",
        "latitude",
        "longitude",
    ]
    location_cols = ["longitude", "latitude", "county"]
    target_cols = [
        "target",
        "county",
        "is_business",
        "product_type",
        "is_consumption",
        "datetime",
    ]

    def __init__(self):
        self.df_data = pl.read_csv(
            os.path.join(self.root, "train.csv"),
            columns=self.data_cols,
            try_parse_dates=True,
        )
        self.df_client = pl.read_csv(
            os.path.join(self.root, "client.csv"),
            columns=self.client_cols,
            try_parse_dates=True,
        )
        self.df_gas_prices = pl.read_csv(
            os.path.join(self.root, "gas_prices.csv"),
            columns=self.gas_prices_cols,
            try_parse_dates=True,
        )
        self.df_electricity_prices = pl.read_csv(
            os.path.join(self.root, "electricity_prices.csv"),
            columns=self.electricity_prices_cols,
            try_parse_dates=True,
        )
        self.df_forecast_weather = pl.read_csv(
            os.path.join(self.root, "forecast_weather.csv"),
            columns=self.forecast_weather_cols,
            try_parse_dates=True,
        )
        self.df_historical_weather = pl.read_csv(
            os.path.join(self.root, "historical_weather.csv"),
            columns=self.historical_weather_cols,
            try_parse_dates=True,
        )
        self.df_weather_station_to_county_mapping = pl.read_csv(
            os.path.join(self.root, "weather_station_to_county_mapping.csv"),
            columns=self.location_cols,
            try_parse_dates=True,
        )
        self.df_data = self.df_data.filter(
            pl.col("datetime") >= pd.to_datetime("2022-01-01")
        )
        self.df_target = self.df_data.select(self.target_cols)

        self.schema_data = self.df_data.schema
        self.schema_client = self.df_client.schema
        self.schema_gas_prices = self.df_gas_prices.schema
        self.schema_electricity_prices = self.df_electricity_prices.schema
        self.schema_forecast_weather = self.df_forecast_weather.schema
        self.schema_historical_weather = self.df_historical_weather.schema
        self.schema_target = self.df_target.schema

        self.df_weather_station_to_county_mapping = (
            self.df_weather_station_to_county_mapping.with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
        )

    def update_with_new_data(
        self,
        df_new_client,
        df_new_gas_prices,
        df_new_electricity_prices,
        df_new_forecast_weather,
        df_new_historical_weather,
        df_new_target,
    ):
        df_new_client = pl.from_pandas(
            df_new_client[self.client_cols], schema_overrides=self.schema_client
        )
        df_new_gas_prices = pl.from_pandas(
            df_new_gas_prices[self.gas_prices_cols],
            schema_overrides=self.schema_gas_prices,
        )
        df_new_electricity_prices = pl.from_pandas(
            df_new_electricity_prices[self.electricity_prices_cols],
            schema_overrides=self.schema_electricity_prices,
        )
        df_new_forecast_weather = pl.from_pandas(
            df_new_forecast_weather[self.forecast_weather_cols],
            schema_overrides=self.schema_forecast_weather,
        )
        df_new_historical_weather = pl.from_pandas(
            df_new_historical_weather[self.historical_weather_cols],
            schema_overrides=self.schema_historical_weather,
        )
        df_new_target = pl.from_pandas(
            df_new_target[self.target_cols], schema_overrides=self.schema_target
        )

        self.df_client = pl.concat([self.df_client, df_new_client]).unique(
            ["date", "county", "is_business", "product_type"]
        )
        self.df_gas_prices = pl.concat([self.df_gas_prices, df_new_gas_prices]).unique(
            ["forecast_date"]
        )
        self.df_electricity_prices = pl.concat(
            [self.df_electricity_prices, df_new_electricity_prices]
        ).unique(["forecast_date"])
        self.df_forecast_weather = pl.concat(
            [self.df_forecast_weather, df_new_forecast_weather]
        ).unique(["forecast_datetime", "latitude", "longitude", "hours_ahead"])
        self.df_historical_weather = pl.concat(
            [self.df_historical_weather, df_new_historical_weather]
        ).unique(["datetime", "latitude", "longitude"])
        self.df_target = pl.concat([self.df_target, df_new_target]).unique(
            ["datetime", "county", "is_business", "product_type", "is_consumption"]
        )

    def preprocess_test(self, df_test):
        df_test = df_test.rename(columns={"prediction_datetime": "datetime"})
        df_test = pl.from_pandas(
            df_test[self.data_cols[1:]], schema_overrides=self.schema_data
        )
        return df_test

In [3]:
class FeaturesGenerator:
    def __init__(self, data_storage):
        self.data_storage = data_storage

    def _add_general_features(self, df_features):
        df_features = (
            df_features.with_columns(
                pl.col("datetime").dt.ordinal_day().alias("dayofyear"),
                pl.col("datetime").dt.hour().alias("hour"),
                pl.col("datetime").dt.day().alias("day"),
                pl.col("datetime").dt.weekday().alias("weekday"),
                pl.col("datetime").dt.month().alias("month"),
                pl.col("datetime").dt.year().alias("year"),
            )
            .with_columns(
                pl.concat_str(
                    "county",
                    "is_business",
                    "product_type",
                    "is_consumption",
                    separator="_",
                ).alias("segment"),
            )
            .with_columns(
                (np.pi * pl.col("dayofyear") / 183).sin().alias("sin(dayofyear)"),
                (np.pi * pl.col("dayofyear") / 183).cos().alias("cos(dayofyear)"),
                (np.pi * pl.col("hour") / 12).sin().alias("sin(hour)"),
                (np.pi * pl.col("hour") / 12).cos().alias("cos(hour)"),
            )
        )
        return df_features

    def _add_client_features(self, df_features):
        df_client = self.data_storage.df_client

        df_features = df_features.join(
            df_client.with_columns(
                (pl.col("date") + pl.duration(days=2)).cast(pl.Date)
            ),
            on=["county", "is_business", "product_type", "date"],
            how="left",
        )
        return df_features

    def _add_forecast_weather_features(self, df_features):
        df_forecast_weather = self.data_storage.df_forecast_weather
        df_weather_station_to_county_mapping = (
            self.data_storage.df_weather_station_to_county_mapping
        )

        df_forecast_weather = (
            df_forecast_weather.rename({"forecast_datetime": "datetime"})
            .filter((pl.col("hours_ahead") >= 22) & pl.col("hours_ahead") <= 45)
            .drop("hours_ahead")
            .with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
            .join(
                df_weather_station_to_county_mapping,
                how="left",
                on=["longitude", "latitude"],
            )
            .drop("longitude", "latitude")
        )

        df_forecast_weather_date = (
            df_forecast_weather.group_by("datetime").mean().drop("county")
        )

        df_forecast_weather_local = (
            df_forecast_weather.filter(pl.col("county").is_not_null())
            .group_by("county", "datetime")
            .mean()
        )

        for hours_lag in [0, 7 * 24]:
            df_features = df_features.join(
                df_forecast_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on="datetime",
                how="left",
                suffix=f"_forecast_{hours_lag}h",
            )
            df_features = df_features.join(
                df_forecast_weather_local.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on=["county", "datetime"],
                how="left",
                suffix=f"_forecast_local_{hours_lag}h",
            )

        return df_features

    def _add_historical_weather_features(self, df_features):
        df_historical_weather = self.data_storage.df_historical_weather
        df_weather_station_to_county_mapping = (
            self.data_storage.df_weather_station_to_county_mapping
        )

        df_historical_weather = (
            df_historical_weather.with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
            .join(
                df_weather_station_to_county_mapping,
                how="left",
                on=["longitude", "latitude"],
            )
            .drop("longitude", "latitude")
        )

        df_historical_weather_date = (
            df_historical_weather.group_by("datetime").mean().drop("county")
        )

        df_historical_weather_local = (
            df_historical_weather.filter(pl.col("county").is_not_null())
            .group_by("county", "datetime")
            .mean()
        )

        for hours_lag in [2 * 24, 7 * 24]:
            df_features = df_features.join(
                df_historical_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on="datetime",
                how="left",
                suffix=f"_historical_{hours_lag}h",
            )
            df_features = df_features.join(
                df_historical_weather_local.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on=["county", "datetime"],
                how="left",
                suffix=f"_historical_local_{hours_lag}h",
            )

        for hours_lag in [1 * 24]:
            df_features = df_features.join(
                df_historical_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag),
                    pl.col("datetime").dt.hour().alias("hour"),
                )
                .filter(pl.col("hour") <= 10)
                .drop("hour"),
                on="datetime",
                how="left",
                suffix=f"_historical_{hours_lag}h",
            )

        return df_features

    def _add_target_features(self, df_features):
        df_target = self.data_storage.df_target

        df_target_all_type_sum = (
            df_target.group_by(["datetime", "county", "is_business", "is_consumption"])
            .sum()
            .drop("product_type")
        )

        df_target_all_county_type_sum = (
            df_target.group_by(["datetime", "is_business", "is_consumption"])
            .sum()
            .drop("product_type", "county")
        )

        for hours_lag in [
            2 * 24,
            3 * 24,
            4 * 24,
            5 * 24,
            6 * 24,
            7 * 24,
            8 * 24,
            9 * 24,
            10 * 24,
            11 * 24,
            12 * 24,
            13 * 24,
            14 * 24,
        ]:
            df_features = df_features.join(
                df_target.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_{hours_lag}h"}),
                on=[
                    "county",
                    "is_business",
                    "product_type",
                    "is_consumption",
                    "datetime",
                ],
                how="left",
            )

        for hours_lag in [2 * 24, 3 * 24, 7 * 24, 14 * 24]:
            df_features = df_features.join(
                df_target_all_type_sum.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_all_type_sum_{hours_lag}h"}),
                on=["county", "is_business", "is_consumption", "datetime"],
                how="left",
            )

            df_features = df_features.join(
                df_target_all_county_type_sum.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_all_county_type_sum_{hours_lag}h"}),
                on=["is_business", "is_consumption", "datetime"],
                how="left",
                suffix=f"_all_county_type_sum_{hours_lag}h",
            )

        cols_for_stats = [
            f"target_{hours_lag}h" for hours_lag in [2 * 24, 3 * 24, 4 * 24, 5 * 24]
        ]
        df_features = df_features.with_columns(
            df_features.select(cols_for_stats).mean(axis=1).alias(f"target_mean"),
            df_features.select(cols_for_stats)
            .transpose()
            .std()
            .transpose()
            .to_series()
            .alias(f"target_std"),
        )

        for target_prefix, lag_nominator, lag_denomonator in [
            ("target", 24 * 7, 24 * 14),
            ("target", 24 * 2, 24 * 9),
            ("target", 24 * 3, 24 * 10),
            ("target", 24 * 2, 24 * 3),
            ("target_all_type_sum", 24 * 2, 24 * 3),
            ("target_all_type_sum", 24 * 7, 24 * 14),
            ("target_all_county_type_sum", 24 * 2, 24 * 3),
            ("target_all_county_type_sum", 24 * 7, 24 * 14),
        ]:
            df_features = df_features.with_columns(
                (
                    pl.col(f"{target_prefix}_{lag_nominator}h")
                    / (pl.col(f"{target_prefix}_{lag_denomonator}h") + 1e-3)
                ).alias(f"{target_prefix}_ratio_{lag_nominator}_{lag_denomonator}")
            )

        return df_features

    def _reduce_memory_usage(self, df_features):
        df_features = df_features.with_columns(pl.col(pl.Float64).cast(pl.Float32))
        return df_features

    def _drop_columns(self, df_features):
        df_features = df_features.drop(
            "date", "datetime", "hour", "dayofyear"
        )
        return df_features

    def _to_pandas(self, df_features, y):
        cat_cols = [
            "county",
            "is_business",
            "product_type",
            "is_consumption",
            "segment",
        ]

        if y is not None:
            df_features = pd.concat([df_features.to_pandas(), y.to_pandas()], axis=1)
        else:
            df_features = df_features.to_pandas()

        df_features = df_features.set_index("row_id")
        df_features[cat_cols] = df_features[cat_cols].astype("category")

        return df_features

    def generate_features(self, df_prediction_items):
        if "target" in df_prediction_items.columns:
            df_prediction_items, y = (
                df_prediction_items.drop("target"),
                df_prediction_items.select("target"),
            )
        else:
            y = None

        df_features = df_prediction_items.with_columns(
            pl.col("datetime").cast(pl.Date).alias("date"),
        )

        for add_features in [
            self._add_general_features,
            self._add_client_features,
            self._add_forecast_weather_features,
            self._add_historical_weather_features,
            self._add_target_features,
            self._reduce_memory_usage,
            self._drop_columns,
        ]:
            df_features = add_features(df_features)

        df_features = self._to_pandas(df_features, y)

        return df_features

In [4]:
data_storage = DataStorage()
features_generator = FeaturesGenerator(data_storage=data_storage)

In [5]:
df_train_features = features_generator.generate_features(data_storage.df_data)
df_train_features = df_train_features[df_train_features['target'].notnull()]

In [6]:
import holidays
import datetime

estonian_holidays = holidays.country_holidays('EE', years=range(2021, 2026))
estonian_holidays = list(estonian_holidays.keys())

def add_holidays_as_binary_features(df):
    df['country_holiday'] = df.apply(lambda row: (datetime.date(row['year'], row['month'], row['day']) in estonian_holidays) * 1, axis=1)
    
    return df

df_train_features = add_holidays_as_binary_features(df_train_features)

- 결측치 과다 컬럼 제거

In [7]:
null_much = ['windspeed_10m_historical_24h',
 'direct_solar_radiation_historical_24h',
 'dewpoint_historical_24h',
 'temperature_historical_24h',
 'cloudcover_high_historical_24h',
 'snowfall_historical_24h',
 'surface_pressure_historical_24h',
 'diffuse_radiation_historical_24h',
 'cloudcover_total_historical_24h',
 'rain_historical_24h',
 'cloudcover_mid_historical_24h',
 'cloudcover_low_historical_24h',
 'shortwave_radiation_historical_24h',
 'winddirection_10m_historical_24h']
df_train_features.drop(null_much, axis = 1, inplace = True)

- 결측치 중앙값 대체

In [8]:
df_med = df_train_features.median(axis = 0, numeric_only = True)
cols = df_med.index
for i in cols:
    df_train_features[i] = df_train_features[i].fillna(df_med[i])

- 불필요 컬럼 제거

In [9]:
del_cols = ['segment']
df_train_features.drop(del_cols, axis = 1, inplace = True)

- day, month, year 일 변환 및 기존 컬럼 삭제

In [10]:
import datetime as dt

df_train_features['datetime'] = 0

for i in df_train_features.index:
    df_train_features.loc[i, 'datetime'] = dt.datetime(df_train_features['year'][i], df_train_features['month'][i], df_train_features['day'][i])

del_cols = ['day', 'month', 'year']
df_train_features.drop(del_cols, axis = 1, inplace = True)

df_train_features['weekday'] = df_train_features['weekday'].astype("category")
df_train_features['datetime'] = df_train_features['datetime'].astype('datetime64[ns]')
df_train_features['datetime'] =  pd.to_numeric(df_train_features['datetime']) / 10**18

In [11]:
X1 = df_train_features.drop('target', axis = 1)
Y1 = df_train_features['target']

- 스케일링 / 원핫인코딩

In [12]:
col_n = X1.select_dtypes(['float32', 'float64', 'int64']).columns
col_o = X1.select_dtypes(['category']).columns

In [13]:
df_n = X1[col_n]
df_o = X1[col_o]

In [14]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

ss = StandardScaler()
df_n = pd.DataFrame(ss.fit_transform(df_n), columns = col_n)

oh = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore')

o1 = oh.fit_transform(df_o)
df_o = pd.DataFrame(o1, columns = oh.get_feature_names_out())

In [15]:
df_n.reset_index(drop = True, inplace = True)
df_o.reset_index(drop = True, inplace = True)

In [16]:
df2 = pd.concat([df_n, df_o], axis = 1)

In [17]:
params = {'n_estimators' : 300, 'max_depth' : 9, 'random_state' : 0, 'force_row_wise' : True, 'num_leaves' : 2**9 - 2, 'verbose' : -1}

In [18]:
from lightgbm import LGBMRegressor

In [19]:
model = LGBMRegressor(**params)

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
x_tr, x_val, y_tr, y_val = train_test_split(df2, Y1, test_size = 0.3, random_state = 0)

In [22]:
x_tr.shape, x_val.shape, y_tr.shape, y_val.shape

((1156331, 174), (495571, 174), (1156331,), (495571,))

In [23]:
model.fit(x_tr, y_tr)

In [24]:
mean_absolute_error(y_val, model.predict(x_val))

28.480356603491774

In [25]:
len_t = int(len(df2) * 0.7)
x_cvtr, x_cvval = (df2.iloc[:len_t, :], df2.iloc[len_t:, :])
y_cvtr, y_cvval = (Y1.iloc[:len_t], Y1.iloc[len_t:]) 

In [26]:
x_cvtr.shape, x_cvval.shape, y_cvtr.shape, y_cvval.shape

((1156331, 174), (495571, 174), (1156331,), (495571,))

In [27]:
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, StackingRegressor
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit

In [28]:
lgbm = LGBMRegressor(random_state=0, bagging_fraction = 0.7, feature_fraction = 0.7, verbose = -1)
xgb = XGBRegressor(random_state=0, eval_metric = 'mae', subsample = 0.7, colsample_bytree = 0.7)
catb = CatBoostRegressor(random_state=0)
rf = RandomForestRegressor(random_state=0)

In [29]:
tscv = TimeSeriesSplit(n_splits = 5, gap = 0)

In [30]:
xgb_params = {
    'nrounds' : [100, 200, 300],
    'min_child_weight' : [0.01, 0.1, 1, 10, 100],
    'max_depth' : [3, 5, 7, 9, 11, 13, 15],
    'max_leaves' : [2**i - 2 for i in range(3, 16, 2)],
    'sub_sample' : [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    'lambda' : [1, 10, 100]
}

In [31]:
lgbm_params = {
    'num_iterations' : [100, 200, 300],
    'max_depth' : [-1, 3, 5, 7, 9, 11, 13, 15],
    'num_leaves' : [2**i - 2 for i in range(3, 16, 2)]
}

In [32]:
rf_params = {
    'n_estimators' : [100, 200, 300],
    'max_depth' : [3, 5, 7, 9, 11, 13, 15],
    'max_leaf_nodes' : [2**i - 2 for i in range(3, 16, 2)]
}

In [33]:
rs_xgb = RandomizedSearchCV(xgb, param_distributions = xgb_params, n_iter = 10, cv = tscv, n_jobs = -1)
rs_lgbm = RandomizedSearchCV(lgbm, param_distributions = lgbm_params, n_iter = 10, cv = tscv, n_jobs = -1)
rs_rf = RandomizedSearchCV(rf, param_distributions = rf_params, n_iter = 10, cv = tscv, n_jobs = -1)

In [44]:
?VotingRegressor

[1;31mInit signature:[0m [0mVotingRegressor[0m[1;33m([0m[0mestimators[0m[1;33m,[0m [1;33m*[0m[1;33m,[0m [0mweights[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m [0mn_jobs[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m [0mverbose[0m[1;33m=[0m[1;32mFalse[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m     
Prediction voting regressor for unfitted estimators.

A voting regressor is an ensemble meta-estimator that fits several base
regressors, each on the whole dataset. Then it averages the individual
predictions to form a final prediction.

Read more in the :ref:`User Guide <voting_regressor>`.

.. versionadded:: 0.21

Parameters
----------
estimators : list of (str, estimator) tuples
    Invoking the ``fit`` method on the ``VotingRegressor`` will fit clones
    of those original estimators that will be stored in the class attribute
    ``self.estimators_``. An estimator can be set to ``'drop'`` using
    :meth:`set_params`.

    .. versionchanged:: 0.21
    

In [45]:
import statsmodels.api as sm
X = sm.add_constant(df2)
Y = list(Y1)
model = sm.OLS(Y, X)
model = model.fit()

In [52]:
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.949
Model:,OLS,Adj. R-squared:,0.949
Method:,Least Squares,F-statistic:,182100.0
Date:,"Wed, 24 Jan 2024",Prob (F-statistic):,0.0
Time:,15:51:48,Log-Likelihood:,-11172000.0
No. Observations:,1651902,AIC:,22340000.0
Df Residuals:,1651732,BIC:,22350000.0
Df Model:,169,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-7.446e+12,4.25e+12,-1.752,0.080,-1.58e+13,8.84e+11
sin(dayofyear),-3.5356,0.405,-8.722,0.000,-4.330,-2.741
cos(dayofyear),4.0550,0.625,6.486,0.000,2.830,5.280
sin(hour),0.2111,0.216,0.979,0.328,-0.212,0.634
cos(hour),2.2818,0.307,7.439,0.000,1.681,2.883
eic_count,-6.4264,0.321,-19.994,0.000,-7.056,-5.796
installed_capacity,15.0977,0.397,38.072,0.000,14.320,15.875
temperature,2.2487,2.282,0.985,0.324,-2.224,6.722
dewpoint,-5.1252,1.765,-2.904,0.004,-8.585,-1.666

0,1,2,3
Omnibus:,1050805.202,Durbin-Watson:,2.482
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2836898536.495
Skew:,1.429,Prob(JB):,0.0
Kurtosis:,205.998,Cond. No.,1.45e+16


In [56]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif["VIF_Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["feature"] = X.columns
vif.round(1)

Unnamed: 0,VIF_Factor,feature
0,inf,const
1,5.6,sin(dayofyear)
2,14.3,cos(dayofyear)
3,1.8,sin(hour)
4,3.5,cos(hour)
...,...,...
170,418278697.7,weekday_3
171,251100838.4,weekday_4
172,389165985.4,weekday_5
173,272972.8,weekday_6


In [72]:
vif['VIF_Factor'].sort_values().iloc[75]

20.639313644914306

In [76]:
list(vif[vif['VIF_Factor'] < 30]['feature'])

['sin(dayofyear)',
 'cos(dayofyear)',
 'sin(hour)',
 'cos(hour)',
 'eic_count',
 'installed_capacity',
 'cloudcover_high',
 'cloudcover_low',
 'cloudcover_mid',
 'cloudcover_total',
 '10_metre_u_wind_component',
 '10_metre_v_wind_component',
 'snowfall',
 'total_precipitation',
 'cloudcover_high_forecast_local_0h',
 'cloudcover_low_forecast_local_0h',
 'cloudcover_mid_forecast_local_0h',
 'cloudcover_total_forecast_local_0h',
 '10_metre_u_wind_component_forecast_local_0h',
 '10_metre_v_wind_component_forecast_local_0h',
 'snowfall_forecast_local_0h',
 'total_precipitation_forecast_local_0h',
 'cloudcover_high_forecast_168h',
 'cloudcover_low_forecast_168h',
 'cloudcover_mid_forecast_168h',
 'cloudcover_total_forecast_168h',
 '10_metre_u_wind_component_forecast_168h',
 '10_metre_v_wind_component_forecast_168h',
 'snowfall_forecast_168h',
 'total_precipitation_forecast_168h',
 'cloudcover_high_forecast_local_168h',
 'cloudcover_low_forecast_local_168h',
 'cloudcover_mid_forecast_local_16

In [83]:
num_cols = 0

for i in range(5, 50000, 10):
    cols = list(vif[vif['VIF_Factor'] < i]['feature'])
    if num_cols == len(cols):
        continue

    num_cols = len(cols)
    x_tr_filter = x_tr[cols]
    x_val_filter = x_val[cols]

    tmp_model = LGBMRegressor(**params)
    tmp_model.fit(x_tr_filter, y_tr)
    
    mae = mean_absolute_error(y_val, tmp_model.predict(x_val_filter))

    print(f"VIF filter MAE at VIF < {i} : {mae}, num_cols = {num_cols}")

VIF filter MAE at VIF < 5 : 66.15031561750317, num_cols = 30
VIF filter MAE at VIF < 15 : 54.38977543828252, num_cols = 66
VIF filter MAE at VIF < 25 : 36.834028384957406, num_cols = 80
VIF filter MAE at VIF < 35 : 33.75632655090032, num_cols = 94
VIF filter MAE at VIF < 45 : 33.41833445756315, num_cols = 96
VIF filter MAE at VIF < 55 : 32.41634739330211, num_cols = 103
VIF filter MAE at VIF < 65 : 32.31193366401612, num_cols = 104
VIF filter MAE at VIF < 75 : 32.077620646838646, num_cols = 108
VIF filter MAE at VIF < 85 : 32.07254063108549, num_cols = 109
VIF filter MAE at VIF < 95 : 31.804099283293574, num_cols = 110
VIF filter MAE at VIF < 105 : 31.932958556310655, num_cols = 112
VIF filter MAE at VIF < 115 : 31.536837686720702, num_cols = 113
VIF filter MAE at VIF < 125 : 31.27323649287137, num_cols = 116
VIF filter MAE at VIF < 135 : 31.3625070473739, num_cols = 117
VIF filter MAE at VIF < 165 : 31.143320561161595, num_cols = 118
VIF filter MAE at VIF < 175 : 31.37244294346778, nu