# LGBM

In [2]:
import pickle
import pandas as pd
import numpy as np

In [3]:
from lightgbm import LGBMRegressor

In [4]:
import pandas as pd
import datetime as dt
import numpy as np

import pandas as pd
import datetime as dt
import numpy as np

class TrainDataProcessor:
    """I am rewriting this training data processor to process a few more variables differently."""

    def __init__(self, train, revealed_targets, client, historical_weather,
                 forecast_weather, electricity_prices, gas_prices, for_testing=False,
                add_log_cols=False):
        self.add_log_cols = add_log_cols
        self.test_orig_dfs = self.get_test_orig_dfs([train, revealed_targets, client, historical_weather,
                 forecast_weather, electricity_prices, gas_prices])
        
        self.weather_mapping = self.init_weather_mapping()
        
        if not for_testing:
            self.train = self.init_train(train)
            self.revealed_targets = self.init_revealed_targets(revealed_targets)
            self.client = self.init_client(client)
            
            self.historical_weather = self.init_historical_weather(historical_weather)
            self.forecast_weather = self.init_forecast_weather(forecast_weather)
            self.electricity_prices = self.init_electricity(electricity_prices)
            self.gas_prices = self.init_gas_prices(gas_prices)
            
            self.df_all_cols = self.join_data(self.train, self.revealed_targets, self.client, self.historical_weather, self.forecast_weather, self.electricity_prices, self.gas_prices)
            if self.add_log_cols:
                self.df_all_cols = self.create_log_cols(self.df_all_cols)
            self.df = self.remove_cols(self.df_all_cols)
            
        
    def get_test_orig_dfs(self, dfs):
        for i, df in enumerate(dfs):
            if 'datetime' in df.columns:
                df['datetime'] = pd.to_datetime(df.datetime)
                col = 'datetime'
            if 'prediction_datetime' in df.columns:
                df['prediction_datetime'] = pd.to_datetime(df.prediction_datetime)
                col = 'prediction_datetime'
            if 'forecast_date' in df.columns:
                df['forecast_date'] = pd.to_datetime(df['forecast_date'])
                col = 'forecast_date'
            if 'forecast_datetime' in df.columns:
                df['forecast_datetime'] = pd.to_datetime(df['forecast_datetime'])
                col = 'forecast_datetime'
            if 'date' in df.columns:
                df['date'] = pd.to_datetime(df.date).dt.date
                col = 'date'

            test_date = df[col].iloc[-1]  # Assuming test is a DataFrame
            start_date = test_date - pd.Timedelta(days=14)
            historical_subset = df[df[col] >= start_date]
            dfs[i] = historical_subset.copy()
        return dfs
        
    def init_train(self, df):
        """Prepares the training data for model training."""
        try:
            df['datetime'] = pd.to_datetime(df.datetime)
        except Exception as e:
            df['datetime'] = pd.to_datetime(df.prediction_datetime)
        df['date'] = df.datetime.dt.date
            
        # df = self.get_data_block_id(df, 'datetime')
        return df
    
    def add_electricity_lag_features(self, df):
        ##### mean from entire last week
        df.set_index('datetime', inplace=True)
        # Use rolling to calculate mean price of the last week
        # The window is 7 days, min_periods can be set as per requirement
        # 'closed' determines which side of the interval is closed; it can be 'right' or 'left'
        df['mean_euros_per_mwh_last_week'] = df['euros_per_mwh'].rolling(window='7D', min_periods=1, closed='right').mean()
        # Shift the results to align with the requirement of lagging
        df['mean_euros_per_mwh_last_week'] = df['mean_euros_per_mwh_last_week'].shift()
        
        ##### mean from last week this hour only
        # Extract hour from datetime
        df['hour'] = df.index.hour

        # Group by hour and apply rolling mean for each group
        hourly_groups = df.groupby('hour')
        dff = hourly_groups['euros_per_mwh'].rolling(window='7D', min_periods=1, closed='right').mean()#.shift()#.reset_index(level=0, drop=True)
        dff = dff.reset_index().set_index('datetime').groupby('hour')['euros_per_mwh'].shift()
        dff = dff.rename('mean_euros_per_mwh_same_hour_last_week')
        df = df.join(dff)
        #### yesterday's power price
        df['yesterdays_euros_per_mwh'] = df['euros_per_mwh'].shift(24)
        
        ### 24h average
        # Calculate the 24-hour rolling average
        df['euros_per_mwh_24h_average_price'] = df['euros_per_mwh'].rolling(window=24, min_periods=1).mean()

        # Resetting the index if needed
        df.reset_index(inplace=True)
        df = df.drop(['forecast_date', 'origin_date', 'hour'], axis=1)
        return df

    def init_electricity(self, df):
        ## LAG = 1 Day
        ## Move forecast datetime ahead by 1 day
        ## change name to datetime
        df['datetime'] = pd.to_datetime(df['forecast_date'])
        df['datetime'] = df['datetime'] + dt.timedelta(days=1)
        # df = self.get_data_block_id(df, 'datetime')
        df = self.add_electricity_lag_features(df)
        return df
    
    def add_historical_weather_lag_features(self, df):
        ##### LATEST WEATHER
        def add_latest_weather(df):
            # Assuming df is your original DataFrame
            # Step 1: Convert datetime to a Datetime Object
            df['datetime'] = pd.to_datetime(df['datetime'])
            df.set_index('datetime', inplace=True)

            # Step 2: Sorting the Data
            df.sort_values(by=['datetime', 'latitude', 'longitude'], inplace=True)

            # Step 3: Creating a Unique Identifier for each location
            df['location_id'] = df['latitude'].astype(str) + '_' + df['longitude'].astype(str)

            # Step 4: Filtering for 10:00 AM Entries
            df.reset_index(inplace=True)
            df_10am = df[df['datetime'].dt.hour == 10]
            df_10am.set_index('datetime', inplace=True)

            # Step 5: Shifting the Features by 1 day
            lagged_features = df_10am.groupby('location_id').shift(periods=1, freq='D')
            
            # grouped = lagged_features.groupby('county')
            # lagged_features = grouped[weather_features].mean()
            
            
            # Renaming columns to indicate lag
            lagged_features = lagged_features.add_suffix('_hw_lagged')
            lagged_features['location_id'] = lagged_features['location_id_hw_lagged']
            lagged_features.reset_index(inplace=True)
            lagged_features['date'] = lagged_features.datetime.dt.date

            df['date'] = df.datetime.dt.date
            return lagged_features
            # Step 6: Merging Lagged Features with Original DataFrame
            df = df.merge(lagged_features, on=['date', 'location_id'], how='left', suffixes=('', '_hw_lagged'))
            return df
        
        ##### mean from last day
        def add_24h_mean_var(df, weather_features):
            # Calculate the start and end times for each row
            # df['start_time'] = pd.to_datetime(df['datetime'].dt.date) - pd.Timedelta(days=2) + pd.Timedelta(hours=11)
            # df['end_time'] = pd.to_datetime(df['datetime'].dt.date) - pd.Timedelta(days=1) + pd.Timedelta(hours=10)
            # df['time_code'] = df['start_time'].astype(str) +'_' + df['end_time'].astype(str) + '_' + df['latitude'].astype(str) + '_' + df['longitude'].astype(str)
            # print(df.time_code)

            # Create a helper column for grouping
            # If the time is before 11:00 AM, subtract a day
            df['group'] = df['datetime'].apply(lambda dt: dt if dt.time() >= pd.to_datetime('11:00').time() else dt - pd.Timedelta(days=1))
            df['group'] = df['group'].dt.date  # Keep only the date part for grouping
            df['group'] = (pd.to_datetime(df['group']) + pd.Timedelta(hours=11)).astype(str) + '_' + (pd.to_datetime(df['group']) + pd.Timedelta(days=1, hours=10)).astype(str) + '_' + df['latitude'].astype(str) + '_' + df['longitude'].astype(str)

            # Now group by this new column
            grouped = df.groupby('group')
            means = grouped[weather_features].mean()
            variances = grouped[weather_features].var()

            # Merge means and variances into the original DataFrame
            my_df = df.merge(means, on='group', suffixes=('', '_hw_means'), how='left')
            my_df = my_df.merge(variances, on='group', how='left', suffixes=('', '_hw_variances'))

            return my_df
        
        ##### mean from last day all estonia
        def add_24h_mean_var_estonia(df, weather_features):
            # Calculate the start and end times for each row
            # df['start_time'] = pd.to_datetime(df['datetime'].dt.date) - pd.Timedelta(days=2) + pd.Timedelta(hours=11)
            # df['end_time'] = pd.to_datetime(df['datetime'].dt.date) - pd.Timedelta(days=1) + pd.Timedelta(hours=10)
            # df['time_code'] = df['start_time'].astype(str) +'_' + df['end_time'].astype(str)
            # print(df.time_code)

            # Create a helper column for grouping
            # If the time is before 11:00 AM, subtract a day
            df['group'] = df['datetime'].apply(lambda dt: dt if dt.time() >= pd.to_datetime('11:00').time() else dt - pd.Timedelta(days=1))
            df['group'] = df['group'].dt.date  # Keep only the date part for grouping
            df['group'] = (pd.to_datetime(df['group']) + pd.Timedelta(hours=11)).astype(str) + '_' + (pd.to_datetime(df['group']) + pd.Timedelta(days=1, hours=10)).astype(str) + '_' + df['latitude'].astype(str) + '_' + df['longitude'].astype(str)

            # Now group by this new column
            grouped = df.groupby('group')
            means = grouped[weather_features].mean()
            variances = grouped[weather_features].var()

            # Merge means and variances into the original DataFrame
            my_df = df.merge(means, on='group', suffixes=('', '_hw_means_estonia'), how='left')
            my_df = my_df.merge(variances, on='group', how='left', suffixes=('', '_hw_variances_estonia'))

            return my_df

        df['datetime'] = pd.to_datetime(df['datetime'])
        weather_features = df.columns.drop(['datetime', 'latitude', 'longitude'])

        # Apply the function
        df = add_24h_mean_var(df, weather_features)    
        df = add_24h_mean_var_estonia(df, weather_features)
           
        latest = add_latest_weather(df)
        df = df.merge(latest, on=['date', 'location_id'], how='left', suffixes=('', '_hw_lagged'))
        
        return df

    def init_historical_weather(self, df):
        ## LAG: From 11:00 AM 2 days ago to 10:00 AM 1 day ago
        ## What to do? Give most recent weather forecast? Give average over the last day?
        """
        Processes the historical weather data.
        """
        df['datetime'] = pd.to_datetime(df.datetime)
        
        
        
        df = self.add_historical_weather_lag_features(df)
        
        df = df.merge(self.weather_mapping, how='inner', on=('latitude', 'longitude'))
        
        return df

    def init_forecast_weather(self, df):
        ## LAG: DON't ADJUST
        ##      The forecast is from yesterday, but can forecast today, which is 22 hours ahead
        ## Drop any columns where:
        ##                        hours_ahead < 22 and hours_ahead > 45
        ## Then rename forecast_datetime to datetime and join on datetime
        """
        Processes the forecast weather data.
        """
        df['datetime'] = pd.to_datetime(df['forecast_datetime'])
        # keep only datetimes from our relevant period
        df = df[(df['hours_ahead'] < 46) & (df['hours_ahead'] > 21)]
        df['datetime'] = df['datetime'] + dt.timedelta(days=1)
        df = df.merge(self.weather_mapping, how='inner', on=('latitude', 'longitude'))
        return df
    
    def add_gas_prices_lag_features(self, df):
        df['date'] = pd.to_datetime(df['date'])
        df.set_index('date', inplace=True)

        # Sort the DataFrame by date, if it's not already sorted
        df.sort_index(inplace=True)

        # Calculate rolling averages for different time windows
        df['lowest_price_3d_avg'] = df['lowest_price_per_mwh'].rolling(window=3).mean()
        df['highest_price_3d_avg'] = df['highest_price_per_mwh'].rolling(window=3).mean()

        df['lowest_price_7d_avg'] = df['lowest_price_per_mwh'].rolling(window=7).mean()
        df['highest_price_7d_avg'] = df['highest_price_per_mwh'].rolling(window=7).mean()

        df['lowest_price_14d_avg'] = df['lowest_price_per_mwh'].rolling(window=14).mean()
        df['highest_price_14d_avg'] = df['highest_price_per_mwh'].rolling(window=14).mean()

        # Reset the index if you want the 'date' column back
        df.reset_index(inplace=True)
        return df

    def init_gas_prices(self, df):
        ## LAG: 1 DAY
        ## Predictions are made from 2 days ago and predict for yesterday
        ## add one day to forecast_date
        ## Rename forecast_date to date, join on date
        """
        Processes the gas prices data.
        Implement the logic to handle gas prices data processing here.
        """
        df['date'] = pd.to_datetime(df['forecast_date']).dt.date
        df['date'] = df['date'] + dt.timedelta(days=1)
        df = self.add_gas_prices_lag_features(df)
        return df
    
    def add_revealed_target_features(self, df):
        df['datetime'] = pd.to_datetime(df['datetime'])
        df['hour'] = df.datetime.dt.hour
        df['day'] = df.datetime.dt.dayofweek
        df.set_index('datetime', inplace=True)
        
        # let me add some new features here too
        # Adding lag features
        # Step 2: Sorting the Data
        df.sort_values(by=['datetime'], inplace=True)

        # Step 3: Creating a Unique Identifier for each location
        df['id'] = df['county'].astype(str) + '_' + df['is_business'].astype(str) + '_' + df['product_type'].astype(str) + '_' + df['is_consumption'].astype(str)
        lagged_features = []
        lagged_hours = []
        ### Defining lagged target features

        for lag_hours in range(1, 24):
            lagged_feature = df.groupby('id').shift(periods=lag_hours, freq='H')
            lagged_features.append(lagged_feature)
            lagged_hours.append(lag_hours)

        for lag_hours in ([i*24 for i in range(1,8)] + [24*11, 24*12]):
            lagged_feature = df.groupby('id').shift(periods=lag_hours, freq='H')
            lagged_features.append(lagged_feature)
            lagged_hours.append(lag_hours)
            
        df.reset_index(inplace=True)
        for lagged_feature, lag_hours in zip(lagged_features, lagged_hours):
            lagged_feature.reset_index(inplace=True)
            lagged_feature.dropna(inplace=True)
            df = df.merge(lagged_feature[['datetime', 'target', 'id']], on=['id', 'datetime'], how='left', suffixes=('', f'_lag_{lag_hours}h'))

        df.set_index('datetime', inplace=True)
        
        

        window_size = 7
        # Group by the specified columns and then apply the rolling mean
        grouped = df.groupby(['county', 'is_business', 'product_type', 'is_consumption'])
        df['target_rolling_avg_24h'] = grouped['target'].transform(lambda x: x.rolling(window=24, min_periods=1).mean())

        grouped = df.groupby(['county', 'is_business', 'product_type', 'is_consumption', 'hour'])
        df['target_rolling_avg_hour_7d'] = grouped['target'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())

        # grouped = df.groupby(['county', 'is_business', 'product_type', 'is_consumption', 'hour', 'day'])
        # df['target_rolling_avg_hour_hour_day_4w'] = grouped['target'].transform(lambda x: x.rolling(window=4, min_periods=1).mean())

        grouped = df.groupby(['county', 'is_business', 'is_consumption'])
        df['target_rolling_allp_avg_24h'] = grouped['target'].transform(lambda x: x.rolling(window=24, min_periods=1).mean())

        grouped = df.groupby(['county', 'is_business', 'is_consumption', 'hour'])
        df['target_rolling_allp_avg_hour_7d'] = grouped['target'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())

        grouped = df.groupby(['county', 'is_business', 'is_consumption', 'hour', 'day'])
        df['target_rolling_allp_avg_hour_hour_day_4w'] = grouped['target'].transform(lambda x: x.rolling(window=4, min_periods=1).mean())
        
        #All of estonia
        grouped = df.groupby(['is_business', 'product_type', 'is_consumption'])
        df['target_rolling_avg_24h_estonia'] = grouped['target'].transform(lambda x: x.rolling(window=24, min_periods=1).mean())

        grouped = df.groupby(['is_business', 'product_type', 'is_consumption', 'hour'])
        df['target_rolling_avg_hour_7d_estonia'] = grouped['target'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())

        # grouped = df.groupby(['is_business', 'product_type', 'is_consumption', 'hour', 'day'])
        # df['target_rolling_avg_hour_hour_day_4w_estonia'] = grouped['target'].transform(lambda x: x.rolling(window=4, min_periods=1).mean())

        grouped = df.groupby(['is_business', 'is_consumption'])
        df['target_rolling_allp_avg_24h_estonia'] = grouped['target'].transform(lambda x: x.rolling(window=24, min_periods=1).mean())

        grouped = df.groupby(['is_business', 'is_consumption', 'hour'])
        df['target_rolling_allp_avg_hour_7d_estonia'] = grouped['target'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())

        # grouped = df.groupby(['is_business', 'is_consumption', 'hour', 'day'])
        # df['target_rolling_allp_avg_hour_hour_day_4w_estonia'] = grouped['target'].transform(lambda x: x.rolling(window=4, min_periods=1).mean())
        
        df = df.drop(['hour', 'day'], axis=1)

        return df
    
    def init_revealed_targets(self, df):
        df['datetime'] = pd.to_datetime(df.datetime)
        df['datetime'] = df['datetime'] + dt.timedelta(days=2)
        df = self.add_revealed_target_features(df)
        return df
    
    def init_client(self, df):
        ## LAG: 2 days
        ## Add 2 days to date, join on date
        df['date'] = pd.to_datetime(df.date).dt.date
        df['date'] = df['date'] + dt.timedelta(days=2)
        # df = self.get_data_block_id(df, 'date')
        return df

    def init_weather_mapping(self):
        # https://www.kaggle.com/code/tsunotsuno/enefit-eda-baseline/notebook#Baseline
        county_point_map = {
            0: (59.4, 24.7), # "HARJUMAA"
            1 : (58.8, 22.7), # "HIIUMAA"
            2 : (59.1, 27.2), # "IDA-VIRUMAA"
            3 : (58.8, 25.7), # "JÄRVAMAA"
            4 : (58.8, 26.2), # "JÕGEVAMAA"
            5 : (59.1, 23.7), # "LÄÄNE-VIRUMAA"
            6 : (59.1, 23.7), # "LÄÄNEMAA"
            7 : (58.5, 24.7), # "PÄRNUMAA"
            8 : (58.2, 27.2), # "PÕLVAMAA"
            9 : (58.8, 24.7), # "RAPLAMAA"
            10 : (58.5, 22.7),# "SAAREMAA"
            11 : (58.5, 26.7),# "TARTUMAA"
            12 : (58.5, 25.2),# "UNKNOWNN" (center of the map)
            13 : (57.9, 26.2),# "VALGAMAA"
            14 : (58.2, 25.7),# "VILJANDIMAA"
            15 : (57.9, 27.2) # "VÕRUMAA"
        }
        # Convert the dictionary to a list of tuples
        data = [(county_code, lat, lon) for county_code, (lat, lon) in county_point_map.items()]

        # Create DataFrame
        df = pd.DataFrame(data, columns=['county', 'latitude', 'longitude'])
        
        return df
    
    def add_date_features(self, df):
        df['year'] = df['datetime'].dt.year
        df['month'] = df['datetime'].dt.month
        df['day'] = df['datetime'].dt.day
        df['hour'] = df['datetime'].dt.hour
        df['quarter'] = df['datetime'].dt.quarter
        df['day_of_week'] = df['datetime'].dt.day_of_week
        df['day_of_year'] = df['datetime'].dt.dayofyear
        df['week_of_year'] = df['datetime'].dt.isocalendar().week
        df['is_weekend'] = df['datetime'].dt.day_of_week >= 5
        df['is_month_start'] = df['datetime'].dt.is_month_start
        df['is_month_end'] = df['datetime'].dt.is_month_end
        df['is_quarter_start'] = df['datetime'].dt.is_quarter_start
        df['is_quarter_end'] = df['datetime'].dt.is_quarter_end
        df['is_year_start'] = df['datetime'].dt.is_year_start
        df['is_year_end'] = df['datetime'].dt.is_year_end
        df['season'] = df['datetime'].dt.month % 12 // 3 + 1
        df['hour_sin'] = np.sin(df['datetime'].dt.hour * (2. * np.pi / 24))
        df['hour_cos'] = np.cos(df['datetime'].dt.hour * (2. * np.pi / 24))
        # Calculate sin and cos for day of year
        days_in_year = 365.25  # accounts for leap year
        df['day_of_year_sin'] = np.sin((df['day_of_year'] - 1) * (2 * np.pi / days_in_year))
        df['day_of_year_cos'] = np.cos((df['day_of_year'] - 1) * (2 * np.pi / days_in_year))
        return df
    
    def add_ee_holidays(self, df):
        import holidays
        # Define Estonia public holidays
        ee_holidays = holidays.CountryHoliday('EE')
        
        print(df['date'].isna().sum())
        
        def find_problem(x):
            try:
                return x in ee_holidays
            except Exception as e:
                print(x)
                raise e

        # Function to check if the date is a holiday
        df['is_ee_holiday'] = df['date'].apply(lambda x: x in ee_holidays)

        return df
    
    def create_log_cols(self, df):
        log_cols = ['target_lag_1h', 'target_lag_2h', 'target_lag_3h', 'target_lag_4h',
       'target_lag_5h', 'target_lag_6h', 'target_lag_7h', 'target_lag_8h',
       'target_lag_9h', 'target_lag_10h', 'target_lag_11h', 'target_lag_12h',
       'target_lag_13h', 'target_lag_14h', 'target_lag_15h', 'target_lag_16h',
       'target_lag_17h', 'target_lag_18h', 'target_lag_19h', 'target_lag_20h',
       'target_lag_21h', 'target_lag_22h', 'target_lag_23h', 'target_lag_24h',
       'target_lag_48h', 'target_lag_72h', 'target_lag_96h', 'target_lag_120h',
       'target_lag_144h', 'target_lag_168h', 'target_lag_264h',
       'target_lag_288h', 'eic_count', 'installed_capacity', 'temperature', 'dewpoint', 'rain',
       'snowfall', 'surface_pressure', 'cloudcover_total', 'cloudcover_low',
       'cloudcover_mid', 'cloudcover_high', 'windspeed_10m',
       'winddirection_10m', 'shortwave_radiation', 'direct_solar_radiation',
       'diffuse_radiation', 'temperature_hw_means', 'dewpoint_hw_means',
       'rain_hw_means', 'snowfall_hw_means', 'surface_pressure_hw_means',
       'cloudcover_total_hw_means', 'cloudcover_low_hw_means',
       'cloudcover_mid_hw_means', 'cloudcover_high_hw_means',
       'windspeed_10m_hw_means', 'winddirection_10m_hw_means',
       'shortwave_radiation_hw_means', 'direct_solar_radiation_hw_means',
       'diffuse_radiation_hw_means', 'temperature_hw_variances',
       'dewpoint_hw_variances', 'rain_hw_variances', 'snowfall_hw_variances',
       'surface_pressure_hw_variances', 'cloudcover_total_hw_variances',
       'cloudcover_low_hw_variances', 'cloudcover_mid_hw_variances',
       'cloudcover_high_hw_variances', 'windspeed_10m_hw_variances',
       'winddirection_10m_hw_variances', 'shortwave_radiation_hw_variances',
       'direct_solar_radiation_hw_variances', 'diffuse_radiation_hw_variances',
       'temperature_hw_lagged', 'dewpoint_hw_lagged', 'rain_hw_lagged',
       'snowfall_hw_lagged', 'surface_pressure_hw_lagged',
       'cloudcover_total_hw_lagged', 'cloudcover_low_hw_lagged', 'cloudcover_mid_hw_lagged',
       'cloudcover_high_hw_lagged', 'windspeed_10m_hw_lagged',
       'winddirection_10m_hw_lagged', 'shortwave_radiation_hw_lagged',
       'direct_solar_radiation_hw_lagged', 'diffuse_radiation_hw_lagged',
       'temperature_hw_means_hw_lagged', 'dewpoint_hw_means_hw_lagged',
       'rain_hw_means_hw_lagged', 'snowfall_hw_means_hw_lagged',
       'surface_pressure_hw_means_hw_lagged',
       'cloudcover_total_hw_means_hw_lagged',
       'cloudcover_low_hw_means_hw_lagged',
       'cloudcover_mid_hw_means_hw_lagged',
       'cloudcover_high_hw_means_hw_lagged',
       'windspeed_10m_hw_means_hw_lagged',
       'winddirection_10m_hw_means_hw_lagged',
       'shortwave_radiation_hw_means_hw_lagged',
       'direct_solar_radiation_hw_means_hw_lagged',
       'diffuse_radiation_hw_means_hw_lagged',
       'temperature_hw_variances_hw_lagged', 'dewpoint_hw_variances_hw_lagged',
       'rain_hw_variances_hw_lagged', 'snowfall_hw_variances_hw_lagged',
       'surface_pressure_hw_variances_hw_lagged',
       'cloudcover_total_hw_variances_hw_lagged',
       'cloudcover_low_hw_variances_hw_lagged',
       'cloudcover_mid_hw_variances_hw_lagged',
       'cloudcover_high_hw_variances_hw_lagged',
       'windspeed_10m_hw_variances_hw_lagged',
       'winddirection_10m_hw_variances_hw_lagged',
       'shortwave_radiation_hw_variances_hw_lagged',
       'direct_solar_radiation_hw_variances_hw_lagged',
       'diffuse_radiation_hw_variances_hw_lagged', 'temperature_fw', 'dewpoint_fw', 'cloudcover_high_fw',
       'cloudcover_low_fw', 'cloudcover_mid_fw', 'cloudcover_total_fw',
       '10_metre_u_wind_component', '10_metre_v_wind_component',
       'direct_solar_radiation_fw', 'surface_solar_radiation_downwards',
       'snowfall_fw', 'total_precipitation', 'euros_per_mwh', 'mean_euros_per_mwh_last_week',
       'mean_euros_per_mwh_same_hour_last_week', 'yesterdays_euros_per_mwh',
       'euros_per_mwh_24h_average_price', 'lowest_price_per_mwh',
       'highest_price_per_mwh', 'lowest_price_3d_avg', 'highest_price_3d_avg',
       'lowest_price_7d_avg', 'highest_price_7d_avg', 'lowest_price_14d_avg',
       'highest_price_14d_avg']
        
        log_cols = [col for col in log_cols if col in df.columns]
        
        dff = np.log1p(df[log_cols] )
        dff.rename(columns={col: col + "_log" for col in log_cols}, inplace=True)
        return pd.concat([df, dff], axis=1)
        
    
    def remove_cols(self, df):
        col_list = ['datetime',
                   'row_id',
                   'prediction_unit_id',
                    'date_train',
                    'hour_part',
                   'date_client',
                    'forecast_date_elec_price',
                    'origin_date_elec_price',
                    'forecast_date_gas_price',
                    'origin_date_gas_price',
                    'datetime_hist_weath',
                   'hour_part_hist_weath_latest',
                    'datetime_hist_weath_latest',
                   'origin_datetime',
                   'hour_part_fore_weath',
                    'datetime',
                    'id',
                     'data_block_id',
                     'row_id',
                     'prediction_unit_id',
                     'date',
                    'data_block_id_rt',
                     'row_id_rt',
                     'prediction_unit_id_rt',
                    'data_block_id_client',
                    'latitude',
                     'longitude',
                     'data_block_id_hw',
                    'start_time',
                     'end_time',
                     'time_code',
                     'group',
                    'data_block_id_hw_means',
                    'data_block_id_hw_variances',
                     'location_id',
                     'date_hw',
                     'datetime_hw_lagged',
                    'latitude_hw_lagged',
                     'longitude_hw_lagged',
                     'data_block_id_hw_lagged',
                     'start_time_hw_lagged',
                     'end_time_hw_lagged',
                     'time_code_hw_lagged',
                     'group_hw_lagged',
                    'data_block_id_hw_means_hw_lagged',
                    'data_block_id_hw_variances_hw_lagged',
                    'location_id_hw_lagged',
                     'latitude_fw',
                     'longitude_fw',
                     'origin_datetime',
                    'data_block_id_fw',
                     'forecast_datetime',
                    'data_block_id_elec',
                    'forecast_date',
                    'origin_date',
                     'data_block_id_gasp',
                   ]
        columns_to_drop = [col for col in col_list if col in df.columns]
        df = df.drop(columns_to_drop, axis=1)
        return df
    
    def remove_test_cols(self, df):
        col_list = ['datetime',
                   'prediction_unit_id',
                    'date_train',
                    'hour_part',
                   'date_client',
                    'forecast_date_elec_price',
                    'origin_date_elec_price',
                    'forecast_date_gas_price',
                    'origin_date_gas_price',
                    'datetime_hist_weath',
                   'hour_part_hist_weath_latest',
                    'datetime_hist_weath_latest',
                   'origin_datetime',
                   'hour_part_fore_weath',
                    'datetime',
                     'data_block_id',
                     'row_id',
                     'prediction_unit_id',
                     'date',
                    'data_block_id_rt',
                     'row_id_rt',
                     'prediction_unit_id_rt',
                    'data_block_id_client',
                    'latitude',
                     'longitude',
                     'data_block_id_hw',
                    'start_time',
                     'end_time',
                     'time_code',
                     'group',
                    'data_block_id_hw_means',
                    'data_block_id_hw_variances',
                     'location_id',
                     'date_hw',
                     'datetime_hw_lagged',
                    'latitude_hw_lagged',
                     'longitude_hw_lagged',
                     'data_block_id_hw_lagged',
                     'start_time_hw_lagged',
                     'end_time_hw_lagged',
                     'time_code_hw_lagged',
                     'group_hw_lagged',
                    'data_block_id_hw_means_hw_lagged',
                    'data_block_id_hw_variances_hw_lagged',
                    'location_id_hw_lagged',
                     'latitude_fw',
                     'longitude_fw',
                     'origin_datetime',
                    'data_block_id_fw',
                     'forecast_datetime',
                    'data_block_id_elec',
                    'forecast_date',
                    'origin_date',
                     'data_block_id_gasp',
                    'id'
                   ]
        columns_to_drop = [col for col in col_list if col in df.columns]
        df = df.drop(columns_to_drop, axis=1)
        return df
    
    def join_data(self, train, revealed_targets, client, historical_weather, forecast_weather, electricity_prices, gas_prices):
        df = train
        df = df.merge(revealed_targets, how='left', on=('datetime', 'county', 'is_business', 'product_type', 'is_consumption'), suffixes=('', '_rt'))
        df = df.merge(client, how='left', on=('date', 'county', 'is_business', 'product_type'), suffixes=('', '_client'))
        df = df.merge(historical_weather, how='left', on=('datetime', 'county'), suffixes=('', '_hw'))
        df = df.merge(forecast_weather, how='left', on=('datetime', 'county'), suffixes=('', '_fw'))
        df = df.merge(electricity_prices, how='left', on='datetime', suffixes=('', '_elec'))
        df['date'] = pd.to_datetime(df['date'])
        df = df.merge(gas_prices, how='left', on='date', suffixes=('', '_gasp'))
        df = self.add_date_features(df)
        df = self.add_ee_holidays(df)
        return df
    
    def add_test_data(self, test, revealed_targets, client, historical_weather,
            forecast_weather, electricity_prices, gas_prices):
        dfs = [test.copy(), revealed_targets, client, historical_weather,
                 forecast_weather, electricity_prices, gas_prices]
        for i, df in enumerate(dfs):
            if 'datetime' in df.columns:
                df['datetime'] = pd.to_datetime(df.datetime)
                col = 'datetime'
            if 'prediction_datetime' in df.columns:
                df['datetime'] = pd.to_datetime(df.prediction_datetime)
                col = 'datetime'
            if 'forecast_date' in df.columns:
                df['forecast_date'] = pd.to_datetime(df['forecast_date'])
                col = 'forecast_date'
            if 'forecast_datetime' in df.columns:
                df['forecast_datetime'] = pd.to_datetime(df['forecast_datetime'])
                col = 'forecast_datetime'
                
            self.test_orig_dfs[i] = pd.concat([ self.test_orig_dfs[i], df ])          
        
        
    
    def process_test_data_timestep(self, test, revealed_targets, client, historical_weather,
            forecast_weather, electricity_prices, gas_prices):
        #append test data to test data cache
        self.add_test_data(test, revealed_targets, client, historical_weather,
            forecast_weather, electricity_prices, gas_prices)
        # process test data
        test = self.init_train(self.test_orig_dfs[0])
        revealed_targets = self.init_revealed_targets(self.test_orig_dfs[1])
        client = self.init_client(self.test_orig_dfs[2])
        historical_weather = self.init_historical_weather(self.test_orig_dfs[3])
        forecast_weather = self.init_forecast_weather(self.test_orig_dfs[4])
        electricity_prices = self.init_electricity(self.test_orig_dfs[5])
        gas_prices = self.init_gas_prices(self.test_orig_dfs[6])
        df_all_cols = self.join_data(test, revealed_targets, client, historical_weather,
            forecast_weather, electricity_prices, gas_prices)
        if self.add_log_cols:
            df_all_cols = self.create_log_cols(df_all_cols)
        df = self.remove_test_cols(df_all_cols)
        return df
        

In [5]:
import pickle

with open('data_processor_lgbm2_new_pandas.pkl', 'rb') as f:
    data_processor = pickle.load(f)
data_processor.df

Unnamed: 0,county,is_business,product_type,target,is_consumption,target_rt,target_lag_1h,target_lag_2h,target_lag_3h,target_lag_4h,...,is_quarter_start,is_quarter_end,is_year_start,is_year_end,season,hour_sin,hour_cos,day_of_year_sin,day_of_year_cos,is_ee_holiday
0,0,0,1,0.713,0,,,,,,...,False,False,False,False,4,0.000000,1.000000,-0.861693,-0.507430,False
1,0,0,1,96.590,1,,,,,,...,False,False,False,False,4,0.000000,1.000000,-0.861693,-0.507430,False
2,0,0,2,0.000,0,,,,,,...,False,False,False,False,4,0.000000,1.000000,-0.861693,-0.507430,False
3,0,0,2,17.314,1,,,,,,...,False,False,False,False,4,0.000000,1.000000,-0.861693,-0.507430,False
4,0,0,3,2.904,0,,,,,,...,False,False,False,False,4,0.000000,1.000000,-0.861693,-0.507430,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018609,15,1,0,197.233,1,184.072,171.092,168.933,174.920,170.068,...,False,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False
2018610,15,1,1,0.000,0,0.000,0.000,2.501,25.884,83.535,...,False,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False
2018611,15,1,1,28.404,1,38.646,47.690,34.806,29.202,21.654,...,False,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False
2018612,15,1,3,0.000,0,0.000,0.000,4.512,34.657,122.195,...,False,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False


In [6]:
def fill_drop_na(df):
    df = df[~df.target.isna()]
    df = df[~df.target_rolling_avg_24h.isna()]
    means = df.mean()
    # For each column, add an indicator column for NA values
    # for col in df.columns:
    #     if df[col].isna().any():
    #         df[f'{col}_is_na'] = df[col].isna()
    df = df.fillna(means)
    return df, means

In [7]:
%%time
processed_df_no_na, means = fill_drop_na(data_processor.df)
processed_df_no_na.isna().sum()

CPU times: total: 3.69 s
Wall time: 8.25 s


county             0
is_business        0
product_type       0
target             0
is_consumption     0
                  ..
hour_sin           0
hour_cos           0
day_of_year_sin    0
day_of_year_cos    0
is_ee_holiday      0
Length: 240, dtype: int64

In [8]:
from datetime import datetime

cv_ranges_corrected = [
    ('2022-09-01', '2022-10-24'), 
    ('2022-10-25', '2022-12-17'), 
    ('2022-12-18', '2023-02-09'), 
    ('2023-02-10', '2023-04-04'), 
    ('2023-04-05', '2023-05-31')
]

# Function to convert a date string into a datetime object
def to_datetime(date_str):
    return datetime.strptime(date_str, '%Y-%m-%d')

# Converting the date strings in cv_ranges to datetime objects
datetime_cv_ranges = [(to_datetime(start), to_datetime(end)) for start, end in cv_ranges_corrected]
datetime_cv_ranges

date_filter = data_processor.df_all_cols.date[processed_df_no_na.index]
date_filter

cv1_train = processed_df_no_na[date_filter <= datetime_cv_ranges[0][0]]
cv1_test = processed_df_no_na[(date_filter <= datetime_cv_ranges[0][1]) & (date_filter > datetime_cv_ranges[0][0])]

## Hyperparam search

In [9]:
from sklearn.model_selection import TimeSeriesSplit

In [10]:
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import HalvingRandomSearchCV


In [25]:
%%time

# https://www.kaggle.com/code/chaozhuang/enefit-eda-w-fft-ssa-arima-lgbm?scriptVersionId=156414824#Predictive-Modelling
from lightgbm import LGBMRegressor
import random
import lightgbm as lgb
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

def tune_lgbm_model(base_params, X_train, y_train, n_iter=8, cv=3):
    """
    Tune a LightGBM model based on a base set of parameters.

    :param base_params: Dictionary of base parameters for the model
    :param X_train: Training features
    :param y_train: Training target variable
    :param n_iter: Number of iterations for RandomizedSearchCV
    :param cv: Number of cross-validation folds
    :return: Best estimator and best parameters
    """
    # Parameter distributions for random search
    cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
       'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
        'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
    cat_features = [c for c in cat_features if c in df_train_data.columns]
    
    param_dist = {
        'learning_rate': sp_uniform(0.005, 0.5),
        'lambda_l1': sp_uniform(0, 4), 
        'lambda_l2': sp_uniform(0, 4), 
        'max_bin': sp_randint(100, 250),
        'min_data_in_leaf': sp_randint(15, 300),
        'n_estimators': sp_randint(2500, 6000),
        # 'num_leaves': sp_randint(25, 150),
        
        # 'colsample_bytree' : sp_uniform(0.1, 1),
        # 'colsample_bynode' : sp_uniform(0.1, 1),
        # 'data_sample_strategy' : ['bagging', 'goss'],
        
        # 'drop_rate': sp_uniform(0, 1),
        # 'skip_drop': sp_uniform(0, 1),
        # 'min_data_per_group': sp_randint(10, 200),
        # 'max_cat_threshold': sp_randint(10, 100),
        # 'cat_l2': sp_randint(10, 100),
        # 'cat_smooth': sp_randint(10, 100),
    }

    # Create a LightGBM regressor object
    lgb_reg = lgb.LGBMRegressor(**base_params)

    # Create a RandomizedSearchCV object
    random_search = HalvingRandomSearchCV(estimator=lgb_reg, param_distributions=param_dist,
                                       scoring='neg_mean_absolute_error',
                                       cv=TimeSeriesSplit(n_splits=cv), random_state=8888, verbose=1,
                                         aggressive_elimination= True,
                                         max_resources=100, min_resources=5, )

    # producer_mask = X_train['is_consumption'] == 0
    results_dict = {}
    # producer
    # X_train_producer = X_train[producer_mask]
    # y_train_producer = y_train[producer_mask]
    # Fit the random search to the data
    random_search.fit(X_train, y_train, categorical_feature=cat_features)

    # Return the best estimator and best parameters
    results_dict['best_estimator'] = random_search.best_estimator_
    results_dict['best_params'] = random_search.best_params_
    
    
#     random_search = HalvingRandomSearchCV(estimator=lgb_reg, param_distributions=param_dist,
#                                        scoring='neg_mean_absolute_error',
#                                        cv=cv, random_state=2024, verbose=1,
#                                          aggressive_elimination= True,
#                                          max_resources=20000, min_resources=5)
#     # consumer
#     X_train_consumer = X_train[~producer_mask]
#     y_train_consumer = y_train[~producer_mask]
#     # Fit the random search to the data
#     random_search.fit(X_train_consumer, y_train_consumer, categorical_feature=cat_features)

#     # Return the best estimator and best parameters
#     results_dict['consumer_best_estimator'] = random_search.best_estimator_
#     results_dict['consumer_best_params'] = random_search.best_params_
    
    return results_dict

base_params_p1 = {
    'verbose': -1,
    'metric': 'mae',
    'n_jobs': 24,
    'boosting': 'dart',
    'objective': 'tweedie'
}

i=4
train = processed_df_no_na[(date_filter <= to_datetime('2023-03-30'))]
val = processed_df_no_na[(date_filter > to_datetime('2023-03-30'))]
print(f"Fold {i}")
print(f"Train rows: {len(train)}")
print(f"Val rows: {len(val)}")

target_cols = ['target']
drop_cols = ['target', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
            'snowfall_fw', 'snowfall_hw_means']

df_train_target = train[target_cols]
df_train_data = train.drop(drop_cols, axis=1)

df_val_target2 = val[target_cols]
df_val_data2 = val.drop(drop_cols, axis=1)

cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
       'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
        'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
cat_features = [c for c in cat_features if c in df_train_data.columns]

# Fit the model
results_dict = tune_lgbm_model(base_params_p1, df_train_data, df_train_target["target"])

print("Best parameters:", results_dict['best_params'])

Fold 4
Train rows: 1805830
Val rows: 195264
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 5
max_resources_: 100
aggressive_elimination: True
factor: 3
----------
iter: 0
n_candidates: 20
n_resources: 5
Fitting 3 folds for each of 20 candidates, totalling 60 fits
----------
iter: 1
n_candidates: 7
n_resources: 15
Fitting 3 folds for each of 7 candidates, totalling 21 fits
----------
iter: 2
n_candidates: 3
n_resources: 45
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best parameters: {'lambda_l1': 2.388935540285825, 'lambda_l2': 0.6070283817241626, 'learning_rate': 0.2514296168463765, 'max_bin': 188, 'min_data_in_leaf': 124, 'n_estimators': 4569}
CPU times: total: 4h 57min 37s
Wall time: 14min 54s


In [26]:
%%time

# https://www.kaggle.com/code/chaozhuang/enefit-eda-w-fft-ssa-arima-lgbm?scriptVersionId=156414824#Predictive-Modelling
from lightgbm import LGBMRegressor
import random
import lightgbm as lgb
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

def tune_lgbm_model(base_params, X_train, y_train, n_iter=8, cv=3):
    """
    Tune a LightGBM model based on a base set of parameters.

    :param base_params: Dictionary of base parameters for the model
    :param X_train: Training features
    :param y_train: Training target variable
    :param n_iter: Number of iterations for RandomizedSearchCV
    :param cv: Number of cross-validation folds
    :return: Best estimator and best parameters
    """
    # Parameter distributions for random search
    cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
       'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
        'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
    cat_features = [c for c in cat_features if c in df_train_data.columns]
    
    param_dist = {
        'learning_rate': sp_uniform(0.005, 0.5),
        'lambda_l1': sp_uniform(0, 4), 
        'lambda_l2': sp_uniform(0, 4), 
        'max_bin': sp_randint(100, 250),
        'min_data_in_leaf': sp_randint(15, 300),
        'n_estimators': sp_randint(2500, 6000),
        # 'num_leaves': sp_randint(25, 150),
        
        # 'colsample_bytree' : sp_uniform(0.1, 1),
        # 'colsample_bynode' : sp_uniform(0.1, 1),
        # 'data_sample_strategy' : ['bagging', 'goss'],
        
        # 'drop_rate': sp_uniform(0, 1),
        # 'skip_drop': sp_uniform(0, 1),
        # 'min_data_per_group': sp_randint(10, 200),
        # 'max_cat_threshold': sp_randint(10, 100),
        # 'cat_l2': sp_randint(10, 100),
        # 'cat_smooth': sp_randint(10, 100),
    }

    # Create a LightGBM regressor object
    lgb_reg = lgb.LGBMRegressor(**base_params)

    # Create a RandomizedSearchCV object
    random_search = HalvingRandomSearchCV(estimator=lgb_reg, param_distributions=param_dist,
                                       scoring='neg_mean_absolute_error',
                                       cv=TimeSeriesSplit(n_splits=cv), random_state=8888, verbose=1,
                                         aggressive_elimination= True,
                                         max_resources=1000, min_resources=5, )

    # producer_mask = X_train['is_consumption'] == 0
    results_dict = {}
    # producer
    # X_train_producer = X_train[producer_mask]
    # y_train_producer = y_train[producer_mask]
    # Fit the random search to the data
    random_search.fit(X_train, y_train, categorical_feature=cat_features)

    # Return the best estimator and best parameters
    results_dict['best_estimator'] = random_search.best_estimator_
    results_dict['best_params'] = random_search.best_params_
    
    
#     random_search = HalvingRandomSearchCV(estimator=lgb_reg, param_distributions=param_dist,
#                                        scoring='neg_mean_absolute_error',
#                                        cv=cv, random_state=2024, verbose=1,
#                                          aggressive_elimination= True,
#                                          max_resources=20000, min_resources=5)
#     # consumer
#     X_train_consumer = X_train[~producer_mask]
#     y_train_consumer = y_train[~producer_mask]
#     # Fit the random search to the data
#     random_search.fit(X_train_consumer, y_train_consumer, categorical_feature=cat_features)

#     # Return the best estimator and best parameters
#     results_dict['consumer_best_estimator'] = random_search.best_estimator_
#     results_dict['consumer_best_params'] = random_search.best_params_
    
    return results_dict

base_params_p1 = {
    'verbose': -1,
    'metric': 'mae',
    'n_jobs': 24,
    'boosting': 'dart',
    'objective': 'tweedie'
}

i=4
train = processed_df_no_na[(date_filter <= to_datetime('2023-03-30'))]
val = processed_df_no_na[(date_filter > to_datetime('2023-03-30'))]
print(f"Fold {i}")
print(f"Train rows: {len(train)}")
print(f"Val rows: {len(val)}")

target_cols = ['target']
drop_cols = ['target', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
            'snowfall_fw', 'snowfall_hw_means']

df_train_target = train[target_cols]
df_train_data = train.drop(drop_cols, axis=1)

df_val_target2 = val[target_cols]
df_val_data2 = val.drop(drop_cols, axis=1)

cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
       'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
        'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
cat_features = [c for c in cat_features if c in df_train_data.columns]

# Fit the model
results_dict = tune_lgbm_model(base_params_p1, df_train_data, df_train_target["target"])

print("Best parameters:", results_dict['best_params'])

Fold 4
Train rows: 1805830
Val rows: 195264
n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 5
max_resources_: 1000
aggressive_elimination: True
factor: 3
----------
iter: 0
n_candidates: 200
n_resources: 5
Fitting 3 folds for each of 200 candidates, totalling 600 fits
----------
iter: 1
n_candidates: 67
n_resources: 15
Fitting 3 folds for each of 67 candidates, totalling 201 fits
----------
iter: 2
n_candidates: 23
n_resources: 45
Fitting 3 folds for each of 23 candidates, totalling 69 fits
----------
iter: 3
n_candidates: 8
n_resources: 135
Fitting 3 folds for each of 8 candidates, totalling 24 fits
----------
iter: 4
n_candidates: 3
n_resources: 405
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best parameters: {'lambda_l1': 1.9178044505870369, 'lambda_l2': 1.3167397171985806, 'learning_rate': 0.12287415335595941, 'max_bin': 161, 'min_data_in_leaf': 20, 'n_estimators': 5663}
CPU times: total: 7h 25min 1s
Wall time: 25min 56s


In [12]:
%%time

# https://www.kaggle.com/code/chaozhuang/enefit-eda-w-fft-ssa-arima-lgbm?scriptVersionId=156414824#Predictive-Modelling
from lightgbm import LGBMRegressor
import random
import lightgbm as lgb
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

def tune_lgbm_model(base_params, X_train, y_train, n_iter=8, cv=5):
    """
    Tune a LightGBM model based on a base set of parameters.

    :param base_params: Dictionary of base parameters for the model
    :param X_train: Training features
    :param y_train: Training target variable
    :param n_iter: Number of iterations for RandomizedSearchCV
    :param cv: Number of cross-validation folds
    :return: Best estimator and best parameters
    """
    # Parameter distributions for random search
    cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
       'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
        'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
    cat_features = [c for c in cat_features if c in df_train_data.columns]
    
    param_dist = {
        'learning_rate': sp_uniform(0.005, 0.5),
        'lambda_l1': sp_uniform(0, 4), 
        'lambda_l2': sp_uniform(0, 4), 
        'max_bin': sp_randint(100, 250),
        'min_data_in_leaf': sp_randint(15, 300),
        'n_estimators': sp_randint(2500, 6000),
        # 'num_leaves': sp_randint(25, 150),
        
        # 'colsample_bytree' : sp_uniform(0.1, 1),
        # 'colsample_bynode' : sp_uniform(0.1, 1),
        # 'data_sample_strategy' : ['bagging', 'goss'],
        
        # 'drop_rate': sp_uniform(0, 1),
        # 'skip_drop': sp_uniform(0, 1),
        # 'min_data_per_group': sp_randint(10, 200),
        # 'max_cat_threshold': sp_randint(10, 100),
        # 'cat_l2': sp_randint(10, 100),
        # 'cat_smooth': sp_randint(10, 100),
    }

    # Create a LightGBM regressor object
    lgb_reg = lgb.LGBMRegressor(**base_params)

    # Create a RandomizedSearchCV object
    random_search = HalvingRandomSearchCV(estimator=lgb_reg, param_distributions=param_dist,
                                       scoring='neg_mean_absolute_error',
                                       cv=TimeSeriesSplit(n_splits=cv), random_state=8888, verbose=1,
                                         aggressive_elimination= True,
                                         max_resources=20000, min_resources=5, )

    # producer_mask = X_train['is_consumption'] == 0
    results_dict = {}
    # producer
    # X_train_producer = X_train[producer_mask]
    # y_train_producer = y_train[producer_mask]
    # Fit the random search to the data
    random_search.fit(X_train, y_train, categorical_feature=cat_features)

    # Return the best estimator and best parameters
    results_dict['best_estimator'] = random_search.best_estimator_
    results_dict['best_params'] = random_search.best_params_
    
    
#     random_search = HalvingRandomSearchCV(estimator=lgb_reg, param_distributions=param_dist,
#                                        scoring='neg_mean_absolute_error',
#                                        cv=cv, random_state=2024, verbose=1,
#                                          aggressive_elimination= True,
#                                          max_resources=20000, min_resources=5)
#     # consumer
#     X_train_consumer = X_train[~producer_mask]
#     y_train_consumer = y_train[~producer_mask]
#     # Fit the random search to the data
#     random_search.fit(X_train_consumer, y_train_consumer, categorical_feature=cat_features)

#     # Return the best estimator and best parameters
#     results_dict['consumer_best_estimator'] = random_search.best_estimator_
#     results_dict['consumer_best_params'] = random_search.best_params_
    
    return results_dict

base_params_p1 = {
    'verbose': -1,
    'metric': 'mae',
    'n_jobs': 24,
    'boosting': 'dart',
    'objective': 'tweedie'
}

i=4
train = processed_df_no_na[(date_filter <= to_datetime('2023-03-30'))]
val = processed_df_no_na[(date_filter > to_datetime('2023-03-30'))]
print(f"Fold {i}")
print(f"Train rows: {len(train)}")
print(f"Val rows: {len(val)}")

target_cols = ['target']
drop_cols = ['target', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
            'snowfall_fw', 'snowfall_hw_means']

df_train_target = train[target_cols]
df_train_data = train.drop(drop_cols, axis=1)

df_val_target2 = val[target_cols]
df_val_data2 = val.drop(drop_cols, axis=1)

cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
       'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
        'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
cat_features = [c for c in cat_features if c in df_train_data.columns]

# Fit the model
results_dict = tune_lgbm_model(base_params_p1, df_train_data, df_train_target["target"])

print("Best parameters:", results_dict['best_params'])

Fold 4
Train rows: 1805830
Val rows: 195264
n_iterations: 8
n_required_iterations: 8
n_possible_iterations: 8
min_resources_: 5
max_resources_: 20000
aggressive_elimination: True
factor: 3
----------
iter: 0
n_candidates: 4000
n_resources: 5
Fitting 5 folds for each of 4000 candidates, totalling 20000 fits


InvalidParameterError: The 'n_samples' parameter of resample must be an int in the range [1, inf) or None. Got 0 instead.

Best parameters: {'lambda_l1': 1.6305167320916096, 'lambda_l2': 0.6463358547872429, 'learning_rate': 0.3718161124401851, 'max_bin': 140, 'min_data_in_leaf': 39, 'n_estimators': 2869}

Best Parameter runs:

Best parameters of p1: {'colsample_bynode': 0.41017411019547834, 'colsample_bytree': 0.7711664691469922, 'lambda_l1': 0.297793613166748, 'lambda_l2': 0.3614843058449302, 'learning_rate': 0.26360243974444403, 'max_bin': 916, 'min_data_in_leaf': 75, 'n_estimators': 8026}
CPU times: total: 1d 6h 28min 13s
Wall time: 1h 18s

Best parameters of producer: {'cat_l2': 32, 'cat_smooth': 42, 'drop_rate': 0.906209935681394, 'lambda_l1': 0.4237563717896653, 'lambda_l2': 2.5833405443842152, 'learning_rate': 0.10961104370080788, 'max_bin': 479, 'max_cat_threshold': 33, 'min_data_in_leaf': 36, 'min_data_per_group': 194, 'n_estimators': 5089, 'num_leaves': 109, 'skip_drop': 0.5961399837369368}
Best parameters of consumer: {'cat_l2': 94, 'cat_smooth': 47, 'drop_rate': 0.709329079259287, 'lambda_l1': 0.19543861233583915, 'lambda_l2': 3.5465108162748504, 'learning_rate': 0.15102639777091226, 'max_bin': 415, 'max_cat_threshold': 18, 'min_data_in_leaf': 27, 'min_data_per_group': 191, 'n_estimators': 5615, 'num_leaves': 10, 'skip_drop': 0.6854847756175649}


Best parameters of producer: {'lambda_l1': 1.0514735056151499, 'lambda_l2': 0.6904243319535714, 'learning_rate': 0.12970053306048324, 'max_bin': 836, 'min_data_in_leaf': 97, 'n_estimators': 4835, 'num_leaves': 107}

Best parameters: {'lambda_l1': 0.8721717874032273, 'lambda_l2': 0.1830954872642523, 'learning_rate': 0.09272080548316208, 'max_bin': 670, 'min_data_in_leaf': 34, 'n_estimators': 2137, 'num_leaves': 88}

In [43]:
from lightgbm import LGBMRegressor
def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                    'snowfall_fw', 'snowfall_hw_means']
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season'] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]
        
        # We leave max_depth as -1
        # Tune num_leaves, default is 31, let's double it       
        
        params = {'lambda_l1': 0.8721717874032273, 'lambda_l2': 0.1830954872642523, 'learning_rate': 0.09272080548316208, 'max_bin': 670, 'min_data_in_leaf': 34, 'n_estimators': 2137, 'num_leaves': 88, 
                    'metric': 'mae', 'n_jobs': 22, 'boosting': 'dart', 'objective': 'tweedie'}
        clf = LGBMRegressor(**params, random_state=69, verbose=0, importance_type='gain')

        clf.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        y_pred = clf.predict(df_train_data)

        from sklearn.metrics import mean_absolute_error

        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f" Train Mean Absolute Error_consumption:", mae)

        y_pred_val = clf.predict(df_val_data2)

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print("Val Mean Absolute Error:", mae)
        
        # importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        # importance = importance.sort_values('importance', ascending=False)
        # display(importance.head(30))
        # display(importance.tail(30))
        print()
        print()

In [44]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
 Train Mean Absolute Error_consumption: 24.71570458901916
Val Mean Absolute Error: 42.74074701715273


Fold 1
Train rows: 1304266
Val rows: 173328
 Train Mean Absolute Error_consumption: 25.58324225568654
Val Mean Absolute Error: 36.02200963137713


Fold 2
Train rows: 1480810
Val rows: 169632
 Train Mean Absolute Error_consumption: 25.684241005132446
Val Mean Absolute Error: 38.6992003907362


Fold 3
Train rows: 1653658
Val rows: 167820
 Train Mean Absolute Error_consumption: 25.868463688973456
Val Mean Absolute Error: 53.53164220498895


Fold 4
Train rows: 1824598
Val rows: 176496
 Train Mean Absolute Error_consumption: 27.06736209195623
Val Mean Absolute Error: 72.42711868608906




In [63]:
d = datetime_cv_ranges[i][1] - datetime_cv_ranges[i][0]
d.days//14
dt.timedelta(days=0)

datetime.timedelta(0)

In [66]:
for f in range(((datetime_cv_ranges[i][1] - datetime_cv_ranges[i][0]).days//14)):
    start = datetime_cv_ranges[i][0] + dt.timedelta(days=f*14)
    stop = datetime_cv_ranges[i][0] + dt.timedelta(days=(f+1)*14)
    train = processed_df_no_na[date_filter <= start]
    val = processed_df_no_na[(date_filter <= stop) & (date_filter > start)]
    print(start)
    print(stop)
    print(len(train))
    print(len(val))

2023-04-05 00:00:00
2023-04-19 00:00:00
1824598
43392
2023-04-19 00:00:00
2023-05-03 00:00:00
1867990
43296
2023-05-03 00:00:00
2023-05-17 00:00:00
1911286
45312
2023-05-17 00:00:00
2023-05-31 00:00:00
1956598
44496


In [67]:
from lightgbm import LGBMRegressor
def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

train_pred_list = []
train_mae_list = []
train_targets_list = []

pred_list = []
mae_list = []
val_targets_list = []

df = processed_df_no_na
i=4
for f in range(((datetime_cv_ranges[i][1] - datetime_cv_ranges[i][0]).days//14)):
    start = datetime_cv_ranges[i][0] + dt.timedelta(days=f*14)
    stop = datetime_cv_ranges[i][0] + dt.timedelta(days=(f+1)*14)
    train = processed_df_no_na[date_filter <= start]
    val = processed_df_no_na[(date_filter <= stop) & (date_filter > start)]
    
    print(f"Fold {i}, period {f}")
    print(f"Train rows: {len(train)}")
    print(f"Val rows: {len(val)}")

    target_cols = ['target', 'target_installed_capacity']
    drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                'snowfall_fw', 'snowfall_hw_means']

    df_train_target = train[target_cols]
    df_train_data = train.drop(drop_cols, axis=1)

    df_val_target2 = val[target_cols]
    df_val_data2 = val.drop(drop_cols, axis=1)

    cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
           'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
            'is_year_start', 'is_year_end', 'season'] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
    cat_features = [c for c in cat_features if c in df_train_data.columns]

    # We leave max_depth as -1
    # Tune num_leaves, default is 31, let's double it       

    params = {'lambda_l1': 0.8721717874032273, 'lambda_l2': 0.1830954872642523, 'learning_rate': 0.09272080548316208, 'max_bin': 670, 'min_data_in_leaf': 34, 'n_estimators': 2137, 'num_leaves': 88, 
                'metric': 'mae', 'n_jobs': 22, 'boosting': 'dart', 'objective': 'tweedie'}
    
    clf = LGBMRegressor(**params, random_state=69, verbose=0, importance_type='gain')

    clf.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

    y_pred = clf.predict(df_train_data)
    train_pred_list.append(y_pred)

    from sklearn.metrics import mean_absolute_error

    # Assuming you have two pandas Series: y_true and y_pred
    mae = mean_absolute_error(df_train_target.target, y_pred)
    train_mae_list.append(mae)
    train_targets_list.append(df_train_target.target)
    print(f" Train Mean Absolute Error_consumption:", mae)

    y_pred_val = clf.predict(df_val_data2)
    pred_list.append(y_pred_val)

    mae = mean_absolute_error(df_val_target2.target, y_pred_val)
    val_targets_list.append(df_val_target2.target)
    mae_list.append(mae)
    print("Val Mean Absolute Error:", mae)

# importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
# importance = importance.sort_values('importance', ascending=False)
# display(importance.head(30))
# display(importance.tail(30))
print()
print()

Fold 4, period 0
Train rows: 1824598
Val rows: 43392
 Train Mean Absolute Error_consumption: 27.06736209195623
Val Mean Absolute Error: 59.52748165599775
Fold 4, period 1
Train rows: 1867990
Val rows: 43296
 Train Mean Absolute Error_consumption: 27.42106436505342
Val Mean Absolute Error: 63.889947247905894
Fold 4, period 2
Train rows: 1911286
Val rows: 45312
 Train Mean Absolute Error_consumption: 27.91180558238793
Val Mean Absolute Error: 69.0872864860352
Fold 4, period 3
Train rows: 1956598
Val rows: 44496
 Train Mean Absolute Error_consumption: 28.437498536613166
Val Mean Absolute Error: 79.94249878022536




In [69]:
np.mean(mae_list)

68.11180354254105

In [70]:
for f in range((datetime_cv_ranges[i][1] - datetime_cv_ranges[i][0]).days):
    
    start = datetime_cv_ranges[i][0] + dt.timedelta(days=f)
    stop = datetime_cv_ranges[i][0] + dt.timedelta(days=(f+1))
    train = processed_df_no_na[date_filter <= start]
    val = processed_df_no_na[(date_filter <= stop) & (date_filter > start)]
    print(start)
    print(stop)
    print(len(train))
    print(len(val))

2023-04-05 00:00:00
2023-04-19 00:00:00
1824598
43392
2023-04-19 00:00:00
2023-05-03 00:00:00
1867990
43296
2023-05-03 00:00:00
2023-05-17 00:00:00
1911286
45312
2023-05-17 00:00:00
2023-05-31 00:00:00
1956598
44496


In [73]:
date_column = 'datetime'
for f in range((datetime_cv_ranges[i][1] - datetime_cv_ranges[i][0]).days):
    
    current_date = datetime_cv_ranges[i][0] + dt.timedelta(days=f)
    start_period = current_date - dt.timedelta(days=30)
    
    # Training data for the current year
    train_current_year = processed_df_no_na[(date_filter <= current_date) & (date_filter > start_period)]
    
    # Training data for the same period in previous years
    train_previous_years = pd.DataFrame()
    for year in range(data_processor.df_all_cols.date.dt.year.min(), current_date.year):
        start_previous = current_date.replace(year=year) - dt.timedelta(days=30)
        end_previous = current_date.replace(year=year) + dt.timedelta(days=30)
        train_previous_year = processed_df_no_na[(date_filter > start_previous) & (date_filter <= end_previous)]
        train_previous_years = pd.concat([train_previous_years, train_previous_year])
    
    # Combine training data
    train = pd.concat([train_current_year, train_previous_years])

    # Validation data (the next day)
    stop = current_date + dt.timedelta(days=1)
    val = processed_df_no_na[(date_filter <= stop) & (date_filter > current_date)]
    
    print(current_date)
    print(stop)
    print(len(train))
    print(len(val))

2023-04-05 00:00:00
2023-04-06 00:00:00
287590
3120
2023-04-06 00:00:00
2023-04-07 00:00:00
287590
3072
2023-04-07 00:00:00
2023-04-08 00:00:00
287542
3120
2023-04-08 00:00:00
2023-04-09 00:00:00
287542
3072
2023-04-09 00:00:00
2023-04-10 00:00:00
287494
3072
2023-04-10 00:00:00
2023-04-11 00:00:00
287446
3120
2023-04-11 00:00:00
2023-04-12 00:00:00
287446
3120
2023-04-12 00:00:00
2023-04-13 00:00:00
287446
3120
2023-04-13 00:00:00
2023-04-14 00:00:00
287446
3120
2023-04-14 00:00:00
2023-04-15 00:00:00
287446
3072
2023-04-15 00:00:00
2023-04-16 00:00:00
287398
3072
2023-04-16 00:00:00
2023-04-17 00:00:00
287350
3072
2023-04-17 00:00:00
2023-04-18 00:00:00
287302
3120
2023-04-18 00:00:00
2023-04-19 00:00:00
287302
3120
2023-04-19 00:00:00
2023-04-20 00:00:00
287302
3120
2023-04-20 00:00:00
2023-04-21 00:00:00
287302
3120
2023-04-21 00:00:00
2023-04-22 00:00:00
287302
3072
2023-04-22 00:00:00
2023-04-23 00:00:00
287254
3072
2023-04-23 00:00:00
2023-04-24 00:00:00
287206
3072
2023-04-24 0

In [76]:
from lightgbm import LGBMRegressor
def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

train_pred_list = []
train_mae_list = []
train_targets_list = []

pred_list = []
mae_list = []
val_targets_list = []

df = processed_df_no_na
i=4
for f in range((datetime_cv_ranges[i][1] - datetime_cv_ranges[i][0]).days):
    
    current_date = datetime_cv_ranges[i][0] + dt.timedelta(days=f)
    start_period = current_date - dt.timedelta(days=30)
    
    # Training data for the current year
    train_current_year = processed_df_no_na[(date_filter <= current_date) & (date_filter > start_period)]
    
    # Training data for the same period in previous years
    train_previous_years = pd.DataFrame()
    for year in range(data_processor.df_all_cols.date.dt.year.min(), current_date.year):
        start_previous = current_date.replace(year=year) - dt.timedelta(days=30)
        end_previous = current_date.replace(year=year) + dt.timedelta(days=30)
        train_previous_year = processed_df_no_na[(date_filter > start_previous) & (date_filter <= end_previous)]
        train_previous_years = pd.concat([train_previous_years, train_previous_year])
    
    # Combine training data
    train = pd.concat([train_current_year, train_previous_years])

    # Validation data (the next day)
    stop = current_date + dt.timedelta(days=1)
    val = processed_df_no_na[(date_filter <= stop) & (date_filter > current_date)]
    
    print(current_date)
    print(stop)
    print(len(train))
    print(len(val))
    
    print(f"Fold {i}, period {f}")
    print(f"Train rows: {len(train)}")
    print(f"Val rows: {len(val)}")

    target_cols = ['target', 'target_installed_capacity']
    drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                'snowfall_fw', 'snowfall_hw_means']

    df_train_target = train[target_cols]
    df_train_data = train.drop(drop_cols, axis=1)

    df_val_target2 = val[target_cols]
    df_val_data2 = val.drop(drop_cols, axis=1)

    cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
           'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
            'is_year_start', 'is_year_end', 'season'] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
    cat_features = [c for c in cat_features if c in df_train_data.columns]

    # We leave max_depth as -1
    # Tune num_leaves, default is 31, let's double it       

    params = {'lambda_l1': 0.8721717874032273, 'lambda_l2': 0.1830954872642523, 'learning_rate': 0.09272080548316208, 'max_bin': 670, 'min_data_in_leaf': 34, 'n_estimators': 2137, 'num_leaves': 88, 
                'metric': 'mae', 'n_jobs': 22, 'boosting': 'dart', 'objective': 'tweedie'}
    
    clf = LGBMRegressor(**params, random_state=69, verbose=0, importance_type='gain')

    clf.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

    y_pred = clf.predict(df_train_data)
    train_pred_list.append(y_pred)

    from sklearn.metrics import mean_absolute_error

    # Assuming you have two pandas Series: y_true and y_pred
    mae = mean_absolute_error(df_train_target.target, y_pred)
    train_mae_list.append(mae)
    train_targets_list.append(df_train_target.target)
    print(f" Train Mean Absolute Error_consumption:", mae)

    y_pred_val = clf.predict(df_val_data2)
    pred_list.append(y_pred_val)

    mae = mean_absolute_error(df_val_target2.target, y_pred_val)
    val_targets_list.append(df_val_target2.target)
    mae_list.append(mae)
    print("Val Mean Absolute Error:", mae)

# importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
# importance = importance.sort_values('importance', ascending=False)
# display(importance.head(30))
# display(importance.tail(30))
print()
print()

2023-04-05 00:00:00
2023-04-06 00:00:00
287590
3120
Fold 4, period 0
Train rows: 287590
Val rows: 3120
 Train Mean Absolute Error_consumption: 22.681794444446826
Val Mean Absolute Error: 65.97163153719124
2023-04-06 00:00:00
2023-04-07 00:00:00
287590
3072
Fold 4, period 1
Train rows: 287590
Val rows: 3072
 Train Mean Absolute Error_consumption: 22.90125551465204
Val Mean Absolute Error: 141.70186717051894
2023-04-07 00:00:00
2023-04-08 00:00:00
287542
3120
Fold 4, period 2
Train rows: 287542
Val rows: 3120
 Train Mean Absolute Error_consumption: 23.017489379950625
Val Mean Absolute Error: 75.94449504334341
2023-04-08 00:00:00
2023-04-09 00:00:00
287542
3072
Fold 4, period 3
Train rows: 287542
Val rows: 3072
 Train Mean Absolute Error_consumption: 22.92264461464641
Val Mean Absolute Error: 82.1803252048764
2023-04-09 00:00:00
2023-04-10 00:00:00
287494
3072
Fold 4, period 4
Train rows: 287494
Val rows: 3072
 Train Mean Absolute Error_consumption: 22.784107036971125
Val Mean Absolute Er

In [77]:
np.mean(mae_list)

68.21711473593948