# LGBM

In [1]:
import pickle
import pandas as pd
import numpy as np

In [2]:
from lightgbm import LGBMRegressor

In [3]:
import pandas as pd
import datetime as dt
import numpy as np

import pandas as pd
import datetime as dt
import numpy as np

class TrainDataProcessor:
    """Processes Train data, using train data as a warm start, and prepares it for inference."""

    def __init__(self, train, revealed_targets, client, historical_weather,
                 forecast_weather, electricity_prices, gas_prices):
        self.test_orig_dfs = self.get_test_orig_dfs([train.copy(), revealed_targets.copy(), client.copy(), historical_weather.copy(),
                 forecast_weather.copy(), electricity_prices.copy(), gas_prices.copy()])
        self.train = self.init_train(train)
        self.revealed_targets = self.init_revealed_targets(revealed_targets)
        self.client = self.init_client(client)
        self.weather_mapping = self.init_weather_mapping()
        self.historical_weather = self.init_historical_weather(historical_weather)
        self.forecast_weather = self.init_forecast_weather(forecast_weather)
        self.electricity_prices = self.init_electricity(electricity_prices)
        self.gas_prices = self.init_gas_prices(gas_prices)
        
        self.df_all_cols = self.join_data(self.train, self.revealed_targets, self.client, self.historical_weather, self.forecast_weather, self.electricity_prices, self.gas_prices)
        self.df = self.remove_cols(self.df_all_cols)
        
    def get_test_orig_dfs(self, dfs):
        for i, df in enumerate(dfs):
            if 'datetime' in df.columns:
                df['datetime'] = pd.to_datetime(df.datetime)
                col = 'datetime'
            if 'prediction_datetime' in df.columns:
                df['prediction_datetime'] = pd.to_datetime(df.prediction_datetime)
                col = 'prediction_datetime'
            if 'forecast_date' in df.columns:
                df['forecast_date'] = pd.to_datetime(df['forecast_date'])
                col = 'forecast_date'
            if 'forecast_datetime' in df.columns:
                df['forecast_datetime'] = pd.to_datetime(df['forecast_datetime'])
                col = 'forecast_datetime'
            if 'date' in df.columns:
                df['date'] = pd.to_datetime(df.date).dt.date
                col = 'date'

            test_date = df[col].iloc[-1]  # Assuming test is a DataFrame
            start_date = test_date - pd.Timedelta(days=14)
            historical_subset = df[df[col] >= start_date]
            dfs[i] = historical_subset
        return dfs
        
    def init_train(self, df):
        """Prepares the training data for model training."""
        try:
            df['datetime'] = pd.to_datetime(df.datetime)
        except Exception as e:
            df['datetime'] = pd.to_datetime(df.prediction_datetime)
        df['date'] = df.datetime.dt.date
            
        # df = self.get_data_block_id(df, 'datetime')
        return df
    
    def add_electricity_lag_features(self, df):
        """Chatgpt summary:
        Enhances a DataFrame with electricity price lag features:
        - Sets 'datetime' as Index for time series analysis.
        - Calculates rolling 7-day mean price, lagged by one day.
        - Computes rolling 7-day mean for same hour, lagged.
        - Adds column for yesterday's price, shifted by 24 hours.
        - Calculates 24-hour rolling average of electricity prices.
        - Resets index and drops 'forecast_date', 'origin_date', 'hour'.
        """
        ##### mean from entire last week
        df.set_index('datetime', inplace=True)
        # Use rolling to calculate mean price of the last week
        # The window is 7 days, min_periods can be set as per requirement
        # 'closed' determines which side of the interval is closed; it can be 'right' or 'left'
        df['mean_euros_per_mwh_last_week'] = df['euros_per_mwh'].rolling(window='7D', min_periods=1, closed='right').mean()
        # Shift the results to align with the requirement of lagging
        df['mean_euros_per_mwh_last_week'] = df['mean_euros_per_mwh_last_week'].shift()
        
        ##### mean from last week this hour only
        # Extract hour from datetime
        df['hour'] = df.index.hour

        # Group by hour and apply rolling mean for each group
        hourly_groups = df.groupby('hour')
        dff = hourly_groups['euros_per_mwh'].rolling(window='7D', min_periods=1, closed='right').mean()#.shift()#.reset_index(level=0, drop=True)
        dff = dff.reset_index().set_index('datetime').groupby('hour')['euros_per_mwh'].shift()
        dff = dff.rename('mean_euros_per_mwh_same_hour_last_week')
        df = df.join(dff)
        #### yesterday's power price
        df['yesterdays_euros_per_mwh'] = df['euros_per_mwh'].shift(24)
        
        ### 24h average
        # Calculate the 24-hour rolling average
        df['euros_per_mwh_24h_average_price'] = df['euros_per_mwh'].rolling(window=24, min_periods=1).mean()

        # Resetting the index if needed
        df.reset_index(inplace=True)
        df = df.drop(['forecast_date', 'origin_date', 'hour'], axis=1)
        return df

    def init_electricity(self, df):
        ## LAG = 1 Day
        ## Move forecast datetime ahead by 1 day
        ## change name to datetime
        df['datetime'] = pd.to_datetime(df['forecast_date'])
        df['datetime'] = df['datetime'] + dt.timedelta(days=1)
        # df = self.get_data_block_id(df, 'datetime')
        df = self.add_electricity_lag_features(df)
        return df
    
    def add_historical_weather_lag_features(self, df):
        """Chatgpt summary:
        Enhances a DataFrame with historical weather lag features:
        - Converts 'datetime' to Datetime object and sets as index.
        - Sorts data by 'datetime', 'latitude', 'longitude'.
        - Creates 'location_id' as a unique identifier for each location.
        - Filters for 10:00 AM entries and shifts features by 1 day.
        - Merges lagged features with original DataFrame.
        - Calculates mean and variance for weather features over the last 24 hours.
        - Merges these statistical summaries back into the original DataFrame.
        """
        ##### LATEST WEATHER
        def add_latest_weather(df):
            # Assuming df is your original DataFrame
            # Step 1: Convert datetime to a Datetime Object
            df['datetime'] = pd.to_datetime(df['datetime'])
            df.set_index('datetime', inplace=True)

            # Step 2: Sorting the Data
            df.sort_values(by=['datetime', 'latitude', 'longitude'], inplace=True)

            # Step 3: Creating a Unique Identifier for each location
            df['location_id'] = df['latitude'].astype(str) + '_' + df['longitude'].astype(str)

            # Step 4: Filtering for 10:00 AM Entries
            df.reset_index(inplace=True)
            df_10am = df[df['datetime'].dt.hour == 10]
            df_10am.set_index('datetime', inplace=True)

            # Step 5: Shifting the Features by 1 day
            lagged_features = df_10am.groupby('location_id').shift(periods=1, freq='D')

            # Renaming columns to indicate lag
            lagged_features = lagged_features.add_suffix('_hw_lagged')
            lagged_features['location_id'] = lagged_features['location_id_hw_lagged']
            lagged_features.reset_index(inplace=True)
            lagged_features['date'] = lagged_features.datetime.dt.date

            df['date'] = df.datetime.dt.date
            return lagged_features
            # Step 6: Merging Lagged Features with Original DataFrame
            df = df.merge(lagged_features, on=['date', 'location_id'], how='left', suffixes=('', '_hw_lagged'))
            return df
        
        ##### mean from last day
        def add_24h_mean_var(df, weather_features):
            # Calculate the start and end times for each row
            df['start_time'] = pd.to_datetime(df['datetime'].dt.date) - pd.Timedelta(days=2) + pd.Timedelta(hours=11)
            df['end_time'] = pd.to_datetime(df['datetime'].dt.date) - pd.Timedelta(days=1) + pd.Timedelta(hours=10)
            df['time_code'] = df['start_time'].astype(str) +'_' + df['end_time'].astype(str) + '_' + df['latitude'].astype(str) + '_' + df['longitude'].astype(str)
            # print(df.time_code)

            # Create a helper column for grouping
            # If the time is before 10:00 AM, subtract a day
            df['group'] = df['datetime'].apply(lambda dt: dt if dt.time() >= pd.to_datetime('11:00').time() else dt - pd.Timedelta(days=1))
            df['group'] = df['group'].dt.date  # Keep only the date part for grouping
            df['group'] = (pd.to_datetime(df['group']) + pd.Timedelta(hours=11)).astype(str) + '_' + (pd.to_datetime(df['group']) + pd.Timedelta(days=1, hours=10)).astype(str) + '_' + df['latitude'].astype(str) + '_' + df['longitude'].astype(str)

            # Now group by this new column
            grouped = df.groupby('group')
            means = grouped[weather_features].mean()
            variances = grouped[weather_features].var()

            # Merge means and variances into the original DataFrame
            my_df = df.merge(means, left_on='time_code', right_on='group', suffixes=('', '_hw_means'), how='left')
            my_df = my_df.merge(variances, left_on='time_code', right_on='group', how='left', suffixes=('', '_hw_variances'))

            return my_df

        df['datetime'] = pd.to_datetime(df['datetime'])
        weather_features = df.columns.drop(['datetime', 'latitude', 'longitude'])

        # Apply the function
        df = add_24h_mean_var(df, weather_features)       
        latest = add_latest_weather(df)
        df = df.merge(latest, on=['date', 'location_id'], how='left', suffixes=('', '_hw_lagged'))
        
        return df

    def init_historical_weather(self, df):
        ## LAG: From 11:00 AM 2 days ago to 10:00 AM 1 day ago
        ## What to do? Give most recent weather forecast? Give average over the last day?
        """
        Processes the historical weather data.
        """
        df['datetime'] = pd.to_datetime(df.datetime)
        
        df = self.add_historical_weather_lag_features(df)
        
        df = df.merge(self.weather_mapping, how='inner', on=('latitude', 'longitude'))
        return df

    def init_forecast_weather(self, df):
        """Chatgpt summary:
        Processes forecast weather data:
        - Converts 'forecast_datetime' to 'datetime' and adjusts it forward by 1 day.
        - Filters data to keep records with 'hours_ahead' between 22 and 45.
        - Merges with a weather mapping based on 'latitude' and 'longitude'.
        """
        ## LAG: DON't ADJUST
        ##      The forecast is from yesterday, but can forecast today, which is 22 hours ahead
        ## Drop any columns where:
        ##                        hours_ahead < 22 and hours_ahead > 45
        ## Then rename forecast_datetime to datetime and join on datetime
        """
        Processes the forecast weather data.
        """
        df['datetime'] = pd.to_datetime(df['forecast_datetime'])
        # keep only datetimes from our relevant period
        df = df[(df['hours_ahead'] < 46) & (df['hours_ahead'] > 21)]
        df['datetime'] = df['datetime'] + dt.timedelta(days=1)
        df = df.merge(self.weather_mapping, how='inner', on=('latitude', 'longitude'))
        return df
    
    def add_gas_prices_lag_features(self, df):
        """Chatgpt summary
        Augments a DataFrame with rolling average lag features for gas prices:
        - Converts 'date' to Datetime object and sets as index.
        - Sorts DataFrame by date.
        - Calculates rolling averages for lowest and highest gas prices over 3, 7, and 14 days.
        - Resets the index to include 'date' as a column again.
        """
        df['date'] = pd.to_datetime(df['date'])
        df.set_index('date', inplace=True)

        # Sort the DataFrame by date, if it's not already sorted
        df.sort_index(inplace=True)

        # Calculate rolling averages for different time windows
        df['lowest_price_3d_avg'] = df['lowest_price_per_mwh'].rolling(window=3).mean()
        df['highest_price_3d_avg'] = df['highest_price_per_mwh'].rolling(window=3).mean()

        df['lowest_price_7d_avg'] = df['lowest_price_per_mwh'].rolling(window=7).mean()
        df['highest_price_7d_avg'] = df['highest_price_per_mwh'].rolling(window=7).mean()

        df['lowest_price_14d_avg'] = df['lowest_price_per_mwh'].rolling(window=14).mean()
        df['highest_price_14d_avg'] = df['highest_price_per_mwh'].rolling(window=14).mean()

        # Reset the index if you want the 'date' column back
        df.reset_index(inplace=True)
        return df

    def init_gas_prices(self, df):
        ## LAG: 1 DAY
        ## Predictions are made from 2 days ago and predict for yesterday
        ## add one day to forecast_date
        ## Rename forecast_date to date, join on date
        """
        Processes the gas prices data.
        Implement the logic to handle gas prices data processing here.
        """
        df['date'] = pd.to_datetime(df['forecast_date']).dt.date
        df['date'] = df['date'] + dt.timedelta(days=1)
        df = self.add_gas_prices_lag_features(df)
        return df
    
    def add_revealed_target_features(self, df):
        """Chatgpt summary:
        Enhances DataFrame with rolling average target features:
        - Converts 'datetime' to Datetime object, extracts 'hour' and 'day' of week.
        - Sets 'datetime' as index.
        - Calculates various rolling averages of 'target' based on different groupings:
          - 24-hour rolling average by county, business status, product type, and consumption status.
          - 7-day hourly rolling average by county, business status, product type, consumption status, and hour.
          - 4-week rolling average by county, business status, product type, consumption status, hour, and day.
          - Similar calculations considering all product types.
        - Drops 'hour' and 'day' columns after processing.
        """
        df['datetime'] = pd.to_datetime(df['datetime'])
        df['hour'] = df.datetime.dt.hour
        df['day'] = df.datetime.dt.dayofweek
        df.set_index('datetime', inplace=True)

        window_size = 7
        # Group by the specified columns and then apply the rolling mean
        grouped = df.groupby(['county', 'is_business', 'product_type', 'is_consumption'])
        df['target_rolling_avg_24h'] = grouped['target'].transform(lambda x: x.rolling(window=24, min_periods=1).mean())

        grouped = df.groupby(['county', 'is_business', 'product_type', 'is_consumption', 'hour'])
        df['target_rolling_avg_hour_7d'] = grouped['target'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())

        grouped = df.groupby(['county', 'is_business', 'product_type', 'is_consumption', 'hour', 'day'])
        df['target_rolling_avg_hour_hour_day_4w'] = grouped['target'].transform(lambda x: x.rolling(window=4, min_periods=1).mean())

        grouped = df.groupby(['county', 'is_business', 'is_consumption'])
        df['target_rolling_allp_avg_24h'] = grouped['target'].transform(lambda x: x.rolling(window=24, min_periods=1).mean())

        grouped = df.groupby(['county', 'is_business', 'is_consumption', 'hour'])
        df['target_rolling_allp_avg_hour_7d'] = grouped['target'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())

        grouped = df.groupby(['county', 'is_business', 'is_consumption', 'hour', 'day'])
        df['target_rolling_allp_avg_hour_hour_day_4w'] = grouped['target'].transform(lambda x: x.rolling(window=4, min_periods=1).mean())
        
        df = df.drop(['hour', 'day'], axis=1)

        return df
    
    def init_revealed_targets(self, df):
        df['datetime'] = pd.to_datetime(df.datetime)
        df['datetime'] = df['datetime'] + dt.timedelta(days=2)
        df = self.add_revealed_target_features(df)
        return df
    
    def init_client(self, df):
        ## LAG: 2 days
        ## Add 2 days to date, join on date
        df['date'] = pd.to_datetime(df.date).dt.date
        df['date'] = df['date'] + dt.timedelta(days=2)
        # df = self.get_data_block_id(df, 'date')
        return df

    def init_weather_mapping(self):
        # https://www.kaggle.com/code/tsunotsuno/enefit-eda-baseline/notebook#Baseline
        county_point_map = {
            0: (59.4, 24.7), # "HARJUMAA"
            1 : (58.8, 22.7), # "HIIUMAA"
            2 : (59.1, 27.2), # "IDA-VIRUMAA"
            3 : (58.8, 25.7), # "JÄRVAMAA"
            4 : (58.8, 26.2), # "JÕGEVAMAA"
            5 : (59.1, 23.7), # "LÄÄNE-VIRUMAA"
            6 : (59.1, 23.7), # "LÄÄNEMAA"
            7 : (58.5, 24.7), # "PÄRNUMAA"
            8 : (58.2, 27.2), # "PÕLVAMAA"
            9 : (58.8, 24.7), # "RAPLAMAA"
            10 : (58.5, 22.7),# "SAAREMAA"
            11 : (58.5, 26.7),# "TARTUMAA"
            12 : (58.5, 25.2),# "UNKNOWNN" (center of the map)
            13 : (57.9, 26.2),# "VALGAMAA"
            14 : (58.2, 25.7),# "VILJANDIMAA"
            15 : (57.9, 27.2) # "VÕRUMAA"
        }
        # Convert the dictionary to a list of tuples
        data = [(county_code, lat, lon) for county_code, (lat, lon) in county_point_map.items()]

        # Create DataFrame
        df = pd.DataFrame(data, columns=['county', 'latitude', 'longitude'])
        
        return df
    
    def add_date_features(self, df):
        df['year'] = df['datetime'].dt.year
        df['month'] = df['datetime'].dt.month
        df['day'] = df['datetime'].dt.day
        df['hour'] = df['datetime'].dt.hour
        df['quarter'] = df['datetime'].dt.quarter
        df['day_of_week'] = df['datetime'].dt.day_of_week
        df['day_of_year'] = df['datetime'].dt.dayofyear
        df['week_of_year'] = df['datetime'].dt.isocalendar().week
        df['is_weekend'] = df['datetime'].dt.day_of_week >= 5
        df['is_month_start'] = df['datetime'].dt.is_month_start
        df['is_month_end'] = df['datetime'].dt.is_month_end
        df['is_quarter_start'] = df['datetime'].dt.is_quarter_start
        df['is_quarter_end'] = df['datetime'].dt.is_quarter_end
        df['is_year_start'] = df['datetime'].dt.is_year_start
        df['is_year_end'] = df['datetime'].dt.is_year_end
        df['season'] = df['datetime'].dt.month % 12 // 3 + 1
        df['hour_sin'] = np.sin(df['datetime'].dt.hour * (2. * np.pi / 24))
        df['hour_cos'] = np.cos(df['datetime'].dt.hour * (2. * np.pi / 24))
        # Calculate sin and cos for day of year
        days_in_year = 365.25  # accounts for leap year
        df['day_of_year_sin'] = np.sin((df['day_of_year'] - 1) * (2 * np.pi / days_in_year))
        df['day_of_year_cos'] = np.cos((df['day_of_year'] - 1) * (2 * np.pi / days_in_year))
        return df
    
    def add_ee_holidays(self, df):
        import holidays
        # Define Estonia public holidays
        ee_holidays = holidays.CountryHoliday('EE')
        
        print(df['date'].isna().sum())
        
        def find_problem(x):
            try:
                return x in ee_holidays
            except Exception as e:
                print(x)
                raise e

        # Function to check if the date is a holiday
        df['is_ee_holiday'] = df['date'].apply(lambda x: x in ee_holidays)

        return df
    
    def remove_cols(self, df):
        col_list = ['datetime',
                   'row_id',
                   'prediction_unit_id',
                    'date_train',
                    'hour_part',
                   'date_client',
                    'forecast_date_elec_price',
                    'origin_date_elec_price',
                    'forecast_date_gas_price',
                    'origin_date_gas_price',
                    'datetime_hist_weath',
                   'hour_part_hist_weath_latest',
                    'datetime_hist_weath_latest',
                   'origin_datetime',
                   'hour_part_fore_weath',
                    'datetime',
                     'data_block_id',
                     'row_id',
                     'prediction_unit_id',
                     'date',
                    'data_block_id_rt',
                     'row_id_rt',
                     'prediction_unit_id_rt',
                    'data_block_id_client',
                    'latitude',
                     'longitude',
                     'data_block_id_hw',
                    'start_time',
                     'end_time',
                     'time_code',
                     'group',
                    'data_block_id_hw_means',
                    'data_block_id_hw_variances',
                     'location_id',
                     'date_hw',
                     'datetime_hw_lagged',
                    'latitude_hw_lagged',
                     'longitude_hw_lagged',
                     'data_block_id_hw_lagged',
                     'start_time_hw_lagged',
                     'end_time_hw_lagged',
                     'time_code_hw_lagged',
                     'group_hw_lagged',
                    'data_block_id_hw_means_hw_lagged',
                    'data_block_id_hw_variances_hw_lagged',
                    'location_id_hw_lagged',
                     'latitude_fw',
                     'longitude_fw',
                     'origin_datetime',
                    'data_block_id_fw',
                     'forecast_datetime',
                    'data_block_id_elec',
                    'forecast_date',
                    'origin_date',
                     'data_block_id_gasp',
                   ]
        columns_to_drop = [col for col in col_list if col in df.columns]
        df = df.drop(columns_to_drop, axis=1)
        return df
    
    def remove_test_cols(self, df):
        col_list = ['datetime',
                   'prediction_unit_id',
                    'date_train',
                    'hour_part',
                   'date_client',
                    'forecast_date_elec_price',
                    'origin_date_elec_price',
                    'forecast_date_gas_price',
                    'origin_date_gas_price',
                    'datetime_hist_weath',
                   'hour_part_hist_weath_latest',
                    'datetime_hist_weath_latest',
                   'origin_datetime',
                   'hour_part_fore_weath',
                    'datetime',
                     'data_block_id',
                     'prediction_unit_id',
                     'date',
                    'data_block_id_rt',
                     'row_id_rt',
                     'prediction_unit_id_rt',
                    'data_block_id_client',
                    'latitude',
                     'longitude',
                     'data_block_id_hw',
                    'start_time',
                     'end_time',
                     'time_code',
                     'group',
                    'data_block_id_hw_means',
                    'data_block_id_hw_variances',
                     'location_id',
                     'date_hw',
                     'datetime_hw_lagged',
                    'latitude_hw_lagged',
                     'longitude_hw_lagged',
                     'data_block_id_hw_lagged',
                     'start_time_hw_lagged',
                     'end_time_hw_lagged',
                     'time_code_hw_lagged',
                     'group_hw_lagged',
                    'data_block_id_hw_means_hw_lagged',
                    'data_block_id_hw_variances_hw_lagged',
                    'location_id_hw_lagged',
                     'latitude_fw',
                     'longitude_fw',
                     'origin_datetime',
                    'data_block_id_fw',
                     'forecast_datetime',
                    'data_block_id_elec',
                    'forecast_date',
                    'origin_date',
                     'data_block_id_gasp',
                   ]
        columns_to_drop = [col for col in col_list if col in df.columns]
        df = df.drop(columns_to_drop, axis=1)
        return df
    
    def join_data(self, train, revealed_targets, client, historical_weather, forecast_weather, electricity_prices, gas_prices):
        df = train
        df = df.merge(revealed_targets, how='left', on=('datetime', 'county', 'is_business', 'product_type', 'is_consumption'), suffixes=('', '_rt'))
        df = df.merge(client, how='left', on=('date', 'county', 'is_business', 'product_type'), suffixes=('', '_client'))
        df = df.merge(historical_weather, how='left', on=('datetime', 'county'), suffixes=('', '_hw'))
        df = df.merge(forecast_weather, how='left', on=('datetime', 'county'), suffixes=('', '_fw'))
        df = df.merge(electricity_prices, how='left', on='datetime', suffixes=('', '_elec'))
        df['date'] = pd.to_datetime(df['date'])
        df = df.merge(gas_prices, how='left', on='date', suffixes=('', '_gasp'))
        df = self.add_date_features(df)
        df = self.add_ee_holidays(df)
        return df
    
    def add_test_data(self, test, revealed_targets, client, historical_weather,
            forecast_weather, electricity_prices, gas_prices):
        dfs = [test.copy(), revealed_targets.copy(), client.copy(), historical_weather.copy(),
                 forecast_weather.copy(), electricity_prices.copy(), gas_prices.copy()]
        for i, df in enumerate(dfs):
            if 'datetime' in df.columns:
                df['datetime'] = pd.to_datetime(df.datetime)
                col = 'datetime'
            if 'prediction_datetime' in df.columns:
                df['datetime'] = pd.to_datetime(df.prediction_datetime)
                col = 'datetime'
            if 'forecast_date' in df.columns:
                df['forecast_date'] = pd.to_datetime(df['forecast_date'])
                col = 'forecast_date'
            if 'forecast_datetime' in df.columns:
                df['forecast_datetime'] = pd.to_datetime(df['forecast_datetime'])
                col = 'forecast_datetime'
                
            self.test_orig_dfs[i] = pd.concat([ self.test_orig_dfs[i], df ])          
        
        
    
    def process_test_data_timestep(self, test, revealed_targets, client, historical_weather,
            forecast_weather, electricity_prices, gas_prices):
        #append test data to test data cache
        self.add_test_data(test, revealed_targets, client, historical_weather,
            forecast_weather, electricity_prices, gas_prices)
        # process test data
        test = self.init_train(self.test_orig_dfs[0].copy())
        revealed_targets = self.init_revealed_targets(self.test_orig_dfs[1].copy())
        client = self.init_client(self.test_orig_dfs[2].copy())
        historical_weather = self.init_historical_weather(self.test_orig_dfs[3].copy())
        forecast_weather = self.init_forecast_weather(self.test_orig_dfs[4].copy())
        electricity_prices = self.init_electricity(self.test_orig_dfs[5].copy())
        gas_prices = self.init_gas_prices(self.test_orig_dfs[6].copy())
        
        df_all_cols = self.join_data(test, revealed_targets, client, historical_weather,
            forecast_weather, electricity_prices, gas_prices)
        df = self.remove_test_cols(df_all_cols)
        return df
        


In [4]:
with open('data_processor_lgbm2.pkl', 'rb') as f:
    data_processor = pickle.load(f)
data_processor.df

Unnamed: 0,county,is_business,product_type,target,is_consumption,target_rt,target_lag_1h,target_lag_2h,target_lag_3h,target_lag_4h,...,is_quarter_start,is_quarter_end,is_year_start,is_year_end,season,hour_sin,hour_cos,day_of_year_sin,day_of_year_cos,is_ee_holiday
0,0,0,1,0.713,0,,,,,,...,False,False,False,False,4,0.000000,1.000000,-0.861693,-0.507430,False
1,0,0,1,96.590,1,,,,,,...,False,False,False,False,4,0.000000,1.000000,-0.861693,-0.507430,False
2,0,0,2,0.000,0,,,,,,...,False,False,False,False,4,0.000000,1.000000,-0.861693,-0.507430,False
3,0,0,2,17.314,1,,,,,,...,False,False,False,False,4,0.000000,1.000000,-0.861693,-0.507430,False
4,0,0,3,2.904,0,,,,,,...,False,False,False,False,4,0.000000,1.000000,-0.861693,-0.507430,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018609,15,1,0,197.233,1,184.072,171.092,168.933,174.920,170.068,...,False,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False
2018610,15,1,1,0.000,0,0.000,0.000,2.501,25.884,83.535,...,False,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False
2018611,15,1,1,28.404,1,38.646,47.690,34.806,29.202,21.654,...,False,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False
2018612,15,1,3,0.000,0,0.000,0.000,4.512,34.657,122.195,...,False,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False


# Testing

For my experimental CV, I want to take the approach of doing a stratified CV by time - splitting the year into 4 different parts, basically testing the model on each season, 3 months at a time. There was something in the kaggle forums that recommended something like this:

Key: 
= -> training data
+ -> CV data

4 splits in time:
1. =============+++
2. ================+++
3. ===================+++
4. ======================+++



The data starts on 2021-09-01 and ends on 2023-05-31

BUT we don't have enough data to do that properly. So, my CV will instead be:


(Thanks chatgpt)

Splitting the period from 2022-09-01 to 2023-05-31 into five equal parts, here are the date ranges for each segment:

#### First Segment:

From 2022-09-01 to 2022-10-24

#### Second Segment:

From 2022-10-25 to 2022-12-17

#### Third Segment:

From 2022-12-18 to 2023-02-09

#### Fourth Segment:

From 2023-02-10 to 2023-04-04

#### Fifth Segment:

From 2023-04-05 to 2023-05-29


In [5]:
def fill_drop_na(df):
    df = df[~df.target.isna()]
    df = df[~df.target_rolling_avg_24h.isna()]
    means = df.mean()
    # For each column, add an indicator column for NA values
    # for col in df.columns:
    #     if df[col].isna().any():
    #         df[f'{col}_is_na'] = df[col].isna()
    df = df.fillna(means)
    return df, means

In [6]:
%%time
processed_df_no_na, means = fill_drop_na(data_processor.df)
processed_df_no_na.isna().sum()

CPU times: total: 22.1 s
Wall time: 58.5 s


county             0
is_business        0
product_type       0
target             0
is_consumption     0
                  ..
hour_sin           0
hour_cos           0
day_of_year_sin    0
day_of_year_cos    0
is_ee_holiday      0
Length: 180, dtype: int64

In [7]:
processed_df_no_na['target_installed_capacity'] = processed_df_no_na['target'] / processed_df_no_na['installed_capacity'] * 1000
processed_df_no_na

  processed_df_no_na['target_installed_capacity'] = processed_df_no_na['target'] / processed_df_no_na['installed_capacity'] * 1000


Unnamed: 0,county,is_business,product_type,target,is_consumption,target_rt,target_lag_1h,target_lag_2h,target_lag_3h,target_lag_4h,...,is_quarter_end,is_year_start,is_year_end,season,hour_sin,hour_cos,day_of_year_sin,day_of_year_cos,is_ee_holiday,target_installed_capacity
11712,0,0,1,0.930,0,0.713,274.689353,274.69907,274.708302,274.717501,...,False,False,False,4,0.000000,1.000000,-0.894542,-0.446983,False,0.975978
11713,0,0,1,123.214,1,96.590,274.689353,274.69907,274.708302,274.717501,...,False,False,False,4,0.000000,1.000000,-0.894542,-0.446983,False,129.305586
11714,0,0,2,0.000,0,0.000,274.689353,274.69907,274.708302,274.717501,...,False,False,False,4,0.000000,1.000000,-0.894542,-0.446983,False,0.000000
11715,0,0,2,21.940,1,17.314,274.689353,274.69907,274.708302,274.717501,...,False,False,False,4,0.000000,1.000000,-0.894542,-0.446983,False,131.850962
11716,0,0,3,1.611,0,2.904,274.689353,274.69907,274.708302,274.717501,...,False,False,False,4,0.000000,1.000000,-0.894542,-0.446983,False,0.223505
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018609,15,1,0,197.233,1,184.072,171.092000,168.93300,174.920000,170.068000,...,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False,318.117742
2018610,15,1,1,0.000,0,0.000,0.000000,2.50100,25.884000,83.535000,...,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False,0.000000
2018611,15,1,1,28.404,1,38.646,47.690000,34.80600,29.202000,21.654000,...,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False,45.482786
2018612,15,1,3,0.000,0,0.000,0.000000,4.51200,34.657000,122.195000,...,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False,0.000000


In [8]:
from datetime import datetime

cv_ranges_corrected = [
    ('2022-09-01', '2022-10-24'), 
    ('2022-10-25', '2022-12-17'), 
    ('2022-12-18', '2023-02-09'), 
    ('2023-02-10', '2023-04-04'), 
    ('2023-04-05', '2023-05-31')
]

# Function to convert a date string into a datetime object
def to_datetime(date_str):
    return datetime.strptime(date_str, '%Y-%m-%d')

# Converting the date strings in cv_ranges to datetime objects
datetime_cv_ranges = [(to_datetime(start), to_datetime(end)) for start, end in cv_ranges_corrected]
datetime_cv_ranges

date_filter = data_processor.df_all_cols.date[processed_df_no_na.index]
date_filter

cv1_train = processed_df_no_na[date_filter <= datetime_cv_ranges[0][0]]
cv1_test = processed_df_no_na[(date_filter <= datetime_cv_ranges[0][1]) & (date_filter > datetime_cv_ranges[0][0])]

In [9]:
import datetime as dt
print(to_datetime('2023-04-05') + dt.timedelta(days=14))
print(to_datetime('2023-04-05') + dt.timedelta(days=48))

2023-04-19 00:00:00
2023-05-23 00:00:00


In [10]:
cv1_train[['year' ,'month', 'day']]

Unnamed: 0,year,month,day
11712,2021,9,5
11713,2021,9,5
11714,2021,9,5
11715,2021,9,5
11716,2021,9,5
...,...,...,...
1144249,2022,9,1
1144250,2022,9,1
1144251,2022,9,1
1144252,2022,9,1


In [11]:
cv1_test[['year' ,'month', 'day']]

Unnamed: 0,year,month,day
1144254,2022,9,2
1144255,2022,9,2
1144256,2022,9,2
1144257,2022,9,2
1144258,2022,9,2
...,...,...,...
1315849,2022,10,24
1315850,2022,10,24
1315851,2022,10,24
1315852,2022,10,24


In [12]:
processed_df_no_na[['year', 'month', 'day']]

Unnamed: 0,year,month,day
11712,2021,9,5
11713,2021,9,5
11714,2021,9,5
11715,2021,9,5
11716,2021,9,5
...,...,...,...
2018609,2023,5,31
2018610,2023,5,31
2018611,2023,5,31
2018612,2023,5,31


## Train 27

GRID SEARCH

In [13]:
from sklearn.model_selection import TimeSeriesSplit

In [14]:
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import HalvingRandomSearchCV


In [39]:
%%time

# https://www.kaggle.com/code/chaozhuang/enefit-eda-w-fft-ssa-arima-lgbm?scriptVersionId=156414824#Predictive-Modelling
from lightgbm import LGBMRegressor
import random
import lightgbm as lgb
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

def tune_lgbm_model(base_params, X_train, y_train, n_iter=8, cv=3):
    """
    Tune a LightGBM model based on a base set of parameters.

    :param base_params: Dictionary of base parameters for the model
    :param X_train: Training features
    :param y_train: Training target variable
    :param n_iter: Number of iterations for RandomizedSearchCV
    :param cv: Number of cross-validation folds
    :return: Best estimator and best parameters
    """
    # Parameter distributions for random search
    cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
       'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
        'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
    cat_features = [c for c in cat_features if c in df_train_data.columns]
    
    param_dist = {
        'learning_rate': sp_uniform(0.005, 0.5),
        'lambda_l1': sp_uniform(0, 4), 
        'lambda_l2': sp_uniform(0, 4), 
        'max_bin': sp_randint(100, 1000),
        'min_data_in_leaf': sp_randint(15, 300),
        'n_estimators': sp_randint(2000, 3500),
        'num_leaves': sp_randint(25, 150),
        
        # 'colsample_bytree' : sp_uniform(0.1, 1),
        # 'colsample_bynode' : sp_uniform(0.1, 1),
        # 'data_sample_strategy' : ['bagging', 'goss'],
        
        # 'drop_rate': sp_uniform(0, 1),
        # 'skip_drop': sp_uniform(0, 1),
        # 'min_data_per_group': sp_randint(10, 200),
        # 'max_cat_threshold': sp_randint(10, 100),
        # 'cat_l2': sp_randint(10, 100),
        # 'cat_smooth': sp_randint(10, 100),
    }

    # Create a LightGBM regressor object
    lgb_reg = lgb.LGBMRegressor(**base_params)

    # Create a RandomizedSearchCV object
    random_search = HalvingRandomSearchCV(estimator=lgb_reg, param_distributions=param_dist,
                                       scoring='neg_mean_absolute_error',
                                       cv=TimeSeriesSplit(n_splits=cv), random_state=1337, verbose=1,
                                         aggressive_elimination= True,
                                         max_resources=30000, min_resources=5, )

    producer_mask = X_train['is_consumption'] == 0
    results_dict = {}
    # producer
    X_train_producer = X_train[producer_mask]
    y_train_producer = y_train[producer_mask]
    # Fit the random search to the data
    random_search.fit(X_train_producer, y_train_producer, categorical_feature=cat_features)

    # Return the best estimator and best parameters
    results_dict['best_estimator'] = random_search.best_estimator_
    results_dict['best_params'] = random_search.best_params_
    
    
#     random_search = HalvingRandomSearchCV(estimator=lgb_reg, param_distributions=param_dist,
#                                        scoring='neg_mean_absolute_error',
#                                        cv=cv, random_state=2024, verbose=1,
#                                          aggressive_elimination= True,
#                                          max_resources=20000, min_resources=5)
#     # consumer
#     X_train_consumer = X_train[~producer_mask]
#     y_train_consumer = y_train[~producer_mask]
#     # Fit the random search to the data
#     random_search.fit(X_train_consumer, y_train_consumer, categorical_feature=cat_features)

#     # Return the best estimator and best parameters
#     results_dict['consumer_best_estimator'] = random_search.best_estimator_
#     results_dict['consumer_best_params'] = random_search.best_params_
    
    return results_dict

base_params_p1 = {
    'verbose': -1,
    'metric': 'mae',
    'n_jobs': 24,
    'boosting': 'dart',
    'objective': 'tweedie'
}

i=4
train = processed_df_no_na[(date_filter <= to_datetime('2023-03-30'))]
val = processed_df_no_na[(date_filter > to_datetime('2023-03-30'))]
print(f"Fold {i}")
print(f"Train rows: {len(train)}")
print(f"Val rows: {len(val)}")

target_cols = ['target', 'target_installed_capacity']
drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
            'snowfall_fw', 'snowfall_hw_means']

df_train_target = train[target_cols]
df_train_data = train.drop(drop_cols, axis=1)

df_val_target2 = val[target_cols]
df_val_data2 = val.drop(drop_cols, axis=1)

cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
       'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
        'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
cat_features = [c for c in cat_features if c in df_train_data.columns]

# Fit the model
results_dict = tune_lgbm_model(base_params_p1, df_train_data, df_train_target["target"])

print("Best parameters:", results_dict['producer_best_params'])

Fold 4
Train rows: 1805830
Val rows: 195264
n_iterations: 8
n_required_iterations: 8
n_possible_iterations: 8
min_resources_: 5
max_resources_: 30000
aggressive_elimination: True
factor: 3
----------
iter: 0
n_candidates: 6000
n_resources: 5
Fitting 3 folds for each of 6000 candidates, totalling 18000 fits


12000 fits failed out of a total of 18000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12000 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\mskel\.conda\envs\kaggle310\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\mskel\.conda\envs\kaggle310\lib\site-packages\lightgbm\sklearn.py", line 1049, in fit
    super().fit(
  File "C:\Users\mskel\.conda\envs\kaggle310\lib\site-packages\lightgbm\sklearn.py", line 842, in fit
    self._Booster = train(
  File "C:\Users\mskel\.conda\envs\kaggle310\lib\site-packages\lightgbm\engine.py", line 255, in train
    booster = Booster(params=params, train_

----------
iter: 1
n_candidates: 2000
n_resources: 15
Fitting 3 folds for each of 2000 candidates, totalling 6000 fits


2000 fits failed out of a total of 6000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2000 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\mskel\.conda\envs\kaggle310\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\mskel\.conda\envs\kaggle310\lib\site-packages\lightgbm\sklearn.py", line 1049, in fit
    super().fit(
  File "C:\Users\mskel\.conda\envs\kaggle310\lib\site-packages\lightgbm\sklearn.py", line 842, in fit
    self._Booster = train(
  File "C:\Users\mskel\.conda\envs\kaggle310\lib\site-packages\lightgbm\engine.py", line 255, in train
    booster = Booster(params=params, train_set

----------
iter: 2
n_candidates: 667
n_resources: 45
Fitting 3 folds for each of 667 candidates, totalling 2001 fits


 -141.17305466]
 -79.8881595]


----------
iter: 3
n_candidates: 223
n_resources: 135
Fitting 3 folds for each of 223 candidates, totalling 669 fits


 -46.86600736]
 -41.88197547]


----------
iter: 4
n_candidates: 75
n_resources: 405
Fitting 3 folds for each of 75 candidates, totalling 225 fits


 -33.19368677]
 -17.84422974]


----------
iter: 5
n_candidates: 25
n_resources: 1215
Fitting 3 folds for each of 25 candidates, totalling 75 fits


 -46.8807594 ]
 -2.71744746]


----------
iter: 6
n_candidates: 9
n_resources: 3645
Fitting 3 folds for each of 9 candidates, totalling 27 fits


 -36.893484  ]
 -4.57919506]


----------
iter: 7
n_candidates: 3
n_resources: 10935
Fitting 3 folds for each of 3 candidates, totalling 9 fits


 -33.02884732]
 -6.57553852]


Best parameters of producer: {'lambda_l1': 0.8721717874032273, 'lambda_l2': 0.1830954872642523, 'learning_rate': 0.09272080548316208, 'max_bin': 670, 'min_data_in_leaf': 34, 'n_estimators': 2137, 'num_leaves': 88}


KeyError: 'consumer_best_params'

Best Parameter runs:

Best parameters of p1: {'colsample_bynode': 0.41017411019547834, 'colsample_bytree': 0.7711664691469922, 'lambda_l1': 0.297793613166748, 'lambda_l2': 0.3614843058449302, 'learning_rate': 0.26360243974444403, 'max_bin': 916, 'min_data_in_leaf': 75, 'n_estimators': 8026}
CPU times: total: 1d 6h 28min 13s
Wall time: 1h 18s

Best parameters of producer: {'cat_l2': 32, 'cat_smooth': 42, 'drop_rate': 0.906209935681394, 'lambda_l1': 0.4237563717896653, 'lambda_l2': 2.5833405443842152, 'learning_rate': 0.10961104370080788, 'max_bin': 479, 'max_cat_threshold': 33, 'min_data_in_leaf': 36, 'min_data_per_group': 194, 'n_estimators': 5089, 'num_leaves': 109, 'skip_drop': 0.5961399837369368}
Best parameters of consumer: {'cat_l2': 94, 'cat_smooth': 47, 'drop_rate': 0.709329079259287, 'lambda_l1': 0.19543861233583915, 'lambda_l2': 3.5465108162748504, 'learning_rate': 0.15102639777091226, 'max_bin': 415, 'max_cat_threshold': 18, 'min_data_in_leaf': 27, 'min_data_per_group': 191, 'n_estimators': 5615, 'num_leaves': 10, 'skip_drop': 0.6854847756175649}


Best parameters of producer: {'lambda_l1': 1.0514735056151499, 'lambda_l2': 0.6904243319535714, 'learning_rate': 0.12970053306048324, 'max_bin': 836, 'min_data_in_leaf': 97, 'n_estimators': 4835, 'num_leaves': 107}

Best parameters: {'lambda_l1': 0.8721717874032273, 'lambda_l2': 0.1830954872642523, 'learning_rate': 0.09272080548316208, 'max_bin': 670, 'min_data_in_leaf': 34, 'n_estimators': 2137, 'num_leaves': 88}

In [43]:
from lightgbm import LGBMRegressor
def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                    'snowfall_fw', 'snowfall_hw_means']
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season'] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]
        
        # We leave max_depth as -1
        # Tune num_leaves, default is 31, let's double it       
        
        params = {'lambda_l1': 0.8721717874032273, 'lambda_l2': 0.1830954872642523, 'learning_rate': 0.09272080548316208, 'max_bin': 670, 'min_data_in_leaf': 34, 'n_estimators': 2137, 'num_leaves': 88, 
                    'metric': 'mae', 'n_jobs': 22, 'boosting': 'dart', 'objective': 'tweedie'}
        clf = LGBMRegressor(**params, random_state=69, verbose=0, importance_type='gain')

        clf.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        y_pred = clf.predict(df_train_data)

        from sklearn.metrics import mean_absolute_error

        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f" Train Mean Absolute Error_consumption:", mae)

        y_pred_val = clf.predict(df_val_data2)

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print("Val Mean Absolute Error:", mae)
        
        # importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        # importance = importance.sort_values('importance', ascending=False)
        # display(importance.head(30))
        # display(importance.tail(30))
        print()
        print()

In [44]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
 Train Mean Absolute Error_consumption: 24.71570458901916
Val Mean Absolute Error: 42.74074701715273


Fold 1
Train rows: 1304266
Val rows: 173328
 Train Mean Absolute Error_consumption: 25.58324225568654
Val Mean Absolute Error: 36.02200963137713


Fold 2
Train rows: 1480810
Val rows: 169632
 Train Mean Absolute Error_consumption: 25.684241005132446
Val Mean Absolute Error: 38.6992003907362


Fold 3
Train rows: 1653658
Val rows: 167820
 Train Mean Absolute Error_consumption: 25.868463688973456
Val Mean Absolute Error: 53.53164220498895


Fold 4
Train rows: 1824598
Val rows: 176496
 Train Mean Absolute Error_consumption: 27.06736209195623
Val Mean Absolute Error: 72.42711868608906




In [63]:
d = datetime_cv_ranges[i][1] - datetime_cv_ranges[i][0]
d.days//14
dt.timedelta(days=0)

datetime.timedelta(0)

In [66]:
for f in range(((datetime_cv_ranges[i][1] - datetime_cv_ranges[i][0]).days//14)):
    start = datetime_cv_ranges[i][0] + dt.timedelta(days=f*14)
    stop = datetime_cv_ranges[i][0] + dt.timedelta(days=(f+1)*14)
    train = processed_df_no_na[date_filter <= start]
    val = processed_df_no_na[(date_filter <= stop) & (date_filter > start)]
    print(start)
    print(stop)
    print(len(train))
    print(len(val))

2023-04-05 00:00:00
2023-04-19 00:00:00
1824598
43392
2023-04-19 00:00:00
2023-05-03 00:00:00
1867990
43296
2023-05-03 00:00:00
2023-05-17 00:00:00
1911286
45312
2023-05-17 00:00:00
2023-05-31 00:00:00
1956598
44496


In [67]:
from lightgbm import LGBMRegressor
def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

train_pred_list = []
train_mae_list = []
train_targets_list = []

pred_list = []
mae_list = []
val_targets_list = []

df = processed_df_no_na
i=4
for f in range(((datetime_cv_ranges[i][1] - datetime_cv_ranges[i][0]).days//14)):
    start = datetime_cv_ranges[i][0] + dt.timedelta(days=f*14)
    stop = datetime_cv_ranges[i][0] + dt.timedelta(days=(f+1)*14)
    train = processed_df_no_na[date_filter <= start]
    val = processed_df_no_na[(date_filter <= stop) & (date_filter > start)]
    
    print(f"Fold {i}, period {f}")
    print(f"Train rows: {len(train)}")
    print(f"Val rows: {len(val)}")

    target_cols = ['target', 'target_installed_capacity']
    drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                'snowfall_fw', 'snowfall_hw_means']

    df_train_target = train[target_cols]
    df_train_data = train.drop(drop_cols, axis=1)

    df_val_target2 = val[target_cols]
    df_val_data2 = val.drop(drop_cols, axis=1)

    cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
           'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
            'is_year_start', 'is_year_end', 'season'] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
    cat_features = [c for c in cat_features if c in df_train_data.columns]

    # We leave max_depth as -1
    # Tune num_leaves, default is 31, let's double it       

    params = {'lambda_l1': 0.8721717874032273, 'lambda_l2': 0.1830954872642523, 'learning_rate': 0.09272080548316208, 'max_bin': 670, 'min_data_in_leaf': 34, 'n_estimators': 2137, 'num_leaves': 88, 
                'metric': 'mae', 'n_jobs': 22, 'boosting': 'dart', 'objective': 'tweedie'}
    
    clf = LGBMRegressor(**params, random_state=69, verbose=0, importance_type='gain')

    clf.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

    y_pred = clf.predict(df_train_data)
    train_pred_list.append(y_pred)

    from sklearn.metrics import mean_absolute_error

    # Assuming you have two pandas Series: y_true and y_pred
    mae = mean_absolute_error(df_train_target.target, y_pred)
    train_mae_list.append(mae)
    train_targets_list.append(df_train_target.target)
    print(f" Train Mean Absolute Error_consumption:", mae)

    y_pred_val = clf.predict(df_val_data2)
    pred_list.append(y_pred_val)

    mae = mean_absolute_error(df_val_target2.target, y_pred_val)
    val_targets_list.append(df_val_target2.target)
    mae_list.append(mae)
    print("Val Mean Absolute Error:", mae)

# importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
# importance = importance.sort_values('importance', ascending=False)
# display(importance.head(30))
# display(importance.tail(30))
print()
print()

Fold 4, period 0
Train rows: 1824598
Val rows: 43392
 Train Mean Absolute Error_consumption: 27.06736209195623
Val Mean Absolute Error: 59.52748165599775
Fold 4, period 1
Train rows: 1867990
Val rows: 43296
 Train Mean Absolute Error_consumption: 27.42106436505342
Val Mean Absolute Error: 63.889947247905894
Fold 4, period 2
Train rows: 1911286
Val rows: 45312
 Train Mean Absolute Error_consumption: 27.91180558238793
Val Mean Absolute Error: 69.0872864860352
Fold 4, period 3
Train rows: 1956598
Val rows: 44496
 Train Mean Absolute Error_consumption: 28.437498536613166
Val Mean Absolute Error: 79.94249878022536




In [69]:
np.mean(mae_list)

68.11180354254105

In [70]:
for f in range((datetime_cv_ranges[i][1] - datetime_cv_ranges[i][0]).days):
    
    start = datetime_cv_ranges[i][0] + dt.timedelta(days=f)
    stop = datetime_cv_ranges[i][0] + dt.timedelta(days=(f+1))
    train = processed_df_no_na[date_filter <= start]
    val = processed_df_no_na[(date_filter <= stop) & (date_filter > start)]
    print(start)
    print(stop)
    print(len(train))
    print(len(val))

2023-04-05 00:00:00
2023-04-19 00:00:00
1824598
43392
2023-04-19 00:00:00
2023-05-03 00:00:00
1867990
43296
2023-05-03 00:00:00
2023-05-17 00:00:00
1911286
45312
2023-05-17 00:00:00
2023-05-31 00:00:00
1956598
44496


In [73]:
date_column = 'datetime'
for f in range((datetime_cv_ranges[i][1] - datetime_cv_ranges[i][0]).days):
    
    current_date = datetime_cv_ranges[i][0] + dt.timedelta(days=f)
    start_period = current_date - dt.timedelta(days=30)
    
    # Training data for the current year
    train_current_year = processed_df_no_na[(date_filter <= current_date) & (date_filter > start_period)]
    
    # Training data for the same period in previous years
    train_previous_years = pd.DataFrame()
    for year in range(data_processor.df_all_cols.date.dt.year.min(), current_date.year):
        start_previous = current_date.replace(year=year) - dt.timedelta(days=30)
        end_previous = current_date.replace(year=year) + dt.timedelta(days=30)
        train_previous_year = processed_df_no_na[(date_filter > start_previous) & (date_filter <= end_previous)]
        train_previous_years = pd.concat([train_previous_years, train_previous_year])
    
    # Combine training data
    train = pd.concat([train_current_year, train_previous_years])

    # Validation data (the next day)
    stop = current_date + dt.timedelta(days=1)
    val = processed_df_no_na[(date_filter <= stop) & (date_filter > current_date)]
    
    print(current_date)
    print(stop)
    print(len(train))
    print(len(val))

2023-04-05 00:00:00
2023-04-06 00:00:00
287590
3120
2023-04-06 00:00:00
2023-04-07 00:00:00
287590
3072
2023-04-07 00:00:00
2023-04-08 00:00:00
287542
3120
2023-04-08 00:00:00
2023-04-09 00:00:00
287542
3072
2023-04-09 00:00:00
2023-04-10 00:00:00
287494
3072
2023-04-10 00:00:00
2023-04-11 00:00:00
287446
3120
2023-04-11 00:00:00
2023-04-12 00:00:00
287446
3120
2023-04-12 00:00:00
2023-04-13 00:00:00
287446
3120
2023-04-13 00:00:00
2023-04-14 00:00:00
287446
3120
2023-04-14 00:00:00
2023-04-15 00:00:00
287446
3072
2023-04-15 00:00:00
2023-04-16 00:00:00
287398
3072
2023-04-16 00:00:00
2023-04-17 00:00:00
287350
3072
2023-04-17 00:00:00
2023-04-18 00:00:00
287302
3120
2023-04-18 00:00:00
2023-04-19 00:00:00
287302
3120
2023-04-19 00:00:00
2023-04-20 00:00:00
287302
3120
2023-04-20 00:00:00
2023-04-21 00:00:00
287302
3120
2023-04-21 00:00:00
2023-04-22 00:00:00
287302
3072
2023-04-22 00:00:00
2023-04-23 00:00:00
287254
3072
2023-04-23 00:00:00
2023-04-24 00:00:00
287206
3072
2023-04-24 0

In [76]:
from lightgbm import LGBMRegressor
def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

train_pred_list = []
train_mae_list = []
train_targets_list = []

pred_list = []
mae_list = []
val_targets_list = []

df = processed_df_no_na
i=4
for f in range((datetime_cv_ranges[i][1] - datetime_cv_ranges[i][0]).days):
    
    current_date = datetime_cv_ranges[i][0] + dt.timedelta(days=f)
    start_period = current_date - dt.timedelta(days=30)
    
    # Training data for the current year
    train_current_year = processed_df_no_na[(date_filter <= current_date) & (date_filter > start_period)]
    
    # Training data for the same period in previous years
    train_previous_years = pd.DataFrame()
    for year in range(data_processor.df_all_cols.date.dt.year.min(), current_date.year):
        start_previous = current_date.replace(year=year) - dt.timedelta(days=30)
        end_previous = current_date.replace(year=year) + dt.timedelta(days=30)
        train_previous_year = processed_df_no_na[(date_filter > start_previous) & (date_filter <= end_previous)]
        train_previous_years = pd.concat([train_previous_years, train_previous_year])
    
    # Combine training data
    train = pd.concat([train_current_year, train_previous_years])

    # Validation data (the next day)
    stop = current_date + dt.timedelta(days=1)
    val = processed_df_no_na[(date_filter <= stop) & (date_filter > current_date)]
    
    print(current_date)
    print(stop)
    print(len(train))
    print(len(val))
    
    print(f"Fold {i}, period {f}")
    print(f"Train rows: {len(train)}")
    print(f"Val rows: {len(val)}")

    target_cols = ['target', 'target_installed_capacity']
    drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                'snowfall_fw', 'snowfall_hw_means']

    df_train_target = train[target_cols]
    df_train_data = train.drop(drop_cols, axis=1)

    df_val_target2 = val[target_cols]
    df_val_data2 = val.drop(drop_cols, axis=1)

    cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
           'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
            'is_year_start', 'is_year_end', 'season'] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
    cat_features = [c for c in cat_features if c in df_train_data.columns]

    # We leave max_depth as -1
    # Tune num_leaves, default is 31, let's double it       

    params = {'lambda_l1': 0.8721717874032273, 'lambda_l2': 0.1830954872642523, 'learning_rate': 0.09272080548316208, 'max_bin': 670, 'min_data_in_leaf': 34, 'n_estimators': 2137, 'num_leaves': 88, 
                'metric': 'mae', 'n_jobs': 22, 'boosting': 'dart', 'objective': 'tweedie'}
    
    clf = LGBMRegressor(**params, random_state=69, verbose=0, importance_type='gain')

    clf.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

    y_pred = clf.predict(df_train_data)
    train_pred_list.append(y_pred)

    from sklearn.metrics import mean_absolute_error

    # Assuming you have two pandas Series: y_true and y_pred
    mae = mean_absolute_error(df_train_target.target, y_pred)
    train_mae_list.append(mae)
    train_targets_list.append(df_train_target.target)
    print(f" Train Mean Absolute Error_consumption:", mae)

    y_pred_val = clf.predict(df_val_data2)
    pred_list.append(y_pred_val)

    mae = mean_absolute_error(df_val_target2.target, y_pred_val)
    val_targets_list.append(df_val_target2.target)
    mae_list.append(mae)
    print("Val Mean Absolute Error:", mae)

# importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
# importance = importance.sort_values('importance', ascending=False)
# display(importance.head(30))
# display(importance.tail(30))
print()
print()

2023-04-05 00:00:00
2023-04-06 00:00:00
287590
3120
Fold 4, period 0
Train rows: 287590
Val rows: 3120
 Train Mean Absolute Error_consumption: 22.681794444446826
Val Mean Absolute Error: 65.97163153719124
2023-04-06 00:00:00
2023-04-07 00:00:00
287590
3072
Fold 4, period 1
Train rows: 287590
Val rows: 3072
 Train Mean Absolute Error_consumption: 22.90125551465204
Val Mean Absolute Error: 141.70186717051894
2023-04-07 00:00:00
2023-04-08 00:00:00
287542
3120
Fold 4, period 2
Train rows: 287542
Val rows: 3120
 Train Mean Absolute Error_consumption: 23.017489379950625
Val Mean Absolute Error: 75.94449504334341
2023-04-08 00:00:00
2023-04-09 00:00:00
287542
3072
Fold 4, period 3
Train rows: 287542
Val rows: 3072
 Train Mean Absolute Error_consumption: 22.92264461464641
Val Mean Absolute Error: 82.1803252048764
2023-04-09 00:00:00
2023-04-10 00:00:00
287494
3072
Fold 4, period 4
Train rows: 287494
Val rows: 3072
 Train Mean Absolute Error_consumption: 22.784107036971125
Val Mean Absolute Er

In [77]:
np.mean(mae_list)

68.21711473593948

In [78]:
from lightgbm import LGBMRegressor
def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

train_pred_list = []
train_mae_list = []
train_targets_list = []

pred_list = []
mae_list = []
val_targets_list = []

df = processed_df_no_na
i=4
for f in range(((datetime_cv_ranges[i][1] - datetime_cv_ranges[i][0]).days//14)):
    start = datetime_cv_ranges[i][0] + dt.timedelta(days=f*14)
    stop = datetime_cv_ranges[i][0] + dt.timedelta(days=(f+1)*14)
    train = processed_df_no_na[date_filter <= start]
    val = processed_df_no_na[(date_filter <= stop) & (date_filter > start)]
    
    print(f"Fold {i}, period {f}")
    print(f"Train rows: {len(train)}")
    print(f"Val rows: {len(val)}")

    target_cols = ['target', 'target_installed_capacity']
    drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                'snowfall_fw', 'snowfall_hw_means']

    df_train_target = train[target_cols]
    df_train_data = train.drop(drop_cols, axis=1)

    df_val_target2 = val[target_cols]
    df_val_data2 = val.drop(drop_cols, axis=1)

    cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
           'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
            'is_year_start', 'is_year_end', 'season'] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
    cat_features = [c for c in cat_features if c in df_train_data.columns]

    # We leave max_depth as -1
    # Tune num_leaves, default is 31, let's double it       

    params = {'lambda_l1': 0.7466999841658806, 'lambda_l2': 3.2140838539606458, 'learning_rate': 0.13753679743025782, 'max_bin': 723, 'min_data_in_leaf': 150, 'n_estimators': 5593,  
                'metric': 'mae', 'n_jobs': 22, 'boosting': 'dart', 'objective': 'tweedie'}
    
    clf = LGBMRegressor(**params, random_state=42, verbose=0, importance_type='gain')

    clf.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

    y_pred = clf.predict(df_train_data)
    train_pred_list.append(y_pred)

    from sklearn.metrics import mean_absolute_error

    # Assuming you have two pandas Series: y_true and y_pred
    mae = mean_absolute_error(df_train_target.target, y_pred)
    train_mae_list.append(mae)
    train_targets_list.append(df_train_target.target)
    print(f" Train Mean Absolute Error_consumption:", mae)

    y_pred_val = clf.predict(df_val_data2)
    pred_list.append(y_pred_val)

    mae = mean_absolute_error(df_val_target2.target, y_pred_val)
    val_targets_list.append(df_val_target2.target)
    mae_list.append(mae)
    print("Val Mean Absolute Error:", mae)

# importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
# importance = importance.sort_values('importance', ascending=False)
# display(importance.head(30))
# display(importance.tail(30))
print()
print()

Fold 4, period 0
Train rows: 1824598
Val rows: 43392
 Train Mean Absolute Error_consumption: 22.00299332488253
Val Mean Absolute Error: 56.94965311542513
Fold 4, period 1
Train rows: 1867990
Val rows: 43296
 Train Mean Absolute Error_consumption: 22.219570541125417
Val Mean Absolute Error: 60.67563387821498
Fold 4, period 2
Train rows: 1911286
Val rows: 45312
 Train Mean Absolute Error_consumption: 22.784159094052434
Val Mean Absolute Error: 65.27616255068035
Fold 4, period 3
Train rows: 1956598
Val rows: 44496
 Train Mean Absolute Error_consumption: 23.093817122533373
Val Mean Absolute Error: 76.85483575387101




In [79]:
np.mean(mae_list)

64.93907132454788

In [26]:
from sklearn.ensemble import VotingRegressor
from lightgbm import LGBMRegressor
import lightgbm as lgb

def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in [4]:
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                    'snowfall_fw', 'snowfall_hw_means']
        
        train = train.dropna()
        val = val.dropna()
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season'] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]  
        
        for feature in cat_features:
            df_train_data[feature] = df_train_data[feature].astype('category')
            df_val_data2[feature] = df_val_data2[feature].astype('category')
        
        
        # We leave max_depth as -1
        # Tune num_leaves, default is 31, let's double it       
        
        params = {'lambda_l1': 0.7466999841658806, 'lambda_l2': 3.2140838539606458, 'learning_rate': 0.13753679743025782, 'max_bin': 723, 'min_data_in_leaf': 150, 'n_estimators': 5593,  
                'metric': 'mae', 'n_jobs': 22, 'boosting': 'dart', 'objective': 'tweedie'}
        
        clf_consumer = VotingRegressor([
            ('lgb_0', LGBMRegressor(**params, random_state=42, verbose=-1, )),
            ('lgb_1', LGBMRegressor(**params, random_state=69, verbose=-1, )),
            ('lgb_2', LGBMRegressor(**params, random_state=1337, verbose=-1, )), 
            ('lgb_3', LGBMRegressor(**params, random_state=124, verbose=-1, )),
            ('lgb_4', LGBMRegressor(**params, random_state=12351, verbose=-1, ))
            ], weights=[0.2,0.2,0.2,0.2,0.2])

        clf_consumer.fit(df_train_data, df_train_target.target)
        # clf_producer.fit(df_train_data[df_train_data.is_consumption==0], df_train_target[df_train_data.is_consumption==0].target, categorical_feature=cat_features)
        
        # clf_consumer = lgb.train(params_consumer, dtrain)
        # preds = gbm.predict(df_val_data2)
        # mae = mean_absolute_error(df_val_target2["target"], preds)

        y_pred = clf_consumer.predict(df_train_data)
        # y_pred_producer = clf_producer.predict(df_train_data[df_train_data.is_consumption==0])
        # y_pred2 = y_pred.copy()
        # y_pred2[df_train_data.is_consumption==0] = y_pred_producer 

        from sklearn.metrics import mean_absolute_error

        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f" Train Mean Absolute Error_consumption:", mae)
        # mae = mean_absolute_error(df_train_target.target, y_pred2)
        # print(f" Train Mean w Producer Absolute Error:", mae)

        y_pred_val = clf_consumer.predict(df_val_data2)
        # y_pred_val_producer = clf_producer.predict(df_val_data2[df_val_data2.is_consumption==0])
        # y_pred_val2 = y_pred_val.copy()
        # y_pred_val2[df_val_data2.is_consumption==0] = y_pred_val_producer 

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print("Val Mean Absolute Error:", mae)
        # mae = mean_absolute_error(df_val_target2.target, y_pred_val2)
        # print("Val Mean w Producer Absolute Error:", mae)
        
        # importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        # importance = importance.sort_values('importance', ascending=False)
        # display(importance.head(30))
        # display(importance.tail(30))
        print()
        print()

In [27]:
train_cv(processed_df_no_na)

Fold 4
Train rows: 1824598
Val rows: 176496
 Train Mean Absolute Error_consumption: 21.690591415319332
Val Mean Absolute Error: 66.56959528179792




### LGBM With linear Trees??

In [19]:
from lightgbm import LGBMRegressor
import lightgbm as lgb

def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                    'snowfall_fw', 'snowfall_hw_means']
        
        train = train.dropna()
        val = val.dropna()
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season'] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]
        
        dtrain = lgb.Dataset(df_train_data, label=df_train_target["target"], params={
                            'linear_tree': True
                        }, categorical_feature=cat_features)
        dtest = lgb.Dataset(df_val_data2, label=df_val_target2["target"], params={
                                'linear_tree': True
                            }, categorical_feature=cat_features)
        
        
        
        
        
        # We leave max_depth as -1
        # Tune num_leaves, default is 31, let's double it       
        
        params_consumer = {'lambda_l1': 0.2, 'lambda_l2': 0.2, 'learning_rate': 0.13753679743025782, 'max_bin': 100, 'min_data_in_leaf': 150, 'n_estimators': 1000, 
                    'metric': 'mae', 'n_jobs': 22, 'boosting': 'dart', 'objective': 'tweedie'}
        params_producer = {'lambda_l1': 1.0514735056151499, 'lambda_l2': 0.6904243319535714, 'learning_rate': 0.12970053306048324, 'max_bin': 836, 'min_data_in_leaf': 97, 'n_estimators': 4835, 'num_leaves': 107, 
                    'metric': 'mae', 'n_jobs': 22, 'boosting': 'dart', 'objective': 'tweedie'}
        # clf_consumer = LGBMRegressor(**params_consumer, random_state=69, verbose=0, importance_type='gain')
        # clf_producer = LGBMRegressor(**params_producer, random_state=1337, verbose=0, importance_type='gain')

        # clf_consumer.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)
        # clf_producer.fit(df_train_data[df_train_data.is_consumption==0], df_train_target[df_train_data.is_consumption==0].target, categorical_feature=cat_features)
        
        clf_consumer = lgb.train(params_consumer, dtrain)
        # preds = gbm.predict(df_val_data2)
        # mae = mean_absolute_error(df_val_target2["target"], preds)

        y_pred = clf_consumer.predict(df_train_data)
        # y_pred_producer = clf_producer.predict(df_train_data[df_train_data.is_consumption==0])
        # y_pred2 = y_pred.copy()
        # y_pred2[df_train_data.is_consumption==0] = y_pred_producer 

        from sklearn.metrics import mean_absolute_error

        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f" Train Mean Absolute Error_consumption:", mae)
        # mae = mean_absolute_error(df_train_target.target, y_pred2)
        # print(f" Train Mean w Producer Absolute Error:", mae)

        y_pred_val = clf_consumer.predict(df_val_data2)
        # y_pred_val_producer = clf_producer.predict(df_val_data2[df_val_data2.is_consumption==0])
        # y_pred_val2 = y_pred_val.copy()
        # y_pred_val2[df_val_data2.is_consumption==0] = y_pred_val_producer 

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print("Val Mean Absolute Error:", mae)
        # mae = mean_absolute_error(df_val_target2.target, y_pred_val2)
        # print("Val Mean w Producer Absolute Error:", mae)
        
        # importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        # importance = importance.sort_values('importance', ascending=False)
        # display(importance.head(30))
        # display(importance.tail(30))
        print()
        print()

In [20]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.040610 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11637
[LightGBM] [Info] Number of data points in the train set: 1129738, number of used features: 132
[LightGBM] [Info] Start training from score 5.523564
 Train Mean Absolute Error_consumption: 32.40664367344905
Val Mean Absolute Error: 937.1609359664891


Fold 1
Train rows: 1304266
Val rows: 173328




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.044339 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11645
[LightGBM] [Info] Number of data points in the train set: 1304266, number of used features: 132
[LightGBM] [Info] Start training from score 5.525174
 Train Mean Absolute Error_consumption: 33.29096997092183
Val Mean Absolute Error: 1.1387595629733573e+36


Fold 2
Train rows: 1480810
Val rows: 169632




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.057728 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11638
[LightGBM] [Info] Number of data points in the train set: 1480810, number of used features: 132
[LightGBM] [Info] Start training from score 5.543922


KeyboardInterrupt: 

In [13]:
%%time

from lightgbm import LGBMRegressor
import random
import lightgbm as lgb
import optuna


i=3
train = processed_df_no_na[date_filter <= datetime_cv_ranges[i][0]]
val = processed_df_no_na[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
print(f"Fold {i}")
print(f"Train rows: {len(train)}")
print(f"Val rows: {len(val)}")

target_cols = ['target', 'target_installed_capacity']
drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
            'snowfall_fw', 'snowfall_hw_means']

df_train_target = train[target_cols]
df_train_data = train.drop(drop_cols, axis=1)

df_val_target2 = val[target_cols]
df_val_data2 = val.drop(drop_cols, axis=1)

cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
       'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
        'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
cat_features = [c for c in cat_features if c in df_train_data.columns]

def objective_function(trial):
    
    dtrain = lgb.Dataset(df_train_data, label=df_train_target["target"], params={
                            'linear_tree': True
                        }, categorical_feature=cat_features)
    dtest = lgb.Dataset(df_val_data2, label=df_val_target2["target"], params={
                            'linear_tree': True
                        }, categorical_feature=cat_features)

    param = {
        "objective": "tweedie",
        "metric": "mae",
        "verbosity": 1,
        "boosting_type": "dart",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 5.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 5.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.5),
        'max_bin': trial.suggest_int('max_bin', 100, 1000),
        'n_estimators': trial.suggest_int('n_estimators', 2000, 3500),
        'random_state': 114
    }

    gbm = lgb.train(param, dtrain)
    preds = gbm.predict(df_val_data2)
    mae = mean_absolute_error(df_val_target2["target"], preds)
    return mae

study = optuna.create_study(direction="minimize")





[I 2024-01-03 14:53:59,382] A new study created in memory with name: no-name-ea7ce979-bc6d-4c1c-91b6-c47a67592a33


Fold 3
Train rows: 1653658
Val rows: 167820
CPU times: total: 375 ms
Wall time: 940 ms


In [None]:
study.optimize(objective_function, n_trials=50, n_jobs=20, show_progress_bar=True)

  0%|          | 0/50 [00:00<?, ?it/s]



[W 2024-01-03 15:11:39,460] Trial 11 failed with parameters: {'lambda_l1': 8.20358307868506e-05, 'lambda_l2': 0.054022428578548505, 'num_leaves': 107, 'feature_fraction': 0.779661219119671, 'bagging_fraction': 0.435566965568534, 'bagging_freq': 1, 'min_child_samples': 25, 'learning_rate': 0.24338865973130844, 'max_bin': 551, 'n_estimators': 2858} because of the following error: LightGBMError('Check failed: (best_split_info.left_count) > (0) at C:\\b\\abs_40p8j029wh\\croot\\lightgbm_1700267980215\\work\\src\\treelearner\\serial_tree_learner.cpp, line 845 .\n').
Traceback (most recent call last):
  File "C:\Users\mskel\.conda\envs\kaggle310\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "<timed exec>", line 56, in objective_function
  File "C:\Users\mskel\.conda\envs\kaggle310\lib\site-packages\lightgbm\engine.py", line 276, in train
    booster.update(fobj=fobj)
  File "C:\Users\mskel\.conda\envs\kaggle310\lib\site-packages

In [15]:
lgb.__version__

'4.1.0'

In [None]:
%%time

# https://www.kaggle.com/code/chaozhuang/enefit-eda-w-fft-ssa-arima-lgbm?scriptVersionId=156414824#Predictive-Modelling
from lightgbm import LGBMRegressor
import random
import lightgbm as lgb
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

def tune_lgbm_model(base_params, X_train, y_train, n_iter=8, cv=3):
    """
    Tune a LightGBM model based on a base set of parameters.

    :param base_params: Dictionary of base parameters for the model
    :param X_train: Training features
    :param y_train: Training target variable
    :param n_iter: Number of iterations for RandomizedSearchCV
    :param cv: Number of cross-validation folds
    :return: Best estimator and best parameters
    """
    # Parameter distributions for random search
    cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
       'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
        'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
    cat_features = [c for c in cat_features if c in df_train_data.columns]
    
    param_dist = {
        'learning_rate': sp_uniform(0.005, 0.5),
        'lambda_l1': sp_uniform(0, 5), 
        'lambda_l2': sp_uniform(0, 5), 
        'max_bin': sp_randint(100, 1000),
        'min_data_in_leaf': sp_randint(15, 300),
        'n_estimators': sp_randint(1000, 3500),
        
        'colsample_bytree' : sp_uniform(0.1, 1),
        'colsample_bynode' : sp_uniform(0.1, 1)
        # 'data_sample_strategy' : ['bagging', 'goss'],
        # 'num_leaves': sp_randint(1, 200),
        # 'drop_rate': sp_uniform(0, 1),
        # 'skip_drop': sp_uniform(0, 1),
        # 'min_data_per_group': sp_randint(10, 200),
        # 'max_cat_threshold': sp_randint(10, 100),
        # 'cat_l2': sp_randint(10, 100),
        # 'cat_smooth': sp_randint(10, 100),
    }

    # Create a LightGBM regressor object
    lgb_reg = lgb.LGBMRegressor(**base_params)

    # Create a RandomizedSearchCV object
    random_search = HalvingRandomSearchCV(estimator=lgb_reg, param_distributions=param_dist,
                                       scoring='neg_mean_absolute_error',
                                       cv=cv, random_state=222, verbose=1,
                                         aggressive_elimination= True,
                                         max_resources=30000, min_resources=5)

    # Fit the random search to the data
    random_search.fit(X_train, y_train, categorical_feature=cat_features)

    # Return the best estimator and best parameters
    return random_search.best_estimator_, random_search.best_params_

base_params_p1 = {
    'verbose': -1,
    'metric': 'mae',
    'n_jobs': 32,
    'boosting': 'dart',
    'objective': 'tweedie'
}

i=4
train = processed_df_no_na#[(date_filter <= to_datetime('2023-03-30'))]
val = processed_df_no_na[(date_filter > to_datetime('2023-03-30'))]
print(f"Fold {i}")
print(f"Train rows: {len(train)}")
print(f"Val rows: {len(val)}")

target_cols = ['target', 'target_installed_capacity']
drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
            'snowfall_fw', 'snowfall_hw_means']

df_train_target = train[target_cols]
df_train_data = train.drop(drop_cols, axis=1)

df_val_target2 = val[target_cols]
df_val_data2 = val.drop(drop_cols, axis=1)

cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
       'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
        'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
cat_features = [c for c in cat_features if c in df_train_data.columns]

# Fit the model
best_model, best_params = tune_lgbm_model(base_params_p1, df_train_data, df_train_target["target"])

print("Best parameters of p1:", best_params)

In [20]:
from lightgbm import LGBMRegressor

def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                    'snowfall_fw', 'snowfall_hw_means']
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]
        
        # We leave max_depth as -1
        # Tune num_leaves, default is 31, let's double it
        params = {'boosting': 'dart', 'lambda_l1': 0.6864834137130547, 'lambda_l2': 0.505513780743504, 'learning_rate': 0.2549750586501872, 'max_bin': 467, 'min_data_in_leaf': 107, 'n_estimators': 3559, 
                  'num_iterations': 1142, 'objective': 'regression_l1', 'reg_sqrt': False}
        clf2 = LGBMRegressor(**params, random_state=42, verbose=0, n_jobs=15, importance_type='gain')
        clf2.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        from sklearn.metrics import mean_absolute_error
        
        print("###############   Target   #################")
        y_pred = clf2.predict(df_train_data)
        y_pred
        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf2.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        # display(importance.head(30))
        # display(importance.tail(30))
        print()
        print()

In [21]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264




###############   Target   #################
For fold 0: Train Mean Absolute Error: 32.96459253631031
For fold 0: Fold Val Mean Absolute Error: 45.85703176069207


Fold 1
Train rows: 1304266
Val rows: 173328




###############   Target   #################
For fold 1: Train Mean Absolute Error: 33.59994364426004
For fold 1: Fold Val Mean Absolute Error: 37.43359879861767


Fold 2
Train rows: 1480810
Val rows: 169632




###############   Target   #################
For fold 2: Train Mean Absolute Error: 33.48656296306161
For fold 2: Fold Val Mean Absolute Error: 37.21805932766326


Fold 3
Train rows: 1653658
Val rows: 167820




###############   Target   #################
For fold 3: Train Mean Absolute Error: 33.10615061442309
For fold 3: Fold Val Mean Absolute Error: 56.77602215130679


Fold 4
Train rows: 1824598
Val rows: 176496




###############   Target   #################
For fold 4: Train Mean Absolute Error: 34.82524925935727
For fold 4: Fold Val Mean Absolute Error: 75.67205978019075




In [22]:
from lightgbm import LGBMRegressor

def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                    'snowfall_fw', 'snowfall_hw_means']
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]
        
        # We leave max_depth as -1
        # Tune num_leaves, default is 31, let's double it
        params = {'boosting': 'dart', 'lambda_l1': 0.5495562797837548, 'lambda_l2': 1.162175586039744, 'learning_rate': 0.27395000326947383, 'max_bin': 466, 'min_data_in_leaf': 136, 'n_estimators': 6132, 'num_iterations': 1891, 'objective': 'tweedie', 'reg_sqrt': True}
        clf2 = LGBMRegressor(**params, random_state=42, verbose=0, n_jobs=15, importance_type='gain')
        clf2.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        from sklearn.metrics import mean_absolute_error
        
        print("###############   Target   #################")
        y_pred = clf2.predict(df_train_data)
        y_pred
        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf2.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        # display(importance.head(30))
        # display(importance.tail(30))
        print()
        print()

In [23]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264




###############   Target   #################
For fold 0: Train Mean Absolute Error: 23.908998906841216
For fold 0: Fold Val Mean Absolute Error: 43.68238625188157


Fold 1
Train rows: 1304266
Val rows: 173328




###############   Target   #################
For fold 1: Train Mean Absolute Error: 25.015406084614412
For fold 1: Fold Val Mean Absolute Error: 35.94291279404007


Fold 2
Train rows: 1480810
Val rows: 169632




###############   Target   #################
For fold 2: Train Mean Absolute Error: 24.93726276349945
For fold 2: Fold Val Mean Absolute Error: 36.87396909225792


Fold 3
Train rows: 1653658
Val rows: 167820




###############   Target   #################
For fold 3: Train Mean Absolute Error: 25.052413785402987
For fold 3: Fold Val Mean Absolute Error: 54.21615383175424


Fold 4
Train rows: 1824598
Val rows: 176496




###############   Target   #################
For fold 4: Train Mean Absolute Error: 26.13208833585263
For fold 4: Fold Val Mean Absolute Error: 70.00967504031478




In [20]:
from lightgbm import LGBMRegressor

def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                    'snowfall_fw', 'snowfall_hw_means']
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]
        
        # We leave max_depth as -1
        # Tune num_leaves, default is 31, let's double it
        params = {'boosting': 'dart', 'lambda_l1': 0.9463464945326672, 'lambda_l2': 0.5311225677898304, 'learning_rate': 0.42981688122448874, 'max_bin': 701, 'min_data_in_leaf': 295, 'n_estimators': 6289, 
                  'num_iterations': 1887, 'objective': 'tweedie', 'reg_sqrt': True}
        clf2 = LGBMRegressor(**params, random_state=42, verbose=0, n_jobs=32, importance_type='gain')
        clf2.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        from sklearn.metrics import mean_absolute_error
        
        print("###############   Target   #################")
        y_pred = clf2.predict(df_train_data)
        y_pred
        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf2.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        # display(importance.head(30))
        # display(importance.tail(30))
        print()
        print()

In [21]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264




###############   Target   #################
For fold 0: Train Mean Absolute Error: 20.845407931843443
For fold 0: Fold Val Mean Absolute Error: 43.01871825089143


Fold 1
Train rows: 1304266
Val rows: 173328




###############   Target   #################
For fold 1: Train Mean Absolute Error: 21.765199573931195
For fold 1: Fold Val Mean Absolute Error: 35.738415800476595


Fold 2
Train rows: 1480810
Val rows: 169632




###############   Target   #################
For fold 2: Train Mean Absolute Error: 21.92315720828745
For fold 2: Fold Val Mean Absolute Error: 35.52263853867839


Fold 3
Train rows: 1653658
Val rows: 167820




###############   Target   #################
For fold 3: Train Mean Absolute Error: 21.906708132439025
For fold 3: Fold Val Mean Absolute Error: 55.80596846670963


Fold 4
Train rows: 1824598
Val rows: 176496




###############   Target   #################
For fold 4: Train Mean Absolute Error: 22.822691492560484
For fold 4: Fold Val Mean Absolute Error: 72.45696756237643




In [22]:
from lightgbm import LGBMRegressor

def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                    'snowfall_fw', 'snowfall_hw_means']
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]
        
        # We leave max_depth as -1
        # Tune num_leaves, default is 31, let's double it
        params = {'boosting': 'dart', 'lambda_l1': 0.5350237641949025, 'lambda_l2': 0.2952089055653504, 'learning_rate': 0.11903395875401224, 'max_bin': 638, 'min_data_in_leaf': 145, 'n_estimators': 5120, 
                  'num_iterations': 2053, 'objective': 'tweedie', 'reg_sqrt': True}
        clf2 = LGBMRegressor(**params, random_state=42, verbose=0, n_jobs=32, importance_type='gain')
        clf2.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        from sklearn.metrics import mean_absolute_error
        
        print("###############   Target   #################")
        y_pred = clf2.predict(df_train_data)
        y_pred
        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf2.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        # display(importance.head(30))
        # display(importance.tail(30))
        print()
        print()

In [23]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264




###############   Target   #################
For fold 0: Train Mean Absolute Error: 23.908998906841216
For fold 0: Fold Val Mean Absolute Error: 43.68238625188157


Fold 1
Train rows: 1304266
Val rows: 173328




###############   Target   #################
For fold 1: Train Mean Absolute Error: 25.015406084614412
For fold 1: Fold Val Mean Absolute Error: 35.94291279404007


Fold 2
Train rows: 1480810
Val rows: 169632




###############   Target   #################
For fold 2: Train Mean Absolute Error: 24.93726276349945
For fold 2: Fold Val Mean Absolute Error: 36.87396909225792


Fold 3
Train rows: 1653658
Val rows: 167820




###############   Target   #################
For fold 3: Train Mean Absolute Error: 25.052413785402987
For fold 3: Fold Val Mean Absolute Error: 54.21615383175424


Fold 4
Train rows: 1824598
Val rows: 176496




###############   Target   #################
For fold 4: Train Mean Absolute Error: 26.13208833585263
For fold 4: Fold Val Mean Absolute Error: 70.00967504031478




Best Parameter runs:


```Best parameters of p1: {'boosting': 'dart', 'lambda_l1': 1.7896547008552977, 'lambda_l2': 1.1957999576221703, 'learning_rate': 0.1882811352534675, 'max_bin': 314, 'min_data_in_leaf': 54, 'num_iterations': 824} = LGBMRegressor(random_state=42, boosting='dart', n_estimators=2500, verbose=0, n_jobs=15, objective='l2', importance_type='gain', lambda_l1= 1.7896547008552977, lambda_l2= 1.1957999576221703, learning_rate= 0.1882811352534675, max_bin= 314, min_data_in_leaf= 54, num_iterations= 824)```

```Best parameters of p1: {'boosting': 'gbrt', 'lambda_l1': 0.34672930701554416, 'lambda_l2': 0.7821212151464816, 'learning_rate': 0.07733541316820935, 'max_bin': 487, 'min_data_in_leaf': 16, 'n_estimators': 2021, 'num_iterations': 489, 'objective': 'tweedie', 'reg_sqrt': False}```

```Best parameters of p1: {'boosting': 'gbrt', 'lambda_l1': 1.9207022987271727, 'lambda_l2': 0.10588062856381697, 'learning_rate': 0.12891341726246747, 'max_bin': 401, 'min_data_in_leaf': 76, 'n_estimators': 2300, 'num_iterations': 730, 'objective': 'tweedie', 'reg_sqrt': True}```

```Best parameters of p1: {'boosting': 'dart', 'lambda_l1': 1.1928653738419932, 'lambda_l2': 1.6746474446216606, 'learning_rate': 0.2727459158987165, 'max_bin': 145, 'min_data_in_leaf': 122, 'n_estimators': 6128, 'num_iterations': 1765, 'objective': 'tweedie', 'reg_sqrt': False}```

```Best parameters of p1: {'boosting': 'dart', 'lambda_l1': 0.6864834137130547, 'lambda_l2': 0.505513780743504, 'learning_rate': 0.2549750586501872, 'max_bin': 467, 'min_data_in_leaf': 107, 'n_estimators': 3559, 'num_iterations': 1142, 'objective': 'regression_l1', 'reg_sqrt': False}```

```Best parameters of p1: {'boosting': 'dart', 'lambda_l1': 0.5495562797837548, 'lambda_l2': 1.162175586039744, 'learning_rate': 0.27395000326947383, 'max_bin': 466, 'min_data_in_leaf': 136, 'n_estimators': 6132, 'num_iterations': 1891, 'objective': 'tweedie', 'reg_sqrt': True}```

```Best parameters of p1: {'boosting': 'dart', 'lambda_l1': 0.9463464945326672, 'lambda_l2': 0.5311225677898304, 'learning_rate': 0.42981688122448874, 'max_bin': 701, 'min_data_in_leaf': 295, 'n_estimators': 6289, 'num_iterations': 1887, 'objective': 'tweedie', 'reg_sqrt': True}```

```Best parameters of p1: {'boosting': 'dart', 'lambda_l1': 0.5350237641949025, 'lambda_l2': 0.2952089055653504, 'learning_rate': 0.11903395875401224, 'max_bin': 638, 'min_data_in_leaf': 145, 'n_estimators': 5120, 'num_iterations': 2053, 'objective': 'tweedie', 'reg_sqrt': True}```


```Best parameters of p1: {'lambda_l1': 1.1210308999481333, 'lambda_l2': 1.3202715549769235, 'learning_rate': 0.49368057640644114, 'max_bin': 285, 'min_data_in_leaf': 278, 'n_estimators': 4430, 'reg_sqrt': True, 'verbose': -1, 'metric': 'mae', 'n_jobs': 32, 'boosting': 'dart', 'objective': 'tweedie'}```

```Best parameters of p1: {'lambda_l1': 0.7466999841658806, 'lambda_l2': 3.2140838539606458, 'learning_rate': 0.13753679743025782, 'max_bin': 723, 'min_data_in_leaf': 150, 'n_estimators': 5593, 'verbose': -1, 'metric': 'mae', 'n_jobs': 32, 'boosting': 'dart', 'objective': 'tweedie'}```

```Best parameters of p1: {'lambda_l1': 2.931420537739911, 'lambda_l2': 4.510580686709511, 'learning_rate': 0.058926445091536876, 'max_bin': 961, 'min_data_in_leaf': 115, 'n_estimators': 5188} CPU times: total: 7h 43min 17s Wall time: 15min 5s```



Todo




```Best parameters of p1: {'colsample_bynode': 0.870026791381559, 'colsample_bytree': 0.21026509011420783, 'lambda_l1': 2.383100182236077, 'lambda_l2': 4.450575103732394, 'learning_rate': 0.2509071472970593, 'max_bin': 556, 'min_data_in_leaf': 133, 'n_estimators': 4594}
CPU times: total: 4h 30min 19s
Wall time: 8min 48s```

```Best parameters of p1: {'colsample_bynode': 0.5253603437351759, 'colsample_bytree': 0.6267686233402383, 'lambda_l1': 0.5452526132258517, 'lambda_l2': 1.1998362529626179, 'learning_rate': 0.32940999216169664, 'max_bin': 635, 'min_data_in_leaf': 100, 'n_estimators': 5522}
CPU times: total: 12h 4min 27s
Wall time: 24min 15s```

### Models

In [53]:
params={'n_iter': 2500,'verbose': 1,'objective': 'l2','metric': 'mae','learning_rate': 0.05073909898961407, 'colsample_bytree': 0.726023996436955, 'colsample_bynode': 0.5803681307354022, 
        'lambda_l1': 8.562963348932286, 'lambda_l2': 4.893256185259296, 'min_data_in_leaf': 115, 'max_depth': 23, 'num_leaves':50, 'max_bin': 898}

train = processed_df_no_na[date_filter <= datetime_cv_ranges[-1][0]]
val = processed_df_no_na[(date_filter <= datetime_cv_ranges[-1][1]) & (date_filter > datetime_cv_ranges[-1][0])]

df_train_target = train[['target', 'target_installed_capacity']]
df_train_data = train.drop(['target', 'target_installed_capacity'], axis=1)

df_val_target = val[['target', 'target_installed_capacity']]
df_val_data = val.drop(['target', 'target_installed_capacity'], axis=1)
        
clf = LGBMRegressor(**params, random_state=42)


cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])

clf.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

y_pred = clf.predict(df_train_data)
y_pred

from sklearn.metrics import mean_absolute_error

# Assuming you have two pandas Series: y_true and y_pred
y_pred = clf.predict(df_train_data)
mae = mean_absolute_error(df_train_target.target, y_pred)
print(f" Train Mean Absolute Error:", mae)

y_pred_val = clf.predict(df_val_data)
y_pred_val

mae = mean_absolute_error(df_val_target.target, y_pred_val)
print("Val Mean Absolute Error:", mae)

# y_pred_test = clf.predict(df_test_data)
# y_pred_test

importance = pd.DataFrame({'importance':clf.feature_importances_, 'name':clf.feature_name_})
importance = importance.sort_values('importance', ascending=False)
display(importance.head(10))
display(importance.tail(10))
print()
print()



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.219732 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84440
[LightGBM] [Info] Number of data points in the train set: 1824598, number of used features: 144
[LightGBM] [Info] Start training from score 268.598179
 Train Mean Absolute Error: 22.29970221631375
Val Mean Absolute Error: 79.12598942626


Unnamed: 0,importance,name
6,5715,target_rolling_avg_hour_7d
5,4274,target_rolling_avg_24h
126,4055,hour
12,4026,installed_capacity
7,3947,target_rolling_avg_hour_hour_day_4w
11,3284,eic_count
4,3144,target_rt
0,2969,county
8,2359,target_rolling_allp_avg_24h
9,2265,target_rolling_allp_avg_hour_7d


Unnamed: 0,importance,name
123,55,year
138,32,season
127,24,quarter
132,21,is_month_start
58,18,snowfall_hw_lagged
133,5,is_month_end
135,3,is_quarter_end
134,3,is_quarter_start
136,0,is_year_start
137,0,is_year_end






In [54]:
test_submission = pd.read_csv("data/example_test_files/sample_submission.csv")
test_submission

Unnamed: 0,row_id,data_block_id,target
0,2005872,634,0
1,2005873,634,0
2,2005874,634,0
3,2005875,634,0
4,2005876,634,0
...,...,...,...
12475,2018347,637,0
12476,2018348,637,0
12477,2018349,637,0
12478,2018350,637,0


In [55]:
clf.predict(df_val_data)



array([6.39107379e+00, 1.01617424e+03, 6.95058781e-02, ...,
       4.89672200e+01, 5.24024018e+00, 2.66437232e+02])

In [61]:
from data import public_timeseries_testing_util as enefit

with open('data_processor_testing.pkl', 'rb') as f:
    data_processor = pickle.load(f)
data_processor.df

env = enefit.make_env()

for (test, revealed_targets, client, historical_weather,
            forecast_weather, electricity_prices, gas_prices, sample_submission) in env.iter_test():
    test_data = data_processor.process_test_data_timestep(test, revealed_targets, client, historical_weather, forecast_weather, electricity_prices, gas_prices)
    display(test_data)
    
    test_data_filtered = test_data[~test_data.currently_scored.fillna(True)]
    test_data_filtered = test_data_filtered.drop('target', axis=1)
    other_cols = test_data_filtered[['prediction_datetime', 'currently_scored', 'row_id']]
    test_data_filtered = test_data_filtered.drop(['prediction_datetime', 'currently_scored', 'row_id'], axis=1)
    preds = clf.predict(test_data_filtered)
    submission = other_cols[['row_id']].copy()
    submission['target'] = preds
    submission = submission.reset_index(drop=True)
    env.predict(submission)
    data_processor.test_orig_dfs[0]['currently_scored'] = True
    display(submission)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['datetime'] = df['datetime'] + dt.timedelta(days=1)


0


Unnamed: 0,county,is_business,product_type,target,is_consumption,row_id,prediction_datetime,currently_scored,target_rt,target_rolling_avg_24h,...,is_quarter_start,is_quarter_end,is_year_start,is_year_end,season,hour_sin,hour_cos,day_of_year_sin,day_of_year_cos,is_ee_holiday
0,0,0,1,2.977,0,1960760,,,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
1,0,0,1,601.482,1,1960761,,,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
2,0,0,2,0.000,0,1960762,,,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
3,0,0,2,9.943,1,1960763,,,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
4,0,0,3,50.278,0,1960764,,,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51347,15,1,1,,1,2008989,2023-05-28 23:00:00,False,32.809,26.492250,...,False,False,False,False,2,-0.258819,0.965926,0.575190,-0.818020,True
51348,15,1,3,,0,2008990,2023-05-28 23:00:00,False,0.000,362.492542,...,False,False,False,False,2,-0.258819,0.965926,0.575190,-0.818020,True
51349,15,1,3,,0,2008990,2023-05-28 23:00:00,False,0.000,375.690208,...,False,False,False,False,2,-0.258819,0.965926,0.575190,-0.818020,True
51350,15,1,3,,1,2008991,2023-05-28 23:00:00,False,195.707,299.014875,...,False,False,False,False,2,-0.258819,0.965926,0.575190,-0.818020,True




Unnamed: 0,row_id,target
0,2005872,18.606309
1,2005872,26.932363
2,2005873,574.233623
3,2005873,516.410356
4,2005874,-2.070909
...,...,...
6235,2008989,37.287530
6236,2008990,-8.324577
6237,2008990,-8.324577
6238,2008991,288.868177


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['datetime'] = df['datetime'] + dt.timedelta(days=1)


0


Unnamed: 0,county,is_business,product_type,target,is_consumption,row_id,prediction_datetime,currently_scored,target_rt,target_rolling_avg_24h,...,is_quarter_start,is_quarter_end,is_year_start,is_year_end,season,hour_sin,hour_cos,day_of_year_sin,day_of_year_cos,is_ee_holiday
0,0,0,1,2.977,0,1960760,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
1,0,0,1,601.482,1,1960761,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
2,0,0,2,0.000,0,1960762,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
3,0,0,2,9.943,1,1960763,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
4,0,0,3,50.278,0,1960764,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57587,15,1,1,,1,2012109,2023-05-29 23:00:00,False,38.646,18.873583,...,False,False,False,False,2,-0.258819,0.965926,0.561034,-0.827793,False
57588,15,1,3,,0,2012110,2023-05-29 23:00:00,False,0.000,304.133875,...,False,False,False,False,2,-0.258819,0.965926,0.561034,-0.827793,False
57589,15,1,3,,0,2012110,2023-05-29 23:00:00,False,0.000,403.044625,...,False,False,False,False,2,-0.258819,0.965926,0.561034,-0.827793,False
57590,15,1,3,,1,2012111,2023-05-29 23:00:00,False,188.689,267.524667,...,False,False,False,False,2,-0.258819,0.965926,0.561034,-0.827793,False




Unnamed: 0,row_id,target
0,2008992,2.334951
1,2008992,4.308818
2,2008993,543.003451
3,2008993,545.110744
4,2008994,-4.441849
...,...,...
6235,2012109,39.542990
6236,2012110,-1.994870
6237,2012110,3.090400
6238,2012111,290.262709


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['datetime'] = df['datetime'] + dt.timedelta(days=1)


0


Unnamed: 0,county,is_business,product_type,target,is_consumption,row_id,prediction_datetime,currently_scored,target_rt,target_rolling_avg_24h,...,is_quarter_start,is_quarter_end,is_year_start,is_year_end,season,hour_sin,hour_cos,day_of_year_sin,day_of_year_cos,is_ee_holiday
0,0,0,1,2.977,0,1960760,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
1,0,0,1,601.482,1,1960761,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
2,0,0,2,0.000,0,1960762,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
3,0,0,2,9.943,1,1960763,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
4,0,0,3,50.278,0,1960764,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63827,15,1,1,,1,2015229,2023-05-30 23:00:00,False,35.217,17.434458,...,False,False,False,False,2,-0.258819,0.965926,0.546711,-0.837321,False
63828,15,1,3,,0,2015230,2023-05-30 23:00:00,False,0.000,375.690208,...,False,False,False,False,2,-0.258819,0.965926,0.546711,-0.837321,False
63829,15,1,3,,0,2015230,2023-05-30 23:00:00,False,0.000,506.681000,...,False,False,False,False,2,-0.258819,0.965926,0.546711,-0.837321,False
63830,15,1,3,,1,2015231,2023-05-30 23:00:00,False,189.933,266.825583,...,False,False,False,False,2,-0.258819,0.965926,0.546711,-0.837321,False




Unnamed: 0,row_id,target
0,2012112,35.103232
1,2012112,18.181895
2,2012113,501.313356
3,2012113,514.270241
4,2012114,-4.603473
...,...,...
6235,2015229,37.730012
6236,2015230,-1.791179
6237,2015230,8.700442
6238,2015231,281.709053


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['datetime'] = df['datetime'] + dt.timedelta(days=1)


0


Unnamed: 0,county,is_business,product_type,target,is_consumption,row_id,prediction_datetime,currently_scored,target_rt,target_rolling_avg_24h,...,is_quarter_start,is_quarter_end,is_year_start,is_year_end,season,hour_sin,hour_cos,day_of_year_sin,day_of_year_cos,is_ee_holiday
0,0,0,1,2.977,0,1960760,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
1,0,0,1,601.482,1,1960761,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
2,0,0,2,0.000,0,1960762,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
3,0,0,2,9.943,1,1960763,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
4,0,0,3,50.278,0,1960764,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70067,15,1,1,,1,2018349,2023-05-31 23:00:00,False,31.484,33.753875,...,False,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False
70068,15,1,3,,0,2018350,2023-05-31 23:00:00,False,0.000,403.044625,...,False,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False
70069,15,1,3,,0,2018350,2023-05-31 23:00:00,False,0.000,286.517708,...,False,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False
70070,15,1,3,,1,2018351,2023-05-31 23:00:00,False,183.756,105.720042,...,False,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False




Unnamed: 0,row_id,target
0,2015232,1.443170
1,2015232,1.150358
2,2015233,526.388732
3,2015233,545.576868
4,2015234,-8.904476
...,...,...
6235,2018349,40.218498
6236,2018350,2.111896
6237,2018350,-4.018734
6238,2018351,266.094187


In [62]:
def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        df_train_target = train[['target', 'target_installed_capacity']]
        df_train_data = train.drop(['target', 'target_installed_capacity'], axis=1)
        
        df_val_target = val[['target', 'target_installed_capacity']]
        df_val_data = val.drop(['target', 'target_installed_capacity'], axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        
        clf = LGBMRegressor(random_state=42, n_estimators=1500, verbose=1, n_jobs=32, objective='l2', )
        clf_producer = LGBMRegressor(random_state=42, n_estimators=1500, verbose=1, n_jobs=32, objective='l2', )
        
        clf.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)
        clf_producer.fit(df_train_data[df_train_data.is_consumption==0], df_train_target[df_train_data.is_consumption==0].target, categorical_feature=cat_features)

        y_pred = clf.predict(df_train_data)
        y_pred_producer = clf_producer.predict(df_train_data[df_train_data.is_consumption==0])
        y_pred2 = y_pred.copy()
        y_pred2[df_train_data.is_consumption==0] = y_pred_producer 

        from sklearn.metrics import mean_absolute_error

        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f" Train Mean Absolute Error:", mae)
        mae = mean_absolute_error(df_train_target.target, y_pred2)
        print(f" Train Mean w Producer Absolute Error:", mae)

        y_pred_val = clf.predict(df_val_data)
        y_pred_val_producer = clf_producer.predict(df_val_data[df_val_data.is_consumption==0])
        y_pred_val2 = y_pred_val.copy()
        y_pred_val2[df_val_data.is_consumption==0] = y_pred_val_producer 

        mae = mean_absolute_error(df_val_target.target, y_pred_val)
        print("Val Mean Absolute Error:", mae)
        mae = mean_absolute_error(df_val_target.target, y_pred_val2)
        print("Val Mean w Producer Absolute Error:", mae)

        # y_pred_test = clf.predict(df_test_data)
        # y_pred_test

        # mae = mean_absolute_error(df_test_target.target, y_pred_test)
        # print("Test Mean Absolute Error:", mae)

        importance = pd.DataFrame({'importance':clf.feature_importances_, 'name':clf.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        # display(importance.head(10))
        # display(importance.tail(10))
        print()
        print()
        print()
        print()

In [63]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041503 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28482
[LightGBM] [Info] Number of data points in the train set: 1129738, number of used features: 167
[LightGBM] [Info] Start training from score 250.526332
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059563 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28481
[LightGBM] [Info] Number of data points in the train set: 564869, number of used features: 166
[LightGBM] [Info] Start training from score 87.581421
 Train Mean Absolute Error: 21.623644374411725
 Train Mean w Producer Absolute Error: 18.244623606390444
Val Mean Absolute Error: 45.406346380878844
Val Mean w Producer Absolute Error: 44.0577406652