# LGBM

In [2]:
import pickle
import pandas as pd
import numpy as np

In [3]:
import pandas as pd
import datetime as dt
import numpy as np

import pandas as pd
import datetime as dt
import numpy as np

class TrainDataProcessor:
    """Processes Train data, using train data as a warm start, and prepares it for inference."""

    def __init__(self, train, revealed_targets, client, historical_weather,
                 forecast_weather, electricity_prices, gas_prices):
        self.test_orig_dfs = self.get_test_orig_dfs([train.copy(), revealed_targets.copy(), client.copy(), historical_weather.copy(),
                 forecast_weather.copy(), electricity_prices.copy(), gas_prices.copy()])
        self.train = self.init_train(train)
        self.revealed_targets = self.init_revealed_targets(revealed_targets)
        self.client = self.init_client(client)
        self.weather_mapping = self.init_weather_mapping()
        self.historical_weather = self.init_historical_weather(historical_weather)
        self.forecast_weather = self.init_forecast_weather(forecast_weather)
        self.electricity_prices = self.init_electricity(electricity_prices)
        self.gas_prices = self.init_gas_prices(gas_prices)
        
        self.df_all_cols = self.join_data(self.train, self.revealed_targets, self.client, self.historical_weather, self.forecast_weather, self.electricity_prices, self.gas_prices)
        self.df = self.remove_cols(self.df_all_cols)
        
    def get_test_orig_dfs(self, dfs):
        for i, df in enumerate(dfs):
            if 'datetime' in df.columns:
                df['datetime'] = pd.to_datetime(df.datetime)
                col = 'datetime'
            if 'prediction_datetime' in df.columns:
                df['prediction_datetime'] = pd.to_datetime(df.prediction_datetime)
                col = 'prediction_datetime'
            if 'forecast_date' in df.columns:
                df['forecast_date'] = pd.to_datetime(df['forecast_date'])
                col = 'forecast_date'
            if 'forecast_datetime' in df.columns:
                df['forecast_datetime'] = pd.to_datetime(df['forecast_datetime'])
                col = 'forecast_datetime'
            if 'date' in df.columns:
                df['date'] = pd.to_datetime(df.date).dt.date
                col = 'date'

            test_date = df[col].iloc[-1]  # Assuming test is a DataFrame
            start_date = test_date - pd.Timedelta(days=14)
            historical_subset = df[df[col] >= start_date]
            dfs[i] = historical_subset
        return dfs
        
    def init_train(self, df):
        """Prepares the training data for model training."""
        try:
            df['datetime'] = pd.to_datetime(df.datetime)
        except Exception as e:
            df['datetime'] = pd.to_datetime(df.prediction_datetime)
        df['date'] = df.datetime.dt.date
            
        # df = self.get_data_block_id(df, 'datetime')
        return df
    
    def add_electricity_lag_features(self, df):
        """Chatgpt summary:
        Enhances a DataFrame with electricity price lag features:
        - Sets 'datetime' as Index for time series analysis.
        - Calculates rolling 7-day mean price, lagged by one day.
        - Computes rolling 7-day mean for same hour, lagged.
        - Adds column for yesterday's price, shifted by 24 hours.
        - Calculates 24-hour rolling average of electricity prices.
        - Resets index and drops 'forecast_date', 'origin_date', 'hour'.
        """
        ##### mean from entire last week
        df.set_index('datetime', inplace=True)
        # Use rolling to calculate mean price of the last week
        # The window is 7 days, min_periods can be set as per requirement
        # 'closed' determines which side of the interval is closed; it can be 'right' or 'left'
        df['mean_euros_per_mwh_last_week'] = df['euros_per_mwh'].rolling(window='7D', min_periods=1, closed='right').mean()
        # Shift the results to align with the requirement of lagging
        df['mean_euros_per_mwh_last_week'] = df['mean_euros_per_mwh_last_week'].shift()
        
        ##### mean from last week this hour only
        # Extract hour from datetime
        df['hour'] = df.index.hour

        # Group by hour and apply rolling mean for each group
        hourly_groups = df.groupby('hour')
        dff = hourly_groups['euros_per_mwh'].rolling(window='7D', min_periods=1, closed='right').mean()#.shift()#.reset_index(level=0, drop=True)
        dff = dff.reset_index().set_index('datetime').groupby('hour')['euros_per_mwh'].shift()
        dff = dff.rename('mean_euros_per_mwh_same_hour_last_week')
        df = df.join(dff)
        #### yesterday's power price
        df['yesterdays_euros_per_mwh'] = df['euros_per_mwh'].shift(24)
        
        ### 24h average
        # Calculate the 24-hour rolling average
        df['euros_per_mwh_24h_average_price'] = df['euros_per_mwh'].rolling(window=24, min_periods=1).mean()

        # Resetting the index if needed
        df.reset_index(inplace=True)
        df = df.drop(['forecast_date', 'origin_date', 'hour'], axis=1)
        return df

    def init_electricity(self, df):
        ## LAG = 1 Day
        ## Move forecast datetime ahead by 1 day
        ## change name to datetime
        df['datetime'] = pd.to_datetime(df['forecast_date'])
        df['datetime'] = df['datetime'] + dt.timedelta(days=1)
        # df = self.get_data_block_id(df, 'datetime')
        df = self.add_electricity_lag_features(df)
        return df
    
    def add_historical_weather_lag_features(self, df):
        """Chatgpt summary:
        Enhances a DataFrame with historical weather lag features:
        - Converts 'datetime' to Datetime object and sets as index.
        - Sorts data by 'datetime', 'latitude', 'longitude'.
        - Creates 'location_id' as a unique identifier for each location.
        - Filters for 10:00 AM entries and shifts features by 1 day.
        - Merges lagged features with original DataFrame.
        - Calculates mean and variance for weather features over the last 24 hours.
        - Merges these statistical summaries back into the original DataFrame.
        """
        ##### LATEST WEATHER
        def add_latest_weather(df):
            # Assuming df is your original DataFrame
            # Step 1: Convert datetime to a Datetime Object
            df['datetime'] = pd.to_datetime(df['datetime'])
            df.set_index('datetime', inplace=True)

            # Step 2: Sorting the Data
            df.sort_values(by=['datetime', 'latitude', 'longitude'], inplace=True)

            # Step 3: Creating a Unique Identifier for each location
            df['location_id'] = df['latitude'].astype(str) + '_' + df['longitude'].astype(str)

            # Step 4: Filtering for 10:00 AM Entries
            df.reset_index(inplace=True)
            df_10am = df[df['datetime'].dt.hour == 10]
            df_10am.set_index('datetime', inplace=True)

            # Step 5: Shifting the Features by 1 day
            lagged_features = df_10am.groupby('location_id').shift(periods=1, freq='D')

            # Renaming columns to indicate lag
            lagged_features = lagged_features.add_suffix('_hw_lagged')
            lagged_features['location_id'] = lagged_features['location_id_hw_lagged']
            lagged_features.reset_index(inplace=True)
            lagged_features['date'] = lagged_features.datetime.dt.date

            df['date'] = df.datetime.dt.date
            return lagged_features
            # Step 6: Merging Lagged Features with Original DataFrame
            df = df.merge(lagged_features, on=['date', 'location_id'], how='left', suffixes=('', '_hw_lagged'))
            return df
        
        ##### mean from last day
        def add_24h_mean_var(df, weather_features):
            # Calculate the start and end times for each row
            df['start_time'] = pd.to_datetime(df['datetime'].dt.date) - pd.Timedelta(days=2) + pd.Timedelta(hours=11)
            df['end_time'] = pd.to_datetime(df['datetime'].dt.date) - pd.Timedelta(days=1) + pd.Timedelta(hours=10)
            df['time_code'] = df['start_time'].astype(str) +'_' + df['end_time'].astype(str) + '_' + df['latitude'].astype(str) + '_' + df['longitude'].astype(str)
            # print(df.time_code)

            # Create a helper column for grouping
            # If the time is before 10:00 AM, subtract a day
            df['group'] = df['datetime'].apply(lambda dt: dt if dt.time() >= pd.to_datetime('11:00').time() else dt - pd.Timedelta(days=1))
            df['group'] = df['group'].dt.date  # Keep only the date part for grouping
            df['group'] = (pd.to_datetime(df['group']) + pd.Timedelta(hours=11)).astype(str) + '_' + (pd.to_datetime(df['group']) + pd.Timedelta(days=1, hours=10)).astype(str) + '_' + df['latitude'].astype(str) + '_' + df['longitude'].astype(str)

            # Now group by this new column
            grouped = df.groupby('group')
            means = grouped[weather_features].mean()
            variances = grouped[weather_features].var()

            # Merge means and variances into the original DataFrame
            my_df = df.merge(means, left_on='time_code', right_on='group', suffixes=('', '_hw_means'), how='left')
            my_df = my_df.merge(variances, left_on='time_code', right_on='group', how='left', suffixes=('', '_hw_variances'))

            return my_df

        df['datetime'] = pd.to_datetime(df['datetime'])
        weather_features = df.columns.drop(['datetime', 'latitude', 'longitude'])

        # Apply the function
        df = add_24h_mean_var(df, weather_features)       
        latest = add_latest_weather(df)
        df = df.merge(latest, on=['date', 'location_id'], how='left', suffixes=('', '_hw_lagged'))
        
        return df

    def init_historical_weather(self, df):
        ## LAG: From 11:00 AM 2 days ago to 10:00 AM 1 day ago
        ## What to do? Give most recent weather forecast? Give average over the last day?
        """
        Processes the historical weather data.
        """
        df['datetime'] = pd.to_datetime(df.datetime)
        
        df = self.add_historical_weather_lag_features(df)
        
        df = df.merge(self.weather_mapping, how='inner', on=('latitude', 'longitude'))
        return df

    def init_forecast_weather(self, df):
        """Chatgpt summary:
        Processes forecast weather data:
        - Converts 'forecast_datetime' to 'datetime' and adjusts it forward by 1 day.
        - Filters data to keep records with 'hours_ahead' between 22 and 45.
        - Merges with a weather mapping based on 'latitude' and 'longitude'.
        """
        ## LAG: DON't ADJUST
        ##      The forecast is from yesterday, but can forecast today, which is 22 hours ahead
        ## Drop any columns where:
        ##                        hours_ahead < 22 and hours_ahead > 45
        ## Then rename forecast_datetime to datetime and join on datetime
        """
        Processes the forecast weather data.
        """
        df['datetime'] = pd.to_datetime(df['forecast_datetime'])
        # keep only datetimes from our relevant period
        df = df[(df['hours_ahead'] < 46) & (df['hours_ahead'] > 21)]
        df['datetime'] = df['datetime'] + dt.timedelta(days=1)
        df = df.merge(self.weather_mapping, how='inner', on=('latitude', 'longitude'))
        return df
    
    def add_gas_prices_lag_features(self, df):
        """Chatgpt summary
        Augments a DataFrame with rolling average lag features for gas prices:
        - Converts 'date' to Datetime object and sets as index.
        - Sorts DataFrame by date.
        - Calculates rolling averages for lowest and highest gas prices over 3, 7, and 14 days.
        - Resets the index to include 'date' as a column again.
        """
        df['date'] = pd.to_datetime(df['date'])
        df.set_index('date', inplace=True)

        # Sort the DataFrame by date, if it's not already sorted
        df.sort_index(inplace=True)

        # Calculate rolling averages for different time windows
        df['lowest_price_3d_avg'] = df['lowest_price_per_mwh'].rolling(window=3).mean()
        df['highest_price_3d_avg'] = df['highest_price_per_mwh'].rolling(window=3).mean()

        df['lowest_price_7d_avg'] = df['lowest_price_per_mwh'].rolling(window=7).mean()
        df['highest_price_7d_avg'] = df['highest_price_per_mwh'].rolling(window=7).mean()

        df['lowest_price_14d_avg'] = df['lowest_price_per_mwh'].rolling(window=14).mean()
        df['highest_price_14d_avg'] = df['highest_price_per_mwh'].rolling(window=14).mean()

        # Reset the index if you want the 'date' column back
        df.reset_index(inplace=True)
        return df

    def init_gas_prices(self, df):
        ## LAG: 1 DAY
        ## Predictions are made from 2 days ago and predict for yesterday
        ## add one day to forecast_date
        ## Rename forecast_date to date, join on date
        """
        Processes the gas prices data.
        Implement the logic to handle gas prices data processing here.
        """
        df['date'] = pd.to_datetime(df['forecast_date']).dt.date
        df['date'] = df['date'] + dt.timedelta(days=1)
        df = self.add_gas_prices_lag_features(df)
        return df
    
    def add_revealed_target_features(self, df):
        """Chatgpt summary:
        Enhances DataFrame with rolling average target features:
        - Converts 'datetime' to Datetime object, extracts 'hour' and 'day' of week.
        - Sets 'datetime' as index.
        - Calculates various rolling averages of 'target' based on different groupings:
          - 24-hour rolling average by county, business status, product type, and consumption status.
          - 7-day hourly rolling average by county, business status, product type, consumption status, and hour.
          - 4-week rolling average by county, business status, product type, consumption status, hour, and day.
          - Similar calculations considering all product types.
        - Drops 'hour' and 'day' columns after processing.
        """
        df['datetime'] = pd.to_datetime(df['datetime'])
        df['hour'] = df.datetime.dt.hour
        df['day'] = df.datetime.dt.dayofweek
        df.set_index('datetime', inplace=True)

        window_size = 7
        # Group by the specified columns and then apply the rolling mean
        grouped = df.groupby(['county', 'is_business', 'product_type', 'is_consumption'])
        df['target_rolling_avg_24h'] = grouped['target'].transform(lambda x: x.rolling(window=24, min_periods=1).mean())

        grouped = df.groupby(['county', 'is_business', 'product_type', 'is_consumption', 'hour'])
        df['target_rolling_avg_hour_7d'] = grouped['target'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())

        grouped = df.groupby(['county', 'is_business', 'product_type', 'is_consumption', 'hour', 'day'])
        df['target_rolling_avg_hour_hour_day_4w'] = grouped['target'].transform(lambda x: x.rolling(window=4, min_periods=1).mean())

        grouped = df.groupby(['county', 'is_business', 'is_consumption'])
        df['target_rolling_allp_avg_24h'] = grouped['target'].transform(lambda x: x.rolling(window=24, min_periods=1).mean())

        grouped = df.groupby(['county', 'is_business', 'is_consumption', 'hour'])
        df['target_rolling_allp_avg_hour_7d'] = grouped['target'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())

        grouped = df.groupby(['county', 'is_business', 'is_consumption', 'hour', 'day'])
        df['target_rolling_allp_avg_hour_hour_day_4w'] = grouped['target'].transform(lambda x: x.rolling(window=4, min_periods=1).mean())
        
        df = df.drop(['hour', 'day'], axis=1)

        return df
    
    def init_revealed_targets(self, df):
        df['datetime'] = pd.to_datetime(df.datetime)
        df['datetime'] = df['datetime'] + dt.timedelta(days=2)
        df = self.add_revealed_target_features(df)
        return df
    
    def init_client(self, df):
        ## LAG: 2 days
        ## Add 2 days to date, join on date
        df['date'] = pd.to_datetime(df.date).dt.date
        df['date'] = df['date'] + dt.timedelta(days=2)
        # df = self.get_data_block_id(df, 'date')
        return df

    def init_weather_mapping(self):
        # https://www.kaggle.com/code/tsunotsuno/enefit-eda-baseline/notebook#Baseline
        county_point_map = {
            0: (59.4, 24.7), # "HARJUMAA"
            1 : (58.8, 22.7), # "HIIUMAA"
            2 : (59.1, 27.2), # "IDA-VIRUMAA"
            3 : (58.8, 25.7), # "JÄRVAMAA"
            4 : (58.8, 26.2), # "JÕGEVAMAA"
            5 : (59.1, 23.7), # "LÄÄNE-VIRUMAA"
            6 : (59.1, 23.7), # "LÄÄNEMAA"
            7 : (58.5, 24.7), # "PÄRNUMAA"
            8 : (58.2, 27.2), # "PÕLVAMAA"
            9 : (58.8, 24.7), # "RAPLAMAA"
            10 : (58.5, 22.7),# "SAAREMAA"
            11 : (58.5, 26.7),# "TARTUMAA"
            12 : (58.5, 25.2),# "UNKNOWNN" (center of the map)
            13 : (57.9, 26.2),# "VALGAMAA"
            14 : (58.2, 25.7),# "VILJANDIMAA"
            15 : (57.9, 27.2) # "VÕRUMAA"
        }
        # Convert the dictionary to a list of tuples
        data = [(county_code, lat, lon) for county_code, (lat, lon) in county_point_map.items()]

        # Create DataFrame
        df = pd.DataFrame(data, columns=['county', 'latitude', 'longitude'])
        
        return df
    
    def add_date_features(self, df):
        df['year'] = df['datetime'].dt.year
        df['month'] = df['datetime'].dt.month
        df['day'] = df['datetime'].dt.day
        df['hour'] = df['datetime'].dt.hour
        df['quarter'] = df['datetime'].dt.quarter
        df['day_of_week'] = df['datetime'].dt.day_of_week
        df['day_of_year'] = df['datetime'].dt.dayofyear
        df['week_of_year'] = df['datetime'].dt.isocalendar().week
        df['is_weekend'] = df['datetime'].dt.day_of_week >= 5
        df['is_month_start'] = df['datetime'].dt.is_month_start
        df['is_month_end'] = df['datetime'].dt.is_month_end
        df['is_quarter_start'] = df['datetime'].dt.is_quarter_start
        df['is_quarter_end'] = df['datetime'].dt.is_quarter_end
        df['is_year_start'] = df['datetime'].dt.is_year_start
        df['is_year_end'] = df['datetime'].dt.is_year_end
        df['season'] = df['datetime'].dt.month % 12 // 3 + 1
        df['hour_sin'] = np.sin(df['datetime'].dt.hour * (2. * np.pi / 24))
        df['hour_cos'] = np.cos(df['datetime'].dt.hour * (2. * np.pi / 24))
        # Calculate sin and cos for day of year
        days_in_year = 365.25  # accounts for leap year
        df['day_of_year_sin'] = np.sin((df['day_of_year'] - 1) * (2 * np.pi / days_in_year))
        df['day_of_year_cos'] = np.cos((df['day_of_year'] - 1) * (2 * np.pi / days_in_year))
        return df
    
    def add_ee_holidays(self, df):
        import holidays
        # Define Estonia public holidays
        ee_holidays = holidays.CountryHoliday('EE')
        
        print(df['date'].isna().sum())
        
        def find_problem(x):
            try:
                return x in ee_holidays
            except Exception as e:
                print(x)
                raise e

        # Function to check if the date is a holiday
        df['is_ee_holiday'] = df['date'].apply(lambda x: x in ee_holidays)

        return df
    
    def remove_cols(self, df):
        col_list = ['datetime',
                   'row_id',
                   'prediction_unit_id',
                    'date_train',
                    'hour_part',
                   'date_client',
                    'forecast_date_elec_price',
                    'origin_date_elec_price',
                    'forecast_date_gas_price',
                    'origin_date_gas_price',
                    'datetime_hist_weath',
                   'hour_part_hist_weath_latest',
                    'datetime_hist_weath_latest',
                   'origin_datetime',
                   'hour_part_fore_weath',
                    'datetime',
                     'data_block_id',
                     'row_id',
                     'prediction_unit_id',
                     'date',
                    'data_block_id_rt',
                     'row_id_rt',
                     'prediction_unit_id_rt',
                    'data_block_id_client',
                    'latitude',
                     'longitude',
                     'data_block_id_hw',
                    'start_time',
                     'end_time',
                     'time_code',
                     'group',
                    'data_block_id_hw_means',
                    'data_block_id_hw_variances',
                     'location_id',
                     'date_hw',
                     'datetime_hw_lagged',
                    'latitude_hw_lagged',
                     'longitude_hw_lagged',
                     'data_block_id_hw_lagged',
                     'start_time_hw_lagged',
                     'end_time_hw_lagged',
                     'time_code_hw_lagged',
                     'group_hw_lagged',
                    'data_block_id_hw_means_hw_lagged',
                    'data_block_id_hw_variances_hw_lagged',
                    'location_id_hw_lagged',
                     'latitude_fw',
                     'longitude_fw',
                     'origin_datetime',
                    'data_block_id_fw',
                     'forecast_datetime',
                    'data_block_id_elec',
                    'forecast_date',
                    'origin_date',
                     'data_block_id_gasp',
                   ]
        columns_to_drop = [col for col in col_list if col in df.columns]
        df = df.drop(columns_to_drop, axis=1)
        return df
    
    def remove_test_cols(self, df):
        col_list = ['datetime',
                   'prediction_unit_id',
                    'date_train',
                    'hour_part',
                   'date_client',
                    'forecast_date_elec_price',
                    'origin_date_elec_price',
                    'forecast_date_gas_price',
                    'origin_date_gas_price',
                    'datetime_hist_weath',
                   'hour_part_hist_weath_latest',
                    'datetime_hist_weath_latest',
                   'origin_datetime',
                   'hour_part_fore_weath',
                    'datetime',
                     'data_block_id',
                     'prediction_unit_id',
                     'date',
                    'data_block_id_rt',
                     'row_id_rt',
                     'prediction_unit_id_rt',
                    'data_block_id_client',
                    'latitude',
                     'longitude',
                     'data_block_id_hw',
                    'start_time',
                     'end_time',
                     'time_code',
                     'group',
                    'data_block_id_hw_means',
                    'data_block_id_hw_variances',
                     'location_id',
                     'date_hw',
                     'datetime_hw_lagged',
                    'latitude_hw_lagged',
                     'longitude_hw_lagged',
                     'data_block_id_hw_lagged',
                     'start_time_hw_lagged',
                     'end_time_hw_lagged',
                     'time_code_hw_lagged',
                     'group_hw_lagged',
                    'data_block_id_hw_means_hw_lagged',
                    'data_block_id_hw_variances_hw_lagged',
                    'location_id_hw_lagged',
                     'latitude_fw',
                     'longitude_fw',
                     'origin_datetime',
                    'data_block_id_fw',
                     'forecast_datetime',
                    'data_block_id_elec',
                    'forecast_date',
                    'origin_date',
                     'data_block_id_gasp',
                   ]
        columns_to_drop = [col for col in col_list if col in df.columns]
        df = df.drop(columns_to_drop, axis=1)
        return df
    
    def join_data(self, train, revealed_targets, client, historical_weather, forecast_weather, electricity_prices, gas_prices):
        df = train
        df = df.merge(revealed_targets, how='left', on=('datetime', 'county', 'is_business', 'product_type', 'is_consumption'), suffixes=('', '_rt'))
        df = df.merge(client, how='left', on=('date', 'county', 'is_business', 'product_type'), suffixes=('', '_client'))
        df = df.merge(historical_weather, how='left', on=('datetime', 'county'), suffixes=('', '_hw'))
        df = df.merge(forecast_weather, how='left', on=('datetime', 'county'), suffixes=('', '_fw'))
        df = df.merge(electricity_prices, how='left', on='datetime', suffixes=('', '_elec'))
        df['date'] = pd.to_datetime(df['date'])
        df = df.merge(gas_prices, how='left', on='date', suffixes=('', '_gasp'))
        df = self.add_date_features(df)
        df = self.add_ee_holidays(df)
        return df
    
    def add_test_data(self, test, revealed_targets, client, historical_weather,
            forecast_weather, electricity_prices, gas_prices):
        dfs = [test.copy(), revealed_targets.copy(), client.copy(), historical_weather.copy(),
                 forecast_weather.copy(), electricity_prices.copy(), gas_prices.copy()]
        for i, df in enumerate(dfs):
            if 'datetime' in df.columns:
                df['datetime'] = pd.to_datetime(df.datetime)
                col = 'datetime'
            if 'prediction_datetime' in df.columns:
                df['datetime'] = pd.to_datetime(df.prediction_datetime)
                col = 'datetime'
            if 'forecast_date' in df.columns:
                df['forecast_date'] = pd.to_datetime(df['forecast_date'])
                col = 'forecast_date'
            if 'forecast_datetime' in df.columns:
                df['forecast_datetime'] = pd.to_datetime(df['forecast_datetime'])
                col = 'forecast_datetime'
                
            self.test_orig_dfs[i] = pd.concat([ self.test_orig_dfs[i], df ])          
        
        
    
    def process_test_data_timestep(self, test, revealed_targets, client, historical_weather,
            forecast_weather, electricity_prices, gas_prices):
        #append test data to test data cache
        self.add_test_data(test, revealed_targets, client, historical_weather,
            forecast_weather, electricity_prices, gas_prices)
        # process test data
        test = self.init_train(self.test_orig_dfs[0].copy())
        revealed_targets = self.init_revealed_targets(self.test_orig_dfs[1].copy())
        client = self.init_client(self.test_orig_dfs[2].copy())
        historical_weather = self.init_historical_weather(self.test_orig_dfs[3].copy())
        forecast_weather = self.init_forecast_weather(self.test_orig_dfs[4].copy())
        electricity_prices = self.init_electricity(self.test_orig_dfs[5].copy())
        gas_prices = self.init_gas_prices(self.test_orig_dfs[6].copy())
        
        df_all_cols = self.join_data(test, revealed_targets, client, historical_weather,
            forecast_weather, electricity_prices, gas_prices)
        df = self.remove_test_cols(df_all_cols)
        return df
        


In [4]:
with open('data_processor.pkl', 'rb') as f:
    data_processor = pickle.load(f)
data_processor.df

Unnamed: 0,county,is_business,product_type,target,is_consumption,target_rt,target_rolling_avg_24h,target_rolling_avg_hour_7d,target_rolling_avg_hour_hour_day_4w,target_rolling_allp_avg_24h,...,is_quarter_start,is_quarter_end,is_year_start,is_year_end,season,hour_sin,hour_cos,day_of_year_sin,day_of_year_cos,is_ee_holiday
0,0,0,1,0.713,0,,,,,,...,False,False,False,False,4,0.000000,1.000000,-0.861693,-0.507430,False
1,0,0,1,96.590,1,,,,,,...,False,False,False,False,4,0.000000,1.000000,-0.861693,-0.507430,False
2,0,0,2,0.000,0,,,,,,...,False,False,False,False,4,0.000000,1.000000,-0.861693,-0.507430,False
3,0,0,2,17.314,1,,,,,,...,False,False,False,False,4,0.000000,1.000000,-0.861693,-0.507430,False
4,0,0,3,2.904,0,,,,,,...,False,False,False,False,4,0.000000,1.000000,-0.861693,-0.507430,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018609,15,1,0,197.233,1,184.072,295.118417,278.497143,184.71275,90.640000,...,False,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False
2018610,15,1,1,0.000,0,0.000,156.335208,0.000000,0.00000,170.148000,...,False,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False
2018611,15,1,1,28.404,1,38.646,18.873583,34.405143,42.90750,92.029875,...,False,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False
2018612,15,1,3,0.000,0,0.000,403.044625,0.000000,0.00000,139.132958,...,False,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False


# Testing

For my experimental CV, I want to take the approach of doing a stratified CV by time - splitting the year into 4 different parts, basically testing the model on each season, 3 months at a time. There was something in the kaggle forums that recommended something like this:

Key: 
= -> training data
+ -> CV data

4 splits in time:
1. =============+++
2. ================+++
3. ===================+++
4. ======================+++



The data starts on 2021-09-01 and ends on 2023-05-31

BUT we don't have enough data to do that properly. So, my CV will instead be:


(Thanks chatgpt)

Splitting the period from 2022-09-01 to 2023-05-31 into five equal parts, here are the date ranges for each segment:

#### First Segment:

From 2022-09-01 to 2022-10-24

#### Second Segment:

From 2022-10-25 to 2022-12-17

#### Third Segment:

From 2022-12-18 to 2023-02-09

#### Fourth Segment:

From 2023-02-10 to 2023-04-04

#### Fifth Segment:

From 2023-04-05 to 2023-05-29


In [5]:
def fill_drop_na(df):
    df = df[~df.target.isna()]
    df = df[~df.target_rolling_avg_24h.isna()]
    means = df.mean()
    # For each column, add an indicator column for NA values
    # for col in df.columns:
    #     if df[col].isna().any():
    #         df[f'{col}_is_na'] = df[col].isna()
    df = df.fillna(means)
    return df, means

In [6]:
%%time
processed_df_no_na, means = fill_drop_na(data_processor.df)
processed_df_no_na.isna().sum()

CPU times: total: 2.66 s
Wall time: 5.06 s


county             0
is_business        0
product_type       0
target             0
is_consumption     0
                  ..
hour_sin           0
hour_cos           0
day_of_year_sin    0
day_of_year_cos    0
is_ee_holiday      0
Length: 145, dtype: int64

In [7]:
processed_df_no_na['target_installed_capacity'] = processed_df_no_na['target'] / processed_df_no_na['installed_capacity'] * 1000
processed_df_no_na

  processed_df_no_na['target_installed_capacity'] = processed_df_no_na['target'] / processed_df_no_na['installed_capacity'] * 1000


Unnamed: 0,county,is_business,product_type,target,is_consumption,target_rt,target_rolling_avg_24h,target_rolling_avg_hour_7d,target_rolling_avg_hour_hour_day_4w,target_rolling_allp_avg_24h,...,is_quarter_end,is_year_start,is_year_end,season,hour_sin,hour_cos,day_of_year_sin,day_of_year_cos,is_ee_holiday,target_installed_capacity
11712,0,0,1,0.930,0,0.713,0.713000,0.713000,0.71300,0.713000,...,False,False,False,4,0.000000,1.000000,-0.894542,-0.446983,False,0.975978
11713,0,0,1,123.214,1,96.590,96.590000,96.590000,96.59000,96.590000,...,False,False,False,4,0.000000,1.000000,-0.894542,-0.446983,False,129.305586
11714,0,0,2,0.000,0,0.000,0.000000,0.000000,0.00000,0.356500,...,False,False,False,4,0.000000,1.000000,-0.894542,-0.446983,False,0.000000
11715,0,0,2,21.940,1,17.314,17.314000,17.314000,17.31400,56.952000,...,False,False,False,4,0.000000,1.000000,-0.894542,-0.446983,False,131.850962
11716,0,0,3,1.611,0,2.904,2.904000,2.904000,2.90400,1.205667,...,False,False,False,4,0.000000,1.000000,-0.894542,-0.446983,False,0.223505
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018609,15,1,0,197.233,1,184.072,295.118417,278.497143,184.71275,90.640000,...,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False,318.117742
2018610,15,1,1,0.000,0,0.000,156.335208,0.000000,0.00000,170.148000,...,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False,0.000000
2018611,15,1,1,28.404,1,38.646,18.873583,34.405143,42.90750,92.029875,...,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False,45.482786
2018612,15,1,3,0.000,0,0.000,403.044625,0.000000,0.00000,139.132958,...,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False,0.000000


In [8]:
from datetime import datetime

cv_ranges_corrected = [
    ('2022-09-01', '2022-10-24'), 
    ('2022-10-25', '2022-12-17'), 
    ('2022-12-18', '2023-02-09'), 
    ('2023-02-10', '2023-04-04'), 
    ('2023-04-05', '2023-05-31')
]

# Function to convert a date string into a datetime object
def to_datetime(date_str):
    return datetime.strptime(date_str, '%Y-%m-%d')

# Converting the date strings in cv_ranges to datetime objects
datetime_cv_ranges = [(to_datetime(start), to_datetime(end)) for start, end in cv_ranges_corrected]
datetime_cv_ranges

date_filter = data_processor.df_all_cols.date[processed_df_no_na.index]
date_filter

cv1_train = processed_df_no_na[date_filter <= datetime_cv_ranges[0][0]]
cv1_test = processed_df_no_na[(date_filter <= datetime_cv_ranges[0][1]) & (date_filter > datetime_cv_ranges[0][0])]

In [9]:
cv1_train[['year' ,'month', 'day']]

Unnamed: 0,year,month,day
11712,2021,9,5
11713,2021,9,5
11714,2021,9,5
11715,2021,9,5
11716,2021,9,5
...,...,...,...
1144249,2022,9,1
1144250,2022,9,1
1144251,2022,9,1
1144252,2022,9,1


In [10]:
cv1_test[['year' ,'month', 'day']]

Unnamed: 0,year,month,day
1144254,2022,9,2
1144255,2022,9,2
1144256,2022,9,2
1144257,2022,9,2
1144258,2022,9,2
...,...,...,...
1315849,2022,10,24
1315850,2022,10,24
1315851,2022,10,24
1315852,2022,10,24


In [11]:
processed_df_no_na[['year', 'month', 'day']]

Unnamed: 0,year,month,day
11712,2021,9,5
11713,2021,9,5
11714,2021,9,5
11715,2021,9,5
11716,2021,9,5
...,...,...,...
2018609,2023,5,31
2018610,2023,5,31
2018611,2023,5,31
2018612,2023,5,31


### Test No. 14

Testing what scores we get including na cols

In [15]:
def fill_drop_na(df):
    df = df[~df.target.isna()]
    df = df[~df.target_rolling_avg_24h.isna()]
    means = df.mean()
    # For each column, add an indicator column for NA values
    for col in df.columns:
        if df[col].isna().any():
            df[f'{col}_is_na'] = df[col].isna()
    df = df.fillna(means)
    return df, means

In [16]:
%%time
processed_df_no_na, means = fill_drop_na(data_processor.df)
processed_df_no_na.isna().sum()

  df[f'{col}_is_na'] = df[col].isna()
  df[f'{col}_is_na'] = df[col].isna()
  df[f'{col}_is_na'] = df[col].isna()
  df[f'{col}_is_na'] = df[col].isna()
  df[f'{col}_is_na'] = df[col].isna()
  df[f'{col}_is_na'] = df[col].isna()
  df[f'{col}_is_na'] = df[col].isna()
  df[f'{col}_is_na'] = df[col].isna()
  df[f'{col}_is_na'] = df[col].isna()
  df[f'{col}_is_na'] = df[col].isna()
  df[f'{col}_is_na'] = df[col].isna()
  df[f'{col}_is_na'] = df[col].isna()
  df[f'{col}_is_na'] = df[col].isna()
  df[f'{col}_is_na'] = df[col].isna()
  df[f'{col}_is_na'] = df[col].isna()
  df[f'{col}_is_na'] = df[col].isna()
  df[f'{col}_is_na'] = df[col].isna()
  df[f'{col}_is_na'] = df[col].isna()
  df[f'{col}_is_na'] = df[col].isna()
  df[f'{col}_is_na'] = df[col].isna()
  df[f'{col}_is_na'] = df[col].isna()
  df[f'{col}_is_na'] = df[col].isna()
  df[f'{col}_is_na'] = df[col].isna()
  df[f'{col}_is_na'] = df[col].isna()
  df[f'{col}_is_na'] = df[col].isna()
  df[f'{col}_is_na'] = df[col].isna()
  df[f'{col}

CPU times: total: 3.28 s
Wall time: 9.36 s


county                                   0
is_business                              0
product_type                             0
target                                   0
is_consumption                           0
                                        ..
euros_per_mwh_24h_average_price_is_na    0
lowest_price_7d_avg_is_na                0
highest_price_7d_avg_is_na               0
lowest_price_14d_avg_is_na               0
highest_price_14d_avg_is_na              0
Length: 255, dtype: int64

In [17]:
processed_df_no_na['target_installed_capacity'] = processed_df_no_na['target'] / processed_df_no_na['installed_capacity'] * 1000
processed_df_no_na

  processed_df_no_na['target_installed_capacity'] = processed_df_no_na['target'] / processed_df_no_na['installed_capacity'] * 1000


Unnamed: 0,county,is_business,product_type,target,is_consumption,target_rt,target_rolling_avg_24h,target_rolling_avg_hour_7d,target_rolling_avg_hour_hour_day_4w,target_rolling_allp_avg_24h,...,euros_per_mwh_is_na,mean_euros_per_mwh_last_week_is_na,mean_euros_per_mwh_same_hour_last_week_is_na,yesterdays_euros_per_mwh_is_na,euros_per_mwh_24h_average_price_is_na,lowest_price_7d_avg_is_na,highest_price_7d_avg_is_na,lowest_price_14d_avg_is_na,highest_price_14d_avg_is_na,target_installed_capacity
11712,0,0,1,0.930,0,0.713,0.713000,0.713000,0.71300,0.713000,...,False,False,False,False,False,True,True,True,True,0.975978
11713,0,0,1,123.214,1,96.590,96.590000,96.590000,96.59000,96.590000,...,False,False,False,False,False,True,True,True,True,129.305586
11714,0,0,2,0.000,0,0.000,0.000000,0.000000,0.00000,0.356500,...,False,False,False,False,False,True,True,True,True,0.000000
11715,0,0,2,21.940,1,17.314,17.314000,17.314000,17.31400,56.952000,...,False,False,False,False,False,True,True,True,True,131.850962
11716,0,0,3,1.611,0,2.904,2.904000,2.904000,2.90400,1.205667,...,False,False,False,False,False,True,True,True,True,0.223505
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018609,15,1,0,197.233,1,184.072,295.118417,278.497143,184.71275,90.640000,...,False,False,False,False,False,False,False,False,False,318.117742
2018610,15,1,1,0.000,0,0.000,156.335208,0.000000,0.00000,170.148000,...,False,False,False,False,False,False,False,False,False,0.000000
2018611,15,1,1,28.404,1,38.646,18.873583,34.405143,42.90750,92.029875,...,False,False,False,False,False,False,False,False,False,45.482786
2018612,15,1,3,0.000,0,0.000,403.044625,0.000000,0.00000,139.132958,...,False,False,False,False,False,False,False,False,False,0.000000


In [21]:
from lightgbm import LGBMRegressor

In [24]:
def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        df_train_target = train[['target', 'target_installed_capacity']]
        df_train_data = train.drop(['target', 'target_installed_capacity'], axis=1)
        
        df_val_target2 = val[['target', 'target_installed_capacity']]
        df_val_data2 = val.drop(['target', 'target_installed_capacity'], axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        
        clf2 = LGBMRegressor(random_state=42, n_estimators=2500, verbose=1, n_jobs=32, objective='l2', importance_type='gain')
        clf2.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        from sklearn.metrics import mean_absolute_error
        
        print("###############   Target   #################")
        y_pred = clf2.predict(df_train_data)
        y_pred
        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf2.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        display(importance.head(30))
        display(importance.tail(30))
        print()
        print()

In [25]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.130074 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28482
[LightGBM] [Info] Number of data points in the train set: 1129738, number of used features: 167
[LightGBM] [Info] Start training from score 250.526332
###############   Target   #################
For fold 0: Train Mean Absolute Error: 18.372078157002534
For fold 0: Fold Val Mean Absolute Error: 45.05880103943924


Unnamed: 0,importance,name
6,3709920000000.0,target_rolling_avg_hour_7d
7,76592160000.0,target_rolling_avg_hour_hour_day_4w
131,69554930000.0,is_weekend
3,20715440000.0,is_consumption
25,20310010000.0,direct_solar_radiation
140,17721200000.0,hour_cos
12,16741120000.0,installed_capacity
11,14512560000.0,eic_count
9,14262350000.0,target_rolling_allp_avg_hour_7d
18,11664760000.0,cloudcover_total


Unnamed: 0,importance,name
136,0.0,is_year_start
167,0.0,cloudcover_total_hw_means_is_na
168,0.0,cloudcover_low_hw_means_is_na
169,0.0,cloudcover_mid_hw_means_is_na
183,0.0,cloudcover_mid_hw_variances_is_na
194,0.0,surface_pressure_hw_lagged_is_na
193,0.0,snowfall_hw_lagged_is_na
192,0.0,rain_hw_lagged_is_na
191,0.0,dewpoint_hw_lagged_is_na
190,0.0,temperature_hw_lagged_is_na




Fold 1
Train rows: 1304266
Val rows: 173328
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.062030 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28496
[LightGBM] [Info] Number of data points in the train set: 1304266, number of used features: 167
[LightGBM] [Info] Start training from score 250.930029
###############   Target   #################
For fold 1: Train Mean Absolute Error: 19.31705648220542
For fold 1: Fold Val Mean Absolute Error: 38.666298360848124


Unnamed: 0,importance,name
6,4349144000000.0,target_rolling_avg_hour_7d
131,85937540000.0,is_weekend
7,47274160000.0,target_rolling_avg_hour_hour_day_4w
3,26788410000.0,is_consumption
25,26422990000.0,direct_solar_radiation
12,19111440000.0,installed_capacity
140,18895130000.0,hour_cos
11,16697080000.0,eic_count
9,15744700000.0,target_rolling_allp_avg_hour_7d
24,14763300000.0,shortwave_radiation


Unnamed: 0,importance,name
136,0.0,is_year_start
167,0.0,cloudcover_total_hw_means_is_na
169,0.0,cloudcover_mid_hw_means_is_na
196,0.0,cloudcover_low_hw_lagged_is_na
183,0.0,cloudcover_mid_hw_variances_is_na
194,0.0,surface_pressure_hw_lagged_is_na
193,0.0,snowfall_hw_lagged_is_na
192,0.0,rain_hw_lagged_is_na
191,0.0,dewpoint_hw_lagged_is_na
190,0.0,temperature_hw_lagged_is_na




Fold 2
Train rows: 1480810
Val rows: 169632
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.058201 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28562
[LightGBM] [Info] Number of data points in the train set: 1480810, number of used features: 170
[LightGBM] [Info] Start training from score 255.678793
###############   Target   #################
For fold 2: Train Mean Absolute Error: 19.319392285073768
For fold 2: Fold Val Mean Absolute Error: 42.284402565880754


Unnamed: 0,importance,name
6,5279724000000.0,target_rolling_avg_hour_7d
131,102850500000.0,is_weekend
7,31590380000.0,target_rolling_avg_hour_hour_day_4w
9,29439600000.0,target_rolling_allp_avg_hour_7d
140,26127620000.0,hour_cos
25,25352690000.0,direct_solar_radiation
11,24274380000.0,eic_count
12,23612100000.0,installed_capacity
3,23519330000.0,is_consumption
18,16443300000.0,cloudcover_total


Unnamed: 0,importance,name
135,0.0,is_quarter_end
167,0.0,cloudcover_total_hw_means_is_na
168,0.0,cloudcover_low_hw_means_is_na
169,0.0,cloudcover_mid_hw_means_is_na
183,0.0,cloudcover_mid_hw_variances_is_na
194,0.0,surface_pressure_hw_lagged_is_na
193,0.0,snowfall_hw_lagged_is_na
192,0.0,rain_hw_lagged_is_na
191,0.0,dewpoint_hw_lagged_is_na
190,0.0,temperature_hw_lagged_is_na




Fold 3
Train rows: 1653658
Val rows: 167820
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.069731 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28579
[LightGBM] [Info] Number of data points in the train set: 1653658, number of used features: 170
[LightGBM] [Info] Start training from score 262.020470
###############   Target   #################
For fold 3: Train Mean Absolute Error: 19.487661880583314
For fold 3: Fold Val Mean Absolute Error: 58.75009506278201


Unnamed: 0,importance,name
6,6242371000000.0,target_rolling_avg_hour_7d
131,123076800000.0,is_weekend
5,46671430000.0,target_rolling_avg_24h
11,37658660000.0,eic_count
140,34186730000.0,hour_cos
7,31474570000.0,target_rolling_avg_hour_hour_day_4w
12,30138470000.0,installed_capacity
25,25450420000.0,direct_solar_radiation
9,23018230000.0,target_rolling_allp_avg_hour_7d
3,19566280000.0,is_consumption


Unnamed: 0,importance,name
148,0.0,temperature_is_na
147,0.0,installed_capacity_is_na
171,0.0,windspeed_10m_hw_means_is_na
172,0.0,winddirection_10m_hw_means_is_na
173,0.0,shortwave_radiation_hw_means_is_na
174,0.0,direct_solar_radiation_hw_means_is_na
197,0.0,cloudcover_mid_hw_lagged_is_na
196,0.0,cloudcover_low_hw_lagged_is_na
195,0.0,cloudcover_total_hw_lagged_is_na
194,0.0,surface_pressure_hw_lagged_is_na




Fold 4
Train rows: 1824598
Val rows: 176496
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.069675 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28598
[LightGBM] [Info] Number of data points in the train set: 1824598, number of used features: 170
[LightGBM] [Info] Start training from score 268.598179
###############   Target   #################
For fold 4: Train Mean Absolute Error: 20.56015206924154
For fold 4: Fold Val Mean Absolute Error: 82.81459148455711


Unnamed: 0,importance,name
6,7131339000000.0,target_rolling_avg_hour_7d
131,139637400000.0,is_weekend
5,124653200000.0,target_rolling_avg_24h
25,46085460000.0,direct_solar_radiation
12,45656860000.0,installed_capacity
140,38011470000.0,hour_cos
7,36818220000.0,target_rolling_avg_hour_hour_day_4w
11,34675230000.0,eic_count
3,22701630000.0,is_consumption
126,16956870000.0,hour


Unnamed: 0,importance,name
166,0.0,surface_pressure_hw_means_is_na
168,0.0,cloudcover_low_hw_means_is_na
195,0.0,cloudcover_total_hw_lagged_is_na
169,0.0,cloudcover_mid_hw_means_is_na
194,0.0,surface_pressure_hw_lagged_is_na
193,0.0,snowfall_hw_lagged_is_na
192,0.0,rain_hw_lagged_is_na
191,0.0,dewpoint_hw_lagged_is_na
190,0.0,temperature_hw_lagged_is_na
189,0.0,diffuse_radiation_hw_variances_is_na






### Test 15

Removing 0 gain feature cols

In [28]:
def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'is_year_end', 'highest_price_14d_avg_is_na', 'diffuse_radiation_hw_means_is_na', 'temperature_hw_variances_is_na', 'cloudcover_high_hw_means_is_na', 'windspeed_10m_hw_means_is_na', 
                     'shortwave_radiation_hw_means_is_na' ,'direct_solar_radiation_hw_means_is_na', 'diffuse_radiation_hw_means_is_na', 'temperature_hw_variances_is_na', 'dewpoint_hw_variances_is_na', 'rain_hw_variances_is_na',
                    'snowfall_hw_variances_is_na', 'surface_pressure_hw_variances_is_na', 'cloudcover_total_hw_variances_is_na', 'cloudcover_low_hw_variances_is_na', 'cloudcover_mid_hw_variances_is_na', 'cloudcover_high_hw_variances_is_na', 
                     'windspeed_10m_hw_variances_is_na', 'winddirection_10m_hw_variances_is_na', 'shortwave_radiation_hw_variances_is_na', 'direct_solar_radiation_hw_variances_is_na', 'diffuse_radiation_hw_variances_is_na',
                    'temperature_hw_lagged_is_na', 'dewpoint_hw_lagged_is_na', 'rain_hw_lagged_is_na', 'snowfall_hw_lagged_is_na', 'surface_pressure_hw_lagged_is_na', 'cloudcover_mid_hw_means_is_na', 'cloudcover_total_hw_lagged_is_na',
                    'cloudcover_low_hw_means_is_na', 'surface_pressure_hw_means_is_na', 'temperature_is_na', 'installed_capacity_is_na', 'is_quarter_end']
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]
        
        clf2 = LGBMRegressor(random_state=42, n_estimators=2500, verbose=1, n_jobs=32, objective='l2', importance_type='gain')
        clf2.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        from sklearn.metrics import mean_absolute_error
        
        print("###############   Target   #################")
        y_pred = clf2.predict(df_train_data)
        y_pred
        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf2.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        display(importance.head(30))
        display(importance.tail(30))
        print()
        print()

In [29]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.046479 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28474
[LightGBM] [Info] Number of data points in the train set: 1129738, number of used features: 164
[LightGBM] [Info] Start training from score 250.526332
###############   Target   #################
For fold 0: Train Mean Absolute Error: 18.412199084311926
For fold 0: Fold Val Mean Absolute Error: 44.80528830531523


Unnamed: 0,importance,name
6,3709790000000.0,target_rolling_avg_hour_7d
7,76674350000.0,target_rolling_avg_hour_hour_day_4w
131,69591260000.0,is_weekend
3,20581370000.0,is_consumption
25,20360790000.0,direct_solar_radiation
138,17700170000.0,hour_cos
12,16550590000.0,installed_capacity
11,14484300000.0,eic_count
9,14278800000.0,target_rolling_allp_avg_hour_7d
18,11729460000.0,cloudcover_total


Unnamed: 0,importance,name
155,0.0,shortwave_radiation_is_na
153,0.0,windspeed_10m_is_na
181,0.0,windspeed_10m_hw_means_hw_lagged_is_na
152,0.0,cloudcover_high_is_na
151,0.0,cloudcover_mid_is_na
150,0.0,cloudcover_low_is_na
149,0.0,cloudcover_total_is_na
148,0.0,surface_pressure_is_na
147,0.0,snowfall_is_na
146,0.0,rain_is_na




Fold 1
Train rows: 1304266
Val rows: 173328
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.139091 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28488
[LightGBM] [Info] Number of data points in the train set: 1304266, number of used features: 164
[LightGBM] [Info] Start training from score 250.930029
###############   Target   #################
For fold 1: Train Mean Absolute Error: 19.258163182613707
For fold 1: Fold Val Mean Absolute Error: 38.719374501233766


Unnamed: 0,importance,name
6,4349193000000.0,target_rolling_avg_hour_7d
131,85841730000.0,is_weekend
7,47183200000.0,target_rolling_avg_hour_hour_day_4w
3,26655310000.0,is_consumption
25,26502300000.0,direct_solar_radiation
12,19285420000.0,installed_capacity
138,18934030000.0,hour_cos
11,16777830000.0,eic_count
9,15765410000.0,target_rolling_allp_avg_hour_7d
24,14613450000.0,shortwave_radiation


Unnamed: 0,importance,name
146,0.0,rain_is_na
145,0.0,dewpoint_is_na
144,0.0,eic_count_is_na
143,0.0,target_rolling_allp_avg_hour_hour_day_4w_is_na
142,0.0,target_rt_is_na
136,0.0,season
135,0.0,is_year_start
132,0.0,is_month_start
156,0.0,direct_solar_radiation_is_na
158,0.0,temperature_hw_means_is_na




Fold 2
Train rows: 1480810
Val rows: 169632
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.204054 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28552
[LightGBM] [Info] Number of data points in the train set: 1480810, number of used features: 166
[LightGBM] [Info] Start training from score 255.678793
###############   Target   #################
For fold 2: Train Mean Absolute Error: 19.319392285072468
For fold 2: Fold Val Mean Absolute Error: 42.28440256587579


Unnamed: 0,importance,name
6,5279724000000.0,target_rolling_avg_hour_7d
131,102850500000.0,is_weekend
7,31590380000.0,target_rolling_avg_hour_hour_day_4w
9,29439600000.0,target_rolling_allp_avg_hour_7d
138,26127620000.0,hour_cos
25,25352690000.0,direct_solar_radiation
11,24274380000.0,eic_count
12,23612100000.0,installed_capacity
3,23519330000.0,is_consumption
18,16443300000.0,cloudcover_total


Unnamed: 0,importance,name
154,0.0,winddirection_10m_is_na
152,0.0,cloudcover_high_is_na
135,0.0,is_year_start
151,0.0,cloudcover_mid_is_na
150,0.0,cloudcover_low_is_na
149,0.0,cloudcover_total_is_na
148,0.0,surface_pressure_is_na
147,0.0,snowfall_is_na
146,0.0,rain_is_na
145,0.0,dewpoint_is_na




Fold 3
Train rows: 1653658
Val rows: 167820
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.073806 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28569
[LightGBM] [Info] Number of data points in the train set: 1653658, number of used features: 166
[LightGBM] [Info] Start training from score 262.020470
###############   Target   #################
For fold 3: Train Mean Absolute Error: 19.47320944890231
For fold 3: Fold Val Mean Absolute Error: 58.927683949674744


Unnamed: 0,importance,name
6,6256145000000.0,target_rolling_avg_hour_7d
131,122543500000.0,is_weekend
11,36011920000.0,eic_count
138,34345260000.0,hour_cos
5,32854340000.0,target_rolling_avg_24h
7,31211060000.0,target_rolling_avg_hour_hour_day_4w
12,30624380000.0,installed_capacity
25,25641630000.0,direct_solar_radiation
9,22872610000.0,target_rolling_allp_avg_hour_7d
3,20954260000.0,is_consumption


Unnamed: 0,importance,name
150,0.0,cloudcover_low_is_na
149,0.0,cloudcover_total_is_na
148,0.0,surface_pressure_is_na
147,0.0,snowfall_is_na
146,0.0,rain_is_na
145,0.0,dewpoint_is_na
144,0.0,eic_count_is_na
143,0.0,target_rolling_allp_avg_hour_hour_day_4w_is_na
127,0.0,quarter
159,0.0,dewpoint_hw_means_is_na




Fold 4
Train rows: 1824598
Val rows: 176496
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.065144 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28588
[LightGBM] [Info] Number of data points in the train set: 1824598, number of used features: 166
[LightGBM] [Info] Start training from score 268.598179
###############   Target   #################
For fold 4: Train Mean Absolute Error: 20.590358148002892
For fold 4: Fold Val Mean Absolute Error: 83.24854163068841


Unnamed: 0,importance,name
6,7131308000000.0,target_rolling_avg_hour_7d
131,139656000000.0,is_weekend
5,124618500000.0,target_rolling_avg_24h
25,46103680000.0,direct_solar_radiation
12,45643950000.0,installed_capacity
138,38001400000.0,hour_cos
7,36846430000.0,target_rolling_avg_hour_hour_day_4w
11,34684730000.0,eic_count
3,22695640000.0,is_consumption
126,16929860000.0,hour


Unnamed: 0,importance,name
149,0.0,cloudcover_total_is_na
148,0.0,surface_pressure_is_na
147,0.0,snowfall_is_na
146,0.0,rain_is_na
145,0.0,dewpoint_is_na
144,0.0,eic_count_is_na
143,0.0,target_rolling_allp_avg_hour_hour_day_4w_is_na
127,0.0,quarter
132,0.0,is_month_start
158,0.0,temperature_hw_means_is_na






### Testing 16

In [32]:
def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'is_year_end', 'highest_price_14d_avg_is_na', 'diffuse_radiation_hw_means_is_na', 'temperature_hw_variances_is_na', 'cloudcover_high_hw_means_is_na', 'windspeed_10m_hw_means_is_na', 
                     'shortwave_radiation_hw_means_is_na' ,'direct_solar_radiation_hw_means_is_na', 'diffuse_radiation_hw_means_is_na', 'temperature_hw_variances_is_na', 'dewpoint_hw_variances_is_na', 'rain_hw_variances_is_na',
                    'snowfall_hw_variances_is_na', 'surface_pressure_hw_variances_is_na', 'cloudcover_total_hw_variances_is_na', 'cloudcover_low_hw_variances_is_na', 'cloudcover_mid_hw_variances_is_na', 'cloudcover_high_hw_variances_is_na', 
                     'windspeed_10m_hw_variances_is_na', 'winddirection_10m_hw_variances_is_na', 'shortwave_radiation_hw_variances_is_na', 'direct_solar_radiation_hw_variances_is_na', 'diffuse_radiation_hw_variances_is_na',
                    'temperature_hw_lagged_is_na', 'dewpoint_hw_lagged_is_na', 'rain_hw_lagged_is_na', 'snowfall_hw_lagged_is_na', 'surface_pressure_hw_lagged_is_na', 'cloudcover_mid_hw_means_is_na', 'cloudcover_total_hw_lagged_is_na',
                    'cloudcover_low_hw_means_is_na', 'surface_pressure_hw_means_is_na', 'temperature_is_na', 'installed_capacity_is_na', 'is_quarter_end', 
                     'is_month_start', 'winddirection_10m_hw_means_is_na', 'cloudcover_low_hw_lagged_is_na', 'cloudcover_mid_hw_lagged_is_na', 'cloudcover_mid_hw_means_hw_lagged_is_na', 'cloudcover_total_hw_means_is_na', 
                     'winddirection_10m_hw_means_is_na', 'cloudcover_low_hw_lagged_is_na', 'cloudcover_mid_hw_lagged_is_na', 'cloudcover_high_hw_lagged_is_na', 'windspeed_10m_hw_lagged_is_na', 'winddirection_10m_hw_lagged_is_na',
                    'snowfall_hw_means_is_na', 'shortwave_radiation_hw_lagged_is_na', 'diffuse_radiation_hw_lagged_is_na', 'temperature_hw_means_hw_lagged_is_na', 'dewpoint_hw_means_hw_lagged_is_na', 'rain_hw_means_hw_lagged_is_na', 
                     'snowfall_hw_means_hw_lagged_is_na', 'surface_pressure_hw_means_hw_lagged_is_na', 'cloudcover_total_hw_means_hw_lagged_is_na', 'direct_solar_radiation_hw_lagged_is_na', 'rain_hw_means_is_na', 'dewpoint_hw_means_is_na',
                    'temperature_hw_means_is_na', 'is_month_start', 'is_month_start', 'quarter', 'target_rolling_allp_avg_hour_hour_day_4w_is_na', 'eic_count_is_na', 'dewpoint_is_na', 'rain_is_na', 
                    'snowfall_is_na', 'surface_pressure_is_na', 'cloudcover_total_is_na']
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]
        
        clf2 = LGBMRegressor(random_state=42, n_estimators=2500, verbose=1, n_jobs=32, objective='l2', importance_type='gain')
        clf2.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        from sklearn.metrics import mean_absolute_error
        
        print("###############   Target   #################")
        y_pred = clf2.predict(df_train_data)
        y_pred
        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf2.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        display(importance.head(30))
        display(importance.tail(30))
        print()
        print()

In [33]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.049028 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28466
[LightGBM] [Info] Number of data points in the train set: 1129738, number of used features: 162
[LightGBM] [Info] Start training from score 250.526332
###############   Target   #################
For fold 0: Train Mean Absolute Error: 18.44684670790685
For fold 0: Fold Val Mean Absolute Error: 44.887473401345865


Unnamed: 0,importance,name
6,3709791000000.0,target_rolling_avg_hour_7d
7,76673310000.0,target_rolling_avg_hour_hour_day_4w
130,69589960000.0,is_weekend
3,20581200000.0,is_consumption
25,20361380000.0,direct_solar_radiation
136,17698770000.0,hour_cos
12,16547320000.0,installed_capacity
11,14484360000.0,eic_count
9,14280130000.0,target_rolling_allp_avg_hour_7d
18,11729920000.0,cloudcover_total


Unnamed: 0,importance,name
186,0.0,yesterdays_euros_per_mwh_is_na
187,0.0,euros_per_mwh_24h_average_price_is_na
189,0.0,highest_price_7d_avg_is_na
166,0.0,winddirection_10m_hw_variances_hw_lagged_is_na
164,0.0,cloudcover_high_hw_variances_hw_lagged_is_na
133,0.0,is_year_start
151,0.0,windspeed_10m_hw_means_hw_lagged_is_na
140,0.0,target_rt_is_na
141,0.0,cloudcover_low_is_na
142,0.0,cloudcover_mid_is_na




Fold 1
Train rows: 1304266
Val rows: 173328
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.168156 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28480
[LightGBM] [Info] Number of data points in the train set: 1304266, number of used features: 162
[LightGBM] [Info] Start training from score 250.930029
###############   Target   #################
For fold 1: Train Mean Absolute Error: 19.258163182613707
For fold 1: Fold Val Mean Absolute Error: 38.719374501233766


Unnamed: 0,importance,name
6,4349193000000.0,target_rolling_avg_hour_7d
130,85841730000.0,is_weekend
7,47183200000.0,target_rolling_avg_hour_hour_day_4w
3,26655310000.0,is_consumption
25,26502300000.0,direct_solar_radiation
12,19285420000.0,installed_capacity
136,18934030000.0,hour_cos
11,16777830000.0,eic_count
9,15765410000.0,target_rolling_allp_avg_hour_7d
24,14613450000.0,shortwave_radiation


Unnamed: 0,importance,name
186,0.0,yesterdays_euros_per_mwh_is_na
187,0.0,euros_per_mwh_24h_average_price_is_na
189,0.0,highest_price_7d_avg_is_na
166,0.0,winddirection_10m_hw_variances_hw_lagged_is_na
164,0.0,cloudcover_high_hw_variances_hw_lagged_is_na
133,0.0,is_year_start
151,0.0,windspeed_10m_hw_means_hw_lagged_is_na
140,0.0,target_rt_is_na
141,0.0,cloudcover_low_is_na
142,0.0,cloudcover_mid_is_na




Fold 2
Train rows: 1480810
Val rows: 169632
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.066629 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28540
[LightGBM] [Info] Number of data points in the train set: 1480810, number of used features: 162
[LightGBM] [Info] Start training from score 255.678793
###############   Target   #################
For fold 2: Train Mean Absolute Error: 19.423965428797374
For fold 2: Fold Val Mean Absolute Error: 42.2610545807069


Unnamed: 0,importance,name
6,5279988000000.0,target_rolling_avg_hour_7d
130,102926400000.0,is_weekend
7,31383170000.0,target_rolling_avg_hour_hour_day_4w
9,29259040000.0,target_rolling_allp_avg_hour_7d
136,26217710000.0,hour_cos
25,25258890000.0,direct_solar_radiation
11,24274150000.0,eic_count
12,23842120000.0,installed_capacity
3,23530020000.0,is_consumption
18,16412610000.0,cloudcover_total


Unnamed: 0,importance,name
187,0.0,euros_per_mwh_24h_average_price_is_na
188,0.0,lowest_price_7d_avg_is_na
189,0.0,highest_price_7d_avg_is_na
167,0.0,shortwave_radiation_hw_variances_hw_lagged_is_na
165,0.0,windspeed_10m_hw_variances_hw_lagged_is_na
141,0.0,cloudcover_low_is_na
152,0.0,winddirection_10m_hw_means_hw_lagged_is_na
142,0.0,cloudcover_mid_is_na
143,0.0,cloudcover_high_is_na
144,0.0,windspeed_10m_is_na




Fold 3
Train rows: 1653658
Val rows: 167820
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.208371 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28557
[LightGBM] [Info] Number of data points in the train set: 1653658, number of used features: 162
[LightGBM] [Info] Start training from score 262.020470
###############   Target   #################
For fold 3: Train Mean Absolute Error: 19.4905897554783
For fold 3: Fold Val Mean Absolute Error: 58.56292069217227


Unnamed: 0,importance,name
6,6243234000000.0,target_rolling_avg_hour_7d
130,121830400000.0,is_weekend
7,48189870000.0,target_rolling_avg_hour_hour_day_4w
11,34645870000.0,eic_count
136,33542690000.0,hour_cos
5,31180490000.0,target_rolling_avg_24h
12,29540120000.0,installed_capacity
25,26314910000.0,direct_solar_radiation
9,22393810000.0,target_rolling_allp_avg_hour_7d
3,22004030000.0,is_consumption


Unnamed: 0,importance,name
187,0.0,euros_per_mwh_24h_average_price_is_na
188,0.0,lowest_price_7d_avg_is_na
189,0.0,highest_price_7d_avg_is_na
166,0.0,winddirection_10m_hw_variances_hw_lagged_is_na
165,0.0,windspeed_10m_hw_variances_hw_lagged_is_na
164,0.0,cloudcover_high_hw_variances_hw_lagged_is_na
151,0.0,windspeed_10m_hw_means_hw_lagged_is_na
141,0.0,cloudcover_low_is_na
142,0.0,cloudcover_mid_is_na
143,0.0,cloudcover_high_is_na




Fold 4
Train rows: 1824598
Val rows: 176496
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.229811 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28576
[LightGBM] [Info] Number of data points in the train set: 1824598, number of used features: 162
[LightGBM] [Info] Start training from score 268.598179
###############   Target   #################
For fold 4: Train Mean Absolute Error: 20.590358148001354
For fold 4: Fold Val Mean Absolute Error: 83.22939458901376


Unnamed: 0,importance,name
6,7131308000000.0,target_rolling_avg_hour_7d
130,139656000000.0,is_weekend
5,124618500000.0,target_rolling_avg_24h
25,46103680000.0,direct_solar_radiation
12,45643950000.0,installed_capacity
136,38001400000.0,hour_cos
7,36846430000.0,target_rolling_avg_hour_hour_day_4w
11,34684730000.0,eic_count
3,22695640000.0,is_consumption
126,16929860000.0,hour


Unnamed: 0,importance,name
187,0.0,euros_per_mwh_24h_average_price_is_na
188,0.0,lowest_price_7d_avg_is_na
189,0.0,highest_price_7d_avg_is_na
167,0.0,shortwave_radiation_hw_variances_hw_lagged_is_na
164,0.0,cloudcover_high_hw_variances_hw_lagged_is_na
165,0.0,windspeed_10m_hw_variances_hw_lagged_is_na
152,0.0,winddirection_10m_hw_means_hw_lagged_is_na
141,0.0,cloudcover_low_is_na
142,0.0,cloudcover_mid_is_na
143,0.0,cloudcover_high_is_na






In [90]:
def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        df_train_target = train[['target', 'target_installed_capacity', 'installed_capacity']]
        df_train_data = train.drop(['target', 'target_installed_capacity', 'installed_capacity'], axis=1)
        
        df_val_target2 = val[['target', 'target_installed_capacity', 'installed_capacity']]
        df_val_data2 = val.drop(['target', 'target_installed_capacity', 'installed_capacity'], axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        
        clf = LGBMRegressor(random_state=42, n_estimators=1500, verbose=1, n_jobs=32, objective='l2')
        clf.fit(df_train_data, df_train_target.target_installed_capacity, categorical_feature=cat_features)

        

        from sklearn.metrics import mean_absolute_error
        
        print("###############   TIC   #################")
        y_pred = clf.predict(df_train_data)
        y_pred
        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target_installed_capacity, y_pred)
        print(f"For fold {i}: Train TIC Mean Absolute Error:", mae)
        mae = mean_absolute_error(df_train_target.target, inverse_tic(y_pred, df_train_target))
        print(f"For fold {i}: Train Mean Absolute Error (TIC-INVERSED):", mae)

        y_pred_val = clf.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target_installed_capacity, y_pred_val)
        print(f"For fold {i}: Fold Val TIC Mean Absolute Error:", mae)
        mae = mean_absolute_error(df_val_target2.target, inverse_tic(y_pred_val, df_val_target2))
        print(f"For fold {i}: Fold Val Mean Absolute Error (TIC-INVERSED):", mae)
        
        importance = pd.DataFrame({'importance':clf.feature_importances_, 'name':clf.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        # display(importance.head(30))
        # display(importance.tail(10))
#         print()
#         print("###############   Target   #################")
#         y_pred = clf2.predict(df_train_data)
#         y_pred
#         # Assuming you have two pandas Series: y_true and y_pred
#         mae = mean_absolute_error(df_train_target.target, y_pred)
#         print(f"For fold {i}: Train Mean Absolute Error:", mae)

#         y_pred_val = clf2.predict(df_val_data2)
#         y_pred_val

#         mae = mean_absolute_error(df_val_target2.target, y_pred_val)
#         print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
#         importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf.feature_name_})
#         importance = importance.sort_values('importance', ascending=False)
#         # display(importance.head(30))
        # display(importance.tail(10))
        print()
        print()

In [91]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.115692 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28227
[LightGBM] [Info] Number of data points in the train set: 1129738, number of used features: 166
[LightGBM] [Info] Start training from score 189.021169
###############   TIC   #################
For fold 0: Train TIC Mean Absolute Error: 23.073826999151365
For fold 0: Train Mean Absolute Error (TIC-INVERSED): 25.438203231172626
For fold 0: Fold Val TIC Mean Absolute Error: 49.24246452014923
For fold 0: Fold Val Mean Absolute Error (TIC-INVERSED): 57.51970750929967


Fold 1
Train rows: 1304266
Val rows: 173328
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.048328 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] To

### Test 17

Going back to remove na_cols from dataset. Will this improve score?

In [34]:
def fill_drop_na(df):
    df = df[~df.target.isna()]
    df = df[~df.target_rolling_avg_24h.isna()]
    means = df.mean()
    # For each column, add an indicator column for NA values
    # for col in df.columns:
    #     if df[col].isna().any():
    #         df[f'{col}_is_na'] = df[col].isna()
    df = df.fillna(means)
    return df, means

In [35]:
%%time
processed_df_no_na, means = fill_drop_na(data_processor.df)
processed_df_no_na.isna().sum()

CPU times: total: 1.66 s
Wall time: 8.41 s


county             0
is_business        0
product_type       0
target             0
is_consumption     0
                  ..
hour_sin           0
hour_cos           0
day_of_year_sin    0
day_of_year_cos    0
is_ee_holiday      0
Length: 145, dtype: int64

In [36]:
processed_df_no_na['target_installed_capacity'] = processed_df_no_na['target'] / processed_df_no_na['installed_capacity'] * 1000
processed_df_no_na

  processed_df_no_na['target_installed_capacity'] = processed_df_no_na['target'] / processed_df_no_na['installed_capacity'] * 1000


Unnamed: 0,county,is_business,product_type,target,is_consumption,target_rt,target_rolling_avg_24h,target_rolling_avg_hour_7d,target_rolling_avg_hour_hour_day_4w,target_rolling_allp_avg_24h,...,is_quarter_end,is_year_start,is_year_end,season,hour_sin,hour_cos,day_of_year_sin,day_of_year_cos,is_ee_holiday,target_installed_capacity
11712,0,0,1,0.930,0,0.713,0.713000,0.713000,0.71300,0.713000,...,False,False,False,4,0.000000,1.000000,-0.894542,-0.446983,False,0.975978
11713,0,0,1,123.214,1,96.590,96.590000,96.590000,96.59000,96.590000,...,False,False,False,4,0.000000,1.000000,-0.894542,-0.446983,False,129.305586
11714,0,0,2,0.000,0,0.000,0.000000,0.000000,0.00000,0.356500,...,False,False,False,4,0.000000,1.000000,-0.894542,-0.446983,False,0.000000
11715,0,0,2,21.940,1,17.314,17.314000,17.314000,17.31400,56.952000,...,False,False,False,4,0.000000,1.000000,-0.894542,-0.446983,False,131.850962
11716,0,0,3,1.611,0,2.904,2.904000,2.904000,2.90400,1.205667,...,False,False,False,4,0.000000,1.000000,-0.894542,-0.446983,False,0.223505
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018609,15,1,0,197.233,1,184.072,295.118417,278.497143,184.71275,90.640000,...,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False,318.117742
2018610,15,1,1,0.000,0,0.000,156.335208,0.000000,0.00000,170.148000,...,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False,0.000000
2018611,15,1,1,28.404,1,38.646,18.873583,34.405143,42.90750,92.029875,...,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False,45.482786
2018612,15,1,3,0.000,0,0.000,403.044625,0.000000,0.00000,139.132958,...,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False,0.000000


In [37]:
from lightgbm import LGBMRegressor

In [40]:
def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity']
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]
        
        clf2 = LGBMRegressor(random_state=42, n_estimators=2500, verbose=1, n_jobs=32, objective='l2', importance_type='gain')
        clf2.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        from sklearn.metrics import mean_absolute_error
        
        print("###############   Target   #################")
        y_pred = clf2.predict(df_train_data)
        y_pred
        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf2.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        display(importance.head(30))
        display(importance.tail(30))
        print()
        print()

In [41]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.140956 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28434
[LightGBM] [Info] Number of data points in the train set: 1129738, number of used features: 144
[LightGBM] [Info] Start training from score 250.526332
###############   Target   #################
For fold 0: Train Mean Absolute Error: 18.238744642671747
For fold 0: Fold Val Mean Absolute Error: 44.920871274791324


Unnamed: 0,importance,name
6,3710887000000.0,target_rolling_avg_hour_7d
7,76017760000.0,target_rolling_avg_hour_hour_day_4w
131,69441180000.0,is_weekend
25,20659010000.0,direct_solar_radiation
3,19502010000.0,is_consumption
140,17545970000.0,hour_cos
12,17056580000.0,installed_capacity
11,14683630000.0,eic_count
9,14224620000.0,target_rolling_allp_avg_hour_7d
18,11946520000.0,cloudcover_total


Unnamed: 0,importance,name
81,215307500.0,direct_solar_radiation_hw_means_hw_lagged
60,201315900.0,cloudcover_total_hw_lagged
52,191263400.0,shortwave_radiation_hw_variances
43,189437400.0,rain_hw_variances
103,188292600.0,cloudcover_total_fw
29,186611200.0,rain_hw_means
109,179734600.0,total_precipitation
62,171123400.0,cloudcover_mid_hw_lagged
139,167704200.0,hour_sin
85,157618700.0,rain_hw_variances_hw_lagged




Fold 1
Train rows: 1304266
Val rows: 173328
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.053603 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28448
[LightGBM] [Info] Number of data points in the train set: 1304266, number of used features: 144
[LightGBM] [Info] Start training from score 250.930029
###############   Target   #################
For fold 1: Train Mean Absolute Error: 19.206930579788054
For fold 1: Fold Val Mean Absolute Error: 38.437862215954915


Unnamed: 0,importance,name
6,4347023000000.0,target_rolling_avg_hour_7d
131,86834650000.0,is_weekend
7,49158570000.0,target_rolling_avg_hour_hour_day_4w
25,27102490000.0,direct_solar_radiation
3,25971800000.0,is_consumption
140,19167260000.0,hour_cos
12,18553150000.0,installed_capacity
11,16422130000.0,eic_count
9,15902940000.0,target_rolling_allp_avg_hour_7d
18,14203730000.0,cloudcover_total


Unnamed: 0,importance,name
109,250615500.0,total_precipitation
61,239249100.0,cloudcover_low_hw_lagged
29,224935500.0,rain_hw_means
63,214301400.0,cloudcover_high_hw_lagged
103,200677000.0,cloudcover_total_fw
62,199336500.0,cloudcover_mid_hw_lagged
60,194025400.0,cloudcover_total_hw_lagged
71,189826300.0,rain_hw_means_hw_lagged
39,188947600.0,direct_solar_radiation_hw_means
53,169909600.0,direct_solar_radiation_hw_variances




Fold 2
Train rows: 1480810
Val rows: 169632
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.066075 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28508
[LightGBM] [Info] Number of data points in the train set: 1480810, number of used features: 144
[LightGBM] [Info] Start training from score 255.678793
###############   Target   #################
For fold 2: Train Mean Absolute Error: 19.356561409923177
For fold 2: Fold Val Mean Absolute Error: 41.44417300077165


Unnamed: 0,importance,name
6,5279985000000.0,target_rolling_avg_hour_7d
131,104056500000.0,is_weekend
7,32369980000.0,target_rolling_avg_hour_hour_day_4w
9,29537350000.0,target_rolling_allp_avg_hour_7d
140,26002130000.0,hour_cos
25,25065890000.0,direct_solar_radiation
12,24787110000.0,installed_capacity
3,24182950000.0,is_consumption
11,23990650000.0,eic_count
18,15571650000.0,cloudcover_total


Unnamed: 0,importance,name
103,271796600.0,cloudcover_total_fw
63,270685200.0,cloudcover_high_hw_lagged
62,267401800.0,cloudcover_mid_hw_lagged
109,266578600.0,total_precipitation
43,253569100.0,rain_hw_variances
39,253524600.0,direct_solar_radiation_hw_means
85,241823100.0,rain_hw_variances_hw_lagged
61,236335400.0,cloudcover_low_hw_lagged
53,228989700.0,direct_solar_radiation_hw_variances
52,220771900.0,shortwave_radiation_hw_variances




Fold 3
Train rows: 1653658
Val rows: 167820
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.057378 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28525
[LightGBM] [Info] Number of data points in the train set: 1653658, number of used features: 144
[LightGBM] [Info] Start training from score 262.020470
###############   Target   #################
For fold 3: Train Mean Absolute Error: 19.575447922360336
For fold 3: Fold Val Mean Absolute Error: 58.57134539803812


Unnamed: 0,importance,name
6,6241271000000.0,target_rolling_avg_hour_7d
131,122573800000.0,is_weekend
7,47496120000.0,target_rolling_avg_hour_hour_day_4w
11,34587360000.0,eic_count
140,33562140000.0,hour_cos
5,32688620000.0,target_rolling_avg_24h
12,28986750000.0,installed_capacity
25,25890200000.0,direct_solar_radiation
3,22663670000.0,is_consumption
9,22605240000.0,target_rolling_allp_avg_hour_7d


Unnamed: 0,importance,name
39,298225000.0,direct_solar_radiation_hw_means
100,274790300.0,cloudcover_high_fw
103,273395900.0,cloudcover_total_fw
102,257172000.0,cloudcover_mid_fw
63,253197700.0,cloudcover_high_hw_lagged
29,250513800.0,rain_hw_means
43,247851700.0,rain_hw_variances
62,245746100.0,cloudcover_mid_hw_lagged
101,242938200.0,cloudcover_low_fw
2,193269100.0,product_type




Fold 4
Train rows: 1824598
Val rows: 176496
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.071068 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28544
[LightGBM] [Info] Number of data points in the train set: 1824598, number of used features: 144
[LightGBM] [Info] Start training from score 268.598179
###############   Target   #################
For fold 4: Train Mean Absolute Error: 20.667024424921664
For fold 4: Fold Val Mean Absolute Error: 81.5727799021615


Unnamed: 0,importance,name
6,7131486000000.0,target_rolling_avg_hour_7d
131,139501200000.0,is_weekend
5,124831100000.0,target_rolling_avg_24h
25,46209750000.0,direct_solar_radiation
12,45566600000.0,installed_capacity
140,38010410000.0,hour_cos
7,36831530000.0,target_rolling_avg_hour_hour_day_4w
11,34855030000.0,eic_count
3,22703110000.0,is_consumption
126,17131690000.0,hour


Unnamed: 0,importance,name
81,339348900.0,direct_solar_radiation_hw_means_hw_lagged
102,323683100.0,cloudcover_mid_fw
21,310382800.0,cloudcover_high
62,302586900.0,cloudcover_mid_hw_lagged
60,291416500.0,cloudcover_total_hw_lagged
16,287986600.0,snowfall
43,285743900.0,rain_hw_variances
103,280175800.0,cloudcover_total_fw
100,274364900.0,cloudcover_high_fw
53,259016600.0,direct_solar_radiation_hw_variances






### Train 18

In [42]:
def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                    'snowfall_fw', 'snowfall_hw_means']
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]
        
        clf2 = LGBMRegressor(random_state=42, n_estimators=2500, verbose=1, n_jobs=32, objective='l2', importance_type='gain')
        clf2.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        from sklearn.metrics import mean_absolute_error
        
        print("###############   Target   #################")
        y_pred = clf2.predict(df_train_data)
        y_pred
        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf2.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        display(importance.head(30))
        display(importance.tail(30))
        print()
        print()

In [43]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.115672 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27769
[LightGBM] [Info] Number of data points in the train set: 1129738, number of used features: 132
[LightGBM] [Info] Start training from score 250.526332
###############   Target   #################
For fold 0: Train Mean Absolute Error: 18.245925963686606
For fold 0: Fold Val Mean Absolute Error: 44.487542829702996


Unnamed: 0,importance,name
6,3710840000000.0,target_rolling_avg_hour_7d
7,75981050000.0,target_rolling_avg_hour_hour_day_4w
126,69382180000.0,is_weekend
25,20595250000.0,direct_solar_radiation
3,19732920000.0,is_consumption
128,17698910000.0,hour_cos
12,17199550000.0,installed_capacity
11,14809310000.0,eic_count
9,14166070000.0,target_rolling_allp_avg_hour_7d
18,11952950000.0,cloudcover_total


Unnamed: 0,importance,name
97,280940100.0,cloudcover_high_fw
21,275530300.0,cloudcover_high
112,257185400.0,highest_price_per_mwh
91,244587800.0,shortwave_radiation_hw_variances_hw_lagged
99,235456700.0,cloudcover_mid_fw
92,235025300.0,direct_solar_radiation_hw_variances_hw_lagged
98,228326500.0,cloudcover_low_fw
100,223830000.0,cloudcover_total_fw
59,222277400.0,cloudcover_mid_hw_lagged
46,221680000.0,cloudcover_mid_hw_variances




Fold 1
Train rows: 1304266
Val rows: 173328
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.149712 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27783
[LightGBM] [Info] Number of data points in the train set: 1304266, number of used features: 132
[LightGBM] [Info] Start training from score 250.930029
###############   Target   #################
For fold 1: Train Mean Absolute Error: 19.274948791103693
For fold 1: Fold Val Mean Absolute Error: 37.964695759590896


Unnamed: 0,importance,name
6,4346879000000.0,target_rolling_avg_hour_7d
126,86870480000.0,is_weekend
7,49212820000.0,target_rolling_avg_hour_hour_day_4w
25,27105290000.0,direct_solar_radiation
3,26036590000.0,is_consumption
128,19250310000.0,hour_cos
12,18394480000.0,installed_capacity
11,16677960000.0,eic_count
9,15976410000.0,target_rolling_allp_avg_hour_7d
18,14281040000.0,cloudcover_total


Unnamed: 0,importance,name
45,351654500.0,cloudcover_low_hw_variances
52,328830700.0,diffuse_radiation_hw_variances
72,328477100.0,cloudcover_low_hw_means_hw_lagged
93,327303100.0,diffuse_radiation_hw_variances_hw_lagged
32,324056500.0,cloudcover_low_hw_means
71,323652500.0,cloudcover_total_hw_means_hw_lagged
127,311287400.0,hour_sin
78,309734400.0,direct_solar_radiation_hw_means_hw_lagged
98,301648200.0,cloudcover_low_fw
97,286071700.0,cloudcover_high_fw




Fold 2
Train rows: 1480810
Val rows: 169632
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.046376 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 27835
[LightGBM] [Info] Number of data points in the train set: 1480810, number of used features: 132
[LightGBM] [Info] Start training from score 255.678793
###############   Target   #################
For fold 2: Train Mean Absolute Error: 19.2140509573557
For fold 2: Fold Val Mean Absolute Error: 41.77216968209118


Unnamed: 0,importance,name
6,5279988000000.0,target_rolling_avg_hour_7d
126,104067600000.0,is_weekend
7,32355690000.0,target_rolling_avg_hour_hour_day_4w
9,29443520000.0,target_rolling_allp_avg_hour_7d
128,25912680000.0,hour_cos
25,25118050000.0,direct_solar_radiation
12,24675270000.0,installed_capacity
11,24023400000.0,eic_count
3,23826520000.0,is_consumption
18,15600440000.0,cloudcover_total


Unnamed: 0,importance,name
87,388715000.0,cloudcover_mid_hw_variances_hw_lagged
21,368188800.0,cloudcover_high
116,359112300.0,highest_price_7d_avg
71,349750900.0,cloudcover_total_hw_means_hw_lagged
34,348341200.0,cloudcover_high_hw_means
73,340462000.0,cloudcover_mid_hw_means_hw_lagged
58,323786700.0,cloudcover_low_hw_lagged
127,312962500.0,hour_sin
99,303292000.0,cloudcover_mid_fw
97,292675800.0,cloudcover_high_fw




Fold 3
Train rows: 1653658
Val rows: 167820
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.142757 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27841
[LightGBM] [Info] Number of data points in the train set: 1653658, number of used features: 132
[LightGBM] [Info] Start training from score 262.020470
###############   Target   #################
For fold 3: Train Mean Absolute Error: 19.6459434628303
For fold 3: Fold Val Mean Absolute Error: 58.82153596039729


Unnamed: 0,importance,name
6,6241075000000.0,target_rolling_avg_hour_7d
126,122552900000.0,is_weekend
7,47312740000.0,target_rolling_avg_hour_hour_day_4w
11,34520990000.0,eic_count
128,33200880000.0,hour_cos
5,32586060000.0,target_rolling_avg_24h
12,29142980000.0,installed_capacity
25,25833520000.0,direct_solar_radiation
3,22818940000.0,is_consumption
9,22462970000.0,target_rolling_allp_avg_hour_7d


Unnamed: 0,importance,name
92,447400600.0,direct_solar_radiation_hw_variances_hw_lagged
56,428159000.0,surface_pressure_hw_lagged
47,423620900.0,cloudcover_high_hw_variances
87,414207400.0,cloudcover_mid_hw_variances_hw_lagged
33,409297800.0,cloudcover_mid_hw_means
52,409125900.0,diffuse_radiation_hw_variances
46,339706800.0,cloudcover_mid_hw_variances
21,334053900.0,cloudcover_high
105,326597700.0,total_precipitation
59,322742500.0,cloudcover_mid_hw_lagged




Fold 4
Train rows: 1824598
Val rows: 176496
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.155015 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27858
[LightGBM] [Info] Number of data points in the train set: 1824598, number of used features: 132
[LightGBM] [Info] Start training from score 268.598179
###############   Target   #################
For fold 4: Train Mean Absolute Error: 20.59336477477411
For fold 4: Fold Val Mean Absolute Error: 80.3498704677256


Unnamed: 0,importance,name
6,7131366000000.0,target_rolling_avg_hour_7d
126,139384000000.0,is_weekend
5,124987400000.0,target_rolling_avg_24h
25,46171320000.0,direct_solar_radiation
12,45696060000.0,installed_capacity
128,37990390000.0,hour_cos
7,36840880000.0,target_rolling_avg_hour_hour_day_4w
11,34985850000.0,eic_count
3,23069540000.0,is_consumption
122,17080520000.0,hour


Unnamed: 0,importance,name
87,503818600.0,cloudcover_mid_hw_variances_hw_lagged
65,502225300.0,diffuse_radiation_hw_lagged
74,494977000.0,cloudcover_high_hw_means_hw_lagged
93,476962000.0,diffuse_radiation_hw_variances_hw_lagged
71,430410300.0,cloudcover_total_hw_means_hw_lagged
127,421962200.0,hour_sin
92,405573900.0,direct_solar_radiation_hw_variances_hw_lagged
58,389504400.0,cloudcover_low_hw_lagged
21,368456200.0,cloudcover_high
78,368366500.0,direct_solar_radiation_hw_means_hw_lagged






### Train 19

In [44]:
def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                    'snowfall_fw', 'snowfall_hw_means', 'year', 'snowfall_hw_variances_hw_lagged', 'snowfall_hw_means_hw_lagged', 'rain_hw_means_hw_lagged', 'rain_hw_variances_hw_lagged', 'rain_hw_lagged']
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]
        
        clf2 = LGBMRegressor(random_state=42, n_estimators=2500, verbose=1, n_jobs=32, objective='l2', importance_type='gain')
        clf2.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        from sklearn.metrics import mean_absolute_error
        
        print("###############   Target   #################")
        y_pred = clf2.predict(df_train_data)
        y_pred
        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf2.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        display(importance.head(30))
        display(importance.tail(30))
        print()
        print()

In [45]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033560 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 26934
[LightGBM] [Info] Number of data points in the train set: 1129738, number of used features: 126
[LightGBM] [Info] Start training from score 250.526332
###############   Target   #################
For fold 0: Train Mean Absolute Error: 18.185584319174346
For fold 0: Fold Val Mean Absolute Error: 45.415119042714444


Unnamed: 0,importance,name
6,3710669000000.0,target_rolling_avg_hour_7d
7,76197950000.0,target_rolling_avg_hour_hour_day_4w
120,69322340000.0,is_weekend
25,20578910000.0,direct_solar_radiation
3,19769840000.0,is_consumption
122,17691880000.0,hour_cos
12,17165110000.0,installed_capacity
11,14480960000.0,eic_count
9,14276420000.0,target_rolling_allp_avg_hour_7d
18,12020780000.0,cloudcover_total


Unnamed: 0,importance,name
21,312156500.0,cloudcover_high
81,302340300.0,cloudcover_low_hw_variances_hw_lagged
34,296054600.0,cloudcover_high_hw_means
88,289741000.0,diffuse_radiation_hw_variances_hw_lagged
32,288495500.0,cloudcover_low_hw_means
46,283957500.0,cloudcover_mid_hw_variances
82,278576700.0,cloudcover_mid_hw_variances_hw_lagged
92,272022500.0,cloudcover_high_fw
47,270748500.0,cloudcover_high_hw_variances
93,266867900.0,cloudcover_low_fw




Fold 1
Train rows: 1304266
Val rows: 173328
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041073 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 26941
[LightGBM] [Info] Number of data points in the train set: 1304266, number of used features: 126
[LightGBM] [Info] Start training from score 250.930029
###############   Target   #################
For fold 1: Train Mean Absolute Error: 19.20672390475781
For fold 1: Fold Val Mean Absolute Error: 38.06841326889926


Unnamed: 0,importance,name
6,4346938000000.0,target_rolling_avg_hour_7d
120,86871340000.0,is_weekend
7,49398680000.0,target_rolling_avg_hour_hour_day_4w
25,27106250000.0,direct_solar_radiation
3,25993330000.0,is_consumption
122,19296960000.0,hour_cos
12,18542690000.0,installed_capacity
11,16751960000.0,eic_count
9,15829090000.0,target_rolling_allp_avg_hour_7d
18,14166130000.0,cloudcover_total


Unnamed: 0,importance,name
45,418816100.0,cloudcover_low_hw_variances
21,417084600.0,cloudcover_high
68,415667400.0,cloudcover_total_hw_means_hw_lagged
78,399975400.0,dewpoint_hw_variances_hw_lagged
88,395333200.0,diffuse_radiation_hw_variances_hw_lagged
47,392031500.0,cloudcover_high_hw_variances
31,391221300.0,cloudcover_total_hw_means
32,372710700.0,cloudcover_low_hw_means
70,339920800.0,cloudcover_mid_hw_means_hw_lagged
121,337645400.0,hour_sin




Fold 2
Train rows: 1480810
Val rows: 169632
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.158703 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26986
[LightGBM] [Info] Number of data points in the train set: 1480810, number of used features: 126
[LightGBM] [Info] Start training from score 255.678793
###############   Target   #################
For fold 2: Train Mean Absolute Error: 19.292432049961864
For fold 2: Fold Val Mean Absolute Error: 41.254720562185504


Unnamed: 0,importance,name
6,5280449000000.0,target_rolling_avg_hour_7d
120,103917600000.0,is_weekend
7,32385730000.0,target_rolling_avg_hour_hour_day_4w
9,29480060000.0,target_rolling_allp_avg_hour_7d
122,25967320000.0,hour_cos
25,25092040000.0,direct_solar_radiation
12,24760280000.0,installed_capacity
11,24201230000.0,eic_count
3,23741840000.0,is_consumption
18,15502140000.0,cloudcover_total


Unnamed: 0,importance,name
52,433362300.0,diffuse_radiation_hw_variances
71,413938300.0,cloudcover_high_hw_means_hw_lagged
83,412288600.0,cloudcover_high_hw_variances_hw_lagged
86,411363000.0,shortwave_radiation_hw_variances_hw_lagged
109,411116400.0,highest_price_3d_avg
68,394799400.0,cloudcover_total_hw_means_hw_lagged
32,386186200.0,cloudcover_low_hw_means
70,372377800.0,cloudcover_mid_hw_means_hw_lagged
69,363691700.0,cloudcover_low_hw_means_hw_lagged
106,347514900.0,lowest_price_per_mwh




Fold 3
Train rows: 1653658
Val rows: 167820
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.147830 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26980
[LightGBM] [Info] Number of data points in the train set: 1653658, number of used features: 126
[LightGBM] [Info] Start training from score 262.020470
###############   Target   #################
For fold 3: Train Mean Absolute Error: 19.441677260864086
For fold 3: Fold Val Mean Absolute Error: 58.34307668910384


Unnamed: 0,importance,name
6,6241119000000.0,target_rolling_avg_hour_7d
120,122663700000.0,is_weekend
7,47244160000.0,target_rolling_avg_hour_hour_day_4w
11,34426080000.0,eic_count
122,33354880000.0,hour_cos
5,32660010000.0,target_rolling_avg_24h
12,29553710000.0,installed_capacity
25,26024910000.0,direct_solar_radiation
3,22712500000.0,is_consumption
9,22374350000.0,target_rolling_allp_avg_hour_7d


Unnamed: 0,importance,name
71,495369000.0,cloudcover_high_hw_means_hw_lagged
107,489849600.0,highest_price_per_mwh
87,467300900.0,direct_solar_radiation_hw_variances_hw_lagged
34,459126500.0,cloudcover_high_hw_means
50,454497400.0,shortwave_radiation_hw_variances
46,447512100.0,cloudcover_mid_hw_variances
33,436224200.0,cloudcover_mid_hw_means
83,435951200.0,cloudcover_high_hw_variances_hw_lagged
52,429190500.0,diffuse_radiation_hw_variances
47,428788000.0,cloudcover_high_hw_variances




Fold 4
Train rows: 1824598
Val rows: 176496
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.057976 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 26995
[LightGBM] [Info] Number of data points in the train set: 1824598, number of used features: 126
[LightGBM] [Info] Start training from score 268.598179
###############   Target   #################
For fold 4: Train Mean Absolute Error: 20.72527656574749
For fold 4: Fold Val Mean Absolute Error: 84.17213756625198


Unnamed: 0,importance,name
6,7131062000000.0,target_rolling_avg_hour_7d
120,139420100000.0,is_weekend
5,124892500000.0,target_rolling_avg_24h
25,46662920000.0,direct_solar_radiation
12,45192350000.0,installed_capacity
122,37911590000.0,hour_cos
7,36724140000.0,target_rolling_avg_hour_hour_day_4w
11,34434460000.0,eic_count
3,22982750000.0,is_consumption
116,17086270000.0,hour


Unnamed: 0,importance,name
91,538441400.0,dewpoint_fw
52,531718900.0,diffuse_radiation_hw_variances
71,519267800.0,cloudcover_high_hw_means_hw_lagged
83,517119300.0,cloudcover_high_hw_variances_hw_lagged
106,516760100.0,lowest_price_per_mwh
37,511747300.0,shortwave_radiation_hw_means
104,510212200.0,yesterdays_euros_per_mwh
68,501697400.0,cloudcover_total_hw_means_hw_lagged
98,490562100.0,direct_solar_radiation_fw
64,481471400.0,diffuse_radiation_hw_lagged






### Train 20

Param tuning


In [46]:
def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                    'snowfall_fw', 'snowfall_hw_means']
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]
        
        # We leave max_depth as -1
        # Tune num_leaves, default is 31, let's double it
        clf2 = LGBMRegressor(random_state=42, n_estimators=2500, verbose=1, n_jobs=32, objective='l2', importance_type='gain', num_leaves=62)
        clf2.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        from sklearn.metrics import mean_absolute_error
        
        print("###############   Target   #################")
        y_pred = clf2.predict(df_train_data)
        y_pred
        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf2.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        display(importance.head(30))
        display(importance.tail(30))
        print()
        print()

In [47]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035766 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 27769
[LightGBM] [Info] Number of data points in the train set: 1129738, number of used features: 132
[LightGBM] [Info] Start training from score 250.526332
###############   Target   #################
For fold 0: Train Mean Absolute Error: 13.619523141258638
For fold 0: Fold Val Mean Absolute Error: 44.09273758041939


Unnamed: 0,importance,name
6,3775629000000.0,target_rolling_avg_hour_7d
126,69044220000.0,is_weekend
3,19778710000.0,is_consumption
25,18345150000.0,direct_solar_radiation
128,17880900000.0,hour_cos
12,16573540000.0,installed_capacity
9,16511450000.0,target_rolling_allp_avg_hour_7d
24,13307940000.0,shortwave_radiation
7,13171820000.0,target_rolling_avg_hour_hour_day_4w
11,12863000000.0,eic_count


Unnamed: 0,importance,name
112,318684800.0,highest_price_per_mwh
38,316292300.0,direct_solar_radiation_hw_means
86,303195200.0,cloudcover_low_hw_variances_hw_lagged
46,303015500.0,cloudcover_mid_hw_variances
99,294344200.0,cloudcover_mid_fw
98,290092500.0,cloudcover_low_fw
100,278405700.0,cloudcover_total_fw
91,276473400.0,shortwave_radiation_hw_variances_hw_lagged
58,276001500.0,cloudcover_low_hw_lagged
97,274714400.0,cloudcover_high_fw




Fold 1
Train rows: 1304266
Val rows: 173328
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.108852 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27783
[LightGBM] [Info] Number of data points in the train set: 1304266, number of used features: 132
[LightGBM] [Info] Start training from score 250.930029
###############   Target   #################
For fold 1: Train Mean Absolute Error: 14.565705129174708
For fold 1: Fold Val Mean Absolute Error: 39.86457468400738


Unnamed: 0,importance,name
6,4375554000000.0,target_rolling_avg_hour_7d
126,85517430000.0,is_weekend
25,27006080000.0,direct_solar_radiation
3,25578560000.0,is_consumption
7,22898650000.0,target_rolling_avg_hour_hour_day_4w
128,19499880000.0,hour_cos
11,18223400000.0,eic_count
9,17778540000.0,target_rolling_allp_avg_hour_7d
12,16219400000.0,installed_capacity
24,16084510000.0,shortwave_radiation


Unnamed: 0,importance,name
31,393572700.0,cloudcover_total_hw_means
87,388572400.0,cloudcover_mid_hw_variances_hw_lagged
93,380494300.0,diffuse_radiation_hw_variances_hw_lagged
33,375016800.0,cloudcover_mid_hw_means
98,365408600.0,cloudcover_low_fw
34,363053100.0,cloudcover_high_hw_means
91,355336600.0,shortwave_radiation_hw_variances_hw_lagged
72,351521300.0,cloudcover_low_hw_means_hw_lagged
38,343197200.0,direct_solar_radiation_hw_means
50,325555700.0,shortwave_radiation_hw_variances




Fold 2
Train rows: 1480810
Val rows: 169632
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.049408 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 27835
[LightGBM] [Info] Number of data points in the train set: 1480810, number of used features: 132
[LightGBM] [Info] Start training from score 255.678793
###############   Target   #################
For fold 2: Train Mean Absolute Error: 14.626893897304953
For fold 2: Fold Val Mean Absolute Error: 42.94517378603924


Unnamed: 0,importance,name
6,5293358000000.0,target_rolling_avg_hour_7d
126,103506300000.0,is_weekend
9,30674160000.0,target_rolling_allp_avg_hour_7d
128,25432420000.0,hour_cos
7,25314860000.0,target_rolling_avg_hour_hour_day_4w
11,25100260000.0,eic_count
25,24368820000.0,direct_solar_radiation
3,22386450000.0,is_consumption
12,18173740000.0,installed_capacity
24,16338840000.0,shortwave_radiation


Unnamed: 0,importance,name
91,429034200.0,shortwave_radiation_hw_variances_hw_lagged
21,425742600.0,cloudcover_high
59,423924100.0,cloudcover_mid_hw_lagged
46,421937400.0,cloudcover_mid_hw_variances
52,412276800.0,diffuse_radiation_hw_variances
86,410948900.0,cloudcover_low_hw_variances_hw_lagged
34,401202800.0,cloudcover_high_hw_means
113,395960900.0,lowest_price_3d_avg
87,372000900.0,cloudcover_mid_hw_variances_hw_lagged
99,354888600.0,cloudcover_mid_fw




Fold 3
Train rows: 1653658
Val rows: 167820
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.158375 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27841
[LightGBM] [Info] Number of data points in the train set: 1653658, number of used features: 132
[LightGBM] [Info] Start training from score 262.020470
###############   Target   #################
For fold 3: Train Mean Absolute Error: 14.836592496087857
For fold 3: Fold Val Mean Absolute Error: 60.7988939440116


Unnamed: 0,importance,name
6,6306109000000.0,target_rolling_avg_hour_7d
126,119614900000.0,is_weekend
11,39959770000.0,eic_count
128,34575020000.0,hour_cos
25,24871410000.0,direct_solar_radiation
9,24523960000.0,target_rolling_allp_avg_hour_7d
12,20994910000.0,installed_capacity
3,19805140000.0,is_consumption
24,19667330000.0,shortwave_radiation
18,15512710000.0,cloudcover_total


Unnamed: 0,importance,name
21,452583700.0,cloudcover_high
86,437549900.0,cloudcover_low_hw_variances_hw_lagged
72,435303900.0,cloudcover_low_hw_means_hw_lagged
47,433875500.0,cloudcover_high_hw_variances
112,424322600.0,highest_price_per_mwh
78,422868900.0,direct_solar_radiation_hw_means_hw_lagged
71,410496900.0,cloudcover_total_hw_means_hw_lagged
88,405488300.0,cloudcover_high_hw_variances_hw_lagged
52,383632900.0,diffuse_radiation_hw_variances
60,376685100.0,cloudcover_high_hw_lagged




Fold 4
Train rows: 1824598
Val rows: 176496
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063981 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 27858
[LightGBM] [Info] Number of data points in the train set: 1824598, number of used features: 132
[LightGBM] [Info] Start training from score 268.598179
###############   Target   #################
For fold 4: Train Mean Absolute Error: 15.82080296712325
For fold 4: Fold Val Mean Absolute Error: 83.94359982560738


Unnamed: 0,importance,name
6,7280554000000.0,target_rolling_avg_hour_7d
126,138515700000.0,is_weekend
25,43250000000.0,direct_solar_radiation
11,40480780000.0,eic_count
128,38530690000.0,hour_cos
12,35899220000.0,installed_capacity
3,21666290000.0,is_consumption
24,15530740000.0,shortwave_radiation
7,14476680000.0,target_rolling_avg_hour_hour_day_4w
18,13329640000.0,cloudcover_total


Unnamed: 0,importance,name
46,532680700.0,cloudcover_mid_hw_variances
87,526197700.0,cloudcover_mid_hw_variances_hw_lagged
127,504076800.0,hour_sin
44,495113900.0,cloudcover_total_hw_variances
119,484049600.0,year
91,466152900.0,shortwave_radiation_hw_variances_hw_lagged
16,451684600.0,snowfall
74,446953500.0,cloudcover_high_hw_means_hw_lagged
38,442316200.0,direct_solar_radiation_hw_means
98,429494700.0,cloudcover_low_fw






### Train 21

Param tuning - n estimators



In [12]:
from lightgbm import LGBMRegressor

def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                    'snowfall_fw', 'snowfall_hw_means']
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]
        
        # We leave max_depth as -1
        # Tune num_leaves, default is 31, let's double it
        clf2 = LGBMRegressor(random_state=42, n_estimators=3500, verbose=1, n_jobs=32, objective='l2', importance_type='gain')
        clf2.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        from sklearn.metrics import mean_absolute_error
        
        print("###############   Target   #################")
        y_pred = clf2.predict(df_train_data)
        y_pred
        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf2.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        display(importance.head(30))
        display(importance.tail(30))
        print()
        print()

In [13]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037181 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 27769
[LightGBM] [Info] Number of data points in the train set: 1129738, number of used features: 132
[LightGBM] [Info] Start training from score 250.526332
###############   Target   #################
For fold 0: Train Mean Absolute Error: 16.133794725829613
For fold 0: Fold Val Mean Absolute Error: 44.29328699955898


Unnamed: 0,importance,name
6,3710921000000.0,target_rolling_avg_hour_7d
7,76030150000.0,target_rolling_avg_hour_hour_day_4w
126,69389260000.0,is_weekend
25,20630460000.0,direct_solar_radiation
3,19758070000.0,is_consumption
128,17712330000.0,hour_cos
12,17252480000.0,installed_capacity
11,14850760000.0,eic_count
9,14209500000.0,target_rolling_allp_avg_hour_7d
18,11977880000.0,cloudcover_total


Unnamed: 0,importance,name
97,299720800.0,cloudcover_high_fw
21,296969700.0,cloudcover_high
112,264166900.0,highest_price_per_mwh
99,256542600.0,cloudcover_mid_fw
91,252725500.0,shortwave_radiation_hw_variances_hw_lagged
98,247126700.0,cloudcover_low_fw
100,244762400.0,cloudcover_total_fw
92,242499500.0,direct_solar_radiation_hw_variances_hw_lagged
46,235221500.0,cloudcover_mid_hw_variances
59,228417400.0,cloudcover_mid_hw_lagged




Fold 1
Train rows: 1304266
Val rows: 173328
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.045513 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 27783
[LightGBM] [Info] Number of data points in the train set: 1304266, number of used features: 132
[LightGBM] [Info] Start training from score 250.930029
###############   Target   #################
For fold 1: Train Mean Absolute Error: 17.16819541562717
For fold 1: Fold Val Mean Absolute Error: 38.06152214999036


Unnamed: 0,importance,name
6,4346977000000.0,target_rolling_avg_hour_7d
126,86879580000.0,is_weekend
7,49269330000.0,target_rolling_avg_hour_hour_day_4w
25,27153580000.0,direct_solar_radiation
3,26069420000.0,is_consumption
128,19266040000.0,hour_cos
12,18470530000.0,installed_capacity
11,16736990000.0,eic_count
9,16030010000.0,target_rolling_allp_avg_hour_7d
18,14314020000.0,cloudcover_total


Unnamed: 0,importance,name
45,363808600.0,cloudcover_low_hw_variances
52,343201900.0,diffuse_radiation_hw_variances
72,341572900.0,cloudcover_low_hw_means_hw_lagged
93,340298900.0,diffuse_radiation_hw_variances_hw_lagged
32,337036900.0,cloudcover_low_hw_means
71,336981400.0,cloudcover_total_hw_means_hw_lagged
98,327450800.0,cloudcover_low_fw
127,322545300.0,hour_sin
78,318404400.0,direct_solar_radiation_hw_means_hw_lagged
97,310339500.0,cloudcover_high_fw




Fold 2
Train rows: 1480810
Val rows: 169632
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.045567 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 27835
[LightGBM] [Info] Number of data points in the train set: 1480810, number of used features: 132
[LightGBM] [Info] Start training from score 255.678793
###############   Target   #################
For fold 2: Train Mean Absolute Error: 17.199067128091837
For fold 2: Fold Val Mean Absolute Error: 41.993909356541074


Unnamed: 0,importance,name
6,5280106000000.0,target_rolling_avg_hour_7d
126,104078800000.0,is_weekend
7,32418760000.0,target_rolling_avg_hour_hour_day_4w
9,29499890000.0,target_rolling_allp_avg_hour_7d
128,25926880000.0,hour_cos
25,25162450000.0,direct_solar_radiation
12,24767930000.0,installed_capacity
11,24102020000.0,eic_count
3,23860410000.0,is_consumption
18,15631790000.0,cloudcover_total


Unnamed: 0,importance,name
87,404492800.0,cloudcover_mid_hw_variances_hw_lagged
21,394440000.0,cloudcover_high
71,364832300.0,cloudcover_total_hw_means_hw_lagged
34,364110300.0,cloudcover_high_hw_means
116,363668100.0,highest_price_7d_avg
73,355024200.0,cloudcover_mid_hw_means_hw_lagged
58,336640700.0,cloudcover_low_hw_lagged
99,332530300.0,cloudcover_mid_fw
127,328217300.0,hour_sin
97,320233600.0,cloudcover_high_fw




Fold 3
Train rows: 1653658
Val rows: 167820
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.195657 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27841
[LightGBM] [Info] Number of data points in the train set: 1653658, number of used features: 132
[LightGBM] [Info] Start training from score 262.020470
###############   Target   #################
For fold 3: Train Mean Absolute Error: 17.50511710643044
For fold 3: Fold Val Mean Absolute Error: 58.653730524280355


Unnamed: 0,importance,name
6,6241214000000.0,target_rolling_avg_hour_7d
126,122574500000.0,is_weekend
7,47386520000.0,target_rolling_avg_hour_hour_day_4w
11,34642350000.0,eic_count
128,33226230000.0,hour_cos
5,32707120000.0,target_rolling_avg_24h
12,29262130000.0,installed_capacity
25,25890490000.0,direct_solar_radiation
3,22865070000.0,is_consumption
9,22534730000.0,target_rolling_allp_avg_hour_7d


Unnamed: 0,importance,name
92,458302600.0,direct_solar_radiation_hw_variances_hw_lagged
47,448098800.0,cloudcover_high_hw_variances
56,446983300.0,surface_pressure_hw_lagged
87,437069900.0,cloudcover_mid_hw_variances_hw_lagged
33,430067000.0,cloudcover_mid_hw_means
52,425895400.0,diffuse_radiation_hw_variances
46,365795500.0,cloudcover_mid_hw_variances
21,363585700.0,cloudcover_high
105,351088700.0,total_precipitation
59,339330300.0,cloudcover_mid_hw_lagged




Fold 4
Train rows: 1824598
Val rows: 176496
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.146874 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27858
[LightGBM] [Info] Number of data points in the train set: 1824598, number of used features: 132
[LightGBM] [Info] Start training from score 268.598179
###############   Target   #################
For fold 4: Train Mean Absolute Error: 18.513984178412134
For fold 4: Fold Val Mean Absolute Error: 80.51404842268971


Unnamed: 0,importance,name
6,7131515000000.0,target_rolling_avg_hour_7d
126,139408100000.0,is_weekend
5,125123500000.0,target_rolling_avg_24h
25,46239400000.0,direct_solar_radiation
12,45820860000.0,installed_capacity
128,38016560000.0,hour_cos
7,36923130000.0,target_rolling_avg_hour_hour_day_4w
11,35113230000.0,eic_count
3,23127590000.0,is_consumption
122,17259740000.0,hour


Unnamed: 0,importance,name
87,530064300.0,cloudcover_mid_hw_variances_hw_lagged
65,528526000.0,diffuse_radiation_hw_lagged
74,522385000.0,cloudcover_high_hw_means_hw_lagged
93,497390700.0,diffuse_radiation_hw_variances_hw_lagged
71,448355700.0,cloudcover_total_hw_means_hw_lagged
127,441956400.0,hour_sin
92,420956600.0,direct_solar_radiation_hw_variances_hw_lagged
58,406427100.0,cloudcover_low_hw_lagged
21,400855100.0,cloudcover_high
78,381634900.0,direct_solar_radiation_hw_means_hw_lagged






### Train 22

Param tuning - n estimators



In [21]:
from lightgbm import LGBMRegressor

def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                    'snowfall_fw', 'snowfall_hw_means']
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]
        
        # We leave max_depth as -1
        # Tune num_leaves, default is 31, let's double it
        clf2 = LGBMRegressor(random_state=42, n_estimators=2500, verbose=0, n_jobs=31, objective='l2', importance_type='gain', max_depth=7, num_leaves=70)
        clf2.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        from sklearn.metrics import mean_absolute_error
        
        print("###############   Target   #################")
        y_pred = clf2.predict(df_train_data)
        y_pred
        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf2.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        # display(importance.head(30))
        # display(importance.tail(30))
        print()
        print()

In [22]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
###############   Target   #################
For fold 0: Train Mean Absolute Error: 14.6978036881538
For fold 0: Fold Val Mean Absolute Error: 45.585199939608366


Fold 1
Train rows: 1304266
Val rows: 173328
###############   Target   #################
For fold 1: Train Mean Absolute Error: 15.628265574727061
For fold 1: Fold Val Mean Absolute Error: 40.71029547481108


Fold 2
Train rows: 1480810
Val rows: 169632
###############   Target   #################
For fold 2: Train Mean Absolute Error: 15.758382567123938
For fold 2: Fold Val Mean Absolute Error: 45.05593023097321


Fold 3
Train rows: 1653658
Val rows: 167820
###############   Target   #################
For fold 3: Train Mean Absolute Error: 16.075784051605037
For fold 3: Fold Val Mean Absolute Error: 61.533667795180406


Fold 4
Train rows: 1824598
Val rows: 176496
###############   Target   #################
For fold 4: Train Mean Absolute Error: 17.057669882790556
For fold 4: Fold 

### Train 23

Param tuning - n estimators



In [23]:
from lightgbm import LGBMRegressor

def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                    'snowfall_fw', 'snowfall_hw_means']
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]
        
        # We leave max_depth as -1
        # Tune num_leaves, default is 31, let's double it
        clf2 = LGBMRegressor(random_state=42, n_estimators=2500, verbose=0, n_jobs=31, objective='l2', importance_type='gain', max_depth=-1, num_leaves=42)
        clf2.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        from sklearn.metrics import mean_absolute_error
        
        print("###############   Target   #################")
        y_pred = clf2.predict(df_train_data)
        y_pred
        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf2.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        # display(importance.head(30))
        # display(importance.tail(30))
        print()
        print()

In [24]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
###############   Target   #################
For fold 0: Train Mean Absolute Error: 16.142637911751418
For fold 0: Fold Val Mean Absolute Error: 44.99688072904069


Fold 1
Train rows: 1304266
Val rows: 173328
###############   Target   #################
For fold 1: Train Mean Absolute Error: 17.13829277555766
For fold 1: Fold Val Mean Absolute Error: 39.44858514624981


Fold 2
Train rows: 1480810
Val rows: 169632
###############   Target   #################
For fold 2: Train Mean Absolute Error: 17.115075354360584
For fold 2: Fold Val Mean Absolute Error: 41.745626896507495


Fold 3
Train rows: 1653658
Val rows: 167820
###############   Target   #################
For fold 3: Train Mean Absolute Error: 17.40765843565076
For fold 3: Fold Val Mean Absolute Error: 58.65525785517471


Fold 4
Train rows: 1824598
Val rows: 176496
###############   Target   #################
For fold 4: Train Mean Absolute Error: 18.328747258043148
For fold 4: Fold V

### Train 24

Param tuning - reducing num_trees?


In [25]:
from lightgbm import LGBMRegressor

def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                    'snowfall_fw', 'snowfall_hw_means']
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]
        
        # We leave max_depth as -1
        # Tune num_leaves, default is 31, let's double it
        clf2 = LGBMRegressor(random_state=42, n_estimators=2500, verbose=0, n_jobs=31, objective='l2', importance_type='gain', max_depth=-1, num_leaves=25)
        clf2.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        from sklearn.metrics import mean_absolute_error
        
        print("###############   Target   #################")
        y_pred = clf2.predict(df_train_data)
        y_pred
        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf2.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        # display(importance.head(30))
        # display(importance.tail(30))
        print()
        print()

In [26]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
###############   Target   #################
For fold 0: Train Mean Absolute Error: 19.912352598335694
For fold 0: Fold Val Mean Absolute Error: 44.8708060882521


Fold 1
Train rows: 1304266
Val rows: 173328
###############   Target   #################
For fold 1: Train Mean Absolute Error: 20.85706628422974
For fold 1: Fold Val Mean Absolute Error: 38.091581330957155


Fold 2
Train rows: 1480810
Val rows: 169632
###############   Target   #################
For fold 2: Train Mean Absolute Error: 21.02647192096505
For fold 2: Fold Val Mean Absolute Error: 40.109960571346555


Fold 3
Train rows: 1653658
Val rows: 167820
###############   Target   #################
For fold 3: Train Mean Absolute Error: 21.141380213375456
For fold 3: Fold Val Mean Absolute Error: 59.90987504073474


Fold 4
Train rows: 1824598
Val rows: 176496
###############   Target   #################
For fold 4: Train Mean Absolute Error: 22.202148396154136
For fold 4: Fold V

### Train 25

Param tuning - learning rate?


In [27]:
from lightgbm import LGBMRegressor

def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                    'snowfall_fw', 'snowfall_hw_means']
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]
        
        # We leave max_depth as -1
        # Tune num_leaves, default is 31, let's double it
        clf2 = LGBMRegressor(random_state=42, n_estimators=2500, verbose=0, n_jobs=31, objective='l2', importance_type='gain', max_depth=-1, num_leaves=25, learning_rate=0.075)
        clf2.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        from sklearn.metrics import mean_absolute_error
        
        print("###############   Target   #################")
        y_pred = clf2.predict(df_train_data)
        y_pred
        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf2.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        # display(importance.head(30))
        # display(importance.tail(30))
        print()
        print()

In [28]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
###############   Target   #################
For fold 0: Train Mean Absolute Error: 21.556418583419447
For fold 0: Fold Val Mean Absolute Error: 45.04079736519395


Fold 1
Train rows: 1304266
Val rows: 173328
###############   Target   #################
For fold 1: Train Mean Absolute Error: 22.487449733671884
For fold 1: Fold Val Mean Absolute Error: 38.5889506539956


Fold 2
Train rows: 1480810
Val rows: 169632
###############   Target   #################
For fold 2: Train Mean Absolute Error: 22.6080797457637
For fold 2: Fold Val Mean Absolute Error: 41.06841769876818


Fold 3
Train rows: 1653658
Val rows: 167820
###############   Target   #################
For fold 3: Train Mean Absolute Error: 22.808330503084477
For fold 3: Fold Val Mean Absolute Error: 59.52882376204059


Fold 4
Train rows: 1824598
Val rows: 176496
###############   Target   #################
For fold 4: Train Mean Absolute Error: 23.888184318547214
For fold 4: Fold Val

### Train 25

Param tuning - learning rate?


In [29]:
from lightgbm import LGBMRegressor

def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                    'snowfall_fw', 'snowfall_hw_means']
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]
        
        # We leave max_depth as -1
        # Tune num_leaves, default is 31, let's double it
        clf2 = LGBMRegressor(random_state=42, n_estimators=2500, verbose=0, n_jobs=31, objective='l2', importance_type='gain', max_depth=-1, num_leaves=25, learning_rate=0.15)
        clf2.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        from sklearn.metrics import mean_absolute_error
        
        print("###############   Target   #################")
        y_pred = clf2.predict(df_train_data)
        y_pred
        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf2.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        # display(importance.head(30))
        # display(importance.tail(30))
        print()
        print()

In [30]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
###############   Target   #################
For fold 0: Train Mean Absolute Error: 17.838273867444126
For fold 0: Fold Val Mean Absolute Error: 46.2944429595387


Fold 1
Train rows: 1304266
Val rows: 173328
###############   Target   #################
For fold 1: Train Mean Absolute Error: 18.73064636999986
For fold 1: Fold Val Mean Absolute Error: 39.21292944329655


Fold 2
Train rows: 1480810
Val rows: 169632
###############   Target   #################
For fold 2: Train Mean Absolute Error: 18.76238779816518
For fold 2: Fold Val Mean Absolute Error: 42.49123730712652


Fold 3
Train rows: 1653658
Val rows: 167820
###############   Target   #################
For fold 3: Train Mean Absolute Error: 18.969531476693177
For fold 3: Fold Val Mean Absolute Error: 60.1132954217168


Fold 4
Train rows: 1824598
Val rows: 176496
###############   Target   #################
For fold 4: Train Mean Absolute Error: 20.153586620044987
For fold 4: Fold Val 

### Train 26

Param tuning - learning rate?


In [32]:
from lightgbm import LGBMRegressor

def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                    'snowfall_fw', 'snowfall_hw_means']
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]
        
        # We leave max_depth as -1
        # Tune num_leaves, default is 31, let's double it
        clf2 = LGBMRegressor(random_state=42, n_estimators=2500, verbose=0, n_jobs=31, objective='l2', importance_type='gain', 
                            colsample_bynode= 0.2996737821583597, colsample_bytree= 0.6142344384136116, lambda_l1= 6.924145688620425, lambda_l2=1.4645041271999772, 
                             learning_rate= 0.07075448519014384, max_bin= 520, max_depth= 9, min_data_in_leaf=176)
        clf2.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        from sklearn.metrics import mean_absolute_error
        
        print("###############   Target   #################")
        y_pred = clf2.predict(df_train_data)
        y_pred
        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf2.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        # display(importance.head(30))
        # display(importance.tail(30))
        print()
        print()

In [33]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
###############   Target   #################
For fold 0: Train Mean Absolute Error: 24.14196773064277
For fold 0: Fold Val Mean Absolute Error: 46.578835013766515


Fold 1
Train rows: 1304266
Val rows: 173328
###############   Target   #################
For fold 1: Train Mean Absolute Error: 25.014742110873687
For fold 1: Fold Val Mean Absolute Error: 41.11527300029508


Fold 2
Train rows: 1480810
Val rows: 169632
###############   Target   #################
For fold 2: Train Mean Absolute Error: 25.153049934179595
For fold 2: Fold Val Mean Absolute Error: 43.92952260980048


Fold 3
Train rows: 1653658
Val rows: 167820
###############   Target   #################
For fold 3: Train Mean Absolute Error: 25.234349977586792
For fold 3: Fold Val Mean Absolute Error: 61.5234089755833


Fold 4
Train rows: 1824598
Val rows: 176496
###############   Target   #################
For fold 4: Train Mean Absolute Error: 26.30260390844555
For fold 4: Fold Va

## Train 27

GRID SEARCH

In [35]:
import random

random.choice(['gbdt', 'dart'])

'dart'

In [41]:
%%time

# https://www.kaggle.com/code/chaozhuang/enefit-eda-w-fft-ssa-arima-lgbm?scriptVersionId=156414824#Predictive-Modelling

import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

def tune_lgbm_model(base_params, X_train, y_train, n_iter=10, cv=3):
    """
    Tune a LightGBM model based on a base set of parameters.

    :param base_params: Dictionary of base parameters for the model
    :param X_train: Training features
    :param y_train: Training target variable
    :param n_iter: Number of iterations for RandomizedSearchCV
    :param cv: Number of cross-validation folds
    :return: Best estimator and best parameters
    """
    # Parameter distributions for random search
    cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
       'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
        'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
    cat_features = [c for c in cat_features if c in df_train_data.columns]
    
    param_dist = {
        'learning_rate': sp_uniform(0.05, 0.15),
        'lambda_l1': sp_uniform(0, 2), 
        'lambda_l2': sp_uniform(0, 2), 
        'max_bin': sp_randint(100, 500),
        'min_data_in_leaf': sp_randint(15, 150),
        'num_iterations': sp_randint(100, 1000),
        'boosting': ['gbrt', 'dart']
    }

    # Create a LightGBM regressor object
    lgb_reg = lgb.LGBMRegressor(**base_params)

    # Create a RandomizedSearchCV object
    random_search = RandomizedSearchCV(estimator=lgb_reg, param_distributions=param_dist,
                                       n_iter=n_iter, scoring='neg_mean_absolute_error',
                                       cv=cv, verbose=0, random_state=42)

    # Fit the random search to the data
    random_search.fit(X_train, y_train, categorical_feature=cat_features)

    # Return the best estimator and best parameters
    return random_search.best_estimator_, random_search.best_params_

base_params_p1 = {
    'verbose': -1,
    'objective': 'l2',
    'metric': 'mae',
    'learning_rate': 0.1,
}

i=4
train = processed_df_no_na[date_filter <= datetime_cv_ranges[i][0]]
val = processed_df_no_na[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
print(f"Fold {i}")
print(f"Train rows: {len(train)}")
print(f"Val rows: {len(val)}")

target_cols = ['target', 'target_installed_capacity']
drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
            'snowfall_fw', 'snowfall_hw_means']

df_train_target = train[target_cols]
df_train_data = train.drop(drop_cols, axis=1)

df_val_target2 = val[target_cols]
df_val_data2 = val.drop(drop_cols, axis=1)

cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
       'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
        'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
cat_features = [c for c in cat_features if c in df_train_data.columns]

# Fit the model
best_model, best_params = tune_lgbm_model(base_params_p1, df_train_data, df_train_target["target"])

print("Best parameters of p1:", best_params)

Fold 4
Train rows: 1824598
Val rows: 176496




Best parameters of p1: {'boosting': 'dart', 'lambda_l1': 1.7896547008552977, 'lambda_l2': 1.1957999576221703, 'learning_rate': 0.1882811352534675, 'max_bin': 314, 'min_data_in_leaf': 54, 'num_iterations': 824}
CPU times: total: 3h 20min 43s
Wall time: 11min 3s


In [42]:
from lightgbm import LGBMRegressor

def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                    'snowfall_fw', 'snowfall_hw_means']
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]
        
        # We leave max_depth as -1
        # Tune num_leaves, default is 31, let's double it
        clf2 = LGBMRegressor(random_state=42, boosting='dart', n_estimators=2500, verbose=0, n_jobs=31, objective='l2', importance_type='gain', 
                            lambda_l1= 1.7896547008552977, lambda_l2= 1.1957999576221703, learning_rate= 0.1882811352534675, max_bin= 314, min_data_in_leaf= 54, num_iterations= 824)
        clf2.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        from sklearn.metrics import mean_absolute_error
        
        print("###############   Target   #################")
        y_pred = clf2.predict(df_train_data)
        y_pred
        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf2.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        # display(importance.head(30))
        # display(importance.tail(30))
        print()
        print()

In [43]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264




###############   Target   #################
For fold 0: Train Mean Absolute Error: 29.207568796606022
For fold 0: Fold Val Mean Absolute Error: 46.207373454379066


Fold 1
Train rows: 1304266
Val rows: 173328




###############   Target   #################
For fold 1: Train Mean Absolute Error: 30.156043533053378
For fold 1: Fold Val Mean Absolute Error: 38.65626295222639


Fold 2
Train rows: 1480810
Val rows: 169632




###############   Target   #################
For fold 2: Train Mean Absolute Error: 29.592703332291993
For fold 2: Fold Val Mean Absolute Error: 40.07894725488798


Fold 3
Train rows: 1653658
Val rows: 167820




###############   Target   #################
For fold 3: Train Mean Absolute Error: 29.279209453132527
For fold 3: Fold Val Mean Absolute Error: 58.36057509833208


Fold 4
Train rows: 1824598
Val rows: 176496




###############   Target   #################
For fold 4: Train Mean Absolute Error: 30.76478968346665
For fold 4: Fold Val Mean Absolute Error: 78.30915872246612




### Train 6

Increasing n_estimators

In [104]:
def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        df_train_target = train[['target', 'target_installed_capacity']]
        df_train_data = train.drop(['target', 'target_installed_capacity'], axis=1)
        
        df_val_target2 = val[['target', 'target_installed_capacity']]
        df_val_data2 = val.drop(['target', 'target_installed_capacity'], axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        
        # clf = LGBMRegressor(random_state=42, n_estimators=2500, verbose=1, n_jobs=32, objective='l2')
        # clf.fit(df_train_data, df_train_target.target_installed_capacity, categorical_feature=cat_features)
        
        clf2 = LGBMRegressor(random_state=42, n_estimators=2500, verbose=1, n_jobs=32, objective='l2')
        clf2.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        

        from sklearn.metrics import mean_absolute_error
        
#         print("###############   TIC   #################")
#         y_pred = clf.predict(df_train_data)
#         y_pred
#         # Assuming you have two pandas Series: y_true and y_pred
#         mae = mean_absolute_error(df_train_target.target_installed_capacity, y_pred)
#         print(f"For fold {i}: Train TIC Mean Absolute Error:", mae)
#         mae = mean_absolute_error(df_train_target.target, inverse_tic(y_pred, df_train_data))
#         print(f"For fold {i}: Train Mean Absolute Error (TIC-INVERSED):", mae)

#         y_pred_val = clf.predict(df_val_data2)
#         y_pred_val

#         mae = mean_absolute_error(df_val_target2.target_installed_capacity, y_pred_val)
#         print(f"For fold {i}: Fold Val TIC Mean Absolute Error:", mae)
#         mae = mean_absolute_error(df_val_target2.target, inverse_tic(y_pred_val, df_val_data2))
#         print(f"For fold {i}: Fold Val Mean Absolute Error (TIC-INVERSED):", mae)
        
#         importance = pd.DataFrame({'importance':clf.feature_importances_, 'name':clf.feature_name_})
#         importance = importance.sort_values('importance', ascending=False)
#         # display(importance.head(30))
#         # display(importance.tail(10))
#         print()
        print("###############   Target   #################")
        y_pred = clf2.predict(df_train_data)
        y_pred
        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf2.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        if i == 4:
            display(importance.head(30))
            display(importance.tail(30))
        print()
        print()

In [105]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.098813 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28434
[LightGBM] [Info] Number of data points in the train set: 1129738, number of used features: 144
[LightGBM] [Info] Start training from score 250.526332


KeyboardInterrupt: 

### Train 7

Increasing n_estimators again

In [102]:
def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        df_train_target = train[['target', 'target_installed_capacity']]
        df_train_data = train.drop(['target', 'target_installed_capacity'], axis=1)
        
        df_val_target2 = val[['target', 'target_installed_capacity']]
        df_val_data2 = val.drop(['target', 'target_installed_capacity'], axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        
        # clf = LGBMRegressor(random_state=42, n_estimators=2500, verbose=1, n_jobs=32, objective='l2')
        # clf.fit(df_train_data, df_train_target.target_installed_capacity, categorical_feature=cat_features)
        
        clf2 = LGBMRegressor(random_state=42, n_estimators=3500, verbose=1, n_jobs=32, objective='l2')
        clf2.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        

        from sklearn.metrics import mean_absolute_error
        
#         print("###############   TIC   #################")
#         y_pred = clf.predict(df_train_data)
#         y_pred
#         # Assuming you have two pandas Series: y_true and y_pred
#         mae = mean_absolute_error(df_train_target.target_installed_capacity, y_pred)
#         print(f"For fold {i}: Train TIC Mean Absolute Error:", mae)
#         mae = mean_absolute_error(df_train_target.target, inverse_tic(y_pred, df_train_data))
#         print(f"For fold {i}: Train Mean Absolute Error (TIC-INVERSED):", mae)

#         y_pred_val = clf.predict(df_val_data2)
#         y_pred_val

#         mae = mean_absolute_error(df_val_target2.target_installed_capacity, y_pred_val)
#         print(f"For fold {i}: Fold Val TIC Mean Absolute Error:", mae)
#         mae = mean_absolute_error(df_val_target2.target, inverse_tic(y_pred_val, df_val_data2))
#         print(f"For fold {i}: Fold Val Mean Absolute Error (TIC-INVERSED):", mae)
        
#         importance = pd.DataFrame({'importance':clf.feature_importances_, 'name':clf.feature_name_})
#         importance = importance.sort_values('importance', ascending=False)
#         # display(importance.head(30))
#         # display(importance.tail(10))
#         print()
        print("###############   Target   #################")
        y_pred = clf2.predict(df_train_data)
        y_pred
        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf2.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        # display(importance.head(30))
        # display(importance.tail(10))
        print()
        print()

In [103]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.039119 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28434
[LightGBM] [Info] Number of data points in the train set: 1129738, number of used features: 144
[LightGBM] [Info] Start training from score 250.526332
###############   Target   #################
For fold 0: Train Mean Absolute Error: 16.14935142512727
For fold 0: Fold Val Mean Absolute Error: 44.83093739937275


Fold 1
Train rows: 1304266
Val rows: 173328
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041894 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28448
[LightGBM] [Info] Number of data points in the train set: 1304266, number o

### Train 8

Increasing n_estimators

In [106]:
def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity']
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        
        # clf = LGBMRegressor(random_state=42, n_estimators=2500, verbose=1, n_jobs=32, objective='l2')
        # clf.fit(df_train_data, df_train_target.target_installed_capacity, categorical_feature=cat_features)
        
        clf2 = LGBMRegressor(random_state=42, n_estimators=10000, verbose=1, n_jobs=32, objective='l2')
        clf2.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        

        from sklearn.metrics import mean_absolute_error
        
#         print("###############   TIC   #################")
#         y_pred = clf.predict(df_train_data)
#         y_pred
#         # Assuming you have two pandas Series: y_true and y_pred
#         mae = mean_absolute_error(df_train_target.target_installed_capacity, y_pred)
#         print(f"For fold {i}: Train TIC Mean Absolute Error:", mae)
#         mae = mean_absolute_error(df_train_target.target, inverse_tic(y_pred, df_train_data))
#         print(f"For fold {i}: Train Mean Absolute Error (TIC-INVERSED):", mae)

#         y_pred_val = clf.predict(df_val_data2)
#         y_pred_val

#         mae = mean_absolute_error(df_val_target2.target_installed_capacity, y_pred_val)
#         print(f"For fold {i}: Fold Val TIC Mean Absolute Error:", mae)
#         mae = mean_absolute_error(df_val_target2.target, inverse_tic(y_pred_val, df_val_data2))
#         print(f"For fold {i}: Fold Val Mean Absolute Error (TIC-INVERSED):", mae)
        
#         importance = pd.DataFrame({'importance':clf.feature_importances_, 'name':clf.feature_name_})
#         importance = importance.sort_values('importance', ascending=False)
#         # display(importance.head(30))
#         # display(importance.tail(10))
#         print()
        print("###############   Target   #################")
        y_pred = clf2.predict(df_train_data)
        y_pred
        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf2.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        if i ==4:
            display(importance.head(30))
            display(importance.tail(30))
        print()
        print()

In [107]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.040629 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28434
[LightGBM] [Info] Number of data points in the train set: 1129738, number of used features: 144
[LightGBM] [Info] Start training from score 250.526332
###############   Target   #################
For fold 0: Train Mean Absolute Error: 9.94322586498788
For fold 0: Fold Val Mean Absolute Error: 44.51990026928266


Fold 1
Train rows: 1304266
Val rows: 173328
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.045926 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28448
[LightGBM] [Info] Number of data points in the train set: 1304266, number of

Unnamed: 0,importance,name
6,13749,target_rolling_avg_hour_7d
126,11606,hour
5,9755,target_rolling_avg_24h
11,7999,eic_count
12,7772,installed_capacity
7,7446,target_rolling_avg_hour_hour_day_4w
4,7445,target_rt
8,7306,target_rolling_allp_avg_24h
0,6652,county
9,6347,target_rolling_allp_avg_hour_7d


Unnamed: 0,importance,name
123,104,year
58,93,snowfall_hw_lagged
133,17,is_month_end
134,14,is_quarter_start
135,9,is_quarter_end
132,6,is_month_start
136,4,is_year_start
127,3,quarter
138,2,season
137,0,is_year_end






### Train 9

Culling low split features

In [112]:
def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'is_year_end', 'season', 'quarter', 'is_year_start', 'is_month_start', 'is_quarter_end', 'is_quarter_start' ,'is_month_end']
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]
        
        # clf = LGBMRegressor(random_state=42, n_estimators=2500, verbose=1, n_jobs=32, objective='l2')
        # clf.fit(df_train_data, df_train_target.target_installed_capacity, categorical_feature=cat_features)
        
        clf2 = LGBMRegressor(random_state=42, n_estimators=3500, verbose=1, n_jobs=32, objective='l2')
        clf2.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        

        from sklearn.metrics import mean_absolute_error
        
#         print("###############   TIC   #################")
#         y_pred = clf.predict(df_train_data)
#         y_pred
#         # Assuming you have two pandas Series: y_true and y_pred
#         mae = mean_absolute_error(df_train_target.target_installed_capacity, y_pred)
#         print(f"For fold {i}: Train TIC Mean Absolute Error:", mae)
#         mae = mean_absolute_error(df_train_target.target, inverse_tic(y_pred, df_train_data))
#         print(f"For fold {i}: Train Mean Absolute Error (TIC-INVERSED):", mae)

#         y_pred_val = clf.predict(df_val_data2)
#         y_pred_val

#         mae = mean_absolute_error(df_val_target2.target_installed_capacity, y_pred_val)
#         print(f"For fold {i}: Fold Val TIC Mean Absolute Error:", mae)
#         mae = mean_absolute_error(df_val_target2.target, inverse_tic(y_pred_val, df_val_data2))
#         print(f"For fold {i}: Fold Val Mean Absolute Error (TIC-INVERSED):", mae)
        
#         importance = pd.DataFrame({'importance':clf.feature_importances_, 'name':clf.feature_name_})
#         importance = importance.sort_values('importance', ascending=False)
#         # display(importance.head(30))
#         # display(importance.tail(10))
#         print()
        print("###############   Target   #################")
        y_pred = clf2.predict(df_train_data)
        y_pred
        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf2.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        if i ==4:
            display(importance.head(30))
            display(importance.tail(30))
        print()
        print()

In [113]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.039135 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28408
[LightGBM] [Info] Number of data points in the train set: 1129738, number of used features: 136
[LightGBM] [Info] Start training from score 250.526332
###############   Target   #################
For fold 0: Train Mean Absolute Error: 16.06946633179013
For fold 0: Fold Val Mean Absolute Error: 44.593979780140145


Fold 1
Train rows: 1304266
Val rows: 173328
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.048410 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28422
[LightGBM] [Info] Number of data points in the train set: 1304266, number 

Unnamed: 0,importance,name
6,5275,target_rolling_avg_hour_7d
126,3509,hour
11,3251,eic_count
5,3212,target_rolling_avg_24h
12,3021,installed_capacity
0,2513,county
7,2453,target_rolling_avg_hour_hour_day_4w
8,2175,target_rolling_allp_avg_24h
4,2113,target_rt
9,1906,target_rolling_allp_avg_hour_7d


Unnamed: 0,importance,name
38,318,shortwave_radiation_hw_means
94,303,shortwave_radiation_hw_variances_hw_lagged
81,297,direct_solar_radiation_hw_means_hw_lagged
67,295,direct_solar_radiation_hw_lagged
120,293,highest_price_7d_avg
53,290,direct_solar_radiation_hw_variances
39,281,direct_solar_radiation_hw_means
2,276,product_type
29,275,rain_hw_means
95,273,direct_solar_radiation_hw_variances_hw_lagged






### Train 10

Culling low split features again

In [114]:
def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'is_year_end', 'season', 'quarter', 'is_year_start', 'is_month_start', 'is_quarter_end', 'is_quarter_start' ,'is_month_end',
                    'snowfall_hw_lagged', 'year', 'snowfall', 'snowfall_hw_variances', 'snowfall_hw_means', 'snowfall_hw_variances_hw_lagged', 'rain_hw_lagged', 'week_of_year', 'snowfall_hw_means_hw_lagged', 
                    'rain_hw_variances', 'shortwave_radiation_hw_variances', 'rain_hw_variances_hw_lagged', 'rain_hw_means_hw_lagged']
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]
        
        # clf = LGBMRegressor(random_state=42, n_estimators=2500, verbose=1, n_jobs=32, objective='l2')
        # clf.fit(df_train_data, df_train_target.target_installed_capacity, categorical_feature=cat_features)
        
        clf2 = LGBMRegressor(random_state=42, n_estimators=3500, verbose=1, n_jobs=32, objective='l2')
        clf2.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        

        from sklearn.metrics import mean_absolute_error
        
#         print("###############   TIC   #################")
#         y_pred = clf.predict(df_train_data)
#         y_pred
#         # Assuming you have two pandas Series: y_true and y_pred
#         mae = mean_absolute_error(df_train_target.target_installed_capacity, y_pred)
#         print(f"For fold {i}: Train TIC Mean Absolute Error:", mae)
#         mae = mean_absolute_error(df_train_target.target, inverse_tic(y_pred, df_train_data))
#         print(f"For fold {i}: Train Mean Absolute Error (TIC-INVERSED):", mae)

#         y_pred_val = clf.predict(df_val_data2)
#         y_pred_val

#         mae = mean_absolute_error(df_val_target2.target_installed_capacity, y_pred_val)
#         print(f"For fold {i}: Fold Val TIC Mean Absolute Error:", mae)
#         mae = mean_absolute_error(df_val_target2.target, inverse_tic(y_pred_val, df_val_data2))
#         print(f"For fold {i}: Fold Val Mean Absolute Error (TIC-INVERSED):", mae)
        
#         importance = pd.DataFrame({'importance':clf.feature_importances_, 'name':clf.feature_name_})
#         importance = importance.sort_values('importance', ascending=False)
#         # display(importance.head(30))
#         # display(importance.tail(10))
#         print()
        print("###############   Target   #################")
        y_pred = clf2.predict(df_train_data)
        y_pred
        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf2.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        if i ==4:
            display(importance.head(30))
            display(importance.tail(30))
        print()
        print()

In [115]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027849 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 26601
[LightGBM] [Info] Number of data points in the train set: 1129738, number of used features: 123
[LightGBM] [Info] Start training from score 250.526332
###############   Target   #################
For fold 0: Train Mean Absolute Error: 16.162003973920534
For fold 0: Fold Val Mean Absolute Error: 44.80166939617799


Fold 1
Train rows: 1304266
Val rows: 173328
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032672 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 26609
[LightGBM] [Info] Number of data points in the train set: 1304266, number 

Unnamed: 0,importance,name
6,5206,target_rolling_avg_hour_7d
114,3585,hour
5,3353,target_rolling_avg_24h
11,3233,eic_count
12,3119,installed_capacity
0,2528,county
7,2404,target_rolling_avg_hour_hour_day_4w
4,2180,target_rt
8,2140,target_rolling_allp_avg_24h
9,1867,target_rolling_allp_avg_hour_7d


Unnamed: 0,importance,name
111,406,highest_price_14d_avg
110,403,lowest_price_14d_avg
118,402,hour_sin
54,391,cloudcover_low_hw_lagged
27,389,dewpoint_hw_means
104,386,lowest_price_per_mwh
28,382,rain_hw_means
55,380,cloudcover_mid_hw_lagged
59,374,shortwave_radiation_hw_lagged
36,373,shortwave_radiation_hw_means






### Train 11

Culling low split features again

In [116]:
def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'is_year_end', 'season', 'quarter', 'is_year_start', 'is_month_start', 'is_quarter_end', 'is_quarter_start' ,'is_month_end',
                    'snowfall_hw_lagged', 'year', 'snowfall', 'snowfall_hw_variances', 'snowfall_hw_means', 'snowfall_hw_variances_hw_lagged', 'rain_hw_lagged', 'week_of_year', 'snowfall_hw_means_hw_lagged', 
                    'rain_hw_variances', 'shortwave_radiation_hw_variances', 'rain_hw_variances_hw_lagged', 'rain_hw_means_hw_lagged', 'is_ee_holiday', 'snowfall_fw', 'lowest_price_3d_avg', 'rain',
                    'highest_price_7d_avg', 'highest_price_3d_avg', 'shortwave_radiation_hw_variances_hw_lagged', 'cloudcover_total_hw_lagged', 'lowest_price_7d_avg', 'direct_solar_radiation_hw_lagged']
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]
        
        # clf = LGBMRegressor(random_state=42, n_estimators=2500, verbose=1, n_jobs=32, objective='l2')
        # clf.fit(df_train_data, df_train_target.target_installed_capacity, categorical_feature=cat_features)
        
        clf2 = LGBMRegressor(random_state=42, n_estimators=3500, verbose=1, n_jobs=32, objective='l2')
        clf2.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        

        from sklearn.metrics import mean_absolute_error
        
#         print("###############   TIC   #################")
#         y_pred = clf.predict(df_train_data)
#         y_pred
#         # Assuming you have two pandas Series: y_true and y_pred
#         mae = mean_absolute_error(df_train_target.target_installed_capacity, y_pred)
#         print(f"For fold {i}: Train TIC Mean Absolute Error:", mae)
#         mae = mean_absolute_error(df_train_target.target, inverse_tic(y_pred, df_train_data))
#         print(f"For fold {i}: Train Mean Absolute Error (TIC-INVERSED):", mae)

#         y_pred_val = clf.predict(df_val_data2)
#         y_pred_val

#         mae = mean_absolute_error(df_val_target2.target_installed_capacity, y_pred_val)
#         print(f"For fold {i}: Fold Val TIC Mean Absolute Error:", mae)
#         mae = mean_absolute_error(df_val_target2.target, inverse_tic(y_pred_val, df_val_data2))
#         print(f"For fold {i}: Fold Val Mean Absolute Error (TIC-INVERSED):", mae)
        
#         importance = pd.DataFrame({'importance':clf.feature_importances_, 'name':clf.feature_name_})
#         importance = importance.sort_values('importance', ascending=False)
#         # display(importance.head(30))
#         # display(importance.tail(10))
#         print()
        print("###############   Target   #################")
        y_pred = clf2.predict(df_train_data)
        y_pred
        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf2.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        if i ==4:
            display(importance.head(30))
            display(importance.tail(30))
        print()
        print()

In [117]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017964 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 24700
[LightGBM] [Info] Number of data points in the train set: 1129738, number of used features: 113
[LightGBM] [Info] Start training from score 250.526332
###############   Target   #################
For fold 0: Train Mean Absolute Error: 16.27063006107027
For fold 0: Fold Val Mean Absolute Error: 44.82294092525837


Fold 1
Train rows: 1304266
Val rows: 173328
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018307 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 24696
[LightGBM] [Info] Number of data points in the train set: 1304266, number o

Unnamed: 0,importance,name
6,5209,target_rolling_avg_hour_7d
105,3661,hour
5,3314,target_rolling_avg_24h
12,3126,installed_capacity
11,3114,eic_count
0,2536,county
7,2442,target_rolling_avg_hour_hour_day_4w
8,2148,target_rolling_allp_avg_24h
4,2067,target_rt
9,1914,target_rolling_allp_avg_hour_7d


Unnamed: 0,importance,name
60,517,dewpoint_hw_means_hw_lagged
75,514,cloudcover_low_hw_variances_hw_lagged
99,504,lowest_price_per_mwh
29,493,cloudcover_total_hw_means
59,493,temperature_hw_means_hw_lagged
57,491,shortwave_radiation_hw_lagged
30,491,cloudcover_low_hw_means
37,490,diffuse_radiation_hw_means
100,488,highest_price_per_mwh
70,485,diffuse_radiation_hw_means_hw_lagged






### Train 12

Producer vs non-producer

In [120]:
def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'is_year_end', 'season', 'quarter', 'is_year_start', 'is_month_start', 'is_quarter_end', 'is_quarter_start' ,'is_month_end',
                    'snowfall_hw_lagged', 'year', 'snowfall', 'snowfall_hw_variances', 'snowfall_hw_means', 'snowfall_hw_variances_hw_lagged', 'rain_hw_lagged', 'week_of_year', 'snowfall_hw_means_hw_lagged', 
                    'rain_hw_variances', 'shortwave_radiation_hw_variances', 'rain_hw_variances_hw_lagged', 'rain_hw_means_hw_lagged']
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target = val[target_cols]
        df_val_data = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]
        
        # clf = LGBMRegressor(random_state=42, n_estimators=2500, verbose=1, n_jobs=32, objective='l2')
        # clf.fit(df_train_data, df_train_target.target_installed_capacity, categorical_feature=cat_features)

        clf = LGBMRegressor(random_state=42, n_estimators=3500, verbose=1, n_jobs=32, objective='l2', )
        clf_producer = LGBMRegressor(random_state=42, n_estimators=3500, verbose=1, n_jobs=32, objective='l2', )

        clf.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)
        clf_producer.fit(df_train_data[df_train_data.is_consumption==0], df_train_target[df_train_data.is_consumption==0].target, categorical_feature=cat_features)

        y_pred = clf.predict(df_train_data)
        y_pred_producer = clf_producer.predict(df_train_data[df_train_data.is_consumption==0])
        y_pred2 = y_pred.copy()
        y_pred2[df_train_data.is_consumption==0] = y_pred_producer 

        from sklearn.metrics import mean_absolute_error

        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f" Train Mean Absolute Error:", mae)
        mae = mean_absolute_error(df_train_target.target, y_pred2)
        print(f" Train Mean w Producer Absolute Error:", mae)

        y_pred_val = clf.predict(df_val_data)
        y_pred_val_producer = clf_producer.predict(df_val_data[df_val_data.is_consumption==0])
        y_pred_val2 = y_pred_val.copy()
        y_pred_val2[df_val_data.is_consumption==0] = y_pred_val_producer 

        mae = mean_absolute_error(df_val_target.target, y_pred_val)
        print("Val Mean Absolute Error:", mae)
        mae = mean_absolute_error(df_val_target.target, y_pred_val2)
        print("Val Mean w Producer Absolute Error:", mae)

        # y_pred_test = clf.predict(df_test_data)
        # y_pred_test

        # mae = mean_absolute_error(df_test_target.target, y_pred_test)
        # print("Test Mean Absolute Error:", mae)

        # importance = pd.DataFrame({'importance':clf.feature_importances_, 'name':clf.feature_name_})
        # importance = importance.sort_values('importance', ascending=False)
        
        importance = pd.DataFrame({'importance':clf.feature_importances_, 'name':clf.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        importance2= pd.DataFrame({'importance':clf_producer.feature_importances_, 'name':clf_producer.feature_name_})
        importance2= importance2.sort_values('importance', ascending=False)
        if i ==4:
            display(importance.head(30))
            display(importance.tail(30))
            display(importance2.head(30))
            display(importance2.tail(30))
        print()
        print()

In [121]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.114681 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26601
[LightGBM] [Info] Number of data points in the train set: 1129738, number of used features: 123
[LightGBM] [Info] Start training from score 250.526332
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.063004 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26600
[LightGBM] [Info] Number of data points in the train set: 564869, number of used features: 122
[LightGBM] [Info] Start training from score 87.581421
 Train Mean Absolute Error: 16.162003973928336
 Train Mean w Producer Absolute Error: 13.30019595000054
Val Mean Absolute Error: 44.80166939618034
Val Mean w Producer Absolute Error: 43.87361629685433


Fold 1
Train rows: 1304266
Val rows: 173328
[LightGBM] [Inf

Unnamed: 0,importance,name
6,5206,target_rolling_avg_hour_7d
114,3585,hour
5,3353,target_rolling_avg_24h
11,3233,eic_count
12,3119,installed_capacity
0,2528,county
7,2404,target_rolling_avg_hour_hour_day_4w
4,2180,target_rt
8,2140,target_rolling_allp_avg_24h
9,1867,target_rolling_allp_avg_hour_7d


Unnamed: 0,importance,name
111,406,highest_price_14d_avg
110,403,lowest_price_14d_avg
118,402,hour_sin
54,391,cloudcover_low_hw_lagged
27,389,dewpoint_hw_means
104,386,lowest_price_per_mwh
28,382,rain_hw_means
55,380,cloudcover_mid_hw_lagged
59,374,shortwave_radiation_hw_lagged
36,373,shortwave_radiation_hw_means


Unnamed: 0,importance,name
12,4839,installed_capacity
6,3980,target_rolling_avg_hour_7d
11,3355,eic_count
24,2891,direct_solar_radiation
0,2430,county
7,2334,target_rolling_avg_hour_hour_day_4w
114,2234,hour
25,2146,diffuse_radiation
5,2056,target_rolling_avg_24h
23,1929,shortwave_radiation


Unnamed: 0,importance,name
48,383,direct_solar_radiation_hw_variances
53,383,cloudcover_total_hw_lagged
55,373,cloudcover_mid_hw_lagged
36,370,shortwave_radiation_hw_means
121,368,day_of_year_cos
50,364,temperature_hw_lagged
62,363,temperature_hw_means_hw_lagged
83,355,shortwave_radiation_hw_variances_hw_lagged
119,351,hour_cos
71,351,shortwave_radiation_hw_means_hw_lagged






### Train 13

Voting Regressor w/ split consumption and production

In [19]:
from sklearn.ensemble import VotingRegressor

def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'is_year_end', 'season', 'quarter', 'is_year_start', 'is_month_start', 'is_quarter_end', 'is_quarter_start' ,'is_month_end',
                    'snowfall_hw_lagged', 'year', 'snowfall', 'snowfall_hw_variances', 'snowfall_hw_means', 'snowfall_hw_variances_hw_lagged', 'rain_hw_lagged', 'week_of_year', 'snowfall_hw_means_hw_lagged', 
                    'rain_hw_variances', 'shortwave_radiation_hw_variances', 'rain_hw_variances_hw_lagged', 'rain_hw_means_hw_lagged']
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target = val[target_cols]
        df_val_data = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]
        
        for feature in cat_features:
            df_train_data[feature] = df_train_data[feature].astype('category')
            df_val_data[feature] = df_val_data[feature].astype('category')
        
        # clf = LGBMRegressor(random_state=42, n_estimators=2500, verbose=1, n_jobs=32, objective='l2')
        # clf.fit(df_train_data, df_train_target.target_installed_capacity, categorical_feature=cat_features)
        
        clf = VotingRegressor([
            ('lgb_0', LGBMRegressor(random_state=42, n_estimators=3500, verbose=-1, n_jobs=32, objective='l2', learning_rate=0.1)),
            ('lgb_1', LGBMRegressor(random_state=42, n_estimators=2500, verbose=-1, n_jobs=32, objective='l2')),
            ('lgb_2', LGBMRegressor(random_state=42, n_estimators=4500, verbose=-1, n_jobs=32, objective='l2')), 
            ('lgb_3', LGBMRegressor(random_state=42, n_estimators=3500, verbose=-1, n_jobs=32, objective='l2', learning_rate=0.05)),
            ('lgb_4', LGBMRegressor(random_state=42, n_estimators=3500, verbose=-1, n_jobs=32, objective='l2', learning_rate=0.13))
            ], weights=[0.2,0.2,0.2,0.2,0.2])

        clf_producer = VotingRegressor([
            ('lgb_5', LGBMRegressor(random_state=42, n_estimators=3500, verbose=-1, n_jobs=32, objective='l2', learning_rate=0.1)),
            ('lgb_6', LGBMRegressor(random_state=42, n_estimators=2500, verbose=-1, n_jobs=32, objective='l2')),
            ('lgb_7', LGBMRegressor(random_state=42, n_estimators=4500, verbose=-1, n_jobs=32, objective='l2')), 
            ('lgb_8', LGBMRegressor(random_state=42, n_estimators=3500, verbose=-1, n_jobs=32, objective='l2', learning_rate=0.05)),
            ('lgb_9', LGBMRegressor(random_state=42, n_estimators=3500, verbose=-1, n_jobs=32, objective='l2', learning_rate=0.13))
            ], weights=[0.2,0.2,0.2,0.2,0.2])

        clf.fit(df_train_data, df_train_target.target)
        clf_producer.fit(df_train_data[df_train_data.is_consumption==0], df_train_target[df_train_data.is_consumption==0].target)

        y_pred = clf.predict(df_train_data)
        y_pred_producer = clf_producer.predict(df_train_data[df_train_data.is_consumption==0])
        y_pred2 = y_pred.copy()
        y_pred2[df_train_data.is_consumption==0] = y_pred_producer 

        from sklearn.metrics import mean_absolute_error

        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f" Train Mean Absolute Error:", mae)
        mae = mean_absolute_error(df_train_target.target, y_pred2)
        print(f" Train Mean w Producer Absolute Error:", mae)

        y_pred_val = clf.predict(df_val_data)
        y_pred_val_producer = clf_producer.predict(df_val_data[df_val_data.is_consumption==0])
        y_pred_val2 = y_pred_val.copy()
        y_pred_val2[df_val_data.is_consumption==0] = y_pred_val_producer 

        mae = mean_absolute_error(df_val_target.target, y_pred_val)
        print("Val Mean Absolute Error:", mae)
        mae = mean_absolute_error(df_val_target.target, y_pred_val2)
        print("Val Mean w Producer Absolute Error:", mae)

        # y_pred_test = clf.predict(df_test_data)
        # y_pred_test

        # mae = mean_absolute_error(df_test_target.target, y_pred_test)
        # print("Test Mean Absolute Error:", mae)

        # importance = pd.DataFrame({'importance':clf.feature_importances_, 'name':clf.feature_name_})
        # importance = importance.sort_values('importance', ascending=False)
        
#         importance = pd.DataFrame({'importance':clf.feature_importances_, 'name':clf.feature_name_})
#         importance = importance.sort_values('importance', ascending=False)
#         importance2= pd.DataFrame({'importance':clf_producer.feature_importances_, 'name':clf_producer.feature_name_})
#         importance2= importance2.sort_values('importance', ascending=False)
#         if i ==4:
#             display(importance.head(30))
#             display(importance.tail(30))
#             display(importance2.head(30))
#             display(importance2.tail(30))
        print()
        print()

In [20]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
 Train Mean Absolute Error: 16.158473235272837
 Train Mean w Producer Absolute Error: 13.306883708162598
Val Mean Absolute Error: 44.068319916626166
Val Mean w Producer Absolute Error: 43.16498582524392


Fold 1
Train rows: 1304266
Val rows: 173328
 Train Mean Absolute Error: 17.156206312920148
 Train Mean w Producer Absolute Error: 14.240507558956622
Val Mean Absolute Error: 37.889459690665014
Val Mean w Producer Absolute Error: 37.567547227992016


Fold 2
Train rows: 1480810
Val rows: 169632
 Train Mean Absolute Error: 17.212468798639485
 Train Mean w Producer Absolute Error: 14.390832522700588
Val Mean Absolute Error: 39.94040763453027
Val Mean w Producer Absolute Error: 39.56369069899141


Fold 3
Train rows: 1653658
Val rows: 167820
 Train Mean Absolute Error: 17.49213599163509
 Train Mean w Producer Absolute Error: 14.72299546650522
Val Mean Absolute Error: 57.916021807495355
Val Mean w Producer Absolute Error: 58.77932335887339


Fold 4

In [None]:
model_consumption = VotingRegressor([
        ('lgb_0', lgb.LGBMRegressor(**p1, random_state=42)),
        ('lgb_1', lgb.LGBMRegressor(**n2, random_state=42)),
        ('lgb_2', lgb.LGBMRegressor(**mx, random_state=42)), 
        ('lgb_3', lgb.LGBMRegressor(**p2, random_state=42)), 
        ('lgb_4', lgb.LGBMRegressor(**p3, random_state=42)), 
        ('lgb_5', lgb.LGBMRegressor(**n3, random_state=42)), 
        ('lgb_6', lgb.LGBMRegressor(**n4, random_state=42)), 
        ('lgb_7', lgb.LGBMRegressor(**n5, random_state=42)),
        ('lgb_8', lgb.LGBMRegressor(**p7, random_state=42)),
        ('lgb_9', lgb.LGBMRegressor(**n6, random_state=42)),
],weights=[0.14,0.13,0.08,0.11,0.09,0.1,0.09,0.07,0.12,0.07])
# weights=[p1,  n2,  mx,  p2,  p3,  n3, n4,  n5,  p7,  n6]
    
model_production = VotingRegressor([
        ('lgb_10', lgb.LGBMRegressor(**p1, random_state=42)),
        ('lgb_11', lgb.LGBMRegressor(**n2, random_state=42)),
        ('lgb_12', lgb.LGBMRegressor(**mx, random_state=42)), 
        ('lgb_13', lgb.LGBMRegressor(**p2, random_state=42)), 
        ('lgb_14', lgb.LGBMRegressor(**p3, random_state=42)), 
        ('lgb_15', lgb.LGBMRegressor(**n3, random_state=42)), 
        ('lgb_16', lgb.LGBMRegressor(**n4, random_state=42)), 
        ('lgb_17', lgb.LGBMRegressor(**n5, random_state=42)),
        ('lgb_18', lgb.LGBMRegressor(**p7, random_state=42)),
        ('lgb_19', lgb.LGBMRegressor(**n6, random_state=42)),
],weights=[0.14,0.13,0.08,0.11,0.09,0.1,0.09,0.07,0.12,0.07])

### Models

In [53]:
params={'n_iter': 2500,'verbose': 1,'objective': 'l2','metric': 'mae','learning_rate': 0.05073909898961407, 'colsample_bytree': 0.726023996436955, 'colsample_bynode': 0.5803681307354022, 
        'lambda_l1': 8.562963348932286, 'lambda_l2': 4.893256185259296, 'min_data_in_leaf': 115, 'max_depth': 23, 'num_leaves':50, 'max_bin': 898}

train = processed_df_no_na[date_filter <= datetime_cv_ranges[-1][0]]
val = processed_df_no_na[(date_filter <= datetime_cv_ranges[-1][1]) & (date_filter > datetime_cv_ranges[-1][0])]

df_train_target = train[['target', 'target_installed_capacity']]
df_train_data = train.drop(['target', 'target_installed_capacity'], axis=1)

df_val_target = val[['target', 'target_installed_capacity']]
df_val_data = val.drop(['target', 'target_installed_capacity'], axis=1)
        
clf = LGBMRegressor(**params, random_state=42)


cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])

clf.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

y_pred = clf.predict(df_train_data)
y_pred

from sklearn.metrics import mean_absolute_error

# Assuming you have two pandas Series: y_true and y_pred
y_pred = clf.predict(df_train_data)
mae = mean_absolute_error(df_train_target.target, y_pred)
print(f" Train Mean Absolute Error:", mae)

y_pred_val = clf.predict(df_val_data)
y_pred_val

mae = mean_absolute_error(df_val_target.target, y_pred_val)
print("Val Mean Absolute Error:", mae)

# y_pred_test = clf.predict(df_test_data)
# y_pred_test

importance = pd.DataFrame({'importance':clf.feature_importances_, 'name':clf.feature_name_})
importance = importance.sort_values('importance', ascending=False)
display(importance.head(10))
display(importance.tail(10))
print()
print()



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.219732 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84440
[LightGBM] [Info] Number of data points in the train set: 1824598, number of used features: 144
[LightGBM] [Info] Start training from score 268.598179
 Train Mean Absolute Error: 22.29970221631375
Val Mean Absolute Error: 79.12598942626


Unnamed: 0,importance,name
6,5715,target_rolling_avg_hour_7d
5,4274,target_rolling_avg_24h
126,4055,hour
12,4026,installed_capacity
7,3947,target_rolling_avg_hour_hour_day_4w
11,3284,eic_count
4,3144,target_rt
0,2969,county
8,2359,target_rolling_allp_avg_24h
9,2265,target_rolling_allp_avg_hour_7d


Unnamed: 0,importance,name
123,55,year
138,32,season
127,24,quarter
132,21,is_month_start
58,18,snowfall_hw_lagged
133,5,is_month_end
135,3,is_quarter_end
134,3,is_quarter_start
136,0,is_year_start
137,0,is_year_end






In [54]:
test_submission = pd.read_csv("data/example_test_files/sample_submission.csv")
test_submission

Unnamed: 0,row_id,data_block_id,target
0,2005872,634,0
1,2005873,634,0
2,2005874,634,0
3,2005875,634,0
4,2005876,634,0
...,...,...,...
12475,2018347,637,0
12476,2018348,637,0
12477,2018349,637,0
12478,2018350,637,0


In [55]:
clf.predict(df_val_data)



array([6.39107379e+00, 1.01617424e+03, 6.95058781e-02, ...,
       4.89672200e+01, 5.24024018e+00, 2.66437232e+02])

In [61]:
from data import public_timeseries_testing_util as enefit

with open('data_processor_testing.pkl', 'rb') as f:
    data_processor = pickle.load(f)
data_processor.df

env = enefit.make_env()

for (test, revealed_targets, client, historical_weather,
            forecast_weather, electricity_prices, gas_prices, sample_submission) in env.iter_test():
    test_data = data_processor.process_test_data_timestep(test, revealed_targets, client, historical_weather, forecast_weather, electricity_prices, gas_prices)
    display(test_data)
    
    test_data_filtered = test_data[~test_data.currently_scored.fillna(True)]
    test_data_filtered = test_data_filtered.drop('target', axis=1)
    other_cols = test_data_filtered[['prediction_datetime', 'currently_scored', 'row_id']]
    test_data_filtered = test_data_filtered.drop(['prediction_datetime', 'currently_scored', 'row_id'], axis=1)
    preds = clf.predict(test_data_filtered)
    submission = other_cols[['row_id']].copy()
    submission['target'] = preds
    submission = submission.reset_index(drop=True)
    env.predict(submission)
    data_processor.test_orig_dfs[0]['currently_scored'] = True
    display(submission)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['datetime'] = df['datetime'] + dt.timedelta(days=1)


0


Unnamed: 0,county,is_business,product_type,target,is_consumption,row_id,prediction_datetime,currently_scored,target_rt,target_rolling_avg_24h,...,is_quarter_start,is_quarter_end,is_year_start,is_year_end,season,hour_sin,hour_cos,day_of_year_sin,day_of_year_cos,is_ee_holiday
0,0,0,1,2.977,0,1960760,,,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
1,0,0,1,601.482,1,1960761,,,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
2,0,0,2,0.000,0,1960762,,,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
3,0,0,2,9.943,1,1960763,,,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
4,0,0,3,50.278,0,1960764,,,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51347,15,1,1,,1,2008989,2023-05-28 23:00:00,False,32.809,26.492250,...,False,False,False,False,2,-0.258819,0.965926,0.575190,-0.818020,True
51348,15,1,3,,0,2008990,2023-05-28 23:00:00,False,0.000,362.492542,...,False,False,False,False,2,-0.258819,0.965926,0.575190,-0.818020,True
51349,15,1,3,,0,2008990,2023-05-28 23:00:00,False,0.000,375.690208,...,False,False,False,False,2,-0.258819,0.965926,0.575190,-0.818020,True
51350,15,1,3,,1,2008991,2023-05-28 23:00:00,False,195.707,299.014875,...,False,False,False,False,2,-0.258819,0.965926,0.575190,-0.818020,True




Unnamed: 0,row_id,target
0,2005872,18.606309
1,2005872,26.932363
2,2005873,574.233623
3,2005873,516.410356
4,2005874,-2.070909
...,...,...
6235,2008989,37.287530
6236,2008990,-8.324577
6237,2008990,-8.324577
6238,2008991,288.868177


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['datetime'] = df['datetime'] + dt.timedelta(days=1)


0


Unnamed: 0,county,is_business,product_type,target,is_consumption,row_id,prediction_datetime,currently_scored,target_rt,target_rolling_avg_24h,...,is_quarter_start,is_quarter_end,is_year_start,is_year_end,season,hour_sin,hour_cos,day_of_year_sin,day_of_year_cos,is_ee_holiday
0,0,0,1,2.977,0,1960760,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
1,0,0,1,601.482,1,1960761,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
2,0,0,2,0.000,0,1960762,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
3,0,0,2,9.943,1,1960763,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
4,0,0,3,50.278,0,1960764,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57587,15,1,1,,1,2012109,2023-05-29 23:00:00,False,38.646,18.873583,...,False,False,False,False,2,-0.258819,0.965926,0.561034,-0.827793,False
57588,15,1,3,,0,2012110,2023-05-29 23:00:00,False,0.000,304.133875,...,False,False,False,False,2,-0.258819,0.965926,0.561034,-0.827793,False
57589,15,1,3,,0,2012110,2023-05-29 23:00:00,False,0.000,403.044625,...,False,False,False,False,2,-0.258819,0.965926,0.561034,-0.827793,False
57590,15,1,3,,1,2012111,2023-05-29 23:00:00,False,188.689,267.524667,...,False,False,False,False,2,-0.258819,0.965926,0.561034,-0.827793,False




Unnamed: 0,row_id,target
0,2008992,2.334951
1,2008992,4.308818
2,2008993,543.003451
3,2008993,545.110744
4,2008994,-4.441849
...,...,...
6235,2012109,39.542990
6236,2012110,-1.994870
6237,2012110,3.090400
6238,2012111,290.262709


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['datetime'] = df['datetime'] + dt.timedelta(days=1)


0


Unnamed: 0,county,is_business,product_type,target,is_consumption,row_id,prediction_datetime,currently_scored,target_rt,target_rolling_avg_24h,...,is_quarter_start,is_quarter_end,is_year_start,is_year_end,season,hour_sin,hour_cos,day_of_year_sin,day_of_year_cos,is_ee_holiday
0,0,0,1,2.977,0,1960760,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
1,0,0,1,601.482,1,1960761,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
2,0,0,2,0.000,0,1960762,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
3,0,0,2,9.943,1,1960763,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
4,0,0,3,50.278,0,1960764,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63827,15,1,1,,1,2015229,2023-05-30 23:00:00,False,35.217,17.434458,...,False,False,False,False,2,-0.258819,0.965926,0.546711,-0.837321,False
63828,15,1,3,,0,2015230,2023-05-30 23:00:00,False,0.000,375.690208,...,False,False,False,False,2,-0.258819,0.965926,0.546711,-0.837321,False
63829,15,1,3,,0,2015230,2023-05-30 23:00:00,False,0.000,506.681000,...,False,False,False,False,2,-0.258819,0.965926,0.546711,-0.837321,False
63830,15,1,3,,1,2015231,2023-05-30 23:00:00,False,189.933,266.825583,...,False,False,False,False,2,-0.258819,0.965926,0.546711,-0.837321,False




Unnamed: 0,row_id,target
0,2012112,35.103232
1,2012112,18.181895
2,2012113,501.313356
3,2012113,514.270241
4,2012114,-4.603473
...,...,...
6235,2015229,37.730012
6236,2015230,-1.791179
6237,2015230,8.700442
6238,2015231,281.709053


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['datetime'] = df['datetime'] + dt.timedelta(days=1)


0


Unnamed: 0,county,is_business,product_type,target,is_consumption,row_id,prediction_datetime,currently_scored,target_rt,target_rolling_avg_24h,...,is_quarter_start,is_quarter_end,is_year_start,is_year_end,season,hour_sin,hour_cos,day_of_year_sin,day_of_year_cos,is_ee_holiday
0,0,0,1,2.977,0,1960760,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
1,0,0,1,601.482,1,1960761,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
2,0,0,2,0.000,0,1960762,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
3,0,0,2,9.943,1,1960763,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
4,0,0,3,50.278,0,1960764,,True,,,...,False,False,False,False,2,-0.258819,0.965926,0.764891,-0.644159,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70067,15,1,1,,1,2018349,2023-05-31 23:00:00,False,31.484,33.753875,...,False,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False
70068,15,1,3,,0,2018350,2023-05-31 23:00:00,False,0.000,403.044625,...,False,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False
70069,15,1,3,,0,2018350,2023-05-31 23:00:00,False,0.000,286.517708,...,False,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False
70070,15,1,3,,1,2018351,2023-05-31 23:00:00,False,183.756,105.720042,...,False,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False




Unnamed: 0,row_id,target
0,2015232,1.443170
1,2015232,1.150358
2,2015233,526.388732
3,2015233,545.576868
4,2015234,-8.904476
...,...,...
6235,2018349,40.218498
6236,2018350,2.111896
6237,2018350,-4.018734
6238,2018351,266.094187


In [62]:
def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        df_train_target = train[['target', 'target_installed_capacity']]
        df_train_data = train.drop(['target', 'target_installed_capacity'], axis=1)
        
        df_val_target = val[['target', 'target_installed_capacity']]
        df_val_data = val.drop(['target', 'target_installed_capacity'], axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        
        clf = LGBMRegressor(random_state=42, n_estimators=1500, verbose=1, n_jobs=32, objective='l2', )
        clf_producer = LGBMRegressor(random_state=42, n_estimators=1500, verbose=1, n_jobs=32, objective='l2', )
        
        clf.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)
        clf_producer.fit(df_train_data[df_train_data.is_consumption==0], df_train_target[df_train_data.is_consumption==0].target, categorical_feature=cat_features)

        y_pred = clf.predict(df_train_data)
        y_pred_producer = clf_producer.predict(df_train_data[df_train_data.is_consumption==0])
        y_pred2 = y_pred.copy()
        y_pred2[df_train_data.is_consumption==0] = y_pred_producer 

        from sklearn.metrics import mean_absolute_error

        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f" Train Mean Absolute Error:", mae)
        mae = mean_absolute_error(df_train_target.target, y_pred2)
        print(f" Train Mean w Producer Absolute Error:", mae)

        y_pred_val = clf.predict(df_val_data)
        y_pred_val_producer = clf_producer.predict(df_val_data[df_val_data.is_consumption==0])
        y_pred_val2 = y_pred_val.copy()
        y_pred_val2[df_val_data.is_consumption==0] = y_pred_val_producer 

        mae = mean_absolute_error(df_val_target.target, y_pred_val)
        print("Val Mean Absolute Error:", mae)
        mae = mean_absolute_error(df_val_target.target, y_pred_val2)
        print("Val Mean w Producer Absolute Error:", mae)

        # y_pred_test = clf.predict(df_test_data)
        # y_pred_test

        # mae = mean_absolute_error(df_test_target.target, y_pred_test)
        # print("Test Mean Absolute Error:", mae)

        importance = pd.DataFrame({'importance':clf.feature_importances_, 'name':clf.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        # display(importance.head(10))
        # display(importance.tail(10))
        print()
        print()
        print()
        print()

In [63]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041503 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28482
[LightGBM] [Info] Number of data points in the train set: 1129738, number of used features: 167
[LightGBM] [Info] Start training from score 250.526332
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059563 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28481
[LightGBM] [Info] Number of data points in the train set: 564869, number of used features: 166
[LightGBM] [Info] Start training from score 87.581421
 Train Mean Absolute Error: 21.623644374411725
 Train Mean w Producer Absolute Error: 18.244623606390444
Val Mean Absolute Error: 45.406346380878844
Val Mean w Producer Absolute Error: 44.0577406652