# Data Processing

I've heard rumours that the API output doesn't have data_block_id - so I'm going to have to look at other ways of joining the data in this notebook to make it API compatible,

In [3]:
import os

os.listdir()

['.git',
 '.gitattributes',
 '.gitignore',
 '.ipynb_checkpoints',
 '1. Data Loading and PreProcessing.ipynb',
 '1.1 Data Loading and PreProcessing LGBM v2.ipynb',
 '2. Training Experiments - LGBM.ipynb',
 '2.1 Training Experiments - LGBM cont.ipynb',
 '2.2 Training Experiments - LGBM cont.ipynb',
 '2.3 Training Experiments - LGBM new dataset from1.1.ipynb',
 '2.4 Training Experiments - LGBM old dataset.ipynb',
 '2.6 Rescued.ipynb',
 '3. Learning Time series.ipynb',
 '3.1 Learning Time series cont.ipynb',
 '3.2 Learning Time series cont.ipynb',
 '4. Data Processing.ipynb',
 '4.1 Training Experiments - statsforecast.ipynb',
 '5. EDA and Data Visualisation.ipynb',
 'data_processor.pkl',
 'data_processor_lgbm2.pkl',
 'Ideas.md',
 'README.md',
 'Untitled.ipynb',
 'WIP Data Exploration.ipynb',
 'WIP Data Loading.ipynb',
 'WIP Training and Sample Submission LGBM.ipynb']

In [1]:
import pandas as pd
import datetime as dt
import numpy as np

import pandas as pd
import datetime as dt
import numpy as np

class TrainDataProcessor:
    """I am rewriting this training data processor to process a few more variables differently."""

    def __init__(self, train, revealed_targets, client, historical_weather,
                 forecast_weather, electricity_prices, gas_prices, for_testing=False,
                add_log_cols=False):
        self.add_log_cols = add_log_cols
        self.test_orig_dfs = self.get_test_orig_dfs([train, revealed_targets, client, historical_weather,
                 forecast_weather, electricity_prices, gas_prices])
        
        self.weather_mapping = self.init_weather_mapping()
        
        if not for_testing:
            self.train = self.init_train(train)
            self.revealed_targets = self.init_revealed_targets(revealed_targets)
            self.client = self.init_client(client)
            
            self.historical_weather = self.init_historical_weather(historical_weather)
            self.forecast_weather = self.init_forecast_weather(forecast_weather)
            self.electricity_prices = self.init_electricity(electricity_prices)
            self.gas_prices = self.init_gas_prices(gas_prices)
            
            self.df_all_cols = self.join_data(self.train, self.revealed_targets, self.client, self.historical_weather, self.forecast_weather, self.electricity_prices, self.gas_prices)
            if self.add_log_cols:
                self.df_all_cols = self.create_log_cols(self.df_all_cols)
            self.df = self.remove_cols(self.df_all_cols)
            
        
    def get_test_orig_dfs(self, dfs):
        for i, df in enumerate(dfs):
            if 'datetime' in df.columns:
                df['datetime'] = pd.to_datetime(df.datetime)
                col = 'datetime'
            if 'prediction_datetime' in df.columns:
                df['prediction_datetime'] = pd.to_datetime(df.prediction_datetime)
                col = 'prediction_datetime'
            if 'forecast_date' in df.columns:
                df['forecast_date'] = pd.to_datetime(df['forecast_date'])
                col = 'forecast_date'
            if 'forecast_datetime' in df.columns:
                df['forecast_datetime'] = pd.to_datetime(df['forecast_datetime'])
                col = 'forecast_datetime'
            if 'date' in df.columns:
                df['date'] = pd.to_datetime(df.date).dt.date
                col = 'date'

            test_date = df[col].iloc[-1]  # Assuming test is a DataFrame
            start_date = test_date - pd.Timedelta(days=14)
            historical_subset = df[df[col] >= start_date]
            dfs[i] = historical_subset.copy()
        return dfs
        
    def init_train(self, df):
        """Prepares the training data for model training."""
        try:
            df['datetime'] = pd.to_datetime(df.datetime)
        except Exception as e:
            df['datetime'] = pd.to_datetime(df.prediction_datetime)
        df['date'] = df.datetime.dt.date
            
        # df = self.get_data_block_id(df, 'datetime')
        return df
    
    def add_electricity_lag_features(self, df):
        ##### mean from entire last week
        df.set_index('datetime', inplace=True)
        # Use rolling to calculate mean price of the last week
        # The window is 7 days, min_periods can be set as per requirement
        # 'closed' determines which side of the interval is closed; it can be 'right' or 'left'
        df['mean_euros_per_mwh_last_week'] = df['euros_per_mwh'].rolling(window='7D', min_periods=1, closed='right').mean()
        # Shift the results to align with the requirement of lagging
        df['mean_euros_per_mwh_last_week'] = df['mean_euros_per_mwh_last_week'].shift()
        
        ##### mean from last week this hour only
        # Extract hour from datetime
        df['hour'] = df.index.hour

        # Group by hour and apply rolling mean for each group
        hourly_groups = df.groupby('hour')
        dff = hourly_groups['euros_per_mwh'].rolling(window='7D', min_periods=1, closed='right').mean()#.shift()#.reset_index(level=0, drop=True)
        dff = dff.reset_index().set_index('datetime').groupby('hour')['euros_per_mwh'].shift()
        dff = dff.rename('mean_euros_per_mwh_same_hour_last_week')
        df = df.join(dff)
        #### yesterday's power price
        df['yesterdays_euros_per_mwh'] = df['euros_per_mwh'].shift(24)
        
        ### 24h average
        # Calculate the 24-hour rolling average
        df['euros_per_mwh_24h_average_price'] = df['euros_per_mwh'].rolling(window=24, min_periods=1).mean()

        # Resetting the index if needed
        df.reset_index(inplace=True)
        df = df.drop(['forecast_date', 'origin_date', 'hour'], axis=1)
        return df

    def init_electricity(self, df):
        ## LAG = 1 Day
        ## Move forecast datetime ahead by 1 day
        ## change name to datetime
        df['datetime'] = pd.to_datetime(df['forecast_date'])
        df['datetime'] = df['datetime'] + dt.timedelta(days=1)
        # df = self.get_data_block_id(df, 'datetime')
        df = self.add_electricity_lag_features(df)
        return df
    
    def add_historical_weather_lag_features(self, df):
        ##### LATEST WEATHER
        def add_latest_weather(df):
            # Assuming df is your original DataFrame
            # Step 1: Convert datetime to a Datetime Object
            df['datetime'] = pd.to_datetime(df['datetime'])
            df.set_index('datetime', inplace=True)

            # Step 2: Sorting the Data
            df.sort_values(by=['datetime', 'latitude', 'longitude'], inplace=True)

            # Step 3: Creating a Unique Identifier for each location
            df['location_id'] = df['latitude'].astype(str) + '_' + df['longitude'].astype(str)

            # Step 4: Filtering for 10:00 AM Entries
            df.reset_index(inplace=True)
            df_10am = df[df['datetime'].dt.hour == 10]
            df_10am.set_index('datetime', inplace=True)

            # Step 5: Shifting the Features by 1 day
            lagged_features = df_10am.groupby('location_id').shift(periods=1, freq='D')
            
            # grouped = lagged_features.groupby('county')
            # lagged_features = grouped[weather_features].mean()
            
            
            # Renaming columns to indicate lag
            lagged_features = lagged_features.add_suffix('_hw_lagged')
            lagged_features['location_id'] = lagged_features['location_id_hw_lagged']
            lagged_features.reset_index(inplace=True)
            lagged_features['date'] = lagged_features.datetime.dt.date

            df['date'] = df.datetime.dt.date
            return lagged_features
            # Step 6: Merging Lagged Features with Original DataFrame
            df = df.merge(lagged_features, on=['date', 'location_id'], how='left', suffixes=('', '_hw_lagged'))
            return df
        
        ##### mean from last day
        def add_24h_mean_var(df, weather_features):
            # Calculate the start and end times for each row
            df['start_time'] = pd.to_datetime(df['datetime'].dt.date) - pd.Timedelta(days=2) + pd.Timedelta(hours=11)
            df['end_time'] = pd.to_datetime(df['datetime'].dt.date) - pd.Timedelta(days=1) + pd.Timedelta(hours=10)
            df['time_code'] = df['start_time'].astype(str) +'_' + df['end_time'].astype(str) + '_' + df['latitude'].astype(str) + '_' + df['longitude'].astype(str)
            # print(df.time_code)

            # Create a helper column for grouping
            # If the time is before 11:00 AM, subtract a day
            df['group'] = df['datetime'].apply(lambda dt: dt if dt.time() >= pd.to_datetime('11:00').time() else dt - pd.Timedelta(days=1))
            df['group'] = df['group'].dt.date  # Keep only the date part for grouping
            df['group'] = (pd.to_datetime(df['group']) + pd.Timedelta(hours=11)).astype(str) + '_' + (pd.to_datetime(df['group']) + pd.Timedelta(days=1, hours=10)).astype(str) + '_' + df['latitude'].astype(str) + '_' + df['longitude'].astype(str)

            # Now group by this new column
            grouped = df.groupby('group')
            means = grouped[weather_features].mean()
            variances = grouped[weather_features].var()

            # Merge means and variances into the original DataFrame
            my_df = df.merge(means, left_on='time_code', right_on='group', suffixes=('', '_hw_means'), how='left')
            my_df = my_df.merge(variances, left_on='time_code', right_on='group', how='left', suffixes=('', '_hw_variances'))

            return my_df
        
        ##### mean from last day all estonia
        def add_24h_mean_var_estonia(df, weather_features):
            # Calculate the start and end times for each row
            df['start_time'] = pd.to_datetime(df['datetime'].dt.date) - pd.Timedelta(days=2) + pd.Timedelta(hours=11)
            df['end_time'] = pd.to_datetime(df['datetime'].dt.date) - pd.Timedelta(days=1) + pd.Timedelta(hours=10)
            df['time_code'] = df['start_time'].astype(str) +'_' + df['end_time'].astype(str)
            # print(df.time_code)

            # Create a helper column for grouping
            # If the time is before 11:00 AM, subtract a day
            df['group'] = df['datetime'].apply(lambda dt: dt if dt.time() >= pd.to_datetime('11:00').time() else dt - pd.Timedelta(days=1))
            df['group'] = df['group'].dt.date  # Keep only the date part for grouping
            df['group'] = (pd.to_datetime(df['group']) + pd.Timedelta(hours=11)).astype(str) + '_' + (pd.to_datetime(df['group']) + pd.Timedelta(days=1, hours=10)).astype(str) + '_' + df['latitude'].astype(str) + '_' + df['longitude'].astype(str)

            # Now group by this new column
            grouped = df.groupby('group')
            means = grouped[weather_features].mean()
            variances = grouped[weather_features].var()

            # Merge means and variances into the original DataFrame
            my_df = df.merge(means, left_on='time_code', right_on='group', suffixes=('', '_hw_means_estonia'), how='left')
            my_df = my_df.merge(variances, left_on='time_code', right_on='group', how='left', suffixes=('', '_hw_variances'))

            return my_df

        df['datetime'] = pd.to_datetime(df['datetime'])
        weather_features = df.columns.drop(['datetime', 'latitude', 'longitude'])

        # Apply the function
        df = add_24h_mean_var(df, weather_features)       
        latest = add_latest_weather(df)
        df = df.merge(latest, on=['date', 'location_id'], how='left', suffixes=('', '_hw_lagged'))
        
        return df

    def init_historical_weather(self, df):
        ## LAG: From 11:00 AM 2 days ago to 10:00 AM 1 day ago
        ## What to do? Give most recent weather forecast? Give average over the last day?
        """
        Processes the historical weather data.
        """
        df['datetime'] = pd.to_datetime(df.datetime)
        
        
        
        df = self.add_historical_weather_lag_features(df)
        
        df = df.merge(self.weather_mapping, how='inner', on=('latitude', 'longitude'))
        
        return df

    def init_forecast_weather(self, df):
        ## LAG: DON't ADJUST
        ##      The forecast is from yesterday, but can forecast today, which is 22 hours ahead
        ## Drop any columns where:
        ##                        hours_ahead < 22 and hours_ahead > 45
        ## Then rename forecast_datetime to datetime and join on datetime
        """
        Processes the forecast weather data.
        """
        df['datetime'] = pd.to_datetime(df['forecast_datetime'])
        # keep only datetimes from our relevant period
        df = df[(df['hours_ahead'] < 46) & (df['hours_ahead'] > 21)]
        df['datetime'] = df['datetime'] + dt.timedelta(days=1)
        df = df.merge(self.weather_mapping, how='inner', on=('latitude', 'longitude'))
        return df
    
    def add_gas_prices_lag_features(self, df):
        df['date'] = pd.to_datetime(df['date'])
        df.set_index('date', inplace=True)

        # Sort the DataFrame by date, if it's not already sorted
        df.sort_index(inplace=True)

        # Calculate rolling averages for different time windows
        df['lowest_price_3d_avg'] = df['lowest_price_per_mwh'].rolling(window=3).mean()
        df['highest_price_3d_avg'] = df['highest_price_per_mwh'].rolling(window=3).mean()

        df['lowest_price_7d_avg'] = df['lowest_price_per_mwh'].rolling(window=7).mean()
        df['highest_price_7d_avg'] = df['highest_price_per_mwh'].rolling(window=7).mean()

        df['lowest_price_14d_avg'] = df['lowest_price_per_mwh'].rolling(window=14).mean()
        df['highest_price_14d_avg'] = df['highest_price_per_mwh'].rolling(window=14).mean()

        # Reset the index if you want the 'date' column back
        df.reset_index(inplace=True)
        return df

    def init_gas_prices(self, df):
        ## LAG: 1 DAY
        ## Predictions are made from 2 days ago and predict for yesterday
        ## add one day to forecast_date
        ## Rename forecast_date to date, join on date
        """
        Processes the gas prices data.
        Implement the logic to handle gas prices data processing here.
        """
        df['date'] = pd.to_datetime(df['forecast_date']).dt.date
        df['date'] = df['date'] + dt.timedelta(days=1)
        df = self.add_gas_prices_lag_features(df)
        return df
    
    def add_revealed_target_features(self, df):
        df['datetime'] = pd.to_datetime(df['datetime'])
        df['hour'] = df.datetime.dt.hour
        df['day'] = df.datetime.dt.dayofweek
        df.set_index('datetime', inplace=True)
        
        # let me add some new features here too
        # Adding lag features
        # Step 2: Sorting the Data
        df.sort_values(by=['datetime'], inplace=True)

        # Step 3: Creating a Unique Identifier for each location
        df['id'] = df['county'].astype(str) + '_' + df['is_business'].astype(str) + '_' + df['product_type'].astype(str) + '_' + df['is_consumption'].astype(str)
        lagged_features = []
        lagged_hours = []
        ### Defining lagged target features

        for lag_hours in range(1, 24):
            lagged_feature = df.groupby('id').shift(periods=lag_hours, freq='H')
            lagged_features.append(lagged_feature)
            lagged_hours.append(lag_hours)

        for lag_hours in ([i*24 for i in range(1,8)] + [24*11, 24*12]):
            lagged_feature = df.groupby('id').shift(periods=lag_hours, freq='H')
            lagged_features.append(lagged_feature)
            lagged_hours.append(lag_hours)
            
        df.reset_index(inplace=True)
        for lagged_feature, lag_hours in zip(lagged_features, lagged_hours):
            lagged_feature.reset_index(inplace=True)
            lagged_feature.dropna(inplace=True)
            df = df.merge(lagged_feature[['datetime', 'target', 'id']], on=['id', 'datetime'], how='left', suffixes=('', f'_lag_{lag_hours}h'))

        df.set_index('datetime', inplace=True)
        
        

        window_size = 7
        # Group by the specified columns and then apply the rolling mean
        grouped = df.groupby(['county', 'is_business', 'product_type', 'is_consumption'])
        df['target_rolling_avg_24h'] = grouped['target'].transform(lambda x: x.rolling(window=24, min_periods=1).mean())

        grouped = df.groupby(['county', 'is_business', 'product_type', 'is_consumption', 'hour'])
        df['target_rolling_avg_hour_7d'] = grouped['target'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())

        # grouped = df.groupby(['county', 'is_business', 'product_type', 'is_consumption', 'hour', 'day'])
        # df['target_rolling_avg_hour_hour_day_4w'] = grouped['target'].transform(lambda x: x.rolling(window=4, min_periods=1).mean())

        grouped = df.groupby(['county', 'is_business', 'is_consumption'])
        df['target_rolling_allp_avg_24h'] = grouped['target'].transform(lambda x: x.rolling(window=24, min_periods=1).mean())

        grouped = df.groupby(['county', 'is_business', 'is_consumption', 'hour'])
        df['target_rolling_allp_avg_hour_7d'] = grouped['target'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())

        grouped = df.groupby(['county', 'is_business', 'is_consumption', 'hour', 'day'])
        df['target_rolling_allp_avg_hour_hour_day_4w'] = grouped['target'].transform(lambda x: x.rolling(window=4, min_periods=1).mean())
        
        #All of estonia
        grouped = df.groupby(['is_business', 'product_type', 'is_consumption'])
        df['target_rolling_avg_24h_estonia'] = grouped['target'].transform(lambda x: x.rolling(window=24, min_periods=1).mean())

        grouped = df.groupby(['is_business', 'product_type', 'is_consumption', 'hour'])
        df['target_rolling_avg_hour_7d_estonia'] = grouped['target'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())

        # grouped = df.groupby(['is_business', 'product_type', 'is_consumption', 'hour', 'day'])
        # df['target_rolling_avg_hour_hour_day_4w_estonia'] = grouped['target'].transform(lambda x: x.rolling(window=4, min_periods=1).mean())

        grouped = df.groupby(['is_business', 'is_consumption'])
        df['target_rolling_allp_avg_24h_estonia'] = grouped['target'].transform(lambda x: x.rolling(window=24, min_periods=1).mean())

        grouped = df.groupby(['is_business', 'is_consumption', 'hour'])
        df['target_rolling_allp_avg_hour_7d_estonia'] = grouped['target'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())

        # grouped = df.groupby(['is_business', 'is_consumption', 'hour', 'day'])
        # df['target_rolling_allp_avg_hour_hour_day_4w_estonia'] = grouped['target'].transform(lambda x: x.rolling(window=4, min_periods=1).mean())
        
        df = df.drop(['hour', 'day'], axis=1)

        return df
    
    def init_revealed_targets(self, df):
        df['datetime'] = pd.to_datetime(df.datetime)
        df['datetime'] = df['datetime'] + dt.timedelta(days=2)
        df = self.add_revealed_target_features(df)
        return df
    
    def init_client(self, df):
        ## LAG: 2 days
        ## Add 2 days to date, join on date
        df['date'] = pd.to_datetime(df.date).dt.date
        df['date'] = df['date'] + dt.timedelta(days=2)
        # df = self.get_data_block_id(df, 'date')
        return df

    def init_weather_mapping(self):
        # https://www.kaggle.com/code/tsunotsuno/enefit-eda-baseline/notebook#Baseline
        county_point_map = {
            0: (59.4, 24.7), # "HARJUMAA"
            1 : (58.8, 22.7), # "HIIUMAA"
            2 : (59.1, 27.2), # "IDA-VIRUMAA"
            3 : (58.8, 25.7), # "JÄRVAMAA"
            4 : (58.8, 26.2), # "JÕGEVAMAA"
            5 : (59.1, 23.7), # "LÄÄNE-VIRUMAA"
            6 : (59.1, 23.7), # "LÄÄNEMAA"
            7 : (58.5, 24.7), # "PÄRNUMAA"
            8 : (58.2, 27.2), # "PÕLVAMAA"
            9 : (58.8, 24.7), # "RAPLAMAA"
            10 : (58.5, 22.7),# "SAAREMAA"
            11 : (58.5, 26.7),# "TARTUMAA"
            12 : (58.5, 25.2),# "UNKNOWNN" (center of the map)
            13 : (57.9, 26.2),# "VALGAMAA"
            14 : (58.2, 25.7),# "VILJANDIMAA"
            15 : (57.9, 27.2) # "VÕRUMAA"
        }
        # Convert the dictionary to a list of tuples
        data = [(county_code, lat, lon) for county_code, (lat, lon) in county_point_map.items()]

        # Create DataFrame
        df = pd.DataFrame(data, columns=['county', 'latitude', 'longitude'])
        
        return df
    
    def add_date_features(self, df):
        df['year'] = df['datetime'].dt.year
        df['month'] = df['datetime'].dt.month
        df['day'] = df['datetime'].dt.day
        df['hour'] = df['datetime'].dt.hour
        df['quarter'] = df['datetime'].dt.quarter
        df['day_of_week'] = df['datetime'].dt.day_of_week
        df['day_of_year'] = df['datetime'].dt.dayofyear
        df['week_of_year'] = df['datetime'].dt.isocalendar().week
        df['is_weekend'] = df['datetime'].dt.day_of_week >= 5
        df['is_month_start'] = df['datetime'].dt.is_month_start
        df['is_month_end'] = df['datetime'].dt.is_month_end
        df['is_quarter_start'] = df['datetime'].dt.is_quarter_start
        df['is_quarter_end'] = df['datetime'].dt.is_quarter_end
        df['is_year_start'] = df['datetime'].dt.is_year_start
        df['is_year_end'] = df['datetime'].dt.is_year_end
        df['season'] = df['datetime'].dt.month % 12 // 3 + 1
        df['hour_sin'] = np.sin(df['datetime'].dt.hour * (2. * np.pi / 24))
        df['hour_cos'] = np.cos(df['datetime'].dt.hour * (2. * np.pi / 24))
        # Calculate sin and cos for day of year
        days_in_year = 365.25  # accounts for leap year
        df['day_of_year_sin'] = np.sin((df['day_of_year'] - 1) * (2 * np.pi / days_in_year))
        df['day_of_year_cos'] = np.cos((df['day_of_year'] - 1) * (2 * np.pi / days_in_year))
        return df
    
    def add_ee_holidays(self, df):
        import holidays
        # Define Estonia public holidays
        ee_holidays = holidays.CountryHoliday('EE')
        
        print(df['date'].isna().sum())
        
        def find_problem(x):
            try:
                return x in ee_holidays
            except Exception as e:
                print(x)
                raise e

        # Function to check if the date is a holiday
        df['is_ee_holiday'] = df['date'].apply(lambda x: x in ee_holidays)

        return df
    
    def create_log_cols(self, df):
        log_cols = ['target_lag_1h', 'target_lag_2h', 'target_lag_3h', 'target_lag_4h',
       'target_lag_5h', 'target_lag_6h', 'target_lag_7h', 'target_lag_8h',
       'target_lag_9h', 'target_lag_10h', 'target_lag_11h', 'target_lag_12h',
       'target_lag_13h', 'target_lag_14h', 'target_lag_15h', 'target_lag_16h',
       'target_lag_17h', 'target_lag_18h', 'target_lag_19h', 'target_lag_20h',
       'target_lag_21h', 'target_lag_22h', 'target_lag_23h', 'target_lag_24h',
       'target_lag_48h', 'target_lag_72h', 'target_lag_96h', 'target_lag_120h',
       'target_lag_144h', 'target_lag_168h', 'target_lag_264h',
       'target_lag_288h', 'eic_count', 'installed_capacity', 'temperature', 'dewpoint', 'rain',
       'snowfall', 'surface_pressure', 'cloudcover_total', 'cloudcover_low',
       'cloudcover_mid', 'cloudcover_high', 'windspeed_10m',
       'winddirection_10m', 'shortwave_radiation', 'direct_solar_radiation',
       'diffuse_radiation', 'temperature_hw_means', 'dewpoint_hw_means',
       'rain_hw_means', 'snowfall_hw_means', 'surface_pressure_hw_means',
       'cloudcover_total_hw_means', 'cloudcover_low_hw_means',
       'cloudcover_mid_hw_means', 'cloudcover_high_hw_means',
       'windspeed_10m_hw_means', 'winddirection_10m_hw_means',
       'shortwave_radiation_hw_means', 'direct_solar_radiation_hw_means',
       'diffuse_radiation_hw_means', 'temperature_hw_variances',
       'dewpoint_hw_variances', 'rain_hw_variances', 'snowfall_hw_variances',
       'surface_pressure_hw_variances', 'cloudcover_total_hw_variances',
       'cloudcover_low_hw_variances', 'cloudcover_mid_hw_variances',
       'cloudcover_high_hw_variances', 'windspeed_10m_hw_variances',
       'winddirection_10m_hw_variances', 'shortwave_radiation_hw_variances',
       'direct_solar_radiation_hw_variances', 'diffuse_radiation_hw_variances',
       'temperature_hw_lagged', 'dewpoint_hw_lagged', 'rain_hw_lagged',
       'snowfall_hw_lagged', 'surface_pressure_hw_lagged',
       'cloudcover_total_hw_lagged', 'cloudcover_low_hw_lagged', 'cloudcover_mid_hw_lagged',
       'cloudcover_high_hw_lagged', 'windspeed_10m_hw_lagged',
       'winddirection_10m_hw_lagged', 'shortwave_radiation_hw_lagged',
       'direct_solar_radiation_hw_lagged', 'diffuse_radiation_hw_lagged',
       'temperature_hw_means_hw_lagged', 'dewpoint_hw_means_hw_lagged',
       'rain_hw_means_hw_lagged', 'snowfall_hw_means_hw_lagged',
       'surface_pressure_hw_means_hw_lagged',
       'cloudcover_total_hw_means_hw_lagged',
       'cloudcover_low_hw_means_hw_lagged',
       'cloudcover_mid_hw_means_hw_lagged',
       'cloudcover_high_hw_means_hw_lagged',
       'windspeed_10m_hw_means_hw_lagged',
       'winddirection_10m_hw_means_hw_lagged',
       'shortwave_radiation_hw_means_hw_lagged',
       'direct_solar_radiation_hw_means_hw_lagged',
       'diffuse_radiation_hw_means_hw_lagged',
       'temperature_hw_variances_hw_lagged', 'dewpoint_hw_variances_hw_lagged',
       'rain_hw_variances_hw_lagged', 'snowfall_hw_variances_hw_lagged',
       'surface_pressure_hw_variances_hw_lagged',
       'cloudcover_total_hw_variances_hw_lagged',
       'cloudcover_low_hw_variances_hw_lagged',
       'cloudcover_mid_hw_variances_hw_lagged',
       'cloudcover_high_hw_variances_hw_lagged',
       'windspeed_10m_hw_variances_hw_lagged',
       'winddirection_10m_hw_variances_hw_lagged',
       'shortwave_radiation_hw_variances_hw_lagged',
       'direct_solar_radiation_hw_variances_hw_lagged',
       'diffuse_radiation_hw_variances_hw_lagged', 'temperature_fw', 'dewpoint_fw', 'cloudcover_high_fw',
       'cloudcover_low_fw', 'cloudcover_mid_fw', 'cloudcover_total_fw',
       '10_metre_u_wind_component', '10_metre_v_wind_component',
       'direct_solar_radiation_fw', 'surface_solar_radiation_downwards',
       'snowfall_fw', 'total_precipitation', 'euros_per_mwh', 'mean_euros_per_mwh_last_week',
       'mean_euros_per_mwh_same_hour_last_week', 'yesterdays_euros_per_mwh',
       'euros_per_mwh_24h_average_price', 'lowest_price_per_mwh',
       'highest_price_per_mwh', 'lowest_price_3d_avg', 'highest_price_3d_avg',
       'lowest_price_7d_avg', 'highest_price_7d_avg', 'lowest_price_14d_avg',
       'highest_price_14d_avg']
        
        log_cols = [col for col in log_cols if col in df.columns]
        
        dff = np.log1p(df[log_cols] )
        dff.rename(columns={col: col + "_log" for col in log_cols}, inplace=True)
        return pd.concat([df, dff], axis=1)
        
    
    def remove_cols(self, df):
        col_list = ['datetime',
                   'row_id',
                   'prediction_unit_id',
                    'date_train',
                    'hour_part',
                   'date_client',
                    'forecast_date_elec_price',
                    'origin_date_elec_price',
                    'forecast_date_gas_price',
                    'origin_date_gas_price',
                    'datetime_hist_weath',
                   'hour_part_hist_weath_latest',
                    'datetime_hist_weath_latest',
                   'origin_datetime',
                   'hour_part_fore_weath',
                    'datetime',
                    'id',
                     'data_block_id',
                     'row_id',
                     'prediction_unit_id',
                     'date',
                    'data_block_id_rt',
                     'row_id_rt',
                     'prediction_unit_id_rt',
                    'data_block_id_client',
                    'latitude',
                     'longitude',
                     'data_block_id_hw',
                    'start_time',
                     'end_time',
                     'time_code',
                     'group',
                    'data_block_id_hw_means',
                    'data_block_id_hw_variances',
                     'location_id',
                     'date_hw',
                     'datetime_hw_lagged',
                    'latitude_hw_lagged',
                     'longitude_hw_lagged',
                     'data_block_id_hw_lagged',
                     'start_time_hw_lagged',
                     'end_time_hw_lagged',
                     'time_code_hw_lagged',
                     'group_hw_lagged',
                    'data_block_id_hw_means_hw_lagged',
                    'data_block_id_hw_variances_hw_lagged',
                    'location_id_hw_lagged',
                     'latitude_fw',
                     'longitude_fw',
                     'origin_datetime',
                    'data_block_id_fw',
                     'forecast_datetime',
                    'data_block_id_elec',
                    'forecast_date',
                    'origin_date',
                     'data_block_id_gasp',
                   ]
        columns_to_drop = [col for col in col_list if col in df.columns]
        df = df.drop(columns_to_drop, axis=1)
        return df
    
    def remove_test_cols(self, df):
        col_list = ['datetime',
                   'prediction_unit_id',
                    'date_train',
                    'hour_part',
                   'date_client',
                    'forecast_date_elec_price',
                    'origin_date_elec_price',
                    'forecast_date_gas_price',
                    'origin_date_gas_price',
                    'datetime_hist_weath',
                   'hour_part_hist_weath_latest',
                    'datetime_hist_weath_latest',
                   'origin_datetime',
                   'hour_part_fore_weath',
                    'datetime',
                     'data_block_id',
                     'row_id',
                     'prediction_unit_id',
                     'date',
                    'data_block_id_rt',
                     'row_id_rt',
                     'prediction_unit_id_rt',
                    'data_block_id_client',
                    'latitude',
                     'longitude',
                     'data_block_id_hw',
                    'start_time',
                     'end_time',
                     'time_code',
                     'group',
                    'data_block_id_hw_means',
                    'data_block_id_hw_variances',
                     'location_id',
                     'date_hw',
                     'datetime_hw_lagged',
                    'latitude_hw_lagged',
                     'longitude_hw_lagged',
                     'data_block_id_hw_lagged',
                     'start_time_hw_lagged',
                     'end_time_hw_lagged',
                     'time_code_hw_lagged',
                     'group_hw_lagged',
                    'data_block_id_hw_means_hw_lagged',
                    'data_block_id_hw_variances_hw_lagged',
                    'location_id_hw_lagged',
                     'latitude_fw',
                     'longitude_fw',
                     'origin_datetime',
                    'data_block_id_fw',
                     'forecast_datetime',
                    'data_block_id_elec',
                    'forecast_date',
                    'origin_date',
                     'data_block_id_gasp',
                    'id'
                   ]
        columns_to_drop = [col for col in col_list if col in df.columns]
        df = df.drop(columns_to_drop, axis=1)
        return df
    
    def join_data(self, train, revealed_targets, client, historical_weather, forecast_weather, electricity_prices, gas_prices):
        df = train
        df = df.merge(revealed_targets, how='left', on=('datetime', 'county', 'is_business', 'product_type', 'is_consumption'), suffixes=('', '_rt'))
        df = df.merge(client, how='left', on=('date', 'county', 'is_business', 'product_type'), suffixes=('', '_client'))
        df = df.merge(historical_weather, how='left', on=('datetime', 'county'), suffixes=('', '_hw'))
        df = df.merge(forecast_weather, how='left', on=('datetime', 'county'), suffixes=('', '_fw'))
        df = df.merge(electricity_prices, how='left', on='datetime', suffixes=('', '_elec'))
        df['date'] = pd.to_datetime(df['date'])
        df = df.merge(gas_prices, how='left', on='date', suffixes=('', '_gasp'))
        df = self.add_date_features(df)
        df = self.add_ee_holidays(df)
        return df
    
    def add_test_data(self, test, revealed_targets, client, historical_weather,
            forecast_weather, electricity_prices, gas_prices):
        dfs = [test.copy(), revealed_targets, client, historical_weather,
                 forecast_weather, electricity_prices, gas_prices]
        for i, df in enumerate(dfs):
            if 'datetime' in df.columns:
                df['datetime'] = pd.to_datetime(df.datetime)
                col = 'datetime'
            if 'prediction_datetime' in df.columns:
                df['datetime'] = pd.to_datetime(df.prediction_datetime)
                col = 'datetime'
            if 'forecast_date' in df.columns:
                df['forecast_date'] = pd.to_datetime(df['forecast_date'])
                col = 'forecast_date'
            if 'forecast_datetime' in df.columns:
                df['forecast_datetime'] = pd.to_datetime(df['forecast_datetime'])
                col = 'forecast_datetime'
                
            self.test_orig_dfs[i] = pd.concat([ self.test_orig_dfs[i], df ])          
        
        
    
    def process_test_data_timestep(self, test, revealed_targets, client, historical_weather,
            forecast_weather, electricity_prices, gas_prices):
        #append test data to test data cache
        self.add_test_data(test, revealed_targets, client, historical_weather,
            forecast_weather, electricity_prices, gas_prices)
        # process test data
        test = self.init_train(self.test_orig_dfs[0])
        revealed_targets = self.init_revealed_targets(self.test_orig_dfs[1])
        client = self.init_client(self.test_orig_dfs[2])
        historical_weather = self.init_historical_weather(self.test_orig_dfs[3])
        forecast_weather = self.init_forecast_weather(self.test_orig_dfs[4])
        electricity_prices = self.init_electricity(self.test_orig_dfs[5])
        gas_prices = self.init_gas_prices(self.test_orig_dfs[6])
        df_all_cols = self.join_data(test, revealed_targets, client, historical_weather,
            forecast_weather, electricity_prices, gas_prices)
        if self.add_log_cols:
            df_all_cols = self.create_log_cols(df_all_cols)
        df = self.remove_test_cols(df_all_cols)
        return df
        

from data import public_timeseries_testing_util as enefit

env = enefit.make_env()
iter_test = env.iter_test()

for (test, revealed_targets, client, historical_weather,
        forecast_weather, electricity_prices, gas_prices, sample_submission) in iter_test:
    """I want a data reading class that saves all the test data and loads it then processes it"""
    data_p = TrainDataProcessor(test, revealed_targets, client, historical_weather,
        forecast_weather, electricity_prices, gas_prices, sample_submission)
    break

In [2]:
%%time

train = pd.read_csv('data/train.csv')

revealed_targets = train.copy()
revealed_targets['datetime'] = pd.to_datetime(revealed_targets.datetime)
revealed_targets['datetime'] = revealed_targets['datetime'] + dt.timedelta(days=2)

client = pd.read_csv('data/client.csv')

historical_weather = pd.read_csv('data/historical_weather.csv')

forecast_weather = pd.read_csv('data/forecast_weather.csv')

electricity_prices = pd.read_csv('data/electricity_prices.csv')

gas_prices = pd.read_csv('data/gas_prices.csv')

data_processor = TrainDataProcessor(train, revealed_targets, client, historical_weather, forecast_weather, electricity_prices, gas_prices, add_log_cols=False)

data_processor.df

FileNotFoundError: [Errno 2] No such file or directory: 'data/train.csv'

In [3]:
data_processor.test_orig_dfs[0]

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
1973722,0,0,1,3.069,0,2023-05-17 23:00:00,623,1973722,0
1973723,0,0,1,621.033,1,2023-05-17 23:00:00,623,1973723,0
1973724,0,0,2,0.000,0,2023-05-17 23:00:00,623,1973724,1
1973725,0,0,2,8.329,1,2023-05-17 23:00:00,623,1973725,1
1973726,0,0,3,18.432,0,2023-05-17 23:00:00,623,1973726,2
...,...,...,...,...,...,...,...,...,...
2018347,15,1,0,197.233,1,2023-05-31 23:00:00,637,2018347,64
2018348,15,1,1,0.000,0,2023-05-31 23:00:00,637,2018348,59
2018349,15,1,1,28.404,1,2023-05-31 23:00:00,637,2018349,59
2018350,15,1,3,0.000,0,2023-05-31 23:00:00,637,2018350,60


In [4]:
data_processor.df.isna().sum()

county               0
is_business          0
product_type         0
target             528
is_consumption       0
                  ... 
hour_sin             0
hour_cos             0
day_of_year_sin      0
day_of_year_cos      0
is_ee_holiday        0
Length: 180, dtype: int64

In [5]:
data_processor.df.hour

0           0
1           0
2           0
3           0
4           0
           ..
2018609    23
2018610    23
2018611    23
2018612    23
2018613    23
Name: hour, Length: 2018614, dtype: int64

In [6]:
import pickle

with open('data_processor_lgbm2_new_pandas_v.pkl', 'wb') as f:
    pickle.dump(data_processor, f)

### Testing the test data loader

In [67]:
%%time

train = pd.read_csv('data/train.csv')
train = train[train.data_block_id < 634].copy()

revealed_targets = train.copy()
revealed_targets['datetime'] = pd.to_datetime(revealed_targets.datetime)
revealed_targets['datetime'] = revealed_targets['datetime'] + dt.timedelta(days=2)
revealed_targets = revealed_targets[revealed_targets.data_block_id < 634].copy()

client = pd.read_csv('data/client.csv')
client = client[client.data_block_id < 634].copy()

historical_weather = pd.read_csv('data/historical_weather.csv')
historical_weather = historical_weather[historical_weather.data_block_id < 634].copy()

forecast_weather = pd.read_csv('data/forecast_weather.csv')
forecast_weather = forecast_weather[forecast_weather.data_block_id < 634].copy()

electricity_prices = pd.read_csv('data/electricity_prices.csv')
electricity_prices = electricity_prices[electricity_prices.data_block_id < 634].copy()

gas_prices = pd.read_csv('data/gas_prices.csv')
gas_prices = gas_prices[gas_prices.data_block_id < 634].copy()

data_processor = TrainDataProcessor(train, revealed_targets, client, historical_weather, forecast_weather, electricity_prices, gas_prices, for_testing=True, add_log_cols=True)

CPU times: total: 11.6 s
Wall time: 12 s


In [68]:
import pickle

with open('data_processor_lgbm1_testing.pkl', 'wb') as f:
    pickle.dump(data_processor, f)

In [69]:
import pickle

with open('data_processor_lgbm1_testing.pkl', 'rb') as f:
    data_processor = pickle.load(f)

In [70]:
data_processor.test_orig_dfs[0]

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
1960760,0,0,1,2.977,0,2023-05-13 23:00:00,619,1960760,0
1960761,0,0,1,601.482,1,2023-05-13 23:00:00,619,1960761,0
1960762,0,0,2,0.000,0,2023-05-13 23:00:00,619,1960762,1
1960763,0,0,2,9.943,1,2023-05-13 23:00:00,619,1960763,1
1960764,0,0,3,50.278,0,2023-05-13 23:00:00,619,1960764,2
...,...,...,...,...,...,...,...,...,...
2005867,15,1,0,184.072,1,2023-05-27 23:00:00,633,2005867,64
2005868,15,1,1,0.000,0,2023-05-27 23:00:00,633,2005868,59
2005869,15,1,1,38.646,1,2023-05-27 23:00:00,633,2005869,59
2005870,15,1,3,0.000,0,2023-05-27 23:00:00,633,2005870,60


In [71]:
data_processor.df = None

In [72]:
from data import public_timeseries_testing_util as enefit

env = enefit.make_env()

for (test, revealed_targets, client, historical_weather,
            forecast_weather, electricity_prices, gas_prices, sample_submission) in env.iter_test():
    test_data = data_processor.process_test_data_timestep(test, revealed_targets, client, historical_weather, forecast_weather, electricity_prices, gas_prices)
    display(test_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['datetime'] = df['datetime'] + dt.timedelta(days=1)


0


  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)


Unnamed: 0,county,is_business,product_type,target,is_consumption,prediction_datetime,currently_scored,target_rt,target_lag_1h,target_lag_2h,...,yesterdays_euros_per_mwh_log,euros_per_mwh_24h_average_price_log,lowest_price_per_mwh_log,highest_price_per_mwh_log,lowest_price_3d_avg_log,highest_price_3d_avg_log,lowest_price_7d_avg_log,highest_price_7d_avg_log,lowest_price_14d_avg_log,highest_price_14d_avg_log
0,0,0,1,2.977,0,,,,,,...,,4.462915,3.663562,3.827771,,,,,,
1,0,0,1,601.482,1,,,,,,...,,4.462915,3.663562,3.827771,,,,,,
2,0,0,2,0.000,0,,,,,,...,,4.462915,3.663562,3.827771,,,,,,
3,0,0,2,9.943,1,,,,,,...,,4.462915,3.663562,3.827771,,,,,,
4,0,0,3,50.278,0,,,,,,...,,4.462915,3.663562,3.827771,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51347,15,1,1,,1,2023-05-28 23:00:00,False,31.286,37.531,38.293,...,4.513932,4.152011,3.377588,3.558201,3.405632,3.566712,3.493473,3.613001,3.558649,3.667619
51348,15,1,3,,0,2023-05-28 23:00:00,False,0.000,0.000,3.309,...,4.513932,4.152011,3.377588,3.558201,3.405632,3.566712,3.493473,3.613001,3.558649,3.667619
51349,15,1,3,,0,2023-05-28 23:00:00,False,0.000,0.000,3.309,...,4.513932,4.152011,3.377588,3.558201,3.405632,3.566712,3.493473,3.613001,3.558649,3.667619
51350,15,1,3,,1,2023-05-28 23:00:00,False,189.933,254.907,271.112,...,4.513932,4.152011,3.377588,3.558201,3.405632,3.566712,3.493473,3.613001,3.558649,3.667619


You must call `predict()` successfully before you can continue with `iter_test()`


TypeError: cannot unpack non-iterable NoneType object

In [55]:
test_data[~test_data.currently_scored.fillna(True)]

Unnamed: 0,county,is_business,product_type,target,is_consumption,prediction_datetime,currently_scored,target_rt,target_lag_1h,target_lag_2h,...,is_quarter_start,is_quarter_end,is_year_start,is_year_end,season,hour_sin,hour_cos,day_of_year_sin,day_of_year_cos,is_ee_holiday
45112,0,0,1,,0,2023-05-28 00:00:00,False,2.675,2.627,3.829,...,False,False,False,False,2,0.000000,1.000000,0.57519,-0.81802,True
45113,0,0,1,,0,2023-05-28 00:00:00,False,2.045,2.627,3.829,...,False,False,False,False,2,0.000000,1.000000,0.57519,-0.81802,True
45114,0,0,1,,1,2023-05-28 00:00:00,False,426.177,525.541,590.043,...,False,False,False,False,2,0.000000,1.000000,0.57519,-0.81802,True
45115,0,0,1,,1,2023-05-28 00:00:00,False,471.887,525.541,590.043,...,False,False,False,False,2,0.000000,1.000000,0.57519,-0.81802,True
45116,0,0,2,,0,2023-05-28 00:00:00,False,0.000,0.000,0.000,...,False,False,False,False,2,0.000000,1.000000,0.57519,-0.81802,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51347,15,1,1,,1,2023-05-28 23:00:00,False,31.286,37.531,38.293,...,False,False,False,False,2,-0.258819,0.965926,0.57519,-0.81802,True
51348,15,1,3,,0,2023-05-28 23:00:00,False,0.000,0.000,3.309,...,False,False,False,False,2,-0.258819,0.965926,0.57519,-0.81802,True
51349,15,1,3,,0,2023-05-28 23:00:00,False,0.000,0.000,3.309,...,False,False,False,False,2,-0.258819,0.965926,0.57519,-0.81802,True
51350,15,1,3,,1,2023-05-28 23:00:00,False,189.933,254.907,271.112,...,False,False,False,False,2,-0.258819,0.965926,0.57519,-0.81802,True


In [56]:
dg = test_data[~test_data.currently_scored.fillna(True)]
dg

Unnamed: 0,county,is_business,product_type,target,is_consumption,prediction_datetime,currently_scored,target_rt,target_lag_1h,target_lag_2h,...,is_quarter_start,is_quarter_end,is_year_start,is_year_end,season,hour_sin,hour_cos,day_of_year_sin,day_of_year_cos,is_ee_holiday
45112,0,0,1,,0,2023-05-28 00:00:00,False,2.675,2.627,3.829,...,False,False,False,False,2,0.000000,1.000000,0.57519,-0.81802,True
45113,0,0,1,,0,2023-05-28 00:00:00,False,2.045,2.627,3.829,...,False,False,False,False,2,0.000000,1.000000,0.57519,-0.81802,True
45114,0,0,1,,1,2023-05-28 00:00:00,False,426.177,525.541,590.043,...,False,False,False,False,2,0.000000,1.000000,0.57519,-0.81802,True
45115,0,0,1,,1,2023-05-28 00:00:00,False,471.887,525.541,590.043,...,False,False,False,False,2,0.000000,1.000000,0.57519,-0.81802,True
45116,0,0,2,,0,2023-05-28 00:00:00,False,0.000,0.000,0.000,...,False,False,False,False,2,0.000000,1.000000,0.57519,-0.81802,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51347,15,1,1,,1,2023-05-28 23:00:00,False,31.286,37.531,38.293,...,False,False,False,False,2,-0.258819,0.965926,0.57519,-0.81802,True
51348,15,1,3,,0,2023-05-28 23:00:00,False,0.000,0.000,3.309,...,False,False,False,False,2,-0.258819,0.965926,0.57519,-0.81802,True
51349,15,1,3,,0,2023-05-28 23:00:00,False,0.000,0.000,3.309,...,False,False,False,False,2,-0.258819,0.965926,0.57519,-0.81802,True
51350,15,1,3,,1,2023-05-28 23:00:00,False,189.933,254.907,271.112,...,False,False,False,False,2,-0.258819,0.965926,0.57519,-0.81802,True


In [36]:
dg.columns[150:]

Index(['mean_euros_per_mwh_last_week',
       'mean_euros_per_mwh_same_hour_last_week', 'yesterdays_euros_per_mwh',
       'euros_per_mwh_24h_average_price', 'lowest_price_per_mwh',
       'highest_price_per_mwh', 'lowest_price_3d_avg', 'highest_price_3d_avg',
       'lowest_price_7d_avg', 'highest_price_7d_avg', 'lowest_price_14d_avg',
       'highest_price_14d_avg', 'year', 'month', 'day', 'hour', 'quarter',
       'day_of_week', 'day_of_year', 'week_of_year', 'is_weekend',
       'is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end',
       'is_year_start', 'is_year_end', 'season', 'hour_sin', 'hour_cos',
       'day_of_year_sin', 'day_of_year_cos', 'is_ee_holiday'],
      dtype='object')

In [37]:
log_cols = ['target_lag_1h', 'target_lag_2h', 'target_lag_3h', 'target_lag_4h',
       'target_lag_5h', 'target_lag_6h', 'target_lag_7h', 'target_lag_8h',
       'target_lag_9h', 'target_lag_10h', 'target_lag_11h', 'target_lag_12h',
       'target_lag_13h', 'target_lag_14h', 'target_lag_15h', 'target_lag_16h',
       'target_lag_17h', 'target_lag_18h', 'target_lag_19h', 'target_lag_20h',
       'target_lag_21h', 'target_lag_22h', 'target_lag_23h', 'target_lag_24h',
       'target_lag_48h', 'target_lag_72h', 'target_lag_96h', 'target_lag_120h',
       'target_lag_144h', 'target_lag_168h', 'target_lag_264h',
       'target_lag_288h', 'eic_count', 'installed_capacity', 'temperature', 'dewpoint', 'rain',
       'snowfall', 'surface_pressure', 'cloudcover_total', 'cloudcover_low',
       'cloudcover_mid', 'cloudcover_high', 'windspeed_10m',
       'winddirection_10m', 'shortwave_radiation', 'direct_solar_radiation',
       'diffuse_radiation', 'temperature_hw_means', 'dewpoint_hw_means',
       'rain_hw_means', 'snowfall_hw_means', 'surface_pressure_hw_means',
       'cloudcover_total_hw_means', 'cloudcover_low_hw_means',
       'cloudcover_mid_hw_means', 'cloudcover_high_hw_means',
       'windspeed_10m_hw_means', 'winddirection_10m_hw_means',
       'shortwave_radiation_hw_means', 'direct_solar_radiation_hw_means',
       'diffuse_radiation_hw_means', 'temperature_hw_variances',
       'dewpoint_hw_variances', 'rain_hw_variances', 'snowfall_hw_variances',
       'surface_pressure_hw_variances', 'cloudcover_total_hw_variances',
       'cloudcover_low_hw_variances', 'cloudcover_mid_hw_variances',
       'cloudcover_high_hw_variances', 'windspeed_10m_hw_variances',
       'winddirection_10m_hw_variances', 'shortwave_radiation_hw_variances',
       'direct_solar_radiation_hw_variances', 'diffuse_radiation_hw_variances',
       'temperature_hw_lagged', 'dewpoint_hw_lagged', 'rain_hw_lagged',
       'snowfall_hw_lagged', 'surface_pressure_hw_lagged',
       'cloudcover_total_hw_lagged', 'cloudcover_low_hw_lagged', 'cloudcover_mid_hw_lagged',
       'cloudcover_high_hw_lagged', 'windspeed_10m_hw_lagged',
       'winddirection_10m_hw_lagged', 'shortwave_radiation_hw_lagged',
       'direct_solar_radiation_hw_lagged', 'diffuse_radiation_hw_lagged',
       'temperature_hw_means_hw_lagged', 'dewpoint_hw_means_hw_lagged',
       'rain_hw_means_hw_lagged', 'snowfall_hw_means_hw_lagged',
       'surface_pressure_hw_means_hw_lagged',
       'cloudcover_total_hw_means_hw_lagged',
       'cloudcover_low_hw_means_hw_lagged',
       'cloudcover_mid_hw_means_hw_lagged',
       'cloudcover_high_hw_means_hw_lagged',
       'windspeed_10m_hw_means_hw_lagged',
       'winddirection_10m_hw_means_hw_lagged',
       'shortwave_radiation_hw_means_hw_lagged',
       'direct_solar_radiation_hw_means_hw_lagged',
       'diffuse_radiation_hw_means_hw_lagged',
       'temperature_hw_variances_hw_lagged', 'dewpoint_hw_variances_hw_lagged',
       'rain_hw_variances_hw_lagged', 'snowfall_hw_variances_hw_lagged',
       'surface_pressure_hw_variances_hw_lagged',
       'cloudcover_total_hw_variances_hw_lagged',
       'cloudcover_low_hw_variances_hw_lagged',
       'cloudcover_mid_hw_variances_hw_lagged',
       'cloudcover_high_hw_variances_hw_lagged',
       'windspeed_10m_hw_variances_hw_lagged',
       'winddirection_10m_hw_variances_hw_lagged',
       'shortwave_radiation_hw_variances_hw_lagged',
       'direct_solar_radiation_hw_variances_hw_lagged',
       'diffuse_radiation_hw_variances_hw_lagged', 'temperature_fw', 'dewpoint_fw', 'cloudcover_high_fw',
       'cloudcover_low_fw', 'cloudcover_mid_fw', 'cloudcover_total_fw',
       '10_metre_u_wind_component', '10_metre_v_wind_component',
       'direct_solar_radiation_fw', 'surface_solar_radiation_downwards',
       'snowfall_fw', 'total_precipitation', 'euros_per_mwh', 'mean_euros_per_mwh_last_week',
       'mean_euros_per_mwh_same_hour_last_week', 'yesterdays_euros_per_mwh',
       'euros_per_mwh_24h_average_price', 'lowest_price_per_mwh',
       'highest_price_per_mwh', 'lowest_price_3d_avg', 'highest_price_3d_avg',
       'lowest_price_7d_avg', 'highest_price_7d_avg', 'lowest_price_14d_avg',
       'highest_price_14d_avg']

In [46]:
mmm=np.log1p(dg[log_cols] )
mmm.rename(columns={col: col + "_log" for col in log_cols}, inplace=True)
pd.concat([dg, mmm], axis=1)
mmm

  result = func(self.values, **kwargs)


Unnamed: 0,target_lag_1h_log,target_lag_2h_log,target_lag_3h_log,target_lag_4h_log,target_lag_5h_log,target_lag_6h_log,target_lag_7h_log,target_lag_8h_log,target_lag_9h_log,target_lag_10h_log,...,yesterdays_euros_per_mwh_log,euros_per_mwh_24h_average_price_log,lowest_price_per_mwh_log,highest_price_per_mwh_log,lowest_price_3d_avg_log,highest_price_3d_avg_log,lowest_price_7d_avg_log,highest_price_7d_avg_log,lowest_price_14d_avg_log,highest_price_14d_avg_log
45112,1.288406,1.574639,2.949165,4.812184,5.972911,6.819044,7.344572,7.545696,7.457694,7.314597,...,4.157945,4.026906,3.377588,3.558201,3.405632,3.566712,3.493473,3.613001,3.558649,3.667619
45113,1.288406,1.574639,2.949165,4.812184,5.972911,6.819044,7.344572,7.545696,7.457694,7.314597,...,4.157945,4.026906,3.377588,3.558201,3.405632,3.566712,3.493473,3.613001,3.558649,3.667619
45114,6.266329,6.381889,6.288126,6.021298,5.842045,5.340313,4.905956,4.562482,4.723584,4.956088,...,4.157945,4.026906,3.377588,3.558201,3.405632,3.566712,3.493473,3.613001,3.558649,3.667619
45115,6.266329,6.381889,6.288126,6.021298,5.842045,5.340313,4.905956,4.562482,4.723584,4.956088,...,4.157945,4.026906,3.377588,3.558201,3.405632,3.566712,3.493473,3.613001,3.558649,3.667619
45116,0.000000,0.000000,0.028587,0.199670,0.971915,1.794755,2.391511,2.470470,2.419301,2.060514,...,4.157945,4.026906,3.377588,3.558201,3.405632,3.566712,3.493473,3.613001,3.558649,3.667619
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51347,3.651463,3.671046,3.193886,3.090861,2.989010,2.573146,3.220115,3.423057,2.031826,2.201549,...,4.513932,4.152011,3.377588,3.558201,3.405632,3.566712,3.493473,3.613001,3.558649,3.667619
51348,0.000000,1.460706,3.558002,4.627176,5.518817,6.135333,6.449740,6.661608,6.841931,6.800778,...,4.513932,4.152011,3.377588,3.558201,3.405632,3.566712,3.493473,3.613001,3.558649,3.667619
51349,0.000000,1.460706,3.558002,4.627176,5.518817,6.135333,6.449740,6.661608,6.841931,6.800778,...,4.513932,4.152011,3.377588,3.558201,3.405632,3.566712,3.493473,3.613001,3.558649,3.667619
51350,5.544814,5.606214,5.469477,5.284355,5.273528,4.908529,4.777719,4.936522,5.452312,6.172771,...,4.513932,4.152011,3.377588,3.558201,3.405632,3.566712,3.493473,3.613001,3.558649,3.667619


In [47]:
mmm=np.log1p(dg[log_cols] )
mmm.rename(columns={col: col + "_log" for col in log_cols}, inplace=True)
mmm

pd.concat([dg, mmm], axis=1)

  result = func(self.values, **kwargs)


Unnamed: 0,county,is_business,product_type,target,is_consumption,prediction_datetime,currently_scored,target_rt,id,target_lag_1h,...,yesterdays_euros_per_mwh_log,euros_per_mwh_24h_average_price_log,lowest_price_per_mwh_log,highest_price_per_mwh_log,lowest_price_3d_avg_log,highest_price_3d_avg_log,lowest_price_7d_avg_log,highest_price_7d_avg_log,lowest_price_14d_avg_log,highest_price_14d_avg_log
45112,0,0,1,,0,2023-05-28 00:00:00,False,2.675,0_0_1_0,2.627,...,4.157945,4.026906,3.377588,3.558201,3.405632,3.566712,3.493473,3.613001,3.558649,3.667619
45113,0,0,1,,0,2023-05-28 00:00:00,False,2.045,0_0_1_0,2.627,...,4.157945,4.026906,3.377588,3.558201,3.405632,3.566712,3.493473,3.613001,3.558649,3.667619
45114,0,0,1,,1,2023-05-28 00:00:00,False,426.177,0_0_1_1,525.541,...,4.157945,4.026906,3.377588,3.558201,3.405632,3.566712,3.493473,3.613001,3.558649,3.667619
45115,0,0,1,,1,2023-05-28 00:00:00,False,471.887,0_0_1_1,525.541,...,4.157945,4.026906,3.377588,3.558201,3.405632,3.566712,3.493473,3.613001,3.558649,3.667619
45116,0,0,2,,0,2023-05-28 00:00:00,False,0.000,0_0_2_0,0.000,...,4.157945,4.026906,3.377588,3.558201,3.405632,3.566712,3.493473,3.613001,3.558649,3.667619
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51347,15,1,1,,1,2023-05-28 23:00:00,False,31.286,15_1_1_1,37.531,...,4.513932,4.152011,3.377588,3.558201,3.405632,3.566712,3.493473,3.613001,3.558649,3.667619
51348,15,1,3,,0,2023-05-28 23:00:00,False,0.000,15_1_3_0,0.000,...,4.513932,4.152011,3.377588,3.558201,3.405632,3.566712,3.493473,3.613001,3.558649,3.667619
51349,15,1,3,,0,2023-05-28 23:00:00,False,0.000,15_1_3_0,0.000,...,4.513932,4.152011,3.377588,3.558201,3.405632,3.566712,3.493473,3.613001,3.558649,3.667619
51350,15,1,3,,1,2023-05-28 23:00:00,False,189.933,15_1_3_1,254.907,...,4.513932,4.152011,3.377588,3.558201,3.405632,3.566712,3.493473,3.613001,3.558649,3.667619


In [13]:
def fill_drop_na(df):
    df = df[~df.target.isna()]
    df = df[~df.target_rolling_avg_24h.isna()]
    means = df.mean()
    # For each column, add an indicator column for NA values
    for col in df.columns:
        if df[col].isna().any():
            df[f'{col}_is_na'] = df[col].isna()
    df = df.fillna(means)
    return df, means

In [None]:
processed_df_no_na, means = fill_drop_na(data_processor.df)
processed_df_no_na.isna().sum()

In [196]:
processed_df_no_na['target_installed_capacity'] = processed_df_no_na['target'] / processed_df_no_na['installed_capacity'] * 1000
processed_df_no_na

  processed_df_no_na['target_installed_capacity'] = processed_df_no_na['target'] / processed_df_no_na['installed_capacity'] * 1000


Unnamed: 0,county,is_business,product_type,target,is_consumption,target_rt,target_rolling_avg_24h,target_rolling_avg_hour_7d,target_rolling_avg_hour_hour_day_4w,target_rolling_allp_avg_24h,...,euros_per_mwh_is_na,mean_euros_per_mwh_last_week_is_na,mean_euros_per_mwh_same_hour_last_week_is_na,yesterdays_euros_per_mwh_is_na,euros_per_mwh_24h_average_price_is_na,lowest_price_7d_avg_is_na,highest_price_7d_avg_is_na,lowest_price_14d_avg_is_na,highest_price_14d_avg_is_na,target_installed_capacity
11712,0,0,1,0.930,0,0.713,0.713000,0.713000,0.71300,0.713000,...,False,False,False,False,False,True,True,True,True,0.975978
11713,0,0,1,123.214,1,96.590,96.590000,96.590000,96.59000,96.590000,...,False,False,False,False,False,True,True,True,True,129.305586
11714,0,0,2,0.000,0,0.000,0.000000,0.000000,0.00000,0.356500,...,False,False,False,False,False,True,True,True,True,0.000000
11715,0,0,2,21.940,1,17.314,17.314000,17.314000,17.31400,56.952000,...,False,False,False,False,False,True,True,True,True,131.850962
11716,0,0,3,1.611,0,2.904,2.904000,2.904000,2.90400,1.205667,...,False,False,False,False,False,True,True,True,True,0.223505
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018609,15,1,0,197.233,1,184.072,295.118417,278.497143,184.71275,90.640000,...,False,False,False,False,False,False,False,False,False,318.117742
2018610,15,1,1,0.000,0,0.000,156.335208,0.000000,0.00000,170.148000,...,False,False,False,False,False,False,False,False,False,0.000000
2018611,15,1,1,28.404,1,38.646,18.873583,34.405143,42.90750,92.029875,...,False,False,False,False,False,False,False,False,False,45.482786
2018612,15,1,3,0.000,0,0.000,403.044625,0.000000,0.00000,139.132958,...,False,False,False,False,False,False,False,False,False,0.000000


In [197]:
test_start = 1900000
val_start = 1800000


df_test = processed_df_no_na.iloc[1900000:]
df_val = processed_df_no_na.iloc[1800000:1900000]
df_train = processed_df_no_na.iloc[:1800000]

In [198]:
df_test_target = df_test[['target', 'target_installed_capacity']]
df_test_data = df_test.drop(['target', 'target_installed_capacity'], axis=1)

df_val_target = df_val[['target', 'target_installed_capacity']]
df_val_data = df_val.drop(['target', 'target_installed_capacity'], axis=1)

df_train_target = df_train[['target', 'target_installed_capacity']]
df_train_data = df_train.drop(['target', 'target_installed_capacity'], axis=1)

### Models

In [166]:
from lightgbm import LGBMRegressor

In [199]:
def train_split(df, n_splits=5, train_length=1000000, val_block_length = 200000):
    valid_values = 1800000 - 1200000
    import random

    # Generate a random integer between 0 and 600000 (inclusive)
    start_list = random.choices(range(0, valid_values), k=n_splits)
    print(start_list)
    
    for i, s in enumerate(start_list):
        print(f"Train rows: {s}, {s+train_length}")
        print(f"Val blocks: {s+train_length}, {s+train_length+val_block_length}")
        
        df_train_filled = df.iloc[s:s+train_length]
        df_val_filled = df.iloc[s+train_length:s+train_length+val_block_length]
        
        df_train_target = df_train_filled[['target', 'target_installed_capacity']]
        df_train_data = df_train_filled.drop(['target', 'target_installed_capacity'], axis=1)
        
        df_val_target2 = df_val_filled[['target', 'target_installed_capacity']]
        df_val_data2 = df_val_filled.drop(['target', 'target_installed_capacity'], axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        
        clf = LGBMRegressor(random_state=42, n_estimators=1500, verbose=1, n_jobs=32, objective='l2')

        clf.fit(df_train_data, df_train_target.target_installed_capacity, categorical_feature=cat_features)

        y_pred = clf.predict(df_train_data)
        y_pred

        from sklearn.metrics import mean_absolute_error

        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target_installed_capacity, y_pred)
        print(f"For fold {i}: Train TIC Mean Absolute Error:", mae)

        y_pred_val = clf.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target_installed_capacity, y_pred_val)
        print(f"For fold {i}: Fold Val TIC Mean Absolute Error:", mae)
        
        y_pred_val = clf.predict(df_val_data)
        y_pred_val

        mae = mean_absolute_error(df_val_target.target_installed_capacity, y_pred_val)
        print("Val TIC Mean Absolute Error:", mae)
        
        
        # Assuming you have two pandas Series: y_true and y_pred
        y_pred = clf.predict(df_train_data)
        y_pred = y_pred * df_train_data.installed_capacity / 1000
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf.predict(df_val_data2)
        y_pred_val = y_pred_val * df_val_data2.installed_capacity / 1000
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        y_pred_val = clf.predict(df_val_data)
        y_pred_val = y_pred_val * df_val_data.installed_capacity / 1000
        y_pred_val

        mae = mean_absolute_error(df_val_target.target, y_pred_val)
        print("Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf.feature_importances_, 'name':clf.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        display(importance.head(10))
        display(importance.tail(10))
        print()
        print()

In [200]:
train_split(df_train.reset_index(drop=True))

[400013, 87096, 480460, 58705, 223133]
Train rows: 400013, 1400013
Val blocks: 1400013, 1600013
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.138523 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28312
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 160
[LightGBM] [Info] Start training from score 180.448367
For fold 0: Train TIC Mean Absolute Error: 22.78330264417057
For fold 0: Fold Val TIC Mean Absolute Error: 65.89921877218084
Val TIC Mean Absolute Error: 65.54868361546382
For fold 0: Train Mean Absolute Error: 27.231593546166195
For fold 0: Fold Val Mean Absolute Error: 71.06215266518653
Val Mean Absolute Error: 109.76376509381343


Unnamed: 0,importance,name
0,2354,county
12,2123,installed_capacity
6,2074,target_rolling_avg_hour_7d
9,1745,target_rolling_allp_avg_hour_7d
5,1553,target_rolling_avg_24h
11,1539,eic_count
127,1203,hour
3,994,is_consumption
7,992,target_rolling_avg_hour_hour_day_4w
8,944,target_rolling_allp_avg_24h


Unnamed: 0,importance,name
179,0,snowfall_hw_variances_is_na
178,0,rain_hw_variances_is_na
177,0,dewpoint_hw_variances_is_na
176,0,temperature_hw_variances_is_na
175,0,diffuse_radiation_hw_means_is_na
174,0,direct_solar_radiation_hw_means_is_na
173,0,shortwave_radiation_hw_means_is_na
172,0,winddirection_10m_hw_means_is_na
171,0,windspeed_10m_hw_means_is_na
253,0,highest_price_14d_avg_is_na




Train rows: 87096, 1087096
Val blocks: 1087096, 1287096
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.102062 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28360
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 163
[LightGBM] [Info] Start training from score 192.875648
For fold 1: Train TIC Mean Absolute Error: 21.942971242595824
For fold 1: Fold Val TIC Mean Absolute Error: 45.76676584495904
Val TIC Mean Absolute Error: 65.34002324165543
For fold 1: Train Mean Absolute Error: 24.25759174837261
For fold 1: Fold Val Mean Absolute Error: 56.656699838266846
Val Mean Absolute Error: 100.4874039761114


Unnamed: 0,importance,name
0,2276,county
12,2148,installed_capacity
6,2139,target_rolling_avg_hour_7d
9,1698,target_rolling_allp_avg_hour_7d
5,1522,target_rolling_avg_24h
11,1440,eic_count
127,1184,hour
7,1092,target_rolling_avg_hour_hour_day_4w
10,999,target_rolling_allp_avg_hour_hour_day_4w
4,978,target_rt


Unnamed: 0,importance,name
181,0,cloudcover_total_hw_variances_is_na
180,0,surface_pressure_hw_variances_is_na
179,0,snowfall_hw_variances_is_na
178,0,rain_hw_variances_is_na
177,0,dewpoint_hw_variances_is_na
176,0,temperature_hw_variances_is_na
175,0,diffuse_radiation_hw_means_is_na
174,0,direct_solar_radiation_hw_means_is_na
173,0,shortwave_radiation_hw_means_is_na
253,0,highest_price_14d_avg_is_na




Train rows: 480460, 1480460
Val blocks: 1480460, 1680460
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.102392 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28258
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 163
[LightGBM] [Info] Start training from score 178.697661
For fold 2: Train TIC Mean Absolute Error: 22.67087410736128
For fold 2: Fold Val TIC Mean Absolute Error: 65.27656611941417
Val TIC Mean Absolute Error: 74.20188537894364
For fold 2: Train Mean Absolute Error: 27.405482850272502
For fold 2: Fold Val Mean Absolute Error: 75.44887008302379
Val Mean Absolute Error: 127.03581181055684


Unnamed: 0,importance,name
0,2362,county
12,2168,installed_capacity
6,2080,target_rolling_avg_hour_7d
9,1712,target_rolling_allp_avg_hour_7d
11,1543,eic_count
5,1508,target_rolling_avg_24h
127,1332,hour
3,1008,is_consumption
7,989,target_rolling_avg_hour_hour_day_4w
8,906,target_rolling_allp_avg_24h


Unnamed: 0,importance,name
178,0,rain_hw_variances_is_na
177,0,dewpoint_hw_variances_is_na
176,0,temperature_hw_variances_is_na
175,0,diffuse_radiation_hw_means_is_na
174,0,direct_solar_radiation_hw_means_is_na
173,0,shortwave_radiation_hw_means_is_na
172,0,winddirection_10m_hw_means_is_na
171,0,windspeed_10m_hw_means_is_na
170,0,cloudcover_high_hw_means_is_na
253,0,highest_price_14d_avg_is_na




Train rows: 58705, 1058705
Val blocks: 1058705, 1258705
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.105926 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28374
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 163
[LightGBM] [Info] Start training from score 192.312810
For fold 3: Train TIC Mean Absolute Error: 21.944628635117795
For fold 3: Fold Val TIC Mean Absolute Error: 43.04080949976132
Val TIC Mean Absolute Error: 82.15955040624334
For fold 3: Train Mean Absolute Error: 23.86490244075675
For fold 3: Fold Val Mean Absolute Error: 55.66674939736568
Val Mean Absolute Error: 111.18599607470497


Unnamed: 0,importance,name
0,2350,county
12,2286,installed_capacity
6,2035,target_rolling_avg_hour_7d
9,1753,target_rolling_allp_avg_hour_7d
5,1538,target_rolling_avg_24h
11,1441,eic_count
127,1193,hour
7,1055,target_rolling_avg_hour_hour_day_4w
10,1019,target_rolling_allp_avg_hour_hour_day_4w
3,958,is_consumption


Unnamed: 0,importance,name
179,0,snowfall_hw_variances_is_na
178,0,rain_hw_variances_is_na
177,0,dewpoint_hw_variances_is_na
176,0,temperature_hw_variances_is_na
175,0,diffuse_radiation_hw_means_is_na
174,0,direct_solar_radiation_hw_means_is_na
173,0,shortwave_radiation_hw_means_is_na
171,0,windspeed_10m_hw_means_is_na
170,0,cloudcover_high_hw_means_is_na
253,0,highest_price_14d_avg_is_na




Train rows: 223133, 1223133
Val blocks: 1223133, 1423133
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.132261 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28343
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 163
[LightGBM] [Info] Start training from score 191.078696
For fold 4: Train TIC Mean Absolute Error: 22.5849692029935
For fold 4: Fold Val TIC Mean Absolute Error: 39.585696449816986
Val TIC Mean Absolute Error: 79.06987925332393
For fold 4: Train Mean Absolute Error: 25.86176891730332
For fold 4: Fold Val Mean Absolute Error: 47.81302891603078
Val Mean Absolute Error: 127.45128402222471


Unnamed: 0,importance,name
0,2305,county
12,2169,installed_capacity
6,2137,target_rolling_avg_hour_7d
9,1726,target_rolling_allp_avg_hour_7d
5,1570,target_rolling_avg_24h
11,1455,eic_count
127,1218,hour
7,1054,target_rolling_avg_hour_hour_day_4w
3,1013,is_consumption
10,966,target_rolling_allp_avg_hour_hour_day_4w


Unnamed: 0,importance,name
177,0,dewpoint_hw_variances_is_na
176,0,temperature_hw_variances_is_na
175,0,diffuse_radiation_hw_means_is_na
174,0,direct_solar_radiation_hw_means_is_na
173,0,shortwave_radiation_hw_means_is_na
172,0,winddirection_10m_hw_means_is_na
171,0,windspeed_10m_hw_means_is_na
170,0,cloudcover_high_hw_means_is_na
169,0,cloudcover_mid_hw_means_is_na
253,0,highest_price_14d_avg_is_na






In [181]:
train_split(df_train.reset_index(drop=True))

[370866, 494083, 13500, 394317, 487447]
Train rows: 370866, 1370866
Val blocks: 1370866, 1570866
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.101860 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27376
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 142
[LightGBM] [Info] Start training from score 181.766626
For fold 0: Train TIC Mean Absolute Error: 25.39071653106545
For fold 0: Fold Val TIC Mean Absolute Error: 75.61266089655118
Val TIC Mean Absolute Error: 54.874490999258505
For fold 0: Train Mean Absolute Error: 31.25548249738619
For fold 0: Fold Val Mean Absolute Error: 89.90459896746444
Val Mean Absolute Error: 106.43911809079378


Unnamed: 0,importance,name
12,2617,installed_capacity
6,2168,target_rolling_avg_hour_7d
5,1618,target_rolling_avg_24h
9,1482,target_rolling_allp_avg_hour_7d
11,1479,eic_count
7,1188,target_rolling_avg_hour_hour_day_4w
3,1058,is_consumption
10,1034,target_rolling_allp_avg_hour_hour_day_4w
4,1033,target_rt
8,934,target_rolling_allp_avg_24h


Unnamed: 0,importance,name
161,0,cloudcover_low_hw_variances_is_na
160,0,cloudcover_total_hw_variances_is_na
159,0,surface_pressure_hw_variances_is_na
158,0,snowfall_hw_variances_is_na
157,0,rain_hw_variances_is_na
156,0,dewpoint_hw_variances_is_na
155,0,temperature_hw_variances_is_na
154,0,diffuse_radiation_hw_means_is_na
153,0,direct_solar_radiation_hw_means_is_na
232,0,highest_price_14d_avg_is_na




Train rows: 494083, 1494083
Val blocks: 1494083, 1694083
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033140 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 27312
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 144
[LightGBM] [Info] Start training from score 178.548399
For fold 1: Train TIC Mean Absolute Error: 25.076673149541094
For fold 1: Fold Val TIC Mean Absolute Error: 56.17414676678932
Val TIC Mean Absolute Error: 51.64176585275168
For fold 1: Train Mean Absolute Error: 31.632315341376565
For fold 1: Fold Val Mean Absolute Error: 71.88400612462081
Val Mean Absolute Error: 105.62433802595633


Unnamed: 0,importance,name
12,2526,installed_capacity
6,2148,target_rolling_avg_hour_7d
5,1569,target_rolling_avg_24h
9,1564,target_rolling_allp_avg_hour_7d
11,1533,eic_count
7,1191,target_rolling_avg_hour_hour_day_4w
3,1078,is_consumption
4,1052,target_rt
10,1004,target_rolling_allp_avg_hour_hour_day_4w
8,943,target_rolling_allp_avg_24h


Unnamed: 0,importance,name
161,0,cloudcover_low_hw_variances_is_na
160,0,cloudcover_total_hw_variances_is_na
159,0,surface_pressure_hw_variances_is_na
158,0,snowfall_hw_variances_is_na
157,0,rain_hw_variances_is_na
156,0,dewpoint_hw_variances_is_na
155,0,temperature_hw_variances_is_na
154,0,diffuse_radiation_hw_means_is_na
153,0,direct_solar_radiation_hw_means_is_na
232,0,highest_price_14d_avg_is_na




Train rows: 13500, 1013500
Val blocks: 1013500, 1213500
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.126903 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27401
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 144
[LightGBM] [Info] Start training from score 192.088536
For fold 2: Train TIC Mean Absolute Error: 24.629459653872555
For fold 2: Fold Val TIC Mean Absolute Error: 45.21650001245498
Val TIC Mean Absolute Error: 59.192943106894184
For fold 2: Train Mean Absolute Error: 27.611452226730805
For fold 2: Fold Val Mean Absolute Error: 67.24355006414906
Val Mean Absolute Error: 107.67899260695317


Unnamed: 0,importance,name
12,2850,installed_capacity
6,2281,target_rolling_avg_hour_7d
9,1530,target_rolling_allp_avg_hour_7d
11,1519,eic_count
5,1508,target_rolling_avg_24h
7,1207,target_rolling_avg_hour_hour_day_4w
4,1194,target_rt
10,1131,target_rolling_allp_avg_hour_hour_day_4w
8,938,target_rolling_allp_avg_24h
3,887,is_consumption


Unnamed: 0,importance,name
161,0,cloudcover_low_hw_variances_is_na
160,0,cloudcover_total_hw_variances_is_na
159,0,surface_pressure_hw_variances_is_na
158,0,snowfall_hw_variances_is_na
157,0,rain_hw_variances_is_na
156,0,dewpoint_hw_variances_is_na
155,0,temperature_hw_variances_is_na
154,0,diffuse_radiation_hw_means_is_na
153,0,direct_solar_radiation_hw_means_is_na
232,0,highest_price_14d_avg_is_na




Train rows: 394317, 1394317
Val blocks: 1394317, 1594317
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.088372 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27365
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 142
[LightGBM] [Info] Start training from score 180.494432
For fold 3: Train TIC Mean Absolute Error: 25.27415427748119
For fold 3: Fold Val TIC Mean Absolute Error: 68.92250790106702
Val TIC Mean Absolute Error: 56.0999978408784
For fold 3: Train Mean Absolute Error: 31.248635837895357
For fold 3: Fold Val Mean Absolute Error: 82.30217265374418
Val Mean Absolute Error: 114.69272605295829


Unnamed: 0,importance,name
12,2544,installed_capacity
6,2248,target_rolling_avg_hour_7d
5,1658,target_rolling_avg_24h
11,1542,eic_count
9,1442,target_rolling_allp_avg_hour_7d
7,1201,target_rolling_avg_hour_hour_day_4w
4,1019,target_rt
3,999,is_consumption
10,967,target_rolling_allp_avg_hour_hour_day_4w
8,966,target_rolling_allp_avg_24h


Unnamed: 0,importance,name
161,0,cloudcover_low_hw_variances_is_na
160,0,cloudcover_total_hw_variances_is_na
159,0,surface_pressure_hw_variances_is_na
158,0,snowfall_hw_variances_is_na
157,0,rain_hw_variances_is_na
156,0,dewpoint_hw_variances_is_na
155,0,temperature_hw_variances_is_na
154,0,diffuse_radiation_hw_means_is_na
153,0,direct_solar_radiation_hw_means_is_na
232,0,highest_price_14d_avg_is_na




Train rows: 487447, 1487447
Val blocks: 1487447, 1687447
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.119900 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27313
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 144
[LightGBM] [Info] Start training from score 178.636280
For fold 4: Train TIC Mean Absolute Error: 25.144306133365234
For fold 4: Fold Val TIC Mean Absolute Error: 59.02132686510403
Val TIC Mean Absolute Error: 54.70876858889155
For fold 4: Train Mean Absolute Error: 31.36717851757248
For fold 4: Fold Val Mean Absolute Error: 71.28244487865365
Val Mean Absolute Error: 110.183934233452


Unnamed: 0,importance,name
12,2571,installed_capacity
6,2207,target_rolling_avg_hour_7d
5,1702,target_rolling_avg_24h
11,1579,eic_count
9,1541,target_rolling_allp_avg_hour_7d
7,1213,target_rolling_avg_hour_hour_day_4w
4,1082,target_rt
3,1043,is_consumption
10,1025,target_rolling_allp_avg_hour_hour_day_4w
8,980,target_rolling_allp_avg_24h


Unnamed: 0,importance,name
161,0,cloudcover_low_hw_variances_is_na
160,0,cloudcover_total_hw_variances_is_na
158,0,snowfall_hw_variances_is_na
157,0,rain_hw_variances_is_na
156,0,dewpoint_hw_variances_is_na
155,0,temperature_hw_variances_is_na
154,0,diffuse_radiation_hw_means_is_na
153,0,direct_solar_radiation_hw_means_is_na
152,0,shortwave_radiation_hw_means_is_na
232,0,highest_price_14d_avg_is_na






In [202]:
def train_split(df, n_splits=5, train_length=1000000, val_block_length = 200000):
    valid_values = 1800000 - 1200000
    import random

    # Generate a random integer between 0 and 600000 (inclusive)
    start_list = random.choices(range(0, valid_values), k=n_splits)
    print(start_list)
    
    
    for i, s in enumerate(start_list):
        print(f"Train rows: {s}, {s+train_length}")
        print(f"Val blocks: {s+train_length}, {s+train_length+val_block_length}")
        
        df_train_filled = df.iloc[s:s+train_length]
        df_val_filled = df.iloc[s+train_length:s+train_length+val_block_length]
        
        df_train_target = df_train_filled[['target', 'target_installed_capacity']]
        df_train_data = df_train_filled.drop(['target', 'target_installed_capacity'], axis=1)
        
        df_val_target2 = df_val_filled[['target', 'target_installed_capacity']]
        df_val_data2 = df_val_filled.drop(['target', 'target_installed_capacity'], axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        
        clf = LGBMRegressor(random_state=42, n_estimators=1500, verbose=1, n_jobs=32, objective='l2')

        clf.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

        y_pred = clf.predict(df_train_data)
        y_pred

        from sklearn.metrics import mean_absolute_error

        # Assuming you have two pandas Series: y_true and y_pred
        y_pred = clf.predict(df_train_data)
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f"For fold {i}: Train Mean Absolute Error:", mae)

        y_pred_val = clf.predict(df_val_data2)
        y_pred_val

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print(f"For fold {i}: Fold Val Mean Absolute Error:", mae)
        
        y_pred_val = clf.predict(df_val_data)
        y_pred_val

        mae = mean_absolute_error(df_val_target.target, y_pred_val)
        print("Val Mean Absolute Error:", mae)
        
        importance = pd.DataFrame({'importance':clf.feature_importances_, 'name':clf.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        display(importance.head(10))
        display(importance.tail(10))
        print()
        print()

In [203]:
train_split(df_train.reset_index(drop=True))

[458280, 410559, 453726, 505810, 79193]
Train rows: 458280, 1458280
Val blocks: 1458280, 1658280
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031434 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28292
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 162
[LightGBM] [Info] Start training from score 261.249764
For fold 0: Train Mean Absolute Error: 23.552704487159133
For fold 0: Fold Val Mean Absolute Error: 47.1241822966275
Val Mean Absolute Error: 73.16576249245972


Unnamed: 0,importance,name
6,2692,target_rolling_avg_hour_7d
5,1548,target_rolling_avg_24h
12,1484,installed_capacity
127,1410,hour
11,1253,eic_count
7,1080,target_rolling_avg_hour_hour_day_4w
0,1074,county
4,1008,target_rt
8,941,target_rolling_allp_avg_24h
25,913,direct_solar_radiation


Unnamed: 0,importance,name
176,0,temperature_hw_variances_is_na
175,0,diffuse_radiation_hw_means_is_na
174,0,direct_solar_radiation_hw_means_is_na
173,0,shortwave_radiation_hw_means_is_na
172,0,winddirection_10m_hw_means_is_na
171,0,windspeed_10m_hw_means_is_na
169,0,cloudcover_mid_hw_means_is_na
168,0,cloudcover_low_hw_means_is_na
167,0,cloudcover_total_hw_means_is_na
253,0,highest_price_14d_avg_is_na




Train rows: 410559, 1410559
Val blocks: 1410559, 1610559
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.102853 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28310
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 160
[LightGBM] [Info] Start training from score 258.786387
For fold 1: Train Mean Absolute Error: 23.35751933713261
For fold 1: Fold Val Mean Absolute Error: 50.947730113128145
Val Mean Absolute Error: 76.28309438294383


Unnamed: 0,importance,name
6,2583,target_rolling_avg_hour_7d
5,1512,target_rolling_avg_24h
12,1471,installed_capacity
127,1394,hour
11,1305,eic_count
7,1118,target_rolling_avg_hour_hour_day_4w
0,1076,county
4,1023,target_rt
8,945,target_rolling_allp_avg_24h
3,875,is_consumption


Unnamed: 0,importance,name
177,0,dewpoint_hw_variances_is_na
176,0,temperature_hw_variances_is_na
175,0,diffuse_radiation_hw_means_is_na
174,0,direct_solar_radiation_hw_means_is_na
173,0,shortwave_radiation_hw_means_is_na
172,0,winddirection_10m_hw_means_is_na
171,0,windspeed_10m_hw_means_is_na
170,0,cloudcover_high_hw_means_is_na
169,0,cloudcover_mid_hw_means_is_na
253,0,highest_price_14d_avg_is_na




Train rows: 453726, 1453726
Val blocks: 1453726, 1653726
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.100716 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28257
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 162
[LightGBM] [Info] Start training from score 261.441985
For fold 2: Train Mean Absolute Error: 23.68745749249557
For fold 2: Fold Val Mean Absolute Error: 44.74578994354624
Val Mean Absolute Error: 79.5484886887238


Unnamed: 0,importance,name
6,2649,target_rolling_avg_hour_7d
5,1587,target_rolling_avg_24h
12,1506,installed_capacity
127,1389,hour
11,1280,eic_count
7,1118,target_rolling_avg_hour_hour_day_4w
4,1067,target_rt
0,994,county
8,940,target_rolling_allp_avg_24h
3,864,is_consumption


Unnamed: 0,importance,name
176,0,temperature_hw_variances_is_na
175,0,diffuse_radiation_hw_means_is_na
174,0,direct_solar_radiation_hw_means_is_na
173,0,shortwave_radiation_hw_means_is_na
172,0,winddirection_10m_hw_means_is_na
171,0,windspeed_10m_hw_means_is_na
170,0,cloudcover_high_hw_means_is_na
169,0,cloudcover_mid_hw_means_is_na
168,0,cloudcover_low_hw_means_is_na
253,0,highest_price_14d_avg_is_na




Train rows: 505810, 1505810
Val blocks: 1505810, 1705810
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.112357 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28247
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 162
[LightGBM] [Info] Start training from score 262.772665
For fold 3: Train Mean Absolute Error: 23.7324738365309
For fold 3: Fold Val Mean Absolute Error: 48.32032199456138
Val Mean Absolute Error: 76.84891769821527


Unnamed: 0,importance,name
6,2651,target_rolling_avg_hour_7d
5,1607,target_rolling_avg_24h
12,1516,installed_capacity
127,1363,hour
11,1236,eic_count
7,1185,target_rolling_avg_hour_hour_day_4w
4,1086,target_rt
0,1046,county
8,1025,target_rolling_allp_avg_24h
3,913,is_consumption


Unnamed: 0,importance,name
178,0,rain_hw_variances_is_na
177,0,dewpoint_hw_variances_is_na
176,0,temperature_hw_variances_is_na
175,0,diffuse_radiation_hw_means_is_na
174,0,direct_solar_radiation_hw_means_is_na
173,0,shortwave_radiation_hw_means_is_na
172,0,winddirection_10m_hw_means_is_na
171,0,windspeed_10m_hw_means_is_na
170,0,cloudcover_high_hw_means_is_na
253,0,highest_price_14d_avg_is_na




Train rows: 79193, 1079193
Val blocks: 1079193, 1279193
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037057 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28364
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 163
[LightGBM] [Info] Start training from score 252.256649
For fold 4: Train Mean Absolute Error: 20.59303156831859
For fold 4: Fold Val Mean Absolute Error: 50.12105457642243
Val Mean Absolute Error: 76.22370312467595


Unnamed: 0,importance,name
6,2505,target_rolling_avg_hour_7d
5,1586,target_rolling_avg_24h
12,1458,installed_capacity
127,1396,hour
11,1305,eic_count
7,1215,target_rolling_avg_hour_hour_day_4w
0,1039,county
4,1009,target_rt
8,980,target_rolling_allp_avg_24h
9,855,target_rolling_allp_avg_hour_7d


Unnamed: 0,importance,name
176,0,temperature_hw_variances_is_na
175,0,diffuse_radiation_hw_means_is_na
174,0,direct_solar_radiation_hw_means_is_na
173,0,shortwave_radiation_hw_means_is_na
172,0,winddirection_10m_hw_means_is_na
171,0,windspeed_10m_hw_means_is_na
170,0,cloudcover_high_hw_means_is_na
169,0,cloudcover_mid_hw_means_is_na
168,0,cloudcover_low_hw_means_is_na
253,0,highest_price_14d_avg_is_na






In [183]:
train_split(df_train.reset_index(drop=True))

[42191, 388461, 174334, 487668, 279677]
Train rows: 42191, 1042191
Val blocks: 1042191, 1242191
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.095617 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27424
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 142
[LightGBM] [Info] Start training from score 250.329657
For fold 0: Train Mean Absolute Error: 23.20279153217127
For fold 0: Fold Val Mean Absolute Error: 70.09303756814207
Val Mean Absolute Error: 89.68653990386342


Unnamed: 0,importance,name
6,2447,target_rolling_avg_hour_7d
12,1530,installed_capacity
5,1494,target_rolling_avg_24h
7,1409,target_rolling_avg_hour_hour_day_4w
11,1191,eic_count
97,1101,hours_ahead
4,1085,target_rt
8,1031,target_rolling_allp_avg_24h
9,844,target_rolling_allp_avg_hour_7d
10,759,target_rolling_allp_avg_hour_hour_day_4w


Unnamed: 0,importance,name
160,0,cloudcover_total_hw_variances_is_na
159,0,surface_pressure_hw_variances_is_na
158,0,snowfall_hw_variances_is_na
157,0,rain_hw_variances_is_na
156,0,dewpoint_hw_variances_is_na
155,0,temperature_hw_variances_is_na
154,0,diffuse_radiation_hw_means_is_na
153,0,direct_solar_radiation_hw_means_is_na
152,0,shortwave_radiation_hw_means_is_na
232,0,highest_price_14d_avg_is_na




Train rows: 388461, 1388461
Val blocks: 1388461, 1588461
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.096703 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27370
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 142
[LightGBM] [Info] Start training from score 257.950435
For fold 1: Train Mean Absolute Error: 26.348740804613882
For fold 1: Fold Val Mean Absolute Error: 64.19997031439515
Val Mean Absolute Error: 93.83951824144796


Unnamed: 0,importance,name
6,2453,target_rolling_avg_hour_7d
5,1613,target_rolling_avg_24h
12,1567,installed_capacity
7,1251,target_rolling_avg_hour_hour_day_4w
11,1234,eic_count
97,1044,hours_ahead
4,1044,target_rt
8,954,target_rolling_allp_avg_24h
9,813,target_rolling_allp_avg_hour_7d
3,806,is_consumption


Unnamed: 0,importance,name
160,0,cloudcover_total_hw_variances_is_na
159,0,surface_pressure_hw_variances_is_na
158,0,snowfall_hw_variances_is_na
157,0,rain_hw_variances_is_na
156,0,dewpoint_hw_variances_is_na
155,0,temperature_hw_variances_is_na
154,0,diffuse_radiation_hw_means_is_na
153,0,direct_solar_radiation_hw_means_is_na
152,0,shortwave_radiation_hw_means_is_na
232,0,highest_price_14d_avg_is_na




Train rows: 174334, 1174334
Val blocks: 1174334, 1374334
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.091876 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27411
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 142
[LightGBM] [Info] Start training from score 256.764046
For fold 2: Train Mean Absolute Error: 24.32116673388529
For fold 2: Fold Val Mean Absolute Error: 59.64550451925321
Val Mean Absolute Error: 93.24400930324347


Unnamed: 0,importance,name
6,2486,target_rolling_avg_hour_7d
12,1511,installed_capacity
5,1511,target_rolling_avg_24h
7,1223,target_rolling_avg_hour_hour_day_4w
11,1195,eic_count
97,1157,hours_ahead
4,1144,target_rt
8,1016,target_rolling_allp_avg_24h
9,851,target_rolling_allp_avg_hour_7d
10,794,target_rolling_allp_avg_hour_hour_day_4w


Unnamed: 0,importance,name
161,0,cloudcover_low_hw_variances_is_na
160,0,cloudcover_total_hw_variances_is_na
159,0,surface_pressure_hw_variances_is_na
158,0,snowfall_hw_variances_is_na
157,0,rain_hw_variances_is_na
156,0,dewpoint_hw_variances_is_na
155,0,temperature_hw_variances_is_na
154,0,diffuse_radiation_hw_means_is_na
153,0,direct_solar_radiation_hw_means_is_na
232,0,highest_price_14d_avg_is_na




Train rows: 487668, 1487668
Val blocks: 1487668, 1687668
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.106933 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27321
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 144
[LightGBM] [Info] Start training from score 262.939421
For fold 3: Train Mean Absolute Error: 26.3319784085403
For fold 3: Fold Val Mean Absolute Error: 63.58532370202302
Val Mean Absolute Error: 96.72258550502887


Unnamed: 0,importance,name
6,2411,target_rolling_avg_hour_7d
5,1596,target_rolling_avg_24h
12,1571,installed_capacity
11,1236,eic_count
7,1229,target_rolling_avg_hour_hour_day_4w
97,1172,hours_ahead
4,1011,target_rt
8,956,target_rolling_allp_avg_24h
3,881,is_consumption
9,833,target_rolling_allp_avg_hour_7d


Unnamed: 0,importance,name
160,0,cloudcover_total_hw_variances_is_na
159,0,surface_pressure_hw_variances_is_na
158,0,snowfall_hw_variances_is_na
157,0,rain_hw_variances_is_na
156,0,dewpoint_hw_variances_is_na
155,0,temperature_hw_variances_is_na
154,0,diffuse_radiation_hw_means_is_na
153,0,direct_solar_radiation_hw_means_is_na
152,0,shortwave_radiation_hw_means_is_na
232,0,highest_price_14d_avg_is_na




Train rows: 279677, 1279677
Val blocks: 1279677, 1479677
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029973 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 27376
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 142
[LightGBM] [Info] Start training from score 257.065150
For fold 4: Train Mean Absolute Error: 25.572616770414125
For fold 4: Fold Val Mean Absolute Error: 57.912941638901934
Val Mean Absolute Error: 95.51979280586224


Unnamed: 0,importance,name
6,2421,target_rolling_avg_hour_7d
12,1651,installed_capacity
5,1510,target_rolling_avg_24h
7,1297,target_rolling_avg_hour_hour_day_4w
11,1203,eic_count
97,1069,hours_ahead
4,1030,target_rt
8,936,target_rolling_allp_avg_24h
3,863,is_consumption
9,837,target_rolling_allp_avg_hour_7d


Unnamed: 0,importance,name
161,0,cloudcover_low_hw_variances_is_na
160,0,cloudcover_total_hw_variances_is_na
158,0,snowfall_hw_variances_is_na
157,0,rain_hw_variances_is_na
156,0,dewpoint_hw_variances_is_na
155,0,temperature_hw_variances_is_na
154,0,diffuse_radiation_hw_means_is_na
153,0,direct_solar_radiation_hw_means_is_na
152,0,shortwave_radiation_hw_means_is_na
232,0,highest_price_14d_avg_is_na






In [204]:
clf = LGBMRegressor(random_state=42, n_estimators=1500, verbose=1, n_jobs=32, objective='l2', )


cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])

clf.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

y_pred = clf.predict(df_train_data)
y_pred

from sklearn.metrics import mean_absolute_error

# Assuming you have two pandas Series: y_true and y_pred
y_pred = clf.predict(df_train_data)
mae = mean_absolute_error(df_train_target.target, y_pred)
print(f" Train Mean Absolute Error:", mae)

y_pred_val = clf.predict(df_val_data)
y_pred_val

mae = mean_absolute_error(df_val_target.target, y_pred_val)
print("Val Mean Absolute Error:", mae)

y_pred_test = clf.predict(df_test_data)
y_pred_test

mae = mean_absolute_error(df_test_target.target, y_pred_test)
print("Test Mean Absolute Error:", mae)

importance = pd.DataFrame({'importance':clf.feature_importances_, 'name':clf.feature_name_})
importance = importance.sort_values('importance', ascending=False)
display(importance.head(10))
display(importance.tail(10))
print()
print()

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.065725 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28589
[LightGBM] [Info] Number of data points in the train set: 1800000, number of used features: 170
[LightGBM] [Info] Start training from score 267.576362
 Train Mean Absolute Error: 23.48407142463921
Val Mean Absolute Error: 67.43060117742853
Test Mean Absolute Error: 85.95009577652112


Unnamed: 0,importance,name
6,2576,target_rolling_avg_hour_7d
11,1665,eic_count
12,1515,installed_capacity
127,1462,hour
5,1304,target_rolling_avg_24h
0,1139,county
7,1066,target_rolling_avg_hour_hour_day_4w
25,831,direct_solar_radiation
8,812,target_rolling_allp_avg_24h
4,811,target_rt


Unnamed: 0,importance,name
178,0,rain_hw_variances_is_na
177,0,dewpoint_hw_variances_is_na
176,0,temperature_hw_variances_is_na
175,0,diffuse_radiation_hw_means_is_na
174,0,direct_solar_radiation_hw_means_is_na
173,0,shortwave_radiation_hw_means_is_na
172,0,winddirection_10m_hw_means_is_na
171,0,windspeed_10m_hw_means_is_na
170,0,cloudcover_high_hw_means_is_na
253,0,highest_price_14d_avg_is_na






In [189]:
clf = LGBMRegressor(random_state=42, n_estimators=1500, verbose=1, n_jobs=32, objective='l2', )


cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])

clf.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

y_pred = clf.predict(df_train_data)
y_pred

from sklearn.metrics import mean_absolute_error

# Assuming you have two pandas Series: y_true and y_pred
y_pred = clf.predict(df_train_data)
mae = mean_absolute_error(df_train_target.target, y_pred)
print(f" Train Mean Absolute Error:", mae)

y_pred_val = clf.predict(df_val_data)
y_pred_val

mae = mean_absolute_error(df_val_target.target, y_pred_val)
print("Val Mean Absolute Error:", mae)

importance = pd.DataFrame({'importance':clf.feature_importances_, 'name':clf.feature_name_})
importance = importance.sort_values('importance', ascending=False)
display(importance.head(10))
display(importance.tail(10))
print()
print()

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.055147 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 27628
[LightGBM] [Info] Number of data points in the train set: 1800000, number of used features: 149
[LightGBM] [Info] Start training from score 267.576362
 Train Mean Absolute Error: 27.266764937408325
Val Mean Absolute Error: 83.21674290798848


Unnamed: 0,importance,name
6,2482,target_rolling_avg_hour_7d
11,1435,eic_count
5,1401,target_rolling_avg_24h
12,1259,installed_capacity
97,1190,hours_ahead
7,1137,target_rolling_avg_hour_hour_day_4w
4,934,target_rt
0,873,county
9,787,target_rolling_allp_avg_hour_7d
107,777,surface_solar_radiation_downwards


Unnamed: 0,importance,name
161,0,cloudcover_low_hw_variances_is_na
160,0,cloudcover_total_hw_variances_is_na
159,0,surface_pressure_hw_variances_is_na
158,0,snowfall_hw_variances_is_na
157,0,rain_hw_variances_is_na
156,0,dewpoint_hw_variances_is_na
155,0,temperature_hw_variances_is_na
154,0,diffuse_radiation_hw_means_is_na
153,0,direct_solar_radiation_hw_means_is_na
232,0,highest_price_14d_avg_is_na






Params are worse, stick with my own



In [205]:
params={'n_iter': 2500,'verbose': 1,'objective': 'l2','metric': 'mae','learning_rate': 0.05073909898961407, 'colsample_bytree': 0.726023996436955, 'colsample_bynode': 0.5803681307354022, 
        'lambda_l1': 8.562963348932286, 'lambda_l2': 4.893256185259296, 'min_data_in_leaf': 115, 'max_depth': 23, 'max_bin': 898}
clf = LGBMRegressor(**params, random_state=42)


cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])

clf.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

y_pred = clf.predict(df_train_data)
y_pred

from sklearn.metrics import mean_absolute_error

# Assuming you have two pandas Series: y_true and y_pred
y_pred = clf.predict(df_train_data)
mae = mean_absolute_error(df_train_target.target, y_pred)
print(f" Train Mean Absolute Error:", mae)

y_pred_val = clf.predict(df_val_data)
y_pred_val

mae = mean_absolute_error(df_val_target.target, y_pred_val)
print("Val Mean Absolute Error:", mae)

# y_pred_test = clf.predict(df_test_data)
# y_pred_test

mae = mean_absolute_error(df_test_target.target, y_pred_test)
print("Test Mean Absolute Error:", mae)

importance = pd.DataFrame({'importance':clf.feature_importances_, 'name':clf.feature_name_})
importance = importance.sort_values('importance', ascending=False)
display(importance.head(10))
display(importance.tail(10))
print()
print()



 Train Mean Absolute Error: 25.198749240364872
Val Mean Absolute Error: 67.47191294147015
Test Mean Absolute Error: 86.29457807405161


Unnamed: 0,importance,name
6,4054,target_rolling_avg_hour_7d
5,2747,target_rolling_avg_24h
12,2660,installed_capacity
7,2651,target_rolling_avg_hour_hour_day_4w
127,2435,hour
11,2191,eic_count
4,1969,target_rt
0,1839,county
129,1591,day_of_week
9,1411,target_rolling_allp_avg_hour_7d


Unnamed: 0,importance,name
180,0,surface_pressure_hw_variances_is_na
178,0,rain_hw_variances_is_na
177,0,dewpoint_hw_variances_is_na
176,0,temperature_hw_variances_is_na
175,0,diffuse_radiation_hw_means_is_na
174,0,direct_solar_radiation_hw_means_is_na
173,0,shortwave_radiation_hw_means_is_na
172,0,winddirection_10m_hw_means_is_na
171,0,windspeed_10m_hw_means_is_na
253,0,highest_price_14d_avg_is_na






In [212]:
clf = LGBMRegressor(random_state=42, n_estimators=1500, verbose=1, n_jobs=32, objective='l2', )
clf_producer = LGBMRegressor(random_state=42, n_estimators=1500, verbose=1, n_jobs=32, objective='l2', )


cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])

clf.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)
clf_producer.fit(df_train_data[df_train_data.is_consumption==0], df_train_target[df_train_data.is_consumption==0].target, categorical_feature=cat_features)

y_pred = clf.predict(df_train_data)
y_pred_producer = clf_producer.predict(df_train_data[df_train_data.is_consumption==0])
y_pred2 = y_pred.copy()
y_pred2[df_train_data.is_consumption==0] = y_pred_producer 

from sklearn.metrics import mean_absolute_error

# Assuming you have two pandas Series: y_true and y_pred
mae = mean_absolute_error(df_train_target.target, y_pred)
print(f" Train Mean Absolute Error:", mae)
mae = mean_absolute_error(df_train_target.target, y_pred2)
print(f" Train Mean w Producer Absolute Error:", mae)

y_pred_val = clf.predict(df_val_data)
y_pred_val_producer = clf_producer.predict(df_val_data[df_val_data.is_consumption==0])
y_pred_val2 = y_pred_val.copy()
y_pred_val2[df_val_data.is_consumption==0] = y_pred_val_producer 

mae = mean_absolute_error(df_val_target.target, y_pred_val)
print("Val Mean Absolute Error:", mae)
mae = mean_absolute_error(df_val_target.target, y_pred_val2)
print("Val Mean w Producer Absolute Error:", mae)

# y_pred_test = clf.predict(df_test_data)
# y_pred_test

# mae = mean_absolute_error(df_test_target.target, y_pred_test)
# print("Test Mean Absolute Error:", mae)

importance = pd.DataFrame({'importance':clf.feature_importances_, 'name':clf.feature_name_})
importance = importance.sort_values('importance', ascending=False)
display(importance.head(10))
display(importance.tail(10))
print()
print()

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.060865 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28589
[LightGBM] [Info] Number of data points in the train set: 1800000, number of used features: 170
[LightGBM] [Info] Start training from score 267.576362
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033094 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28589
[LightGBM] [Info] Number of data points in the train set: 900000, number of used features: 168
[LightGBM] [Info] Start training from score 68.152649
 Train Mean Absolute Error: 23.48407142463921
 Train Mean w Producer Absolute Error: 20.335134985571287
Val Mean Absolute Error: 67.43060117742853
Val Mean w Producer Absolute Er

Unnamed: 0,importance,name
6,2576,target_rolling_avg_hour_7d
11,1665,eic_count
12,1515,installed_capacity
127,1462,hour
5,1304,target_rolling_avg_24h
0,1139,county
7,1066,target_rolling_avg_hour_hour_day_4w
25,831,direct_solar_radiation
8,812,target_rolling_allp_avg_24h
4,811,target_rt


Unnamed: 0,importance,name
178,0,rain_hw_variances_is_na
177,0,dewpoint_hw_variances_is_na
176,0,temperature_hw_variances_is_na
175,0,diffuse_radiation_hw_means_is_na
174,0,direct_solar_radiation_hw_means_is_na
173,0,shortwave_radiation_hw_means_is_na
172,0,winddirection_10m_hw_means_is_na
171,0,windspeed_10m_hw_means_is_na
170,0,cloudcover_high_hw_means_is_na
253,0,highest_price_14d_avg_is_na






In [213]:
clf = LGBMRegressor(random_state=42, n_estimators=1500, verbose=1, n_jobs=32, objective='l2', )
clf_producer = LGBMRegressor(random_state=42, n_estimators=1500, verbose=1, n_jobs=32, objective='l2', )


cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])

clf.fit(df_train_data[df_train_data.is_consumption==1], df_train_target.target[df_train_data.is_consumption==1], categorical_feature=cat_features)
clf_producer.fit(df_train_data[df_train_data.is_consumption==0], df_train_target[df_train_data.is_consumption==0].target, categorical_feature=cat_features)

y_pred = clf.predict(df_train_data)
y_pred_producer = clf_producer.predict(df_train_data[df_train_data.is_consumption==0])
y_pred2 = y_pred.copy()
y_pred2[df_train_data.is_consumption==0] = y_pred_producer 

from sklearn.metrics import mean_absolute_error

# Assuming you have two pandas Series: y_true and y_pred
mae = mean_absolute_error(df_train_target.target, y_pred)
print(f" Train Mean Absolute Error:", mae)
mae = mean_absolute_error(df_train_target.target, y_pred2)
print(f" Train Mean w Producer Absolute Error:", mae)

y_pred_val = clf.predict(df_val_data)
y_pred_val_producer = clf_producer.predict(df_val_data[df_val_data.is_consumption==0])
y_pred_val2 = y_pred_val.copy()
y_pred_val2[df_val_data.is_consumption==0] = y_pred_val_producer 

mae = mean_absolute_error(df_val_target.target, y_pred_val)
print("Val Mean Absolute Error:", mae)
mae = mean_absolute_error(df_val_target.target, y_pred_val2)
print("Val Mean w Producer Absolute Error:", mae)

# y_pred_test = clf.predict(df_test_data)
# y_pred_test

# mae = mean_absolute_error(df_test_target.target, y_pred_test)
# print("Test Mean Absolute Error:", mae)

importance = pd.DataFrame({'importance':clf.feature_importances_, 'name':clf.feature_name_})
importance = importance.sort_values('importance', ascending=False)
display(importance.head(10))
display(importance.tail(10))
print()
print()

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033048 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28589
[LightGBM] [Info] Number of data points in the train set: 900000, number of used features: 168
[LightGBM] [Info] Start training from score 467.000075
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033250 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28589
[LightGBM] [Info] Number of data points in the train set: 900000, number of used features: 168
[LightGBM] [Info] Start training from score 68.152649
 Train Mean Absolute Error: 63.83221236527128
 Train Mean w Producer Absolute Error: 18.533954492423582
Val Mean Absolute Error: 113.35204800819537
Val Mean w Producer Absolute Er

Unnamed: 0,importance,name
6,2742,target_rolling_avg_hour_7d
127,1727,hour
11,1548,eic_count
12,1485,installed_capacity
5,1472,target_rolling_avg_24h
0,1261,county
7,975,target_rolling_avg_hour_hour_day_4w
4,746,target_rt
9,735,target_rolling_allp_avg_hour_7d
17,669,surface_pressure


Unnamed: 0,importance,name
178,0,rain_hw_variances_is_na
177,0,dewpoint_hw_variances_is_na
176,0,temperature_hw_variances_is_na
175,0,diffuse_radiation_hw_means_is_na
174,0,direct_solar_radiation_hw_means_is_na
173,0,shortwave_radiation_hw_means_is_na
172,0,winddirection_10m_hw_means_is_na
171,0,windspeed_10m_hw_means_is_na
170,0,cloudcover_high_hw_means_is_na
253,0,highest_price_14d_avg_is_na






In [None]:
VotingRegressor

In [215]:
def train_split(df, n_splits=5, train_length=1000000, val_block_length = 200000):
    valid_values = 1800000 - 1200000
    import random

    # Generate a random integer between 0 and 600000 (inclusive)
    start_list = random.choices(range(0, valid_values), k=n_splits)
    print(start_list)
    
    
    for i, s in enumerate(start_list):
        print(f"Train rows: {s}, {s+train_length}")
        print(f"Val blocks: {s+train_length}, {s+train_length+val_block_length}")
        
        df_train_filled = df.iloc[s:s+train_length]
        df_val_filled = df.iloc[s+train_length:s+train_length+val_block_length]
        
        df_train_target = df_train_filled[['target', 'target_installed_capacity']]
        df_train_data = df_train_filled.drop(['target', 'target_installed_capacity'], axis=1)
        
        df_val_target2 = df_val_filled[['target', 'target_installed_capacity']]
        df_val_data2 = df_val_filled.drop(['target', 'target_installed_capacity'], axis=1)
        
        clf = LGBMRegressor(random_state=42, n_estimators=1500, verbose=1, n_jobs=32, objective='l2', )
        clf_producer = LGBMRegressor(random_state=42, n_estimators=1500, verbose=1, n_jobs=32, objective='l2', )


        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
                       'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                        'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])

        clf.fit(df_train_data[df_train_data.is_consumption==1], df_train_target.target[df_train_data.is_consumption==1], categorical_feature=cat_features)
        clf_producer.fit(df_train_data[df_train_data.is_consumption==0], df_train_target[df_train_data.is_consumption==0].target, categorical_feature=cat_features)

        y_pred = clf.predict(df_train_data)
        y_pred_producer = clf_producer.predict(df_train_data[df_train_data.is_consumption==0])
        y_pred2 = y_pred.copy()
        y_pred2[df_train_data.is_consumption==0] = y_pred_producer 

        from sklearn.metrics import mean_absolute_error

        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f" Train Mean Absolute Error:", mae)
        mae = mean_absolute_error(df_train_target.target, y_pred2)
        print(f" Train Mean w Producer Absolute Error:", mae)
        
        
        y_pred_val = clf.predict(df_val_data2)
        y_pred_val_producer = clf_producer.predict(df_val_data2[df_val_data2.is_consumption==0])
        y_pred_val2 = y_pred_val.copy()
        y_pred_val2[df_val_data2.is_consumption==0] = y_pred_val_producer 

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print("Val2 Mean Absolute Error:", mae)
        mae = mean_absolute_error(df_val_target2.target, y_pred_val2)
        print("Val2 Mean w Producer Absolute Error:", mae)

        y_pred_val = clf.predict(df_val_data)
        y_pred_val_producer = clf_producer.predict(df_val_data[df_val_data.is_consumption==0])
        y_pred_val2 = y_pred_val.copy()
        y_pred_val2[df_val_data.is_consumption==0] = y_pred_val_producer 

        mae = mean_absolute_error(df_val_target.target, y_pred_val)
        print("Val Mean Absolute Error:", mae)
        mae = mean_absolute_error(df_val_target.target, y_pred_val2)
        print("Val Mean w Producer Absolute Error:", mae)

        # y_pred_test = clf.predict(df_test_data)
        # y_pred_test

        # mae = mean_absolute_error(df_test_target.target, y_pred_test)
        # print("Test Mean Absolute Error:", mae)

        importance = pd.DataFrame({'importance':clf.feature_importances_, 'name':clf.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        display(importance.head(10))
        display(importance.tail(10))
        print()
        print()

In [216]:
train_split(df_train.reset_index(drop=True))

[253796, 100414, 404724, 410712, 184016]
Train rows: 253796, 1253796
Val blocks: 1253796, 1453796
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.052486 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28363
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 162
[LightGBM] [Info] Start training from score 413.336931
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.056129 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28363
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 162
[LightGBM] [Info] Start training from score 102.250220
 Train Mean Absolute Error: 68.9050968905701
 Train Mean w Producer Absolute Error: 16.77568653037153
Val2 Mean Absolute Error: 101.26972137944523
Val2 Mean w Producer Absolute Error: 39.934188369074434
Val M

Unnamed: 0,importance,name
6,2612,target_rolling_avg_hour_7d
5,1700,target_rolling_avg_24h
127,1619,hour
12,1429,installed_capacity
11,1291,eic_count
0,1249,county
7,1125,target_rolling_avg_hour_hour_day_4w
4,915,target_rt
9,773,target_rolling_allp_avg_hour_7d
8,734,target_rolling_allp_avg_24h


Unnamed: 0,importance,name
177,0,dewpoint_hw_variances_is_na
176,0,temperature_hw_variances_is_na
175,0,diffuse_radiation_hw_means_is_na
174,0,direct_solar_radiation_hw_means_is_na
173,0,shortwave_radiation_hw_means_is_na
172,0,winddirection_10m_hw_means_is_na
171,0,windspeed_10m_hw_means_is_na
170,0,cloudcover_high_hw_means_is_na
169,0,cloudcover_mid_hw_means_is_na
253,0,highest_price_14d_avg_is_na




Train rows: 100414, 1100414
Val blocks: 1100414, 1300414
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051494 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28352
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 162
[LightGBM] [Info] Start training from score 417.028890
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051704 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28352
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 162
[LightGBM] [Info] Start training from score 89.643385
 Train Mean Absolute Error: 59.368990335309064
 Train Mean w Producer Absolute Error: 15.475081452720781
Val2 Mean Absolute Error: 81.518102491763
Val2 Mean w Producer Absolute Error: 46.163671775180994
Val Mean Absolute Error: 120.13309072029683
V

Unnamed: 0,importance,name
6,2637,target_rolling_avg_hour_7d
5,1676,target_rolling_avg_24h
127,1666,hour
12,1411,installed_capacity
0,1255,county
11,1247,eic_count
7,1131,target_rolling_avg_hour_hour_day_4w
4,958,target_rt
9,825,target_rolling_allp_avg_hour_7d
17,758,surface_pressure


Unnamed: 0,importance,name
177,0,dewpoint_hw_variances_is_na
176,0,temperature_hw_variances_is_na
175,0,diffuse_radiation_hw_means_is_na
174,0,direct_solar_radiation_hw_means_is_na
173,0,shortwave_radiation_hw_means_is_na
172,0,winddirection_10m_hw_means_is_na
171,0,windspeed_10m_hw_means_is_na
170,0,cloudcover_high_hw_means_is_na
169,0,cloudcover_mid_hw_means_is_na
253,0,highest_price_14d_avg_is_na




Train rows: 404724, 1404724
Val blocks: 1404724, 1604724
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051867 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28312
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 159
[LightGBM] [Info] Start training from score 412.675981
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.055381 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28312
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 159
[LightGBM] [Info] Start training from score 105.030144
 Train Mean Absolute Error: 67.57225484006167
 Train Mean w Producer Absolute Error: 17.357508595535354
Val2 Mean Absolute Error: 111.54423534202455
Val2 Mean w Producer Absolute Error: 48.298657678380835
Val Mean Absolute Error: 123.3086834604508

Unnamed: 0,importance,name
6,2588,target_rolling_avg_hour_7d
5,1745,target_rolling_avg_24h
127,1643,hour
12,1388,installed_capacity
11,1281,eic_count
0,1191,county
7,1124,target_rolling_avg_hour_hour_day_4w
4,970,target_rt
8,745,target_rolling_allp_avg_24h
9,742,target_rolling_allp_avg_hour_7d


Unnamed: 0,importance,name
177,0,dewpoint_hw_variances_is_na
176,0,temperature_hw_variances_is_na
175,0,diffuse_radiation_hw_means_is_na
174,0,direct_solar_radiation_hw_means_is_na
173,0,shortwave_radiation_hw_means_is_na
172,0,winddirection_10m_hw_means_is_na
171,0,windspeed_10m_hw_means_is_na
170,0,cloudcover_high_hw_means_is_na
169,0,cloudcover_mid_hw_means_is_na
253,0,highest_price_14d_avg_is_na




Train rows: 410712, 1410712
Val blocks: 1410712, 1610712
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.053177 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28324
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 159
[LightGBM] [Info] Start training from score 412.525062
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.054675 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28324
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 159
[LightGBM] [Info] Start training from score 105.020044
 Train Mean Absolute Error: 63.924033238381334
 Train Mean w Producer Absolute Error: 17.315571715303733
Val2 Mean Absolute Error: 99.5139189973263
Val2 Mean w Producer Absolute Error: 49.5343511577973
Val Mean Absolute Error: 120.73683777257426
V

Unnamed: 0,importance,name
6,2656,target_rolling_avg_hour_7d
5,1736,target_rolling_avg_24h
127,1685,hour
12,1432,installed_capacity
11,1289,eic_count
0,1256,county
7,1094,target_rolling_avg_hour_hour_day_4w
4,920,target_rt
9,761,target_rolling_allp_avg_hour_7d
8,739,target_rolling_allp_avg_24h


Unnamed: 0,importance,name
175,0,diffuse_radiation_hw_means_is_na
174,0,direct_solar_radiation_hw_means_is_na
173,0,shortwave_radiation_hw_means_is_na
172,0,winddirection_10m_hw_means_is_na
171,0,windspeed_10m_hw_means_is_na
170,0,cloudcover_high_hw_means_is_na
168,0,cloudcover_low_hw_means_is_na
167,0,cloudcover_total_hw_means_is_na
166,0,surface_pressure_hw_means_is_na
253,0,highest_price_14d_avg_is_na




Train rows: 184016, 1184016
Val blocks: 1184016, 1384016
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051953 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28365
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 162
[LightGBM] [Info] Start training from score 415.654631
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.052292 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28365
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 162
[LightGBM] [Info] Start training from score 97.475967
 Train Mean Absolute Error: 64.43221519441765
 Train Mean w Producer Absolute Error: 16.092110911711703
Val2 Mean Absolute Error: 78.95056262055941
Val2 Mean w Producer Absolute Error: 41.785773632552086
Val Mean Absolute Error: 128.08857397423535


Unnamed: 0,importance,name
6,2676,target_rolling_avg_hour_7d
5,1757,target_rolling_avg_24h
127,1549,hour
12,1405,installed_capacity
0,1293,county
11,1267,eic_count
7,1126,target_rolling_avg_hour_hour_day_4w
4,971,target_rt
9,829,target_rolling_allp_avg_hour_7d
8,757,target_rolling_allp_avg_24h


Unnamed: 0,importance,name
178,0,rain_hw_variances_is_na
176,0,temperature_hw_variances_is_na
175,0,diffuse_radiation_hw_means_is_na
174,0,direct_solar_radiation_hw_means_is_na
173,0,shortwave_radiation_hw_means_is_na
172,0,winddirection_10m_hw_means_is_na
171,0,windspeed_10m_hw_means_is_na
170,0,cloudcover_high_hw_means_is_na
169,0,cloudcover_mid_hw_means_is_na
253,0,highest_price_14d_avg_is_na






In [218]:
def train_split(df, n_splits=5, train_length=1000000, val_block_length = 200000):
    valid_values = 1800000 - 1200000
    import random

    # Generate a random integer between 0 and 600000 (inclusive)
    start_list = random.choices(range(0, valid_values), k=n_splits)
    print(start_list)
    
    
    for i, s in enumerate(start_list):
        print(f"Train rows: {s}, {s+train_length}")
        print(f"Val blocks: {s+train_length}, {s+train_length+val_block_length}")
        
        df_train_filled = df.iloc[s:s+train_length]
        df_val_filled = df.iloc[s+train_length:s+train_length+val_block_length]
        
        df_train_target = df_train_filled[['target', 'target_installed_capacity']]
        df_train_data = df_train_filled.drop(['target', 'target_installed_capacity'], axis=1)
        
        df_val_target2 = df_val_filled[['target', 'target_installed_capacity']]
        df_val_data2 = df_val_filled.drop(['target', 'target_installed_capacity'], axis=1)
        
        clf = LGBMRegressor(random_state=42, n_estimators=1500, verbose=1, n_jobs=32, objective='l2', )
        clf_producer = LGBMRegressor(random_state=42, n_estimators=1500, verbose=1, n_jobs=32, objective='l2', )


        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
                       'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                        'is_year_start', 'is_year_end', 'season', ] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])

        clf.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)
        clf_producer.fit(df_train_data[df_train_data.is_consumption==0], df_train_target[df_train_data.is_consumption==0].target, categorical_feature=cat_features)

        y_pred = clf.predict(df_train_data)
        y_pred_producer = clf_producer.predict(df_train_data[df_train_data.is_consumption==0])
        y_pred2 = y_pred.copy()
        y_pred2[df_train_data.is_consumption==0] = y_pred_producer 

        from sklearn.metrics import mean_absolute_error

        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f" Train Mean Absolute Error:", mae)
        mae = mean_absolute_error(df_train_target.target, y_pred2)
        print(f" Train Mean w Producer Absolute Error:", mae)
        
        
        y_pred_val = clf.predict(df_val_data2)
        y_pred_val_producer = clf_producer.predict(df_val_data2[df_val_data2.is_consumption==0])
        y_pred_val2 = y_pred_val.copy()
        y_pred_val2[df_val_data2.is_consumption==0] = y_pred_val_producer 

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print("Val2 Mean Absolute Error:", mae)
        mae = mean_absolute_error(df_val_target2.target, y_pred_val2)
        print("Val2 Mean w Producer Absolute Error:", mae)

        y_pred_val = clf.predict(df_val_data)
        y_pred_val_producer = clf_producer.predict(df_val_data[df_val_data.is_consumption==0])
        y_pred_val2 = y_pred_val.copy()
        y_pred_val2[df_val_data.is_consumption==0] = y_pred_val_producer 

        mae = mean_absolute_error(df_val_target.target, y_pred_val)
        print("Val Mean Absolute Error:", mae)
        mae = mean_absolute_error(df_val_target.target, y_pred_val2)
        print("Val Mean w Producer Absolute Error:", mae)

        # y_pred_test = clf.predict(df_test_data)
        # y_pred_test

        # mae = mean_absolute_error(df_test_target.target, y_pred_test)
        # print("Test Mean Absolute Error:", mae)

        importance = pd.DataFrame({'importance':clf.feature_importances_, 'name':clf.feature_name_})
        importance = importance.sort_values('importance', ascending=False)
        display(importance.head(10))
        display(importance.tail(10))
        print()
        print()

In [219]:
train_split(df_train.reset_index(drop=True))

[574441, 518625, 196160, 92443, 521944]
Train rows: 574441, 1574441
Val blocks: 1574441, 1774441
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.145271 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28260
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 165
[LightGBM] [Info] Start training from score 265.702694
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.050639 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28273
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 164
[LightGBM] [Info] Start training from score 101.550707
 Train Mean Absolute Error: 23.32915789978762
 Train Mean w Producer Absolute Error: 19.73829563416386
Val2 Mean Absolute Error: 51.9077453805939
Val2 Mean w Producer Absolute Error: 50.404046303722275
Val Me

Unnamed: 0,importance,name
6,2561,target_rolling_avg_hour_7d
5,1523,target_rolling_avg_24h
12,1486,installed_capacity
127,1423,hour
11,1282,eic_count
7,1133,target_rolling_avg_hour_hour_day_4w
0,1071,county
8,1034,target_rolling_allp_avg_24h
4,960,target_rt
9,852,target_rolling_allp_avg_hour_7d


Unnamed: 0,importance,name
177,0,dewpoint_hw_variances_is_na
176,0,temperature_hw_variances_is_na
175,0,diffuse_radiation_hw_means_is_na
174,0,direct_solar_radiation_hw_means_is_na
173,0,shortwave_radiation_hw_means_is_na
172,0,winddirection_10m_hw_means_is_na
171,0,windspeed_10m_hw_means_is_na
170,0,cloudcover_high_hw_means_is_na
169,0,cloudcover_mid_hw_means_is_na
253,0,highest_price_14d_avg_is_na




Train rows: 518625, 1518625
Val blocks: 1518625, 1718625
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033089 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28232
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 162
[LightGBM] [Info] Start training from score 263.006751
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.050078 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28238
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 161
[LightGBM] [Info] Start training from score 104.608366
 Train Mean Absolute Error: 23.606237647241283
 Train Mean w Producer Absolute Error: 19.906589476532616
Val2 Mean Absolute Error: 51.27584277870394
Val2 Mean w Producer Absolute Err

Unnamed: 0,importance,name
6,2599,target_rolling_avg_hour_7d
5,1555,target_rolling_avg_24h
12,1459,installed_capacity
127,1376,hour
11,1223,eic_count
7,1139,target_rolling_avg_hour_hour_day_4w
4,1080,target_rt
0,1033,county
8,975,target_rolling_allp_avg_24h
3,859,is_consumption


Unnamed: 0,importance,name
179,0,snowfall_hw_variances_is_na
178,0,rain_hw_variances_is_na
177,0,dewpoint_hw_variances_is_na
176,0,temperature_hw_variances_is_na
175,0,diffuse_radiation_hw_means_is_na
174,0,direct_solar_radiation_hw_means_is_na
173,0,shortwave_radiation_hw_means_is_na
172,0,winddirection_10m_hw_means_is_na
171,0,windspeed_10m_hw_means_is_na
253,0,highest_price_14d_avg_is_na




Train rows: 196160, 1196160
Val blocks: 1196160, 1396160
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036119 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28356
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 163
[LightGBM] [Info] Start training from score 257.098969
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059328 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28356
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 162
[LightGBM] [Info] Start training from score 98.235705
 Train Mean Absolute Error: 21.896641578308728
 Train Mean w Producer Absolute Error: 18.383924955961138
Val2 Mean Absolute Error: 43.55759642145541
Val2 Mean w Producer Absolute Erro

Unnamed: 0,importance,name
6,2536,target_rolling_avg_hour_7d
5,1607,target_rolling_avg_24h
12,1451,installed_capacity
127,1340,hour
11,1323,eic_count
7,1158,target_rolling_avg_hour_hour_day_4w
4,1088,target_rt
0,1015,county
8,1006,target_rolling_allp_avg_24h
3,841,is_consumption


Unnamed: 0,importance,name
177,0,dewpoint_hw_variances_is_na
176,0,temperature_hw_variances_is_na
175,0,diffuse_radiation_hw_means_is_na
174,0,direct_solar_radiation_hw_means_is_na
173,0,shortwave_radiation_hw_means_is_na
172,0,winddirection_10m_hw_means_is_na
171,0,windspeed_10m_hw_means_is_na
170,0,cloudcover_high_hw_means_is_na
169,0,cloudcover_mid_hw_means_is_na
253,0,highest_price_14d_avg_is_na




Train rows: 92443, 1092443
Val blocks: 1092443, 1292443
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037186 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28359
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 163
[LightGBM] [Info] Start training from score 252.965747
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.054417 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28357
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 162
[LightGBM] [Info] Start training from score 88.841109
 Train Mean Absolute Error: 20.65881557086205
 Train Mean w Producer Absolute Error: 17.501697198224793
Val2 Mean Absolute Error: 49.024293372860356
Val2 Mean w Producer Absolute Error

Unnamed: 0,importance,name
6,2592,target_rolling_avg_hour_7d
5,1539,target_rolling_avg_24h
12,1414,installed_capacity
127,1356,hour
11,1283,eic_count
7,1233,target_rolling_avg_hour_hour_day_4w
0,1031,county
4,998,target_rt
8,973,target_rolling_allp_avg_24h
9,880,target_rolling_allp_avg_hour_7d


Unnamed: 0,importance,name
177,0,dewpoint_hw_variances_is_na
176,0,temperature_hw_variances_is_na
175,0,diffuse_radiation_hw_means_is_na
174,0,direct_solar_radiation_hw_means_is_na
173,0,shortwave_radiation_hw_means_is_na
172,0,winddirection_10m_hw_means_is_na
171,0,windspeed_10m_hw_means_is_na
170,0,cloudcover_high_hw_means_is_na
169,0,cloudcover_mid_hw_means_is_na
253,0,highest_price_14d_avg_is_na




Train rows: 521944, 1521944
Val blocks: 1521944, 1721944
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033408 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28215
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 163
[LightGBM] [Info] Start training from score 262.802658
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.052206 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28232
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 162
[LightGBM] [Info] Start training from score 104.571654
 Train Mean Absolute Error: 23.794170008308935
 Train Mean w Producer Absolute Error: 20.067881538352378
Val2 Mean Absolute Error: 54.956196462639625
Val2 Mean w Producer Absolute Er

Unnamed: 0,importance,name
6,2682,target_rolling_avg_hour_7d
5,1655,target_rolling_avg_24h
12,1540,installed_capacity
127,1402,hour
11,1243,eic_count
7,1243,target_rolling_avg_hour_hour_day_4w
0,1081,county
4,1077,target_rt
8,984,target_rolling_allp_avg_24h
9,858,target_rolling_allp_avg_hour_7d


Unnamed: 0,importance,name
177,0,dewpoint_hw_variances_is_na
176,0,temperature_hw_variances_is_na
175,0,diffuse_radiation_hw_means_is_na
174,0,direct_solar_radiation_hw_means_is_na
173,0,shortwave_radiation_hw_means_is_na
172,0,winddirection_10m_hw_means_is_na
171,0,windspeed_10m_hw_means_is_na
170,0,cloudcover_high_hw_means_is_na
169,0,cloudcover_mid_hw_means_is_na
253,0,highest_price_14d_avg_is_na




