# XGBoost

In [1]:
import pickle
import pandas as pd
import numpy as np

In [2]:
import pandas as pd
import datetime as dt
import numpy as np

import pandas as pd
import datetime as dt
import numpy as np

class TrainDataProcessor:
    """Processes Train data, using train data as a warm start, and prepares it for inference."""

    def __init__(self, train, revealed_targets, client, historical_weather,
                 forecast_weather, electricity_prices, gas_prices):
        self.test_orig_dfs = self.get_test_orig_dfs([train.copy(), revealed_targets.copy(), client.copy(), historical_weather.copy(),
                 forecast_weather.copy(), electricity_prices.copy(), gas_prices.copy()])
        self.train = self.init_train(train)
        self.revealed_targets = self.init_revealed_targets(revealed_targets)
        self.client = self.init_client(client)
        self.weather_mapping = self.init_weather_mapping()
        self.historical_weather = self.init_historical_weather(historical_weather)
        self.forecast_weather = self.init_forecast_weather(forecast_weather)
        self.electricity_prices = self.init_electricity(electricity_prices)
        self.gas_prices = self.init_gas_prices(gas_prices)
        
        self.df_all_cols = self.join_data(self.train, self.revealed_targets, self.client, self.historical_weather, self.forecast_weather, self.electricity_prices, self.gas_prices)
        self.df = self.remove_cols(self.df_all_cols)
        
    def get_test_orig_dfs(self, dfs):
        for i, df in enumerate(dfs):
            if 'datetime' in df.columns:
                df['datetime'] = pd.to_datetime(df.datetime)
                col = 'datetime'
            if 'prediction_datetime' in df.columns:
                df['prediction_datetime'] = pd.to_datetime(df.prediction_datetime)
                col = 'prediction_datetime'
            if 'forecast_date' in df.columns:
                df['forecast_date'] = pd.to_datetime(df['forecast_date'])
                col = 'forecast_date'
            if 'forecast_datetime' in df.columns:
                df['forecast_datetime'] = pd.to_datetime(df['forecast_datetime'])
                col = 'forecast_datetime'
            if 'date' in df.columns:
                df['date'] = pd.to_datetime(df.date).dt.date
                col = 'date'

            test_date = df[col].iloc[-1]  # Assuming test is a DataFrame
            start_date = test_date - pd.Timedelta(days=14)
            historical_subset = df[df[col] >= start_date]
            dfs[i] = historical_subset
        return dfs
        
    def init_train(self, df):
        """Prepares the training data for model training."""
        try:
            df['datetime'] = pd.to_datetime(df.datetime)
        except Exception as e:
            df['datetime'] = pd.to_datetime(df.prediction_datetime)
        df['date'] = df.datetime.dt.date
            
        # df = self.get_data_block_id(df, 'datetime')
        return df
    
    def add_electricity_lag_features(self, df):
        """Chatgpt summary:
        Enhances a DataFrame with electricity price lag features:
        - Sets 'datetime' as Index for time series analysis.
        - Calculates rolling 7-day mean price, lagged by one day.
        - Computes rolling 7-day mean for same hour, lagged.
        - Adds column for yesterday's price, shifted by 24 hours.
        - Calculates 24-hour rolling average of electricity prices.
        - Resets index and drops 'forecast_date', 'origin_date', 'hour'.
        """
        ##### mean from entire last week
        df.set_index('datetime', inplace=True)
        # Use rolling to calculate mean price of the last week
        # The window is 7 days, min_periods can be set as per requirement
        # 'closed' determines which side of the interval is closed; it can be 'right' or 'left'
        df['mean_euros_per_mwh_last_week'] = df['euros_per_mwh'].rolling(window='7D', min_periods=1, closed='right').mean()
        # Shift the results to align with the requirement of lagging
        df['mean_euros_per_mwh_last_week'] = df['mean_euros_per_mwh_last_week'].shift()
        
        ##### mean from last week this hour only
        # Extract hour from datetime
        df['hour'] = df.index.hour

        # Group by hour and apply rolling mean for each group
        hourly_groups = df.groupby('hour')
        dff = hourly_groups['euros_per_mwh'].rolling(window='7D', min_periods=1, closed='right').mean()#.shift()#.reset_index(level=0, drop=True)
        dff = dff.reset_index().set_index('datetime').groupby('hour')['euros_per_mwh'].shift()
        dff = dff.rename('mean_euros_per_mwh_same_hour_last_week')
        df = df.join(dff)
        #### yesterday's power price
        df['yesterdays_euros_per_mwh'] = df['euros_per_mwh'].shift(24)
        
        ### 24h average
        # Calculate the 24-hour rolling average
        df['euros_per_mwh_24h_average_price'] = df['euros_per_mwh'].rolling(window=24, min_periods=1).mean()

        # Resetting the index if needed
        df.reset_index(inplace=True)
        df = df.drop(['forecast_date', 'origin_date', 'hour'], axis=1)
        return df

    def init_electricity(self, df):
        ## LAG = 1 Day
        ## Move forecast datetime ahead by 1 day
        ## change name to datetime
        df['datetime'] = pd.to_datetime(df['forecast_date'])
        df['datetime'] = df['datetime'] + dt.timedelta(days=1)
        # df = self.get_data_block_id(df, 'datetime')
        df = self.add_electricity_lag_features(df)
        return df
    
    def add_historical_weather_lag_features(self, df):
        """Chatgpt summary:
        Enhances a DataFrame with historical weather lag features:
        - Converts 'datetime' to Datetime object and sets as index.
        - Sorts data by 'datetime', 'latitude', 'longitude'.
        - Creates 'location_id' as a unique identifier for each location.
        - Filters for 10:00 AM entries and shifts features by 1 day.
        - Merges lagged features with original DataFrame.
        - Calculates mean and variance for weather features over the last 24 hours.
        - Merges these statistical summaries back into the original DataFrame.
        """
        ##### LATEST WEATHER
        def add_latest_weather(df):
            # Assuming df is your original DataFrame
            # Step 1: Convert datetime to a Datetime Object
            df['datetime'] = pd.to_datetime(df['datetime'])
            df.set_index('datetime', inplace=True)

            # Step 2: Sorting the Data
            df.sort_values(by=['datetime', 'latitude', 'longitude'], inplace=True)

            # Step 3: Creating a Unique Identifier for each location
            df['location_id'] = df['latitude'].astype(str) + '_' + df['longitude'].astype(str)

            # Step 4: Filtering for 10:00 AM Entries
            df.reset_index(inplace=True)
            df_10am = df[df['datetime'].dt.hour == 10]
            df_10am.set_index('datetime', inplace=True)

            # Step 5: Shifting the Features by 1 day
            lagged_features = df_10am.groupby('location_id').shift(periods=1, freq='D')

            # Renaming columns to indicate lag
            lagged_features = lagged_features.add_suffix('_hw_lagged')
            lagged_features['location_id'] = lagged_features['location_id_hw_lagged']
            lagged_features.reset_index(inplace=True)
            lagged_features['date'] = lagged_features.datetime.dt.date

            df['date'] = df.datetime.dt.date
            return lagged_features
            # Step 6: Merging Lagged Features with Original DataFrame
            df = df.merge(lagged_features, on=['date', 'location_id'], how='left', suffixes=('', '_hw_lagged'))
            return df
        
        ##### mean from last day
        def add_24h_mean_var(df, weather_features):
            # Calculate the start and end times for each row
            df['start_time'] = pd.to_datetime(df['datetime'].dt.date) - pd.Timedelta(days=2) + pd.Timedelta(hours=11)
            df['end_time'] = pd.to_datetime(df['datetime'].dt.date) - pd.Timedelta(days=1) + pd.Timedelta(hours=10)
            df['time_code'] = df['start_time'].astype(str) +'_' + df['end_time'].astype(str) + '_' + df['latitude'].astype(str) + '_' + df['longitude'].astype(str)
            # print(df.time_code)

            # Create a helper column for grouping
            # If the time is before 10:00 AM, subtract a day
            df['group'] = df['datetime'].apply(lambda dt: dt if dt.time() >= pd.to_datetime('11:00').time() else dt - pd.Timedelta(days=1))
            df['group'] = df['group'].dt.date  # Keep only the date part for grouping
            df['group'] = (pd.to_datetime(df['group']) + pd.Timedelta(hours=11)).astype(str) + '_' + (pd.to_datetime(df['group']) + pd.Timedelta(days=1, hours=10)).astype(str) + '_' + df['latitude'].astype(str) + '_' + df['longitude'].astype(str)

            # Now group by this new column
            grouped = df.groupby('group')
            means = grouped[weather_features].mean()
            variances = grouped[weather_features].var()

            # Merge means and variances into the original DataFrame
            my_df = df.merge(means, left_on='time_code', right_on='group', suffixes=('', '_hw_means'), how='left')
            my_df = my_df.merge(variances, left_on='time_code', right_on='group', how='left', suffixes=('', '_hw_variances'))

            return my_df

        df['datetime'] = pd.to_datetime(df['datetime'])
        weather_features = df.columns.drop(['datetime', 'latitude', 'longitude'])

        # Apply the function
        df = add_24h_mean_var(df, weather_features)       
        latest = add_latest_weather(df)
        df = df.merge(latest, on=['date', 'location_id'], how='left', suffixes=('', '_hw_lagged'))
        
        return df

    def init_historical_weather(self, df):
        ## LAG: From 11:00 AM 2 days ago to 10:00 AM 1 day ago
        ## What to do? Give most recent weather forecast? Give average over the last day?
        """
        Processes the historical weather data.
        """
        df['datetime'] = pd.to_datetime(df.datetime)
        
        df = self.add_historical_weather_lag_features(df)
        
        df = df.merge(self.weather_mapping, how='inner', on=('latitude', 'longitude'))
        return df

    def init_forecast_weather(self, df):
        """Chatgpt summary:
        Processes forecast weather data:
        - Converts 'forecast_datetime' to 'datetime' and adjusts it forward by 1 day.
        - Filters data to keep records with 'hours_ahead' between 22 and 45.
        - Merges with a weather mapping based on 'latitude' and 'longitude'.
        """
        ## LAG: DON't ADJUST
        ##      The forecast is from yesterday, but can forecast today, which is 22 hours ahead
        ## Drop any columns where:
        ##                        hours_ahead < 22 and hours_ahead > 45
        ## Then rename forecast_datetime to datetime and join on datetime
        """
        Processes the forecast weather data.
        """
        df['datetime'] = pd.to_datetime(df['forecast_datetime'])
        # keep only datetimes from our relevant period
        df = df[(df['hours_ahead'] < 46) & (df['hours_ahead'] > 21)]
        df['datetime'] = df['datetime'] + dt.timedelta(days=1)
        df = df.merge(self.weather_mapping, how='inner', on=('latitude', 'longitude'))
        return df
    
    def add_gas_prices_lag_features(self, df):
        """Chatgpt summary
        Augments a DataFrame with rolling average lag features for gas prices:
        - Converts 'date' to Datetime object and sets as index.
        - Sorts DataFrame by date.
        - Calculates rolling averages for lowest and highest gas prices over 3, 7, and 14 days.
        - Resets the index to include 'date' as a column again.
        """
        df['date'] = pd.to_datetime(df['date'])
        df.set_index('date', inplace=True)

        # Sort the DataFrame by date, if it's not already sorted
        df.sort_index(inplace=True)

        # Calculate rolling averages for different time windows
        df['lowest_price_3d_avg'] = df['lowest_price_per_mwh'].rolling(window=3).mean()
        df['highest_price_3d_avg'] = df['highest_price_per_mwh'].rolling(window=3).mean()

        df['lowest_price_7d_avg'] = df['lowest_price_per_mwh'].rolling(window=7).mean()
        df['highest_price_7d_avg'] = df['highest_price_per_mwh'].rolling(window=7).mean()

        df['lowest_price_14d_avg'] = df['lowest_price_per_mwh'].rolling(window=14).mean()
        df['highest_price_14d_avg'] = df['highest_price_per_mwh'].rolling(window=14).mean()

        # Reset the index if you want the 'date' column back
        df.reset_index(inplace=True)
        return df

    def init_gas_prices(self, df):
        ## LAG: 1 DAY
        ## Predictions are made from 2 days ago and predict for yesterday
        ## add one day to forecast_date
        ## Rename forecast_date to date, join on date
        """
        Processes the gas prices data.
        Implement the logic to handle gas prices data processing here.
        """
        df['date'] = pd.to_datetime(df['forecast_date']).dt.date
        df['date'] = df['date'] + dt.timedelta(days=1)
        df = self.add_gas_prices_lag_features(df)
        return df
    
    def add_revealed_target_features(self, df):
        """Chatgpt summary:
        Enhances DataFrame with rolling average target features:
        - Converts 'datetime' to Datetime object, extracts 'hour' and 'day' of week.
        - Sets 'datetime' as index.
        - Calculates various rolling averages of 'target' based on different groupings:
          - 24-hour rolling average by county, business status, product type, and consumption status.
          - 7-day hourly rolling average by county, business status, product type, consumption status, and hour.
          - 4-week rolling average by county, business status, product type, consumption status, hour, and day.
          - Similar calculations considering all product types.
        - Drops 'hour' and 'day' columns after processing.
        """
        df['datetime'] = pd.to_datetime(df['datetime'])
        df['hour'] = df.datetime.dt.hour
        df['day'] = df.datetime.dt.dayofweek
        df.set_index('datetime', inplace=True)

        window_size = 7
        # Group by the specified columns and then apply the rolling mean
        grouped = df.groupby(['county', 'is_business', 'product_type', 'is_consumption'])
        df['target_rolling_avg_24h'] = grouped['target'].transform(lambda x: x.rolling(window=24, min_periods=1).mean())

        grouped = df.groupby(['county', 'is_business', 'product_type', 'is_consumption', 'hour'])
        df['target_rolling_avg_hour_7d'] = grouped['target'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())

        grouped = df.groupby(['county', 'is_business', 'product_type', 'is_consumption', 'hour', 'day'])
        df['target_rolling_avg_hour_hour_day_4w'] = grouped['target'].transform(lambda x: x.rolling(window=4, min_periods=1).mean())

        grouped = df.groupby(['county', 'is_business', 'is_consumption'])
        df['target_rolling_allp_avg_24h'] = grouped['target'].transform(lambda x: x.rolling(window=24, min_periods=1).mean())

        grouped = df.groupby(['county', 'is_business', 'is_consumption', 'hour'])
        df['target_rolling_allp_avg_hour_7d'] = grouped['target'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())

        grouped = df.groupby(['county', 'is_business', 'is_consumption', 'hour', 'day'])
        df['target_rolling_allp_avg_hour_hour_day_4w'] = grouped['target'].transform(lambda x: x.rolling(window=4, min_periods=1).mean())
        
        df = df.drop(['hour', 'day'], axis=1)

        return df
    
    def init_revealed_targets(self, df):
        df['datetime'] = pd.to_datetime(df.datetime)
        df['datetime'] = df['datetime'] + dt.timedelta(days=2)
        df = self.add_revealed_target_features(df)
        return df
    
    def init_client(self, df):
        ## LAG: 2 days
        ## Add 2 days to date, join on date
        df['date'] = pd.to_datetime(df.date).dt.date
        df['date'] = df['date'] + dt.timedelta(days=2)
        # df = self.get_data_block_id(df, 'date')
        return df

    def init_weather_mapping(self):
        # https://www.kaggle.com/code/tsunotsuno/enefit-eda-baseline/notebook#Baseline
        county_point_map = {
            0: (59.4, 24.7), # "HARJUMAA"
            1 : (58.8, 22.7), # "HIIUMAA"
            2 : (59.1, 27.2), # "IDA-VIRUMAA"
            3 : (58.8, 25.7), # "JÄRVAMAA"
            4 : (58.8, 26.2), # "JÕGEVAMAA"
            5 : (59.1, 23.7), # "LÄÄNE-VIRUMAA"
            6 : (59.1, 23.7), # "LÄÄNEMAA"
            7 : (58.5, 24.7), # "PÄRNUMAA"
            8 : (58.2, 27.2), # "PÕLVAMAA"
            9 : (58.8, 24.7), # "RAPLAMAA"
            10 : (58.5, 22.7),# "SAAREMAA"
            11 : (58.5, 26.7),# "TARTUMAA"
            12 : (58.5, 25.2),# "UNKNOWNN" (center of the map)
            13 : (57.9, 26.2),# "VALGAMAA"
            14 : (58.2, 25.7),# "VILJANDIMAA"
            15 : (57.9, 27.2) # "VÕRUMAA"
        }
        # Convert the dictionary to a list of tuples
        data = [(county_code, lat, lon) for county_code, (lat, lon) in county_point_map.items()]

        # Create DataFrame
        df = pd.DataFrame(data, columns=['county', 'latitude', 'longitude'])
        
        return df
    
    def add_date_features(self, df):
        df['year'] = df['datetime'].dt.year
        df['month'] = df['datetime'].dt.month
        df['day'] = df['datetime'].dt.day
        df['hour'] = df['datetime'].dt.hour
        df['quarter'] = df['datetime'].dt.quarter
        df['day_of_week'] = df['datetime'].dt.day_of_week
        df['day_of_year'] = df['datetime'].dt.dayofyear
        df['week_of_year'] = df['datetime'].dt.isocalendar().week
        df['is_weekend'] = df['datetime'].dt.day_of_week >= 5
        df['is_month_start'] = df['datetime'].dt.is_month_start
        df['is_month_end'] = df['datetime'].dt.is_month_end
        df['is_quarter_start'] = df['datetime'].dt.is_quarter_start
        df['is_quarter_end'] = df['datetime'].dt.is_quarter_end
        df['is_year_start'] = df['datetime'].dt.is_year_start
        df['is_year_end'] = df['datetime'].dt.is_year_end
        df['season'] = df['datetime'].dt.month % 12 // 3 + 1
        df['hour_sin'] = np.sin(df['datetime'].dt.hour * (2. * np.pi / 24))
        df['hour_cos'] = np.cos(df['datetime'].dt.hour * (2. * np.pi / 24))
        # Calculate sin and cos for day of year
        days_in_year = 365.25  # accounts for leap year
        df['day_of_year_sin'] = np.sin((df['day_of_year'] - 1) * (2 * np.pi / days_in_year))
        df['day_of_year_cos'] = np.cos((df['day_of_year'] - 1) * (2 * np.pi / days_in_year))
        return df
    
    def add_ee_holidays(self, df):
        import holidays
        # Define Estonia public holidays
        ee_holidays = holidays.CountryHoliday('EE')
        
        print(df['date'].isna().sum())
        
        def find_problem(x):
            try:
                return x in ee_holidays
            except Exception as e:
                print(x)
                raise e

        # Function to check if the date is a holiday
        df['is_ee_holiday'] = df['date'].apply(lambda x: x in ee_holidays)

        return df
    
    def remove_cols(self, df):
        col_list = ['datetime',
                   'row_id',
                   'prediction_unit_id',
                    'date_train',
                    'hour_part',
                   'date_client',
                    'forecast_date_elec_price',
                    'origin_date_elec_price',
                    'forecast_date_gas_price',
                    'origin_date_gas_price',
                    'datetime_hist_weath',
                   'hour_part_hist_weath_latest',
                    'datetime_hist_weath_latest',
                   'origin_datetime',
                   'hour_part_fore_weath',
                    'datetime',
                     'data_block_id',
                     'row_id',
                     'prediction_unit_id',
                     'date',
                    'data_block_id_rt',
                     'row_id_rt',
                     'prediction_unit_id_rt',
                    'data_block_id_client',
                    'latitude',
                     'longitude',
                     'data_block_id_hw',
                    'start_time',
                     'end_time',
                     'time_code',
                     'group',
                    'data_block_id_hw_means',
                    'data_block_id_hw_variances',
                     'location_id',
                     'date_hw',
                     'datetime_hw_lagged',
                    'latitude_hw_lagged',
                     'longitude_hw_lagged',
                     'data_block_id_hw_lagged',
                     'start_time_hw_lagged',
                     'end_time_hw_lagged',
                     'time_code_hw_lagged',
                     'group_hw_lagged',
                    'data_block_id_hw_means_hw_lagged',
                    'data_block_id_hw_variances_hw_lagged',
                    'location_id_hw_lagged',
                     'latitude_fw',
                     'longitude_fw',
                     'origin_datetime',
                    'data_block_id_fw',
                     'forecast_datetime',
                    'data_block_id_elec',
                    'forecast_date',
                    'origin_date',
                     'data_block_id_gasp',
                   ]
        columns_to_drop = [col for col in col_list if col in df.columns]
        df = df.drop(columns_to_drop, axis=1)
        return df
    
    def remove_test_cols(self, df):
        col_list = ['datetime',
                   'prediction_unit_id',
                    'date_train',
                    'hour_part',
                   'date_client',
                    'forecast_date_elec_price',
                    'origin_date_elec_price',
                    'forecast_date_gas_price',
                    'origin_date_gas_price',
                    'datetime_hist_weath',
                   'hour_part_hist_weath_latest',
                    'datetime_hist_weath_latest',
                   'origin_datetime',
                   'hour_part_fore_weath',
                    'datetime',
                     'data_block_id',
                     'prediction_unit_id',
                     'date',
                    'data_block_id_rt',
                     'row_id_rt',
                     'prediction_unit_id_rt',
                    'data_block_id_client',
                    'latitude',
                     'longitude',
                     'data_block_id_hw',
                    'start_time',
                     'end_time',
                     'time_code',
                     'group',
                    'data_block_id_hw_means',
                    'data_block_id_hw_variances',
                     'location_id',
                     'date_hw',
                     'datetime_hw_lagged',
                    'latitude_hw_lagged',
                     'longitude_hw_lagged',
                     'data_block_id_hw_lagged',
                     'start_time_hw_lagged',
                     'end_time_hw_lagged',
                     'time_code_hw_lagged',
                     'group_hw_lagged',
                    'data_block_id_hw_means_hw_lagged',
                    'data_block_id_hw_variances_hw_lagged',
                    'location_id_hw_lagged',
                     'latitude_fw',
                     'longitude_fw',
                     'origin_datetime',
                    'data_block_id_fw',
                     'forecast_datetime',
                    'data_block_id_elec',
                    'forecast_date',
                    'origin_date',
                     'data_block_id_gasp',
                   ]
        columns_to_drop = [col for col in col_list if col in df.columns]
        df = df.drop(columns_to_drop, axis=1)
        return df
    
    def join_data(self, train, revealed_targets, client, historical_weather, forecast_weather, electricity_prices, gas_prices):
        df = train
        df = df.merge(revealed_targets, how='left', on=('datetime', 'county', 'is_business', 'product_type', 'is_consumption'), suffixes=('', '_rt'))
        df = df.merge(client, how='left', on=('date', 'county', 'is_business', 'product_type'), suffixes=('', '_client'))
        df = df.merge(historical_weather, how='left', on=('datetime', 'county'), suffixes=('', '_hw'))
        df = df.merge(forecast_weather, how='left', on=('datetime', 'county'), suffixes=('', '_fw'))
        df = df.merge(electricity_prices, how='left', on='datetime', suffixes=('', '_elec'))
        df['date'] = pd.to_datetime(df['date'])
        df = df.merge(gas_prices, how='left', on='date', suffixes=('', '_gasp'))
        df = self.add_date_features(df)
        df = self.add_ee_holidays(df)
        return df
    
    def add_test_data(self, test, revealed_targets, client, historical_weather,
            forecast_weather, electricity_prices, gas_prices):
        dfs = [test.copy(), revealed_targets.copy(), client.copy(), historical_weather.copy(),
                 forecast_weather.copy(), electricity_prices.copy(), gas_prices.copy()]
        for i, df in enumerate(dfs):
            if 'datetime' in df.columns:
                df['datetime'] = pd.to_datetime(df.datetime)
                col = 'datetime'
            if 'prediction_datetime' in df.columns:
                df['datetime'] = pd.to_datetime(df.prediction_datetime)
                col = 'datetime'
            if 'forecast_date' in df.columns:
                df['forecast_date'] = pd.to_datetime(df['forecast_date'])
                col = 'forecast_date'
            if 'forecast_datetime' in df.columns:
                df['forecast_datetime'] = pd.to_datetime(df['forecast_datetime'])
                col = 'forecast_datetime'
                
            self.test_orig_dfs[i] = pd.concat([ self.test_orig_dfs[i], df ])          
        
        
    
    def process_test_data_timestep(self, test, revealed_targets, client, historical_weather,
            forecast_weather, electricity_prices, gas_prices):
        #append test data to test data cache
        self.add_test_data(test, revealed_targets, client, historical_weather,
            forecast_weather, electricity_prices, gas_prices)
        # process test data
        test = self.init_train(self.test_orig_dfs[0].copy())
        revealed_targets = self.init_revealed_targets(self.test_orig_dfs[1].copy())
        client = self.init_client(self.test_orig_dfs[2].copy())
        historical_weather = self.init_historical_weather(self.test_orig_dfs[3].copy())
        forecast_weather = self.init_forecast_weather(self.test_orig_dfs[4].copy())
        electricity_prices = self.init_electricity(self.test_orig_dfs[5].copy())
        gas_prices = self.init_gas_prices(self.test_orig_dfs[6].copy())
        
        df_all_cols = self.join_data(test, revealed_targets, client, historical_weather,
            forecast_weather, electricity_prices, gas_prices)
        df = self.remove_test_cols(df_all_cols)
        return df
        


In [3]:
with open('data_processor_lgbm2_new_pandas.pkl', 'rb') as f:
    data_processor = pickle.load(f)
data_processor.df

Unnamed: 0,county,is_business,product_type,target,is_consumption,target_rt,target_lag_1h,target_lag_2h,target_lag_3h,target_lag_4h,...,is_quarter_start,is_quarter_end,is_year_start,is_year_end,season,hour_sin,hour_cos,day_of_year_sin,day_of_year_cos,is_ee_holiday
0,0,0,1,0.713,0,,,,,,...,False,False,False,False,4,0.000000,1.000000,-0.861693,-0.507430,False
1,0,0,1,96.590,1,,,,,,...,False,False,False,False,4,0.000000,1.000000,-0.861693,-0.507430,False
2,0,0,2,0.000,0,,,,,,...,False,False,False,False,4,0.000000,1.000000,-0.861693,-0.507430,False
3,0,0,2,17.314,1,,,,,,...,False,False,False,False,4,0.000000,1.000000,-0.861693,-0.507430,False
4,0,0,3,2.904,0,,,,,,...,False,False,False,False,4,0.000000,1.000000,-0.861693,-0.507430,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018609,15,1,0,197.233,1,184.072,171.092,168.933,174.920,170.068,...,False,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False
2018610,15,1,1,0.000,0,0.000,0.000,2.501,25.884,83.535,...,False,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False
2018611,15,1,1,28.404,1,38.646,47.690,34.806,29.202,21.654,...,False,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False
2018612,15,1,3,0.000,0,0.000,0.000,4.512,34.657,122.195,...,False,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False


# Testing

For my experimental CV, I want to take the approach of doing a stratified CV by time - splitting the year into 4 different parts, basically testing the model on each season, 3 months at a time. There was something in the kaggle forums that recommended something like this:

Key: 
= -> training data
+ -> CV data

4 splits in time:
1. =============+++
2. ================+++
3. ===================+++
4. ======================+++



The data starts on 2021-09-01 and ends on 2023-05-31

BUT we don't have enough data to do that properly. So, my CV will instead be:


(Thanks chatgpt)

Splitting the period from 2022-09-01 to 2023-05-31 into five equal parts, here are the date ranges for each segment:

#### First Segment:

From 2022-09-01 to 2022-10-24

#### Second Segment:

From 2022-10-25 to 2022-12-17

#### Third Segment:

From 2022-12-18 to 2023-02-09

#### Fourth Segment:

From 2023-02-10 to 2023-04-04

#### Fifth Segment:

From 2023-04-05 to 2023-05-29


In [4]:
def fill_drop_na(df):
    df = df[~df.target.isna()]
    df = df[~df.target_rolling_avg_24h.isna()]
    means = df.mean()
    # For each column, add an indicator column for NA values
    # for col in df.columns:
    #     if df[col].isna().any():
    #         df[f'{col}_is_na'] = df[col].isna()
    df = df.fillna(means)
    return df, means

In [5]:
%%time
processed_df_no_na, means = fill_drop_na(data_processor.df)
processed_df_no_na.isna().sum()

CPU times: total: 3.44 s
Wall time: 8.2 s


county             0
is_business        0
product_type       0
target             0
is_consumption     0
                  ..
hour_sin           0
hour_cos           0
day_of_year_sin    0
day_of_year_cos    0
is_ee_holiday      0
Length: 240, dtype: int64

In [6]:
processed_df_no_na['target_installed_capacity'] = processed_df_no_na['target'] / processed_df_no_na['installed_capacity'] * 1000
processed_df_no_na

  processed_df_no_na['target_installed_capacity'] = processed_df_no_na['target'] / processed_df_no_na['installed_capacity'] * 1000


Unnamed: 0,county,is_business,product_type,target,is_consumption,target_rt,target_lag_1h,target_lag_2h,target_lag_3h,target_lag_4h,...,is_quarter_end,is_year_start,is_year_end,season,hour_sin,hour_cos,day_of_year_sin,day_of_year_cos,is_ee_holiday,target_installed_capacity
11712,0,0,1,0.930,0,0.713,274.689353,274.69907,274.708302,274.717501,...,False,False,False,4,0.000000,1.000000,-0.894542,-0.446983,False,0.975978
11713,0,0,1,123.214,1,96.590,274.689353,274.69907,274.708302,274.717501,...,False,False,False,4,0.000000,1.000000,-0.894542,-0.446983,False,129.305586
11714,0,0,2,0.000,0,0.000,274.689353,274.69907,274.708302,274.717501,...,False,False,False,4,0.000000,1.000000,-0.894542,-0.446983,False,0.000000
11715,0,0,2,21.940,1,17.314,274.689353,274.69907,274.708302,274.717501,...,False,False,False,4,0.000000,1.000000,-0.894542,-0.446983,False,131.850962
11716,0,0,3,1.611,0,2.904,274.689353,274.69907,274.708302,274.717501,...,False,False,False,4,0.000000,1.000000,-0.894542,-0.446983,False,0.223505
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018609,15,1,0,197.233,1,184.072,171.092000,168.93300,174.920000,170.068000,...,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False,318.117742
2018610,15,1,1,0.000,0,0.000,0.000000,2.50100,25.884000,83.535000,...,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False,0.000000
2018611,15,1,1,28.404,1,38.646,47.690000,34.80600,29.202000,21.654000,...,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False,45.482786
2018612,15,1,3,0.000,0,0.000,0.000000,4.51200,34.657000,122.195000,...,False,False,False,2,-0.258819,0.965926,0.532227,-0.846602,False,0.000000


In [7]:
from datetime import datetime

cv_ranges_corrected = [
    ('2022-09-01', '2022-10-24'), 
    ('2022-10-25', '2022-12-17'), 
    ('2022-12-18', '2023-02-09'), 
    ('2023-02-10', '2023-04-04'), 
    ('2023-04-05', '2023-05-31')
]

# Function to convert a date string into a datetime object
def to_datetime(date_str):
    return datetime.strptime(date_str, '%Y-%m-%d')

# Converting the date strings in cv_ranges to datetime objects
datetime_cv_ranges = [(to_datetime(start), to_datetime(end)) for start, end in cv_ranges_corrected]
datetime_cv_ranges

date_filter = data_processor.df_all_cols.date[processed_df_no_na.index]
date_filter

cv1_train = processed_df_no_na[date_filter <= datetime_cv_ranges[0][0]]
cv1_test = processed_df_no_na[(date_filter <= datetime_cv_ranges[0][1]) & (date_filter > datetime_cv_ranges[0][0])]

In [8]:
import datetime as dt
print(to_datetime('2023-04-05') + dt.timedelta(days=14))
print(to_datetime('2023-04-05') + dt.timedelta(days=48))

2023-04-19 00:00:00
2023-05-23 00:00:00


In [9]:
cv1_train[['year' ,'month', 'day']]

Unnamed: 0,year,month,day
11712,2021,9,5
11713,2021,9,5
11714,2021,9,5
11715,2021,9,5
11716,2021,9,5
...,...,...,...
1144249,2022,9,1
1144250,2022,9,1
1144251,2022,9,1
1144252,2022,9,1


In [10]:
cv1_test[['year' ,'month', 'day']]

Unnamed: 0,year,month,day
1144254,2022,9,2
1144255,2022,9,2
1144256,2022,9,2
1144257,2022,9,2
1144258,2022,9,2
...,...,...,...
1315849,2022,10,24
1315850,2022,10,24
1315851,2022,10,24
1315852,2022,10,24


In [11]:
processed_df_no_na[['year', 'month', 'day']]

Unnamed: 0,year,month,day
11712,2021,9,5
11713,2021,9,5
11714,2021,9,5
11715,2021,9,5
11716,2021,9,5
...,...,...,...
2018609,2023,5,31
2018610,2023,5,31
2018611,2023,5,31
2018612,2023,5,31


## Train

In [12]:
from sklearn.model_selection import TimeSeriesSplit

In [13]:
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import HalvingRandomSearchCV

import xgboost as xgb


In [14]:
# xgboost.XGBRegressor(objective='reg:absoluteerror', n_estimators=1000)

In [15]:
# pd.get_dummies()

In [16]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first', sparse=False)

In [17]:
from sklearn.preprocessing import OneHotEncoder

# create a one hot encoder to create the dummies and fit it to the data
ohe= OneHotEncoder(handle_unknown='ignore', drop='first', sparse=False)
ohe.fit(df[['x']])

# now let's simulate the two situations A and B
df.loc[1, 'x']= 1
df= df.append(dict(x=5, y=5), ignore_index=True)

# the actual feature generation is done in a separate step
tr=ohe.transform(df[['x']])

# if you need the columns in your existing data frame, you can glue them together
df2=pd.DataFrame(tr, columns=['oh1', 'oh2', 'oh3'], index=df.index)
result= pd.concat([df, df2], axis='columns')

NameError: name 'df' is not defined

In [54]:
# from sklearn.ensemble import VotingRegressor

def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

processed_df_no_na['week_of_year'] = processed_df_no_na['week_of_year'].astype('int32')

def train_cv(df):
    for i in [0]:
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                    'snowfall_fw', 'snowfall_hw_means']
        
        train = train.dropna()
        val = val.dropna()
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end',  'week_of_year',
                'is_year_start', 'is_year_end', 'season'] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]  
        
        for feature in cat_features:
            df_train_data[feature] = df_train_data[feature].astype('category')
            df_val_data2[feature] = df_val_data2[feature].astype('category')

        #one hot encode because xgboost doesn't handle categorical variables well natively like lgbm does :(
        encoder = OneHotEncoder(handle_unknown='ignore', drop='first', sparse=False)
        encoder.fit(df_train_data[cat_features])

        train_hot = encoder.transform(df_train_data[cat_features])
        df_train_data = df_train_data.drop(cat_features, axis=1)
        train_hot_df = pd.DataFrame(train_hot, columns=[f'cat_{o}' for o in range(train_hot.shape[1])], index=df_train_data.index)
        df_train_data = pd.concat([df_train_data, train_hot_df], axis=1)
        # display(df_train_data)

        val_hot = encoder.transform(df_val_data2[cat_features])
        df_val_data2 = df_val_data2.drop(cat_features, axis=1)
        val_hot_df = pd.DataFrame(val_hot, columns=[f'cat_{o}' for o in range(val_hot.shape[1])], index=df_val_data2.index)
        df_val_data2 = pd.concat([df_val_data2, val_hot_df], axis=1)

        # Xy = xgb.DMatrix(df_train_data, df_train_target['target'], enable_categorical=True)
        # Xy_val = xgb.DMatrix(df_val_data2, df_val_target2['target'], enable_categorical=True)

        # We leave max_depth as -1
        # Tune num_leaves, default is 31, let's double it       
        
        # params = {'lambda_l1': 0.7466999841658806, 'lambda_l2': 3.2140838539606458, 'learning_rate': 0.13753679743025782, 'max_bin': 250, 'min_data_in_leaf': 150, 'n_estimators': 5593,  
        #         'metric': 'mae', 'n_jobs': 22, 'boosting': 'dart', 'objective': 'tweedie', 'device':'gpu'}
        
        clf_consumer = xgb.XGBRegressor(objective='reg:absoluteerror', n_estimators=1000, device="cuda"#, tree_method='gpu_hist'
                                        )

        # clf_consumer = xgb.train({'objective':'reg:absoluteerror', 'n_estimators':1000, 'enable_categorical':True, 'device':"cuda"}, Xy)
        
        # clf_consumer = VotingRegressor([
        #     ('lgb_0', LGBMRegressor(**params, random_state=42, verbose=1, )),
        #     ('lgb_1', LGBMRegressor(**params, random_state=69, verbose=1, )),
        #     ('lgb_2', LGBMRegressor(**params, random_state=1337, verbose=1, )), 
        #     ('lgb_3', LGBMRegressor(**params, random_state=124, verbose=1, )),
        #     ('lgb_4', LGBMRegressor(**params, random_state=12351, verbose=1, ))
        #     ], weights=[0.2,0.2,0.2,0.2,0.2])
        
        # clf_producer = VotingRegressor([
        #     ('lgb_0', LGBMRegressor(**params, random_state=142, verbose=1, )),
        #     ('lgb_1', LGBMRegressor(**params, random_state=169, verbose=1, )),
        #     ('lgb_2', LGBMRegressor(**params, random_state=11337, verbose=1, )), 
        #     ('lgb_3', LGBMRegressor(**params, random_state=1124, verbose=1, )),
        #     ('lgb_4', LGBMRegressor(**params, random_state=112351, verbose=1, ))
        #     ], weights=[0.2,0.2,0.2,0.2,0.2])

        clf_consumer.fit(df_train_data, df_train_target.target)

        # clf_producer.fit(df_train_data[df_train_data.is_consumption==0], df_train_target[df_train_data.is_consumption==0].target)
        
        # clf_consumer = lgb.train(params_consumer, dtrain)
        # preds = gbm.predict(df_val_data2)
        # mae = mean_absolute_error(df_val_target2["target"], preds)

        y_pred = clf_consumer.predict(df_train_data)
        # y_pred_producer = clf_producer.predict(df_train_data[df_train_data.is_consumption==0])
        # y_pred2 = y_pred.copy()
        # y_pred2[df_train_data.is_consumption==0] = y_pred_producer 

        from sklearn.metrics import mean_absolute_error

        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f" Train Mean Absolute Error_consumption:", mae)
        # mae = mean_absolute_error(df_train_target.target, y_pred2)
        # print(f" Train Mean w Producer Absolute Error:", mae)

        y_pred_val = clf_consumer.predict(df_val_data2)
        # y_pred_val_producer = clf_producer.predict(df_val_data2[df_val_data2.is_consumption==0])
        # y_pred_val2 = y_pred_val.copy()
        # y_pred_val2[df_val_data2.is_consumption==0] = y_pred_val_producer 

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print("Val Mean Absolute Error:", mae)
        # mae = mean_absolute_error(df_val_target2.target, y_pred_val2)
        # print("Val Mean w Producer Absolute Error:", mae)
        
        # importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        # importance = importance.sort_values('importance', ascending=False)
        # display(importance.head(30))
        # display(importance.tail(30))
        print()
        print()

In [55]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264




Parameters: { "device" } are not used.

 Train Mean Absolute Error_consumption: 170.22221717625746
Val Mean Absolute Error: 172.03259466718384




In [60]:
# from sklearn.ensemble import VotingRegressor

def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

processed_df_no_na['week_of_year'] = processed_df_no_na['week_of_year'].astype('int32')

def train_cv(df):
    for i in [0]:
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                    'snowfall_fw', 'snowfall_hw_means']
        
        train = train.dropna()
        val = val.dropna()
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end',  'week_of_year',
                'is_year_start', 'is_year_end', 'season'] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]  
        
        for feature in cat_features:
            df_train_data[feature] = df_train_data[feature].astype('category')
            df_val_data2[feature] = df_val_data2[feature].astype('category')

        #one hot encode because xgboost doesn't handle categorical variables well natively like lgbm does :(
        encoder = OneHotEncoder(handle_unknown='ignore', drop='first', sparse=False)
        encoder.fit(df_train_data[cat_features])

        train_hot = encoder.transform(df_train_data[cat_features])
        df_train_data = df_train_data.drop(cat_features, axis=1)
        train_hot_df = pd.DataFrame(train_hot, columns=[f'cat_{o}' for o in range(train_hot.shape[1])], index=df_train_data.index)
        df_train_data = pd.concat([df_train_data, train_hot_df], axis=1)
        # display(df_train_data)

        val_hot = encoder.transform(df_val_data2[cat_features])
        df_val_data2 = df_val_data2.drop(cat_features, axis=1)
        val_hot_df = pd.DataFrame(val_hot, columns=[f'cat_{o}' for o in range(val_hot.shape[1])], index=df_val_data2.index)
        df_val_data2 = pd.concat([df_val_data2, val_hot_df], axis=1)

        # Xy = xgb.DMatrix(df_train_data, df_train_target['target'], enable_categorical=True)
        # Xy_val = xgb.DMatrix(df_val_data2, df_val_target2['target'], enable_categorical=True)

        # We leave max_depth as -1
        # Tune num_leaves, default is 31, let's double it       
        
        # params = {'lambda_l1': 0.7466999841658806, 'lambda_l2': 3.2140838539606458, 'learning_rate': 0.13753679743025782, 'max_bin': 250, 'min_data_in_leaf': 150, 'n_estimators': 5593,  
        #         'metric': 'mae', 'n_jobs': 22, 'boosting': 'dart', 'objective': 'tweedie', 'device':'gpu'}

        clf_consumer = xgb.XGBRegressor(objective='reg:absoluteerror', n_estimators=1000, device='cuda:0'
                                        )

        # clf_consumer = xgb.train({'objective':'reg:absoluteerror', 'n_estimators':1000, 'enable_categorical':True, 'device':"cuda"}, Xy)
        
        # clf_consumer = VotingRegressor([
        #     ('lgb_0', LGBMRegressor(**params, random_state=42, verbose=1, )),
        #     ('lgb_1', LGBMRegressor(**params, random_state=69, verbose=1, )),
        #     ('lgb_2', LGBMRegressor(**params, random_state=1337, verbose=1, )), 
        #     ('lgb_3', LGBMRegressor(**params, random_state=124, verbose=1, )),
        #     ('lgb_4', LGBMRegressor(**params, random_state=12351, verbose=1, ))
        #     ], weights=[0.2,0.2,0.2,0.2,0.2])
        
        # clf_producer = VotingRegressor([
        #     ('lgb_0', LGBMRegressor(**params, random_state=142, verbose=1, )),
        #     ('lgb_1', LGBMRegressor(**params, random_state=169, verbose=1, )),
        #     ('lgb_2', LGBMRegressor(**params, random_state=11337, verbose=1, )), 
        #     ('lgb_3', LGBMRegressor(**params, random_state=1124, verbose=1, )),
        #     ('lgb_4', LGBMRegressor(**params, random_state=112351, verbose=1, ))
        #     ], weights=[0.2,0.2,0.2,0.2,0.2])

        clf_consumer.fit(df_train_data, df_train_target.target)

        # clf_producer.fit(df_train_data[df_train_data.is_consumption==0], df_train_target[df_train_data.is_consumption==0].target)
        
        # clf_consumer = lgb.train(params_consumer, dtrain)
        # preds = gbm.predict(df_val_data2)
        # mae = mean_absolute_error(df_val_target2["target"], preds)

        y_pred = clf_consumer.predict(df_train_data)
        # y_pred_producer = clf_producer.predict(df_train_data[df_train_data.is_consumption==0])
        # y_pred2 = y_pred.copy()
        # y_pred2[df_train_data.is_consumption==0] = y_pred_producer 

        from sklearn.metrics import mean_absolute_error

        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f" Train Mean Absolute Error_consumption:", mae)
        # mae = mean_absolute_error(df_train_target.target, y_pred2)
        # print(f" Train Mean w Producer Absolute Error:", mae)

        y_pred_val = clf_consumer.predict(df_val_data2)
        # y_pred_val_producer = clf_producer.predict(df_val_data2[df_val_data2.is_consumption==0])
        # y_pred_val2 = y_pred_val.copy()
        # y_pred_val2[df_val_data2.is_consumption==0] = y_pred_val_producer 

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print("Val Mean Absolute Error:", mae)
        # mae = mean_absolute_error(df_val_target2.target, y_pred_val2)
        # print("Val Mean w Producer Absolute Error:", mae)
        
        # importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        # importance = importance.sort_values('importance', ascending=False)
        # display(importance.head(30))
        # display(importance.tail(30))
        print()
        print()

In [61]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264




Parameters: { "device" } are not used.



KeyboardInterrupt: 

## Hyper param tuning

In [22]:
# IMPORTANT - filter warnings for long running jobs or all the output text will crash the computer :(

import warnings
warnings.filterwarnings('ignore')

In [24]:
%%time

# https://www.kaggle.com/code/chaozhuang/enefit-eda-w-fft-ssa-arima-lgbm?scriptVersionId=156414824#Predictive-Modelling
import random
import xgboost as xgb
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

def tune_lgbm_model(base_params, df, i, n_iter=8, cv=3):
    """
    Tune a LightGBM model based on a base set of parameters.

    :param base_params: Dictionary of base parameters for the model
    :param X_train: Training features
    :param y_train: Training target variable
    :param n_iter: Number of iterations for RandomizedSearchCV
    :param cv: Number of cross-validation folds
    :return: Best estimator and best parameters
    """

    train = df[date_filter <= datetime_cv_ranges[i][0]]
    val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
    print(f"Fold {i}")
    print(f"Train rows: {len(train)}")
    print(f"Val rows: {len(val)}")

    target_cols = ['target', 'target_installed_capacity']
    drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                'snowfall_fw', 'snowfall_hw_means']
    
    train = train.dropna()
    val = val.dropna()
    
    df_train_target = train[target_cols]
    df_train_data = train.drop(drop_cols, axis=1)
    
    df_val_target2 = val[target_cols]
    df_val_data2 = val.drop(drop_cols, axis=1)
    
    cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
            'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end',  'week_of_year',
            'is_year_start', 'is_year_end', 'season'] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
    cat_features = [c for c in cat_features if c in df_train_data.columns]  
    
    for feature in cat_features:
        df_train_data[feature] = df_train_data[feature].astype('category')
        df_val_data2[feature] = df_val_data2[feature].astype('category')

    #one hot encode because xgboost doesn't handle categorical variables well natively like lgbm does :(
    encoder = OneHotEncoder(handle_unknown='ignore', drop='first', sparse=False)
    encoder.fit(df_train_data[cat_features])

    train_hot = encoder.transform(df_train_data[cat_features])
    df_train_data = df_train_data.drop(cat_features, axis=1)
    train_hot_df = pd.DataFrame(train_hot, columns=[f'cat_{o}' for o in range(train_hot.shape[1])], index=df_train_data.index)
    df_train_data = pd.concat([df_train_data, train_hot_df], axis=1)
    df_train_data = df_train_data.reset_index(drop=True)
    # display(df_train_data)

    val_hot = encoder.transform(df_val_data2[cat_features])
    df_val_data2 = df_val_data2.drop(cat_features, axis=1)
    val_hot_df = pd.DataFrame(val_hot, columns=[f'cat_{o}' for o in range(val_hot.shape[1])], index=df_val_data2.index)
    df_val_data2 = pd.concat([df_val_data2, val_hot_df], axis=1)
    df_val_data2 = df_val_data2.reset_index(drop=True)

    print(f"train data: {df_train_data.isna().sum().max()}")
    print(f"val data: {df_val_data2.isna().sum().max()}")

    param_dist = {
        'booster': ['gbtree', 'gblinear', 'dart'],
        'objective': ['reg:absoluteerror', 'reg:tweedie'],
        'gamma': sp_uniform(0,1),
        'max_depth': sp_randint(3, 50),
        'tweedie_variance_power': sp_uniform(1,2),
        'min_child_weight': sp_uniform(0,3),
        'learning_rate': sp_uniform(0.005, 0.5),

        'lambda': sp_uniform(0, 4), 
        'alpha': sp_uniform(0, 4), 
        'grow_policy': ['depthwise', 'lossguide'],
        'max_bin': sp_randint(100, 1000),

        'n_estimators': sp_randint(2000, 3500),

        # 'min_data_in_leaf': sp_randint(15, 300),
        
        # 'num_leaves': sp_randint(25, 150),
        
        # 'colsample_bytree' : sp_uniform(0.1, 1),
        # 'colsample_bynode' : sp_uniform(0.1, 1),
        # 'data_sample_strategy' : ['bagging', 'goss'],
        
        # 'drop_rate': sp_uniform(0, 1),
        # 'skip_drop': sp_uniform(0, 1),
        # 'min_data_per_group': sp_randint(10, 200),
        # 'max_cat_threshold': sp_randint(10, 100),
        # 'cat_l2': sp_randint(10, 100),
        # 'cat_smooth': sp_randint(10, 100),
    }

    # Create a LightGBM regressor object
    xgb_reg = xgb.XGBRegressor(**base_params)

    # Create a RandomizedSearchCV object
    random_search = HalvingRandomSearchCV(estimator=xgb_reg, param_distributions=param_dist,
                                       scoring='neg_mean_absolute_error',
                                       cv=TimeSeriesSplit(n_splits=cv), random_state=1337, verbose=0,
                                         aggressive_elimination= True,
                                         max_resources=1000, min_resources=5, )

    results_dict = {}
    # Fit the random search to the data
    random_search.fit(df_train_data, df_train_target.target)

    # Return the best estimator and best parameters
    results_dict['best_estimator'] = random_search.best_estimator_
    results_dict['best_params'] = random_search.best_params_
    
    
#     random_search = HalvingRandomSearchCV(estimator=lgb_reg, param_distributions=param_dist,
#                                        scoring='neg_mean_absolute_error',
#                                        cv=cv, random_state=2024, verbose=1,
#                                          aggressive_elimination= True,
#                                          max_resources=20000, min_resources=5)
#     # consumer
#     X_train_consumer = X_train[~producer_mask]
#     y_train_consumer = y_train[~producer_mask]
#     # Fit the random search to the data
#     random_search.fit(X_train_consumer, y_train_consumer, categorical_feature=cat_features)

#     # Return the best estimator and best parameters
#     results_dict['consumer_best_estimator'] = random_search.best_estimator_
#     results_dict['consumer_best_params'] = random_search.best_params_

    with open('experiments/xgb_hyperparam_search_object1.pkl', 'wb') as file:
        pickle.dump(random_search, file, protocol=pickle.HIGHEST_PROTOCOL)
    
    return results_dict

base_params_p1 = {
    'verbosity':0, 
    'device':'gpu',
    'n_jobs': 22,
    'eval_metric': 'mae'
}

# Fit the model
results_dict = tune_lgbm_model(base_params_p1, processed_df_no_na, 4)

results_params = {'best_params': results_dict['best_params']}

import pickle
# save dictionary to pickle file
with open('experiments/xgb_hyperparam_results1.pkl', 'wb') as file:
    pickle.dump(results_params, file, protocol=pickle.HIGHEST_PROTOCOL)

# print("Best parameters:", results_dict['best_params'])

Fold 4
Train rows: 1824598
Val rows: 176496
train data: 0
val data: 0


In [None]:
print("Best parameters:", results_dict['best_params'])

## The best model

In [None]:
# from sklearn.ensemble import VotingRegressor

def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

processed_df_no_na['week_of_year'] = processed_df_no_na['week_of_year'].astype('int32')

mae_results = []

def train_cv(df):
    for i in range(5):
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                    'snowfall_fw', 'snowfall_hw_means']
        
        train = train.dropna()
        val = val.dropna()
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end',  'week_of_year',
                'is_year_start', 'is_year_end', 'season'] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]  
        
        for feature in cat_features:
            df_train_data[feature] = df_train_data[feature].astype('category')
            df_val_data2[feature] = df_val_data2[feature].astype('category')

        #one hot encode because xgboost doesn't handle categorical variables well natively like lgbm does :(
        encoder = OneHotEncoder(handle_unknown='ignore', drop='first', sparse=False)
        encoder.fit(df_train_data[cat_features])

        train_hot = encoder.transform(df_train_data[cat_features])
        df_train_data = df_train_data.drop(cat_features, axis=1)
        train_hot_df = pd.DataFrame(train_hot, columns=[f'cat_{o}' for o in range(train_hot.shape[1])], index=df_train_data.index)
        df_train_data = pd.concat([df_train_data, train_hot_df], axis=1)
        # display(df_train_data)

        val_hot = encoder.transform(df_val_data2[cat_features])
        df_val_data2 = df_val_data2.drop(cat_features, axis=1)
        val_hot_df = pd.DataFrame(val_hot, columns=[f'cat_{o}' for o in range(val_hot.shape[1])], index=df_val_data2.index)
        df_val_data2 = pd.concat([df_val_data2, val_hot_df], axis=1)

        # Xy = xgb.DMatrix(df_train_data, df_train_target['target'], enable_categorical=True)
        # Xy_val = xgb.DMatrix(df_val_data2, df_val_target2['target'], enable_categorical=True)

        # We leave max_depth as -1
        # Tune num_leaves, default is 31, let's double it
        base_params = {
                # 'verbosity':1, 
                'device':'gpu',
                'n_jobs': 22,
                'eval_metric': 'mae'
            } 

        params = base_params | results_params['best_params']
        
        # params = {'lambda_l1': 0.7466999841658806, 'lambda_l2': 3.2140838539606458, 'learning_rate': 0.13753679743025782, 'max_bin': 250, 'min_data_in_leaf': 150, 'n_estimators': 5593,  
        #         'metric': 'mae', 'n_jobs': 22, 'boosting': 'dart', 'objective': 'tweedie', 'device':'gpu'}

        # clf_consumer = xgb.XGBRegressor(objective='reg:absoluteerror', n_estimators=1000, device='cuda:0')
        clf_consumer = xgb.XGBRegressor(**params)

        # clf_consumer = xgb.train({'objective':'reg:absoluteerror', 'n_estimators':1000, 'enable_categorical':True, 'device':"cuda"}, Xy)
        
        # clf_consumer = VotingRegressor([
        #     ('lgb_0', LGBMRegressor(**params, random_state=42, verbose=1, )),
        #     ('lgb_1', LGBMRegressor(**params, random_state=69, verbose=1, )),
        #     ('lgb_2', LGBMRegressor(**params, random_state=1337, verbose=1, )), 
        #     ('lgb_3', LGBMRegressor(**params, random_state=124, verbose=1, )),
        #     ('lgb_4', LGBMRegressor(**params, random_state=12351, verbose=1, ))
        #     ], weights=[0.2,0.2,0.2,0.2,0.2])
        
        # clf_producer = VotingRegressor([
        #     ('lgb_0', LGBMRegressor(**params, random_state=142, verbose=1, )),
        #     ('lgb_1', LGBMRegressor(**params, random_state=169, verbose=1, )),
        #     ('lgb_2', LGBMRegressor(**params, random_state=11337, verbose=1, )), 
        #     ('lgb_3', LGBMRegressor(**params, random_state=1124, verbose=1, )),
        #     ('lgb_4', LGBMRegressor(**params, random_state=112351, verbose=1, ))
        #     ], weights=[0.2,0.2,0.2,0.2,0.2])

        clf_consumer.fit(df_train_data, df_train_target.target)

        # clf_producer.fit(df_train_data[df_train_data.is_consumption==0], df_train_target[df_train_data.is_consumption==0].target)
        
        # clf_consumer = lgb.train(params_consumer, dtrain)
        # preds = gbm.predict(df_val_data2)
        # mae = mean_absolute_error(df_val_target2["target"], preds)

        y_pred = clf_consumer.predict(df_train_data)
        # y_pred_producer = clf_producer.predict(df_train_data[df_train_data.is_consumption==0])
        # y_pred2 = y_pred.copy()
        # y_pred2[df_train_data.is_consumption==0] = y_pred_producer 

        from sklearn.metrics import mean_absolute_error

        results = {}

        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f" Train Mean Absolute Error_consumption:", mae)
        # mae = mean_absolute_error(df_train_target.target, y_pred2)
        # print(f" Train Mean w Producer Absolute Error:", mae)
        results[f'Fold {i}: Train Mean Absolute Error_consumption:'] = mae

        y_pred_val = clf_consumer.predict(df_val_data2)
        # y_pred_val_producer = clf_producer.predict(df_val_data2[df_val_data2.is_consumption==0])
        # y_pred_val2 = y_pred_val.copy()
        # y_pred_val2[df_val_data2.is_consumption==0] = y_pred_val_producer 

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print("Val Mean Absolute Error:", mae)
        results[f'Fold {i}: Val Mean Absolute Error:'] = mae
        # mae = mean_absolute_error(df_val_target2.target, y_pred_val2)
        # print("Val Mean w Producer Absolute Error:", mae)
        
        # importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        # importance = importance.sort_values('importance', ascending=False)
        # display(importance.head(30))
        # display(importance.tail(30))

        mae_results.append(results)

        print()
        print()

    import pickle
    # save dictionary to pickle file
    with open('experiments/xgb_hyperparam_mae_cv_run1.pkl', 'wb') as file:
        pickle.dump(mae_results, file, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1129738
Val rows: 171264




Parameters: { "device" } are not used.



KeyboardInterrupt: 

In [24]:
from sklearn.ensemble import VotingRegressor
from lightgbm import LGBMRegressor
import lightgbm as lgb

def inverse_tic(preds, train):
    return preds/1000 * train.installed_capacity

def train_cv(df):
    for i in [0]:
        train = df[date_filter <= datetime_cv_ranges[i][0]]
        val = df[(date_filter <= datetime_cv_ranges[i][1]) & (date_filter > datetime_cv_ranges[i][0])]
        print(f"Fold {i}")
        print(f"Train rows: {len(train)}")
        print(f"Val rows: {len(val)}")
        
        target_cols = ['target', 'target_installed_capacity']
        drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                    'snowfall_fw', 'snowfall_hw_means']
        
        train = train.dropna()
        val = val.dropna()
        
        df_train_target = train[target_cols]
        df_train_data = train.drop(drop_cols, axis=1)
        
        df_val_target2 = val[target_cols]
        df_val_data2 = val.drop(drop_cols, axis=1)
        
        cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
               'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
                'is_year_start', 'is_year_end', 'season'] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
        cat_features = [c for c in cat_features if c in df_train_data.columns]  
        
        for feature in cat_features:
            df_train_data[feature] = df_train_data[feature].astype('category')
            df_val_data2[feature] = df_val_data2[feature].astype('category')
        
        
        # We leave max_depth as -1
        # Tune num_leaves, default is 31, let's double it       
        
        params = {'lambda_l1': 0.7466999841658806, 'lambda_l2': 3.2140838539606458, 'learning_rate': 0.13753679743025782, 'max_bin': 250, 'min_data_in_leaf': 150, 'n_estimators': 5593,  
                'metric': 'mae', 'n_jobs': 22, 'boosting': 'dart', 'objective': 'tweedie', 'device':'gpu'}
        
        # clf_consumer = LGBMRegressor(**params, random_state=42, verbose=1, )
        
        clf_consumer = VotingRegressor([
            ('lgb_0', LGBMRegressor(**params, random_state=42, verbose=1, )),
            ('lgb_1', LGBMRegressor(**params, random_state=69, verbose=1, )),
            ('lgb_2', LGBMRegressor(**params, random_state=1337, verbose=1, )), 
            ('lgb_3', LGBMRegressor(**params, random_state=124, verbose=1, )),
            ('lgb_4', LGBMRegressor(**params, random_state=12351, verbose=1, ))
            ], weights=[0.2,0.2,0.2,0.2,0.2])
        
        # clf_producer = VotingRegressor([
        #     ('lgb_0', LGBMRegressor(**params, random_state=142, verbose=1, )),
        #     ('lgb_1', LGBMRegressor(**params, random_state=169, verbose=1, )),
        #     ('lgb_2', LGBMRegressor(**params, random_state=11337, verbose=1, )), 
        #     ('lgb_3', LGBMRegressor(**params, random_state=1124, verbose=1, )),
        #     ('lgb_4', LGBMRegressor(**params, random_state=112351, verbose=1, ))
        #     ], weights=[0.2,0.2,0.2,0.2,0.2])

        clf_consumer.fit(df_train_data, df_train_target.target)
        # clf_producer.fit(df_train_data[df_train_data.is_consumption==0], df_train_target[df_train_data.is_consumption==0].target)
        
        # clf_consumer = lgb.train(params_consumer, dtrain)
        # preds = gbm.predict(df_val_data2)
        # mae = mean_absolute_error(df_val_target2["target"], preds)

        y_pred = clf_consumer.predict(df_train_data)
        # y_pred_producer = clf_producer.predict(df_train_data[df_train_data.is_consumption==0])
        # y_pred2 = y_pred.copy()
        # y_pred2[df_train_data.is_consumption==0] = y_pred_producer 

        from sklearn.metrics import mean_absolute_error

        # Assuming you have two pandas Series: y_true and y_pred
        mae = mean_absolute_error(df_train_target.target, y_pred)
        print(f" Train Mean Absolute Error_consumption:", mae)
        # mae = mean_absolute_error(df_train_target.target, y_pred2)
        # print(f" Train Mean w Producer Absolute Error:", mae)

        y_pred_val = clf_consumer.predict(df_val_data2)
        # y_pred_val_producer = clf_producer.predict(df_val_data2[df_val_data2.is_consumption==0])
        # y_pred_val2 = y_pred_val.copy()
        # y_pred_val2[df_val_data2.is_consumption==0] = y_pred_val_producer 

        mae = mean_absolute_error(df_val_target2.target, y_pred_val)
        print("Val Mean Absolute Error:", mae)
        # mae = mean_absolute_error(df_val_target2.target, y_pred_val2)
        # print("Val Mean w Producer Absolute Error:", mae)
        
        # importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
        # importance = importance.sort_values('importance', ascending=False)
        # display(importance.head(30))
        # display(importance.tail(30))
        print()
        print()

In [25]:
train_cv(processed_df_no_na)

Fold 0
Train rows: 1158538
Val rows: 842556
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 50105
[LightGBM] [Info] Number of data points in the train set: 1158538, number of used features: 225
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3090, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 214 dense feature groups (238.65 MB) transferred to GPU in 0.069642 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 5.525093
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 50105
[LightGBM] [Info] Number of data points in the train set: 1158538, number of used features: 225
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3090, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built


In [20]:
train_pred_list = []
train_mae_list = []
train_targets_list = []

pred_list = []
mae_list = []
val_targets_list = []

df = processed_df_no_na
i=0
for f in range(((datetime_cv_ranges[i][1] - datetime_cv_ranges[i][0]).days//14)):
    start = datetime_cv_ranges[i][0] + dt.timedelta(days=f*14)
    stop = datetime_cv_ranges[i][0] + dt.timedelta(days=(f+1)*14)
    train = processed_df_no_na[date_filter <= start]
    val = processed_df_no_na[(date_filter <= stop) & (date_filter > start)]
    
    print(f"Fold {i}, period {f}")
    print(f"Train rows: {len(train)}")
    print(f"Val rows: {len(val)}")

    target_cols = ['target', 'target_installed_capacity']
    drop_cols = ['target', 'target_installed_capacity', 'quarter', 'season', 'is_year_end', 'is_year_start', 'is_month_end', 'is_quarter_end', 'is_quarter_start', 'is_month_start', 'snowfall_hw_lagged', 'snowfall_hw_variances',
                'snowfall_fw', 'snowfall_hw_means']

    df_train_target = train[target_cols]
    df_train_data = train.drop(drop_cols, axis=1)

    df_val_target2 = val[target_cols]
    df_val_data2 = val.drop(drop_cols, axis=1)

    cat_features = ["county", "is_business", "product_type", "is_consumption", 'month', 'hour', 'quarter',
           'day_of_week', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start' ,'is_quarter_end', 
            'is_year_start', 'is_year_end', 'season'] + list(df_train_data.columns[df_train_data.columns.str.contains('is_na')])
    cat_features = [c for c in cat_features if c in df_train_data.columns]

    # We leave max_depth as -1
    # Tune num_leaves, default is 31, let's double it       

    params = {'lambda_l1': 0.7466999841658806, 'lambda_l2': 3.2140838539606458, 'learning_rate': 0.13753679743025782, 'max_bin': 250, 'min_data_in_leaf': 150, 'n_estimators': 5593,  
                'metric': 'mae', 'n_jobs': 22, 'boosting': 'dart', 'objective': 'tweedie', 'device':'gpu'}
    
    clf = LGBMRegressor(**params, random_state=42, verbose=0, importance_type='gain')

    clf.fit(df_train_data, df_train_target.target, categorical_feature=cat_features)

    y_pred = clf.predict(df_train_data)
    train_pred_list.append(y_pred)

    from sklearn.metrics import mean_absolute_error

    # Assuming you have two pandas Series: y_true and y_pred
    mae = mean_absolute_error(df_train_target.target, y_pred)
    train_mae_list.append(mae)
    train_targets_list.append(df_train_target.target)
    print(f" Train Mean Absolute Error_consumption:", mae)

    y_pred_val = clf.predict(df_val_data2)
    pred_list.append(y_pred_val)

    mae = mean_absolute_error(df_val_target2.target, y_pred_val)
    val_targets_list.append(df_val_target2.target)
    mae_list.append(mae)
    print("Val Mean Absolute Error:", mae)

# importance = pd.DataFrame({'importance':clf2.feature_importances_, 'name':clf2.feature_name_})
# importance = importance.sort_values('importance', ascending=False)
# display(importance.head(30))
# display(importance.tail(30))
print()
print()

Fold 0, period 0
Train rows: 1129738
Val rows: 44880


NameError: name 'LGBMRegressor' is not defined

In [28]:
np.mean(mae_list)

41.6798865951825