In [7]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from autogluon.tabular import TabularPredictor
import autogluon

## Define constans and functions

In [8]:
locations = ["A", "B", "C"]
features_order = []

LAGGED_COLUMNS_TO_KEEP = [
    # 'direct_rad:W_lag_1h', 
    'direct_rad:W_lag_forward_1h', 
    # 'clear_sky_rad:W_lag_1h', 
    'clear_sky_rad:W_lag_forward_1h', 
    # 'diffuse_rad:W_lag_1h', 
    'diffuse_rad:W_lag_forward_1h', 
    # 'direct_rad_1h:J_lag_1h', 
    'direct_rad_1h:J_lag_forward_1h', 
    # 'is_in_shadow:idx_lag_1h', 
    'is_in_shadow:idx_lag_forward_1h', 
    # 'clear_sky_energy_1h:J_lag_1h', 
    'clear_sky_energy_1h:J_lag_forward_1h', 
    # 'effective_cloud_cover:p_lag_1h', 
    'effective_cloud_cover:p_lag_forward_1h', 
    # 'visibility:m_lag_1h', 
    'visibility:m_lag_forward_1h', 
    # 'total_cloud_cover:p_lag_1h', 
    'total_cloud_cover:p_lag_forward_1h', 


    # 'direct_rad:W_lag_2h', 
    # 'direct_rad:W_lag_forward_2h', 
    # 'clear_sky_rad:W_lag_2h', 
    # 'clear_sky_rad:W_lag_forward_2h', 
    # 'diffuse_rad:W_lag_2h', 
    # 'diffuse_rad:W_lag_forward_2h', 
    # 'direct_rad_1h:J_lag_2h', 
    # 'direct_rad_1h:J_lag_forward_2h', 
    # 'is_in_shadow:idx_lag_2h', 
    # 'is_in_shadow:idx_lag_forward_2h', 
    # 'clear_sky_energy_1h:J_lag_2h', 
    # 'clear_sky_energy_1h:J_lag_forward_2h', 
    # 'effective_cloud_cover:p_lag_2h', 
    # 'effective_cloud_cover:p_lag_forward_2h', 
    # 'visibility:m_lag_2h', 
    # 'visibility:m_lag_forward_2h', 
    # 'total_cloud_cover:p_lag_2h', 
    # 'total_cloud_cover:p_lag_forward_2h', 

    # 'direct_rad:W_lag_3h', 
    # 'direct_rad:W_lag_forward_3h', 
    # 'clear_sky_rad:W_lag_3h', 
    # 'clear_sky_rad:W_lag_forward_3h', 
    # 'diffuse_rad:W_lag_3h', 
    # 'diffuse_rad:W_lag_forward_3h', 
    # 'direct_rad_1h:J_lag_3h', 
    # 'direct_rad_1h:J_lag_forward_3h', 
    # 'is_in_shadow:idx_lag_3h', 
    # 'is_in_shadow:idx_lag_forward_3h', 
    # 'clear_sky_energy_1h:J_lag_3h', 
    # 'clear_sky_energy_1h:J_lag_forward_3h', 
    # 'effective_cloud_cover:p_lag_3h', 
    # 'effective_cloud_cover:p_lag_forward_3h', 
    # 'visibility:m_lag_3h', 
    # 'visibility:m_lag_forward_3h', 
    # 'total_cloud_cover:p_lag_3h', 
    # 'total_cloud_cover:p_lag_forward_3h'
]

CUSTOM_COLUMNS_TO_KEEP = [
    "hour_cos",
    "hour_sin",
    "month_sin",
    "month_cos",
    "day-of-year",
    "hours_since_forecast"
]

WEATHER_FEATURES = [
    "direct_rad:W",
    "clear_sky_rad:W",
    "diffuse_rad:W",
    "direct_rad_1h:J",
    "is_in_shadow:idx",
    "clear_sky_energy_1h:J",
    "effective_cloud_cover:p",
    "visibility:m",
    "total_cloud_cover:p",
]


TEST_COLUMNS_TO_KEEP = [
    "direct_rad:W",
    "clear_sky_rad:W",
    "diffuse_rad:W",
    "direct_rad_1h:J",
    "is_in_shadow:idx",
    "clear_sky_energy_1h:J",
    "diffuse_rad_1h:J",
    "is_day:idx",
    "sun_elevation:d",
    "ceiling_height_agl:m",
    "effective_cloud_cover:p",
    "visibility:m",
    "total_cloud_cover:p",
    "air_density_2m:kgm3",
    "wind_speed_v_10m:ms",
    "dew_point_2m:K",
    "wind_speed_u_10m:ms",
    "t_1000hPa:K",
    "absolute_humidity_2m:gm3",
    "snow_water:kgm2",
    "relative_humidity_1000hPa:p",
    "fresh_snow_24h:cm",
    "cloud_base_agl:m",
    "fresh_snow_12h:cm",
    "snow_depth:cm",
    "dew_or_rime:idx",
    "fresh_snow_6h:cm",
    "super_cooled_liquid_water:kgm2",
    "fresh_snow_3h:cm",
    "rain_water:kgm2",
    "precip_type_5min:idx",
    "precip_5min:mm",
    "fresh_snow_1h:cm",
    "sun_azimuth:d",
    "msl_pressure:hPa",
    "pressure_100m:hPa",
    "pressure_50m:hPa",
    "sfc_pressure:hPa",
    "prob_rime:p",
    "wind_speed_10m:ms",
    # "elevation:m",
    # "snow_density:kgm3",
    # "snow_drift:idx",
    "snow_melt_10min:mm",
    "wind_speed_w_1000hPa:ms",
    "observed_or_estimated"
    # "location_A",
    # "location_B",
    # "location_C",
    # "date_calc",
] + CUSTOM_COLUMNS_TO_KEEP  +  LAGGED_COLUMNS_TO_KEEP

COLUMNS_TO_KEEP = TEST_COLUMNS_TO_KEEP + ["pv_measurement"]



def create_weather_lagged_features(df, weather_features):
    # Choose the weather features for which you want to create lagged versions
    for feature in weather_features:
        # Assuming hourly data, adjust the lags for your specific dataset
        # Creating lagged features for 1 hour, 1 day, and 1 week
        # df[f'{feature}_lag_1h'] = df[feature].shift(1)
        # df[f'{feature}_lag_2h'] = df[feature].shift(2)
        # df[f'{feature}_lag_3h'] = df[feature].shift(3)

        df[f'{feature}_lag_forward_1h'] = df[feature].shift(-1)
        # df[f'{feature}_lag_forward_2h'] = df[feature].shift(-2)
        # df[f'{feature}_lag_forward_3h'] = df[feature].shift(-3)
        # df[f'{feature}_lag_24h'] = df[feature].shift(24*4)
        # df[f'{feature}_lag_168h'] = df[feature].shift(24 * 7 * 4 * 365)
        # df[f'{feature}_front_lag_1h'] = df[feature].shift(-4)
        # df[f'{feature}_front_lag_24h'] = df[feature].shift(-24*4)


    # Handling edges by filling NaNs with appropriate values or dropping them
    # You may choose to fill with zeroes or interpolate, based on what makes more sense for your data
    # df.fillna(method='ffill', inplace=True)  # Forward fill  # Autogluon should handle this for us.
    # df.fillna(method='bfill', inplace=True)  # Backward fill  # Autogluon should handle this for us.
    
    return df


B_SCALE_VALUE = 6.3
C_SCALE_VALUE = 8.1

## Prepare data


In [9]:
def add_custom_fields(df):
     df['hour_sin'] = np.sin(2 * np.pi * df['date_forecast'].dt.hour / 24)
     df['hour_cos'] = np.cos(2 * np.pi * df['date_forecast'].dt.hour / 24)

     df['month_sin'] = np.sin(2 * np.pi * df['date_forecast'].dt.month / 12)
     df['month_cos'] = np.cos(2 * np.pi * df['date_forecast'].dt.month / 12)
     df['day-of-year'] = df['date_forecast'].dt.dayofyear
     return df

def add_calc_date(df_observed, df_estimated, df_test):
    # Function to calculate the difference in hours
    def calculate_hour_difference(row):
        diff = row['date_calc'] - row['date_forecast']
        return diff.total_seconds() / 3600  # Convert difference to hours

    # Apply the function to calculate the hour difference for df_estimated and df_test
    df_estimated['hours_since_forecast'] = df_estimated.apply(calculate_hour_difference, axis=1)
    df_test['hours_since_forecast'] = df_test.apply(calculate_hour_difference, axis=1)

    # Fill in zero for df_observed
    df_observed['hours_since_forecast'] = 0

    return df_observed, df_estimated, df_test


def remove_outliers(df):
    # Use a mask to filter out the rows where rolling std is zero but keep the rows where the value itself is zero
    # Because some places in the data, the pv-measurements are messed up and are repeating.
    mask = (df['pv_measurement'].rolling(2).std() == 0) & (df['pv_measurement'] != 0)
    df[mask] = np.NaN  # Put this to NaN and hope autoGluon Handles.
    return df

def resample_add_data(df, is_test_data):
    df = add_custom_fields(df)
    df.set_index('date_forecast', inplace=True)
    df = df.resample('1H').mean()
    
    # Remove empty dates if test data
    if is_test_data:
        non_nan_threshold = len(df.columns) // 2
        df.dropna(thresh=non_nan_threshold, inplace=True)

    # df.interpolate(method="linear", inplace=True)  # Autogluon should handle this for us.
    
    return df

def add_location_feature(X, location):
      # Treat location as a categorical feature by converting it to a category type
    X['location'] = location
    X['location'] = X['location'].astype(str)  # Convert to string if 'location' is not an int
    # X['dew_or_rime:idx'] = X['dew_or_rime:idx'].astype(str)
    # X['is_day:idx'] = X['is_day:idx'].astype(str)
    # X['is_in_shadow:idx'] = X['is_in_shadow:idx'].astype(str)
    # categorical_columns = ['location', 'dew_or_rime:idx', 'is_day:idx', 'is_in_shadow:idx']

    # # Before filling NaN values, add 'missing' as a category for each categorical column.
    # for column in categorical_columns:
    #     X[column] = X[column].astype('category')  # Ensure the column is of type 'category'.
    #     if 'missing' not in X[column].cat.categories:
    #         X[column] = X[column].cat.add_categories(['missing'])  # Add 'missing' as a new category.
    #     X[column] = X[column].fillna('missing')
    # X['location'] = X['location'].astype('category')
    # X['dew_or_rime:idx'] = X['dew_or_rime:idx'].astype('category')
    # X['is_day:idx'] = X['is_day:idx'].astype('category')
    # X['is_in_shadow:idx'] = X['is_in_shadow:idx'].astype('category')
    
    return X

def add_scaling(X_test, X_training, location):
    global scalers
    continuous_columns = X_training.select_dtypes(include=['float32', 'int32']).columns
    if location not in scalers:
        scalers[location] = MinMaxScaler()
    X_training[continuous_columns] = scalers[location].fit_transform(X_training[continuous_columns])
    
    X_test[continuous_columns] = scalers[location].transform(X_test[continuous_columns])

    return X_test, X_training

# Skip this as we have hours since forecast as a feature.
# Deprecated as the concat is moved to main function.
def make_observed_and_estimated_category(df_observed, df_estimated, df_test):
     # Hot encode in wether observed or estimated
    df_observed['observed_or_estimated'] = 'observed'
    df_estimated['observed_or_estimated'] = 'estimated'
    df_test['observed_or_estimated'] = 'estimated'
    # Concatenate observed and estimated
    df_training = pd.concat([df_observed, df_estimated], axis=0).sort_values(by="date_forecast")
    df_training['observed_or_estimated'] = df_training['observed_or_estimated'].astype('category')
    df_test['observed_or_estimated'] = df_test['observed_or_estimated'].astype('category')

    return df_training, df_test

    
    

In [10]:

# Initialize a dictionary to hold the scalers for each location

scalers = {}


def prepare_data(location):
    # Load data
    scaling = False  # Set scaling to True to enable individual scaling for each location
    global scalers
    global scale_target 
    scale_target = False

    # Load training data
    df_observed = pd.read_parquet(f"data/{location}/X_train_observed.parquet")
    df_estimated = pd.read_parquet(f"data/{location}/X_train_estimated.parquet")
    df_target = pd.read_parquet(f"data/{location}/train_targets.parquet")
    # drop nan values in target data, pv measurement
    df_target.dropna(inplace=True)

   
    # Load test data
    df_test = pd.read_parquet(f"data/{location}/X_test_estimated.parquet")

   
    # Add calculated date
    df_observed, df_estimated, df_test = add_calc_date(df_observed, df_estimated, df_test)
    

    # Resample and add custom fields

    df_observed = resample_add_data(df_observed, False)
    df_estimated = resample_add_data(df_estimated, False)
    df_test = resample_add_data(df_test, True)

    df_training, df_test = make_observed_and_estimated_category(df_observed, df_estimated, df_test)
    

    # Autogluon should scale for us.
    if scale_target:
        if location == "B":
            df_target["pv_measurement"] = df_target["pv_measurement"] * B_SCALE_VALUE
        elif location == "C":
            df_target["pv_measurement"] = df_target["pv_measurement"] * C_SCALE_VALUE
    
    # Merge training with target data
    df_training = pd.merge(df_training, df_target, left_on="date_forecast", right_on="time", how="inner")
    
    # Create lagged features and remove outliers training
    df_training = create_weather_lagged_features(df_training, WEATHER_FEATURES)
    df_training = df_training[COLUMNS_TO_KEEP]
    df_training = remove_outliers(df_training)

    df_test = create_weather_lagged_features(df_test, WEATHER_FEATURES)
    df_test = df_test[TEST_COLUMNS_TO_KEEP]


    # Add categories
    df_training = add_location_feature(df_training, location)
    X_test = add_location_feature(df_test, location)
    
    # Add scaling
    if scaling:
        X_test, X_training = add_scaling(X_test, X_training, location)
    df_test.reset_index(inplace=True)
    df_test.drop(columns=["date_forecast"], inplace=True)
    # y_training = np.log1p(y_training)
    return df_training, X_test



# Use prepare_data function

combined_df_train = []
combined_df_test = []
combined_df_validation = []

for location in locations:
    # Prepare the training data
    X_training, X_test = prepare_data(location)

    df_training, df_test = prepare_data(location)
    # split df training at 
    X_training, X_validation = train_test_split(df_training[df_training["observed_or_estimated"] == "estimated"], test_size=1440, shuffle=True)
    X_training = pd.concat([X_training, df_training[df_training["observed_or_estimated"] == "observed"]])
    X_training = shuffle(X_training, random_state=420)
    
    combined_df_train.append(X_training)
    combined_df_validation.append(X_validation)

    combined_df_test.append(X_test)


print()




# Define hyperparameters

In [11]:
lgbmXT = {'learning_rate': 0.05, 'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}
r51 = {'layers': [200, 100, 50],
     'emb_drop': 0.6046989241462619,
     'ps': 0.09244767444160731,
     'bs': 1024,
     'lr': 0.00775309042164966,
     'epochs': 48,
     'early.stopping.min_delta': 0.0001,
     'early.stopping.patience': 20,
     'smoothing': 0.0, 'ag_args': {'name_suffix': '_r51'}}

r118 = {'learning_rate': 0.021720607471727896,
     'extra_trees': True,
     'feature_fraction': 0.7832570544199176,
     'min_data_in_leaf': 3,
     'num_leaves': 21}

rf_r5 = {'n_estimators': 300,
     'max_leaf_nodes': 50000,
     'n_jobs': -1,
     'random_state': 0,
     'bootstrap': True,
     'min_samples_leaf': 5,
     'max_features': 0.5}

hyperparameters_a = {
    'NN_TORCH': {},
    'GBM': [lgbmXT, 'GBMLarge', r118],
    'FASTAI': [r51]
}

hyperparameters_b = {
    'NN_TORCH': {},
    'GBM': [lgbmXT, r118],
    'KNN': [{'weights': 'uniform'}],
    'FASTAI': [r51],
    'CAT': {}
}

hyperparameters_c = {
    'NN_TORCH': {},
    'GBM': [lgbmXT, r118],
    'KNN': [{'weights': 'uniform'}],
    'FASTAI': [r51],
    'CAT': {},
    'XGB': {},

}

level_2_hyperparameters = {
    'XT': [{}],
    'RF': [{}, rf_r5],
    'GBM': ['GBMLarge'],
    'NN_TORCH': {},
}



## Train

In [12]:
def train_model(dataset):
    # Define the path where the AutoGluon models will be saved
    # enumerate all the locations
    for index, location in enumerate(locations):
        save_path = f"autogluon_models/test_model{location}"

        if location == "A":
            hyperparameters = hyperparameters_a
        elif location == "B":
            hyperparameters = hyperparameters_b
        else: 
            hyperparameters = hyperparameters_c

        # Initialize the TabularPredictor object
        model = TabularPredictor(
            label="pv_measurement", path=save_path, eval_metric="mae"
        )
        model.fit(
            train_data=combined_df_train[index].dropna(subset=["pv_measurement"]),
            tuning_data=combined_df_validation[index].dropna(subset=["pv_measurement"]),
            presets="experimental_zeroshot_hpo_hybrid",
            use_bag_holdout=True,
            hyperparameters={0: hyperparameters, 1: level_2_hyperparameters},
            num_bag_sets=3,
            num_stack_levels=1,
        )


# Train the model using all available training data and the initial validation set for early stopping
train_model(combined_df_train)

# Evaluate the model using the same validation set
# evaluate_model(combined_X_val, combined_Y_val, location, model)

Presets specified: ['experimental_zeroshot_hpo_hybrid']
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=3
Beginning AutoGluon training ...
AutoGluon will save models to "autogluon_models/test_modelA"
AutoGluon Version:  0.8.3b20231109
Python Version:     3.11.6
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 21.6.0: Sat Jun 18 17:07:28 PDT 2022; root:xnu-8020.140.41~1/RELEASE_ARM64_T8110
Disk Space Avail:   22.17 GB / 245.11 GB (9.0%)
Train Data Rows:    32643
Train Data Columns: 59
Tuning Data Rows:    1440
Tuning Data Columns: 59
Label Column: pv_measurement
Preprocessing data ...
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Lab

# Feature importance

In [50]:
# model.leaderboard(combined_df_validation, silent=True)
# model.feature_importance(combined_df_validation, time_limit=120)

predictior = TabularPredictor.load(f"autogluon_models/test_modelC", require_version_match=False)


# Get the leaderboard
leaderboard = predictior.leaderboard(silent=True)

# Retrieve the best model's name
best_model = leaderboard.iloc[0]['model']
predictior.info()

# # Retrieve hyperparameters of the best model
# best_model_info = predictior.get_model_info(model=best_model)
# best_hyperparameters = best_model_info['hyperparameters']

# # Optionally, save or print these hyperparameters
# print("Best Model Hyperparameters:", best_hyperparameters)

{'path': 'autogluon_models/test_modelC',
 'label': 'pv_measurement',
 'random_state': 0,
 'version': '0.8.3b20231109',
 'features': ['direct_rad:W',
  'clear_sky_rad:W',
  'diffuse_rad:W',
  'direct_rad_1h:J',
  'is_in_shadow:idx',
  'clear_sky_energy_1h:J',
  'diffuse_rad_1h:J',
  'is_day:idx',
  'sun_elevation:d',
  'ceiling_height_agl:m',
  'effective_cloud_cover:p',
  'visibility:m',
  'total_cloud_cover:p',
  'air_density_2m:kgm3',
  'wind_speed_v_10m:ms',
  'dew_point_2m:K',
  'wind_speed_u_10m:ms',
  't_1000hPa:K',
  'absolute_humidity_2m:gm3',
  'snow_water:kgm2',
  'relative_humidity_1000hPa:p',
  'fresh_snow_24h:cm',
  'cloud_base_agl:m',
  'fresh_snow_12h:cm',
  'snow_depth:cm',
  'dew_or_rime:idx',
  'fresh_snow_6h:cm',
  'super_cooled_liquid_water:kgm2',
  'fresh_snow_3h:cm',
  'rain_water:kgm2',
  'precip_type_5min:idx',
  'precip_5min:mm',
  'fresh_snow_1h:cm',
  'sun_azimuth:d',
  'msl_pressure:hPa',
  'pressure_100m:hPa',
  'pressure_50m:hPa',
  'sfc_pressure:hPa',
  '

In [53]:
print(leaderboard)

                          model  score_val  pred_time_val     fit_time  \
0           WeightedEnsemble_L3  -9.308664    2890.739702  9454.294130   
1          ExtraTreesMSE_BAG_L2  -9.442430    2873.868619  9105.513262   
2         ExtraTrees_r19_BAG_L2  -9.455205    2873.836126  9102.713872   
3        RandomForest_r5_BAG_L2  -9.483109    2873.739279  9122.796443   
4          LightGBM_r111_BAG_L2  -9.544697    2883.366117  9181.099464   
5        RandomForestMSE_BAG_L2  -9.558274    2873.765136  9156.735636   
6          LightGBMLarge_BAG_L2  -9.604771    2876.342520  9137.670673   
7          LightGBM_r118_BAG_L2  -9.689695    2883.444918  9152.231380   
8                XGBoost_BAG_L2  -9.746289    2873.895038  9125.370814   
9          LightGBM_r158_BAG_L2  -9.764162    2875.470964  9119.913010   
10          LightGBM_r97_BAG_L2  -9.811798    2896.546383  9178.178513   
11       NeuralNetFastAI_BAG_L2  -9.889997    2874.502014  9161.677149   
12  NeuralNetFastAI_r145_BAG_L2  -9.89

# Make predictions

In [67]:

# Assuming you have defined WEATHER_FEATURES, TEST_COLUMNS_TO_KEEP, and other functions previously

def make_predictions(df_test_pred, location):
    eval_model = TabularPredictor.load(f"autogluon_models/test_model{location}", require_version_match=False)
    preds = eval_model.predict(df_test_pred)
    return preds





## Evaluate locally


In [None]:
import matplotlib.pyplot as plt


def evaluate_model_locally(location, scalers):
    # Load the test data
    target_df = pd.read_parquet(f"data/{location}/train_targets.parquet")
    
    # Make predictions
    # filter x_validate to only include values from location
    pred_dataset = combined_df_validation[combined_df_validation["location"] == location]
    pred_dataset.reset_index(inplace=True)

    preds = make_predictions(pred_dataset.drop("pv_measurement", axis=1), location)[-720:]
    target = target_df.tail(720)["pv_measurement"].to_numpy()
    
    differences = preds - target
    # Count predictions lower than the actual
    lower_predictions = (differences < 0) & (target != 0)
    # Count predictions higher than the actual
    higher_predictions = (differences > 0) & (target != 0)

    # Biggest misreads
    absolute_differences = abs(differences)
    max_diff_index = absolute_differences.argmax()  # Index of the biggest difference
    # max_diff_value = absolute_differences[max_diff_index]  # Value of the biggest difference
    print(f"Number of predictions that are a lower value than the actual, given that the actual is not 0: {lower_predictions.sum()}")
    print(f"Number of predictions that are larger than the target, given that the target is not 0: {higher_predictions.sum()}")
    # print(f"The biggest misread is at index {max_diff_index} with a difference of {max_diff_value}")
    
    index = target_df.index[-720:]
    print(f'location: {location}')
    # Plotting
    plt.figure(figsize=(60,6))
    plt.plot(index, target, label="Target")
    plt.plot(index, preds, label="Predictions")
    plt.xlabel('Date')
    plt.ylabel('Value')
    plt.title('Target vs Predictions')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
    plt.show()

for loc in locations:
    evaluate_model_locally(loc, scalers)


TypeError: list indices must be integers or slices, not str

## Submit to csv 1

In [68]:
df_submission = pd.read_csv("data/test.csv")

for index, location in enumerate(locations): 
    preds = make_predictions(combined_df_test[index], location)

    # Assign the predictions to df_submission for the current location
    mask = df_submission["location"] == location
    print(mask.head())
    # Add a check to make sure the lengths match
    if len(preds) != mask.sum():
        print(f"Length of predictions: {len(preds)}")
        print(f"Length of submission entries: {mask.sum()}")
        raise ValueError(f"Mismatch in length of predictions and submission entries for location {location}.")

    df_submission.loc[mask, "prediction"] = preds.to_numpy()


# Save the results to a new submission file
df_submission[["id", "prediction"]].to_csv("predictions/autogluon-2-stack-params.csv", index=False)

0    True
1    True
2    True
3    True
4    True
Name: location, dtype: bool


  3.03056335e+02  1.04668689e+03  1.98079504e+03  2.82339160e+03
  2.43028540e+03  2.57733765e+03  2.99167017e+03  3.06624048e+03
  2.53624023e+03  2.84018433e+03  2.30561938e+03  1.54139111e+03
  1.16434448e+03  5.84558777e+02  2.30821075e+02  2.39570255e+01
  7.34898001e-02  6.94085732e-02  6.97909147e-02  6.69148192e-02
  6.94147795e-02  7.02978522e-02  1.22660494e+00  1.56422516e+02
  5.69263611e+02  1.31115308e+03  2.48727881e+03  3.36796143e+03
  3.87339844e+03  3.97540552e+03  3.97760547e+03  4.08654053e+03
  4.07100464e+03  4.05018188e+03  3.41763379e+03  2.62558252e+03
  1.70471924e+03  8.58616516e+02  4.10139069e+02  9.09735870e+01
  5.68352342e-01  1.25227466e-01  1.08722307e-01  1.01177074e-01
  8.16048533e-02  4.14771065e-02  3.28996925e+01  2.49938766e+02
  7.46272217e+02  1.43808118e+03  2.60119897e+03  3.52220020e+03
  4.43930420e+03  4.80005615e+03  4.70224121e+03  4.29028760e+03
  4.22967871e+03  3.83884424e+03  2.78003027e+03  1.99739124e+03
  1.33111792e+03  8.60143

0    False
1    False
2    False
3    False
4    False
Name: location, dtype: bool
0    False
1    False
2    False
3    False
4    False
Name: location, dtype: bool
