# Imports

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import h2o
from sklearn.model_selection import train_test_split
from sklego.preprocessing import RepeatingBasisFunction
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime

In [23]:
train_a = pd.read_parquet('A/train_targets.parquet')
train_b = pd.read_parquet('B/train_targets.parquet')
train_c = pd.read_parquet('C/train_targets.parquet')

raw_b = train_b

X_train_estimated_a = pd.read_parquet('A/X_train_estimated.parquet')
X_train_estimated_b = pd.read_parquet('B/X_train_estimated.parquet')
X_train_estimated_c = pd.read_parquet('C/X_train_estimated.parquet')

X_train_observed_a = pd.read_parquet('A/X_train_observed.parquet')
X_train_observed_b = pd.read_parquet('B/X_train_observed.parquet')
X_train_observed_c = pd.read_parquet('C/X_train_observed.parquet')

X_test_estimated_a = pd.read_parquet('A/X_test_estimated.parquet')
X_test_estimated_b = pd.read_parquet('B/X_test_estimated.parquet')
X_test_estimated_c = pd.read_parquet('C/X_test_estimated.parquet')

# Preprocessing

In [24]:
def preprocessing_X(df):
    data = df.copy()
    if 'date_calc' in data.columns:
        data.drop('date_calc', axis=1, inplace=True)

    if 'date_forecast' in data.columns:
        data.set_index('date_forecast', inplace=True)

    
    data = data.groupby(pd.Grouper(freq='1H')).mean()
    
    
    data.dropna(how='all', inplace=True)
    
    data['hour'] = data.index.hour
    data['week'] = pd.to_numeric(data.index.isocalendar().week)
    data['week_hour'] = data['week'] * 100 + data['hour']
    data['month'] = data.index.month


    data['hour_sinus'] = np.sin(2*np.pi*data['hour']/ 23)
    data['hour_cosine'] = np.cos(2*np.pi*data['hour'] / 23)

    data['month_sinus'] = np.sin(2*np.pi*data['month'] / 12)
    data['month_cosine'] = np.cos(2*np.pi*data['month'] / 12)

    

    data.drop(columns=['snow_density:kgm3', 'month', 'hour', 'week','week_hour'], inplace=True)
    
    data.rename_axis('time', inplace=True)

    mapping = elevation_mapping = {
            6.0 : 'A',
            7.0 : 'B',
            24.0 : 'C'
        }
    
    data['location'] = data['elevation:m'].map(mapping)
    data['location'] = data['location'].astype(str)
    data.drop(columns=['elevation:m'], inplace=True)
    
    return data

def remove_constants_Y(df):
    data = df.copy()

    # Create a mask to identify sequences of 3 or more consecutive rows with the same 'pv_measurement' (non-zero)
    consecutive_mask = (data['pv_measurement'] == data['pv_measurement'].shift(1)) & (data['pv_measurement'] != 0)
    consecutive_count = consecutive_mask.groupby((~consecutive_mask).cumsum()).cumsum()
    mask_consecutive = (consecutive_count <= 3) | (~consecutive_mask)

    #create a mask to identify sequences of 24 or more consecutive 0-valued pv-measurement
    consecutive_mask_0 = (data['pv_measurement'] == data['pv_measurement'].shift(1)) & (data['pv_measurement'] == 0)
    consecutive_count_0 = consecutive_mask_0.groupby((~consecutive_mask_0).cumsum()).cumsum()
    mask_consecutive_0 = (consecutive_count_0 <= 24) | (~consecutive_mask_0)

    combined_mask = mask_consecutive & mask_consecutive_0

    filtered_data = data[combined_mask]

    return filtered_data


def preprocessing_Y(df):
    data = df.copy()
    if 'time' in data.columns:
        data.set_index('time', inplace=True)
    data.dropna(inplace=True)
    
    return data


def add_estimated_flag(df, estimated):
    data = df.copy()

    data['estimated'] = 'E' if estimated else 'O'

    return data




In [25]:
transformed_train_a = train_a.copy()
transformed_train_b = train_b.copy()
transformed_train_c = train_c.copy()

scaler_a = MinMaxScaler()
scaler_b = MinMaxScaler()
scaler_c = MinMaxScaler()

transformed_train_a['pv_measurement'] = scaler_a.fit_transform(train_a[['pv_measurement']])
transformed_train_b['pv_measurement'] = scaler_b.fit_transform(train_b[['pv_measurement']])
transformed_train_c['pv_measurement'] = scaler_c.fit_transform(train_c[['pv_measurement']])

#combining observed and estimated data
observed_a = preprocessing_X(X_train_observed_a)
observed_a = add_estimated_flag(observed_a, False)

estimated_a = preprocessing_X(X_train_estimated_a)
estimated_a = add_estimated_flag(estimated_a, True)


x_train_a = pd.concat([observed_a, estimated_a], axis=0)
y_train_a = preprocessing_Y(train_a)
m_train_a = x_train_a.merge(y_train_a, how='inner', on='time')

transformed_y_train_a = preprocessing_Y(transformed_train_a)
transformed_m_train_a = x_train_a.merge(transformed_y_train_a, how='inner', on='time')

observed_b = preprocessing_X(X_train_observed_b)
observed_b = add_estimated_flag(observed_b, False)

estimated_b = preprocessing_X(X_train_estimated_b)
estimated_b = add_estimated_flag(estimated_b, True)

x_train_b = pd.concat([observed_b, estimated_b], axis=0)
y_train_b = preprocessing_Y(train_b)
y_train_b = remove_constants_Y(y_train_b)
m_train_b = x_train_b.merge(y_train_b, how='inner', on='time')

transformed_y_train_b = preprocessing_Y(transformed_train_b)
transformed_y_train_b = remove_constants_Y(transformed_y_train_b)
transformed_m_train_b = x_train_b.merge(transformed_y_train_b, how='inner', on='time')

observed_c = preprocessing_X(X_train_observed_c)
observed_c = add_estimated_flag(observed_c, False)

estimated_c = preprocessing_X(X_train_estimated_c)
estimated_c = add_estimated_flag(estimated_c, True)


x_train_c = pd.concat([observed_c, estimated_c], axis=0)
y_train_c = preprocessing_Y(train_c)
y_train_c = remove_constants_Y(y_train_c)
m_train_c = x_train_c.merge(y_train_c, how='inner', on='time')

transformed_y_train_c = preprocessing_Y(transformed_train_c)
transformed_y_train_c = remove_constants_Y(transformed_y_train_c)
transformed_m_train_c = x_train_c.merge(transformed_y_train_c, how='inner', on='time')

x_train = pd.concat([m_train_a, m_train_b, m_train_c], axis=0, ignore_index=True)

transformed_x_train = pd.concat([transformed_m_train_a, transformed_m_train_b, transformed_m_train_c], axis=0, ignore_index=True)


y_train = x_train['pv_measurement']
transformed_y_train = transformed_x_train['pv_measurement']

x_train = x_train.drop(columns=['pv_measurement'], axis=1)
transformed_x_train = transformed_x_train.drop(columns=['pv_measurement'], axis=1)

X_train, X_val, Y_train, Y_val = train_test_split(x_train, y_train, test_size=0.2, shuffle=True, random_state=10)
transformed_X_train, transformed_X_val, transformed_Y_train, transformed_Y_val = train_test_split(transformed_x_train, transformed_y_train, test_size=0.2, shuffle=True, random_state=10)

X_train_a = X_train[X_train['location'] == 'A']
X_train_b = X_train[X_train['location'] == 'B']
X_train_c = X_train[X_train['location'] == 'C']

transformed_X_train_a = transformed_X_train[transformed_X_train['location'] == 'A']
transformed_X_train_b = transformed_X_train[transformed_X_train['location'] == 'B']
transformed_X_train_c = transformed_X_train[transformed_X_train['location'] == 'C']

Y_train_a = Y_train[X_train['location'] == 'A']
Y_train_b = Y_train[X_train['location'] == 'B']
Y_train_c = Y_train[X_train['location'] == 'C']

transformed_Y_train_a = transformed_Y_train[transformed_X_train['location'] == 'A']
transformed_Y_train_b = transformed_Y_train[transformed_X_train['location'] == 'B']
transformed_Y_train_c = transformed_Y_train[transformed_X_train['location'] == 'C']

X_val_a = X_val[X_val['location'] == 'A']
X_val_b = X_val[X_val['location'] == 'B']
X_val_c = X_val[X_val['location'] == 'C']

transformed_X_val_a = transformed_X_val[transformed_X_val['location'] == 'A']
transformed_X_val_b = transformed_X_val[transformed_X_val['location'] == 'B']
transformed_X_val_c = transformed_X_val[transformed_X_val['location'] == 'C']

Y_val_a = Y_val[X_val['location'] == 'A']
Y_val_b = Y_val[X_val['location'] == 'B']
Y_val_c = Y_val[X_val['location'] == 'C']

transformed_Y_val_a = transformed_Y_val[transformed_X_val['location'] == 'A']
transformed_Y_val_b = transformed_Y_val[transformed_X_val['location'] == 'B']
transformed_Y_val_c = transformed_Y_val[transformed_X_val['location'] == 'C']


x_test_a = preprocessing_X(X_test_estimated_a)
x_test_a = add_estimated_flag(x_test_a, True)

x_test_b = preprocessing_X(X_test_estimated_b)
x_test_b = add_estimated_flag(x_test_b, True)

x_test_c = preprocessing_X(X_test_estimated_c)
x_test_c = add_estimated_flag(x_test_c, True)






x_test = pd.concat([x_test_a, x_test_b, x_test_c], axis=0, ignore_index=True)


### Catboost

In [26]:
from catboost import CatBoostRegressor
from catboost import Pool

params = {
    'depth' : 9,
    'iterations' : 1000,
    'loss_function' : 'MAE'
}

def catboost(X_train, Y_train, X_val, Y_val):
    
    model = CatBoostRegressor(**params)

    train_pool = Pool(data=X_train, label=Y_train, cat_features=['location', 'estimated'])

    eval_pool = Pool(data=X_val, label=Y_val, cat_features=['location', 'estimated'])

    model.fit(train_pool, use_best_model=True, eval_set=eval_pool)
    return model 



def train_predict(single=False):
    lenlist = []
    if not single:

        model = catboost(X_train, Y_train, X_val, Y_val)
        
        predictions = pd.DataFrame(model.predict(x_test), columns=['prediction'])

        predictions.index.name = 'id'

        return predictions

    else:
        model_a = catboost(X_train_a, Y_train_a, X_val_a, Y_val_a)
        predict_a = model_a.predict(x_test_a)
        predictions_a = pd.DataFrame(predict_a, columns=['prediction'])

        model_b = catboost(X_train_b, Y_train_b, X_val_b, Y_val_b)
        predict_b = model_b.predict(x_test_b)
        predictions_b = pd.DataFrame(predict_b, columns=['prediction'])
        
        model_c = catboost(X_train_c, Y_train_c, X_val_c, Y_val_c)
        predict_c = model_c.predict(x_test_c)
        predictions_c = pd.DataFrame(predict_c, columns=['prediction'])

        predictions = pd.concat([predictions_a, predictions_b, predictions_c], axis=0, ignore_index=True)

        predictions.index.name = 'id'

        return predictions



def transformed_train_predict(single=False):
    if not single:

        model = catboost(transformed_X_train, transformed_Y_train, transformed_X_val, transformed_Y_val)
        
        predictions = pd.DataFrame(model.predict(x_test), columns=['prediction'])

        predictions.index.name = 'id'

        return predictions

    else:
        model_a = catboost(transformed_X_train_a, transformed_Y_train_a, transformed_X_val_a, transformed_Y_val_a)
        predict_a = model_a.predict(x_test_a)
        predictions_a = pd.DataFrame(predict_a, columns=['prediction'])
        predictions_a.loc[:, 'prediction'] = scaler_a.inverse_transform(predictions_a['prediction'].values.reshape(-1, 1))
        

        model_b = catboost(transformed_X_train_b, transformed_Y_train_b, transformed_X_val_b, transformed_Y_val_b)
        predict_b = model_b.predict(x_test_b)
        predictions_b = pd.DataFrame(predict_b, columns=['prediction'])
        predictions_b.loc[:, 'prediction'] = scaler_b.inverse_transform(predictions_b['prediction'].values.reshape(-1, 1))

        
        model_c = catboost(transformed_X_train_c, transformed_Y_train_c, transformed_X_val_c, transformed_Y_val_c)
        predict_c = model_c.predict(x_test_c)
        predictions_c = pd.DataFrame(predict_c, columns=['prediction'])
        predictions_c.loc[:, 'prediction'] = scaler_c.inverse_transform(predictions_c['prediction'].values.reshape(-1, 1))


        predictions = pd.concat([predictions_a, predictions_b, predictions_c], axis=0, ignore_index=True)


        predictions.index.name = 'id'
        

        return predictions

In [27]:
catboost_predictions = train_predict(single=True)


0:	learn: 609.4517455	test: 637.6052708	best: 637.6052708 (0)	total: 14.5ms	remaining: 14.5s
1:	learn: 598.5068431	test: 626.7081969	best: 626.7081969 (1)	total: 33.6ms	remaining: 16.8s
2:	learn: 585.4067416	test: 613.3868237	best: 613.3868237 (2)	total: 44ms	remaining: 14.6s
3:	learn: 571.9478643	test: 599.7856574	best: 599.7856574 (3)	total: 53.8ms	remaining: 13.4s
4:	learn: 558.4536945	test: 586.2005389	best: 586.2005389 (4)	total: 63.4ms	remaining: 12.6s
5:	learn: 547.8445927	test: 575.6325521	best: 575.6325521 (5)	total: 72.9ms	remaining: 12.1s
6:	learn: 534.0477976	test: 561.4060082	best: 561.4060082 (6)	total: 82.6ms	remaining: 11.7s
7:	learn: 521.6373886	test: 548.6515287	best: 548.6515287 (7)	total: 92.3ms	remaining: 11.4s
8:	learn: 508.9158328	test: 535.5494145	best: 535.5494145 (8)	total: 102ms	remaining: 11.2s
9:	learn: 497.2895478	test: 523.7440918	best: 523.7440918 (9)	total: 112ms	remaining: 11.1s
10:	learn: 487.9145009	test: 514.4818512	best: 514.4818512 (10)	total: 121

In [28]:
transformed_catboost_predictions = transformed_train_predict(single=True)

0:	learn: 0.1062971	test: 0.1112076	best: 0.1112076 (0)	total: 13.2ms	remaining: 13.2s
1:	learn: 0.1043882	test: 0.1093070	best: 0.1093070 (1)	total: 23.8ms	remaining: 11.9s
2:	learn: 0.1021034	test: 0.1069836	best: 0.1069836 (2)	total: 33.9ms	remaining: 11.3s
3:	learn: 0.0997560	test: 0.1046113	best: 0.1046113 (3)	total: 43.8ms	remaining: 10.9s
4:	learn: 0.0974024	test: 0.1022419	best: 0.1022419 (4)	total: 53.5ms	remaining: 10.6s
5:	learn: 0.0955520	test: 0.1003987	best: 0.1003987 (5)	total: 73.4ms	remaining: 12.2s
6:	learn: 0.0931456	test: 0.0979174	best: 0.0979174 (6)	total: 83.6ms	remaining: 11.9s
7:	learn: 0.0909811	test: 0.0956928	best: 0.0956928 (7)	total: 95.2ms	remaining: 11.8s
8:	learn: 0.0887623	test: 0.0934076	best: 0.0934076 (8)	total: 106ms	remaining: 11.7s
9:	learn: 0.0867345	test: 0.0913486	best: 0.0913486 (9)	total: 117ms	remaining: 11.6s
10:	learn: 0.0850993	test: 0.0897331	best: 0.0897331 (10)	total: 128ms	remaining: 11.5s
11:	learn: 0.0831633	test: 0.0877163	best: 0

### AutoGluon

In [9]:
X_train_a = X_train_a.copy()
X_val_a = X_val_a.copy()

X_train_a['pv_measurement'] = Y_train_a
X_val_a['pv_measurement'] = Y_val_a
predictor_a = TabularPredictor(
                label='pv_measurement',
                eval_metric='mean_absolute_error').fit(
                    train_data=X_train_a,
                    tuning_data=X_val_a
                    )

No path specified. Models will be saved in: "AutogluonModels/ag-20231104_130322/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231104_130322/"
AutoGluon Version:  0.8.2
Python Version:     3.9.18
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 22.6.0: Wed Jul  5 22:22:05 PDT 2023; root:xnu-8796.141.3~6/RELEASE_ARM64_T6000
Disk Space Avail:   675.11 GB / 994.66 GB (67.9%)
Train Data Rows:    27303
Train Data Columns: 49
Tuning Data Rows:    6758
Tuning Data Columns: 49
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (5733.42, 0.0, 626.07979, 1164.11715)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary',

Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    2833.44 MB
	Train Data (Original)  Memory Usage: 10.9 MB (0.4% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 1 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Useless Original Features (Count: 2): ['snow_drift:idx', 'location']
		These features carry no predictive signal and should be manually investigated.
		This is typically a feature which has the same value for all rows.
		These features do not need to be present at inference time.
	Types of features in original d

[1000]	valid_set's l1: 172.24
[2000]	valid_set's l1: 166.441
[3000]	valid_set's l1: 163.157
[4000]	valid_set's l1: 161.358
[5000]	valid_set's l1: 160.047
[6000]	valid_set's l1: 159.106
[7000]	valid_set's l1: 158.517
[8000]	valid_set's l1: 158.123
[9000]	valid_set's l1: 157.803
[10000]	valid_set's l1: 157.54


	-157.5348	 = Validation score   (-mean_absolute_error)
	31.9s	 = Training   runtime
	0.69s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's l1: 176.143
[2000]	valid_set's l1: 173.323
[3000]	valid_set's l1: 172.588
[4000]	valid_set's l1: 172.274
[5000]	valid_set's l1: 172.098
[6000]	valid_set's l1: 172.038
[7000]	valid_set's l1: 172.01
[8000]	valid_set's l1: 171.95
[9000]	valid_set's l1: 171.926
[10000]	valid_set's l1: 171.924


	-171.9232	 = Validation score   (-mean_absolute_error)
	35.85s	 = Training   runtime
	1.16s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-189.0231	 = Validation score   (-mean_absolute_error)
	15.24s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: CatBoost ...
	-176.7087	 = Validation score   (-mean_absolute_error)
	100.96s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-190.1458	 = Validation score   (-mean_absolute_error)
	2.93s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-189.8382	 = Validation score   (-mean_absolute_error)
	18.12s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: XGBoost ...
	-183.0429	 = Validation score   (-mean_absolute_error)
	5.34s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	-175.7177	 = Validation score   (-mean_absolute_error)
	38.17s	 = Training   runtime
	0.03s	 = Validation runtime
Fitt

[1000]	valid_set's l1: 168.97
[2000]	valid_set's l1: 167.558
[3000]	valid_set's l1: 167.273
[4000]	valid_set's l1: 167.168
[5000]	valid_set's l1: 167.133
[6000]	valid_set's l1: 167.122
[7000]	valid_set's l1: 167.118
[8000]	valid_set's l1: 167.116
[9000]	valid_set's l1: 167.115
[10000]	valid_set's l1: 167.115


	-167.1146	 = Validation score   (-mean_absolute_error)
	89.14s	 = Training   runtime
	2.2s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	-156.0687	 = Validation score   (-mean_absolute_error)
	0.1s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 345.35s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20231104_130322/")


In [10]:
X_train_b = X_train_b.copy()
X_val_b = X_val_b.copy()

X_train_b['pv_measurement'] = Y_train_b
X_val_b['pv_measurement'] = Y_val_b
predictor_b = TabularPredictor(
                label='pv_measurement',
                eval_metric='mean_absolute_error').fit(
                    train_data=X_train_b,
                    tuning_data=X_val_b
                    )

No path specified. Models will be saved in: "AutogluonModels/ag-20231104_130907/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231104_130907/"
AutoGluon Version:  0.8.2
Python Version:     3.9.18
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 22.6.0: Wed Jul  5 22:22:05 PDT 2023; root:xnu-8796.141.3~6/RELEASE_ARM64_T6000
Disk Space Avail:   674.19 GB / 994.66 GB (67.8%)
Train Data Rows:    21507
Train Data Columns: 49
Tuning Data Rows:    5343
Tuning Data Columns: 49
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (1146.2625, 0.0, 103.86418, 209.37976)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary'

[1000]	valid_set's l1: 24.2961
[2000]	valid_set's l1: 23.4697
[3000]	valid_set's l1: 23.0977
[4000]	valid_set's l1: 22.8576
[5000]	valid_set's l1: 22.6894
[6000]	valid_set's l1: 22.5849
[7000]	valid_set's l1: 22.5193
[8000]	valid_set's l1: 22.4572
[9000]	valid_set's l1: 22.4237
[10000]	valid_set's l1: 22.391


	-22.3902	 = Validation score   (-mean_absolute_error)
	26.1s	 = Training   runtime
	0.53s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's l1: 24.4739
[2000]	valid_set's l1: 24.1022
[3000]	valid_set's l1: 24.002
[4000]	valid_set's l1: 23.9506
[5000]	valid_set's l1: 23.9279
[6000]	valid_set's l1: 23.9135
[7000]	valid_set's l1: 23.9049
[8000]	valid_set's l1: 23.8999
[9000]	valid_set's l1: 23.898
[10000]	valid_set's l1: 23.8944


	-23.8943	 = Validation score   (-mean_absolute_error)
	19.69s	 = Training   runtime
	0.67s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-26.7455	 = Validation score   (-mean_absolute_error)
	11.84s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: CatBoost ...
	-24.5797	 = Validation score   (-mean_absolute_error)
	95.94s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-26.8162	 = Validation score   (-mean_absolute_error)
	2.3s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-26.9106	 = Validation score   (-mean_absolute_error)
	14.33s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: XGBoost ...
	-24.9303	 = Validation score   (-mean_absolute_error)
	62.71s	 = Training   runtime
	0.62s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	-24.4589	 = Validation score   (-mean_absolute_error)
	29.04s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting mode

[1000]	valid_set's l1: 23.7229
[2000]	valid_set's l1: 23.6053
[3000]	valid_set's l1: 23.587
[4000]	valid_set's l1: 23.5814
[5000]	valid_set's l1: 23.5798
[6000]	valid_set's l1: 23.5794
[7000]	valid_set's l1: 23.5792
[8000]	valid_set's l1: 23.5791
[9000]	valid_set's l1: 23.5791
[10000]	valid_set's l1: 23.5791


	-23.5791	 = Validation score   (-mean_absolute_error)
	76.47s	 = Training   runtime
	1.64s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	-22.0329	 = Validation score   (-mean_absolute_error)
	0.09s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 344.42s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20231104_130907/")


In [11]:
X_train_c = X_train_c.copy()
X_val_c = X_val_c.copy()

X_train_c['pv_measurement'] = Y_train_c
X_val_c['pv_measurement'] = Y_val_c
predictor_c = TabularPredictor(
                label='pv_measurement',
                eval_metric='mean_absolute_error').fit(
                    train_data=X_train_c,
                    tuning_data=X_val_c
                    )

No path specified. Models will be saved in: "AutogluonModels/ag-20231104_131451/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231104_131451/"
AutoGluon Version:  0.8.2
Python Version:     3.9.18
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 22.6.0: Wed Jul  5 22:22:05 PDT 2023; root:xnu-8796.141.3~6/RELEASE_ARM64_T6000
Disk Space Avail:   673.44 GB / 994.66 GB (67.7%)
Train Data Rows:    17486
Train Data Columns: 49
Tuning Data Rows:    4473
Tuning Data Columns: 49
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and label-values can't be converted to int).
	Label info (max, min, mean, stddev): (999.6, 0.0, 91.61558, 176.63667)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary'

[1000]	valid_set's l1: 21.6336
[2000]	valid_set's l1: 20.9161
[3000]	valid_set's l1: 20.5681
[4000]	valid_set's l1: 20.3804
[5000]	valid_set's l1: 20.2987
[6000]	valid_set's l1: 20.2333
[7000]	valid_set's l1: 20.1841
[8000]	valid_set's l1: 20.1505
[9000]	valid_set's l1: 20.1234
[10000]	valid_set's l1: 20.1056


	-20.1042	 = Validation score   (-mean_absolute_error)
	24.27s	 = Training   runtime
	0.45s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's l1: 22.4284
[2000]	valid_set's l1: 22.0468
[3000]	valid_set's l1: 21.9573
[4000]	valid_set's l1: 21.9193
[5000]	valid_set's l1: 21.9052
[6000]	valid_set's l1: 21.8948
[7000]	valid_set's l1: 21.889
[8000]	valid_set's l1: 21.8831
[9000]	valid_set's l1: 21.8809
[10000]	valid_set's l1: 21.8784


	-21.8783	 = Validation score   (-mean_absolute_error)
	20.21s	 = Training   runtime
	0.54s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-24.6581	 = Validation score   (-mean_absolute_error)
	8.23s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: CatBoost ...
	-22.0781	 = Validation score   (-mean_absolute_error)
	94.91s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-24.4799	 = Validation score   (-mean_absolute_error)
	1.86s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-24.3772	 = Validation score   (-mean_absolute_error)
	10.55s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: XGBoost ...
	-22.9638	 = Validation score   (-mean_absolute_error)
	60.09s	 = Training   runtime
	0.58s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	-23.0195	 = Validation score   (-mean_absolute_error)
	23.08s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting mode

[1000]	valid_set's l1: 21.9856
[2000]	valid_set's l1: 21.887
[3000]	valid_set's l1: 21.8733
[4000]	valid_set's l1: 21.8704
[5000]	valid_set's l1: 21.8697
[6000]	valid_set's l1: 21.8695
[7000]	valid_set's l1: 21.8694
[8000]	valid_set's l1: 21.8693
[9000]	valid_set's l1: 21.8693
[10000]	valid_set's l1: 21.8693


	-21.8693	 = Validation score   (-mean_absolute_error)
	109.87s	 = Training   runtime
	1.95s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	-19.9835	 = Validation score   (-mean_absolute_error)
	0.1s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 359.03s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20231104_131451/")


In [12]:
y_pred_a = predictor_a.predict(x_test_a)
y_pred_b = predictor_b.predict(x_test_b)
y_pred_c = predictor_c.predict(x_test_c)

y_pred_c.head()

time
2023-05-01 00:00:00   -0.101726
2023-05-01 01:00:00    0.171641
2023-05-01 02:00:00    0.732429
2023-05-01 03:00:00    4.126227
2023-05-01 04:00:00   -1.562568
Name: pv_measurement, dtype: float32

In [13]:
perf = predictor_a.evaluate(X_val_a, silent=True)
predictor_a.leaderboard(X_val_a, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-156.068659,-156.068659,3.601404,2.917223,159.31558,0.011306,0.000253,0.100719,2,True,10
1,LightGBMXT,-157.534796,-157.534796,0.829472,0.689478,31.895799,0.829472,0.689478,31.895799,1,True,1
2,LightGBMLarge,-167.114598,-167.114598,2.725775,2.199704,89.14478,2.725775,2.199704,89.14478,1,True,9
3,LightGBM,-171.923215,-171.923215,0.879118,1.163029,35.854316,0.879118,1.163029,35.854316,1,True,2
4,NeuralNetTorch,-175.717694,-175.717694,0.034851,0.027788,38.174281,0.034851,0.027788,38.174281,1,True,8
5,CatBoost,-176.708704,-176.708704,0.064731,0.018198,100.958586,0.064731,0.018198,100.958586,1,True,4
6,XGBoost,-183.042911,-183.042911,0.037094,0.024701,5.342685,0.037094,0.024701,5.342685,1,True,7
7,RandomForestMSE,-189.02309,-189.02309,0.350669,0.051471,15.243516,0.350669,0.051471,15.243516,1,True,3
8,NeuralNetFastAI,-189.838229,-189.838229,0.067592,0.047239,18.124583,0.067592,0.047239,18.124583,1,True,6
9,ExtraTreesMSE,-190.145828,-190.145828,0.221116,0.05824,2.932213,0.221116,0.05824,2.932213,1,True,5


In [14]:
autogluon_predictions = pd.concat([y_pred_a, y_pred_b, y_pred_c], axis=0, ignore_index=True).to_frame()
autogluon_predictions.index.name = 'id'
autogluon_predictions.rename(columns={'pv_measurement': 'prediction'}, inplace=True)

### Postprocessing

In [15]:
def postprocessing(features_df, preds_df):
    features = features_df.copy()
    preds = preds_df.copy()

    # Set the index of 'features' to match 'preds'
    features['is_day:idx'].index = preds.index

    # Setting all night-time predictions to zero
    preds.loc[features['is_day:idx'] == 0, 'prediction'] = 0

    # Setting all low values to zero
    preds['prediction'] = preds['prediction'].apply(lambda x: 0.0 if x < 1 else x)

    return preds




In [16]:
catboost_predictions_proc = postprocessing(x_test, catboost_predictions)
transformed_catboost_predictions_proc = postprocessing(x_test, transformed_catboost_predictions)
autogluon_predictions_proc = postprocessing(x_test, autogluon_predictions)

In [29]:
final = pd.DataFrame()

final['prediction'] =(
                0.25 * catboost_predictions_proc['prediction'] + 
                0.25 * transformed_catboost_predictions_proc['prediction'] + 
                0.50 * autogluon_predictions_proc['prediction']     
                )            

final.index.name = 'id'

now = datetime.now()
current_time = now.strftime("%H:%M:%S")

final.to_csv(f'pred_{current_time}.csv')