In [1]:
from sklearn.model_selection import train_test_split
%matplotlib inline
from utils import *

In [2]:
train_b = pd.read_parquet('./data/B/train_targets.parquet')

X_train_estimated_b = pd.read_parquet('./data/B/X_train_estimated.parquet')
X_train_observed_b = pd.read_parquet('./data/B/X_train_observed.parquet')
X_test_estimated_b = pd.read_parquet('./data/B/X_test_estimated.parquet')

df = pd.concat([X_train_observed_b, X_train_estimated_b])

df = resample_to_hourly(df)
X_test_estimated_b = resample_to_hourly(X_test_estimated_b)

df = pd.merge(df, train_b, left_on='date_forecast', right_on='time', how='inner')
df = df.drop(columns=['snow_density:kgm3', 'elevation:m'])
# , 'snow_drift:idx', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'fresh_snow_24h:cm',  'snow_melt_10min:mm', 'snow_water:kgm2', 'wind_speed_w_1000hPa:ms', 'dew_point_2m:K', 'elevation:m', 'msl_pressure:hPa'
X_test_estimated_b = X_test_estimated_b.drop(columns=['snow_density:kgm3', 'elevation:m'])
# , 'snow_drift:idx', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'fresh_snow_24h:cm',  'snow_melt_10min:mm', 'snow_water:kgm2', 'wind_speed_w_1000hPa:ms', 'dew_point_2m:K', 'elevation:m', 'msl_pressure:hPa'

  df_hourly = df.resample('H').mean()
  df_hourly = df.resample('H').mean()


In [3]:
imputer = SimpleImputer(strategy='most_frequent')
X_test_estimated_b[['ceiling_height_agl:m', 'cloud_base_agl:m']] = imputer.fit_transform(X_test_estimated_b[['ceiling_height_agl:m', 'cloud_base_agl:m']])
df[['ceiling_height_agl:m', 'cloud_base_agl:m']] = imputer.fit_transform(df[['ceiling_height_agl:m', 'cloud_base_agl:m']])
df = df.dropna(subset=['pv_measurement'])

In [4]:
segments = find_long_constant_periods(train_b['pv_measurement'], threshold=5)
df = remove_constant_periods(df, segments)
df = remove_unwanted_rows(df)
df = lag_features_by_one_hour(df, ['diffuse_rad_1h:J', 'direct_rad_1h:J', 'clear_sky_energy_1h:J'])
df = is_estimated(df)
# df = feature_engineering_1(df)

X_test_estimated_b = lag_features_by_one_hour(X_test_estimated_b, ['diffuse_rad_1h:J', 'direct_rad_1h:J', 'clear_sky_energy_1h:J'], 'date_forecast')
X_test_estimated_b = is_estimated(X_test_estimated_b, 'date_forecast')
# X_test_estimated_b = feature_engineering_1(X_test_estimated_b)

In [5]:
train_end_date = '2022-10-21'
df['time'] = pd.to_datetime(df['time'])

train_df = df[df['time'] < train_end_date]
remaining_data = df[df['time'] > train_end_date]

train_data, validation_df = train_test_split(remaining_data, test_size=0.5, random_state=42)
train_df = pd.concat([train_df, train_data], ignore_index=True)

# Identifying the features and the target variable
X_train = train_df.drop(columns=['pv_measurement', 'time', 'date_forecast'])
y_train = train_df['pv_measurement']
X_val = validation_df.drop(columns=['pv_measurement', 'time', 'date_forecast'])
y_val = validation_df['pv_measurement']

In [6]:
# Combine training and validation data into a single dataset for AutoGluon
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)

# Specify the name of the target variable
label = 'pv_measurement'

# Create a TabularPredictor object
predictor = TabularPredictor(label=label, eval_metric="mean_absolute_error").fit(train_data=train_data, tuning_data=val_data, presets='best_quality', num_gpus=1, num_stack_levels=0, use_bag_holdout=True)
# , num_gpus=1, num_stack_levels=0, use_bag_holdout=True

No path specified. Models will be saved in: "AutogluonModels\ag-20231029_161521\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231029_161521\"
AutoGluon Version:  0.8.2
Python Version:     3.9.13
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   936.52 GB / 2047.46 GB (45.7%)
Train Data Rows:    27808
Train Data Columns: 44
Tuning Data Rows:    1801
Tuning Data Columns: 44
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (1152.3, -0.0, 96.65726, 205.20717)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specif

In [7]:
results = predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                     model  score_val  pred_time_val     fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      WeightedEnsemble_L2 -10.400978      31.403154  3288.543870                0.000000           0.190298            2       True         12
1    NeuralNetTorch_BAG_L1 -11.014719       0.177509   687.248432                0.177509         687.248432            1       True         10
2        LightGBMXT_BAG_L1 -11.469870       8.802445   415.683793                8.802445         415.683793            1       True          3
3     LightGBMLarge_BAG_L1 -11.639637      11.302814  1701.380554               11.302814        1701.380554            1       True         11
4          LightGBM_BAG_L1 -11.937835      10.795494   380.853071               10.795494         380.853071            1       True          4
5           XGBoost_BAG_L1 -12.287361       3.342581   471.391683         



In [8]:
y_pred = predictor.predict(X_test_estimated_b)
y_pred = y_pred.clip(lower=0)
y_pred = y_pred.reset_index(drop=True)
y_pred.index.name = 'id'

In [9]:
df = pd.DataFrame(y_pred)
df.to_csv('result_b.csv')