In [6]:
%matplotlib inline
from utils import *

In [7]:
train_a = pd.read_parquet('./data/A/train_targets.parquet')

X_train_estimated_a = pd.read_parquet('./data/A/X_train_estimated.parquet')
X_train_observed_a = pd.read_parquet('./data/A/X_train_observed.parquet')
X_test_estimated_a = pd.read_parquet('./data/A/X_test_estimated.parquet')
X_test_estimated_a['date_forecast'] = pd.to_datetime(X_test_estimated_a['date_forecast'])
X_test_estimated_a = X_test_estimated_a[X_test_estimated_a['date_forecast'].dt.minute == 0]

df = pd.concat([X_train_observed_a, X_train_estimated_a])
df = pd.merge(df, train_a, left_on='date_forecast', right_on='time', how='inner')
df = df.drop(columns=['date_forecast', 'date_calc', 'snow_density:kgm3', 'snow_drift:idx', 'fresh_snow_1h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'fresh_snow_24h:cm', 'snow_depth:cm', 'snow_melt_10min:mm', 'snow_water:kgm2', 'wind_speed_w_1000hPa:ms'])

In [8]:
imputer = SimpleImputer(strategy='most_frequent')
df[['ceiling_height_agl:m', 'cloud_base_agl:m']] = imputer.fit_transform(df[['ceiling_height_agl:m', 'cloud_base_agl:m']])
X_test_estimated_a[['ceiling_height_agl:m', 'cloud_base_agl:m']] = imputer.fit_transform(X_test_estimated_a[['ceiling_height_agl:m', 'cloud_base_agl:m']])
X_test_estimated_a = X_test_estimated_a.rename(columns={'date_forecast': 'time'})

In [9]:
threshold = 0.98

segments = find_long_constant_periods(train_a['pv_measurement'], threshold=5)
df = remove_constant_periods(df, segments)

df = lag_features_by_one_hour(df, ['diffuse_rad_1h:J', 'direct_rad_1h:J', 'clear_sky_energy_1h:J'])
df = is_estimated(df)

X_test_estimated_a = lag_features_by_one_hour(X_test_estimated_a, ['diffuse_rad_1h:J', 'direct_rad_1h:J', 'clear_sky_energy_1h:J'])
X_test_estimated_a = is_estimated(X_test_estimated_a)
common_columns = df.columns.intersection(X_test_estimated_a.columns)
X_test_estimated_a = X_test_estimated_a.loc[:, common_columns]

In [10]:
# # Define the split date
# split_date = '2022-10-27'
# 
# # Convert the 'time' column to a datetime object
# df['time'] = pd.to_datetime(df['time'])
# 
# # Sorting the data by the 'time' column to maintain chronological order
# df.sort_values('time', inplace=True)
# 
# # Splitting the data into training and test sets based on the split date
# train_df = df[df['time'] < split_date]
# test_df = df[df['time'] >= split_date]
# 
# # Identifying the features and the target variable
# X_train = train_df.drop(columns=['pv_measurement', 'time'])
# y_train = train_df['pv_measurement']
# X_test = test_df.drop(columns=['pv_measurement', 'time'])
# y_test = test_df['pv_measurement']

In [11]:
train_end_date = '2022-10-21'
# 2023-01-29
validation_end_date = '2023-01-29'
# 2023-03-16
# Convert 'time' column to datetime, if not already
df['time'] = pd.to_datetime(df['time'])

# Split the data into training, validation, and testing sets based on the new split dates
train_df = df[df['time'] < train_end_date]
validation_df = df[(df['time'] >= train_end_date) & (df['time'] < validation_end_date)]
test_df = df[df['time'] >= train_end_date]

# Randomly sample data within these periods (assuming you want to keep the same data structure)
# train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
# validation_df = validation_df.sample(frac=1, random_state=42).reset_index(drop=True)
# test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Identifying the features and the target variable
X_train = train_df.drop(columns=['pv_measurement', 'time'])
y_train = train_df['pv_measurement']
X_val = validation_df.drop(columns=['pv_measurement', 'time'])
y_val = validation_df['pv_measurement']
X_test = test_df.drop(columns=['pv_measurement', 'time'])
y_test = test_df['pv_measurement']

In [12]:
# Combine training and validation data into a single dataset for AutoGluon
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)

# Specify the name of the target variable
label = 'pv_measurement'

# Create a TabularPredictor object
predictor = TabularPredictor(label=label, eval_metric="mean_absolute_error").fit(train_data=train_data, tuning_data=val_data, presets='medium_quality', use_bag_holdout=True)

No path specified. Models will be saved in: "AutogluonModels\ag-20231024_062722\"
Presets specified: ['medium_quality']
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231024_062722\"
AutoGluon Version:  0.8.2
Python Version:     3.9.13
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   951.46 GB / 2047.46 GB (46.5%)
Train Data Rows:    29666
Train Data Columns: 36
Tuning Data Rows:    2187
Tuning Data Columns: 36
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (5733.42, 0.0, 674.16825, 1195.54547)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generat

In [13]:
results = predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                  model   score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2  -46.426989       0.082468  61.864709                0.000000           0.204802            2       True         12
1        NeuralNetTorch  -47.387622       0.019557  50.397829                0.019557          50.397829            1       True         10
2         ExtraTreesMSE  -52.090404       0.096002   2.265758                0.096002           2.265758            1       True          7
3               XGBoost  -52.592244       0.004002   0.455328                0.004002           0.455328            1       True          9
4       RandomForestMSE  -52.613441       0.062910  11.262078                0.062910          11.262078            1       True          5
5         LightGBMLarge  -53.648352       0.004000   1.705070                0.004000           1.



In [14]:
y_pred = predictor.predict(X_test_estimated_a)
y_pred = y_pred.clip(lower=0)
y_pred = y_pred.reset_index(drop=True)
y_pred.index.name = 'id'

In [15]:
df = pd.DataFrame(y_pred)
df.to_csv('result_a.csv')