In [None]:
%matplotlib inline
from utils import *

In [None]:
train_b = pd.read_parquet('./data/B/train_targets.parquet')

X_train_estimated_b = pd.read_parquet('./data/B/X_train_estimated.parquet')
X_train_observed_b = pd.read_parquet('./data/B/X_train_observed.parquet')
X_test_estimated_b = pd.read_parquet('./data/B/X_test_estimated.parquet')

X_test_estimated_b['date_forecast'] = pd.to_datetime(X_test_estimated_b['date_forecast'])
X_test_estimated_b = X_test_estimated_b[X_test_estimated_b['date_forecast'].dt.minute == 0]

imputer = SimpleImputer(strategy='most_frequent')
X_test_estimated_b[['ceiling_height_agl:m', 'cloud_base_agl:m']] = imputer.fit_transform(X_test_estimated_b[['ceiling_height_agl:m', 'cloud_base_agl:m']])

df = pd.concat([X_train_observed_b, X_train_estimated_b])
df = pd.merge(df, train_b, left_on='date_forecast', right_on='time', how='inner')
df = df.drop(columns=['date_forecast', 'date_calc', 'snow_density:kgm3', 'snow_drift:idx', 'fresh_snow_1h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'fresh_snow_24h:cm'])

In [None]:
imputer = SimpleImputer(strategy='most_frequent')
X_test_estimated_b[['ceiling_height_agl:m', 'cloud_base_agl:m']] = imputer.fit_transform(X_test_estimated_b[['ceiling_height_agl:m', 'cloud_base_agl:m']])
df[['ceiling_height_agl:m', 'cloud_base_agl:m', 'pv_measurement']] = imputer.fit_transform(df[['ceiling_height_agl:m', 'cloud_base_agl:m', 'pv_measurement']])
X_test_estimated_b = X_test_estimated_b.rename(columns={'date_forecast': 'time'})

In [None]:
threshold = 0.99

segments = find_long_constant_periods(train_b['pv_measurement'], threshold=5)
df = remove_constant_periods(df, segments)

df = lag_features_by_one_hour(df, ['diffuse_rad_1h:J', 'direct_rad_1h:J', 'clear_sky_energy_1h:J'])
df = is_estimated(df)
# df = remove_highly_correlated_features(df, threshold)

X_test_estimated_b = lag_features_by_one_hour(X_test_estimated_b, ['diffuse_rad_1h:J', 'direct_rad_1h:J', 'clear_sky_energy_1h:J'])
X_test_estimated_b = is_estimated(X_test_estimated_b)
common_columns = df.columns.intersection(X_test_estimated_b.columns)
X_test_estimated_b = X_test_estimated_b.loc[:, common_columns]

In [None]:
# # Define the split date
# split_date = '2022-10-27'
# 
# # Convert the 'time' column to a datetime object
# df['time'] = pd.to_datetime(df['time'])
# 
# # Sorting the data by the 'time' column to maintain chronological order
# df.sort_values('time', inplace=True)
# 
# # Splitting the data into training and test sets based on the split date
# train_df = df[df['time'] < split_date]
# test_df = df[df['time'] >= split_date]
# 
# # Identifying the features and the target variable
# X_train = train_df.drop(columns=['pv_measurement', 'time'])
# y_train = train_df['pv_measurement']
# X_test = test_df.drop(columns=['pv_measurement', 'time'])
# y_test = test_df['pv_measurement']

In [None]:
train_end_date = '2022-10-21'
# 2023-01-29
validation_end_date = '2023-01-29'
# 2023-03-16

# Split the data into training, validation, and testing sets based on the new split dates
train_df, val_and_test_df = split_df_on_date(df, train_end_date)
validation_df, test_df = split_df_on_date(val_and_test_df, validation_end_date)

# Randomly sample data within these periods (assuming you want to keep the same data structure)
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
validation_df = validation_df.sample(frac=1, random_state=42).reset_index(drop=True)
test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Use all time values for observed data
train_df = mean_of_the_hour(train_df)
validation_df = mean_of_the_hour(validation_df)
test_df = mean_of_the_hour(test_df)

# Identifying the features and the target variable
X_train = train_df.drop(columns=['pv_measurement', 'time'])
y_train = train_df['pv_measurement']
X_val = validation_df.drop(columns=['pv_measurement', 'time'])
y_val = validation_df['pv_measurement']
X_test = test_df.drop(columns=['pv_measurement', 'time'])
y_test = test_df['pv_measurement']

In [None]:
# Combine training and validation data into a single dataset for AutoGluon
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)

# Specify the name of the target variable
label = 'pv_measurement'

# Create a TabularPredictor object
predictor = TabularPredictor(label=label, eval_metric="mean_absolute_error").fit(train_data=train_data, tuning_data=val_data, presets='medium_quality', use_bag_holdout=True)

In [None]:
# MAE
from sklearn.metrics import mean_absolute_error

predictions = predictor.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
mae

In [None]:
results = predictor.fit_summary(show_plot=True)

In [None]:
X_test_estimated_b = mean_of_the_hour(X_test_estimated_b)

y_pred = predictor.predict(X_test_estimated_b)
y_pred = y_pred.clip(lower=0)
y_pred = y_pred.reset_index(drop=True)
y_pred.index.name = 'id'

In [None]:
df = pd.DataFrame(y_pred)
df.to_csv('result_b.csv')