In [None]:
from sklearn.model_selection import train_test_split
%matplotlib inline
from utils import *
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
train_c = pd.read_parquet('./data/C/train_targets.parquet')

X_train_estimated_c = pd.read_parquet('./data/C/X_train_estimated.parquet')
X_train_observed_c = pd.read_parquet('./data/C/X_train_observed.parquet')
X_test_estimated_c = pd.read_parquet('./data/C/X_test_estimated.parquet')

df = pd.concat([X_train_observed_c, X_train_estimated_c])

df = resample_to_hourly(df)
X_test_estimated_c = resample_to_hourly(X_test_estimated_c)

df = pd.merge(df, train_c, left_on='date_forecast', right_on='time', how='inner')
df = df.drop(columns=['snow_density:kgm3', 'elevation:m'])
X_test_estimated_c = X_test_estimated_c.drop(columns=['snow_density:kgm3', 'elevation:m'])

In [None]:
cols_to_impute = ['ceiling_height_agl:m', 'cloud_base_agl:m']

imputer = IterativeImputer(max_iter=10, random_state=42)
X_test_estimated_c[cols_to_impute] = imputer.fit_transform(X_test_estimated_c[cols_to_impute])
df[cols_to_impute] = imputer.fit_transform(df[cols_to_impute])
df = df.dropna(subset=['pv_measurement'])

In [None]:
segments = find_long_constant_periods(train_c['pv_measurement'], threshold=5)
df = remove_constant_periods(df, segments)
df = is_estimated(df)
df = generate_solar_features_3(df)

X_test_estimated_c = is_estimated(X_test_estimated_c, 'date_forecast')
X_test_estimated_c = generate_solar_features_3(X_test_estimated_c)

In [None]:
train_end_date = '2022-10-21'
df['time'] = pd.to_datetime(df['time'])

train_df = df[df['time'] < train_end_date]
remaining_data = df[df['time'] > train_end_date]

train_data, validation_df = train_test_split(remaining_data, test_size=0.5, random_state=42)
train_df = pd.concat([train_df, train_data], ignore_index=True)

# Identifying the features and the target variable
X_train = train_df.drop(columns=['pv_measurement', 'time', 'date_forecast'])
y_train = train_df['pv_measurement']
X_val = validation_df.drop(columns=['pv_measurement', 'time', 'date_forecast'])
y_val = validation_df['pv_measurement']

In [None]:
# Combine training and validation data into a single dataset for AutoGluon
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)

# Specify the name of the target variable
label = 'pv_measurement'

# Create a TabularPredictor object
predictor = TabularPredictor(label=label, eval_metric="mean_absolute_error").fit(train_data=train_data, tuning_data=val_data, presets='medium_quality')
# , num_gpus=1, num_stack_levels=0, use_bag_holdout=True

In [None]:
results = predictor.fit_summary()

In [None]:
feature_importance = predictor.feature_importance(val_data)

In [None]:
best_features = feature_importance[feature_importance['importance'] > 0.2].index.tolist()

X_train = X_train[best_features]
train_data = pd.concat([X_train, y_train], axis=1)

X_val = X_val[best_features]
val_data = pd.concat([X_val, y_val], axis=1)

label = 'pv_measurement'

predictor = TabularPredictor(label=label, eval_metric="mean_absolute_error").fit(train_data=train_data, tuning_data=val_data, presets='medium_quality')
# , num_gpus=1, num_stack_levels=0, use_bag_holdout=True

In [None]:
results = predictor.fit_summary()

In [None]:
X_test_estimated_c = X_test_estimated_c[best_features]

y_pred = predictor.predict(X_test_estimated_c)
y_pred = y_pred.clip(lower=0)
y_pred = y_pred.reset_index(drop=True)
y_pred.index.name = 'id'

In [None]:
df = pd.DataFrame(y_pred)
df.to_csv('result_c.csv')