In [17]:
import optuna
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
%matplotlib inline
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import RFE

In [18]:
train_a = pd.read_parquet('./data/A/train_targets.parquet')
train_b = pd.read_parquet('./data/B/train_targets.parquet')
train_c = pd.read_parquet('./data/C/train_targets.parquet')

X_train_estimated_a = pd.read_parquet('./data/A/X_train_estimated.parquet')
X_train_estimated_c = pd.read_parquet('./data/C/X_train_estimated.parquet')

X_train_observed_a = pd.read_parquet('./data/A/X_train_observed.parquet')
X_train_observed_c = pd.read_parquet('./data/C/X_train_observed.parquet')

X_test_estimated_a = pd.read_parquet('./data/A/X_test_estimated.parquet')
X_test_estimated_c = pd.read_parquet('./data/C/X_test_estimated.parquet')

X_test_estimated_c['date_forecast'] = pd.to_datetime(X_test_estimated_c['date_forecast'])
X_test_estimated_c = X_test_estimated_c[X_test_estimated_c['date_forecast'].dt.minute == 0]

imputer = SimpleImputer(strategy='most_frequent')
X_test_estimated_c[['ceiling_height_agl:m', 'cloud_base_agl:m']] = imputer.fit_transform(X_test_estimated_c[['ceiling_height_agl:m', 'cloud_base_agl:m']])

In [19]:
# fig, axs = plt.subplots(3, 1, figsize=(20, 10), sharex=True)
# train_a[['time', 'pv_measurement']].set_index('time').plot(ax=axs[0], title='Train/Test  A', color='red')
# train_b[['time', 'pv_measurement']].set_index('time').plot(ax=axs[1], title='Train/Test  B', color='red')
# train_c[['time', 'pv_measurement']].set_index('time').plot(ax=axs[2], title='Train/Test  C', color='red')

In [20]:
df = pd.concat([X_train_observed_c, X_train_estimated_c])

In [21]:
# def remove_highly_correlated_features(df, threshold):
#     """
#     Remove features from a DataFrame that have an absolute correlation higher than a given threshold.
# 
#     Parameters:
#     - df (Pandas DataFrame): The input DataFrame.
#     - threshold (float): The correlation threshold for feature removal.
# 
#     Returns:
#     - Pandas DataFrame: A new DataFrame with highly correlated features removed.
#     """
#     # Compute the Pearson correlation matrix
#     correlation_matrix = df.corr(method='pearson')
# 
#     # Initialize an empty list to hold features to be removed
#     features_to_remove = []
# 
#     # Traverse the correlation matrix to find highly correlated features
#     for i in range(len(correlation_matrix.columns)):
#         for j in range(i+1, len(correlation_matrix.columns)):
#             feature1 = correlation_matrix.columns[i]
#             feature2 = correlation_matrix.columns[j]
# 
#             # Check for high absolute correlation
#             if abs(correlation_matrix.iloc[i, j]) > threshold:
#                 # Add one of the features to the list if it's not already there
#                 if feature1 not in features_to_remove and feature2 not in features_to_remove:
#                     features_to_remove.append(feature1)
# 
#     # Drop the identified features from the DataFrame
#     filtered_df = df.drop(columns=features_to_remove)
# 
#     return filtered_df
# 
# # Example usage
# threshold = 0.99
# df = remove_highly_correlated_features(df, threshold)

In [22]:
# def remove_highly_correlated_features_spearman(df, threshold):
#     """
#     Remove features from a DataFrame that have an absolute Spearman correlation higher than a given threshold.
# 
#     Parameters:
#     - df (Pandas DataFrame): The input DataFrame.
#     - threshold (float): The correlation threshold for feature removal.
# 
#     Returns:
#     - Pandas DataFrame: A new DataFrame with highly correlated features removed.
#     """
#     # Compute the Spearman correlation matrix
#     correlation_matrix = df.corr(method='spearman')
# 
#     # Initialize an empty list to hold features to be removed
#     features_to_remove = []
# 
#     # Traverse the correlation matrix to find highly correlated features
#     for i in range(len(correlation_matrix.columns)):
#         for j in range(i+1, len(correlation_matrix.columns)):
#             feature1 = correlation_matrix.columns[i]
#             feature2 = correlation_matrix.columns[j]
# 
#             # Check for high absolute correlation
#             if abs(correlation_matrix.iloc[i, j]) > threshold:
#                 # Add one of the features to the list if it's not already there
#                 if feature1 not in features_to_remove and feature2 not in features_to_remove:
#                     features_to_remove.append(feature1)
# 
#     # Drop the identified features from the DataFrame
#     filtered_df = df.drop(columns=features_to_remove)
# 
#     return filtered_df
# 
# threshold = 0.95
# df = remove_highly_correlated_features_spearman(df, threshold)

In [23]:
df = pd.merge(df, train_c, left_on='date_forecast', right_on='time', how='inner')
df = df.drop(columns=['date_forecast', 'date_calc', 'snow_density:kgm3', 'snow_drift:idx', 'diffuse_rad_1h:J', 'fresh_snow_1h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'fresh_snow_24h:cm'])

imputer = SimpleImputer(strategy='most_frequent')
df[['ceiling_height_agl:m', 'cloud_base_agl:m', 'pv_measurement']] = imputer.fit_transform(df[['ceiling_height_agl:m', 'cloud_base_agl:m', 'pv_measurement']])

In [24]:
split_index = int(len(df) * 0.7)
train_data = df[:split_index]
test_data = df[split_index:]

# # Plotting the data
# plt.figure(figsize=(15, 5))
# 
# # Plot training data
# plt.plot(np.arange(0, split_index), train_data, label='Training Data', color='blue')
# 
# # Plot testing data
# plt.plot(np.arange(split_index, len(df)), test_data, label='Testing Data', color='red')
# 
# # Adding a vertical line to indicate the split point
# plt.axvline(x=split_index, color='gray', linestyle='--', label='Split Point')
# 
# plt.title('Training and Testing Data Split')
# plt.xlabel('Index')
# plt.ylabel('Value')
# plt.show()

In [25]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(train_data.drop(columns=['pv_measurement', 'time']), train_data['pv_measurement'])

# Make predictions on the test set
y_pred = xgb_model.predict(test_data.drop(columns=['pv_measurement', 'time']))
y_pred = np.maximum(y_pred, 0)
# Calculate MAE on the test set
mae = mean_absolute_error(test_data['pv_measurement'], y_pred)
mae

23.14241221518395

In [26]:
# def find_long_constant_periods(data, threshold):
#     start = None
#     segments = []
#     for i in range(1, len(data)):
#         if data[i] == data[i-1]:
#             if start is None:
#                 start = i-1
#         else:
#             if start is not None:
#                 if (i - start) > threshold:
#                     segments.append((start, i))
#                 start = None
#     return segments
# 
# # Load and prepare data
# train_b.columns = ['ds', 'y']
# 
# segments = find_long_constant_periods(train_b['y'], threshold=40)

In [27]:
# def remove_constant_periods(df, segments):
#     drop_indices = []
#     for start, end in segments:
#         drop_indices.extend(range(start, end))
#     return df.drop(drop_indices)
# 
# df = remove_constant_periods(df, segments)

In [28]:
split_date = '2022-10-27'

# Convert the 'time' column to a datetime object
df['time'] = pd.to_datetime(df['time'])

# Sorting the data by the 'time' column to maintain chronological order
df.sort_values('time', inplace=True)

# Splitting the data into training and test sets based on the split date
train_df = df[df['time'] < split_date]
test_df = df[df['time'] >= split_date]

# Identifying the features and the target variable
X_train = train_df.drop(columns=['pv_measurement', 'time'])
y_train = train_df['pv_measurement']
X_test = test_df.drop(columns=['pv_measurement', 'time'])
y_test = test_df['pv_measurement']

In [29]:
# xgb_model = xgb.XGBRegressor()
# # Create RFE model and select the top 10 features
# rfe = RFE(xgb_model, n_features_to_select=21)
# fit = rfe.fit(X_train, y_train)
# 
# selected_columns = X_train.columns[rfe.support_]
# 
# # Train and test using only selected features
# X_train_rfe = rfe.transform(X_train)
# X_test_rfe = rfe.transform(X_test)
# 
# xgb_model.fit(X_train_rfe, y_train)
# y_pred = xgb_model.predict(X_test_rfe)
# y_pred = np.maximum(y_pred, 0)
# mae = mean_absolute_error(y_test, y_pred)
# mae

20.71578408075066

In [30]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)
y_pred = np.maximum(y_pred, 0)
# Calculate MAE on the test set
mae = mean_absolute_error(y_test, y_pred)
mae

20.498028856135

In [123]:
def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "n_estimators": 1000,
        "verbosity": 0,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
    }

    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_train, verbose=False)
    predictions = model.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    return mae

In [71]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

[I 2023-10-19 18:40:30,446] A new study created in memory with name: no-name-0073e966-0450-41db-a7ac-7b431bbbe135
[I 2023-10-19 18:40:50,723] Trial 0 finished with value: 21.406244018910485 and parameters: {'learning_rate': 0.0011825862163086524, 'max_depth': 7, 'subsample': 0.44479566169738116, 'colsample_bytree': 0.972561228878958, 'min_child_weight': 13}. Best is trial 0 with value: 21.406244018910485.
[I 2023-10-19 18:40:55,305] Trial 1 finished with value: 25.039678252602656 and parameters: {'learning_rate': 0.010206535187711944, 'max_depth': 1, 'subsample': 0.5065468318015546, 'colsample_bytree': 0.188696173486114, 'min_child_weight': 5}. Best is trial 0 with value: 21.406244018910485.
[I 2023-10-19 18:41:06,035] Trial 2 finished with value: 20.48787642721524 and parameters: {'learning_rate': 0.01853290222495454, 'max_depth': 7, 'subsample': 0.2555982408344387, 'colsample_bytree': 0.20900624955281982, 'min_child_weight': 18}. Best is trial 2 with value: 20.48787642721524.
[I 2023

In [72]:
study.best_params

{'learning_rate': 0.004436874810837867,
 'max_depth': 5,
 'subsample': 0.5162088909470665,
 'colsample_bytree': 0.4570813439805998,
 'min_child_weight': 17}

In [31]:
X_test_estimated_c = X_test_estimated_c.drop(columns=['date_forecast', 'date_calc', 'snow_density:kgm3', 'snow_drift:idx', 'diffuse_rad_1h:J', 'fresh_snow_1h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'fresh_snow_24h:cm'])

In [32]:
df['time'] = pd.to_datetime(df['time'])
df.sort_values('time', inplace=True)
train_df = df
X_train = train_df
y_train = df['pv_measurement']

xgb_model = xgb.XGBRegressor(
    learning_rate = 0.0044368,
    max_depth = 5,
    subsample = 0.51621,
    colsample_bytree = 0.45708,
    min_child_weight = 17,
    n_estimators = 1000
)

xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test_estimated_c)
y_pred = np.maximum(y_pred, 0)

In [33]:
df = pd.DataFrame(y_pred)
df.to_csv('result_c.csv')