In [2]:
import optuna
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
%matplotlib inline
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import RFE

In [3]:
train_a = pd.read_parquet('./data/A/train_targets.parquet')
train_b = pd.read_parquet('./data/B/train_targets.parquet')
train_c = pd.read_parquet('./data/C/train_targets.parquet')

X_train_estimated_a = pd.read_parquet('./data/A/X_train_estimated.parquet')
X_train_estimated_b = pd.read_parquet('./data/B/X_train_estimated.parquet')
X_train_estimated_c = pd.read_parquet('./data/C/X_train_estimated.parquet')

X_train_observed_a = pd.read_parquet('./data/A/X_train_observed.parquet')
X_train_observed_b = pd.read_parquet('./data/B/X_train_observed.parquet')
X_train_observed_c = pd.read_parquet('./data/C/X_train_observed.parquet')

X_test_estimated_a = pd.read_parquet('./data/A/X_test_estimated.parquet')
X_test_estimated_b = pd.read_parquet('./data/B/X_test_estimated.parquet')
X_test_estimated_c = pd.read_parquet('./data/C/X_test_estimated.parquet')

X_test_estimated_b['date_forecast'] = pd.to_datetime(X_test_estimated_b['date_forecast'])
X_test_estimated_b = X_test_estimated_b[X_test_estimated_b['date_forecast'].dt.minute == 0]

imputer = SimpleImputer(strategy='most_frequent')
X_test_estimated_b[['ceiling_height_agl:m', 'cloud_base_agl:m']] = imputer.fit_transform(X_test_estimated_b[['ceiling_height_agl:m', 'cloud_base_agl:m']])

In [4]:
# fig, axs = plt.subplots(3, 1, figsize=(40, 20), sharex=True)
# train_a[['time', 'pv_measurement']].set_index('time').plot(ax=axs[0], title='Train/Test  A', color='red')
# train_b[['time', 'pv_measurement']].set_index('time').plot(ax=axs[1], title='Train/Test  B', color='red')
# train_c[['time', 'pv_measurement']].set_index('time').plot(ax=axs[2], title='Train/Test  C', color='red')

In [5]:
df = pd.concat([X_train_observed_b, X_train_estimated_b])

In [6]:
# def remove_highly_correlated_features(df, threshold):
#     """
#     Remove features from a DataFrame that have an absolute correlation higher than a given threshold.
# 
#     Parameters:
#     - df (Pandas DataFrame): The input DataFrame.
#     - threshold (float): The correlation threshold for feature removal.
# 
#     Returns:
#     - Pandas DataFrame: A new DataFrame with highly correlated features removed.
#     """
#     # Compute the Pearson correlation matrix
#     correlation_matrix = df.corr(method='pearson')
# 
#     # Initialize an empty list to hold features to be removed
#     features_to_remove = []
# 
#     # Traverse the correlation matrix to find highly correlated features
#     for i in range(len(correlation_matrix.columns)):
#         for j in range(i+1, len(correlation_matrix.columns)):
#             feature1 = correlation_matrix.columns[i]
#             feature2 = correlation_matrix.columns[j]
# 
#             # Check for high absolute correlation
#             if abs(correlation_matrix.iloc[i, j]) > threshold:
#                 # Add one of the features to the list if it's not already there
#                 if feature1 not in features_to_remove and feature2 not in features_to_remove:
#                     features_to_remove.append(feature1)
# 
#     # Drop the identified features from the DataFrame
#     filtered_df = df.drop(columns=features_to_remove)
# 
#     return filtered_df
# 
# threshold = 0.99
# df = remove_highly_correlated_features(df, threshold)

In [7]:
def remove_highly_correlated_features_spearman(df, threshold):
    # Compute the Spearman correlation matrix
    correlation_matrix = df.corr(method='spearman')

    # Initialize an empty list to hold features to be removed
    features_to_remove = []

    # Traverse the correlation matrix to find highly correlated features
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            feature1 = correlation_matrix.columns[i]
            feature2 = correlation_matrix.columns[j]

            # Check for high absolute correlation
            if abs(correlation_matrix.iloc[i, j]) > threshold:
                # Add one of the features to the list if it's not already there
                if feature1 not in features_to_remove and feature2 not in features_to_remove:
                    features_to_remove.append(feature1)

    # Drop the identified features from the DataFrame
    filtered_df = df.drop(columns=features_to_remove)

    return filtered_df

threshold = 0.99
df = remove_highly_correlated_features_spearman(df, threshold)

In [8]:
df = pd.merge(df, train_b, left_on='date_forecast', right_on='time', how='inner')
df = df.drop(columns=['date_forecast', 'date_calc', 'snow_density:kgm3', 'snow_drift:idx', 'diffuse_rad_1h:J', 'fresh_snow_1h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'fresh_snow_24h:cm'])

imputer = SimpleImputer(strategy='most_frequent')
df[['ceiling_height_agl:m', 'cloud_base_agl:m', 'pv_measurement']] = imputer.fit_transform(df[['ceiling_height_agl:m', 'cloud_base_agl:m', 'pv_measurement']])

In [9]:
split_index = int(len(df) * 0.7)
train_data = df[:split_index]
test_data = df[split_index:]

# # Plotting the data
# plt.figure(figsize=(15, 5))
# 
# # Plot training data
# plt.plot(np.arange(0, split_index), train_data, label='Training Data', color='blue')
# 
# # Plot testing data
# plt.plot(np.arange(split_index, len(df)), test_data, label='Testing Data', color='red')
# 
# # Adding a vertical line to indicate the split point
# plt.axvline(x=split_index, color='gray', linestyle='--', label='Split Point')
# 
# plt.title('Training and Testing Data Split')
# plt.xlabel('Index')
# plt.ylabel('Value')
# plt.show()

In [10]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(train_data.drop(columns=['pv_measurement', 'time']), train_data['pv_measurement'])

# Make predictions on the test set
y_pred = xgb_model.predict(test_data.drop(columns=['pv_measurement', 'time']))
y_pred = np.maximum(y_pred, 0)
# Calculate MAE on the test set
mae = mean_absolute_error(test_data['pv_measurement'], y_pred)
mae

31.27941339498111

In [11]:
def find_long_constant_periods(data, threshold):
    start = None
    segments = []
    for i in range(1, len(data)):
        if data[i] == data[i-1]:
            if start is None:
                start = i-1
        else:
            if start is not None:
                if (i - start) > threshold:
                    segments.append((start, i))
                start = None
    return segments

segments = find_long_constant_periods(train_b['pv_measurement'], threshold=30)

In [12]:
def remove_constant_periods(df, segments):
    drop_indices = []
    for start, end in segments:
        drop_indices.extend(range(start, end))
    return df.drop(drop_indices)

df = remove_constant_periods(df, segments)

In [13]:
split_date = '2022-10-27'

# Convert the 'time' column to a datetime object
df['time'] = pd.to_datetime(df['time'])

# Sorting the data by the 'time' column to maintain chronological order
df.sort_values('time', inplace=True)

# Splitting the data into training and test sets based on the split date
train_df = df[df['time'] < split_date]
test_df = df[df['time'] >= split_date]

# Identifying the features and the target variable
X_train = train_df.drop(columns=['pv_measurement', 'time'])
y_train = train_df['pv_measurement']
X_test = test_df.drop(columns=['pv_measurement', 'time'])
y_test = test_df['pv_measurement']

In [14]:
# xgb_model = xgb.XGBRegressor()
# # Create RFE model and select the top 10 features
# rfe = RFE(xgb_model, n_features_to_select=21)
# fit = rfe.fit(X_train, y_train)
# 
# selected_columns = X_train.columns[rfe.support_]
# 
# # Train and test using only selected features
# X_train = rfe.transform(X_train)
# X_test = rfe.transform(X_test)
# 
# xgb_model.fit(X_train, y_train)
# y_pred = xgb_model.predict(X_test)
# y_pred = np.maximum(y_pred, 0)
# mae = mean_absolute_error(y_test, y_pred)
# mae

17.477934253617054

In [15]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)
y_pred = np.maximum(y_pred, 0)
# Calculate MAE on the test set
mae = mean_absolute_error(y_test, y_pred)
mae

17.477934253617054

In [173]:
def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "n_estimators": 1000,
        "verbosity": 0,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
    }

    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_train, verbose=False)
    predictions = model.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    return mae

In [174]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

[I 2023-10-19 18:30:25,941] A new study created in memory with name: no-name-ed0c9eb8-cd62-4e28-bd34-08e54f5b4244
[I 2023-10-19 18:30:34,133] Trial 0 finished with value: 47.67517272206098 and parameters: {'learning_rate': 0.0013814264759543838, 'max_depth': 10, 'subsample': 0.9395443358359329, 'colsample_bytree': 0.10099345137684806, 'min_child_weight': 8}. Best is trial 0 with value: 47.67517272206098.
[I 2023-10-19 18:30:40,319] Trial 1 finished with value: 16.371353639198503 and parameters: {'learning_rate': 0.01339705420216449, 'max_depth': 6, 'subsample': 0.5420367544073865, 'colsample_bytree': 0.545774847653938, 'min_child_weight': 3}. Best is trial 1 with value: 16.371353639198503.
[I 2023-10-19 18:30:43,533] Trial 2 finished with value: 20.15810486665535 and parameters: {'learning_rate': 0.002267665273282633, 'max_depth': 4, 'subsample': 0.13948096930288317, 'colsample_bytree': 0.6320119244308359, 'min_child_weight': 16}. Best is trial 1 with value: 16.371353639198503.
[I 2023

In [175]:
study.best_params

{'learning_rate': 0.005354911079208287,
 'max_depth': 10,
 'subsample': 0.6741937943069567,
 'colsample_bytree': 0.8902317796735592,
 'min_child_weight': 7}

In [17]:
X_test_estimated_b = X_test_estimated_b.drop(columns=['date_forecast', 'date_calc', 'snow_density:kgm3', 'snow_drift:idx', 'diffuse_rad_1h:J', 'fresh_snow_1h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'fresh_snow_24h:cm'])

df['time'] = pd.to_datetime(df['time'])
df.sort_values('time', inplace=True)
train_df = df
X_train = train_df
y_train = df['pv_measurement']

xgb_model = xgb.XGBRegressor(
    learning_rate = 0.005354,
    max_depth = 10,
    subsample = 0.67419,
    colsample_bytree = 0.89023,
    min_child_weight = 7,
    n_estimators = 1000
)

xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test_estimated_b)
y_pred = np.maximum(y_pred, 0)

In [18]:
df = pd.DataFrame(y_pred)
df.to_csv('result_b.csv')

In [19]:
df_a = pd.read_csv('result_a.csv')
df_b = pd.read_csv('result_b.csv')
df_c = pd.read_csv('result_c.csv')

In [20]:
concatenated_df = pd.concat([df_a, df_b, df_c])

In [22]:
concatenated_df.to_csv('result.csv', index=False)