In [14]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from scipy.spatial import distance
%matplotlib inline
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
import optuna
from sklearn.feature_selection import RFE

In [15]:
train_a = pd.read_parquet('./data/A/train_targets.parquet')
train_b = pd.read_parquet('./data/B/train_targets.parquet')
train_c = pd.read_parquet('./data/C/train_targets.parquet')

X_train_estimated_a = pd.read_parquet('./data/A/X_train_estimated.parquet')
X_train_estimated_b = pd.read_parquet('./data/B/X_train_estimated.parquet')
X_train_estimated_c = pd.read_parquet('./data/C/X_train_estimated.parquet')

X_train_observed_a = pd.read_parquet('./data/A/X_train_observed.parquet')
X_train_observed_b = pd.read_parquet('./data/B/X_train_observed.parquet')
X_train_observed_c = pd.read_parquet('./data/C/X_train_observed.parquet')

X_test_estimated_a = pd.read_parquet('./data/A/X_test_estimated.parquet')
X_test_estimated_b = pd.read_parquet('./data/B/X_test_estimated.parquet')
X_test_estimated_c = pd.read_parquet('./data/C/X_test_estimated.parquet')

X_test_estimated_a['date_forecast'] = pd.to_datetime(X_test_estimated_a['date_forecast'])
X_test_estimated_a = X_test_estimated_a[X_test_estimated_a['date_forecast'].dt.minute == 0]

imputer = SimpleImputer(strategy='most_frequent')
X_test_estimated_a[['ceiling_height_agl:m', 'cloud_base_agl:m']] = imputer.fit_transform(X_test_estimated_a[['ceiling_height_agl:m', 'cloud_base_agl:m']])

In [16]:
df = pd.concat([X_train_observed_a, X_train_estimated_b])

In [17]:
def remove_highly_correlated_features(df, threshold):
    # Compute the Pearson correlation matrix
    correlation_matrix = df.corr(method='pearson')

    # Initialize an empty list to hold features to be removed
    features_to_remove = []

    # Traverse the correlation matrix to find highly correlated features
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            feature1 = correlation_matrix.columns[i]
            feature2 = correlation_matrix.columns[j]

            # Check for high absolute correlation
            if abs(correlation_matrix.iloc[i, j]) > threshold:
                # Add one of the features to the list if it's not already there
                if feature1 not in features_to_remove and feature2 not in features_to_remove:
                    features_to_remove.append(feature1)

    # Drop the identified features from the DataFrame
    filtered_df = df.drop(columns=features_to_remove)

    return filtered_df

# Example usage
threshold = 0.98
df = remove_highly_correlated_features(df, threshold)

In [18]:
df = pd.merge(df, train_a, left_on='date_forecast', right_on='time', how='inner')
df = df.drop(columns=['date_forecast', 'date_calc', 'snow_density:kgm3', 'snow_drift:idx', 'diffuse_rad_1h:J', 'fresh_snow_1h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'fresh_snow_24h:cm'])

imputer = SimpleImputer(strategy='most_frequent')
df[['ceiling_height_agl:m', 'cloud_base_agl:m']] = imputer.fit_transform(df[['ceiling_height_agl:m', 'cloud_base_agl:m']])

In [19]:
# Define the split date
split_date = '2022-10-27'

# Convert the 'time' column to a datetime object
df['time'] = pd.to_datetime(df['time'])

# Sorting the data by the 'time' column to maintain chronological order
df.sort_values('time', inplace=True)

# Splitting the data into training and test sets based on the split date
train_df = df[df['time'] < split_date]
test_df = df[df['time'] >= split_date]

# Identifying the features and the target variable
X_train = train_df.drop(columns=['pv_measurement', 'time'])
y_train = train_df['pv_measurement']
X_test = test_df.drop(columns=['pv_measurement', 'time'])
y_test = test_df['pv_measurement']

In [20]:
# xgb_model = xgb.XGBRegressor()
# # Create RFE model and select the top 10 features
# rfe = RFE(xgb_model, n_features_to_select=20)
# fit = rfe.fit(X_train, y_train)
# 
# selected_columns = X_train.columns[rfe.support_]
# 
# # Train and test using only selected features
# X_train = rfe.transform(X_train)
# X_test = rfe.transform(X_test)
# 
# xgb_model.fit(X_train, y_train)
# y_pred = xgb_model.predict(X_test)
# y_pred = np.maximum(y_pred, 0)
# mae = mean_absolute_error(y_test, y_pred)
# mae

In [21]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)
y_pred = np.maximum(y_pred, 0)
# Calculate MAE on the test set
mae = mean_absolute_error(y_test, y_pred)
mae

128.25668432112474

In [22]:
def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "n_estimators": 1000,
        "verbosity": 0,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
    }

    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_train, verbose=False)
    predictions = model.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    return mae

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

[I 2023-10-19 20:37:24,181] A new study created in memory with name: no-name-2e3b5d1b-c9af-47a6-a32f-ef07cee11699
[I 2023-10-19 20:37:54,823] Trial 0 finished with value: 123.72761342679104 and parameters: {'learning_rate': 0.003861220114813485, 'max_depth': 10, 'subsample': 0.2290239271209744, 'colsample_bytree': 0.9187815102498904, 'min_child_weight': 2}. Best is trial 0 with value: 123.72761342679104.
[I 2023-10-19 20:38:06,260] Trial 1 finished with value: 147.40877115203085 and parameters: {'learning_rate': 0.0409679428065436, 'max_depth': 6, 'subsample': 0.14188982050809595, 'colsample_bytree': 0.2969461361195008, 'min_child_weight': 11}. Best is trial 0 with value: 123.72761342679104.
[I 2023-10-19 20:38:19,288] Trial 2 finished with value: 136.05279561456652 and parameters: {'learning_rate': 0.03751499260109972, 'max_depth': 4, 'subsample': 0.4208229550224095, 'colsample_bytree': 0.7525800042529615, 'min_child_weight': 9}. Best is trial 0 with value: 123.72761342679104.
[I 2023

In [None]:
study.best_params

In [None]:
X_test_estimated_a = X_test_estimated_a.drop(columns=['date_forecast', 'date_calc', 'snow_density:kgm3', 'snow_drift:idx', 'diffuse_rad_1h:J', 'fresh_snow_1h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm', 'fresh_snow_24h:cm'])


df['time'] = pd.to_datetime(df['time'])
df.sort_values('time', inplace=True)
train_df = df
X_train = train_df
y_train = df['pv_measurement']

xgb_model = xgb.XGBRegressor(
    learning_rate = 0.006827,
    max_depth = 9,
    subsample = 0.72558,
    colsample_bytree = 0.9398,
    min_child_weight = 17,
    n_estimators = 1000
)

xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test_estimated_a)
y_pred = np.maximum(y_pred, 0)

In [None]:
df = pd.DataFrame(y_pred)
df.to_csv('result_a.csv')