In [1]:
import pandas as pd
import numpy as np
from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit 
import optuna
import json
from sklearn.metrics import mean_absolute_error
#Load inn datasets
X_test  = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/X_test.parquet')
X_train = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/X_train.parquet')
y_train = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/Y_train.parquet')
y_train_a = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/Y_train_a.parquet')
y_train_b = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/Y_train_b.parquet')
y_train_c = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/Y_train_c.parquet')

In [2]:
def test_train_split(df):
    dates_2 = (df.index >= '2023-04-01') & (df.index <= '2023-04-15')
    dates_1 = (df.index >= '2021-05-01') & (df.index <= '2021-08-01')

    test_set = df[dates_1 | dates_2]

    training_set = df[~(dates_1 | dates_2)]

    X_train = training_set.drop("pv_measurement", axis=1)
    y_train = training_set['pv_measurement']

    X_test = test_set.drop("pv_measurement", axis=1)
    y_test = test_set['pv_measurement'] 

    
    
    return X_train, X_test, y_train, y_test

X_trainnew_a, X_test_new_a, y_train_new_a, y_test_a = test_train_split(pd.concat([X_train[X_train["location"] == "A"].drop("location", axis=1), y_train_a], axis=1))
X_train_new_b, X_test_new_b, y_train_new_b, y_test_b = test_train_split(pd.concat([X_train[X_train["location"] == "B"].drop("location", axis=1), y_train_b], axis=1))
X_train_new_c, X_test_new_c, y_train_new_c, y_test_c = test_train_split(pd.concat([X_train[X_train["location"] == "C"].drop("location", axis=1), y_train_c], axis=1))

X_train_loc_a, X_test_loc_a, y_train_loc_a, y_test_a = test_train_split(pd.concat([X_train[X_train["location"] == "A"], y_train_a], axis=1))
X_train_loc_b, X_test_loc_b, y_train_loc_b, y_test_b = test_train_split(pd.concat([X_train[X_train["location"] == "B"], y_train_b], axis=1))
X_train_loc_c, X_test_loc_c, y_train_loc_c, y_test_c = test_train_split(pd.concat([X_train[X_train["location"] == "C"], y_train_c], axis=1))


X_train_new = pd.concat([X_train_loc_a, X_train_loc_b, X_train_loc_c])
X_test_new = pd.concat([X_test_loc_a, X_test_loc_b, X_test_loc_c])
y_train_new = pd.concat([y_train_loc_a, y_train_loc_b, y_train_loc_c])
y_test = pd.concat([y_test_a, y_test_b, y_test_c])

In [4]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import optuna

def objective(trial, X_train_new, y_train_new):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 1000, 3000),
        "verbosity": 0,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
        'eval_metric': 'mae'
    }

    model = xgb.XGBRegressor(**params)
    model.fit(X_train_new_b, y_train_new_b, verbose=300)
    pred = model.predict(X_test_new_b)
    MAE = mean_absolute_error(y_test_b, pred)
    return MAE
    
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial, X_train_new_b, y_train_new_b), n_trials=70)

[I 2023-11-11 17:31:05,240] A new study created in memory with name: no-name-c7872f7a-db40-4f80-9cf4-186f22aa6f33
[I 2023-11-11 17:31:09,657] Trial 0 finished with value: 74.88530299513819 and parameters: {'n_estimators': 1987, 'learning_rate': 0.058348716082063774, 'max_depth': 7, 'subsample': 0.7067041706619943, 'colsample_bytree': 0.06859857324926139, 'min_child_weight': 19}. Best is trial 0 with value: 74.88530299513819.
[I 2023-11-11 17:31:11,153] Trial 1 finished with value: 76.3613868674005 and parameters: {'n_estimators': 1591, 'learning_rate': 0.023562886804277247, 'max_depth': 1, 'subsample': 0.9689407953925651, 'colsample_bytree': 0.6021851605719214, 'min_child_weight': 8}. Best is trial 0 with value: 74.88530299513819.
[I 2023-11-11 17:31:17,646] Trial 2 finished with value: 66.9432723252984 and parameters: {'n_estimators': 2121, 'learning_rate': 0.012220792179720722, 'max_depth': 6, 'subsample': 0.3825012850937786, 'colsample_bytree': 0.8616898073385812, 'min_child_weight'