In [1]:
import pandas as pd
import numpy as np
from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit 
import optuna
import json
from sklearn.metrics import mean_absolute_error
#Load inn datasets
X_test  = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/X_test.parquet')
X_train = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/X_train.parquet')
y_train = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/Y_train.parquet')
y_train_a = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/Y_train_a.parquet')
y_train_b = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/Y_train_b.parquet')
y_train_c = pd.read_parquet('data/prepared_datasets/avg/no_duplicates/Y_train_c.parquet')

In [2]:
def test_train_split(df):
    dates_2 = (df.index >= '2023-04-01') & (df.index <= '2023-04-15')
    dates_1 = (df.index >= '2021-05-01') & (df.index <= '2021-08-01')

    test_set = df[dates_1 | dates_2]

    training_set = df[~(dates_1 | dates_2)]

    X_train = training_set.drop("pv_measurement", axis=1)
    y_train = training_set['pv_measurement']

    X_test = test_set.drop("pv_measurement", axis=1)
    y_test = test_set['pv_measurement'] 

    
    
    return X_train, X_test, y_train, y_test

X_train_new_a, X_test_new_a, y_train_new_a, y_test_a = test_train_split(pd.concat([X_train[X_train["location"] == "A"].drop("location", axis=1), y_train_a], axis=1))
X_train_new_b, X_test_new_b, y_train_new_b, y_test_b = test_train_split(pd.concat([X_train[X_train["location"] == "B"].drop("location", axis=1), y_train_b], axis=1))
X_train_new_c, X_test_new_c, y_train_new_c, y_test_c = test_train_split(pd.concat([X_train[X_train["location"] == "C"].drop("location", axis=1), y_train_c], axis=1))

X_train_loc_a, X_test_loc_a, y_train_loc_a, y_test_a = test_train_split(pd.concat([X_train[X_train["location"] == "A"], y_train_a], axis=1))
X_train_loc_b, X_test_loc_b, y_train_loc_b, y_test_b = test_train_split(pd.concat([X_train[X_train["location"] == "B"], y_train_b], axis=1))
X_train_loc_c, X_test_loc_c, y_train_loc_c, y_test_c = test_train_split(pd.concat([X_train[X_train["location"] == "C"], y_train_c], axis=1))


X_train_new = pd.concat([X_train_loc_a, X_train_loc_b, X_train_loc_c])
X_test_new = pd.concat([X_test_loc_a, X_test_loc_b, X_test_loc_c])
y_train_new = pd.concat([y_train_loc_a, y_train_loc_b, y_train_loc_c])
y_test = pd.concat([y_test_a, y_test_b, y_test_c])

In [4]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import optuna

def objective(trial, X_train_new, y_train_new):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 1000, 3000),
        "verbosity": 0,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
        'eval_metric': 'mae'
    }

    model = xgb.XGBRegressor(**params)
    model.fit(X_train_new_a, y_train_new_a, verbose=300)
    pred = model.predict(X_test_new_a)
    MAE = mean_absolute_error(y_test_a, pred)
    return MAE
    
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial, X_train_new_a, y_train_new_a), n_trials=100)

[I 2023-11-11 18:04:59,537] A new study created in memory with name: no-name-04a57b40-6451-438c-8e24-c4a42713941b
[I 2023-11-11 18:05:11,117] Trial 0 finished with value: 330.4766048363645 and parameters: {'n_estimators': 1043, 'learning_rate': 0.024890512306631928, 'max_depth': 10, 'subsample': 0.807006709281653, 'colsample_bytree': 0.405480948435344, 'min_child_weight': 19}. Best is trial 0 with value: 330.4766048363645.
[I 2023-11-11 18:05:15,024] Trial 1 finished with value: 471.4937668960224 and parameters: {'n_estimators': 1410, 'learning_rate': 0.0014457415839290405, 'max_depth': 2, 'subsample': 0.25334115313753564, 'colsample_bytree': 0.9076842908234225, 'min_child_weight': 4}. Best is trial 0 with value: 330.4766048363645.
[I 2023-11-11 18:05:39,458] Trial 2 finished with value: 337.3198771475631 and parameters: {'n_estimators': 2020, 'learning_rate': 0.019615279921596136, 'max_depth': 9, 'subsample': 0.5729242078418975, 'colsample_bytree': 0.7807147757058316, 'min_child_weigh

KeyboardInterrupt: 