In [11]:
import pandas as pd
import numpy as np
from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit 
import optuna
import json
from sklearn.metrics import mean_absolute_error

In [7]:
#Load inn datasets
X_test  = pd.read_parquet('../data/prepared_datasets/only_y_cleaned/X_test.parquet')
X_train = pd.read_parquet('../data/prepared_datasets/only_y_cleaned/X_train.parquet')
y_train = pd.read_parquet('../data/prepared_datasets/only_y_cleaned/Y_train.parquet')
y_train_a = pd.read_parquet('../data/prepared_datasets/only_y_cleaned/Y_train_a.parquet')
y_train_b = pd.read_parquet('../data/prepared_datasets/only_y_cleaned/Y_train_b.parquet')
y_train_c = pd.read_parquet('../data/prepared_datasets/only_y_cleaned/Y_train_c.parquet')

In [8]:
def test_train_split(df):
    dates_2 = (df.index >= '2023-04-15') & (df.index <= '2023-04-30')
    dates_1 = (df.index >= '2021-05-01') & (df.index <= '2021-07-25')

    test_set = df[dates_1 | dates_2]

    training_set = df[~(dates_1 | dates_2)]

    X_train = training_set.drop("pv_measurement", axis=1)
    y_train = training_set['pv_measurement']

    X_test = test_set.drop("pv_measurement", axis=1)
    y_test = test_set['pv_measurement'] 

    
    
    return X_train, X_test, y_train, y_test

X_trainnew_a, X_test_new_a, y_train_new_a, y_test_a = test_train_split(pd.concat([X_train[X_train["location"] == "A"].drop("location", axis=1), y_train_a], axis=1))
X_train_new_b, X_test_new_b, y_train_new_b, y_test_b = test_train_split(pd.concat([X_train[X_train["location"] == "B"].drop("location", axis=1), y_train_b], axis=1))
X_train_new_c, X_test_new_c, y_train_new_c, y_test_c = test_train_split(pd.concat([X_train[X_train["location"] == "C"].drop("location", axis=1), y_train_c], axis=1))

X_train_loc_a, X_test_loc_a, y_train_loc_a, y_test_a = test_train_split(pd.concat([X_train[X_train["location"] == "A"], y_train_a], axis=1))
X_train_loc_b, X_test_loc_b, y_train_loc_b, y_test_b = test_train_split(pd.concat([X_train[X_train["location"] == "B"], y_train_b], axis=1))
X_train_loc_c, X_test_loc_c, y_train_loc_c, y_test_c = test_train_split(pd.concat([X_train[X_train["location"] == "C"], y_train_c], axis=1))


X_train_new = pd.concat([X_train_loc_a, X_train_loc_b, X_train_loc_c])
X_test_new = pd.concat([X_test_loc_a, X_test_loc_b, X_test_loc_c])
y_train_new = pd.concat([y_train_loc_a, y_train_loc_b, y_train_loc_c])
y_test = pd.concat([y_test_a, y_test_b, y_test_c])

In [9]:
#Create Pools
train_pool = Pool(X_train_new, y_train_new, cat_features=["location"])
test_pool = Pool(X_test_new, cat_features=["location"]) 

In [10]:
def objective(trial, X_train, y_train):
    params = {
        "iterations": trial.suggest_int("iterations", 300, 3000),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 13),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 2, 10),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.3, 1.0),
        "random_strength": trial.suggest_float("random_strength", 0.3, 1.0),
        "border_count": trial.suggest_int("border_count", 1, 1000),
        "rsm": trial.suggest_float("rsm", 0.05, 1),
        "loss_function": "LogCosh"
    }

    catboost_model_val = CatBoostRegressor(**params, verbose=0)
    catboost_model_val.fit(train_pool)
    pred = pd.DataFrame(catboost_model_val.predict(test_pool))
    MAE = mean_absolute_error(y_test, pred)

    return MAE
    
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial, X_train_new, y_train_new), n_trials=50)

[I 2023-11-08 23:51:18,405] A new study created in memory with name: no-name-015f78c4-9234-4a91-ae8f-0398408bfe38
[W 2023-11-08 23:51:28,170] Trial 0 failed with parameters: {'iterations': 1692, 'learning_rate': 0.0025788987978416027, 'depth': 13, 'min_data_in_leaf': 88, 'l2_leaf_reg': 5, 'bagging_temperature': 0.9729232505425349, 'random_strength': 0.5228861529457045, 'border_count': 177, 'rsm': 0.6422374979503118} because of the following error: KeyboardInterrupt('').
Traceback (most recent call last):
  File "c:\Users\Shahl\AppData\Local\Programs\Python\Python310\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Shahl\AppData\Local\Temp\ipykernel_7180\406540543.py", line 23, in <lambda>
    study.optimize(lambda trial: objective(trial, X_train_new, y_train_new), n_trials=50)
  File "C:\Users\Shahl\AppData\Local\Temp\ipykernel_7180\406540543.py", line 16, in objective
    catboost_model_val.fit(train_pool)
  File 

KeyboardInterrupt: 

In [None]:
#to output the best paramaters
print(study.best_params)

#to output the best score returned from the trials
print(study.best_value)


with open("optuna-best-parameters_2020:5:00-2020:7:30.txt", "w") as file:
    file.write("Best paramaters: \n")
    file.write(json.dumps(study.best_params))  # Write the first string followed by a newline character
    file.write("\n")
    file.write("best score MAE: \n")
    file.write(json.dumps(study.best_value))  # Write the second string followed by a newline character