In [1]:
import pandas as pd
import numpy as np
from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit 
import optuna 
import json
from sklearn.metrics import mean_absolute_error

In [3]:
#Load inn datasets
X_test  = pd.read_parquet('../data/prepared_datasets/only_y_cleaned/X_test.parquet')
X_train = pd.read_parquet('../data/prepared_datasets/only_y_cleaned/X_train.parquet')
y_train = pd.read_parquet('../data/prepared_datasets/only_y_cleaned/Y_train.parquet')
y_train_a = pd.read_parquet('../data/prepared_datasets/only_y_cleaned/Y_train_a.parquet')
y_train_b = pd.read_parquet('../data/prepared_datasets/only_y_cleaned/Y_train_b.parquet')
y_train_c = pd.read_parquet('../data/prepared_datasets/only_y_cleaned/Y_train_c.parquet')

In [4]:
def splitting_def(df):
    date_range_1 = (df.index >= '2020-05-01') & (df.index <= '2020-06-25')
    date_range_2 = (df.index >= '2023-05-01') & (df.index <= '2023-06-15')

    # Combine the date ranges to create the test set
    test_set = df[date_range_1 | date_range_2]

    # The rest of the data will be your training set
    training_set = df[~(date_range_1 | date_range_2)]
    
    # Splitting the test_set into X_test and y_test
    X_test = test_set.drop("pv_measurement", axis=1)
    y_test = test_set['pv_measurement']  # Assuming 'pv_measurement' is your target variable

    # Splitting the training_set into X_train and y_train
    X_train = training_set.drop("pv_measurement", axis=1)
    y_train = training_set['pv_measurement']
    
    return X_train, X_test, y_train, y_test

X_train_new_a, X_test_new_a, y_train_new_a, y_test_a = splitting_def(pd.concat([X_train[X_train["location"] == "A"].drop("location", axis=1), y_train_a], axis=1))
X_train_new_b, X_test_new_b, y_train_new_b, y_test_b = splitting_def(pd.concat([X_train[X_train["location"] == "B"].drop("location", axis=1), y_train_b], axis=1))
X_train_new_c, X_test_new_c, y_train_new_c, y_test_c = splitting_def(pd.concat([X_train[X_train["location"] == "C"].drop("location", axis=1), y_train_c], axis=1))

In [5]:
#Create a pool of data
train_pool_a = Pool(X_train_new_a, y_train_new_a)
train_pool_b = Pool(X_train_new_b, y_train_new_b)
train_pool_c = Pool(X_train_new_c, y_train_new_c)


test_pool_a = Pool(X_test_new_a) 
test_pool_b = Pool(X_test_new_b) 
test_pool_c = Pool(X_test_new_c) 

In [9]:
#For location A
def objective(trial, X_train, y_train):
    params = {
        "iterations": trial.suggest_int("iterations", 300, 3000),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 13),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 2, 10),
        "has-time": trial.suggest_categorical('has-time', [True, False]),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.3, 1.0),
        "random_strength": trial.suggest_float("random_strength", 0.3, 1.0),
        "random_state": 2,
        "border_count": trial.suggest_int("border_count", 1, 1000),
        "rsm": trial.suggest_float("rsm", 0.001, 1),
        "nan_mode": trial.suggest_categorical("nan_mode", ["min", "max"])
    }

    catboost_model_a = CatBoostRegressor(verbose=100)
    catboost_model_a.fit(train_pool_a)
    pred_a = pd.DataFrame(catboost_model_a.predict(test_pool_a))
    MAE_a = mean_absolute_error(y_test_a, pred_a)
    return MAE_a

study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=50)

[I 2023-11-06 18:51:03,690] A new study created in memory with name: no-name-c2340277-f919-4a80-b79f-b69c4b81ab37


Learning rate set to 0.07105
0:	learn: 1077.8600125	total: 8.02ms	remaining: 8.02s
100:	learn: 420.8398889	total: 619ms	remaining: 5.51s
200:	learn: 392.1335326	total: 1.2s	remaining: 4.75s
300:	learn: 371.7885673	total: 1.76s	remaining: 4.08s
400:	learn: 355.7069453	total: 2.32s	remaining: 3.46s
500:	learn: 340.6871261	total: 2.89s	remaining: 2.88s
600:	learn: 328.2901904	total: 3.45s	remaining: 2.29s
700:	learn: 316.7981316	total: 4.02s	remaining: 1.72s
800:	learn: 306.3068825	total: 4.61s	remaining: 1.15s
900:	learn: 296.7449297	total: 5.22s	remaining: 574ms


[I 2023-11-06 18:51:09,774] Trial 0 finished with value: 351.34628571238125 and parameters: {'iterations': 962, 'learning_rate': 0.005419834021739618, 'depth': 2, 'colsample_bylevel': 0.33005409202664165, 'min_data_in_leaf': 26, 'l2_leaf_reg': 5, 'has-time': False, 'bagging_temperature': 0.8574517687919452, 'random_strength': 0.6429523281135647, 'border_count': 252, 'rsm': 0.8802066806385332, 'nan_mode': 'max'}. Best is trial 0 with value: 351.34628571238125.


999:	learn: 288.1683232	total: 5.83s	remaining: 0us


In [7]:
#to output the best paramaters
print(study.best_params)

#to output the best score returned from the trials
print(study.best_value)

with open("optuna-best-parameters_a.txt", "w") as file:
    file.write("Best paramaters: \n")
    file.write(json.dumps(study.best_params))  # Write the first string followed by a newline character
    file.write("\n")
    file.write("best score MAE: \n")
    file.write(json.dumps(study.best_value))  # Write the second string followed by a newline character

{'iterations': 945, 'learning_rate': 0.0025703767511988562, 'depth': 2, 'colsample_bylevel': 0.201959065860163, 'min_data_in_leaf': 84, 'l2_leaf_reg': 10, 'has-time': False, 'bagging_temperature': 0.5455357133013922, 'random_strength': 0.30512559107040527, 'border_count': 138, 'rsm': 0.10140848708389534}
351.34628571238125
