In [1]:
import os
import pandas as pd
from typing import NamedTuple, List
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer

In [2]:
class Config(NamedTuple):
    data_version: str = "2024-05-19"
    #res_version: str = "2024-06-24"
    model_type: str = "_full"
    model_combination: str = "ngb"
    scaler_str: str = "yeo_johnson"

config = Config()

targets = ["f_rocof", "f_ext", "f_msd", "f_integral"]
target_names = ["RoCoF", "Nadir", "MSD", "Integral"]
areas = ["CE", "Nordic"]
area_names = ["Continental Europe", "Nordic"]
area_colors = ["C0", "C1"]

In [4]:
for area in ["Nordic"]:
    print(
        "---------------------------- ", area, " ------------------------------------"
    )
    data_folder = f"../data/2020-2024/{area}/version_{config.data_version}/"
    norm_data_folder = data_folder + config.scaler_str + "/"
    if not os.path.exists(norm_data_folder):
        os.makedirs(norm_data_folder)
    print(norm_data_folder)
    # Load data
    y_train = pd.read_hdf(data_folder + "y_train.h5")
    y_test = pd.read_hdf(data_folder + "y_test.h5")
    y_pred = pd.read_hdf(data_folder + "y_pred.h5")
    X_train = pd.read_hdf(data_folder + "X_train_full.h5")
    X_test = pd.read_hdf(data_folder + "X_test_full.h5")

    scaler_X = PowerTransformer("yeo-johnson")
    scaler_y = PowerTransformer("yeo-johnson")
    X_train_scaled = scaler_X.fit_transform(X_train)
    X_test_scaled = scaler_X.transform(X_test)

    y_train_scaled = scaler_y.fit_transform(y_train)
    y_test_scaled = scaler_y.transform(y_test)

    # Convert back to DataFrame
    X_train_scaled_df = pd.DataFrame(
        X_train_scaled, columns=X_train.columns, index=X_train.index
    )
    X_test_scaled_df = pd.DataFrame(
        X_test_scaled, columns=X_test.columns, index=X_test.index
    )
    y_train_scaled_df = pd.DataFrame(
        y_train_scaled, columns=y_train.columns, index=y_train.index
    )
    y_test_scaled_df = pd.DataFrame(
        y_test_scaled, columns=y_test.columns, index=y_test.index
    )

    # Save the scaled data back to HDF5 files
    X_train_scaled_df.to_hdf(
        norm_data_folder + "X_train_full_scaled.h5", key="X_train_scaled", mode="w"
    )
    X_test_scaled_df.to_hdf(
        norm_data_folder + "X_test_full_scaled.h5", key="X_test_scaled", mode="w"
    )
    y_train_scaled_df.to_hdf(
        norm_data_folder + "y_train_scaled.h5", key="y_train_scaled", mode="w"
    )
    y_test_scaled_df.to_hdf(
        norm_data_folder + "y_test_scaled.h5", key="y_test_scaled", mode="w"
    )
    y_pred.to_hdf(norm_data_folder + "y_pred.h5", key="y_pred", mode="w")
    

----------------------------  Nordic  ------------------------------------
../data/2020-2024/Nordic/version_2024-05-19/yeo_johnson/


In [11]:
import joblib
from sklearn.preprocessing import PowerTransformer

for area in ["CE", "Nordic"]:
    data_folder = f"../data/2020-2024/{area}/version_{config.data_version}/"
    norm_data_folder = data_folder + config.scaler_str + "/"
    if not os.path.exists(norm_data_folder):
        os.makedirs(norm_data_folder)
    scaler_folder = norm_data_folder  + "scalers" + "/"
    if not os.path.exists(scaler_folder):
        os.makedirs(scaler_folder)
    # Load data 
    y_train = pd.read_hdf(data_folder + "y_train.h5")
    y_test = pd.read_hdf(data_folder + "y_test.h5")
    #y_pred = pd.read_hdf(data_folder + "y_pred.h5")
    
    X_train = pd.read_hdf(data_folder + "X_train_full.h5")
    X_test = pd.read_hdf(data_folder + "X_test_full.h5")


    scaler_X = PowerTransformer("yeo-johnson")
    scaler_X.fit(X_train)  
    for tar in targets:
        scaler_y = PowerTransformer("yeo-johnson")
        scaler_y.fit(y_train[[tar]])  
        joblib.dump(scaler_y, scaler_folder + f"scaler_y_{tar}.pkl")
    
    joblib.dump(scaler_X, scaler_folder + f"scaler_X.pkl")
