In [1]:
from datetime import datetime

# print date as date accessed
date_accessed = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"Date accessed: {date_accessed}")

Date accessed: 2024-10-22 22:58:08


In [2]:
import xarray as xr
import dask
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import os, sys, glob, re, time, math, calendar, ast
import yaml

from pytorch_tabnet.tab_model import TabNetRegressor
import pickle
from pickle import dump, load
import joblib

import torch

from sklearn.utils import shuffle
from sklearn.preprocessing import PolynomialFeatures
from sklearn import preprocessing

# import custom functions
sys.path.append('/')
from libraries import *
from plotters import *

#For reproducibility of the results, the following seeds should be selected 
randSeed = 42

In [None]:
sys.argv = ['', 'PROF_QUEE','Averaged_over_55th_to_5th_min', 'segregated', 'not_transformed','Kho_loss_on_profile',0,  42, 0, 80, "1"]    # for debugging
station_id = sys.argv[1]
hourly_data_method = sys.argv[2]
train_dates_range = ('2021-01-01T00:00:00', '2023-12-31T23:00:00')

# Extract years from the date range
start_date = datetime.fromisoformat(train_dates_range[0])
end_date = datetime.fromisoformat(train_dates_range[1])
# Get the years
start_year = start_date.year
end_year = end_date.year
# Format the folder name
if start_year == end_year:
    years_experiment = f"{start_year}"
else:
    years_experiment = f"{start_year}_to_{end_year}"

experiment = f'ERA5_to_profilers'

segregated = sys.argv[3]
transformed = sys.argv[4]
loss_function = sys.argv[5]

# Initialize an empty list to store the model parameters for each Ens
model_data = []

for Ens in range(0,10):
    model_output_dir = f'trained_models/{experiment}/{station_id}/{hourly_data_method}/{years_experiment}/{segregated}/{transformed}/{loss_function}/Ens{Ens}'
    valMin = 1e8
    best_model_params = {}
    for trial in range(0,100):
        fSTR = f'{model_output_dir}/trial{trial}/TabNet_HOLDOUT.pkl'
        with open(fSTR, "rb") as f:
            tabReg = pickle.load(f)
        rmseVal = tabReg.history['valid_rmse'][tabReg.best_epoch]
        if rmseVal < valMin: 
            valMin = rmseVal

            # Save the best model's parameters
            best_model_params = {
                'n_d': tabReg.n_d,
                'n_a': tabReg.n_a,
                'n_steps': tabReg.n_steps,
                'n_independent': tabReg.n_independent,
                'n_shared': tabReg.n_shared,
                'gamma': tabReg.gamma,
            }
            
            fSTR = f'{model_output_dir}/TabNet_HOLDOUT.pkl'
            with open(fSTR, "wb") as f:
                dump(tabReg, f, pickle.HIGHEST_PROTOCOL)
            print('dumped')
            print(Ens, trial, valMin)
    
    # Append the best model's parameters for this Ens to the list
    model_data.append([Ens, best_model_params['n_d'], best_model_params['n_a'], 
                       best_model_params['n_steps'], best_model_params['n_independent'], 
                       best_model_params['n_shared'], best_model_params['gamma']])

# Create a DataFrame from the collected model data
df = pd.DataFrame(model_data, columns=['Ens', 'n_d', 'n_a', 'n_steps', 'n_independent', 'n_shared', 'gamma'])

# Set Ens as the index
df.set_index('Ens', inplace=True)

In [10]:
df.to_csv('best_model_params.csv')