In [None]:
import os
import joblib
import pandas as pd
import numpy as np
import random
import itertools

import matplotlib.pyplot as plt
plt.style.use('tableau-colorblind10')

import sys
sys.path.append('/data/Hydra_Work/Competition_Functions') 
from Processing_Functions import process_forecast_date, process_seasonal_forecasts

import ML_Functions
from ML_Functions import Hydra_LSTM_Block, initialize_models_optimizers, PinballLoss, SumPinballLoss, EarlyStopper, Model_Run, No_Body_Model_Run
from Data_Transforming import read_nested_csvs, generate_daily_flow, use_USGS_flow_data, USGS_to_daily_df_yearly


sys.path.append('/data/Hydra_Work/Pipeline_Functions')
from Folder_Work import filter_rows_by_year, csv_dictionary, add_day_of_year_column

from datetime import datetime
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.optim.lr_scheduler as lr_scheduler


In [None]:
# All the prep
monthly_basins = ['animas_r_at_durango', 'boise_r_nr_boise', 'boysen_reservoir_inflow', 'colville_r_at_kettle_falls', 'detroit_lake_inflow', 'dillon_reservoir_inflow',
    'fontenelle_reservoir_inflow', 'green_r_bl_howard_a_hanson_dam', 'hungry_horse_reservoir_inflow', 'libby_reservoir_inflow',
    'missouri_r_at_toston','owyhee_r_bl_owyhee_dam', 'pecos_r_nr_pecos', 'pueblo_reservoir_inflow',
    'ruedi_reservoir_inflow', 'skagit_ross_reservoir', 'snake_r_nr_heise', 'stehekin_r_at_stehekin', 'sweetwater_r_nr_alcova',
    'taylor_park_reservoir_inflow', 'virgin_r_at_virtin', 'weber_r_nr_oakley', 'yampa_r_nr_maybell',
]


USGS_basins = ['animas_r_at_durango', 'boise_r_nr_boise', 'boysen_reservoir_inflow', 'colville_r_at_kettle_falls', 'detroit_lake_inflow', 'dillon_reservoir_inflow',   
    'green_r_bl_howard_a_hanson_dam', 'hungry_horse_reservoir_inflow', 'libby_reservoir_inflow', 'merced_river_yosemite_at_pohono_bridge', 'missouri_r_at_toston',
    'owyhee_r_bl_owyhee_dam', 'pecos_r_nr_pecos', 'pueblo_reservoir_inflow',    'san_joaquin_river_millerton_reservoir', 'snake_r_nr_heise', 'stehekin_r_at_stehekin',
    'sweetwater_r_nr_alcova', 'taylor_park_reservoir_inflow', 'virgin_r_at_virtin', 'weber_r_nr_oakley', 'yampa_r_nr_maybell',
]

basins = list(set(monthly_basins + USGS_basins))


selected_years = range(2000,2024,2)

era5_folder = '/data/Hydra_Work/Rodeo_Data/era5'
era5 = csv_dictionary(era5_folder, basins, years=selected_years)
era5 = add_day_of_year_column(era5)

flow_folder = '/data/Hydra_Work/Rodeo_Data/train_monthly_naturalized_flow'
flow = csv_dictionary(flow_folder, monthly_basins)
flow = filter_rows_by_year(flow, 1998)

climatology_file_path = '/data/Hydra_Work/Rodeo_Data/climate_indices.csv'
climate_indices = pd.read_csv(climatology_file_path)
climate_indices['date'] = pd.to_datetime(climate_indices['date'])
climate_indices.set_index('date', inplace = True)
climate_indices.drop('Unnamed: 0', axis = 1, inplace = True)
climate_indices = climate_indices[~climate_indices.index.duplicated(keep='first')]

root_folder = '/data/Hydra_Work/Rodeo_Data/seasonal_forecasts'
seasonal_forecasts = read_nested_csvs(root_folder)

USGS_flow_folder = '/data/Hydra_Work/Rodeo_Data/USGS_streamflows'
USGS_flow = csv_dictionary(USGS_flow_folder, USGS_basins)

Static_variables = pd.read_csv('/data/Hydra_Work/Rodeo_Data/static_indices.csv', index_col= 'site_id')

# Convert monthly flow values to daily flow estimates
daily_flow = {}

# Iterate through the dictionary and apply generate_daily_flow to each DataFrame
for key, df in flow.items():
    daily_flow[key] = generate_daily_flow(df, persistence_factor=0.7)

# Replacing monhtly data for normalised USGS when available
daily_flow = use_USGS_flow_data(daily_flow, USGS_flow)

# Introducing the data from San_jaoqin and Merced, normalised by the yearly flow given
path = '/data/Hydra_Work/Rodeo_Data/USGS_streamflows/san_joaquin_river_millerton_reservoir.csv'
name = 'san_joaquin_river_millerton_reservoir'
normalising_path = '/data/Hydra_Work/Rodeo_Data/train_yearly/san_joaquin_river_millerton_reservoir.csv'

USGS_to_daily_df_yearly(daily_flow, path, name, normalising_path)

path = '/data/Hydra_Work/Rodeo_Data/USGS_streamflows/merced_river_yosemite_at_pohono_bridge.csv'
name = 'merced_river_yosemite_at_pohono_bridge'
normalising_path = '/data/Hydra_Work/Rodeo_Data/train_yearly/merced_river_yosemite_at_pohono_bridge.csv'

USGS_to_daily_df_yearly(daily_flow, path, name, normalising_path)

path = '/data/Hydra_Work/Rodeo_Data/USGS_streamflows/detroit_lake_inflow.csv'
name = 'detroit_lake_inflow'
normalising_path = '/data/Hydra_Work/Rodeo_Data/train_yearly/detroit_lake_inflow.csv'

USGS_to_daily_df_yearly(daily_flow, path, name, normalising_path)

climate_scaler_filename = '/data/Hydra_Work/Rodeo_Data/scalers/climate_normalization_scaler.save'
climate_scaler = joblib.load(climate_scaler_filename) 
climate_indices = pd.DataFrame(climate_scaler.transform(climate_indices), columns=climate_indices.columns, index=climate_indices.index)

era5_scaler_filename = '/data/Hydra_Work/Rodeo_Data/scalers/era5_scaler.save'
era5_scaler = joblib.load(era5_scaler_filename) 
era5 = {key: pd.DataFrame(era5_scaler.transform(df), columns=df.columns, index=df.index) for key, df in era5.items()}

for basin, df in daily_flow.items(): 
    flow_scaler_filename = f'/data/Hydra_Work/Rodeo_Data/scalers/flows/{basin}_flow_scaler.save'
    flow_scaler = joblib.load(flow_scaler_filename) 
    daily_flow[basin] = pd.DataFrame(flow_scaler.transform(df), columns=df.columns, index=df.index)

seasonal_scaler_filename = "/data/Hydra_Work/Rodeo_Data/scalers/seasonal_scaler.save"
seasonal_scaler = joblib.load(seasonal_scaler_filename)
seasonal_forecasts = {key: pd.DataFrame(seasonal_scaler.transform(df), columns=df.columns, index=df.index ) for key, df in seasonal_forecasts.items()}

static_scaler_filename = '/data/Hydra_Work/Rodeo_Data/scalers/static_scaler.save'
static_scaler = joblib.load(static_scaler_filename) 
Static_variables = pd.DataFrame(static_scaler.transform(Static_variables), columns=Static_variables.columns, index=Static_variables.index)

climatological_flows = {}

for basin, df in daily_flow.items():
    # Extract day of year and flow values
    df['day_of_year'] = df.index.dayofyear

    grouped = df.groupby('day_of_year')['daily_flow'].quantile([0.1, 0.5, 0.9]).unstack(level=1)

    climatological_flows[basin] = pd.DataFrame({
        'day_of_year': grouped.index,
        '10th_percentile_flow': grouped[0.1],
        '50th_percentile_flow': grouped[0.5],
        '90th_percentile_flow': grouped[0.9]
    })
    
    climatological_flows[basin].set_index('day_of_year', inplace=True)

    # Drop the temporary 'day_of_year' column from the original dataframe
    df.drop(columns='day_of_year', inplace=True)

criterion = SumPinballLoss(quantiles = [0.1, 0.5, 0.9])

basin = 'animas_r_at_durango' 
All_Dates = daily_flow[basin].index[
    ((daily_flow[basin].index.month < 6) | ((daily_flow[basin].index.month == 6) & (daily_flow[basin].index.day < 25))) &
    ((daily_flow[basin].index.year % 2 == 0) | ((daily_flow[basin].index.month > 10) | ((daily_flow[basin].index.month == 10) & (daily_flow[basin].index.day >= 1))))
]
All_Dates = All_Dates[All_Dates.year > 1998]


# Validation Year
Val_Dates = All_Dates[All_Dates.year == 2022]
All_Dates = All_Dates[All_Dates.year < 2022]


basin_to_remove = 'sweetwater_r_nr_alcova'

if basin_to_remove in basins:
    basins.remove(basin_to_remove)


seed = 42 ; torch.manual_seed(seed) ; random.seed(seed) ; np.random.seed(seed)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

days  = 90
hidden_variables_size = 17

LR = 1e-3
static_size = np.shape(Static_variables)[1]
forecast_size = np.shape(seasonal_forecasts['american_river_folsom_lake_2000_apr'])[1]
History_Fourier_in_forcings = 0 #2*3*(6 - 1)
Climate_guess = 3
History_Statistics_in_forcings = 5*2

head_input_size = forecast_size + static_size + History_Fourier_in_forcings + History_Statistics_in_forcings  + Climate_guess + 3
head_output_size = 3

# Be careful of this: Trying to unpickle estimator MinMaxScaler from version 1.3.0 when using version 1.4.1.post1

# Tuning individual basins

In [None]:
LR = 1e-3
static_size = np.shape(Static_variables)[1]
forecast_size = np.shape(seasonal_forecasts['american_river_folsom_lake_2000_apr'])[1]
History_Fourier_in_forcings = 0 #2*3*(6 - 1)
Climate_guess = 3
History_Statistics_in_forcings = 5*2

input_size = forecast_size + static_size + History_Fourier_in_forcings + History_Statistics_in_forcings  + Climate_guess + 3
output_size, head_hidden_size, head_num_layers =  3, 64, 3




In [None]:
def define_models(hidden_size, num_layers, dropout, bidirectional, learning_rate, copies = 3, output_size = 3, input_size = input_size, days = 90, hidden_variables_size = hidden_variables_size, device = device):
    models = {}
    params_to_optimize = {}
    optimizers = {}
    schedulers = {}
    for copy in range(copies):
        models[copy] = Hydra_LSTM_Block(input_size, hidden_size, num_layers, output_size, H0_sequences_size=days * hidden_variables_size, dropout= dropout, bidirectional= bidirectional)
        models[copy].to(device)
        params_to_optimize[copy] = list(models[copy].parameters())

        optimizers[copy] = torch.optim.Adam(params_to_optimize[copy], lr= learning_rate, weight_decay = 1e-3)
        schedulers[copy] = lr_scheduler.CosineAnnealingLR(optimizers[copy], T_max=1e4)

    return models, params_to_optimize, optimizers, schedulers

def update_final_parameters(Final_Parameters, basin, min_val_loss_parameters, min_val_loss):
    Final_Parameters['basin'].append(basin)
    Final_Parameters['hidden_size'].append(min_val_loss_parameters[0])
    Final_Parameters['num_layers'].append(min_val_loss_parameters[1])
    Final_Parameters['dropout'].append(min_val_loss_parameters[2])
    Final_Parameters['bidirectional'].append(min_val_loss_parameters[3])
    Final_Parameters['learning_rate'].append(min_val_loss_parameters[4])
    Final_Parameters['val_loss'].append(min_val_loss)


In [None]:
import ray
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from ray.tune.stopper import TrialPlateauStopper
# Fixed parameters
total_epochs = 30
n_epochs = 1  # Epochs between tests
group_lengths = np.arange(180)
batch_size = 1
copies = 3

# parameters to tune
hidden_sizes = [16, 64, 128]
num_layers =  [1,3]
dropout = [0.1, 0.4]
bidirectional = [False, True]
learning_rate = [1e-3, 1e-5]

# Set up configuration space
config_space = {
    "hidden_size": tune.grid_search(hidden_sizes),
    "num_layers": tune.grid_search(num_layers),
    "dropout": tune.grid_search(dropout),
    "bidirectional": tune.grid_search(bidirectional),
    "learning_rate": tune.grid_search(learning_rate)
}




In [None]:
def train_model(config):

    All_Dates = ray.get(All_Dates_id)  
    Val_Dates = ray.get(Val_Dates_id)  
    era5 = ray.get(era5_id)  
    daily_flow = ray.get(daily_flow_id)  
    climatological_flows = ray.get(climatological_flows_id)
    climate_indices = ray.get(climate_indices_id)
    seasonal_forecasts = ray.get(seasonal_forecasts_id)
    Static_variables = ray.get(Static_variables_id)



    copies = 3
    
    device = torch.device('cuda' if torch.cuda.
                    is_available() else 'cpu')
   
    models, params_to_optimize, optimizers, schedulers = define_models(
    config["hidden_size"], config["num_layers"], config["dropout"],
    config["bidirectional"], config["learning_rate"], copies=copies, device = device)


    losses, val_losses = [], []
    early_stopper = EarlyStopper(patience=4, min_delta=0.01)

    for epoch in range(total_epochs):

        train_losses = {}
        epoch_val_losses = {}

        for copy in range(copies):

             # Need to fix the outputs of No_Body_Model_Run
            train_losses[copy] = No_Body_Model_Run(All_Dates, [basin], models[copy], era5, daily_flow, climatological_flows, climate_indices, seasonal_forecasts,
                Static_variables, optimizers[copy], schedulers[copy], criterion, early_stopper=early_stopper, n_epochs=n_epochs,
                batch_size=batch_size, group_lengths=group_lengths, Train_Mode=True, device=device, specialised=False)
            epoch_val_losses[copy] = No_Body_Model_Run(Val_Dates, [basin], models[copy], era5, daily_flow, climatological_flows, climate_indices, seasonal_forecasts,
                Static_variables, optimizers[copy], schedulers[copy], criterion, early_stopper=early_stopper, n_epochs=n_epochs,
                batch_size=batch_size, group_lengths=group_lengths, Train_Mode=False, device=device, specialised=False)

        loss = np.mean(list(train_losses.values()))
        val_loss = np.mean(list(epoch_val_losses.values())).mean()

        ray.train.report({'val_loss' : val_loss})

        losses.append(loss)
        val_losses.append(val_loss)

        if early_stopper.early_stop(val_loss):
            break
    return val_loss

    


In [None]:
from ray import train, tune

ray.shutdown()
ray.init(runtime_env = { "env_vars":   {"PYTHONPATH": '/data/Hydra_Work/Competition_Functions/' } } )
         
All_Dates_id = ray.put(All_Dates)  
Val_Dates_id = ray.put(Val_Dates)  
era5_id = ray.put(era5)  
daily_flow_id = ray.put(daily_flow)  
climatological_flows_id = ray.put(climatological_flows)
climate_indices_id = ray.put(climate_indices)
seasonal_forecasts_id = ray.put(seasonal_forecasts)
Static_variables_id = ray.put(Static_variables)


In [None]:
def objective(config):  
    device = torch.device('cuda' if torch.cuda.
                      is_available() else 'cpu')
    
    print('Device available is', device)
    

    score = train_model(config) # Have training loop in here that outputs loss of model
    return {"val_loss": score}

basin = 'stehekin_r_at_stehekin'


# Can use fractions of GPU
tuner = tune.Tuner(tune.with_resources(tune.with_parameters(objective), resources={"cpu": 1, "gpu": 1/16}), param_space=config_space) 

results = tuner.fit()
print(results.get_best_result(metric="val_loss", mode="min").config)

In [None]:
# Loading models
Tuned_Models = {}
for basin in basins:
    Tuned_Models[basin] = torch.load(f'/data/Hydra_Work/Post_Rodeo_Work/Tuned_Single_Models/basin.pth')


# Tuning General Model

In [11]:
LR = 1e-3
static_size = np.shape(Static_variables)[1]
forecast_size = np.shape(seasonal_forecasts['american_river_folsom_lake_2000_apr'])[1]
History_Fourier_in_forcings = 0 #2*3*(6 - 1)
Climate_guess = 3
History_Statistics_in_forcings = 5*2

input_size = forecast_size + static_size + History_Fourier_in_forcings + History_Statistics_in_forcings  + Climate_guess + 3
output_size, head_hidden_size, head_num_layers =  3, 64, 3


In [12]:
def update_final_parameters_general(Final_Parameters, min_val_loss_parameters, min_val_loss):
    Final_Parameters['hidden_size'].append(min_val_loss_parameters[0])
    Final_Parameters['num_layers'].append(min_val_loss_parameters[1])
    Final_Parameters['dropout'].append(min_val_loss_parameters[2])
    Final_Parameters['bidirectional'].append(min_val_loss_parameters[3])
    Final_Parameters['learning_rate'].append(min_val_loss_parameters[4])
    Final_Parameters['val_loss'].append(min_val_loss)

In [14]:
import ray
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from ray.tune.stopper import TrialPlateauStopper

# Fixed parameters
total_epochs = 20
n_epochs = 1 # Epochs between tests
group_lengths = np.arange(180)
batch_size = 1
copies = 2

# parameters to tune
# I tuned to 128,2,0.1,False,1e-3 
hidden_sizes = [64, 128, 256]
num_layers = [1,3]
dropout = [0.1, 0.4]
bidirectional =  [False, True]
learning_rate = [1e-2, 1e-3, 1e-5]

config_space = {
    "hidden_size": tune.grid_search(hidden_sizes),
    "num_layers": tune.grid_search(num_layers),
    "dropout": tune.grid_search(dropout),
    "bidirectional": tune.grid_search(bidirectional),
    "learning_rate": tune.grid_search(learning_rate)
}


# Places to save info
model_dir = '/data/Hydra_Work/Post_Rodeo_Work/Tuned_General_Model/'

In [17]:
def train_model_general(config):

    All_Dates = ray.get(All_Dates_id)  
    Val_Dates = ray.get(Val_Dates_id)  
    era5 = ray.get(era5_id)  
    daily_flow = ray.get(daily_flow_id)  
    climatological_flows = ray.get(climatological_flows_id)
    climate_indices = ray.get(climate_indices_id)
    seasonal_forecasts = ray.get(seasonal_forecasts_id)
    Static_variables = ray.get(Static_variables_id)

    copies = 3
    
    device = torch.device('cuda' if torch.cuda.
                    is_available() else 'cpu')
   
    models, params_to_optimize, optimizers, schedulers = define_models(
    config["hidden_size"], config["num_layers"], config["dropout"],
    config["bidirectional"], config["learning_rate"], copies=copies, device = device)


    losses, val_losses = [], []
    early_stopper = EarlyStopper(patience=10, min_delta=0.01)

    for epoch in range(total_epochs):

        train_losses = {}
        epoch_val_losses = {}

        for copy in range(copies):

             # Need to fix the outputs of No_Body_Model_Run
            train_losses[copy] = No_Body_Model_Run(All_Dates, basins, models[copy], era5, daily_flow, climatological_flows, climate_indices, seasonal_forecasts,
                Static_variables, optimizers[copy], schedulers[copy], criterion, early_stopper=early_stopper, n_epochs=n_epochs,
                batch_size=batch_size, group_lengths=group_lengths, Train_Mode=True, device=device, specialised=False)
            epoch_val_losses[copy] = No_Body_Model_Run(Val_Dates, basins, models[copy], era5, daily_flow, climatological_flows, climate_indices, seasonal_forecasts,
                Static_variables, optimizers[copy], schedulers[copy], criterion, early_stopper=early_stopper, n_epochs=n_epochs,
                batch_size=batch_size, group_lengths=group_lengths, Train_Mode=False, device=device, specialised=False)

        loss = np.mean(list(train_losses.values()))
        val_loss = np.mean(list(epoch_val_losses.values())).mean()

        ray.train.report({'val_loss' : val_loss})

        losses.append(loss)
        val_losses.append(val_loss)

        if early_stopper.early_stop(val_loss):
            break
    return val_loss

In [18]:
def objective(config):  
    device = torch.device('cuda' if torch.cuda.
                      is_available() else 'cpu')
    
    print('Device available is', device)
    

    score = train_model_general(config) # Have training loop in here that outputs loss of model
    return {"val_loss": score}


# Can use fractions of GPU
tuner = tune.Tuner(tune.with_resources(tune.with_parameters(objective), resources={"cpu": 1, "gpu": 1/16}), param_space=config_space) 

results = tuner.fit()
print(results.get_best_result(metric="val_loss", mode="min").config)

0,1
Current time:,2024-03-19 10:34:31
Running for:,00:09:09.05
Memory:,69.4/125.9 GiB

Trial name,status,loc,bidirectional,dropout,hidden_size,learning_rate,num_layers,iter,total time (s),val_loss
objective_fddfe_00000,RUNNING,136.156.133.98:313403,False,0.1,64,0.01,1,1.0,323.604,22.2622
objective_fddfe_00001,RUNNING,136.156.133.98:313404,True,0.1,64,0.01,1,1.0,326.332,22.7039
objective_fddfe_00002,RUNNING,136.156.133.98:313405,False,0.4,64,0.01,1,1.0,316.371,20.7989
objective_fddfe_00003,RUNNING,136.156.133.98:313406,True,0.4,64,0.01,1,1.0,346.225,21.3366
objective_fddfe_00004,RUNNING,136.156.133.98:313416,False,0.1,128,0.01,1,1.0,315.955,21.2442
objective_fddfe_00005,RUNNING,136.156.133.98:313437,True,0.1,128,0.01,1,1.0,330.584,21.1446
objective_fddfe_00006,RUNNING,136.156.133.98:313446,False,0.4,128,0.01,1,1.0,315.921,21.9285
objective_fddfe_00007,RUNNING,136.156.133.98:313464,True,0.4,128,0.01,1,1.0,325.054,22.708
objective_fddfe_00008,RUNNING,136.156.133.98:313488,False,0.1,256,0.01,1,1.0,367.611,22.6019
objective_fddfe_00009,RUNNING,136.156.133.98:313531,True,0.1,256,0.01,1,1.0,349.316,21.0491


[36m(objective pid=313405)[0m Device available is cuda




[36m(objective pid=313405)[0m defaultdict(<class 'int'>, {})
[36m(objective pid=313437)[0m Device available is cuda[32m [repeated 15x across cluster][0m




[36m(objective pid=313405)[0m defaultdict(<class 'int'>, {})[32m [repeated 16x across cluster][0m
[36m(objective pid=313545)[0m defaultdict(<class 'int'>, {})[32m [repeated 4x across cluster][0m
[36m(objective pid=313416)[0m defaultdict(<class 'int'>, {})[32m [repeated 7x across cluster][0m
[36m(objective pid=313488)[0m defaultdict(<class 'int'>, {})[32m [repeated 12x across cluster][0m
[36m(objective pid=313635)[0m defaultdict(<class 'int'>, {})[32m [repeated 4x across cluster][0m
[36m(objective pid=313532)[0m defaultdict(<class 'int'>, {})[32m [repeated 2x across cluster][0m
[36m(objective pid=313532)[0m defaultdict(<class 'int'>, {})[32m [repeated 2x across cluster][0m
[36m(objective pid=313545)[0m defaultdict(<class 'int'>, {})
[36m(objective pid=313405)[0m defaultdict(<class 'int'>, {})
[36m(objective pid=313464)[0m defaultdict(<class 'int'>, {})[32m [repeated 4x across cluster][0m
[36m(objective pid=313464)[0m defaultdict(<class 'int'>, {})[

2024-03-19 10:34:41,179	INFO tune.py:1042 -- Total run time: 559.19 seconds (549.05 seconds for the tuning loop).
Resume experiment with: Tuner.restore(path="/home/gbmc/ray_results/objective_2024-03-19_10-25-21", trainable=...)
- objective_fddfe_00016: FileNotFoundError('Could not fetch metrics for objective_fddfe_00016: both result.json and progress.csv were not found at /home/gbmc/ray_results/objective_2024-03-19_10-25-21/objective_fddfe_00016_16_bidirectional=False,dropout=0.1000,hidden_size=128,learning_rate=0.0010,num_layers=1_2024-03-19_10-25-22')
- objective_fddfe_00017: FileNotFoundError('Could not fetch metrics for objective_fddfe_00017: both result.json and progress.csv were not found at /home/gbmc/ray_results/objective_2024-03-19_10-25-21/objective_fddfe_00017_17_bidirectional=True,dropout=0.1000,hidden_size=128,learning_rate=0.0010,num_layers=1_2024-03-19_10-25-26')
- objective_fddfe_00018: FileNotFoundError('Could not fetch metrics for objective_fddfe_00018: both result.js

{'hidden_size': 64, 'num_layers': 1, 'dropout': 0.4, 'bidirectional': False, 'learning_rate': 0.01}


In [None]:
General_Model = torch.load('/data/Hydra_Work/Post_Rodeo_Work/Tuned_General_Model/General_model.pth')

# Tuning Hydra Model

In [19]:
def define_models_hydra(body_input_size, body_hidden_size, body_num_layers, body_output_size, body_dropout, body_bidirectional,
                                 head_input_size, head_hidden_size, head_num_layers, head_output_size, head_dropout, head_bidirectional,
                        learning_rate_body, learning_rate_head, learning_rate_general_head, LR, basins = basins,  hidden_variables_size = hidden_variables_size, days = 90, device = device, copies = 3):
    Hydra_Bodys = {}
    model_heads = {}
    General_Hydra_Heads = {}

    params_to_optimize = {}
    optimizers = {}
    schedulers = {}
    for copy in range(copies):
        Hydra_Bodys[copy], model_heads[copy], General_Hydra_Heads[copy], optimizers[copy], schedulers[copy] = initialize_models_optimizers(basins, body_input_size, body_hidden_size, body_num_layers, body_output_size, body_dropout, body_bidirectional,
                            head_input_size, head_hidden_size, head_num_layers, head_output_size, head_dropout, head_bidirectional,
                            days, hidden_variables_size, learning_rate_body, learning_rate_head, learning_rate_general_head, LR, device)

    return Hydra_Bodys, General_Hydra_Heads, model_heads, optimizers, schedulers 

def update_final_parameters_hydra(Final_Parameters, min_val_loss_parameters, min_val_loss):
    # Append body parameters
    Final_Parameters['body_hidden_size'].append(min_val_loss_parameters[0])
    Final_Parameters['body_num_layers'].append(min_val_loss_parameters[1])
    Final_Parameters['body_dropout'].append(min_val_loss_parameters[2])
    Final_Parameters['body_learning_rate'].append(min_val_loss_parameters[3])
    Final_Parameters['body_output'].append(min_val_loss_parameters[4])
    # Append head parameters
    Final_Parameters['head_hidden_size'].append(min_val_loss_parameters[5])
    Final_Parameters['head_num_layers'].append(min_val_loss_parameters[6])
    Final_Parameters['head_dropout'].append(min_val_loss_parameters[7])
    Final_Parameters['head_learning_rate'].append(min_val_loss_parameters[8])
    # Append validation loss
    Final_Parameters['val_loss'].append(min_val_loss)

In [27]:
import ray
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from ray.tune.stopper import TrialPlateauStopper

# Fixed parameters
total_epochs = 20
n_epochs = 1 # Epochs between tests
group_lengths = np.arange(180)
batch_size = 1
copies = 3
body_input_size = input_size
head_output_size = 3

# parameters to tune
# chose 128, 2, 0.1, 1e-3, 6, 32, 1, 0.4, 1e-3
body_hidden_sizes = [64, 128, 256]
body_num_layers =  [1, 3]
body_dropouts = [0.1, 0.4]
body_learning_rates = [1e-3, 1e-5]
body_outputs = [3, 6, 10]


head_hidden_sizes = [16, 32, 64]
head_num_layers = [1, 3]
head_dropouts = [0.1, 0.4, 0.7]
head_learning_rates = [1e-3, 1e-5]
LR = [1e-3]
bidirectionals = [False, True]

config_space = {
    "body_hidden_size": tune.grid_search(body_hidden_sizes),
    "body_num_layer": tune.grid_search(body_num_layers),
    "body_dropout": tune.grid_search(body_dropouts),
    "bidirectional": tune.grid_search(bidirectionals),
    "body_output": tune.grid_search(body_outputs),
    "body_learning_rate": tune.grid_search(body_learning_rates),
    "head_hidden_size": tune.grid_search(head_hidden_sizes),
    "head_num_layer": tune.grid_search(head_num_layers),
    "head_dropout": tune.grid_search(head_dropouts),
    "head_learning_rate": tune.grid_search(head_learning_rates),
    "general_head_learning_rate": tune.grid_search(head_learning_rates),
    "LR": tune.grid_search(LR)
}

# Places to save info
model_dir = '/data/Hydra_Work/Post_Rodeo_Work/Tuned_Hydra_Model/'



In [38]:
def train_model_hydra(config):

    All_Dates = ray.get(All_Dates_id)  
    Val_Dates = ray.get(Val_Dates_id)  
    era5 = ray.get(era5_id)  
    daily_flow = ray.get(daily_flow_id)  
    climatological_flows = ray.get(climatological_flows_id)
    climate_indices = ray.get(climate_indices_id)
    seasonal_forecasts = ray.get(seasonal_forecasts_id)
    Static_variables = ray.get(Static_variables_id)

    copies = 3
    
    device = torch.device('cuda' if torch.cuda.
                    is_available() else 'cpu')
   

    Hydra_Bodys, General_Hydra_Heads, model_heads, optimizers, schedulers  = define_models_hydra(body_input_size, config['body_hidden_size'],
                                config['body_num_layer'],  config['body_output'], config['body_dropout'], config['bidirectional'], config['body_output'],
                                config['head_hidden_size'], config['head_num_layer'], 3, config['head_dropout'], config['bidirectional'],
                                config['body_learning_rate'], config['head_learning_rate'], config['general_head_learning_rate'], config['LR']
                                )
     

    general_losses, specific_losses, general_val_losses, specific_val_losses = [], [], [], []

    early_stopper = EarlyStopper(patience=10, min_delta=0.01)

    for epoch in range(total_epochs):
        train_general_losses = {}
        train_specific_losses = {}
        epoch_val_general_losses = {}
        epoch_val_specific_losses = {}
        climate_losses = {}
        
        for copy in range(copies):

             # Need to fix the outputs of No_Body_Model_Run
            train_general_losses[copy], train_specific_losses[copy], climate_losses[copy] = Model_Run(All_Dates, basins, Hydra_Bodys[copy], General_Hydra_Heads[copy], model_heads[copy], era5, daily_flow, climatological_flows, climate_indices, seasonal_forecasts,
                Static_variables, optimizers[copy], schedulers[copy], criterion, early_stopper=early_stopper, n_epochs=n_epochs,
                batch_size=batch_size, group_lengths=group_lengths, Train_Mode=True, device=device, feed_forcing = False)
            epoch_val_general_losses[copy], epoch_val_specific_losses[copy], climate_losses[copy] = Model_Run(Val_Dates, basins, Hydra_Bodys[copy], General_Hydra_Heads[copy], model_heads[copy], era5, daily_flow, climatological_flows, climate_indices, seasonal_forecasts,
                Static_variables, optimizers[copy], schedulers[copy], criterion, early_stopper=early_stopper, n_epochs=n_epochs,
                batch_size=batch_size, group_lengths=group_lengths, Train_Mode=False, device=device, feed_forcing = False)

        general_loss = np.mean(list(train_general_losses.values()))
        specific_loss = np.mean(list(train_specific_losses.values()))
        
        epoch_val_general_loss = np.mean(list(epoch_val_general_losses.values())).mean()
        epoch_val_specific_loss = np.mean(list(epoch_val_specific_losses.values())).mean()
        
        
        general_losses.append(general_loss)
        specific_losses.append(specific_loss)
        specific_val_losses.append(epoch_val_specific_loss)
        specific_val_losses.append(epoch_val_specific_loss)

        ray.train.report({'val_loss' : val_loss})

        losses.append(loss)
        val_losses.append(val_loss)

        if early_stopper.early_stop(val_loss):
            break
    return val_loss

In [39]:
def objective(config):  
    device = torch.device('cuda' if torch.cuda.
                      is_available() else 'cpu')
    

    score = train_model_hydra(config) # Have training loop in here that outputs loss of model
    return {"val_loss": score}


# Can use fractions of GPU
tuner = tune.Tuner(tune.with_resources(tune.with_parameters(objective), resources={"cpu": 1, "gpu": 1/16}), param_space=config_space) 

results = tuner.fit()
print(results.get_best_result(metric="val_loss", mode="min").config)

0,1
Current time:,2024-03-19 12:08:54
Running for:,00:03:24.30
Memory:,71.8/125.9 GiB

Trial name,status,loc,LR,bidirectional,body_dropout,body_hidden_size,body_learning_rate,body_num_layer,body_output,general_head_learnin g_rate,head_dropout,head_hidden_size,head_learning_rate,head_num_layer
objective_fad02_00000,RUNNING,136.156.133.98:396242,0.001,False,0.1,64,0.001,1,3,0.001,0.1,16,0.001,1
objective_fad02_00001,RUNNING,136.156.133.98:396267,0.001,True,0.1,64,0.001,1,3,0.001,0.1,16,0.001,1
objective_fad02_00002,RUNNING,136.156.133.98:396301,0.001,False,0.4,64,0.001,1,3,0.001,0.1,16,0.001,1
objective_fad02_00003,RUNNING,136.156.133.98:396457,0.001,True,0.4,64,0.001,1,3,0.001,0.1,16,0.001,1
objective_fad02_00004,RUNNING,136.156.133.98:396459,0.001,False,0.1,128,0.001,1,3,0.001,0.1,16,0.001,1
objective_fad02_00005,RUNNING,136.156.133.98:396496,0.001,True,0.1,128,0.001,1,3,0.001,0.1,16,0.001,1
objective_fad02_00006,RUNNING,136.156.133.98:396528,0.001,False,0.4,128,0.001,1,3,0.001,0.1,16,0.001,1
objective_fad02_00007,RUNNING,136.156.133.98:396544,0.001,True,0.4,128,0.001,1,3,0.001,0.1,16,0.001,1
objective_fad02_00008,RUNNING,136.156.133.98:396577,0.001,False,0.1,256,0.001,1,3,0.001,0.1,16,0.001,1
objective_fad02_00009,RUNNING,136.156.133.98:396582,0.001,True,0.1,256,0.001,1,3,0.001,0.1,16,0.001,1


[36m(objective pid=396301)[0m Device available is cuda




[36m(objective pid=396601)[0m Epoch 1: Training Mode
[36m(objective pid=396601)[0m general difference : 2.6705935244298487 
[36m(objective pid=396601)[0m specific difference: 9.121204995386716
[36m(objective pid=396601)[0m Climatology loss: 29.276312078442377
[36m(objective pid=396595)[0m Device available is cuda[32m [repeated 15x across cluster][0m
[36m(objective pid=396301)[0m Epoch 1: Training Mode[32m [repeated 2x across cluster][0m
[36m(objective pid=396301)[0m general difference : 1.901206251964984 [32m [repeated 2x across cluster][0m
[36m(objective pid=396301)[0m specific difference: 7.5696723828779975[32m [repeated 2x across cluster][0m
[36m(objective pid=396301)[0m Climatology loss: 30.05716260423443[32m [repeated 2x across cluster][0m
[36m(objective pid=396457)[0m Epoch 1: Training Mode[32m [repeated 2x across cluster][0m
[36m(objective pid=396457)[0m general difference : 1.0403344970271724 [32m [repeated 2x across cluster][0m
[36m(objecti

2024-03-19 12:09:04,231	INFO tune.py:1042 -- Total run time: 214.42 seconds (204.09 seconds for the tuning loop).
Resume experiment with: Tuner.restore(path="/home/gbmc/ray_results/objective_2024-03-19_12-05-29", trainable=...)
- objective_fad02_00016: FileNotFoundError('Could not fetch metrics for objective_fad02_00016: both result.json and progress.csv were not found at /home/gbmc/ray_results/objective_2024-03-19_12-05-29/objective_fad02_00016_16_LR=0.0010,bidirectional=False,body_dropout=0.1000,body_hidden_size=128,body_learning_rate=0.0000,body_num_2024-03-19_12-05-31')
- objective_fad02_00017: FileNotFoundError('Could not fetch metrics for objective_fad02_00017: both result.json and progress.csv were not found at /home/gbmc/ray_results/objective_2024-03-19_12-05-29/objective_fad02_00017_17_LR=0.0010,bidirectional=True,body_dropout=0.1000,body_hidden_size=128,body_learning_rate=0.0000,body_num__2024-03-19_12-05-36')
- objective_fad02_00018: FileNotFoundError('Could not fetch metric

RuntimeError: No best trial found for the given metric: val_loss. This means that no trial has reported this metric, or all values reported for this metric are NaN. To not ignore NaN values, you can set the `filter_nan_and_inf` arg to False.