In [2]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
import category_encoders as ce
from matplotlib import pyplot as plt
import seaborn as sns
import tensorflow 
from tensorflow import keras
from tensorflow.keras import layers
import keras_tuner as kt
from matplotlib.ticker import MaxNLocator


import os

from keras import backend as K



from keras.utils.vis_utils import plot_model
import pickle

2023-07-23 14:37:22.227518: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
#DATAFRAMES PER ORIGIN

#function creating a dictionary of dataframes 
#from an initial dataframe.
#Each dataframe corresponds to a specific value 
#of variable ORIGIN
#The dataframe is created only if it has 
#at least a desired number of entries 
#since there may be "origins" associated with too few entries

#it returns di_dataframes_per_origin where
#key=name of orogin airport
#value = the related df and sorted according 
#'FL_DATE', 'DEP_TIME'

def fct_create_di_dataframes_per_origin(
    val_dataframe,
    val_name_col_origin="ORIGIN",
    val_name_col_1_sort='FL_DATE',
    val_name_col_2_sort='DEP_TIME',
    val_admissible_min_nb_observatios=80):

    #key=name of orogin airport
    #value = the related df
    di_dataframes_per_origin={}
    
    li_origins=val_dataframe[val_name_col_origin].unique().tolist()
    
    for i in li_origins:
        df=val_dataframe[val_dataframe[val_name_col_origin]==i]
        
        #if the dataframe has at least val_admissible_min_nb_observatios entries
        #we create it otherwise we ignore it
        
        if df.shape[0]>=val_admissible_min_nb_observatios:
            
            df=df.sort_values([val_name_col_1_sort, val_name_col_2_sort])
            
            di_dataframes_per_origin[i]=df
     
    #print("in fct len(li_dataframes_per_origin)",len(li_dataframes_per_origin))
    return di_dataframes_per_origin

In [6]:

#read DataFrame from pickle file
df_2009_1= pd.read_pickle("my_df_2009_1.pkl")

di=fct_create_di_dataframes_per_origin(
val_dataframe=df_2009_1)

list(di.keys())

['IAD',
 'SEA',
 'LAX',
 'ORD',
 'TPA',
 'DCA',
 'LGA',
 'OAK',
 'BDL',
 'PDX',
 'BIL',
 'BOS',
 'PHL',
 'LAS',
 'DTW',
 'DEN',
 'OMA',
 'RDU',
 'SFO',
 'ONT',
 'SMF',
 'MSP',
 'GEG',
 'PHX',
 'MCO',
 'MSY',
 'RNO',
 'JFK',
 'BUF',
 'DSM',
 'SAN',
 'MCI',
 'MDT',
 'CLE',
 'OKC',
 'PVD',
 'FSD',
 'IAH',
 'EWR',
 'BWI',
 'BTV',
 'ROC',
 'ICT',
 'MIA',
 'RIC',
 'TUL',
 'HNL',
 'DFW',
 'JAC',
 'PIT',
 'AUS',
 'SNA',
 'SJC',
 'BOI',
 'ALB',
 'DAY',
 'GRR',
 'CMH',
 'HDN',
 'SLC',
 'SJU',
 'EGE',
 'ABQ',
 'SAT',
 'ATL',
 'CLT',
 'RSW',
 'MHT',
 'KOA',
 'OGG',
 'LIH',
 'PSP',
 'STL',
 'TUS',
 'STT',
 'IND',
 'JAX',
 'CVG',
 'BZN',
 'RAP']

In [3]:
#FUNCTION  creating the list targets for a single dataframe 

def fct_create_li_target_single_df(\
val_dataframe,val_target_column_name="ARR_DELAY"):
    
    #li_targets=[] the ith element is  targets for the ith entry
    
    
    li_targets=val_dataframe[val_target_column_name].values
    
    #print("li_targets",li_targets)
    #import sys
    #sys.exit()
        
        
    return li_targets

In [4]:
# VECTORIZATION  -  CATEGORICAL VARIABLES

#fct which vectorizes object type columns of a single dataframe
#IT RETURNS A VECTORIZED DATAFRAME
#the encoder employed is 
#LeaveOneOutEncoder which is a traget based encoder
#the advantages of this encoder is that 
#it doesn't increases the number of variables
#as it doesn't create additional variables

#this function vectorizes the  dataframe 
#which is passed as value to the argument val_dataframe

def fct_vectorization_obj_cols_single_df(
    val_dataframe,
    val_li_name_cols_to_ignore=['FL_DATE','ORIGIN'],
    val_li_name_cols_to_copy=['DEST']):
    

    
    #print("li_obj_cols",li_obj_cols)

    #***************
    #FOR VERIFICATION PURPOSES WE KEEP AS OBJECT TYPE A COPIE OF VARIABLES, 
    #OP_CARRIER
    #ORIGIN
    #DEST

    #df_2009_1['OP_CARRIER_1']=df_2009_1['OP_CARRIER']
    #df_2009_1['ORIGIN_1']=df_2009_1['ORIGIN']
    #***************
    
    #FOR VERIFICATION PURPOSES WE KEEP AS OBJECT TYPE A COPIE OF VARIABLE DEST
    # for the model 1 where we have a separate dataframe per ORIGIN and we employ
    #a single OPERATOR that is UA we do not vectorize variables as we will not use them.
    #But if we need to study many ORIGINS (model 2) and/or many OPERATORS  simultaneously 
    #that is airline companies then we would make a copy of these variables 
    #for verification reasons
    
    #the list with the column names to  ignire during the vectorization
    val_li_cols_to_ignore=val_li_name_cols_to_ignore
    
    #print("val_li_name_cols_to_copy",val_li_name_cols_to_copy)
    
    for i in val_li_name_cols_to_copy:
        
        #print("IN fct_vectorization_obj_cols_single_df, i",i)
        
        val_dataframe[i+str('_1')]=val_dataframe[i]
    
        val_li_cols_to_ignore.append(i+str('_1'))
                                                 
    #the list with the object type columns to vectorize                                            
    li_obj_cols=[i for i in val_dataframe.select_dtypes(exclude=np.number).columns.tolist() 
             if i not in val_li_cols_to_ignore]
                                                 
    encoder = ce.LeaveOneOutEncoder(return_df=True)

    val_dataframe[li_obj_cols]=encoder.fit_transform(val_dataframe[li_obj_cols],val_dataframe['ARR_DELAY'])

    
    return val_dataframe


In [5]:
def fct_measure_mean_observations_per_day_single_df(
    val_name_df,
    val_name_column_date='FL_DATE'):
    
    res = (pd.to_datetime(val_name_df[val_name_column_date])
       .dt.floor('d')
       .value_counts()
       .rename_axis(val_name_column_date)
       .reset_index(name='count'))
    
    res1=res.sort_values(by=[val_name_column_date])
        
    mean_nb_obs=round(res1["count"].mean())
        
    return mean_nb_obs
    

In [6]:
#function standardizing training samples of single dataframe 
#it returns 
#[standardized data array, number_train_samples,
#number_val_samples,number_test_samples,mean,std] 

def fct_stand_single_df(
    val_name_df,
    val_proportion_train_set=0.5,
    val_proportion_val_set=0.25):
    
    
    #we exctract the numerical columns
    li_num_cols=val_name_df.select_dtypes(include=np.number).columns.tolist()
    #print(li_num_cols)
    #break
    
    #we keep the target variable in the data but we do not keep the date as it is
    #object type variable and we only keep numerical ones

    raw_data=val_name_df[li_num_cols].to_numpy()
    #raw_data.shape
    
    #Computing the number of samples we'll use for each data split

    #samples we’ll use for each data split

    #50% of the data for training, 
    #the following 25% for validation, and 
    #the last 25% for testing.
    
    num_train_samples = int(val_proportion_train_set * len(raw_data))

    num_val_samples = int(val_proportion_val_set * len(raw_data))

    num_test_samples = len(raw_data) - num_train_samples - num_val_samples
    
    
    mean = raw_data[:num_train_samples].mean(axis=0)
    raw_data -= mean
    std = raw_data[:num_train_samples].std(axis=0)
    #we replace all std equal to zero  by 0.0001
    std[std  == 0] = 0.0001
    raw_data /= std
    
    return[raw_data,num_train_samples,num_val_samples,num_test_samples,mean,std]
        
    
    

In [7]:
#function examining the existence of missing (null) values for a list dataframe
#t returns 0/1 if no/at least one  dataframe has missing  values
#and also prints a message

def fct_examine_existence_null_vals_dataframe(
    val_li_dataframes):
    
    rep=0
    
    for i in val_li_dataframes:
        
        a=i.isnull().sum().sum()
        
        if a!=0:
            print("Missing values for current dataframe")
            print(i.head())
            rep=1
            return rep
        
    print("No missing values in any dataframe of the considered list")
    print()
    
    return rep


In [8]:
def fct_plot_correlation(\
    v_data,\
    v_folder_figure,\
    v_name_fig="fig_cor_vector_data_heatmap.png"):
    
    plt.figure(figsize=(14,7))
    #plt.figure(figsize=(20,15))

    # Create a custom divergin palette
    #cmap = sns.diverging_palette(100, 7, s=75, l=40,n=5, center="light", as_cmap=True)
    cmap = sns.diverging_palette(145, 300, s=60, as_cmap=True)
    
    #Create a mask
    mask = np.triu(np.ones_like(v_data.corr(numeric_only = True), dtype=bool))
    np.fill_diagonal(mask, False)

    #scale all fonts in your legend and on the axes.
    #sns.set(font_scale=1.4)
    sns.set(font_scale=1)
    #cmap="PiYG
    heatmap = sns.heatmap(v_data.corr(numeric_only = True), vmin=-1, vmax=1, annot=True,
    annot_kws={'fontsize': 8},
    linewidths=0.5, linecolor='m',
    cmap=cmap, mask=mask)
    heatmap.set_title('Correlation coeffcient matrix',
    fontdict={'fontsize':15}, color="darkmagenta",weight='bold',pad=12)
    plt.yticks(rotation=30,fontsize=8.5,fontweight="bold",color="darkmagenta")
    plt.xticks(rotation=20,fontsize=8.3,fontweight="bold",color="darkmagenta")
    plt.savefig(v_folder_figure+"/"+v_name_fig)
    plt.close()
    #plt.show()

In [9]:


#OUTPUT OF THE FUNCTION

#it returns 

#[rep_stand_arrays,\
#val_li_targets_df,\
#val_mean_observations_per_day,
#mean,std]



#rep_stand_arrays=[standardized data array, 
#number_train_samples ,
#number_val_samples,list_number_test_samples,
#mean,std]

#standardized data array= arrays, 

#number_train_samples=  the number of training samples

#number_val_samples=  the number of validation samples


#number_test_samples= the number of test samples

#Model 2= the model where we have a single dataframe with all the origins


def fct_creation_data_arrays_model_2(
    var_dataframe,
    var_folder_figure,
    var_name_col_origin="ORIGIN",
    var_li_name_col_to_copy=['DEST'],
    var_name_column_date='FL_DATE',
    var_name_col_2_sort='DEP_TIME',
    var_name_target_variable="ARR_DELAY",
    var_li_cols_to_ignore=['FL_DATE','ORIGIN'],
    var_proportion_train_set=0.5,
    var_proportion_val_set=0.25,
    var_name_fig="fig_cor_vector_data_heatmap.png"
):
    
    
    #we examine null values in each dataframe
    #val_existence_missing_values=O/1 no missing/missing values
    val_existence_missing_values=\
    fct_examine_existence_null_vals_dataframe(
        val_li_dataframes=[var_dataframe])
    
    #WE HAVE PREVIOUSLY CHECKED (OUTSIDE OF THIS FUNCTION)
    #THAT THERE ARE NO MISSING VALUES
    if val_existence_missing_values==1:
        print("IN FCT fct_creation_data_arrays_model_2,\
        EXISTENCE MISSING VALUES :",val_existence_missing_values==1)
    
    #we create the list with the targets for each dataframe
    val_li_targets_df=fct_create_li_target_single_df(\
    val_dataframe=var_dataframe,
    val_target_column_name=var_name_target_variable)
    

    
    # VECTORIZATION  -  CATEGORICAL VARIABLES

    #we vectorize all columns but date and origin
    #we keep o copy of te destination column DEST
    #as an object type variable with the name DEST_1
    #we do it i cse we need to rapidly detect the destination by name

    val_vectorized_df=fct_vectorization_obj_cols_single_df(
    val_dataframe=var_dataframe,
    val_li_name_cols_to_ignore=var_li_cols_to_ignore,
    val_li_name_cols_to_copy=var_li_name_col_to_copy)
    
    

    #print("Head Vectorized dataframe: ",\
    #val_vectorized_dfs[0].head()
    #print()
    
    inform_vectorized_df=val_vectorized_df.info()
    
    print("information vectorized df:",inform_vectorized_df)
    print()
    
    #PLOT CORRELATION
    fct_plot_correlation(\
    v_data=val_vectorized_df,\
    v_folder_figure=var_folder_figure,\
    v_name_fig=var_name_fig)
    
    
    #DATA EXAMINATION
    #we measure the mean number of observations  per day per dataframe

    val_mean_observations_per_day=\
    fct_measure_mean_observations_per_day_single_df(
    val_name_df=val_vectorized_df,
    val_name_column_date=var_name_column_date)
    

    
    print("mean number of observations for the vectorized dataframe: ",
          val_mean_observations_per_day)
    print()
    

    
    #Preparing the data

    #Problem formulation
    #given data covering the previous x(=1) days and sampled once per day per flight, 
    #can we predict the arrival delay in next x(=1) day


    #Standardize-Normalize the data


    
    #There are two types of scaling of your data 
    #that you may want to consider: 
    #normalization and standardization.

    #INPUT DATA

    #Standardizing a dataset involves rescaling 
    #the distribution of values 
    #so that the mean of observed values is 0 and 
    #the standard deviation is 1. 
    #It is sometimes referred to as “whitening.”
    #This can be thought of as subtracting the mean value 
    #or centering the data.


    #Normalization is a rescaling of the data from the original 
    #range so that all values are within the range of 0 and 1.



    #If the distribution of the quantity is normal, then 
    #it should be standardized, otherwise the data should be normalized. 
    #This applies if the range of quantity values is large 
    #(10s, 100s, etc.) or 
    #small (0.01, 0.0001).

    #f the quantity values are small (near 0-1) and 
    #the distribution is limited (e.g. standard deviation near 1) 
    #then perhaps you can get away with no scaling of the data.


    #If in doubt, normalize the input sequence. 
    #If you have the resources, explore modeling with the raw data, 
    #standardized data, and normalized data and see 
    #if there is a beneficial difference in the performance 
    #of the resulting model.



    # We’re going to use the first num_train_samples timesteps 
    #as training data, so we’ll
    #standardize or normalize only on this fraction of the data.
    
    # We’re going to use the first num_train_samples timesteps 
    #as training data, so we’ll
    #standardize or normalize only on this fraction of 
    #the data.i_dfs=li_vectorized_dfs,
    
    
    #rep_stand_arrays= 
    #[standardized data array, 
    #number_train_samples,
    #number_val_samples,
    #number_test_samples,
    #mean,
    #std]
    
     
    
    rep_stand_arrays=fct_stand_single_df(
    val_name_df=val_vectorized_df,
    val_proportion_train_set=var_proportion_train_set,
    val_proportion_val_set=var_proportion_val_set)

    
    
    print("size of the list with the number of training data \
    for each standardized array: ",\
          len(rep_stand_arrays[0]))
    print("size of the list with the number of validation data\
    for each standardized array: "\
          ,rep_stand_arrays[1])
    print("size of the list with the number of test data \
    for each standardized array: ",\
          rep_stand_arrays[2])
    print("**************************")
    print()
    
    
    
    return [rep_stand_arrays,\
            val_li_targets_df,\
            val_mean_observations_per_day]

In [10]:
#create train, validation and test sets


#function returning the train, validation and test dataset

#we will use s_length_train timesteps to predict the next
#s_length_target timesteps

#data =data array,
#target= target array,
#num_train_samples= the number of training samples  (of the data),
#num_val_samples= the number of validation samples,
#num_test_samples = the number of test samples


def fct_create_train_val_test_datasets_from_arrays(
    s_length_train,
    s_length_target,
    s_stride,
    b_size,
    data,
    target,
    num_train_samples,
    num_val_samples,
    shuffle_tr_s,
    shuffle_tr_t,
    shuffle_v_s,
    shuffle_v_t,
    shuffle_t_s,
    shuffle_t_t):
    #,
    #num_test_samples):
    
    
    #the target  for a sequence
    #will be the target s_length_target timesteps after the end of the sequence
    
    delay_tr=s_length_train
    
    delay_tar=s_length_target
    
    
    #print("s_length_train",s_length_train)
    #print()
    
    #train set (batch size, timesteps, input_features)
    #all timesteps  should belong in the train set so we must not have
    #samples for which the target does not belong in the train set
    input_train_dataset = keras.preprocessing.timeseries_dataset_from_array(
    data, None, sequence_length=s_length_train, sequence_stride=s_stride,
    start_index=0,end_index=num_train_samples-delay_tar,batch_size=b_size,
    shuffle=shuffle_tr_s)
    
    #print("here1")

    #target of the train set
    target_train_dataset = keras.preprocessing.timeseries_dataset_from_array(
    target, None, sequence_length=s_length_target, sequence_stride=s_stride,
    start_index=s_length_train,
    end_index=num_train_samples,
    batch_size=b_size,
    shuffle=shuffle_tr_t)

    #train-target dataset
    train_dataset=tensorflow.data.Dataset.zip((input_train_dataset, target_train_dataset))

    
    #a=0
    #print(len(train_dataset))
    #for i,j in train_dataset:
    #    print("input",i)
    #    print("target",j)
    #    a+=1
    #    if a==73:
    #    break 
    
    #we create validation sets if the num_val_samples>0
    if num_val_samples>0:
        #validation sets
        input_val_dataset = keras.preprocessing.timeseries_dataset_from_array(
        data, None, sequence_length=s_length_train, sequence_stride=s_stride,
        start_index=num_train_samples,
        end_index=num_train_samples+num_val_samples-delay_tar,
        batch_size=b_size,
        shuffle=shuffle_v_s)

    
    
        #targets of the validation sets
        target_val_dataset = keras.preprocessing.timeseries_dataset_from_array(
        target,None, 
        sequence_length=s_length_target, sequence_stride=s_stride,
        start_index=num_train_samples+s_length_train,
        end_index=num_train_samples+num_val_samples,
        batch_size=b_size,
        shuffle=shuffle_v_t)

    
        #validation-target dataset
        val_dataset=tensorflow.data.Dataset.zip((input_val_dataset,target_val_dataset))
        
    else:
        val_dataset=None
    
    #print('start_index, end_index=',num_train_samples+s_length_train,
    #num_train_samples+num_val_samples)
    
    
    
    #test set
    input_test_dataset = keras.preprocessing.timeseries_dataset_from_array(
    data, None, sequence_length=s_length_train, sequence_stride=s_stride,
    start_index=num_train_samples+num_val_samples,
    end_index=data.shape[0]-delay_tar,
    batch_size=b_size,
    shuffle=shuffle_t_s)

    #targets of the test set
    target_test_dataset = keras.preprocessing.timeseries_dataset_from_array(
    target, None, sequence_length=s_length_target, sequence_stride=s_stride,
    start_index=num_train_samples+num_val_samples+s_length_train,
    batch_size=b_size,
    shuffle=shuffle_t_t)

    
    #test-target sets
    test_dataset=tensorflow.data.Dataset.zip((input_test_dataset, target_test_dataset)) 
    
    return train_dataset,val_dataset,test_dataset


In [11]:
#fct returning
#train_dataset,
#val_dataset,
#test_dataset,
#v_s_length_train,
#stand_array.shape[-1] (nb feaures)
#mean value of train data
#std f train data
#num train data
#num validation data
#sequence length target 


#va_dataframe=df_2009_1
#va_folder_figure=the folder with the plots

#if val_s_length_train then

#val_s_length_train=
#val_nb_past_seq_lengths x mean nb obesrvations per day

#if val_s_length_target=None
#val_s_length_target=
#val_nb_future_seq_lengths x mean nb obesrvations per day

#we  learn from val_s_length_train tsteps to predict
#val_s_length_target




def fct_create_train_val_test_datasets_from_dataframe(
    val_dataframe,
    val_s_stride,
    val_b_size,
    val_folder_figure,
    val_nb_past_seq_lengths=1,
    val_nb_future_seq_lengths=1,
    val_s_length_train=None,
    val_s_length_target=None,
    val_name_col_origin="ORIGIN",
    val_li_name_col_to_copy=['DEST'],
    val_name_column_date='FL_DATE',
    val_name_col_2_sort='DEP_TIME',
    val_name_target_variable="ARR_DELAY",
    val_li_cols_to_ignore=['FL_DATE','ORIGIN'],
    val_proportion_train_set=0.5,
    val_proportion_val_set=0.25,
    val_name_fig="fig_cor_vector_data_heatmap.png",
    val_shuffle_tr_s=False,
    val_shuffle_tr_t=False,
    val_shuffle_v_s=False,
    val_shuffle_v_t=False,
    val_shuffle_t_s=False,
    val_shuffle_t_t=False):
    
    #li_rep_stand_data=[rep_stand_arrays,\
    #val_li_targets_df,\
    #val_mean_observations_per_day]


    #rep_stand_arrays= 
    #[standardized data array, 
    #number_train_samples,
    #number_val_samples,
    #number_test_samples,
    #mean,
    #std]
    
    li_rep_stand_data=fct_creation_data_arrays_model_2(
    var_dataframe=val_dataframe,
    var_folder_figure=val_folder_figure,
    var_name_col_origin=val_name_col_origin,
    var_li_name_col_to_copy=val_li_name_col_to_copy,
    var_name_column_date=val_name_column_date,
    var_name_col_2_sort=val_name_col_2_sort,
    var_name_target_variable=val_name_target_variable,
    var_li_cols_to_ignore=val_li_cols_to_ignore,
    var_proportion_train_set=val_proportion_train_set,
    var_proportion_val_set=val_proportion_val_set,
    var_name_fig=val_name_fig)
    
    stand_array=li_rep_stand_data[0][0]
    
    #if we want to use the mean observations per day for
    #learning from past tsteps to estimate
    if val_s_length_train==None:
        
        mean_observ_per_day=\
        fct_measure_mean_observations_per_day_single_df(
        val_name_df=val_dataframe,
        val_name_column_date=val_name_column_date)
        
        v_s_length_train=\
        round(val_nb_past_seq_lengths*mean_observ_per_day)
        
        
    else:
        v_s_length_train=val_s_length_train
        mean_observ_per_day=None
    
    #if we want to use the mean observations per day for
    #the future tsteps to estimate
    if val_s_length_target==None:
        
        if mean_observ_per_day==None:
            mean_observ_per_day=\
            fct_measure_mean_observations_per_day_single_df(
            val_name_df=val_dataframe,
            val_name_column_date=val_name_column_date)
        
        v_s_length_target=\
        round(mean_observ_per_day*val_nb_future_seq_lengths)
        
    else:
        
        v_s_length_target=val_s_length_target
    
    
    #we define the train, validation and tests sets
    train_dataset,val_dataset,test_dataset=\
    fct_create_train_val_test_datasets_from_arrays(
    s_length_train=v_s_length_train,
    s_length_target=v_s_length_target,
    s_stride=val_s_stride,
    b_size=val_b_size,
    data=stand_array,
    target=li_rep_stand_data[1],
    num_train_samples=li_rep_stand_data[0][1],
    num_val_samples=li_rep_stand_data[0][2],
    shuffle_tr_s=val_shuffle_tr_s,
    shuffle_tr_t=val_shuffle_tr_t,
    shuffle_v_s=val_shuffle_v_s,
    shuffle_v_t=val_shuffle_v_t,
    shuffle_t_s=val_shuffle_t_s,
    shuffle_t_t=val_shuffle_t_t)
    
    return train_dataset,val_dataset,test_dataset,v_s_length_train,stand_array.shape[-1],\
    li_rep_stand_data[0][4],li_rep_stand_data[0][5],li_rep_stand_data[0][1],\
    li_rep_stand_data[0][2],v_s_length_target
    
    
    
    

In [12]:

#function which creates a model for the hyperparameters hp
#best hp)
#and finds the best number of epochs  to retrain the best models
#t returns the best epoch and the history object when searching
#the best epoch of the best model

#this function takes as argument the batch size but
#since data are already provided per batches
#it is not used in the 
#fir metod. To examine if we should  use it


def fct_get_best_epoch(
    va_hp,
    va_tuner,
    va_train_dataset,
    va_val_dataset,
    va_batch_size=None,
    va_metric_to_monitor_best_epoch_callbacks="val_loss", 
    va_mode_callbacks="min", 
    va_patience_best_epoch_callbacks=10,
    va_epochs=2):
        
        print(" IN FCT BEST_EPOCH, WE START SEARCH BEST EPOCH FOR HP")
    
    
        model = va_tuner.hypermodel.build(va_hp)
    
        print("In fct best_epoch, va_metric_to_monitor_best_epoch_callbacks",\
        va_metric_to_monitor_best_epoch_callbacks)
        print()
    
        #Typically, we want to train the new models 
        #for longer than we did during the search: 
        #using an aggressive patience value(=10)  in the 
        #EarlyStopping callback saves time during the search, 
        #but it may lead to under-fit models. 
        #Thus we use the validation set to find the best epoch:
   
        #va_metric_to_monitor_best_epoch_callbacks wil lbe a metric of the validation dataset
        callbacks=[
        keras.callbacks.EarlyStopping(
        monitor=va_metric_to_monitor_best_epoch_callbacks, 
        mode=va_mode_callbacks, 
        patience=va_patience_best_epoch_callbacks)
        ]
    
        #fit the model,
        #model.fit returns a History object
        #which records metrics (train and/or validation set) for each epoch
        #It containing a dictionary member named history key=metric
        #value=records per epoch
        history = model.fit(
        va_train_dataset,
        validation_data=va_val_dataset,
        epochs=va_epochs,
        #batch_size=va_batch_size,
        callbacks=callbacks)
    
        #print("history",history)
        #print("history.keys", history.history.keys())
        #for  i in history.history.keys():
        #    print(history.history[i])
        #    import sys
         #   sys.exit()
    
        #history.history["loss"]=[ training loss 1st epoch, training loss 2nd epoch...]
    
        #tha validation loss per epoch
        val_loss_per_epoch = history.history[va_metric_to_monitor_best_epoch_callbacks]
    
        print("In fct best_epoch, val_loss_per_epoch-result history:",val_loss_per_epoch)
        print()
    
        best_epoch = val_loss_per_epoch.index(min(val_loss_per_epoch)) + 1
    
        print(f"Best epoch: {best_epoch}")
        print()
        print("END FCT BEST EPOCH")
        
        return best_epoch, history

In [13]:
#function which creates a model from a given set of hyperparameters
#finds the best number of epochs
#retraines the model
#returns the best trained model 

#va_train_dataset= the train dataset
#va_new_train=the train +the validation dataset

def fct_get_best_trained_model(
    va_hp,
    va_tuner,
    va_train_dataset,
    va_new_train,
    va_val_dataset,
    va_index_for_saving_best_model,
    va_to_multiply_epoch_for_train_dur,
    va_batch_size=None,
    va_metric_to_monitor_best_epoch_callbacks="val_loss", 
    va_mode_callbacks="min", 
    va_patience_best_epoch_callbacks=10,
    va_epochs=2,
    va_pkl_filename_best_model = "best_model",
    va_pkl_filename_best_retrained_model=\
    "history_obj_best_retrained_model"
    ):
        
        print("WE START FCT BEST_TRAINED_MODEL BY SEARCHING BEST EPOCH")
    
        #we find best epoch for the hyperparameters hp
        best_epoch,history_when_search_best_epoch = fct_get_best_epoch(
        va_hp=va_hp,
        va_tuner=va_tuner,
        va_train_dataset=va_train_dataset,
        va_val_dataset=va_val_dataset,
        va_batch_size=va_batch_size,
        va_metric_to_monitor_best_epoch_callbacks=\
        va_metric_to_monitor_best_epoch_callbacks, 
        va_mode_callbacks=va_mode_callbacks, 
        va_patience_best_epoch_callbacks=\
        va_patience_best_epoch_callbacks,
        va_epochs=va_epochs)

        model = va_tuner.hypermodel.build(va_hp)
    
        #we save the best  model
        fi_name=va_pkl_filename_best_model+"_"+\
        str(va_index_for_saving_best_model)+".pkl"

        with open(fi_name, 'wb') as file:
            pickle.dump(model, file)
    
        print("IN FCT BEST_TRAINED MODEL \
        WE CREATED MODEL WITH BEST HP - WE WILL START RETRAIN IT\
        FOR A LITTLE LONGER THAN THE BEST NUMBER OF EPOCHS \
        IN TRAIN+VAL SET")
          
        #we retrain the (best) model for a little  longer 
        #than the best number of  epochs
        #since we have more  data (va_new_train=train+validation)
        
        #history=the values of each metric per epoch
        #for the train set we don't use the validation set)
        history_retrained_model=model.fit(
        va_new_train,
        #batch_size=va_batch_size, 
        epochs=int(best_epoch * 
        va_to_multiply_epoch_for_train_dur))
    
        #we save the history object of the best retrained model
        fi_name_1=va_pkl_filename_best_retrained_model+"_"+\
        str(va_index_for_saving_best_model)+".pkl"

        
        with open(fi_name_1, 'wb') as file_1:
            pickle.dump(history_retrained_model, file_1)
        
        print("IN FCT BEST_TRAINED_MODEL END RETRAINED MODEL-END FCT")
    
        return model,history_when_search_best_epoch,history_retrained_model
    

In [14]:
#method searching for the best model using a BO tuner

#1.we find the best hyperparameters using a BO tuner
#2.we find the best number of epochs 
#3.we  return the best trained model

#in tuners we should Always 
#specify validation metrics, 
#since the goal of the search process 
#is to find models that generalize
#so 
#val_objective_metric_for_tuner_to_optimize,
#val_metric_for_tuner_callback,
#val_monitor_callbacks
#are metrics for the validation dataset

#val_max_trials=Maximum number of different model configurations 
#(“trials”) to try before ending the search

#To reduce metrics variance, we can  train the
#same model multiple times and average the results. 
#val_executions_per_trial is how many 
#training rounds (executions) to run 
#for each model configuration (trial).

#val_directory=Where
#to store search logs

#val_overwrite=Whether to overwrite data in directory 
#to start a new search.

def fct_search_best_model_using_tuner(
val_di_tuners,
val_key_tuner_class,
val_hypermodel,
val_objective_metric_for_tuner_to_optimize,
val_mode,
val_max_trials,
val_executions_per_trial,
val_directory,
val_metric_for_tuner_search_hp_callback,
val_li_keys_tuners_optimizing_batch_size,
val_train_dataset,
val_val_dataset,
val_test_dataset,
val_epochs_tuner_search,
val_top_best_models,
val_batch_size,
val_to_multiply_epoch_for_train_dur,
val_metric_to_monitor_best_epoch_callbacks,
val_epochs_best_trained_model_search,
val_overwrite=True,
val_patience_during_tuner_search=5,
val_verbose=2,
val_mode_callbacks="min",
val_patience_best_epoch_callbacks=10,
val_pkl_filename_best_model = "best_model",
val_pkl_filename_best_retrained_model=\
"history_obj_best_retrained_model"):
   

    print("IN FCT fct_search_best_model_using_tuner WE CREATE A TUNER\
    TO SEARCH THE BEST MODEL")

    #we define the tuner
    tuner = val_di_tuners[val_key_tuner_class](
    hypermodel=val_hypermodel,
    objective=\
    kt.Objective(val_objective_metric_for_tuner_to_optimize,\
                direction=val_mode),
    max_trials=val_max_trials,
    executions_per_trial=val_executions_per_trial,
    directory=val_directory,
    overwrite=val_overwrite)
    
    print("IN FCT fct_search_best_model_using_tuner \
    val_metric_for_tuner_search_hp_callback",
          val_metric_for_tuner_search_hp_callback)
    print()

    #we print a summary of the hyperparameters in the search space
    #that is
    #the number of hyperparameters we search to optimize (7)
    #number layers
    #units each layer
    #activation function each layer
    #dropout
    #activation function last layer
    #optimizer
    #learining rate etc.
    print()
    print("IN fct_search_best_model_using_tuner \
    Summary of the hyperparameters in the search space",\
    tuner.search_space_summary())
    print()

    #At each trial, 
    #the tuner would generate a new set of hyperparameter values 
    # build the model
    #Train the model and record its metric

    #we start the search of best hyperparameters

    #callback.EarlyStopping= prematurely stop training 
    #if validation loss doesn’t improve
    #that is when it starts overfit
    callbacks = [
    keras.callbacks.EarlyStopping(\
    monitor=val_metric_for_tuner_search_hp_callback, 
    patience=val_patience_during_tuner_search),
    ]
    #if the tuner optimizes the batch size
    if val_key_tuner_class in val_li_keys_tuners_optimizing_batch_size:
        
        tuner.search(
        val_train_dataset,
        #batch_size=val_batch_size,
        epochs=val_epochs_tuner_search,
        validation_data=val_val_dataset,
        callbacks=callbacks,
        verbose=val_verbose)

    #if  the tuner doesn't optimize the batch size
    else:
        #we perform a search for best hyperparameter configurations
        tuner.search(
        val_train_dataset,
        batch_size=val_batch_size,
        epochs=val_epochs_tuner_search,
        validation_data=val_val_dataset,
        callbacks=callbacks,
        verbose=val_verbose)

    print()
    print("Results Summary of the tuner",tuner.results_summary())
    print()

    print("IN FCT fct_search_best_model_using_tuner,\
    END TUNER SEARCH FOR FINDING BEST MODEL")
    print()

    print("In FCT fct_search_best_model_using_tuner,\
    val_top_best_models",val_top_best_models)
    print()

    
    #the best hyperparameters
    best_hps = tuner.get_best_hyperparameters(val_top_best_models)

    #for each best hp we search the best epoch to retrain the best model


    #Usually, when retraining these models,
    #we  include the validation data as part of the training data, 
    #since we won’t be making any further hyperparameter changes, a
    #nd thus we will no longer be evaluating performance on the validation data.
    
    v_new_train=val_train_dataset.concatenate(val_val_dataset)


    #we create the best trained models 
    #for each considered set of best hyperparameters
    #di_best_models dict, key=id best model (start with 1)
    #value=best model
    di_best_trained_models = {}
    
    #di_hist_when_search_best_epoch=dictionary,
    #key=id best model (starting with 1)
    #value=history when searching best epoch
    di_hist_when_search_best_epoch={}
    
    #di_hist_retrained_best_model=dictionary,
    #key=id best model (starting with 1)
    #value=history retrained best model
    #this history cotnains values for both train and
    #validation sets
    di_hist_retrained_best_model={}
    
    #di_results_model_eval_test_set=dictionary,
    #key=id best model (starting with 1)
    #value=dictkey=id metric, value=value metric
    di_results_model_eval_test_set={}
    
    indice=0

    #for each best hyperparameter:
    #i. we create a best model
    #ii. we find the best epoch
    #iii. we retrain the best model for the best epoch
    for hp in best_hps:
        
        indice+=1
        
        print()
        print("IN FCT fct_search_best_model_using_tuner,\
        hyperparameter number:",indice)
        print("IN FCT fct_search_best_model_using_tunerr,\
        we  will search for the best trained model THAT IS THE BEST EPOCH\
        AND RETRAIN THE BEST  MODEL using function \
        get_best_trained_model") 

        model,history_when_search_best_epoch,history_retrained_model=\
        fct_get_best_trained_model(
        va_hp=hp,
        va_tuner=tuner,
        va_train_dataset=val_train_dataset,
        va_new_train=v_new_train,
        va_val_dataset=val_val_dataset,
        va_index_for_saving_best_model=indice,
        va_to_multiply_epoch_for_train_dur=\
        val_to_multiply_epoch_for_train_dur,
        va_batch_size=val_batch_size,
        va_metric_to_monitor_best_epoch_callbacks=\
        val_metric_to_monitor_best_epoch_callbacks,
        va_mode_callbacks=val_mode_callbacks, 
        va_patience_best_epoch_callbacks=\
        val_patience_best_epoch_callbacks,
        va_epochs=val_epochs_best_trained_model_search,
        va_pkl_filename_best_model = val_pkl_filename_best_model,
        va_pkl_filename_best_retrained_model=\
        val_pkl_filename_best_retrained_model
        )

        di_best_trained_models[indice]=model
                
        di_hist_when_search_best_epoch[indice]=history_when_search_best_epoch
        
        di_hist_retrained_best_model[indice]=history_retrained_model

        #model.evaluate returns a list of scalars for the loss and metrics
        #for the test set
        #results=[value of test_loss, value of test_mae, values of test_rmse]
        results_model_eval_test_set=model.evaluate(val_test_dataset)
        
        di_results_model_eval_test_set[indice]={}

        li_metric_names=model.metrics_names
        nb_metrics=len(li_metric_names)
        for j in range(nb_metrics):
            di_results_model_eval_test_set[indice][li_metric_names[j]]=\
            results_model_eval_test_set[j]
        
    for i in di_hist_when_search_best_epoch:
        print()
        print("Search Best Epoch For Best retrained model :", i)
        #for each metric
        for j in di_hist_when_search_best_epoch[i].history:
            print("metric",j)
            print("values per epoch:",di_hist_when_search_best_epoch[i].history[j])
      
    
    for i in di_hist_retrained_best_model:
        print()
        print("Evaluation Best retrained model :", i)
        #for each metric
        for j in di_hist_retrained_best_model[i].history:
            print("metric",j)
            print("values per epoch:",di_hist_retrained_best_model[i].history[j])
            
    for i in di_results_model_eval_test_set:
        print()
        print("Evaluation Test Set: Best retrained model :", i)
        #for each metric
        for j in di_results_model_eval_test_set[i]:
            print("Test metric",j)
            print("Test metric value:",di_results_model_eval_test_set[i][j])
            

    print("FIN")
    
    return di_best_trained_models,di_hist_when_search_best_epoch,\
    di_hist_retrained_best_model,di_results_model_eval_test_set
    

In [15]:
#function returning the best model(s) performance when
# a single study involving all origins is considered
#it returns
#di_best_trained_models,di_hist_when_search_best_epoch,\
#di_hist_retrained_best_model,di_results_model_eval_test_set,\
#va_test_dataset,va_mean_value_train_dataset,va_std_train_dataset,\
#va_id_first_future_observation,va_li_true_vals_test_set,\
#va_name_figure_folder_metric,va_name_figure_metric

def fct_best_approaches_1(
    v_dataframe,
    v_s_stride,
    v_b_size,
    v_folder_figures,
    v_nb_past_seq_lengths,
    v_nb_future_seq_lengths,
    v_s_length_train_model,
    v_s_length_target,
    v_name_col_origin,
    v_li_name_col_to_copy,
    v_name_column_date,
    v_name_col_2_sort,
    v_name_target_variable,
    v_li_cols_to_ignore,
    v_proportion_train_set,
    v_proportion_val_set,
    v_name_fig,
    v_shuffle_tr_s,
    v_shuffle_tr_t,
    v_shuffle_v_s,
    v_shuffle_v_t,
    v_shuffle_t_s,
    v_shuffle_t_t,
    v_di_hypermodels,
    v_key_hypermodel_class,
    v_min_nb_lay_model,
    v_max_nb_lay_model,
    v_min_nb_units_model,
    v_max_nb_units_model,
    v_min_value_dropout_rate_model,
    v_max_value_dropout_rate_model,
    v_min_value_recurrent_dropout_rate_model,
    v_max_value_recurrent_dropout_rate_model,
    v_min_nb_filters_conv1d,
    v_max_nb_filters_conv1d,
    v_min_nb_kernel_size_conv1d,
    v_max_nb_kernel_size_conv1d,
    v_step_nb_layers_model,
    v_step_nb_units_model,
    v_step_dropout_rate_model,
    v_step_recurrent_dropout_rate_model,
    v_step_nb_kernel_size_conv1d,
    v_min_pool_size,
    v_max_pool_size,
    v_step_pool_size,
    v_li_activ_fcts_model,
    v_li_optimizers_model,
    v_min_val_learning_rate_optimizer,
    v_max_val_learning_rate_optimizer,
    v_loss_fct_model,
    v_metrics_model,
    v_di_tuners,
    v_key_tuner_class,
    v_objective_metric_for_tuner_to_optimize,
    v_mode,
    v_max_trials,
    v_executions_per_trial,
    v_directory,
    v_metric_for_tuner_search_hp_callback,
    v_li_keys_tuners_optimizing_batch_size,
    v_epochs_tuner_search,
    v_top_best_models,
    v_batch_size,
    v_to_multiply_epoch_for_train_dur,
    v_metric_to_monitor_best_epoch_callbacks,
    v_epochs_best_trained_model_search,
    v_overwrite=True,
    v_patience_during_tuner_search=5,
    v_verbose=2,
    v_mode_callbacks="min",
    v_patience_best_epoch_callbacks=10,
    v_pkl_filename_best_model = "best_model",
    v_pkl_filename_best_retrained_model=\
    "history_obj_best_retrained_model"
):

    #create the folder for the figures (plots)
    #folder for the all the figure plots
    os.makedirs(v_folder_figures,exist_ok = True)

    
    #create data arrays
    train_dataset,val_dataset,test_dataset,\
    v_s_length_train,nb_input_features,\
    mean_val_train_data,std_val_train_data,\
    number_train_data,number_validation_data,\
    v_s_length_target=\
    fct_create_train_val_test_datasets_from_dataframe(
    val_dataframe=v_dataframe,
    val_s_stride=v_s_stride,
    val_b_size=v_b_size,
    val_folder_figure=v_folder_figures,
    val_nb_past_seq_lengths=v_nb_past_seq_lengths,
    val_nb_future_seq_lengths=v_nb_future_seq_lengths,
    val_s_length_train=v_s_length_train_model,
    val_s_length_target=v_s_length_target,
    val_name_col_origin=v_name_col_origin,
    val_li_name_col_to_copy=v_li_name_col_to_copy,
    val_name_column_date=v_name_column_date,
    val_name_col_2_sort=v_name_col_2_sort,
    val_name_target_variable=v_name_target_variable,
    val_li_cols_to_ignore=v_li_cols_to_ignore,
    val_proportion_train_set=v_proportion_train_set,
    val_proportion_val_set=v_proportion_val_set,
    val_name_fig=v_name_fig,
    val_shuffle_tr_s=v_shuffle_tr_s,
    val_shuffle_tr_t=v_shuffle_tr_t,
    val_shuffle_v_s=v_shuffle_v_s,
    val_shuffle_v_t=v_shuffle_v_t,
    val_shuffle_t_s=v_shuffle_t_s,
    val_shuffle_t_t=v_shuffle_t_t)
    

    v_nb_initial_input_features_model=nb_input_features

    #v_min_nb_units_model=int(nb_input_features+1)

    #the sequence length of the hypermodel will be the value returned 
    #by the function fct_create_train_val_test_datasets_from_dataframe
    #that is 
    #val_s_length_train_model=v_s_length_train
    #the same will hold true  from now on 
    #whenever we need the sequence length of the train set
    #s_length_train
    #The number of outputs of the  last ayer of the model willbe
    #the number of tsteps to predict as we are dealing with a 
    #timeseries forecast problem
    hypermodel_j=v_di_hypermodels[v_key_hypermodel_class](
    val_s_length_train_model=v_s_length_train,
    val_nb_initial_input_features_model=v_nb_initial_input_features_model,
    val_min_nb_lay_model=v_min_nb_lay_model,
    val_max_nb_lay_model=v_max_nb_lay_model,
    val_min_nb_units_model=v_min_nb_units_model,
    val_max_nb_units_model=v_max_nb_units_model,
    val_min_value_dropout_rate_model=v_min_value_dropout_rate_model,
    val_max_value_dropout_rate_model=v_max_value_dropout_rate_model,
    val_min_nb_filters_conv1d=v_min_nb_filters_conv1d,
    val_max_nb_filters_conv1d=v_max_nb_filters_conv1d,
    val_min_nb_kernel_size_conv1d=v_min_nb_kernel_size_conv1d,
    val_max_nb_kernel_size_conv1d=v_max_nb_kernel_size_conv1d,
    val_min_value_recurrent_dropout_rate_model=\
    v_min_value_recurrent_dropout_rate_model,
    val_max_value_recurrent_dropout_rate_model=\
    v_max_value_recurrent_dropout_rate_model,
    val_step_nb_layers_model=v_step_nb_layers_model,
    val_step_nb_units_model=v_step_nb_units_model,
    val_step_dropout_rate_model=v_step_dropout_rate_model,
    val_step_recurrent_dropout_rate_model=\
    v_step_recurrent_dropout_rate_model,
    val_step_nb_kernel_size_conv1d=\
    v_step_nb_kernel_size_conv1d,
    val_min_pool_size=v_min_pool_size,
    val_max_pool_size=v_max_pool_size,
    val_step_pool_size=v_step_pool_size,
    val_li_activ_fcts_model=v_li_activ_fcts_model,
    val_nb_last_output_classes_model=v_s_length_target,
    val_li_optimizers_model=v_li_optimizers_model,
    val_min_val_learning_rate_optimizer=\
    v_min_val_learning_rate_optimizer,
    val_max_val_learning_rate_optimizer=\
    v_max_val_learning_rate_optimizer,
    var_loss_fct_model=v_loss_fct_model,
    var_metrics_model=v_metrics_model)

    v_hypermodel=hypermodel_j

    di_best_trained_models,di_hist_when_search_best_epoch,\
    di_hist_retrained_best_model,di_results_model_eval_test_set=\
    fct_search_best_model_using_tuner(
    val_di_tuners=v_di_tuners,
    val_key_tuner_class=v_key_tuner_class,
    val_hypermodel=v_hypermodel,
    val_objective_metric_for_tuner_to_optimize=\
    v_objective_metric_for_tuner_to_optimize,
    val_mode=v_mode,
    val_max_trials=v_max_trials,
    val_executions_per_trial=v_executions_per_trial,
    val_directory=v_directory,
    val_metric_for_tuner_search_hp_callback=\
    v_metric_for_tuner_search_hp_callback,
    val_li_keys_tuners_optimizing_batch_size=\
    v_li_keys_tuners_optimizing_batch_size,
    val_train_dataset=train_dataset,
    val_val_dataset=val_dataset,
    val_test_dataset=test_dataset,
    val_epochs_tuner_search=v_epochs_tuner_search,
    val_top_best_models=v_top_best_models,
    val_batch_size=v_batch_size,
    val_to_multiply_epoch_for_train_dur=\
    v_to_multiply_epoch_for_train_dur,
    val_metric_to_monitor_best_epoch_callbacks=\
    v_metric_to_monitor_best_epoch_callbacks,
    val_epochs_best_trained_model_search=\
    v_epochs_best_trained_model_search,
    val_overwrite=v_overwrite,
    val_patience_during_tuner_search=v_patience_during_tuner_search,
    val_verbose=v_verbose,
    val_mode_callbacks=v_mode_callbacks,
    val_patience_best_epoch_callbacks=\
    v_patience_best_epoch_callbacks,
    val_pkl_filename_best_model=v_pkl_filename_best_model,
    val_pkl_filename_best_retrained_model=\
    v_pkl_filename_best_retrained_model)


    va_test_dataset=test_dataset
    
    #the mean value of  the target variable is the 7th column
    #enumeration starting with zero as 2st element)
    va_mean_value_train_dataset=mean_val_train_data[7]
    
    #the std value of  the target variable is the 7th column
    #enumeration starting with zero as 2st element)
    va_std_train_dataset=std_val_train_data[7]


    va_id_first_future_observation=int(number_train_data+number_validation_data)
    
    va_li_true_vals_test_set=\
    v_dataframe[v_name_target_variable].values[va_id_first_future_observation:]
    
    va_name_figure_folder_metric=v_folder_figures

    #print("va_name_figure_folder_metric",va_name_figure_folder_metric)

    #we will not use, it as we use a single folder 
    #the one named v_folder_figures
    #tths variable should be removed
    #va_name_figure_folder_metric=None
    
    va_name_figure_metric="fig_inferences"
    
    li_rep=[di_best_trained_models,di_hist_when_search_best_epoch,\
    di_hist_retrained_best_model,di_results_model_eval_test_set,\
    va_test_dataset,va_mean_value_train_dataset,va_std_train_dataset,\
    va_id_first_future_observation,va_li_true_vals_test_set,\
    va_name_figure_folder_metric,va_name_figure_metric]

    
    return li_rep
    


    


In [16]:
#function  returning the  best model(s) performance
#when considering a distinct study for eash origin 
#it returns a dictionary
#di_rep
#val_key_model_class=the key  of the desired model 
#in the dictionary  di_hypermodels, defined in the 
#file with the Hyperparameter models
#di[key]= the hypemodel we want to create


def fct_best_approaches_2(
    v_di_dataframes,
    v_s_stride,
    v_b_size,
    v_folder_figures,
    v_nb_past_seq_lengths,
    v_nb_future_seq_lengths,
    v_s_length_train_model,
    v_s_length_target,
    v_name_col_origin,
    v_li_name_col_to_copy,
    v_name_column_date,
    v_name_col_2_sort,
    v_name_target_variable,
    v_li_cols_to_ignore,
    v_proportion_train_set,
    v_proportion_val_set,
    v_name_fig,
    v_shuffle_tr_s,
    v_shuffle_tr_t,
    v_shuffle_v_s,
    v_shuffle_v_t,
    v_shuffle_t_s,
    v_shuffle_t_t,
    v_di_hypermodels,
    v_key_hypermodel_class,
    v_min_nb_lay_model,
    v_max_nb_lay_model,
    v_min_nb_units_model,
    v_max_nb_units_model,
    v_min_value_dropout_rate_model,
    v_max_value_dropout_rate_model,
    v_min_value_recurrent_dropout_rate_model,
    v_max_value_recurrent_dropout_rate_model,
    v_min_nb_filters_conv1d,
    v_max_nb_filters_conv1d,
    v_min_nb_kernel_size_conv1d,
    v_max_nb_kernel_size_conv1d,
    v_step_nb_layers_model,
    v_step_nb_units_model,
    v_step_dropout_rate_model,
    v_step_recurrent_dropout_rate_model,
    v_step_nb_kernel_size_conv1d,
    v_min_pool_size,
    v_max_pool_size,
    v_step_pool_size,
    v_li_activ_fcts_model,
    v_li_optimizers_model,
    v_min_val_learning_rate_optimizer,
    v_max_val_learning_rate_optimizer,
    v_loss_fct_model,
    v_metrics_model,
    v_di_tuners,
    v_key_tuner_class,
    v_objective_metric_for_tuner_to_optimize,
    v_mode,
    v_max_trials,
    v_executions_per_trial,
    v_directory,
    v_metric_for_tuner_search_hp_callback,
    v_li_keys_tuners_optimizing_batch_size,
    v_epochs_tuner_search,
    v_top_best_models,
    v_batch_size,
    v_to_multiply_epoch_for_train_dur,
    v_metric_to_monitor_best_epoch_callbacks,
    v_epochs_best_trained_model_search,
    v_overwrite=True,
    v_patience_during_tuner_search=5,
    v_verbose=2,
    v_mode_callbacks="min",
    v_patience_best_epoch_callbacks=10,
    v_pkl_filename_best_model = "best_model",
    v_pkl_filename_best_retrained_model=\
    "history_obj_best_retrained_model"
):

    #di_rep=dict, key=id origin, value=[,... 
    #returned values of function fct_best_approaches_1 
    #for the ith dataframe,....]
    di_rep={}



    #for each dataframe
    for i in v_di_dataframes:
        
        #cur_dir=os.getcwd()
        #os.chdir(v_folder_figures)
        
        #create the the folder
        val_name_folder_figures_given_origin=v_folder_figures+"/"+v_folder_figures+"_"+str(i)
        #print()
        #print("val_name_folder_figures_given_origin",val_name_folder_figures_given_origin)
        #import sys
        #sys.exit()
        os.makedirs(val_name_folder_figures_given_origin,exist_ok = True)

        #va_name_figure_folder_metric sera val_name_folder_origin
        di_best_trained_models,di_hist_when_search_best_epoch,\
        di_hist_retrained_best_model,di_results_model_eval_test_set,\
        va_test_dataset,va_mean_value_train_dataset,va_std_train_dataset,\
        va_id_first_future_observation,va_li_true_vals_test_set,\
        va_name_figure_folder_metric,va_name_figure_metric=fct_best_approaches_1(
        v_dataframe=v_di_dataframes[i],
        v_s_stride=v_s_stride,
        v_b_size=v_b_size,
        v_folder_figures=val_name_folder_figures_given_origin,
        v_nb_past_seq_lengths=v_nb_past_seq_lengths,
        v_nb_future_seq_lengths=v_nb_future_seq_lengths,
        v_s_length_train_model=v_s_length_train_model,
        v_s_length_target=v_s_length_target,
        v_name_col_origin=v_name_col_origin,
        v_li_name_col_to_copy=v_li_name_col_to_copy,
        v_name_column_date=v_name_column_date,
        v_name_col_2_sort=v_name_col_2_sort,
        v_name_target_variable=v_name_target_variable,
        v_li_cols_to_ignore=v_li_cols_to_ignore,
        v_proportion_train_set=v_proportion_train_set,
        v_proportion_val_set=v_proportion_val_set,
        v_name_fig=v_name_fig,
        v_shuffle_tr_s=v_shuffle_tr_s,
        v_shuffle_tr_t=v_shuffle_tr_t,
        v_shuffle_v_s=v_shuffle_v_s,
        v_shuffle_v_t=v_shuffle_v_t,
        v_shuffle_t_s=v_shuffle_t_s,
        v_shuffle_t_t=v_shuffle_t_t,
        v_di_hypermodels=v_di_hypermodels,
        v_key_hypermodel_class=v_key_hypermodel_class,
        v_min_nb_lay_model=v_min_nb_lay_model,
        v_max_nb_lay_model=v_max_nb_lay_model,
        v_min_nb_units_model=v_min_nb_units_model,
        v_max_nb_units_model=v_max_nb_units_model,
        v_min_value_dropout_rate_model=\
        v_min_value_dropout_rate_model,
        v_max_value_dropout_rate_model=\
        v_max_value_dropout_rate_model,
        v_min_value_recurrent_dropout_rate_model=\
        v_min_value_recurrent_dropout_rate_model,
        v_max_value_recurrent_dropout_rate_model=\
        v_max_value_recurrent_dropout_rate_model,
        v_min_nb_filters_conv1d=v_min_nb_filters_conv1d,
        v_max_nb_filters_conv1d=v_max_nb_filters_conv1d,
        v_min_nb_kernel_size_conv1d=v_min_nb_kernel_size_conv1d,
        v_max_nb_kernel_size_conv1d=v_max_nb_kernel_size_conv1d,
        v_step_nb_layers_model=v_step_nb_layers_model,
        v_step_nb_units_model=v_step_nb_units_model,
        v_step_dropout_rate_model=v_step_dropout_rate_model,
        v_step_recurrent_dropout_rate_model=\
        v_step_recurrent_dropout_rate_model,
        v_step_nb_kernel_size_conv1d=\
        v_step_nb_kernel_size_conv1d,
        v_min_pool_size=v_min_pool_size,
        v_max_pool_size=v_max_pool_size,
        v_step_pool_size=v_step_pool_size,
        v_li_activ_fcts_model=v_li_activ_fcts_model,
        v_li_optimizers_model=v_li_optimizers_model,
        v_min_val_learning_rate_optimizer=\
        v_min_val_learning_rate_optimizer,
        v_max_val_learning_rate_optimizer=\
        v_max_val_learning_rate_optimizer,
        v_loss_fct_model=v_loss_fct_model,
        v_metrics_model=v_metrics_model,
        v_di_tuners=v_di_tuners,
        v_key_tuner_class=v_key_tuner_class,
        v_objective_metric_for_tuner_to_optimize=\
        v_objective_metric_for_tuner_to_optimize,
        v_mode=v_mode,
        v_max_trials=v_max_trials,
        v_executions_per_trial=v_executions_per_trial,
        v_directory=v_directory,
        v_metric_for_tuner_search_hp_callback=\
        v_metric_for_tuner_search_hp_callback,
        v_li_keys_tuners_optimizing_batch_size=\
        v_li_keys_tuners_optimizing_batch_size,
        v_epochs_tuner_search=v_epochs_tuner_search,
        v_top_best_models=v_top_best_models,
        v_batch_size=v_batch_size,
        v_to_multiply_epoch_for_train_dur=\
        v_to_multiply_epoch_for_train_dur,
        v_metric_to_monitor_best_epoch_callbacks=\
        v_metric_to_monitor_best_epoch_callbacks,
        v_epochs_best_trained_model_search=\
        v_epochs_best_trained_model_search,
        v_overwrite=True,
        v_patience_during_tuner_search=5,
        v_verbose=2,
        v_mode_callbacks="min",
        v_patience_best_epoch_callbacks=10,
        v_pkl_filename_best_model = "best_model",
        v_pkl_filename_best_retrained_model=\
        "history_obj_best_retrained_model")

        li=[di_best_trained_models,di_hist_when_search_best_epoch,\
        di_hist_retrained_best_model,di_results_model_eval_test_set,\
        va_test_dataset,va_mean_value_train_dataset,va_std_train_dataset,\
        va_id_first_future_observation,va_li_true_vals_test_set,\
        va_name_figure_folder_metric,va_name_figure_metric]

        di_rep[i]=li

        #os.chdir(cur_dir)

    return di_rep

    

        
        

In [17]:
#function returning the best  model(s) approach

#if all origins are examined together it returns a list
#li_rep=di_best_trained_models,di_hist_when_search_best_epoch,\
#di_hist_retrained_best_model,di_results_model_eval_test_set,\
#va_test_dataset,va_mean_value_train_dataset,va_std_train_dataset,\
#va_id_first_future_observation,va_li_true_vals_test_set,\
#va_name_figure_folder_metric,va_name_figure_metric]
#otherwise if each origin  is examined separatly
#it returns a dict, key=id origin
#value= li_re^
def fct_best_approaches(
    v_distinct_origins,
    v_dataframe,
    v_s_stride,
    v_b_size,
    v_folder_figures,
    v_nb_past_seq_lengths,
    v_nb_future_seq_lengths,
    v_s_length_train_model,
    v_s_length_target,
    v_name_col_origin,
    v_li_name_col_to_copy,
    v_name_column_date,
    v_name_col_2_sort,
    v_name_target_variable,
    v_li_cols_to_ignore,
    v_proportion_train_set,
    v_proportion_val_set,
    v_name_fig,
    v_shuffle_tr_s,
    v_shuffle_tr_t,
    v_shuffle_v_s,
    v_shuffle_v_t,
    v_shuffle_t_s,
    v_shuffle_t_t,
    v_di_hypermodels,
    v_key_hypermodel_class,
    v_min_nb_lay_model,
    v_max_nb_lay_model,
    v_min_nb_units_model,
    v_max_nb_units_model,
    v_min_value_dropout_rate_model,
    v_max_value_dropout_rate_model,
    v_min_value_recurrent_dropout_rate_model,
    v_max_value_recurrent_dropout_rate_model,
    v_min_nb_filters_conv1d,
    v_max_nb_filters_conv1d,
    v_min_nb_kernel_size_conv1d,
    v_max_nb_kernel_size_conv1d,
    v_step_nb_layers_model,
    v_step_nb_units_model,
    v_step_dropout_rate_model,
    v_step_recurrent_dropout_rate_model,
    v_step_nb_kernel_size_conv1d,
    v_min_pool_size,
    v_max_pool_size,
    v_step_pool_size,
    v_li_activ_fcts_model,
    v_li_optimizers_model,
    v_min_val_learning_rate_optimizer,
    v_max_val_learning_rate_optimizer,
    v_loss_fct_model,
    v_metrics_model,
    v_di_tuners,
    v_key_tuner_class,
    v_objective_metric_for_tuner_to_optimize,
    v_mode,
    v_max_trials,
    v_executions_per_trial,
    v_directory,
    v_metric_for_tuner_search_hp_callback,
    v_li_keys_tuners_optimizing_batch_size,
    v_epochs_tuner_search,
    v_top_best_models,
    v_batch_size,
    v_to_multiply_epoch_for_train_dur,
    v_metric_to_monitor_best_epoch_callbacks,
    v_epochs_best_trained_model_search,
    val_admissible_min_nb_observatios=80,\
    v_overwrite=True,
    v_patience_during_tuner_search=5,
    v_verbose=2,
    v_mode_callbacks="min",
    v_patience_best_epoch_callbacks=10,
    v_pkl_filename_best_model = "best_model",
    v_pkl_filename_best_retrained_model=\
    "history_obj_best_retrained_model"
):

    #if we want to examine all  origins together
    if v_distinct_origins==0:

        #create the folder for the figures (plots)
        #folder for the all the figure plots
        #os.makedirs(v_folder_figures,exist_ok = True)

        #li_rep=[di_best_trained_models,di_hist_when_search_best_epoch,\
        #di_hist_retrained_best_model,di_results_model_eval_test_set,\
        #va_test_dataset,va_mean_value_train_dataset,va_std_train_dataset,\
        #va_id_first_future_observation,va_li_true_vals_test_set,\
        #va_name_figure_folder_metric,va_name_figure_metric]

        li_rep=\
        fct_best_approaches_1(
        v_dataframe=v_dataframe,
        v_s_stride=v_s_stride,
        v_b_size=v_b_size,
        v_folder_figures=v_folder_figures,
        v_nb_past_seq_lengths=v_nb_past_seq_lengths,
        v_nb_future_seq_lengths=v_nb_future_seq_lengths,
        v_s_length_train_model=v_s_length_train_model,
        v_s_length_target=v_s_length_target,
        v_name_col_origin=v_name_col_origin,
        v_li_name_col_to_copy=v_li_name_col_to_copy,
        v_name_column_date=v_name_column_date,
        v_name_col_2_sort=v_name_col_2_sort,
        v_name_target_variable=v_name_target_variable,
        v_li_cols_to_ignore=v_li_cols_to_ignore,
        v_proportion_train_set=v_proportion_train_set,
        v_proportion_val_set=v_proportion_val_set,
        v_name_fig=v_name_fig,
        v_shuffle_tr_s=v_shuffle_tr_s,
        v_shuffle_tr_t=v_shuffle_tr_t,
        v_shuffle_v_s=v_shuffle_v_s,
        v_shuffle_v_t=v_shuffle_v_t,
        v_shuffle_t_s=v_shuffle_t_s,
        v_shuffle_t_t=v_shuffle_t_t,
        v_di_hypermodels=v_di_hypermodels,
        v_key_hypermodel_class=v_key_hypermodel_class,
        v_min_nb_lay_model=v_min_nb_lay_model,
        v_max_nb_lay_model=v_max_nb_lay_model,
        v_min_nb_units_model=v_min_nb_units_model,
        v_max_nb_units_model=v_max_nb_units_model,
        v_min_value_dropout_rate_model=\
        v_min_value_dropout_rate_model,
        v_max_value_dropout_rate_model=\
        v_max_value_dropout_rate_model,
        v_min_value_recurrent_dropout_rate_model=\
        v_min_value_recurrent_dropout_rate_model,
        v_max_value_recurrent_dropout_rate_model=\
        v_max_value_recurrent_dropout_rate_model,
        v_min_nb_filters_conv1d=v_min_nb_filters_conv1d,
        v_max_nb_filters_conv1d=v_max_nb_filters_conv1d,
        v_min_nb_kernel_size_conv1d=\
        v_min_nb_kernel_size_conv1d,
        v_max_nb_kernel_size_conv1d=\
        v_max_nb_kernel_size_conv1d,
        v_step_nb_layers_model=v_step_nb_layers_model,
        v_step_nb_units_model=v_step_nb_units_model,
        v_step_dropout_rate_model=v_step_dropout_rate_model,
        v_step_recurrent_dropout_rate_model=\
        v_step_recurrent_dropout_rate_model,
        v_step_nb_kernel_size_conv1d=\
        v_step_nb_kernel_size_conv1d,
        v_min_pool_size=v_min_pool_size,
        v_max_pool_size=v_max_pool_size,
        v_step_pool_size=v_step_pool_size,
        v_li_activ_fcts_model=v_li_activ_fcts_model,
        v_li_optimizers_model=v_li_optimizers_model,
        v_min_val_learning_rate_optimizer=\
        v_min_val_learning_rate_optimizer,
        v_max_val_learning_rate_optimizer=\
        v_max_val_learning_rate_optimizer,
        v_loss_fct_model=v_loss_fct_model,
        v_metrics_model=v_metrics_model,
        v_di_tuners=v_di_tuners,
        v_key_tuner_class=v_key_tuner_class,
        v_objective_metric_for_tuner_to_optimize=\
        v_objective_metric_for_tuner_to_optimize,
        v_mode=v_mode,
        v_max_trials=v_max_trials,
        v_executions_per_trial=v_executions_per_trial,
        v_directory=v_directory,
        v_metric_for_tuner_search_hp_callback=\
        v_metric_for_tuner_search_hp_callback,
        v_li_keys_tuners_optimizing_batch_size=\
        v_li_keys_tuners_optimizing_batch_size,
        v_epochs_tuner_search=v_epochs_tuner_search,
        v_top_best_models=v_top_best_models,
        v_batch_size=v_batch_size,
        v_to_multiply_epoch_for_train_dur=\
        v_to_multiply_epoch_for_train_dur,
        v_metric_to_monitor_best_epoch_callbacks=\
        v_metric_to_monitor_best_epoch_callbacks,
        v_epochs_best_trained_model_search=\
        v_epochs_best_trained_model_search,
        v_overwrite=True,
        v_patience_during_tuner_search=5,
        v_verbose=2,
        v_mode_callbacks="min",
        v_patience_best_epoch_callbacks=10,
        v_pkl_filename_best_model = "best_model",
        v_pkl_filename_best_retrained_model=\
        "history_obj_best_retrained_model")

        return li_rep
            
    #if we want to examine each origin separatly 
    else:
        #di_df_per_origin=dict, key=id  origin, value=df
        di_df_per_origin=fct_create_di_dataframes_per_origin(
        val_dataframe=v_dataframe,
        val_admissible_min_nb_observatios=\
        val_admissible_min_nb_observatios,
        val_name_col_origin="ORIGIN",
        val_name_col_1_sort='FL_DATE',
        val_name_col_2_sort='DEP_TIME')

        #di_rep=dict, key=id origin, 
        #value=[di_best_trained_models,di_hist_when_search_best_epoch,\
        #di_hist_retrained_best_model,di_results_model_eval_test_set,\
        #va_test_dataset,va_mean_value_train_dataset,va_std_train_dataset,\
        #va_id_first_future_observation,va_li_true_vals_test_set,\
        #va_name_figure_folder_metric,va_name_figure_metric]
        
        di_rep=fct_best_approaches_2(
        v_di_dataframes=di_df_per_origin,
        v_s_stride=v_s_stride,
        v_b_size=v_b_size,
        v_folder_figures=v_folder_figures,
        v_nb_past_seq_lengths=v_nb_past_seq_lengths,
        v_nb_future_seq_lengths=v_nb_future_seq_lengths,
        v_s_length_train_model=v_s_length_train_model,
        v_s_length_target=v_s_length_target,
        v_name_col_origin=v_name_col_origin,
        v_li_name_col_to_copy=v_li_name_col_to_copy,
        v_name_column_date=v_name_column_date,
        v_name_col_2_sort=v_name_col_2_sort,
        v_name_target_variable=v_name_target_variable,
        v_li_cols_to_ignore=v_li_cols_to_ignore,
        v_proportion_train_set=v_proportion_train_set,
        v_proportion_val_set=v_proportion_val_set,
        v_name_fig=v_name_fig,
        v_shuffle_tr_s=v_shuffle_tr_s,
        v_shuffle_tr_t=v_shuffle_tr_t,
        v_shuffle_v_s=v_shuffle_v_s,
        v_shuffle_v_t=v_shuffle_v_t,
        v_shuffle_t_s=v_shuffle_t_s,
        v_shuffle_t_t=v_shuffle_t_t,
        v_di_hypermodels=v_di_hypermodels,
        v_key_hypermodel_class=v_key_hypermodel_class,
        v_min_nb_lay_model=v_min_nb_lay_model,
        v_max_nb_lay_model=v_max_nb_lay_model,
        v_min_nb_units_model=v_min_nb_units_model,
        v_max_nb_units_model=v_max_nb_units_model,
        v_min_value_dropout_rate_model=v_min_value_dropout_rate_model,
        v_max_value_dropout_rate_model=v_max_value_dropout_rate_model,
        v_min_value_recurrent_dropout_rate_model=v_min_value_recurrent_dropout_rate_model,
        v_max_value_recurrent_dropout_rate_model=v_max_value_recurrent_dropout_rate_model,
        v_min_nb_filters_conv1d=v_min_nb_filters_conv1d,
        v_max_nb_filters_conv1d=v_max_nb_filters_conv1d,
        v_min_nb_kernel_size_conv1d=v_min_nb_kernel_size_conv1d,
        v_max_nb_kernel_size_conv1d=v_max_nb_kernel_size_conv1d,
        v_step_nb_layers_model=v_step_nb_layers_model,
        v_step_nb_units_model=v_step_nb_units_model,
        v_step_dropout_rate_model=v_step_dropout_rate_model,
        v_step_recurrent_dropout_rate_model=v_step_recurrent_dropout_rate_model,
        v_step_nb_kernel_size_conv1d=v_step_nb_kernel_size_conv1d,
        v_min_pool_size=v_min_pool_size,
        v_max_pool_size=v_max_pool_size,
        v_step_pool_size=v_step_pool_size,
        v_li_activ_fcts_model=v_li_activ_fcts_model,
        v_li_optimizers_model=v_li_optimizers_model,
        v_min_val_learning_rate_optimizer=v_min_val_learning_rate_optimizer,
        v_max_val_learning_rate_optimizer=v_max_val_learning_rate_optimizer,
        v_loss_fct_model=v_loss_fct_model,
        v_metrics_model=v_metrics_model,
        v_di_tuners=v_di_tuners,
        v_key_tuner_class=v_key_tuner_class,
        v_objective_metric_for_tuner_to_optimize=v_objective_metric_for_tuner_to_optimize,
        v_mode=v_mode,
        v_max_trials=v_max_trials,
        v_executions_per_trial=v_executions_per_trial,
        v_directory=v_directory,
        v_metric_for_tuner_search_hp_callback=v_metric_for_tuner_search_hp_callback,
        v_li_keys_tuners_optimizing_batch_size=v_li_keys_tuners_optimizing_batch_size,
        v_epochs_tuner_search=v_epochs_tuner_search,
        v_top_best_models=v_top_best_models,
        v_batch_size=v_batch_size,
        v_to_multiply_epoch_for_train_dur=v_to_multiply_epoch_for_train_dur,
        v_metric_to_monitor_best_epoch_callbacks=v_metric_to_monitor_best_epoch_callbacks,
        v_epochs_best_trained_model_search=v_epochs_best_trained_model_search,
        v_overwrite=True,
        v_patience_during_tuner_search=5,
        v_verbose=2,
        v_mode_callbacks="min",
        v_patience_best_epoch_callbacks=10,
        v_pkl_filename_best_model = "best_model",
        v_pkl_filename_best_retrained_model=\
        "history_obj_best_retrained_model")

        return di_rep
        

In [18]:
#fct making inferences from the first best model registered in a list
#with the best models

#it returns a dict, 
#key=id best model, value=list destandardized  predictions 

def fct_create_di_destand_predictions(
    val_di_best_models,
    val_test_dataset,
    val_mean_value_train_dataset,
    val_std_train_dataset):
        
        #di_destand_predicts = dict
        #key=id best  model, value=list destandardized  predictions 
        di_destand_predicts={}
        
        
        #for each  best model we make predictions
        for i in val_di_best_models:
             
            preds=val_di_best_models[i].predict(val_test_dataset)
            #print("type(preds)",type(preds))
            #print("preds.shape",preds.shape)
        
            li=\
            [x * val_std_train_dataset+val_mean_value_train_dataset for x in preds]

            #print("len(li)",len(li))
            #print("len(li[0])",len(li[0]))
            
        
            #li_destandardized_preds=np.array(li).reshape(-1)

            #print("li_destandardized_preds.shape",li_destandardized_preds.shape)
            
            di_destand_predicts[i]=li
                   
        return di_destand_predicts

In [19]:
def fct_plot_inferences_single_set(\
    val_past_values,\
    val_true_future_values,\
    val_prediction,\
    val_name_folder_plots,\
    val_name_figure_inferences,\
    val_title="Inferences versus True Future Values"):

    sns.set(palette="hot")
    plt.rcParams['lines.markersize'] = 10

    
    plt.figure(figsize=(18, 6))

    past_t_steps=list(range(-len(val_past_values), 0))
    
    future_t_steps = np.arange(len(val_true_future_values))

    #print("future_t_steps",future_t_steps)
    
    #plot the past values of arrival delay used for  learning
    plt.plot(past_t_steps, np.array(val_past_values[:, 7]), label='Past Values',color="green")

    #plot the true future values
    plt.plot(future_t_steps, np.array(val_true_future_values), 
           label='True Future Values',color="darkmagenta")
        
    #plot predictions
    plt.plot(future_t_steps, np.array(val_prediction),
    label='Predicted Future Values',color="gold")
        
    
        
    plt.title(val_title,c='mediumblue')
    
    plt.legend(loc='best')
    
    plt.grid(True)

    plt.savefig(val_name_folder_plots+"/"+\
                val_name_figure_inferences+str(".png"))
    #plt.show()
    plt.close()

In [20]:
def fct_plot_inferences(\
    val_nb_takes,\
    val_test_set,
    val_di_destandardized_predictions,\
    val_name_folder_with_plots,\
    val_name_file_plot_inferences,\
    val_title="Inferences versus True Future Values"):

        for m in val_di_destandardized_predictions:
            ind=0
            for  i, j in val_test_set.take(val_nb_takes):

                ind+=1

                v_name_file_plot_inferences=\
                val_name_file_plot_inferences+"_"+"best_model_"+str(m)+"_"+str(ind)

                fct_plot_inferences_single_set(\
                val_past_values=i[0],\
                val_true_future_values=j[0],\
                val_prediction=val_di_destandardized_predictions[m][0],\
                val_name_folder_plots=val_name_folder_with_plots,\
                val_name_figure_inferences=\
                v_name_file_plot_inferences,\
                val_title=val_title)
            

In [21]:
#function which plots the predicted values versus
#the true values for a single list of predictions


def fct_plot_train_metrics_retrained_best_model(
    val_di_hist_retrained_best_model,
    val_li_colors,
    val_li_markers,
    val_name_figure_folder_metric,
    val_name_figure,
    val_name_figure_loss="best_retrained_model",
    val_title=\
    "Train Set Metrics- Retrained Best Model,",
    val_y_label="Metric Value",\
    val_loc="best"):
    
    sns.set(palette="hot")
    plt.rcParams['lines.markersize'] = 10
    
    #with plt.rc_context({'axes.edgecolor':'blue',\
    # 'xtick.color':'dodgerblue', 'ytick.color':'dodgerblue',
    # 'figure.facecolor':'white'}):
        
    plt.figure(figsize=(10,12))
    
    ax = plt.gca()
    ax.spines["bottom"].set_color("green")
    ax.spines["top"].set_color("green")
    ax.spines["left"].set_color("green")
    ax.spines["right"].set_color("green")
    
    ax.tick_params(axis="x", colors="forestgreen")     
    ax.tick_params(axis="y", colors="forestgreen")
    
    #sns.set(style="darkgrid")

    #sns.set_style("darkgrid", \
    #{"grid.color": ".3", "grid.linestyle": ":"})

    #res=sns.set(\
    #rc={'axes.facecolor':'lightblue', \
    #'figure.facecolor':'cornflowerblue'})
    #**************
    
    
    #di_hist_retrained_best_model
    #di_hist_retrained_best_model=dictionary,
    #key=id best model (starting with 1)
    #value=history retrained best model
    #this history cotnains values for both train and
    #validation sets
    for i in val_di_hist_retrained_best_model:
        
        li_labels=[]
        
        ind=0
        
        #print()
        #print("ind in boucle i",ind)
    
        #print("Evaluation Best retrained model :", i)
        #for each metric
        for j in val_di_hist_retrained_best_model[i].history:
            
            #print("metric",j)
            
            
            if j!="loss" and j!="val_loss":
                
                #print("values per epoch:",val_di_hist_retrained_best_model[i].history[j])
            
                li_vals_metric_j=val_di_hist_retrained_best_model[i].history[j]
                
                a=len(li_vals_metric_j)
            
                li_id_epochs=list(range(1,a+1))
            
                #print("here",val_di_hist_retrained_best_model[i].history[j])
                graph=sns.lineplot(x=li_id_epochs,\
                y = val_di_hist_retrained_best_model[i].history[j],label=str(j),\
                color=val_li_colors[ind],marker=val_li_markers[ind])
            
                #print("val_li_colors[ind]",val_li_colors[ind])
                
                ind+=1
            

        
        
        plt.xlabel("Epoch ID",c='mediumblue')
        plt.ylabel(val_y_label,c='mediumblue')
        #plt.xticks(li_id_epochs)
        
        v_title=val_title+" "+str(i)
        plt.title(v_title,c='mediumblue')
        
    
        ax.xaxis.set_major_locator(MaxNLocator(integer=True))
        
        plt.grid(True)
        plt.savefig(val_name_figure_folder_metric+"/"+\
                    val_name_figure+"_"+str(i)+str(".png"))
        #plt.show()
        plt.close()
    
    
    #print("val_di_hist_retrained_best_model[i].history.keys()",val_di_hist_retrained_best_model[i].history.keys())
    #we plot the loss
    
    for i in val_di_hist_retrained_best_model:
        #print("we plot loss or val_loss, model:",i)
        
        #print("here1",ind,ind+1,len(val_li_markers))
        
        if "loss" in val_di_hist_retrained_best_model[i].history:
        
            li_vals_metric_j=val_di_hist_retrained_best_model[i].history["loss"]


            a=len(li_vals_metric_j)
            
            li_id_epochs=list(range(1,a+1))

    
            #print("here",val_di_hist_retrained_best_model[i].history[j])
            graph=sns.lineplot(x=li_id_epochs,\
            y = val_di_hist_retrained_best_model[i].history["loss"],\
            label="loss",\
            color=val_li_colors[ind],marker=val_li_markers[ind])
        
            #if we have loss metrics on  validation set
            if "val_loss" in val_di_hist_retrained_best_model[i].history:
            
                #print("here",val_di_hist_retrained_best_model[i].history[j])
            
                graph=sns.lineplot(x=li_id_epochs,\
                y = val_di_hist_retrained_best_model[i].history["val_loss"],\
                label="val_loss",\
                color=val_li_colors[ind+1],marker=val_li_markers[ind+1])
            
                v_title=val_title+" "+str(i)+" "+": Loss"+" "+": Loss, Val_Loss"
                
                v_name_save_fig=val_name_figure_folder_metric+"/"+\
                "fig_plot_metric_"+"loss_during_search_"+\
                val_name_figure_loss+"_"+str(i)+str(".png")
            else:
                v_title=val_title+" "+str(i)+" "+": Loss"
                
            v_name_save_fig=val_name_figure_folder_metric+"/"+\
            val_name_figure+"_"+\
            val_name_figure_loss+"_"+str(i)+str(".png")
                
                
    
        plt.xlabel("Epoch ID",c='mediumblue')
    
        plt.ylabel(val_y_label,c='mediumblue')
        #plt.xticks(li_id_epochs)

        ax.xaxis.set_major_locator(MaxNLocator(integer=True))
        
        plt.title(v_title,c='mediumblue')
        
        plt.grid(True)
        plt.savefig(v_name_save_fig)
        #plt.show()
        plt.close()
        
    

In [22]:
#val_di_results_model_eval_test_set=dictionary,
#key=id best model (starting with 1)
#value=dict, key=id metric, value=value metric
    

def fct_arrange_metrics_test_set_per_model(val_di_results_model_eval_test_set):
    
    #key=id metric, vaulue=[...,value metric model i,...]
    di={}
    
    #for each model
    for i in val_di_results_model_eval_test_set:
        #for each metric of the model
        for j in val_di_results_model_eval_test_set[i]:
            #if the metric is in the dictionary
            
            if j in di:
                di[j].append(val_di_results_model_eval_test_set[i][j])
            
            #if the metric is not in the diction
            else:
                di[j]=[val_di_results_model_eval_test_set[i][j]]
                
    return di
    

In [23]:
#val_di=dict, 
#value=dict, key=id metric, value=[...,value metric ith. model,....]
def fct_plot_metrics_test_set_per_model(\
val_di,
val_name_figure_folder,\
val_name_figure,\
val_li_colors,\
val_li_markers,\
val_x_label="Best Model ID",\
val_y_label="Value Metric",\
val_title="Metrics Test Set Per Each Best Model"):
    
    sns.set(palette="hot")
    plt.rcParams['lines.markersize'] = 10
    
    #with plt.rc_context({'axes.edgecolor':'blue',\
    # 'xtick.color':'dodgerblue', 'ytick.color':'dodgerblue',
    # 'figure.facecolor':'white'}):
        
    plt.figure(figsize=(10,12))
    
    ax = plt.gca()
    ax.spines["bottom"].set_color("green")
    ax.spines["top"].set_color("green")
    ax.spines["left"].set_color("green")
    ax.spines["right"].set_color("green")
    
    ax.tick_params(axis="x", colors="forestgreen")     
    ax.tick_params(axis="y", colors="forestgreen")

    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    
    #sns.set(style="darkgrid")

    #sns.set_style("darkgrid", \
    #{"grid.color": ".3", "grid.linestyle": ":"})

    #res=sns.set(\
    #rc={'axes.facecolor':'lightblue', \
    #'figure.facecolor':'cornflowerblue'})
    #**************
    
    #we plot each metric in a different figure
    
    ind=0
    
    #for each metric
    for i in val_di:
        
        a=len(val_di[i])
            
        li_id_models=list(range(1,a+1))
            
        #print("here",val_di_hist_retrained_best_model[i].history[j])
        graph=sns.lineplot(x=li_id_models,\
        y = val_di[i],label=str(i),\
        color=val_li_colors[ind],marker=val_li_markers[ind])
        
        ind+=1
        
        plt.xlabel(val_x_label,c='mediumblue')
        plt.ylabel(val_y_label,c='mediumblue')
        #plt.xticks(li_id_epochs)
        
        
        plt.title(val_title,c='mediumblue')
        
    
    
        plt.grid(True)
        plt.savefig(val_name_figure_folder+"/"+\
                    val_name_figure+"_metric_"+str(i)+str(".png"))
        #plt.show()
        plt.close()
        
    
    

In [24]:
#function which plots the graph for a dict of models
#val_di=dict,key=id  model, value=model
#val_name_file_plot_graph =the name of the file 
#without the extention for the type of the file
def fct_plot_graph_di_models(\
    val_di,\
    val_name_folder_figures,    
    val_name_file_plot_graph):

        #for each model
        for i in val_di:
            #we  create the name of the file
            file_name=val_name_file_plot_graph+"_"+str(i)+".png"
            
            plot_model(val_di[i], 
                       to_file=val_name_folder_figures+"/"+file_name,
            show_shapes=True, show_layer_names=True)

In [25]:
#va_loc="best"

#va_li_responses=list
#[di_best_trained_models,di_hist_when_search_best_epoch,\
# di_hist_retrained_best_model,di_results_model_eval_test_set,\
 # va_test_dataset,va_mean_value_train_dataset,va_std_train_dataset,\
# va_id_first_future_observation,va_li_true_vals_test_set,\
# va_name_figure_folder_metric,va_name_figure_metric] 

def fct_analyze_results_1(
    va_li_responses,
    va_name_figure_metric_for_predicts,
    va_mae_test_set,
    va_rmse_test_set,
    va_li_colors,
    va_li_markers,
    va_name_figure_find_best_epoch,
    va_name_figure_loss_best_epoch,
    va_title_best_epoch,
    va_folder_figures,
    va_name_figure_best_traind_model,
    va_name_figure_loss_best_trained_model,\
    va_name_figure_plots_test_set,\
    va_name_file_plot_graph,
    va_nb_takes_plot_inferences,\
    val_x_label="Best Model ID",\
    val_y_label="Value Metric",\
    val_title="Metrics Test Set Per Each Best Model",
    val_title_best_trained_model=\
    "Train Set Metrics - Retrained Best Model",
    va_loc="best",\
    val_title_inferences_plot="Inferences versus True Future Values"
):
    
    #we plot the graph for each best model
    fct_plot_graph_di_models(\
    val_di=va_li_responses[0],\
    #val_name_folder_figures=va_folder_figures,
    val_name_folder_figures=va_li_responses[9],
    val_name_file_plot_graph=va_name_file_plot_graph)


    #va_li_responses=list
    #[di_best_trained_models,di_hist_when_search_best_epoch,\
    # di_hist_retrained_best_model,di_results_model_eval_test_set,\
    # va_test_dataset,va_mean_value_train_dataset,va_std_train_dataset,\
    # va_id_first_future_observation,va_li_true_vals_test_set,\
    # va_name_figure_folder_metric,va_name_figure_metric]    
    
    #di_destand_predicts=dict
    #key=id best model (starting with 1)
    #value= list destandardized predictions 
    di_destand_predicts=fct_create_di_destand_predictions(
    val_di_best_models=va_li_responses[0],
    val_test_dataset=va_li_responses[4],
    val_mean_value_train_dataset=va_li_responses[5],
    val_std_train_dataset=va_li_responses[6])
    
    
    #print("di_destand_predicts",di_destand_predicts)
    
    fct_plot_inferences(\
    val_nb_takes=va_nb_takes_plot_inferences,\
    val_test_set=va_li_responses[4],
    val_di_destandardized_predictions=di_destand_predicts,\
    #val_name_folder_with_plots=va_folder_figures,\
    val_name_folder_with_plots=va_li_responses[9],
    val_name_file_plot_inferences=va_name_figure_metric_for_predicts,\
    val_title=val_title_inferences_plot)
    
    
    
    #we plot the metrics for finding the best number of epochs 
    #for the best models
    fct_plot_train_metrics_retrained_best_model(
    val_di_hist_retrained_best_model=va_li_responses[1],
    val_li_colors=va_li_colors,
    val_li_markers=va_li_markers,
    #val_name_figure_folder_metric=va_folder_figures,
    val_name_figure_folder_metric=va_li_responses[9],
    val_name_figure=va_name_figure_find_best_epoch,
    val_name_figure_loss=va_name_figure_loss_best_epoch,
    val_title=va_title_best_epoch,
    val_y_label="Metric Value",\
    val_loc="best")
    
    #we  plot the metrics for the best trained model(s)
    #trained for the best number of epochs
    
    fct_plot_train_metrics_retrained_best_model(
    val_di_hist_retrained_best_model=va_li_responses[2],
    val_li_colors=va_li_colors,
    val_li_markers=va_li_markers,
    #val_name_figure_folder_metric=va_folder_figures,
    val_name_figure_folder_metric=va_li_responses[9],
    val_name_figure=va_name_figure_best_traind_model,
    val_name_figure_loss=va_name_figure_loss_best_trained_model,\
    val_title=\
    "Train Set Metrics - Retrained Best Model",
    val_y_label="Metric Value",\
    val_loc="best")

    #va_li_responses=list
    #[di_best_trained_models,di_hist_when_search_best_epoch,\
    # di_hist_retrained_best_model,di_results_model_eval_test_set,\
    # va_test_dataset,va_mean_value_train_dataset,va_std_train_dataset,\
    # va_id_first_future_observation,va_li_true_vals_test_set,\
    # va_name_figure_folder_metric,va_name_figure_metric]   
    
    #we create the dictionary, 
    #key=metric for evaluating best model(s) on the test set
    #value=[... metric test set  using the ith best model,....]
    
    di_metrics_test_set=fct_arrange_metrics_test_set_per_model(\
    val_di_results_model_eval_test_set=va_li_responses[3])
    
    
    #we plot the metrics when evaluation the best model(s) on the test set
    fct_plot_metrics_test_set_per_model(\
    val_di=di_metrics_test_set,
    #val_name_figure_folder=va_folder_figures,\
    val_name_figure_folder=va_li_responses[9],
    val_name_figure=va_name_figure_plots_test_set,\
    val_li_colors=va_li_colors,\
    val_li_markers=va_li_markers,\
    val_x_label="Best Model ID",\
    val_y_label="Value Metric",\
    val_title="Metrics Test Set Per Each Best Model")

In [None]:
#va_loc="best"

#va_di_respones=dict, key=id origin
#value=#va_li_responses=list
#[di_best_trained_models,di_hist_when_search_best_epoch,\
# di_hist_retrained_best_model,di_results_model_eval_test_set,\
#va_test_dataset,va_mean_value_train_dataset,va_std_train_dataset,\
# va_id_first_future_observation,va_li_true_vals_test_set,\
# va_name_figure_folder_metric,va_name_figure_metric] 

def fct_analyze_results_2(
    va_di_responses,
    va_folder_figures,
    va_mae_test_set,
    va_rmse_test_set,
    va_li_colors,
    va_li_markers,
    va_name_figure_metric_for_predicts,
    va_name_figure_find_best_epoch,
    va_name_figure_loss_best_epoch,
    va_title_best_epoch,
    va_name_figure_best_traind_model,
    va_name_figure_loss_best_trained_model,\
    va_name_figure_plots_test_set,\
    va_name_file_plot_graph,
    va_nb_takes_plot_inferences,\
    val_x_label="Best Model ID",\
    val_y_label="Value Metric",\
    val_title="Metrics Test Set Per Each Best Model",
    val_title_best_trained_model=\
    "Train Set Metrics - Retrained Best Model",
    va_loc="best",\
    val_title_inferences_plot="Inferences versus True Future Values"
):
    #va_di_responses=dict, key=id origin
    #[di_best_trained_models,di_hist_when_search_best_epoch,\
    # di_hist_retrained_best_model,di_results_model_eval_test_set,\
    # va_test_dataset,va_mean_value_train_dataset,va_std_train_dataset,\
    # va_id_first_future_observation,va_li_true_vals_test_set,\
    # va_name_figure_folder_metric,va_name_figure_metric]

    #we analyze each origin
    for i in va_di_responses:

        print("analyze  origin: ", i)
        #print("va_folder_figures=",va_folder_figures)
        #print("va_folder_figures=",va_di_responses[i][9])
        #print()
        
        #va_folder_figures=va_folder_figures+"/"+va_folder_figures+"_"+str(i)

        #v_folder_figures+"/"+v_folder_figures+"_"+str(i)

        fct_analyze_results_1(
        va_li_responses=va_di_responses[i],
        va_name_figure_metric_for_predicts=\
        va_name_figure_metric_for_predicts,
        va_mae_test_set=va_mae_test_set,
        va_rmse_test_set=va_rmse_test_set,
        va_li_colors= va_li_colors,
        va_li_markers=va_li_markers,
        va_name_figure_find_best_epoch=\
        va_name_figure_find_best_epoch,
        va_name_figure_loss_best_epoch=\
        va_name_figure_loss_best_epoch,
        va_title_best_epoch=va_title_best_epoch,
        #va_folder_figures=va_folder_figures,
        va_folder_figures=va_di_responses[i][9],
        va_name_figure_best_traind_model=\
        va_name_figure_best_traind_model,
        va_name_figure_loss_best_trained_model=\
        va_name_figure_loss_best_trained_model,
        va_name_figure_plots_test_set=\
        va_name_figure_plots_test_set,
        va_name_file_plot_graph=\
        va_name_file_plot_graph,
        va_nb_takes_plot_inferences=\
        va_nb_takes_plot_inferences,
        val_x_label="Best Model ID",\
        val_y_label="Value Metric",\
        val_title="Metrics Test Set Per Each Best Model",
        val_title_best_trained_model=\
        "Train Set Metrics - Retrained Best Model",
        va_loc="best",\
        val_title_inferences_plot="Inferences versus True Future Values")


In [1]:
#va_di_respones=None if we wish o examine all the origins together

def fct_analyze_results(\
    va_distinct_origins,\
    va_li_or_di_responses,\
    va_name_figure_metric_for_predicts,
    va_mae_test_set,
    va_rmse_test_set,
    va_li_colors,
    va_li_markers,
    va_name_figure_find_best_epoch,
    va_name_figure_loss_best_epoch,
    va_title_best_epoch,
    va_folder_figures,
    va_name_figure_best_traind_model,
    va_name_figure_loss_best_trained_model,\
    va_name_figure_plots_test_set,\
    va_name_file_plot_graph,
    va_nb_takes_plot_inferences,\
    val_x_label="Best Model ID",\
    val_y_label="Value Metric",\
    val_title="Metrics Test Set Per Each Best Model",
    val_title_best_trained_model=\
    "Train Set Metrics - Retrained Best Model",
    va_loc="best",\
    val_title_inferences_plot="Inferences versus True Future Values"):

        #if all origins will be  examined at the same  time
        if va_distinct_origins==0:

            fct_analyze_results_1(
            va_li_responses=va_li_or_di_responses,
            va_name_figure_metric_for_predicts=\
            va_name_figure_metric_for_predicts,
            va_mae_test_set=va_mae_test_set,
            va_rmse_test_set=va_rmse_test_set,
            va_li_colors= va_li_colors,
            va_li_markers=va_li_markers,
            va_name_figure_find_best_epoch=\
            va_name_figure_find_best_epoch,
            va_name_figure_loss_best_epoch=\
            va_name_figure_loss_best_epoch,
            va_title_best_epoch=va_title_best_epoch,\
            va_folder_figures=va_folder_figures,
            va_name_figure_best_traind_model=\
            va_name_figure_best_traind_model,
            va_name_figure_loss_best_trained_model=\
            va_name_figure_loss_best_trained_model,
            va_name_figure_plots_test_set=\
            va_name_figure_plots_test_set,
            va_name_file_plot_graph=va_name_file_plot_graph,
            va_nb_takes_plot_inferences=\
            va_nb_takes_plot_inferences,
            val_x_label="Best Model ID",
            val_y_label="Value Metric",
            val_title="Metrics Test Set Per Each Best Model",
            val_title_best_trained_model=\
            "Train Set Metrics - Retrained Best Model",
            va_loc="best",\
            val_title_inferences_plot="Inferences versus True Future Values")

        elif va_distinct_origins==1:
            fct_analyze_results_2(
            va_di_responses=va_li_or_di_responses,
            va_folder_figures=va_folder_figures,
            va_mae_test_set=va_mae_test_set,
            va_rmse_test_set=va_rmse_test_set,
            va_li_colors=va_li_colors,
            va_li_markers=va_li_markers,
            va_name_figure_metric_for_predicts=\
            va_name_figure_metric_for_predicts,   
            va_name_figure_find_best_epoch=\
            va_name_figure_find_best_epoch,
            va_name_figure_loss_best_epoch=\
            va_name_figure_loss_best_epoch,
            va_title_best_epoch=va_title_best_epoch,
            va_name_figure_best_traind_model=\
            va_name_figure_best_traind_model,
            va_name_figure_loss_best_trained_model=\
            va_name_figure_loss_best_trained_model,
            va_name_figure_plots_test_set=\
            va_name_figure_plots_test_set,
            va_name_file_plot_graph=\
            va_name_file_plot_graph,
            va_nb_takes_plot_inferences=\
            va_nb_takes_plot_inferences,
            val_x_label="Best Model ID",
            val_y_label="Value Metric",
            val_title="Metrics Test Set Per Each Best Model",
            val_title_best_trained_model=\
            "Train Set Metrics - Retrained Best Model",
            va_loc="best",\
            val_title_inferences_plot="Inferences versus True Future Values")

        else:
            print("PROBLEM IN FCT fct_analyze_results, va_distinct_origins: ",va_distinct_origins)
            import sys
            sys.exit()
           