# Import all libraries

In [None]:
!pip install wandb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

import tensorflow as tf
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.layers import Conv3D,MaxPooling3D, Dense,Flatten, Concatenate, ConvLSTM2D, ConvLSTM3D
from tensorflow.keras.layers import Input
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError
import wandb
from wandb.keras import WandbCallback
tf.test.gpu_device_name()

# Define function

In [None]:
def split_data_1step(all_sites_df, fh_step = 1, 
               extra_features :list = []):


    # morning model

    morn_cols = ['site','Datetime', f'I_lead_{fh_step}step', 'I', f'I_lead_{fh_step}step_back1D', f'hour_index_lead_{fh_step}step', f'iclr_lead_{fh_step}step', 'ci_center', f'ci_est(t+{fh_step})']

    _df_morn = all_sites_df[all_sites_df['Datetime'].dt.time.isin(pd.date_range('06:30:00', '08:30:00', freq='30min').time)][morn_cols]

    
    _df_morn = _df_morn.dropna()
    date_morn_index = _df_morn['Datetime'].dt.date

    train_date_morn_cond = date_morn_index.isin(train_date_list)
    end_date_morn_cond = date_morn_index.isin(test_date_list)

    _df_morn_train = _df_morn[train_date_morn_cond].set_index(['site', 'Datetime'])
    _df_morn_test = _df_morn[end_date_morn_cond].set_index(['site', 'Datetime'])


    X_train_morn, X_test_morn = _df_morn_train.drop(columns=[f'I_lead_{fh_step}step']), _df_morn_test.drop(columns=[f'I_lead_{fh_step}step'])
    y_train_morn, y_test_morn = _df_morn_train[f'I_lead_{fh_step}step'], _df_morn_test[f'I_lead_{fh_step}step']

    # noon model

    noon_cols = ['site','Datetime',  f'I_lead_{fh_step}step', 'I', f'I_lead_{fh_step}step_back1D', f'hour_index_lead_{fh_step}step', f'iclr_lead_{fh_step}step', 
                 'I_lag_1step', 'I_lag_2step', 'I_lag_3step', 'I_lag_4step','I_lag_5step', 'ci_center', f'ci_est(t+{fh_step})']

    _df_noon = all_sites_df[all_sites_df['Datetime'].dt.time.isin(pd.date_range('09:00:00', '14:30:00', freq='30min').time)][noon_cols]
    _df_noon = _df_noon.dropna()
    date_noon_index = _df_noon['Datetime'].dt.date

    train_date_noon_cond = date_noon_index.isin(train_date_list)
    end_date_noon_cond = date_noon_index.isin(test_date_list)

    _df_noon_train = _df_noon[train_date_noon_cond].set_index(['site', 'Datetime'])
    _df_noon_test = _df_noon[end_date_noon_cond].set_index(['site', 'Datetime'])


    X_train_noon, X_test_noon = _df_noon_train.drop(columns=[f'I_lead_{fh_step}step']), _df_noon_test.drop(columns=[f'I_lead_{fh_step}step'])
    y_train_noon, y_test_noon = _df_noon_train[f'I_lead_{fh_step}step'], _df_noon_test[f'I_lead_{fh_step}step']

    # evening model 
    even_cols = morn_cols
    _df_even = all_sites_df[all_sites_df['Datetime'].dt.time.isin(pd.date_range('15:00:00', '17:00:00', freq='30min').time)][even_cols]

    
    _df_even = _df_even.dropna()
    date_even_index = _df_even['Datetime'].dt.date

    train_date_even_cond = date_even_index.isin(train_date_list)
    end_date_even_cond = date_even_index.isin(test_date_list)

    _df_even_train = _df_even[train_date_even_cond].set_index(['site', 'Datetime'])
    _df_even_test = _df_even[end_date_even_cond].set_index(['site', 'Datetime'])


    X_train_even, X_test_even = _df_even_train.drop(columns=[f'I_lead_{fh_step}step']), _df_even_test.drop(columns=[f'I_lead_{fh_step}step'])
    y_train_even, y_test_even = _df_even_train[f'I_lead_{fh_step}step'], _df_even_test[f'I_lead_{fh_step}step']


    return X_train_morn, X_test_morn, y_train_morn, y_test_morn, X_train_noon, X_test_noon,y_train_noon, y_test_noon, X_train_even, X_test_even, y_train_even, y_test_even
def split_val_from_train(X_train, y_train) :
  _X = X_train.reset_index()
  _y = y_train.reset_index()

  date_index = _X['Datetime'].dt.date

  train_cond = date_index.isin(train_date_val_list)
  val_cond = date_index.isin(val_date_list)

  _X_train_val, _y_train_val = _X[train_cond].set_index(['site', 'Datetime']), _y[train_cond].set_index(['site', 'Datetime'])
  _X_val, _y_val = _X[val_cond].set_index(['site', 'Datetime']), _y[val_cond].set_index(['site', 'Datetime'])

  return _X_train_val, _y_train_val, _X_val, _y_val

from sklearn.preprocessing import MinMaxScaler,StandardScaler
def split_to_each_step(df, fh_step):
  use_cols = ['site','Datetime', f'I_lead_{fh_step}step', 
                     'I', f'I_lead_{fh_step}step_back1D', f'hour_index_lead_{fh_step}step', f'iclr_lead_{fh_step}step', 
                     'I_lag_1step', 'ci_center', f'ci_est(t+{fh_step})']


  _df = df[use_cols]
  _df = _df.dropna()
  _df['Datetime'] = pd.to_datetime(_df['Datetime'])

  date_index = _df['Datetime'].dt.date
  _df_train = _df[date_index.isin(train_date_val_list)]
  _df_val = _df[date_index.isin(val_date_list)]
  _df_test = _df[date_index.isin(test_date_list)]

  
  _df_train = _df_train.set_index(['site', 'Datetime'])
  _df_val = _df_val.set_index(['site', 'Datetime'])
  _df_test = _df_test.set_index(['site', 'Datetime'])


  scaler = StandardScaler()
  X_train = _df_train.drop(columns=[f'I_lead_{fh_step}step'])
  X_train = scaler.fit_transform(X_train)

  _df_train[list(set(_df_train.columns) - set([f'I_lead_{fh_step}step']))] = X_train
  _df_val[list(set(_df_val.columns) - set(['filename', f'I_lead_{fh_step}step']))] = scaler.transform(_df_val.drop(columns=['filename', f'I_lead_{fh_step}step']))
  _df_test[list(set(_df_test.columns) - set([f'I_lead_{fh_step}step']))] = scaler.transform(_df_test.drop(columns=[f'I_lead_{fh_step}step']))
  
  return _df_train, _df_val, _df_test, scaler

# Import data

In [None]:
all_sites_df = pd.read_csv('processed_all_sites_HS1e4_df_not_imputed_R_channel.csv', parse_dates = ['Datetime'])
all_sites_df = all_sites_df.iloc[:, 1:]
all_sites_df['site'] = all_sites_df['site'].astype('str')

In [1]:
import pickle
train_date_list, test_date_list, val_date_list,train_date_val_list = pickle.load(f)

In [None]:
X_train_morn, X_test_morn, y_train_morn, y_test_morn, X_train_noon, X_test_noon, y_train_noon, y_test_noon, X_train_even, X_test_even, y_train_even, y_test_even = split_data_1step(all_sites_df)
X_train_val_noon, y_train_val_noon, X_val_noon, y_val_noon = split_val_from_train(X_train_noon, y_train_noon)
scaler = StandardScaler()
X_train_val_noon_scaled = scaler.fit_transform(X_train_val_noon)
X_val_noon_scaled = scaler.transform(X_val_noon)

# Hyperparameters tuning
- We use WandB sweep feature to log hyperpameters tuning of each model

## SVR

In [2]:
# define hyperparameters
import wandb
key = ''
wandb.login(key=key)

sweep_config = {'method': 'grid'}
# choose hyperparameter choice 
parameters_dict = {
    'C' : {
        'values' : [10, 100, 200, 400]
        },
    'gamma' : {
        'values' : [0.01, 0.1]
    },
    'epsilon' : {
        'values' : [0.1, 1, 10]
        },
    'kernel' : {
        'values' : ['rbf']
        }

    }
    
sweep_config['parameters'] = parameters_dict

metric = {
    'name': 'validation_loss',
    'goal': 'minimize'   
    }

sweep_config['metric'] = metric
import pprint
pprint.pprint(sweep_config)

[34m[1mwandb[0m: Currently logged in as: [33mnatanon-t[0m ([33mduo-y4[0m). Use [1m`wandb login --relogin`[0m to force relogin


{'method': 'grid',
 'metric': {'goal': 'minimize', 'name': 'validation_loss'},
 'parameters': {'C': {'values': [10, 100, 200, 400]},
                'epsilon': {'values': [0.1, 1, 10]},
                'gamma': {'values': [0.01, 0.1]},
                'kernel': {'values': ['rbf']}}}


In [None]:
# define all function to train model
def SVR_model(C = 1, gamma = 1, epsilon = 1, kernel = 'rbf'):

  model = SVR(C = C, gamma = gamma, epsilon = epsilon,kernel = kernel)
  return model

def train(model):  

    model.fit(X_train_val_noon_scaled, np.array(y_train_val_noon).ravel()) 

    y_pred_train = model.predict(X_train_val_noon_scaled)
    y_pred_train[y_pred_train < 0] = 0

    y_pred_val = model.predict(X_val_noon_scaled)
    y_pred_val[y_pred_val < 0] = 0

    train_rmse = (mean_squared_error(y_train_val_noon, y_pred_train))** 0.5
    train_mae = mean_absolute_error(y_train_val_noon, y_pred_train)

    val_rmse = (mean_squared_error(y_val_noon, y_pred_val))** 0.5
    val_mae = mean_absolute_error(y_val_noon, y_pred_val)


    wandb.log({
        'train_rmse': train_rmse,
        'train_mae': train_mae, 
        'val_rmse': val_rmse, 
        'val_mae': val_mae
      })

    
def sweep_train(config_defaults=None):
    # Initialize wandb with a sample project name
    with wandb.init(config=config_defaults):  # this gets over-written in the Sweep
        wandb.config.architecture_name = "SVR"
        wandb.config.dataset_name = "validation"
        # initialize model
        model = SVR_model(C = wandb.config.C, 
                   gamma = wandb.config.gamma, 
                   epsilon = wandb.config.epsilon,
                   kernel = wandb.config.kernel
                   )
        train(model)
sweep_id = wandb.sweep(sweep_config, project="HyperTune_SVR_noon_latest")
wandb.agent(sweep_id, function=sweep_train)
wandb.finish()

## RF

In [None]:
import wandb
key = ''
wandb.login(key=key)

sweep_config = {'method': 'grid'}

# choose hyperparameter choice 
parameters_dict = {
    'n_estimators' : {
        'values' : [100, 500, 1000, 1500, 2000]
        },
    'max_depth' : {
        'values' : [30, 35, 40, 45, 50]
    },
    'min_samples_leaf' : {
        'values' : [25]
        },
    'min_samples_split' : {
        'values' : [25]
    
    }
    }

sweep_config['parameters'] = parameters_dict

# 
metric = {
    'name': 'validation_loss',
    'goal': 'minimize'   
    }

sweep_config['metric'] = metric

import pprint
pprint.pprint(sweep_config)

In [None]:
# define all function to train model
def RF(max_depth=10, min_samples_leaf= 25, min_samples_split = 25, n_estimators=1000, random_state=42):

  model = RandomForestRegressor(max_depth=max_depth, min_samples_leaf= min_samples_leaf, min_samples_split = min_samples_split, n_estimators=n_estimators, random_state=42)
  return model

def train(model):  
    model.fit(X_train_val_noon, y_train_val_noon) 

    y_pred_train = model.predict(X_train_val_noon)
    y_pred_train[y_pred_train < 0] = 0

    y_pred_val = model.predict(X_val_noon)
    y_pred_val[y_pred_val < 0] = 0

    train_rmse = (mean_squared_error(y_train_val_noon, y_pred_train))** 0.5
    train_mae = mean_absolute_error(y_train_val_noon, y_pred_train)

    val_rmse = (mean_squared_error(y_val_noon, y_pred_val))** 0.5
    val_mae = mean_absolute_error(y_val_noon, y_pred_val)

    wandb.log({
        'train_rmse': train_rmse,
        'train_mae': train_mae, 
        'val_rmse': val_rmse, 
        'val_mae': val_mae
      })

    
def sweep_train(config_defaults=None):
    # Initialize wandb with a sample project name
    with wandb.init(config=config_defaults):  # this gets over-written in the Sweep

        wandb.config.architecture_name = "RF"
        wandb.config.dataset_name = "validationr"

        # initialize model
        model = RF(max_depth = wandb.config.max_depth, 
                   min_samples_leaf = wandb.config.min_samples_leaf, 
                   min_samples_split = wandb.config.min_samples_split, 
                   n_estimators = wandb.config.n_estimators
                   )
        train(model)

sweep_id = wandb.sweep(sweep_config, project="HyperTune_RF_noon_latest")
wandb.agent(sweep_id, function=sweep_train)
wandb.finish()

## LGBM

In [None]:
# choose hyperparameter choice 
sweep_config = {'method': 'grid'}
parameters_dict = {
    'boosting_type': {
        'values' : ['gbdt','goss'] },
    'n_estimators' : {
        'values' : [500, 1000, 1500,2000] },
    'max_depth' : {
        'values' : [5,10 ,15,20]},
    'learning_rate' : {
        'values' : [0.005, 0.01, 0.05, 0.1] },
    'max_bin' :{
        'values' : [256, 512] },
    'reg_alpha': {
        'values': [0,1e-1, 1] },

#     'reg_lambda': {
#         'values' : [0, 1e-1, 1] },        
    }
    

sweep_config['parameters'] = parameters_dict

# 
metric = {
    'name': 'validation_loss',
    'goal': 'minimize'   
    }

sweep_config['metric'] = metric

import pprint
pprint.pprint(sweep_config)

In [None]:
# define all function to train model
from lightgbm import LGBMRegressor
def lgbm(boosting_type = 'gbdt', learning_rate = 0.1, 
     max_bin=256, max_depth = 5, n_estimators=100, reg_alpha=0):

  model = LGBMRegressor(boosting_type = boosting_type, learning_rate = learning_rate, 
                             max_bin=max_bin, max_depth = max_depth, n_estimators=n_estimators, 
                            reg_alpha=reg_alpha)
  return model

def train(model):  

    
    model.fit(X_train_val_noon, y_train_val_noon) 

    y_pred_train = model.predict(X_train_val_noon)
    y_pred_train[y_pred_train < 0] = 0

    y_pred_val = model.predict(X_val_noon)
    y_pred_val[y_pred_val < 0] = 0

    train_rmse = (mean_squared_error(y_train_val_noon, y_pred_train))** 0.5
    train_mae = mean_absolute_error(y_train_val_noon, y_pred_train)

    val_rmse = (mean_squared_error(y_val_noon, y_pred_val))** 0.5
    val_mae = mean_absolute_error(y_val_noon, y_pred_val)

    wandb.log({
        'train_rmse': train_rmse,
        'train_mae': train_mae, 
        'val_rmse': val_rmse, 
        'val_mae': val_mae
      })

def sweep_train(config_defaults=None):
    # Initialize wandb with a sample project name
    with wandb.init(config=config_defaults):  # this gets over-written in the Sweep
        wandb.config.architecture_name = "LightGBM"

        # initialize model
        model = lgbm(boosting_type = wandb.config.boosting_type, learning_rate = wandb.config.learning_rate, 
                             max_bin=wandb.config.max_bin, max_depth = wandb.config.max_depth, 
                             n_estimators=wandb.config.n_estimators, 
                            reg_alpha=wandb.config.reg_alpha)
        train(model)
sweep_id = wandb.sweep(sweep_config, project="HypeTune_lgbm_noon")
wandb.agent(sweep_id, function=sweep_train)
wandb.finish()

## ANN

In [None]:
fh_step=1
df_train, df_val, df_test, scaler = split_to_each_step(all_sites_df, fh_step=fh_step)
print(f'model {fh_step} step, the data have {df_train.shape[0]} samples for training and {df_val.shape[0]} for validation')

In [None]:
X_train, y_train = df_train.drop(columns=[f'I_lead_{fh_step}step']),   df_train[f'I_lead_{fh_step}step']
X_val, y_val = df_val.drop(columns=[f'I_lead_{fh_step}step']),   df_test[f'I_lead_{fh_step}step']


In [None]:
import wandb
key = ''
wandb.login(key=key)

sweep_config = {'method': 'grid'}

# choose hyperparameter choice 
parameters_dict = {
    'learning_rate' : {
        'values' : [0.005, 0.01, 0.05]
    },
    'no_layers' : {
        'values' : [1, 2, 3, 4, 5]
        },
    'dense' : {
        'values' : [32, 64]
        },
    }

sweep_config['parameters'] = parameters_dict

# 
metric = {
    'name': 'val_loss',
    'goal': 'minimize'   
    }

sweep_config['metric'] = metric

import pprint
pprint.pprint(sweep_config)

In [None]:
# define all function to train model
from keras import backend as K
def MLP(nb_features=10, no_layers = 3, dense = 32):
  K.clear_session()
  model = Sequential()
  model.add(tf.keras.Input(shape=(nb_features,)))


  for i in range(no_layers):
    model.add(Dense(dense, activation='relu'))

  model.add(Dense(1,activation='relu'))
  model.compile(loss="mean_absolute_error", optimizer='adam',
                metrics=[RootMeanSquaredError(), MeanAbsoluteError()])
  return model


def train(model, batch_size=64, epochs = 30, lr=1e-3, optimizer='adam'):  
    
    # Compile model like you usually do.
    tf.keras.backend.clear_session()
    model.compile(loss="mean_absolute_error", optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
                metrics=[RootMeanSquaredError(), MeanAbsoluteError()])

    # callback setup
    cbs = [WandbCallback()]

    model.fit(X_train, 
              y_train, 
              batch_size=batch_size, 
              epochs=epochs,
              validation_data=(X_val, y_val), 
              # learning_rate = lr,
              callbacks=cbs)
    
def sweep_train(config_defaults=None):
    # Initialize wandb with a sample project name
    with wandb.init(config=config_defaults):  # this gets over-written in the Sweep
        wandb.config.architecture_name = "ANN"
        wandb.config.dataset_name = "validation"
        # initialize model
        model = MLP(nb_features = X_train.shape[1], no_layers = wandb.config.no_layers, dense = wandb.config.dense)
        train(model, 
              # wandb.config.batch_size,
              epochs = 30,
              lr= wandb.config.learning_rate
              )