# Import all libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import math

import os
import datetime
from  datetime import timedelta, time

import warnings
warnings.filterwarnings("ignore")
import lightgbm as lgb
from lightgbm import LGBMRegressor

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, TimeSeriesSplit
from sklearn.linear_model import LinearRegression 
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler,StandardScaler

# Define relevant class 

In [None]:
class ModelSplitter:
    
    def __init__(self, df):
        self.df = df
        self.start_morn = 6.5
        self.end_morn = 9.0
        self.start_noon = 9.5
        self.end_noon = 15.0
        self.start_even = 15.5
        self.end_even = 17.0
        
    def split_train_test(self, train_date_list, test_date_list,fh_step):
        
        start_morn_min, start_morn_hour = math.modf(self.start_morn)
        end_morn_min, end_morn_hour = math.modf(self.end_morn-0.5*fh_step)
        
        start_noon_min, start_noon_hour = math.modf(self.start_noon-0.5*fh_step)
        end_noon_min, end_noon_hour = math.modf(self.end_noon-0.5*fh_step)
        
        start_even_min, start_even_hour = math.modf(self.start_even-0.5*fh_step)
        end_even_min, end_even_hour = math.modf(self.end_even-0.5*fh_step)
        
        # noon models
        noon_cols = ['site','Datetime', f'I_lead_{fh_step}step', 
                     'I', f'I_lead_{fh_step}step_back1D', f'hour_index_lead_{fh_step}step', f'iclr_lead_{fh_step}step', 
                     'I_lag_1step', 'I_lag_2step','I_lag_3step','I_lag_4step','I_lag_5step',
                     'ci_center', f'ci_est(t+{fh_step})']

        _df_noon = self.df[self.df['Datetime'].dt.time.isin(pd.date_range(datetime.time(int(start_noon_hour), int(start_noon_min*60)).strftime('%H:%M:%S'), 
                                                                              datetime.time(int(end_noon_hour), int(end_noon_min*60)).strftime('%H:%M:%S'), 
                                                                              freq='30min').time)][noon_cols]
        _df_noon = _df_noon.dropna()
        date_noon_index = _df_noon['Datetime'].dt.date

        train_date_noon_cond = date_noon_index.isin(train_date_list)
        end_date_noon_cond = date_noon_index.isin(test_date_list)

        _df_noon_train = _df_noon[train_date_noon_cond].set_index(['site', 'Datetime'])
        _df_noon_test = _df_noon[end_date_noon_cond].set_index(['site', 'Datetime'])


        X_train_noon, X_test_noon = _df_noon_train.drop(columns=[f'I_lead_{fh_step}step']), _df_noon_test.drop(columns=[f'I_lead_{fh_step}step'])
        y_train_noon, y_test_noon = _df_noon_train[f'I_lead_{fh_step}step'], _df_noon_test[f'I_lead_{fh_step}step']
        
        # evening models
        
        even_cols = ['site','Datetime', f'I_lead_{fh_step}step', 'I',
                         f'I_lead_{fh_step}step_back1D', f'hour_index_lead_{fh_step}step', 
                         f'iclr_lead_{fh_step}step', 'ci_center', f'ci_est(t+{fh_step})']
        
        _df_even = self.df[self.df['Datetime'].dt.time.isin(pd.date_range(datetime.time(int(start_even_hour), int(start_even_min*60)).strftime('%H:%M:%S'), 
                                                                              datetime.time(int(end_even_hour), int(end_even_min*60)).strftime('%H:%M:%S'), 
                                                                              freq='30min').time)][even_cols]
        
        _df_even = _df_even.dropna()
        date_even_index = _df_even['Datetime'].dt.date

        train_date_even_cond = date_even_index.isin(train_date_list)
        end_date_even_cond = date_even_index.isin(test_date_list)

        _df_even_train = _df_even[train_date_even_cond].set_index(['site', 'Datetime'])
        _df_even_test = _df_even[end_date_even_cond].set_index(['site', 'Datetime'])
        
        X_train_even, X_test_even = _df_even_train.drop(columns=[f'I_lead_{fh_step}step']), _df_even_test.drop(columns=[f'I_lead_{fh_step}step'])
        y_train_even, y_test_even = _df_even_train[f'I_lead_{fh_step}step'], _df_even_test[f'I_lead_{fh_step}step']
        
        
        if fh_step <= 5 :
        
            morn_cols = even_cols
            _df_morn = self.df[self.df['Datetime'].dt.time.isin(pd.date_range(datetime.time(int(start_morn_hour), int(start_morn_min*60)).strftime('%H:%M:%S'), 
                                                                              datetime.time(int(end_morn_hour), int(end_morn_min*60)).strftime('%H:%M:%S'), 
                                                                              freq='30min').time)][morn_cols]
            
            _df_morn = _df_morn.dropna()
            date_morn_index = _df_morn['Datetime'].dt.date

            train_date_morn_cond = date_morn_index.isin(train_date_list)
            end_date_morn_cond = date_morn_index.isin(test_date_list)

            _df_morn_train = _df_morn[train_date_morn_cond].set_index(['site', 'Datetime'])
            _df_morn_test = _df_morn[end_date_morn_cond].set_index(['site', 'Datetime'])


            X_train_morn, X_test_morn = _df_morn_train.drop(columns=[f'I_lead_{fh_step}step']), _df_morn_test.drop(columns=[f'I_lead_{fh_step}step'])
            y_train_morn, y_test_morn = _df_morn_train[f'I_lead_{fh_step}step'], _df_morn_test[f'I_lead_{fh_step}step']
            
            return X_train_morn, X_test_morn, y_train_morn, y_test_morn, X_train_noon, X_test_noon,y_train_noon, y_test_noon, X_train_even, X_test_even, y_train_even, y_test_even
        
        else :
            return X_train_noon, X_test_noon,y_train_noon, y_test_noon, X_train_even, X_test_even, y_train_even, y_test_even
        
        
        
        

# Import data

In [None]:
main_path = os.path.abspath(os.getcwd())
csv_path = os.path.join(main_path, 'csv_data')

all_sites_df = pd.read_csv(os.path.join(csv_path, 'processed_all_sites_HS1e0_df_not_imputed_R_channel.csv'), parse_dates = ['Datetime'])
all_sites_df = all_sites_df.iloc[:, 1:]

In [None]:
import pickle
f = open('train_test_date_listVar.pkl', 'rb')
train_date_list, test_date_list, val_date_list,train_date_val_list = pickle.load(f)
print(f'training set has {len(train_date_val_list)} dates\n'+
      f'validation set has {len(val_date_list)} dates \n'+
      f'test set has {len(test_date_list)}')

# Training 
- Note that the morning model only appear in step 1-5

In [None]:
model_data = ModelSplitter(all_sites_df)

In [None]:
fh_step = 1
X_train_morn, X_test_morn, y_train_morn, y_test_morn, X_train_noon, X_test_noon,y_train_noon, y_test_noon, X_train_even, X_test_even, y_train_even, y_test_even = model_data.split_train_test(train_date_list, test_date_list,fh_step=fh_step)


## LR

In [None]:
lr_morn = LinearRegression()
lr_noon = LinearRegression()
lr_even = LinearRegression()

lr_morn.fit(X_train_morn, y_train_morn)
lr_noon.fit(X_train_noon, y_train_noon)
lr_even.fit(X_train_even, y_train_even)

y_pred_morn = lr_morn.predict(X_test_morn)
y_pred_morn[y_pred_morn < 0] = 0

y_pred_noon = lr_noon.predict(X_test_noon)
y_pred_noon[y_pred_noon < 0] = 0

y_pred_even = lr_even.predict(X_test_even)
y_pred_even[y_pred_even < 0] = 0

y_pred_lr = np.concatenate([y_pred_morn, y_pred_noon, y_pred_even])
y_test = pd.concat([y_test_morn, y_test_noon, y_test_even], axis=0)

print(mean_squared_error(y_test, y_pred_lr)**(1/2))
print(mean_absolute_error(y_test, y_pred_lr))

## RF

In [None]:
rf_morn = RandomForestRegressor(max_depth=30 , min_samples_leaf= 25, min_samples_split = 25, n_estimators= 1500, random_state=42)
rf_morn.fit(X_train_morn, y_train_morn)
y_pred_morn = rf_morn.predict(X_test_morn)
y_pred_morn[y_pred_morn < 0] = 0 

rf_noon = RandomForestRegressor(max_depth=30 , min_samples_leaf= 25, min_samples_split = 25, n_estimators= 1500, random_state=42)
rf_noon.fit(X_train_noon, y_train_noon)
y_pred_noon = rf_noon.predict(X_test_noon)
y_pred_noon[y_pred_noon < 0] = 0 

rf_even = RandomForestRegressor(max_depth=30 , min_samples_leaf= 25, min_samples_split = 25, n_estimators= 1500, random_state=42)
rf_even.fit(X_train_even, y_train_even)
y_pred_even = rf_even.predict(X_test_even)
y_pred_even[y_pred_even < 0] = 0 

y_pred_rf = np.concatenate([y_pred_morn, y_pred_noon, y_pred_even])
y_test = pd.concat([y_test_morn, y_test_noon, y_test_even], axis=0)

print(mean_squared_error(y_test, y_pred_rf)**(1/2))
print(mean_absolute_error(y_test, y_pred_rf))

## SVR

In [None]:
morn_scaler = StandardScaler()
noon_scaler = StandardScaler()
even_scaler = StandardScaler()

X_train_morn_scaled = morn_scaler.fit_transform(X_train_morn)
X_test_morn_scaled = morn_scaler.transform(X_test_morn)

X_train_noon_scaled = noon_scaler.fit_transform(X_train_noon)
X_test_noon_scaled = noon_scaler.transform(X_test_noon)

X_train_even_scaled = even_scaler.fit_transform(X_train_even)
X_test_even_scaled = even_scaler.transform(X_test_even)

In [None]:
svr_morn = SVR(kernel='rbf', C=100, epsilon=1, gamma=0.1)
svr_morn.fit(X_train_morn_scaled, y_train_morn)
y_pred_morn = svr_morn.predict(X_test_morn_scaled)
y_pred_morn[y_pred_morn < 0] = 0 

svr_noon = SVR(kernel='rbf', C=100, epsilon=1, gamma=0.1)
svr_noon.fit(X_train_noon_scaled, y_train_noon)
y_pred_noon = svr_noon.predict(X_test_noon_scaled)
y_pred_noon[y_pred_noon < 0] = 0 

svr_even = SVR(kernel='rbf', C=100, epsilon=1, gamma=0.1)
svr_even.fit(X_train_even_scaled, y_train_even)
y_pred_even = svr_even.predict(X_test_even_scaled)
y_pred_even[y_pred_even < 0] = 0 

y_pred_svr = np.concatenate([y_pred_morn, y_pred_noon, y_pred_even])
y_test = pd.concat([y_test_morn, y_test_noon, y_test_even], axis=0)

print(mean_squared_error(y_test, y_pred_svr)**(1/2))
print(mean_absolute_error(y_test, y_pred_svr))

## LGBM

In [None]:
lgbm_morn = LGBMRegressor(boosting_type='goss',learning_rate=0.005, max_depth = 10 , n_estimators=2000)
lgbm_morn.fit(X_train_morn, y_train_morn)
y_pred_morn = lgbm_morn.predict(X_test_morn)
y_pred_morn[y_pred_morn < 0] = 0 

lgbm_noon = LGBMRegressor(boosting_type='goss',learning_rate=0.005, max_depth = 10 , n_estimators=2000)
lgbm_noon.fit(X_train_noon, y_train_noon)
y_pred_noon = lgbm_noon.predict(X_test_noon)
y_pred_noon[y_pred_noon < 0] = 0 

lgbm_even = LGBMRegressor(boosting_type='goss',learning_rate=0.005, max_depth = 10 , n_estimators=2000)
lgbm_even.fit(X_train_even, y_train_even)
y_pred_even = lgbm_even.predict(X_test_even)
y_pred_even[y_pred_even < 0] = 0 

y_pred_lgbm = np.concatenate([y_pred_morn, y_pred_noon, y_pred_even])
y_test = pd.concat([y_test_morn, y_test_noon, y_test_even], axis=0)


print(mean_squared_error(y_test, y_pred_lgbm)**(1/2))
print(mean_absolute_error(y_test, y_pred_lgbm))


# Save to csv

In [None]:
df_test = pd.DataFrame(y_test)
df_test['I_pred_lr'] = y_pred_lr
df_test['I_pred_rf'] = y_pred_rf
df_test['I_pred_svr'] = y_pred_svr
df_test['I_pred_lgbm'] = y_pred_lgbm
df_test.to_csv(f'df_test_{fh_step}step.csv')