# LGBM VMD Model development

In [1]:
import pandas as pd
import seaborn as sns
import openpyxl
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle

In [2]:
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
import optuna
from vmdpy import VMD

In [3]:
from Functions.helper_functions import * 

In [4]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
pd.options.mode.chained_assignment = None  # default='warn'

In [5]:
np.random.seed(42)

# Data

In [6]:
train_wp1 = pd.read_csv('Data/Preprocessing/WP1_train_preprocessed.csv', sep=',')
train_wp2 = pd.read_csv('Data/Preprocessing/WP2_train_preprocessed.csv', sep=',')
train_wp3 = pd.read_csv('Data/Preprocessing/WP3_train_preprocessed.csv', sep=',')
train_wp4 = pd.read_csv('Data/Preprocessing/WP4_train_preprocessed.csv', sep=',')
train_wp5 = pd.read_csv('Data/Preprocessing/WP5_train_preprocessed.csv', sep=',')
train_wp6 = pd.read_csv('Data/Preprocessing/WP6_train_preprocessed.csv', sep=',')

In [7]:
test_wp1 = pd.read_csv('Data/Preprocessing/WP1_test_preprocessed.csv', sep=',')
test_wp2 = pd.read_csv('Data/Preprocessing/WP2_test_preprocessed.csv', sep=',')
test_wp3 = pd.read_csv('Data/Preprocessing/WP3_test_preprocessed.csv', sep=',')
test_wp4 = pd.read_csv('Data/Preprocessing/WP4_test_preprocessed.csv', sep=',')
test_wp5 = pd.read_csv('Data/Preprocessing/WP5_test_preprocessed.csv', sep=',')
test_wp6 = pd.read_csv('Data/Preprocessing/WP6_test_preprocessed.csv', sep=',')
test_dates = pd.read_csv('Data/Initial/test.csv', sep=',').date.values

In [8]:
to_drop = ['date','wd','forecast_time', 'forecast', "forecast_dist"]

In [9]:
u_to_drop = [
    'u_T_1', 'u_T_2', 'u_T_3', 'u_T_4', 'u_T_5', 'u_T_6', 
    'u_T_2_mean', 'u_T_3_mean', 'u_T_4_mean', 'u_T_5_mean', 'u_T_6_mean', 'u_T_7_mean',
    'u_T_8_mean', 'u_T_9_mean', 'u_T_10_mean', 'u_T_11_mean', 'u_T_12_mean','u_T_24_mean',
    'u_T_2_std', 'u_T_4_std', 'u_T_5_std', 'u_T_6_std',
    'u_T_2_median', 'u_T_3_median', 'u_T_4_median', 'u_T_5_median', 'u_T_6_median', 'u_T_12_median','u_T_24_median', 'u_T_36_median',
    'u_T_2_max', 'u_T_3_max', 'u_T_4_max', 'u_T_5_max', 'u_T_6_max', 'u_T_12_max',
    'u_T_2_min', 'u_T_3_min', 'u_T_4_min', 'u_T_5_min', 'u_T_6_min', 'u_T_12_min',
    'u2_T_1', 'u2_T_2', 'u2_T_3', 'u2_T_4', 'u2_T_5', 'u2_T_6', 
    'u2_T_2_mean', 'u2_T_3_mean', 'u2_T_4_mean', 'u2_T_5_mean', 'u2_T_6_mean', 'u2_T_7_mean',
    'u2_T_8_mean', 'u2_T_9_mean', 'u2_T_10_mean', 'u2_T_11_mean', 'u2_T_12_mean','u2_T_24_mean',
    'u2_T_2_std', 'u2_T_4_std', 'u2_T_5_std', 'u2_T_6_std', 'u2_T_24_std',
    'u2_T_2_median', 'u2_T_3_median', 'u2_T_4_median', 'u2_T_5_median', 'u2_T_6_median', 'u2_T_12_median',
    'u2_T_2_max','u2_T_3_max', 'u2_T_4_max','u2_T_5_max', 'u2_T_6_max', 'u2_T_12_max',
    'u2_T_2_min', 'u2_T_3_min', 'u2_T_4_min', 'u2_T_5_min', 'u2_T_6_min',
    'u2_T_12', 'u2_T_36_mean', 'u2_T_36_std', 'u2_T_24_median', 'u2_T_24_max',
    'u_T_36_mean','u_T_12','u_T_24_max','u2_T_36_median','u_T_24_min'
]
ws_to_drop = [
    'ws_T_1', 'ws_T_2', 'ws_T_3', 'ws_T_4', 'ws_T_5', 'ws_T_6', 'ws_T_7', 'ws_T_8', 'ws_T_10','ws_T_11', 'ws_T_12',
    'ws_T_2_mean', 'ws_T_3_mean', 'ws_T_4_mean', 'ws_T_5_mean', 'ws_T_6_mean', 'ws_T_7_mean', 'ws_T_8_mean', 'ws_T_9_mean', 
    'ws_T_10_mean', 'ws_T_11_mean', 'ws_T_12_mean', 'ws_T_24_mean', 
    'ws_T_2_std', 'ws_T_3_std', 'ws_T_4_std', 'ws_T_5_std', 
    'ws_T_2_median', 'ws_T_3_median', 'ws_T_4_median', 'ws_T_5_median', 'ws_T_6_median',
    'ws_T_12_median', 'ws_T_24_median', 'ws_T_36_median',
    'ws_T_2_max', 'ws_T_3_max', 'ws_T_4_max', 'ws_T_5_max','ws_T_6_max', 'ws_T_12_max',
     'ws_T_2_min', 'ws_T_3_min', 'ws_T_4_min', 'ws_T_5_min', 'ws_T_6_min', 'ws_T_12_min','ws_T_24_max','ws_T_24_min'
]

v_to_drop = [
    'v_T_1', 'v_T_2', 'v_T_3', 'v_T_4', 'v_T_5', 'v_T_6', 
    'v_T_2_mean', 'v_T_3_mean', 'v_T_4_mean', 'v_T_5_mean', 'v_T_6_mean', 'v_T_7_mean',
    'v_T_8_mean', 'v_T_9_mean', 'v_T_10_mean', 'v_T_11_mean', 'v_T_12_mean', 'v_T_24_mean','v_T_36_mean',
    'v_T_3_std', 'v_T_4_std', 'v_T_5_std','v_T_6_std','v_T_24_std', 'v_T_36_median',
    'v_T_2_median', 'v_T_3_median', 'v_T_4_median', 'v_T_5_median', 'v_T_6_median', 
    'v_T_2_max', 'v_T_3_max', 'v_T_4_max', 'v_T_5_max', 'v_T_6_max', 'v_T_12_max', 
    'v_T_2_min', 'v_T_3_min', 'v_T_4_min', 'v_T_5_min', 'v_T_6_min', 'v_T_12_min', 
    'v_T_36_min', 'v_T_36', 'v_T_24_max',  'v_T_12_median', 'v_T_24_median',
]

wd_to_drop = [
    'coswd_1', 'coswd_2', 'coswd_3', 'coswd_4', 'coswd_5', 'coswd_6',
    'coswd_2_mean', 'coswd_3_mean', 'coswd_4_mean', 'coswd_5_mean', 'coswd_6_mean', 'coswd_7_mean', 
    'coswd_8_mean', 'coswd_9_mean', 'coswd_10_mean', 'coswd_11_mean', 'coswd_12_mean', 'coswd_24_mean', 
    'coswd_3_std', 'coswd_4_std','coswd_5_std','coswd_2_median', 'coswd_3_median','coswd_4_median', 
    'coswd_5_median', 'coswd_6_median', 'coswd_36_median', 'coswd_24_median', 'coswd_12_median',
    'coswd_2_max', 'coswd_3_max', 'coswd_4_max', 'coswd_5_max', 'coswd_6_max', 'coswd_12_max', 'coswd_24_max',
    'coswd_2_min', 'coswd_3_min', 'coswd_4_min', 'coswd_5_min', 'coswd_6_min', 'coswd_12_min', 'coswd_24_min',
    'ws_T_36_max', 'ws_T_36_min', 'coswd_12', 'coswd_24'
]

other_to_drop = [
    'cos_day', 'u', 'v'
]

feature_corr = u_to_drop+ws_to_drop+v_to_drop+wd_to_drop+other_to_drop
to_drop = feature_corr+to_drop

# LGBM functions

In [10]:
from lightgbm import LGBMRegressor

In [11]:
def create_dataset(data,n,split):
    n_batch=int(len(data)/84)
    new_data=np.array_split(data,n_batch)
    train = pd.DataFrame()
    val=pd.DataFrame()
    for i in range(n_batch):
        if (i+n)%split ==0:
            val = pd.concat([val,new_data[i]])
        else:
            train=pd.concat([train,new_data[i]])
    return train,val

In [12]:
def hyperparametrization_cv(trial, x,y,cv):
    param = {
        'n_jobs':-1,
        'metric': 'rmse', 
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
        'max_depth': trial.suggest_int("max_depth", 20, 100),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
    }
    
#     model = LGBMRegressor(**param)  
    model = Pipeline([('scaler', MinMaxScaler()),('ridge', LGBMRegressor(**param))])
#     model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    split=cv
    rmse=[]
    mae=[]
    for n in range(split):
        train_x,test_x=create_dataset(x,n,split)
        train_y,test_y=create_dataset(y,n,split)
        model.fit(train_x,train_y.values.ravel())
        preds=model.predict(test_x)
    
        mae.append(mean_absolute_error(test_y, preds))
        rmse.append(mean_squared_error(test_y, preds,squared=False))
    
    #rmse =  mean_squared_error(test_y, preds,squared=False)
    #mae = mean_absolute_error(test_y,preds)
    return np.mean(mae) #rmse

In [21]:
def lgbm_scaled_cross_validation(X, y, params,cv):
    if params == None:
#         model = Pipeline([('scaler', MaxAbsScaler()),('ridge', LGBMRegressor())])
        model = Pipeline([('scaler', RobustScaler()),('ridge', LGBMRegressor())])
    else:
#         model = Pipeline([('scaler', MaxAbsScaler()),('ridge', LGBMRegressor(**params))])
        model = Pipeline([('scaler', RobustScaler()),('ridge', LGBMRegressor(**params))])

    print('-----------LGBM CROSS VALIDATION BEGINNING-----------')
    split = cv
    kf = KFold(n_splits=split, shuffle=True)       
    lgbm_rmse_scores = []
    lgbm_mae_scores = []
    i = 1
    for n in range(split):
        X_train, X_test = create_dataset(X,n,split)
        Y_train, Y_test = create_dataset(y,n,split)

        model.fit(X_train, Y_train)

        prediction = model.predict(X_test)
        lgbm_rmse_scores.append(mean_squared_error(Y_test, prediction,squared=False))
        lgbm_mae_scores.append(mean_absolute_error(Y_test, prediction))
        
        print(show_evaluation(prediction, Y_test))
        print(f'-------------------FOLD {i}-----------------')
        i+=1

    print('---------------CROSS VALIDATION COMPLETE-------------')
    print('--------------------------RMSE-----------------------')
    display_scores(lgbm_rmse_scores)
    print('--------------------------MAE------------------------')
    display_scores(lgbm_mae_scores)

## WP1 

| |  | Mean | Std | Sum up |
| --- | --- | --- | --- | --- |
| No params | RMSE | 0.10344875448880764 | 0.0019070131550065564 |  |
| No params - StandardScaler | RMSE | 0.10355855436437653 | 0.0019043784372002228 |  |
| After tuning 50trials| RMSE | 0.06830085723562579 | 0.0012998671387256361 | To keep, maybe redo optuna with warm start with it - first sub|
| After tuning 100trials| RMSE | 0.0695930431604128 | 0.0015123892627707553 | |
| After tuning W.S 60trials | RMSE | 0.0671868765178121 | 0.0019357892229500213 | second sub - BETTER |
| After tuning W.S 50trials - StandardScaler | RMSE | 0.06705406618054667 | 0.0013509038914922506 | with warm start 1 parameters |
| After tuning W.S 50trials - RobustScaler | RMSE | 0.06736753027260156 | 0.0009971394767292064 | |
| --- | --- | --- | --- | --- |
| No params | MAE | 0.07306057409517844 | 0.0009857628815465762 |  |
| No params - StandardScaler | MAE | 0.07310137331348271 | 0.0010945145617042643 |  |
| After tuning 50trials| MAE | 0.04481696427654311 | 0.000727722171899004 |  |
| After tuning 100trials| MAE | 0.045696725242359994 | 0.0008508558100930331 |  |
| After tuning W.S 60trials | MAE | 0.04322865305311156 | 0.0009454253743042544 | |
| After tuning W.S 50trials - StandardScaler | MAE | 0.043173032898780175 | 0.0007696056550296055 | with warm start 1 parameters |
| After tuning W.S 50trials - RobustScaler | MAE | 0.043246198690701294 | 0.000596545751975941 | |

In [14]:
wp1_X = train_wp1[[c for c in train_wp1 if c not in ["wp"]] + ["wp"]].drop(to_drop+feature_corr, axis = 1)
X1 = wp1_X.drop('wp', axis=1)
y1 = wp1_X['wp']

In [15]:
def objective_wp1(trial,data=X1,target=y1,cv=5):
    #train_x, test_x = create_dataset(data,0)
    #train_y,test_y=create_dataset(target,0)
    return hyperparametrization_cv(trial, data,target,cv)

In [16]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp1, n_trials=50)
#write_results('Data/Hyperparametrization/lgbm_vmd_50trials_cv.xlsx', 'wp1', study.trials_dataframe())
params_1=study.best_trial.params
#best_trial

[32m[I 2021-09-20 12:55:51,818][0m A new study created in memory with name: no-name-62a730ed-30f7-481b-8853-f215b023688c[0m
[32m[I 2021-09-20 12:56:44,498][0m Trial 0 finished with value: 0.07086629332091385 and parameters: {'reg_alpha': 1.914354919226198, 'reg_lambda': 0.048178000856428704, 'colsample_bytree': 0.7, 'subsample': 1.0, 'learning_rate': 0.09386243461297668, 'max_depth': 20, 'num_leaves': 816, 'min_child_samples': 3}. Best is trial 0 with value: 0.07086629332091385.[0m
[32m[I 2021-09-20 12:57:10,431][0m Trial 1 finished with value: 0.1587432561996897 and parameters: {'reg_alpha': 0.3775020995328389, 'reg_lambda': 1.4996066336672706, 'colsample_bytree': 0.5, 'subsample': 0.6, 'learning_rate': 0.007961715331160727, 'max_depth': 71, 'num_leaves': 736, 'min_child_samples': 100}. Best is trial 0 with value: 0.07086629332091385.[0m
[32m[I 2021-09-20 12:57:39,538][0m Trial 2 finished with value: 0.08124137737544093 and parameters: {'reg_alpha': 3.0264211159547183, 'reg

In [17]:
#optuna.logging.set_verbosity(0)

In [18]:
optuna.logging.get_verbosity()

20

In [19]:
params_1

{'reg_alpha': 0.018078983057199488,
 'reg_lambda': 0.002174603964707146,
 'colsample_bytree': 1.0,
 'subsample': 1.0,
 'learning_rate': 0.11068373388876057,
 'max_depth': 49,
 'num_leaves': 552,
 'min_child_samples': 9}

In [22]:
lgbm_scaled_cross_validation(X=X1,y=y1, params=params_1,cv=5)

-----------LGBM CROSS VALIDATION BEGINNING-----------
RMSE score: 0.10052722451502887
MAE score: 0.06470481785395771
None
-------------------FOLD 1-----------------
RMSE score: 0.10168527605846683
MAE score: 0.06455614996095457
None
-------------------FOLD 2-----------------
RMSE score: 0.100693139684104
MAE score: 0.06581741750655307
None
-------------------FOLD 3-----------------
RMSE score: 0.10691986902185474
MAE score: 0.06808964934479528
None
-------------------FOLD 4-----------------
RMSE score: 0.10579764147593695
MAE score: 0.06807591151598474
None
-------------------FOLD 5-----------------
---------------CROSS VALIDATION COMPLETE-------------
--------------------------RMSE-----------------------
Scores: [0.10052722451502887, 0.10168527605846683, 0.100693139684104, 0.10691986902185474, 0.10579764147593695]
Mean: 0.10312463015107828
Std: 0.0026936677509359136
--------------------------MAE------------------------
Scores: [0.06470481785395771, 0.06455614996095457, 0.0658174175065

## WP2

| |  | Mean | Std | Sum up|
| --- | --- | --- | --- | |
| No params | RMSE | 0.10935335541057582 | 0.0014425096116734836 | |
| No params - StandardScaler | RMSE | 0.10938240918068962 | 0.002563571983412213 |  |
| After tuning - 50trials| RMSE | 0.0725081520968898 | 0.0016974702626377217 | |
| After tuning 100trials| RMSE | 0.0707064364904941 | 0.001396820290618349 | More stable, to keep |
| After tuning W.S. 50trials - StandardScaler | RMSE | 0.07072007438762447 | 0.0010396359382112443 | With 100trials parameters |
| After tuning W.S. 50trials - RobustScaler | RMSE | 0.07014346508497502 | 0.0013671602187583412 | |
| 50 trials per vmf - RobustScaler | RMSE | 0.06362570142793085 |0.001287144743120583 | |
| --- | --- | --- | --- |---|
| No params | MAE | 0.07681923856705511 | 0.0008670825615244791 | |
| No params - StandardScaler | MAE | 0.0768829843356125 | 0.0017568758903956435 |  |
| After tuning - 50trials| MAE | 0.04512164110351975 |  0.0006579433030966575 | |
| After tuning 100trials| MAE | 0.04457902842458915 | 0.0006807155447311589 | |
| After tuning W.S. 50trials - StandardScaler | MAE | 0.04456161763710905 | 0.0005342365137369251 | With 100trials parameters |
| After tuning W.S. 50trials - RobustScaler | MAE | 0.044471464183373825 | 0.0007098731360466324 | |
| 50 trials per vmf - RobustScaler | MAE | 0.04180182904890137 |0.000707244027895384 | |

In [23]:
wp2_X = train_wp2[[c for c in train_wp2 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X2 = wp2_X.drop('wp', axis=1)
y2 = wp2_X['wp']

In [24]:
def objective_wp2(trial,data=X2,target=y2,cv=5):
    #train_x, test_x = create_dataset(data,0)
    #train_y,test_y=create_dataset(target,0)
    return hyperparametrization_cv(trial, data,target,cv)

In [25]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp2, n_trials=50)
#write_results('Data/Hyperparametrization/lgbm_vmd_50trials_cv.xlsx', 'wp1', study.trials_dataframe())
params_2=study.best_trial.params
#best_trial

[32m[I 2021-09-20 13:51:57,623][0m A new study created in memory with name: no-name-46c9ef4c-25a6-426d-9cf8-5e9ea6f0667e[0m
[32m[I 2021-09-20 13:52:20,776][0m Trial 0 finished with value: 0.1213184581008508 and parameters: {'reg_alpha': 0.10570820429041153, 'reg_lambda': 2.0257743833060546, 'colsample_bytree': 0.3, 'subsample': 0.5, 'learning_rate': 0.017639060775824406, 'max_depth': 72, 'num_leaves': 940, 'min_child_samples': 137}. Best is trial 0 with value: 0.1213184581008508.[0m
[32m[I 2021-09-20 13:53:03,511][0m Trial 1 finished with value: 0.20692303868201672 and parameters: {'reg_alpha': 0.012166727964453908, 'reg_lambda': 4.080894028014074, 'colsample_bytree': 1.0, 'subsample': 0.4, 'learning_rate': 0.0012969444877625011, 'max_depth': 51, 'num_leaves': 811, 'min_child_samples': 46}. Best is trial 0 with value: 0.1213184581008508.[0m
[32m[I 2021-09-20 13:53:31,397][0m Trial 2 finished with value: 0.20759802200298197 and parameters: {'reg_alpha': 0.8760125296592365, 'r

In [26]:
params_2

{'reg_alpha': 0.029402734043432143,
 'reg_lambda': 0.45456212808233626,
 'colsample_bytree': 0.8,
 'subsample': 0.8,
 'learning_rate': 0.13658229494995808,
 'max_depth': 29,
 'num_leaves': 465,
 'min_child_samples': 13}

In [27]:
lgbm_scaled_cross_validation(X=X2,y=y2, params=params_2,cv=5)

-----------LGBM CROSS VALIDATION BEGINNING-----------
RMSE score: 0.10053975151190245
MAE score: 0.06502171920829278
None
-------------------FOLD 1-----------------
RMSE score: 0.1134078708101767
MAE score: 0.07468897852392722
None
-------------------FOLD 2-----------------
RMSE score: 0.11153925835927998
MAE score: 0.07232049986577652
None
-------------------FOLD 3-----------------
RMSE score: 0.10207558696622762
MAE score: 0.06666172738125264
None
-------------------FOLD 4-----------------
RMSE score: 0.1057548915869517
MAE score: 0.06703597850596066
None
-------------------FOLD 5-----------------
---------------CROSS VALIDATION COMPLETE-------------
--------------------------RMSE-----------------------
Scores: [0.10053975151190245, 0.1134078708101767, 0.11153925835927998, 0.10207558696622762, 0.1057548915869517]
Mean: 0.10666347184690769
Std: 0.005072166558749562
--------------------------MAE------------------------
Scores: [0.06502171920829278, 0.07468897852392722, 0.07232049986577

## WP3

| |  | Mean | Std ||
| --- | --- | --- | --- ||
| No params | RMSE | 0.10392558077951244 | 0.0019038044796542812 ||
| No params - StandardScaler | RMSE | 0.10354114984428979 | 0.0013629826554706927 ||
| After tuning - 50trials| RMSE | 0.058253804820626545 | 0.0009893279354834155 | More stable, to keep |
| After tuning - 100trials| RMSE | 0.058338944346627106 | 0.0017133930174837203 ||
| After tuning W.S. - 50trials - StandardScaler | RMSE | 0.05839355310487706 | 0.0009599824558874801 ||
| After tuning - 50trial - StandardScaler | RMSE | 0.05828701989178382 | 0.0013469437248627486 | with 50trials best|
| After tuning - 50trial W.S. - RobustScaler | RMSE | 0.05907568414783913 | 0.0010120672015320588 | |
| 50 trials per vmf - RobustScaler | RMSE | 0.055222505478173776 |0.001617631570145444 | |
| --- | --- | --- | --- | --- |
| No params | MAE | 0.07550802464973318 | 0.0012006073434917633 ||
| No params - StandardScaler | MAE | 0.0753220326933334 | 0.0007353049410651828 | |
| After tuning - 50trials| MAE | 0.03787310900962521 | 0.000442034368456366 ||
| After tuning - 100trials| MAE | 0.03838030476025398 | 0.0007480100565996748 ||
| After tuning W.S. - 50trials -StandardScaler | MAE | 0.03838277636708219 | 0.0006195357541130345 ||
| After tuning - 50trials - StandardScaler | MAE | 0.03796602386426936 | 0.0006588818520419181 | with 50trials best|
| After tuning - 50trial W.S. - RobustScaler | MAE | 0.03824961618901729 | 0.0007043585347168141 | |
| 50 trials per vmf - RobustScaler | MAE | 0.03743940912757508 |0.0007763774066394942 | |


In [28]:
wp3_X = train_wp3[[c for c in train_wp3 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X3 = wp3_X.drop('wp', axis = 1)
y3 = wp3_X['wp']


In [29]:
def objective_wp3(trial,data=X3,target=y3,cv=5):
    #train_x, test_x = create_dataset(data,0)
    #train_y,test_y=create_dataset(target,0)
    return hyperparametrization_cv(trial, data,target,cv)

In [30]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp3, n_trials=50)
#write_results('Data/Hyperparametrization/lgbm_vmd_50trials_cv.xlsx', 'wp1', study.trials_dataframe())
params_3=study.best_trial.params
#best_trial

[32m[I 2021-09-20 14:25:03,779][0m A new study created in memory with name: no-name-9d6e85e5-7902-48cd-a8b0-00bd9385402a[0m
[32m[I 2021-09-20 14:25:29,903][0m Trial 0 finished with value: 0.0737859413859929 and parameters: {'reg_alpha': 0.015223898666486832, 'reg_lambda': 0.5094496125761684, 'colsample_bytree': 0.7, 'subsample': 0.5, 'learning_rate': 0.2863687132539241, 'max_depth': 77, 'num_leaves': 376, 'min_child_samples': 163}. Best is trial 0 with value: 0.0737859413859929.[0m
[32m[I 2021-09-20 14:25:56,720][0m Trial 1 finished with value: 0.2060964095648755 and parameters: {'reg_alpha': 0.0055760760574993335, 'reg_lambda': 4.29795782944928, 'colsample_bytree': 0.8, 'subsample': 0.8, 'learning_rate': 0.0012803638594326876, 'max_depth': 47, 'num_leaves': 532, 'min_child_samples': 195}. Best is trial 0 with value: 0.0737859413859929.[0m
[32m[I 2021-09-20 14:26:27,662][0m Trial 2 finished with value: 0.17517731017632832 and parameters: {'reg_alpha': 0.0013644246518298884, 

In [31]:
params_3

{'reg_alpha': 0.20741912347765662,
 'reg_lambda': 4.95283355304233,
 'colsample_bytree': 0.7,
 'subsample': 0.7,
 'learning_rate': 0.1269064972898481,
 'max_depth': 44,
 'num_leaves': 744,
 'min_child_samples': 21}

In [32]:
lgbm_scaled_cross_validation(X=X3,y=y3,params=params_3,cv=5)

-----------LGBM CROSS VALIDATION BEGINNING-----------
RMSE score: 0.09797277424877915
MAE score: 0.06621343183643381
None
-------------------FOLD 1-----------------
RMSE score: 0.1012938129340793
MAE score: 0.06725475625542955
None
-------------------FOLD 2-----------------
RMSE score: 0.1010066841161253
MAE score: 0.06732280833256975
None
-------------------FOLD 3-----------------
RMSE score: 0.09759658075552409
MAE score: 0.06557784716606284
None
-------------------FOLD 4-----------------
RMSE score: 0.09204660621950396
MAE score: 0.0624905683273814
None
-------------------FOLD 5-----------------
---------------CROSS VALIDATION COMPLETE-------------
--------------------------RMSE-----------------------
Scores: [0.09797277424877915, 0.1012938129340793, 0.1010066841161253, 0.09759658075552409, 0.09204660621950396]
Mean: 0.09798329165480235
Std: 0.0033314964508119782
--------------------------MAE------------------------
Scores: [0.06621343183643381, 0.06725475625542955, 0.06732280833256

## WP4

| |  | Mean | Std | |
| --- | --- | --- | --- | --- |
| No params | RMSE | 0.10486204816363351 | 0.0015105949978751166 ||
| No params  - Std | RMSE | 0.10480849342496516 | 0.0010370863436755212 ||
| After tuning - 50trials| RMSE | 0.06513233717204232 | 0.0015891617240032727 ||
| After tuning 100trials| RMSE | 0.06357594848470964 | 0.0013676749030776929 ||
| After tuning with W.S - 50trials - Std | RMSE | 0.06339401569270936 | 0.001228053306037005 ||
| After tuning with W.S. - 50trials - RobustScaler | RMSE | 0.0632324115841705 | 0.0010081050240456021 ||
| 50 trials per vmf - RobustScaler | RMSE |0.0593103048767063  |0.0011712884174825636 | |
| --- | --- | --- | --- | --- |
| No params | MAE | 0.07564776733421566 | 0.00104638869825841 ||
| No params  - Std | MAE | 0.07570794104041156 | 0.0008419207475550308 ||
| After tuning - 50trials| MAE | 0.04219236028055372 | 0.0008190579419060266 ||
| After tuning 100trials| MAE |0.04172111697148837  | 0.0009349285385250968 ||
| After tuning with W.S - 50trials  - Std | MAE | 0.04150668920859586 | 0.0005729825500890684 ||
| After tuning with W.S. - 50trials - RobustScaler | MAE | 0.04170428506837879 | 0.0006101247158768171 ||
| 50 trials per vmf - RobustScaler | MAE | 0.04115807160847805 |0.0006471350512019214 | |

In [33]:
wp4_X = train_wp4[[c for c in train_wp4 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X4 = wp4_X.drop('wp', axis = 1)
y4 = wp4_X['wp']

In [34]:
def objective_wp4(trial,data=X4,target=y4,cv=5):
    #train_x, test_x = create_dataset(data,0)
    #train_y,test_y=create_dataset(target,0)
    return hyperparametrization_cv(trial, data,target,cv)

In [35]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp4, n_trials=50)
#write_results('Data/Hyperparametrization/lgbm_vmd_50trials_cv.xlsx', 'wp1', study.trials_dataframe())
params_4=study.best_trial.params
#best_trial

[32m[I 2021-09-20 14:53:16,743][0m A new study created in memory with name: no-name-0b1418f0-a9b5-4aed-a81f-aa74ce1f640f[0m
[32m[I 2021-09-20 14:53:43,714][0m Trial 0 finished with value: 0.17466452207697153 and parameters: {'reg_alpha': 0.0187877992364005, 'reg_lambda': 3.076982239548737, 'colsample_bytree': 0.7, 'subsample': 0.5, 'learning_rate': 0.005260856326282007, 'max_depth': 80, 'num_leaves': 804, 'min_child_samples': 120}. Best is trial 0 with value: 0.17466452207697153.[0m
[32m[I 2021-09-20 14:54:12,839][0m Trial 1 finished with value: 0.08429887083202078 and parameters: {'reg_alpha': 0.4456979404377384, 'reg_lambda': 0.21318654821028002, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.038366241120891785, 'max_depth': 82, 'num_leaves': 924, 'min_child_samples': 96}. Best is trial 1 with value: 0.08429887083202078.[0m
[32m[I 2021-09-20 14:54:35,661][0m Trial 2 finished with value: 0.07630409233042328 and parameters: {'reg_alpha': 0.032030908825470765, 

In [36]:
params_4

{'reg_alpha': 0.07849666964165539,
 'reg_lambda': 0.0017033350344194495,
 'colsample_bytree': 1.0,
 'subsample': 0.5,
 'learning_rate': 0.06675968989843854,
 'max_depth': 56,
 'num_leaves': 511,
 'min_child_samples': 2}

In [37]:
lgbm_scaled_cross_validation(X=X4,y=y4,params=params_4,cv=5)

-----------LGBM CROSS VALIDATION BEGINNING-----------
RMSE score: 0.0987960580644649
MAE score: 0.06573301889609909
None
-------------------FOLD 1-----------------
RMSE score: 0.10103360135680922
MAE score: 0.06854243728906631
None
-------------------FOLD 2-----------------
RMSE score: 0.10847525746985424
MAE score: 0.07207916862965734
None
-------------------FOLD 3-----------------
RMSE score: 0.10645820644997828
MAE score: 0.0692416591681621
None
-------------------FOLD 4-----------------
RMSE score: 0.10068858509979893
MAE score: 0.06716859395533498
None
-------------------FOLD 5-----------------
---------------CROSS VALIDATION COMPLETE-------------
--------------------------RMSE-----------------------
Scores: [0.0987960580644649, 0.10103360135680922, 0.10847525746985424, 0.10645820644997828, 0.10068858509979893]
Mean: 0.10309034168818113
Std: 0.0037088898169375067
--------------------------MAE------------------------
Scores: [0.06573301889609909, 0.06854243728906631, 0.072079168629

## WP5

| |  | Mean | Std |
| --- | --- | --- | --- |
| No params | RMSE | 0.11722129743692011 | 0.0017732599261516583 |
| No params - Std | RMSE | 0.11729790317307003 | 0.0014884067903823003 |
| After tuning - 50trials| RMSE | 0.07721413638593042 | 0.0011020420293213135 |
| After tuning - 100trials| RMSE | 0.07297648991888442 | 0.0014970317509404526 |
| After tuning - 100trials - warm start | RMSE | 0.07362803793800192 | 0.0013223501622953715 |
| After tuning 50trials - std - warm start | RMSE | 0.07379631452164911 | 0.0019249511778190373 |
| After tuning 50trials W.S - RobustScaler | RMSE | 0.07295312559507504 | 0.0017094978756410563 |
| 50 trials per vmf | RMSE | 0.06800612721907516 | 0.001364694216955839 |
| --- | --- | --- | --- |
| No params | MAE | 0.08497074568090211 | 0.0009101526501392155 |
| No params - Std | MAE | 0.0849648687365363 | 0.0011901882563545429 |
| After tuning - 50trials| MAE | 0.051677856581467195 | 0.0006374939894477714 |
| After tuning - 100trials| MAE | 0.04765271414503236 | 0.0006257356756510128 |
| After tuning - 100trials - warm trials | MAE | 0.04785179154681675 | 0.0005795839605605526 |
| After tuning 50trials - std - warm start | MAE | 0.0480147138609328 | 0.0008230908433814974 |
| After tuning 50trials W.S - RobustScaler | MAE | 0.04783683731205745 | 0.0007485320718195094 |
| 50 trials per vmf | MAE | 0.04587332776201404 | 0.0008842958670279714 |

In [38]:
wp5_X = train_wp5[[c for c in train_wp5 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X5 = wp5_X.drop('wp', axis = 1)
y5 = wp5_X['wp']

In [39]:
def objective_wp5(trial,data=X5,target=y5,cv=5):
    #train_x, test_x = create_dataset(data,0)
    #train_y,test_y=create_dataset(target,0)
    return hyperparametrization_cv(trial, data,target,cv)

In [40]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp5, n_trials=50)
#write_results('Data/Hyperparametrization/lgbm_vmd_50trials_cv.xlsx', 'wp1', study.trials_dataframe())
params_5=study.best_trial.params
#best_trial

[32m[I 2021-09-20 15:17:21,060][0m A new study created in memory with name: no-name-d41505d8-a92c-4a6d-80a9-26b42b0614dc[0m
[32m[I 2021-09-20 15:17:41,685][0m Trial 0 finished with value: 0.10294870386616226 and parameters: {'reg_alpha': 0.00664835472481104, 'reg_lambda': 2.693966081562147, 'colsample_bytree': 0.3, 'subsample': 0.8, 'learning_rate': 0.043484964788240946, 'max_depth': 52, 'num_leaves': 128, 'min_child_samples': 124}. Best is trial 0 with value: 0.10294870386616226.[0m
[32m[I 2021-09-20 15:18:05,021][0m Trial 1 finished with value: 0.16198752089901264 and parameters: {'reg_alpha': 5.372719859102916, 'reg_lambda': 1.1800016503863906, 'colsample_bytree': 0.9, 'subsample': 1.0, 'learning_rate': 0.009076457965320747, 'max_depth': 28, 'num_leaves': 381, 'min_child_samples': 56}. Best is trial 0 with value: 0.10294870386616226.[0m
[32m[I 2021-09-20 15:18:24,684][0m Trial 2 finished with value: 0.11484448554555231 and parameters: {'reg_alpha': 1.0851547438162845, 're

In [41]:
params_5

{'reg_alpha': 0.471286553222722,
 'reg_lambda': 0.10555787976936198,
 'colsample_bytree': 0.8,
 'subsample': 0.5,
 'learning_rate': 0.09625106303517775,
 'max_depth': 79,
 'num_leaves': 510,
 'min_child_samples': 14}

In [42]:
lgbm_scaled_cross_validation(X=X5,y=y5, params=params_5,cv=5)

-----------LGBM CROSS VALIDATION BEGINNING-----------
RMSE score: 0.10796566879558035
MAE score: 0.07265940752576183
None
-------------------FOLD 1-----------------
RMSE score: 0.10868536072861915
MAE score: 0.07265479769030136
None
-------------------FOLD 2-----------------
RMSE score: 0.11848871515910664
MAE score: 0.07780580895325959
None
-------------------FOLD 3-----------------
RMSE score: 0.11772448964398359
MAE score: 0.07827057024530713
None
-------------------FOLD 4-----------------
RMSE score: 0.1183575539940442
MAE score: 0.07916669158465629
None
-------------------FOLD 5-----------------
---------------CROSS VALIDATION COMPLETE-------------
--------------------------RMSE-----------------------
Scores: [0.10796566879558035, 0.10868536072861915, 0.11848871515910664, 0.11772448964398359, 0.1183575539940442]
Mean: 0.1142443576642668
Std: 0.004844969639282624
--------------------------MAE------------------------
Scores: [0.07265940752576183, 0.07265479769030136, 0.0778058089532

## WP6

| |  | Mean | Std |
| --- | --- | --- | --- |
| No params | RMSE | 0.0940394026188472 | 0.0010749562915831372 |
| No params - std | RMSE | 0.09409110695713666 | 0.0013476052174559326 |
| After tuning - 50trials| RMSE | 0.05404362835213171 | 0.0008595325139047733 |
| After tuning 100trials| RMSE | 0.054861488499908594 | 0.0007335378238383901 |
| After tuning 50trials std - W.S. | RMSE |  0.05446642457662869 | 0.0009850877593637966 |
| After tuning with W.S. - 50trials - RobustScaler | RMSE | 0.053430744425113176 | 0.0010865698281516414 ||
| 50 trials per vmf | RMSE | 0.05055279132726877 | 0.0006313072455945034 |
| --- | --- | --- | --- |
| No params | MAE | 0.070455643271004 | 0.0006641538274191148 |
| After tuning - 50trials| MAE | 0.03657758274248596 | 0.0005325521314198646 |
| After tuning 100trials| MAE | 0.03783933495157941 | 0.00045956939815828987 |
| After tuning 50trials std - warm start| MAE | 0.0367847898055025 | 0.00039001216012464674 |
| After tuning with W.S. - 50trials - RobustScaler | RMSE | 0.03652663713189234 | 0.0006251342721346248 ||
| 50 trials per vmf | MAE | 0.0357185614133554 | 0.00028295806558658535 |

In [43]:
wp6_X = train_wp6[[c for c in train_wp6 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X6 = wp6_X.drop('wp', axis = 1)
y6 = wp6_X['wp']


In [44]:
def objective_wp6(trial,data=X6,target=y6,cv=5):
    #train_x, test_x = create_dataset(data,0)
    #train_y,test_y=create_dataset(target,0)
    return hyperparametrization_cv(trial, data,target,cv)

In [45]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp6, n_trials=50)
#write_results('Data/Hyperparametrization/lgbm_vmd_50trials_cv.xlsx', 'wp1', study.trials_dataframe())
params_6=study.best_trial.params
#best_trial

[32m[I 2021-09-20 15:43:57,517][0m A new study created in memory with name: no-name-c56915a3-5af9-4e51-8947-06a4927c1961[0m
[32m[I 2021-09-20 15:44:22,008][0m Trial 0 finished with value: 0.11793019859215721 and parameters: {'reg_alpha': 2.56543763474529, 'reg_lambda': 0.949480120375089, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.012294051071346929, 'max_depth': 92, 'num_leaves': 872, 'min_child_samples': 102}. Best is trial 0 with value: 0.11793019859215721.[0m
[32m[I 2021-09-20 15:44:59,908][0m Trial 1 finished with value: 0.06813519552733607 and parameters: {'reg_alpha': 0.014027221206028947, 'reg_lambda': 0.3728568191434699, 'colsample_bytree': 0.5, 'subsample': 0.5, 'learning_rate': 0.049850095083627984, 'max_depth': 33, 'num_leaves': 577, 'min_child_samples': 54}. Best is trial 1 with value: 0.06813519552733607.[0m
[32m[I 2021-09-20 15:45:34,499][0m Trial 2 finished with value: 0.11337652437052612 and parameters: {'reg_alpha': 0.0015734183378787443, 

In [46]:
params_6

{'reg_alpha': 0.29989602232567175,
 'reg_lambda': 0.003317793295518409,
 'colsample_bytree': 0.7,
 'subsample': 0.5,
 'learning_rate': 0.1322139683565735,
 'max_depth': 35,
 'num_leaves': 325,
 'min_child_samples': 25}

In [47]:
lgbm_scaled_cross_validation(X=X6, y=y6,params=params_6,cv=5)

-----------LGBM CROSS VALIDATION BEGINNING-----------
RMSE score: 0.08507500129034921
MAE score: 0.059797503336323804
None
-------------------FOLD 1-----------------
RMSE score: 0.0853233286773399
MAE score: 0.06108715303888444
None
-------------------FOLD 2-----------------
RMSE score: 0.09072952725754627
MAE score: 0.06345330662848023
None
-------------------FOLD 3-----------------
RMSE score: 0.0881607428560229
MAE score: 0.06183888257358363
None
-------------------FOLD 4-----------------
RMSE score: 0.08863848966383489
MAE score: 0.06111321782873196
None
-------------------FOLD 5-----------------
---------------CROSS VALIDATION COMPLETE-------------
--------------------------RMSE-----------------------
Scores: [0.08507500129034921, 0.0853233286773399, 0.09072952725754627, 0.0881607428560229, 0.08863848966383489]
Mean: 0.08758541794901864
Std: 0.002132820752070113
--------------------------MAE------------------------
Scores: [0.059797503336323804, 0.06108715303888444, 0.063453306628

# Predictions

## Functions

In [48]:
to_drop_test = ['date','wd','forecast_time', 'forecast', "forecast_dist", 'wp']+feature_corr
def make_prediction_dataset(test, to_drop=to_drop_test):
    test_to_predict = test.dropna(subset=['ws','u','v'], how = 'any') # keeps only lines with u,v,ws,wd
    test_to_predict = test_to_predict[test_to_predict['wp'].isna()] # keeps only lines with no wp
    test_to_predict = test_to_predict.sort_values(by=['date', 'forecast_time'], ascending = [True, False]).drop_duplicates(subset='date')
    test_to_predict = test_to_predict.drop(to_drop, axis = 1)
    return test_to_predict

In [49]:
def make_submission_file(lst_X_trains, lst_y_trains, lst_tests, lst_models, dates):
    i = 1
    lst_prediction = []
    lst_models_trained = []
    for X, y, test, model in zip(lst_X_trains, lst_y_trains, lst_tests, lst_models):
        print(f'--------------Model {i}--------------')
        model.fit(X, y)
        print(f'True:\n\tMin:{min(y)}\n\tMax:{max(y)}\n\tMean:{y.mean()}')
        predictions = model.predict(test)
        print(f'Prediction:\n\tMin:{min(predictions)}\n\tMax:{max(predictions)}\n\tMean:{np.mean(predictions)}')
#         predictions = [min(y) if i < 0 else i for i in predictions]
#         predictions = [max(y) if i > max(y) else i for i in predictions]
        predictions = [0 if i < 0 else i for i in predictions]
        predictions = [1 if i > 1 else i for i in predictions]
        print(f'Prediction corrected:\n\tMin:{min(predictions)}\n\tMax:{max(predictions)}\n\tMean:{np.mean(predictions)}')
        lst_prediction.append(predictions)
        lst_models_trained.append(model)
        i+=1
    
    df_predictions = pd.DataFrame({
        'date': test_dates,
        'wp1': lst_prediction[0],
        'wp2': lst_prediction[1],
        'wp3': lst_prediction[2],
        'wp4': lst_prediction[3],
        'wp5': lst_prediction[4],
        'wp6': lst_prediction[5],        
    })
    return df_predictions, lst_models_trained

## Submission 

In [50]:
model_1 = Pipeline([('scaler', MaxAbsScaler()),('ridge', LGBMRegressor(**params_1))])
model_2 = Pipeline([('scaler', MaxAbsScaler()),('ridge', LGBMRegressor(**params_2))])
model_3 = Pipeline([('scaler', MaxAbsScaler()),('ridge', LGBMRegressor(**params_3))])
model_4 = Pipeline([('scaler', MaxAbsScaler()),('ridge', LGBMRegressor(**params_4))])
model_5 = Pipeline([('scaler', MaxAbsScaler()),('ridge', LGBMRegressor(**params_5))])
model_6 = Pipeline([('scaler', MaxAbsScaler()),('ridge', LGBMRegressor(**params_6))])

In [52]:
lst_models = [model_1, model_2, model_3, model_4, model_5, model_6]
lst_X_trains = [X1, X2, X3, X4, X5, X6]
lst_y_trains = [y1, y2, y3, y4, y5, y6]

In [53]:
lst_tests = []
for test in [test_wp1, test_wp2, test_wp3, test_wp4, test_wp5, test_wp6]:
    test = make_prediction_dataset(test)
    lst_tests.append(test)

In [54]:
df_predictions, lst_models_trained = make_submission_file(lst_X_trains, lst_y_trains, lst_tests, lst_models, test_dates)

--------------Model 1--------------
True:
	Min:0.0
	Max:0.96
	Mean:0.2845981952075702
Prediction:
	Min:-0.012629501056262262
	Max:0.9362037364462557
	Mean:0.2986602846264825
Prediction corrected:
	Min:0
	Max:0.9362037364462557
	Mean:0.2986840138836199
--------------Model 2--------------
True:
	Min:0.0
	Max:0.966
	Mean:0.25890153769841273
Prediction:
	Min:-0.013789645120279206
	Max:0.9662203426405734
	Mean:0.25423219108420664
Prediction corrected:
	Min:0
	Max:0.9662203426405734
	Mean:0.2542494456135906
--------------Model 3--------------
True:
	Min:0.0
	Max:0.989
	Mean:0.2625247252747253
Prediction:
	Min:-0.011704978191967703
	Max:0.9598253715809746
	Mean:0.2856813969565523
Prediction corrected:
	Min:0
	Max:0.9598253715809746
	Mean:0.2856940249117191
--------------Model 4--------------
True:
	Min:0.0
	Max:0.992
	Mean:0.2763637820512821
Prediction:
	Min:-0.01178447285079433
	Max:0.8991395755662723
	Mean:0.280327535025914
Prediction corrected:
	Min:0
	Max:0.8991395755662723
	Mean:0.280346

## Saving models

In [58]:
nb_sub = 41
model = "robustscaler-lgbm-cv"
prepro = 'RobustScaler'
postpro = "Prediction limited by 0-1"

In [59]:
# df_predictions.to_csv('Predictions/submission_nb_10_full_maxabs-lgbm-featselect.csv', index=False, sep=';')
df_predictions.to_csv(f'Predictions/submission_nb_{nb_sub}_{model}.csv', index=False, sep=';')

In [60]:
df_predictions.head()

Unnamed: 0,date,wp1,wp2,wp3,wp4,wp5,wp6
0,2011010101,0.693398,0.588826,0.055611,0.501383,0.629562,0.563751
1,2011010102,0.609136,0.587803,0.052893,0.461121,0.647442,0.570213
2,2011010103,0.627847,0.487257,0.102975,0.457511,0.647029,0.568546
3,2011010104,0.760432,0.557049,0.138302,0.691063,0.635964,0.565386
4,2011010105,0.760352,0.409648,0.341826,0.685897,0.64446,0.576714
