# LGBM Model development

In [21]:
import pandas as pd
import seaborn as sns
import openpyxl
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle

In [22]:
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
import optuna

In [23]:
from Functions.helper_functions import * 

In [24]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
pd.options.mode.chained_assignment = None  # default='warn'

In [25]:
np.random.seed(42)

# Data

In [27]:
train_wp1 = pd.read_csv('Data/Preprocessing/WP1_train_preprocessed.csv', sep=',')
train_wp2 = pd.read_csv('Data/Preprocessing/WP2_train_preprocessed.csv', sep=',')
train_wp3 = pd.read_csv('Data/Preprocessing/WP3_train_preprocessed.csv', sep=',')
train_wp4 = pd.read_csv('Data/Preprocessing/WP4_train_preprocessed.csv', sep=',')
train_wp5 = pd.read_csv('Data/Preprocessing/WP5_train_preprocessed.csv', sep=',')
train_wp6 = pd.read_csv('Data/Preprocessing/WP6_train_preprocessed.csv', sep=',')

In [28]:
test_wp1 = pd.read_csv('Data/Preprocessing/WP1_test_preprocessed.csv', sep=',')
test_wp2 = pd.read_csv('Data/Preprocessing/WP2_test_preprocessed.csv', sep=',')
test_wp3 = pd.read_csv('Data/Preprocessing/WP3_test_preprocessed.csv', sep=',')
test_wp4 = pd.read_csv('Data/Preprocessing/WP4_test_preprocessed.csv', sep=',')
test_wp5 = pd.read_csv('Data/Preprocessing/WP5_test_preprocessed.csv', sep=',')
test_wp6 = pd.read_csv('Data/Preprocessing/WP6_test_preprocessed.csv', sep=',')
test_dates = pd.read_csv('Data/Initial/test.csv', sep=',').date.values

In [29]:
to_drop = ['date','wd','forecast_time', 'forecast', "forecast_dist"]

In [30]:
u_to_drop = [
    'u_T_1', 'u_T_2', 'u_T_3', 'u_T_4', 'u_T_5', 'u_T_6', 
    'u_T_2_mean', 'u_T_3_mean', 'u_T_4_mean', 'u_T_5_mean', 'u_T_6_mean', 'u_T_7_mean',
    'u_T_8_mean', 'u_T_9_mean', 'u_T_10_mean', 'u_T_11_mean', 'u_T_12_mean','u_T_24_mean',
    'u_T_2_std', 'u_T_4_std', 'u_T_5_std', 'u_T_6_std',
    'u_T_2_median', 'u_T_3_median', 'u_T_4_median', 'u_T_5_median', 'u_T_6_median', 'u_T_12_median','u_T_24_median', 'u_T_36_median',
    'u_T_2_max', 'u_T_3_max', 'u_T_4_max', 'u_T_5_max', 'u_T_6_max', 'u_T_12_max',
    'u_T_2_min', 'u_T_3_min', 'u_T_4_min', 'u_T_5_min', 'u_T_6_min', 'u_T_12_min',
    'u2_T_1', 'u2_T_2', 'u2_T_3', 'u2_T_4', 'u2_T_5', 'u2_T_6', 
    'u2_T_2_mean', 'u2_T_3_mean', 'u2_T_4_mean', 'u2_T_5_mean', 'u2_T_6_mean', 'u2_T_7_mean',
    'u2_T_8_mean', 'u2_T_9_mean', 'u2_T_10_mean', 'u2_T_11_mean', 'u2_T_12_mean','u2_T_24_mean',
    'u2_T_2_std', 'u2_T_4_std', 'u2_T_5_std', 'u2_T_6_std', 'u2_T_24_std',
    'u2_T_2_median', 'u2_T_3_median', 'u2_T_4_median', 'u2_T_5_median', 'u2_T_6_median', 'u2_T_12_median',
    'u2_T_2_max','u2_T_3_max', 'u2_T_4_max','u2_T_5_max', 'u2_T_6_max', 'u2_T_12_max',
    'u2_T_2_min', 'u2_T_3_min', 'u2_T_4_min', 'u2_T_5_min', 'u2_T_6_min',
    'u2_T_12', 'u2_T_36_mean', 'u2_T_36_std', 'u2_T_24_median', 'u2_T_24_max',
    'u_T_36_mean','u_T_12','u_T_24_max','u2_T_36_median','u_T_24_min'
]
ws_to_drop = [
    'ws_T_1', 'ws_T_2', 'ws_T_3', 'ws_T_4', 'ws_T_5', 'ws_T_6', 'ws_T_7', 'ws_T_8', 'ws_T_10','ws_T_11', 'ws_T_12',
    'ws_T_2_mean', 'ws_T_3_mean', 'ws_T_4_mean', 'ws_T_5_mean', 'ws_T_6_mean', 'ws_T_7_mean', 'ws_T_8_mean', 'ws_T_9_mean', 
    'ws_T_10_mean', 'ws_T_11_mean', 'ws_T_12_mean', 'ws_T_24_mean', 
    'ws_T_2_std', 'ws_T_3_std', 'ws_T_4_std', 'ws_T_5_std', 
    'ws_T_2_median', 'ws_T_3_median', 'ws_T_4_median', 'ws_T_5_median', 'ws_T_6_median',
    'ws_T_12_median', 'ws_T_24_median', 'ws_T_36_median',
    'ws_T_2_max', 'ws_T_3_max', 'ws_T_4_max', 'ws_T_5_max','ws_T_6_max', 'ws_T_12_max',
     'ws_T_2_min', 'ws_T_3_min', 'ws_T_4_min', 'ws_T_5_min', 'ws_T_6_min', 'ws_T_12_min','ws_T_24_max','ws_T_24_min'
]

v_to_drop = [
    'v_T_1', 'v_T_2', 'v_T_3', 'v_T_4', 'v_T_5', 'v_T_6', 
    'v_T_2_mean', 'v_T_3_mean', 'v_T_4_mean', 'v_T_5_mean', 'v_T_6_mean', 'v_T_7_mean',
    'v_T_8_mean', 'v_T_9_mean', 'v_T_10_mean', 'v_T_11_mean', 'v_T_12_mean', 'v_T_24_mean','v_T_36_mean',
    'v_T_3_std', 'v_T_4_std', 'v_T_5_std','v_T_6_std','v_T_24_std', 'v_T_36_median',
    'v_T_2_median', 'v_T_3_median', 'v_T_4_median', 'v_T_5_median', 'v_T_6_median', 
    'v_T_2_max', 'v_T_3_max', 'v_T_4_max', 'v_T_5_max', 'v_T_6_max', 'v_T_12_max', 
    'v_T_2_min', 'v_T_3_min', 'v_T_4_min', 'v_T_5_min', 'v_T_6_min', 'v_T_12_min', 
    'v_T_36_min', 'v_T_36', 'v_T_24_max',  'v_T_12_median', 'v_T_24_median',
]

wd_to_drop = [
    'coswd_1', 'coswd_2', 'coswd_3', 'coswd_4', 'coswd_5', 'coswd_6',
    'coswd_2_mean', 'coswd_3_mean', 'coswd_4_mean', 'coswd_5_mean', 'coswd_6_mean', 'coswd_7_mean', 
    'coswd_8_mean', 'coswd_9_mean', 'coswd_10_mean', 'coswd_11_mean', 'coswd_12_mean', 'coswd_24_mean', 
    'coswd_3_std', 'coswd_4_std','coswd_5_std','coswd_2_median', 'coswd_3_median','coswd_4_median', 
    'coswd_5_median', 'coswd_6_median', 'coswd_36_median', 'coswd_24_median', 'coswd_12_median',
    'coswd_2_max', 'coswd_3_max', 'coswd_4_max', 'coswd_5_max', 'coswd_6_max', 'coswd_12_max', 'coswd_24_max',
    'coswd_2_min', 'coswd_3_min', 'coswd_4_min', 'coswd_5_min', 'coswd_6_min', 'coswd_12_min', 'coswd_24_min',
    'ws_T_36_max', 'ws_T_36_min', 'coswd_12', 'coswd_24'
]

other_to_drop = [
    'cos_day', 'u', 'v'
]

feature_corr = u_to_drop+ws_to_drop+v_to_drop+wd_to_drop+other_to_drop
to_drop = feature_corr+to_drop

# LGBM functions

In [31]:
from lightgbm import LGBMRegressor

In [32]:
# def lgbm_cross_validation(X, y, params):
#     if params == None:
#         model = LGBMRegressor()
#     else:
#         model = LGBMRegressor(**params)

#     print('-----------LGBM CROSS VALIDATION BEGINNING-----------')
#     split = 10
#     kf = KFold(n_splits=split, shuffle=True)       
#     lgbm_rmse_scores = []
#     lgbm_mae_scores = []
#     i = 1
#     for (train_index, test_index) in kf.split(pd.DataFrame(X), pd.DataFrame(y)):
#         X_train, X_test = pd.DataFrame(X).iloc[train_index], pd.DataFrame(X).iloc[test_index]
#         Y_train, Y_test = pd.DataFrame(y).iloc[train_index],pd.DataFrame(y).iloc[test_index]

#         model.fit(X_train, Y_train, eval_set=[(X_test, Y_test)], verbose=100)

#         prediction = model.predict(X_test)
#         lgbm_rmse_scores.append(mean_squared_error(Y_test, prediction,squared=False))
#         lgbm_mae_scores.append(mean_absolute_error(Y_test, prediction))
        
#         print(show_evaluation(prediction, Y_test))
#         print(f'-------------------FOLD {i}-----------------')
#         i+=1

#     print('---------------CROSS VALIDATION COMPLETE-------------')
#     print('--------------------------RMSE-----------------------')
#     display_scores(lgbm_rmse_scores)
#     print('--------------------------MAE------------------------')
#     display_scores(lgbm_mae_scores)

In [33]:
def lgbm_scaled_cross_validation(X, y, params):
    if params == None:
        model = Pipeline([('scaler', MinMaxScaler()),('ridge', LGBMRegressor())])
    else:
        model = Pipeline([('scaler', MinMaxScaler()),('ridge', LGBMRegressor(**params))])

    print('-----------LGBM CROSS VALIDATION BEGINNING-----------')
    split = 10
    kf = KFold(n_splits=split, shuffle=True)       
    lgbm_rmse_scores = []
    lgbm_mae_scores = []
    i = 1
    for (train_index, test_index) in kf.split(pd.DataFrame(X), pd.DataFrame(y)):
        X_train, X_test = pd.DataFrame(X).iloc[train_index], pd.DataFrame(X).iloc[test_index]
        Y_train, Y_test = pd.DataFrame(y).iloc[train_index],pd.DataFrame(y).iloc[test_index]

        model.fit(X_train, Y_train)

        prediction = model.predict(X_test)
        lgbm_rmse_scores.append(mean_squared_error(Y_test, prediction,squared=False))
        lgbm_mae_scores.append(mean_absolute_error(Y_test, prediction))
        
        print(show_evaluation(prediction, Y_test))
        print(f'-------------------FOLD {i}-----------------')
        i+=1

    print('---------------CROSS VALIDATION COMPLETE-------------')
    print('--------------------------RMSE-----------------------')
    display_scores(lgbm_rmse_scores)
    print('--------------------------MAE------------------------')
    display_scores(lgbm_mae_scores)

In [34]:
def hyperparametrization(trial, train_x, test_x, train_y, test_y):
    param = {
        'metric': 'rmse', 
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
        'max_depth': trial.suggest_int("max_depth", 20, 100),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
    }
    
#     model = LGBMRegressor(**param)  
    model = Pipeline([('scaler', MinMaxScaler()),('ridge', LGBMRegressor(**param))])
#     model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    model.fit(train_x,train_y)
    
    preds = model.predict(test_x)
    
    rmse =  mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [35]:
def hyperparametrization_cv(trial, X, y):
    param = {
        'metric': 'rmse', 
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
        'max_depth': trial.suggest_int("max_depth", 20, 100),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
    }
    
    model = LGBMRegressor(**param)
    rmse = - cross_val_score(model, X, y, cv=3, scoring = 'neg_root_mean_squared_error').mean()
    
    return rmse

## WP1 

| |  | Mean | Std | Sum up |
| --- | --- | --- | --- | --- |
| No params | RMSE | 0.10344875448880764 | 0.0019070131550065564 |  |
| No params - StandardScaler | RMSE | 0.10355855436437653 | 0.0019043784372002228 |  |
| After tuning 50trials| RMSE | 0.06830085723562579 | 0.0012998671387256361 | To keep, maybe redo optuna with warm start with it - first sub|
| After tuning 100trials| RMSE | 0.0695930431604128 | 0.0015123892627707553 | |
| After tuning warm start 60trials | RMSE | 0.0671868765178121 | 0.0019357892229500213 | second sub - BETTER |
| After tuning warm start 50trials - StandardScaler | RMSE | 0.06705406618054667 | 0.0013509038914922506 | with warm start 1 parameters |
| --- | --- | --- | --- | --- |
| No params | MAE | 0.07306057409517844 | 0.0009857628815465762 |  |
| No params - StandardScaler | MAE | 0.07310137331348271 | 0.0010945145617042643 |  |
| After tuning 50trials| MAE | 0.04481696427654311 | 0.000727722171899004 |  |
| After tuning 100trials| MAE | 0.045696725242359994 | 0.0008508558100930331 |  |
| After tuning warm start 60trials | MAE | 0.04322865305311156 | 0.0009454253743042544 | |
| After tuning warm start 50trials - StandardScaler | MAE | 0.043173032898780175 | 0.0007696056550296055 | with warm start 1 parameters |

In [47]:
wp1_X = train_wp1[[c for c in train_wp1 if c not in ["wp"]] + ["wp"]].drop(to_drop+feature_corr, axis = 1)
X1 = wp1_X.drop('wp', axis=1)
y1 = wp1_X['wp']

def objective_wp1(trial,data=X1,target=y1):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization(trial, train_x, test_x, train_y, test_y)

In [37]:
# lgbm_cross_validation(X1, y1, None)
# lgbm_scaled_cross_validation(X1, y1, None)

-----------LGBM CROSS VALIDATION BEGINNING-----------
RMSE score: 0.10732270371619028
MAE score: 0.07571007135576018
None
-------------------FOLD 1-----------------
RMSE score: 0.10886917727677274
MAE score: 0.07634181570136926
None
-------------------FOLD 2-----------------
RMSE score: 0.10736229882111793
MAE score: 0.07545585548085655
None
-------------------FOLD 3-----------------
RMSE score: 0.10911831651184314
MAE score: 0.07750026292976274
None
-------------------FOLD 4-----------------
RMSE score: 0.10704179620830895
MAE score: 0.07648866023010195
None
-------------------FOLD 5-----------------
RMSE score: 0.10710112334031804
MAE score: 0.07511977294091005
None
-------------------FOLD 6-----------------
RMSE score: 0.11025646953823802
MAE score: 0.07762902676974258
None
-------------------FOLD 7-----------------
RMSE score: 0.10765743908831053
MAE score: 0.07589383654776555
None
-------------------FOLD 8-----------------
RMSE score: 0.10977016770862105
MAE score: 0.0781972008262

In [38]:
# try_these_first = [{
#     'reg_alpha': 0.664265743859848,
#     'reg_lambda': 9.83047434398735,
#     'colsample_bytree': 1.0,
#     'subsample': 1.0,
#     'learning_rate': 0.24237997149103074,
#     'max_depth': 77,
#     'num_leaves': 389,
#     'min_child_samples': 2,
# },  {
#     'reg_alpha': 0.8314449043001416,
#     'reg_lambda': 9.093012403173608,
#     'colsample_bytree': 0.9,
#     'subsample': 0.4,
#     'learning_rate': 0.2033256175102991,
#     'max_depth': 55,
#     'num_leaves': 964,
#     'min_child_samples': 25,
# },  {
#     'reg_alpha': 0.06611820641937002,
#     'reg_lambda': 0.00402359814884553,
#     'colsample_bytree': 0.8,
#     'subsample': 0.5,
#     'learning_rate': 0.15622705931571296,
#     'max_depth': 72,
#     'num_leaves': 596,
#     'min_child_samples': 2,
# },  {
#     'reg_alpha': 0.25020407037516895,
#     'reg_lambda': 7.183180037262842,
#     'colsample_bytree': 1.0,
#     'subsample': 1.0,
#     'learning_rate': 0.11751089382716717,
#     'max_depth': 84,
#     'num_leaves': 596,
#     'min_child_samples': 15,
#     'min_data_per_groups': 35
# }]

# study = optuna.create_study(direction='minimize')
# study.enqueue_trial(try_these_first[0])
# study.enqueue_trial(try_these_first[1])

In [39]:
# # study = optuna.create_study(direction='minimize')
study.optimize(objective_wp1, n_trials=5)
# write_results('Data/Hyperparametrization/lgbm_50trials_cv.xlsx', 'wp1', study.trials_dataframe())
best_trial = study.best_trial.params
best_trial

In [40]:
# # 100
# params_1 = {
#     'reg_alpha': 0.8314449043001416,
#     'reg_lambda': 9.093012403173608,
#     'colsample_bytree': 0.9,
#     'subsample': 0.4,
#     'learning_rate': 0.2033256175102991,
#     'max_depth': 55,
#     'num_leaves': 964,
#     'min_child_samples': 25,
#     'min_data_per_groups': 36
# }

# #50
# params_1 = {
#     'reg_alpha': 0.664265743859848,
#     'reg_lambda': 9.83047434398735,
#     'colsample_bytree': 1.0,
#     'subsample': 1.0,
#     'learning_rate': 0.24237997149103074,
#     'max_depth': 77,
#     'num_leaves': 389,
#     'min_child_samples': 2,
#     'min_data_per_groups': 75
# }

# warm start
params_1 = {
    'reg_alpha': 0.25020407037516895,
    'reg_lambda': 7.183180037262842,
    'colsample_bytree': 1.0,
    'subsample': 1.0,
    'learning_rate': 0.11751089382716717,
    'max_depth': 84,
    'num_leaves': 596,
    'min_child_samples': 15,
    'min_data_per_groups': 35
}

# params_1 = best_trial

In [41]:
# lgbm_cross_validation(X1, y1, params_1)
lgbm_scaled_cross_validation(X1, y1, params_1)

-----------LGBM CROSS VALIDATION BEGINNING-----------
RMSE score: 0.06787525916403352
MAE score: 0.042995212934237824
None
-------------------FOLD 1-----------------
RMSE score: 0.07190475286451645
MAE score: 0.045573344414612096
None
-------------------FOLD 2-----------------
RMSE score: 0.06905120117081694
MAE score: 0.0442099780490888
None
-------------------FOLD 3-----------------
RMSE score: 0.06743650527177474
MAE score: 0.04404054729505381
None
-------------------FOLD 4-----------------


KeyboardInterrupt: 

## WP2

| |  | Mean | Std | Sum up|
| --- | --- | --- | --- | |
| No params | RMSE | 0.10935335541057582 | 0.0014425096116734836 | |
| No params - StandardScaler | RMSE | 0.10938240918068962 | 0.002563571983412213 |  |
| After tuning - 50trials| RMSE | 0.0725081520968898 | 0.0016974702626377217 | |
| After tuning 100trials| RMSE | 0.0707064364904941 | 0.001396820290618349 | More stable, to keep |
| After tuning warm start 50trials - StandardScaler | RMSE | 0.07072007438762447 | 0.0010396359382112443 | With 100trials parameters |
| --- | --- | --- | --- |---|
| No params | MAE | 0.07681923856705511 | 0.0008670825615244791 | |
| No params - StandardScaler | MAE | 0.0768829843356125 | 0.0017568758903956435 |  |
| After tuning - 50trials| MAE | 0.04512164110351975 |  0.0006579433030966575 | |
| After tuning 100trials| MAE | 0.04457902842458915 | 0.0006807155447311589 | |
| After tuning warm start 50trials - StandardScaler | MAE | 0.04456161763710905 | 0.0005342365137369251 | With 100trials parameters |

In [48]:
wp2_X = train_wp2[[c for c in train_wp2 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X2 = wp2_X.drop('wp', axis=1)
y2 = wp2_X['wp']

def objective_wp2(trial,data = X2,target = y2):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization(trial, train_x, test_x, train_y, test_y)

In [None]:
# lgbm_cross_validation(X2, y2, None)
lgbm_scaled_cross_validation(X2, y2, None)

In [None]:
try_these_first = [{
    'reg_alpha': 0.18268883436586145,
    'reg_lambda': 0.15916821051528962,
    'colsample_bytree': 1.0,
    'subsample': 0.6,
    'learning_rate': 0.18007000714755378,
    'max_depth': 77,
    'num_leaves': 425,
    'min_child_samples': 10,
    'min_data_per_groups': 19
},  {
 'reg_alpha': 0.04439450895032273,
 'reg_lambda': 0.7790968728875318,
 'colsample_bytree': 0.4,
 'subsample': 1.0,
 'learning_rate': 0.09520041095092219,
 'max_depth': 31,
 'num_leaves': 883,
 'min_child_samples': 18,
 'min_data_per_groups': 56
}]

study = optuna.create_study(direction='minimize')
study.enqueue_trial(try_these_first[0])
study.enqueue_trial(try_these_first[1])

In [None]:
# # study = optuna.create_study(direction='minimize')
# study.optimize(objective_wp2, n_trials=50)
# write_results('Data/Hyperparametrization/lgbm_100trials.xlsx', 'wp2', study.trials_dataframe())
# best_trial = study.best_trial.params
# best_trial

In [None]:
# # Std warm start 
# params_2 = {
#     'reg_alpha': 0.34026994469471555,
#     'reg_lambda': 1.1032197453137866,
#     'colsample_bytree': 0.9,
#     'subsample': 0.6,
#     'learning_rate': 0.13414826176962302,
#     'max_depth': 81,
#     'num_leaves': 987,
#     'min_child_samples': 39
# }

# 100
params_2 = {
    'reg_alpha': 0.18268883436586145,
    'reg_lambda': 0.15916821051528962,
    'colsample_bytree': 1.0,
    'subsample': 0.6,
    'learning_rate': 0.18007000714755378,
    'max_depth': 77,
    'num_leaves': 425,
    'min_child_samples': 10,
    'min_data_per_groups': 19
}

# 50
# params_2 = {
#  'reg_alpha': 0.04439450895032273,
#  'reg_lambda': 0.7790968728875318,
#  'colsample_bytree': 0.4,
#  'subsample': 1.0,
#  'learning_rate': 0.09520041095092219,
#  'max_depth': 31,
#  'num_leaves': 883,
#  'min_child_samples': 18,
#  'min_data_per_groups': 56}

In [None]:
# lgbm_cross_validation(X2, y2, params_2)
lgbm_scaled_cross_validation(X2, y2, params_2)

## WP3

| |  | Mean | Std ||
| --- | --- | --- | --- ||
| No params | RMSE | 0.10392558077951244 | 0.0019038044796542812 ||
| No params - StandardScaler | RMSE | 0.10354114984428979 | 0.0013629826554706927 ||
| After tuning - 50trials| RMSE | 0.058253804820626545 | 0.0009893279354834155 | More stable, to keep |
| After tuning 100trials| RMSE | 0.058338944346627106 | 0.0017133930174837203 ||
| After tuning warm start 50trials - StandardScaler | RMSE | 0.05839355310487706 | 0.0009599824558874801 ||
| After tuning - 50trial - StandardScaler | RMSE | 0.05828701989178382 | 0.0013469437248627486 | with 50trials best|
| --- | --- | --- | --- | --- |
| No params | MAE | 0.07550802464973318 | 0.0012006073434917633 ||
| No params - StandardScaler | MAE | 0.0753220326933334 | 0.0007353049410651828 | |
| After tuning - 50trials| MAE | 0.03787310900962521 | 0.000442034368456366 ||
| After tuning 100trials| MAE | 0.03838030476025398 | 0.0007480100565996748 ||
| After tuning warm start 50trials - - StandardScaler | MAE | 0.03838277636708219 | 0.0006195357541130345 ||
| After tuning 50trials - StandardScaler | MAE | 0.03796602386426936 | 0.0006588818520419181 | with 50trials best|

In [49]:
wp3_X = train_wp3[[c for c in train_wp3 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X3 = wp3_X.drop('wp', axis = 1)
y3 = wp3_X['wp']

def objective_wp3(trial,data = X3,target = y3):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization(trial, train_x, test_x, train_y, test_y)

In [None]:
# lgbm_cross_validation(X3, y3, None)
lgbm_scaled_cross_validation(X3, y3, None)

In [None]:
# try_these_first = [{
#     'reg_alpha': 0.26013926149282945,
#     'reg_lambda': 0.002325658512162904,
#     'colsample_bytree': 1.0,
#     'subsample': 0.7,
#     'learning_rate': 0.10619054458258967,
#     'max_depth': 83,
#     'num_leaves': 647,
#     'min_child_samples': 3,
#     'min_data_per_groups': 24
# },  {
#     'reg_alpha': 0.002937356908910416,
#     'reg_lambda': 0.003822180117262245,
#     'colsample_bytree': 0.8,
#     'subsample': 1.0,
#     'learning_rate': 0.09489749817678472,
#     'max_depth': 41,
#     'num_leaves': 842,
#     'min_child_samples': 18,
#     'min_data_per_groups': 46
# }]

# study = optuna.create_study(direction='minimize')
# study.enqueue_trial(try_these_first[0])
# study.enqueue_trial(try_these_first[1])

In [None]:
# study = optuna.create_study(direction='minimize')
study.optimize(objective_wp3, n_trials=50)
# write_results('Data/Hyperparametrization/lgbm_100trials.xlsx', 'wp3', study.trials_dataframe())
best_trial = study.best_trial.params
best_trial

In [None]:
# # 100
# params_3 = {
#     'reg_alpha': 0.26013926149282945,
#     'reg_lambda': 0.002325658512162904,
#     'colsample_bytree': 1.0,
#     'subsample': 0.7,
#     'learning_rate': 0.10619054458258967,
#     'max_depth': 83,
#     'num_leaves': 647,
#     'min_child_samples': 3,
#     'min_data_per_groups': 24
# }

# # # 50
# params_3 = {
#     'reg_alpha': 0.002937356908910416,
#     'reg_lambda': 0.003822180117262245,
#     'colsample_bytree': 0.8,
#     'subsample': 1.0,
#     'learning_rate': 0.09489749817678472,
#     'max_depth': 41,
#     'num_leaves': 842,
#     'min_child_samples': 18,
#     'min_data_per_groups': 46
# }

# 50 warm start std scaler
params_3 = {
    'reg_alpha': 0.2380367567801365,
    'reg_lambda': 0.005052844767806766,
    'colsample_bytree': 0.9,
    'subsample': 0.5,
    'learning_rate': 0.11958787026894079,
    'max_depth': 41,
    'num_leaves': 690,
    'min_child_samples': 16
}

In [None]:
# lgbm_cross_validation(X3, y3, params_3)
lgbm_scaled_cross_validation(X3, y3, params_3)

## WP4

| |  | Mean | Std | |
| --- | --- | --- | --- | --- |
| No params | RMSE | 0.10486204816363351 | 0.0015105949978751166 ||
| No params  - Std | RMSE | 0.10480849342496516 | 0.0010370863436755212 ||
| After tuning - 50trials| RMSE | 0.06513233717204232 | 0.0015891617240032727 ||
| After tuning 100trials| RMSE | 0.06357594848470964 | 0.0013676749030776929 ||
| After tuning with warm start - 50trials - Std | RMSE | 0.06339401569270936 | 0.001228053306037005 ||
| No params | MAE | 0.07564776733421566 | 0.00104638869825841 ||
| No params  - Std | MAE | 0.07570794104041156 | 0.0008419207475550308 ||
| After tuning - 50trials| MAE | 0.04219236028055372 | 0.0008190579419060266 ||
| After tuning 100trials| MAE |0.04172111697148837  | 0.0009349285385250968 ||
| After tuning with warm start - 50trials  - Std | MAE | 0.04150668920859586 | 0.0005729825500890684 ||

In [50]:
wp4_X = train_wp4[[c for c in train_wp4 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X4 = wp4_X.drop('wp', axis = 1)
y4 = wp4_X['wp']

def objective_wp4(trial,data = X4,target = y4):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization(trial, train_x, test_x, train_y, test_y)

In [None]:
# lgbm_cross_validation(X4, y4, None)
lgbm_scaled_cross_validation(X4, y4, None)

In [None]:
# try_these_first = [{
#     'reg_alpha': 0.08714703614419553,
#     'reg_lambda': 9.983645262139024,
#     'colsample_bytree': 0.9,
#     'subsample': 0.8,
#     'learning_rate': 0.13413154768816146,
#     'max_depth': 41,
#     'num_leaves': 613,
#     'min_child_samples': 15,
#     'min_data_per_groups': 29
# },  {
#     'reg_alpha': 0.15331128149569725,
#     'reg_lambda': 0.28560184971009756,
#     'colsample_bytree': 0.7,
#     'subsample': 0.5,
#     'learning_rate': 0.11430869527789024,
#     'max_depth': 24,
#     'num_leaves': 856,
#     'min_child_samples': 14,
#     'min_data_per_groups': 33
# }]

# study = optuna.create_study(direction='minimize')
# study.enqueue_trial(try_these_first[0])
# study.enqueue_trial(try_these_first[1])

In [None]:
# # study = optuna.create_study(direction='minimize')
# study.optimize(objective_wp4, n_trials=50)
# # write_results('Data/Hyperparametrization/scaled_lgbm_50trials.xlsx', 'wp4', study.trials_dataframe())
# best_trial = study.best_trial.params
# best_trial

In [None]:
# 100
params_4 = {
    'reg_alpha': 0.08714703614419553,
    'reg_lambda': 9.983645262139024,
    'colsample_bytree': 0.9,
    'subsample': 0.8,
    'learning_rate': 0.13413154768816146,
    'max_depth': 41,
    'num_leaves': 613,
    'min_child_samples': 15,
    'min_data_per_groups': 29
}

# {
#     'reg_alpha': 0.15331128149569725,
#     'reg_lambda': 0.28560184971009756,
#     'colsample_bytree': 0.7,
#     'subsample': 0.5,
#     'learning_rate': 0.11430869527789024,
#     'max_depth': 24,
#     'num_leaves': 856,
#     'min_child_samples': 14,
#     'min_data_per_groups': 33
# }

In [None]:
# lgbm_cross_validation(X4, y4, params_4)
lgbm_scaled_cross_validation(X4, y4, params_4)

## WP5

| |  | Mean | Std |
| --- | --- | --- | --- |
| No params | RMSE | 0.11722129743692011 | 0.0017732599261516583 |
| No params - Std | RMSE | 0.11729790317307003 | 0.0014884067903823003 |
| After tuning - 50trials| RMSE | 0.07721413638593042 | 0.0011020420293213135 |
| After tuning - 100trials| RMSE | 0.07297648991888442 | 0.0014970317509404526 |
| After tuning - 100trials - warm start | RMSE | 0.07362803793800192 | 0.0013223501622953715 |
| After tuning 50trials - std - warm start | RMSE | 0.07379631452164911 | 0.0019249511778190373 |
| --- | --- | --- | --- |
| No params | MAE | 0.08497074568090211 | 0.0009101526501392155 |
| No params - Std | MAE | 0.0849648687365363 | 0.0011901882563545429 |
| After tuning - 50trials| MAE | 0.051677856581467195 | 0.0006374939894477714 |
| After tuning - 100trials| MAE | 0.04765271414503236 | 0.0006257356756510128 |
| After tuning - 100trials - warm trials | MAE | 0.04785179154681675 | 0.0005795839605605526 |
| After tuning 50trials - std - warm start | MAE | 0.0480147138609328 | 0.0008230908433814974 |

In [51]:
wp5_X = train_wp5[[c for c in train_wp5 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X5 = wp5_X.drop('wp', axis = 1)
y5 = wp5_X['wp']

def objective_wp5(trial, data = X5,target = y5):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization(trial, train_x, test_x, train_y, test_y)

In [None]:
# lgbm_cross_validation(X5, y5, None)
lgbm_scaled_cross_validation(X5, y5, None)

In [None]:
# try_these_first = [{
#     'reg_alpha': 0.04781362061382749,
#     'reg_lambda': 9.716980953182604,
#     'colsample_bytree': 0.9,
#     'subsample': 0.7,
#     'learning_rate': 0.14614317149730652,
#     'max_depth': 57,
#     'num_leaves': 532,
#     'min_child_samples': 7,
#     'min_data_per_groups': 84
# },  {
#     'reg_alpha': 0.0025641515787025067,
#     'reg_lambda': 0.024580995322705475,
#     'colsample_bytree': 0.8,
#     'subsample': 0.4,
#     'learning_rate': 0.11844862032615265,
#     'max_depth': 69,
#     'num_leaves': 328,
#     'min_child_samples': 62,
#     'min_data_per_groups': 34
# },  {
#     'reg_alpha': 0.1420112281892889,
#     'reg_lambda': 0.14745955581286027,
#     'colsample_bytree': 0.9,
#     'subsample': 0.7,
#     'learning_rate': 0.15576554024588912,
#     'max_depth': 61,
#     'num_leaves': 483,
#     'min_child_samples': 10,
#     'min_data_per_groups': 49
# }]

# study = optuna.create_study(direction='minimize')
# study.enqueue_trial(try_these_first[0])
# study.enqueue_trial(try_these_first[1])
# study.enqueue_trial(try_these_first[2])

In [None]:
# # study = optuna.create_study(direction='minimize')
# study.optimize(objective_wp5, n_trials=50)
# write_results('Data/Hyperparametrization/scaled_lgbm_50trials.xlsx', 'wp5', study.trials_dataframe())
# best_trial = study.best_trial.params
# best_trial

In [None]:
# warm start 50 std
params_5 = {
    'reg_alpha': 0.1420112281892889,
    'reg_lambda': 0.14745955581286027,
    'colsample_bytree': 0.9,
    'subsample': 0.7,
    'learning_rate': 0.15576554024588912,
    'max_depth': 61,
    'num_leaves': 483,
    'min_child_samples': 10
}

# warm start 
# params_5 = {
#     'reg_alpha': 0.1420112281892889,
#     'reg_lambda': 0.14745955581286027,
#     'colsample_bytree': 0.9,
#     'subsample': 0.7,
#     'learning_rate': 0.15576554024588912,
#     'max_depth': 61,
#     'num_leaves': 483,
#     'min_child_samples': 10,
#     'min_data_per_groups': 49
# }


# # 100
# params_5 = {
#     'reg_alpha': 0.04781362061382749,
#     'reg_lambda': 9.716980953182604,
#     'colsample_bytree': 0.9,
#     'subsample': 0.7,
#     'learning_rate': 0.14614317149730652,
#     'max_depth': 57,
#     'num_leaves': 532,
#     'min_child_samples': 7,
#     'min_data_per_groups': 84
# }

# # 50
# params_5 = {
#     'reg_alpha': 0.0025641515787025067,
#     'reg_lambda': 0.024580995322705475,
#     'colsample_bytree': 0.8,
#     'subsample': 0.4,
#     'learning_rate': 0.11844862032615265,
#     'max_depth': 69,
#     'num_leaves': 328,
#     'min_child_samples': 62,
#     'min_data_per_groups': 34
# }

In [None]:
# lgbm_cross_validation(X5, y5, params_5)
lgbm_scaled_cross_validation(X5, y5, params_5)

## WP6

| |  | Mean | Std |
| --- | --- | --- | --- |
| No params | RMSE | 0.0940394026188472 | 0.0010749562915831372 |
| No params - std | RMSE | 0.09409110695713666 | 0.0013476052174559326 |
| After tuning - 50trials| RMSE | 0.05404362835213171 | 0.0008595325139047733 |
| After tuning 100trials| RMSE | 0.054861488499908594 | 0.0007335378238383901 |
| After tuning 50trials std - warm start| RMSE |  0.05446642457662869 | 0.0009850877593637966 |
| --- | --- | --- | --- |
| No params | MAE | 0.070455643271004 | 0.0006641538274191148 |
| After tuning - 50trials| MAE | 0.03657758274248596 | 0.0005325521314198646 |
| After tuning 100trials| MAE | 0.03783933495157941 | 0.00045956939815828987 |
| After tuning 50trials std - warm start| MAE | 0.0367847898055025 | 0.00039001216012464674 |

In [52]:
wp6_X = train_wp6[[c for c in train_wp6 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X6 = wp6_X.drop('wp', axis = 1)
y6 = wp6_X['wp']

def objective_wp6(trial,data = X6,target = y6):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization(trial, train_x, test_x, train_y, test_y)

In [None]:
# lgbm_cross_validation(X6, y6, None)
lgbm_scaled_cross_validation(X6, y6, None)

In [None]:
# try_these_first = [{
#     'reg_alpha': 0.23451110075396234,
#     'reg_lambda': 0.796705483623135,
#     'colsample_bytree': 0.9,
#     'subsample': 0.4,
#     'learning_rate': 0.1561492653707781,
#     'max_depth': 67,
#     'num_leaves': 998,
#     'min_child_samples': 45,
#     'min_data_per_groups': 48
# },  {
#     'reg_alpha': 0.11420484028619322,
#     'reg_lambda': 2.6106462927544216,
#     'colsample_bytree': 0.5,
#     'subsample': 0.4,
#     'learning_rate': 0.13579539259861131,
#     'max_depth': 35,
#     'num_leaves': 765,
#     'min_child_samples': 3,
#     'min_data_per_groups': 29
# }]

# study = optuna.create_study(direction='minimize')
# study.enqueue_trial(try_these_first[0])
# study.enqueue_trial(try_these_first[1])

In [None]:
# # study = optuna.create_study(direction='minimize')
# study.optimize(objective_wp6, n_trials=50)
# write_results('Data/Hyperparametrization/scaled_lgbm_50trials.xlsx', 'wp6', study.trials_dataframe())
# best_trial = study.best_trial.params
# best_trial

In [None]:
# warm start
params_6 = {
    'reg_alpha': 0.19099691249064502,
    'reg_lambda': 0.3893771552082417,
    'colsample_bytree': 0.6,
    'subsample': 0.7,
    'learning_rate': 0.10214699989265669,
    'max_depth': 70,
    'num_leaves': 903,
    'min_child_samples': 1
} 

# # 100
# params_6 = {
#     'reg_alpha': 0.23451110075396234,
#     'reg_lambda': 0.796705483623135,
#     'colsample_bytree': 0.9,
#     'subsample': 0.4,
#     'learning_rate': 0.1561492653707781,
#     'max_depth': 67,
#     'num_leaves': 998,
#     'min_child_samples': 45,
#     'min_data_per_groups': 48
# }

# # 50
# params_6 = {
#     'reg_alpha': 0.11420484028619322,
#     'reg_lambda': 2.6106462927544216,
#     'colsample_bytree': 0.5,
#     'subsample': 0.4,
#     'learning_rate': 0.13579539259861131,
#     'max_depth': 35,
#     'num_leaves': 765,
#     'min_child_samples': 3,
#     'min_data_per_groups': 29
# }

In [None]:
# lgbm_cross_validation(X6, y6, params_6)
lgbm_scaled_cross_validation(X6, y6, params_6)

# Predictions

## Functions

In [56]:
to_drop_test = ['date','wd','forecast_time', 'forecast', "forecast_dist", 'wp']+feature_corr
def make_prediction_dataset(test, to_drop=to_drop_test):
    test_to_predict = test.dropna(subset=['ws','u','v'], how = 'any') # keeps only lines with u,v,ws,wd
    test_to_predict = test_to_predict[test_to_predict['wp'].isna()] # keeps only lines with no wp
    test_to_predict = test_to_predict.sort_values(by=['date', 'forecast_time'], ascending = [True, False]).drop_duplicates(subset='date')
    test_to_predict = test_to_predict.drop(to_drop, axis = 1)
    return test_to_predict

In [57]:
def make_submission_file(lst_X_trains, lst_y_trains, lst_tests, lst_models, dates):
    i = 1
    lst_prediction = []
    lst_models_trained = []
    for X, y, test, model in zip(lst_X_trains, lst_y_trains, lst_tests, lst_models):
        print(f'--------------Model {i}--------------')
        model.fit(X, y)
        print(f'True:\n\tMin:{min(y)}\n\tMax:{max(y)}\n\tMean:{y.mean()}')
        predictions = model.predict(test)
        print(f'Prediction:\n\tMin:{min(predictions)}\n\tMax:{max(predictions)}\n\tMean:{np.mean(predictions)}')
        predictions = [min(y) if i < 0 else i for i in predictions]
        predictions = [max(y) if i > max(y) else i for i in predictions]
        print(f'Prediction corrected:\n\tMin:{min(predictions)}\n\tMax:{max(predictions)}\n\tMean:{np.mean(predictions)}')
        lst_prediction.append(predictions)
        lst_models_trained.append(model)
        i+=1
    
    df_predictions = pd.DataFrame({
        'date': test_dates,
        'wp1': lst_prediction[0],
        'wp2': lst_prediction[1],
        'wp3': lst_prediction[2],
        'wp4': lst_prediction[3],
        'wp5': lst_prediction[4],
        'wp6': lst_prediction[5],        
    })
    return df_predictions, lst_models_trained

## Submission 

In [58]:
model_1 = Pipeline([('scaler', MaxAbsScaler()),('ridge', LGBMRegressor(**params_1))])
model_2 = Pipeline([('scaler', MaxAbsScaler()),('ridge', LGBMRegressor(**params_2))])
model_3 = Pipeline([('scaler', MaxAbsScaler()),('ridge', LGBMRegressor(**params_3))])
model_4 = Pipeline([('scaler', MaxAbsScaler()),('ridge', LGBMRegressor(**params_4))])
model_5 = Pipeline([('scaler', MaxAbsScaler()),('ridge', LGBMRegressor(**params_5))])
model_6 = Pipeline([('scaler', MaxAbsScaler()),('ridge', LGBMRegressor(**params_6))])

In [59]:
# model_1 = LGBMRegressor(**params_1)
# model_2 = LGBMRegressor(**params_2)
# model_3 = LGBMRegressor(**params_3)
# model_4 = LGBMRegressor(**params_4)
# model_5 = LGBMRegressor(**params_5)
# model_6 = LGBMRegressor(**params_6)

lst_models = [model_1, model_2, model_3, model_4, model_5, model_6]
lst_X_trains = [X1, X2, X3, X4, X5, X6]
lst_y_trains = [y1, y2, y3, y4, y5, y6]

In [60]:
lst_tests = []
for test in [test_wp1, test_wp2, test_wp3, test_wp4, test_wp5, test_wp6]:
    test = make_prediction_dataset(test)
    lst_tests.append(test)

In [61]:
df_predictions, lst_models_trained = make_submission_file(lst_X_trains, lst_y_trains, lst_tests, lst_models, test_dates)

--------------Model 1--------------
True:
	Min:0.0
	Max:0.96
	Mean:0.2845981952075702
Prediction:
	Min:-0.06270481396737551
	Max:0.9959193151767103
	Mean:0.30001763028971734
Prediction corrected:
	Min:0.0
	Max:0.96
	Mean:0.3000638337471024
--------------Model 2--------------
True:
	Min:0.0
	Max:0.966
	Mean:0.25890153769841273
Prediction:
	Min:-0.04862701689611222
	Max:1.0752733750886052
	Mean:0.25445528695585135
Prediction corrected:
	Min:0.0
	Max:0.966
	Mean:0.2544183961408785
--------------Model 3--------------
True:
	Min:0.0
	Max:0.989
	Mean:0.2625247252747253
Prediction:
	Min:-0.07510566009564784
	Max:1.0322431842989495
	Mean:0.28966257371606147
Prediction corrected:
	Min:0.0
	Max:0.989
	Mean:0.2901427408259324
--------------Model 4--------------
True:
	Min:0.0
	Max:0.992
	Mean:0.2763637820512821
Prediction:
	Min:-0.057743049195960315
	Max:0.9709534568975586
	Mean:0.2819305898806336
Prediction corrected:
	Min:0.0
	Max:0.9709534568975586
	Mean:0.2823063044721959
--------------Model 

In [62]:
df_predictions.to_csv('Predictions/submission_nb_10_full_maxabs-lgbm-featselect.csv', index=False, sep=';')

## Saving models

In [63]:
pkl_model = "Models/LGBM/LGBM-maxabs-wp1-100trials_best_warm_start-featselect.pkl"
with open(pkl_model, 'wb') as file:
    pickle.dump(lst_models_trained[0], file)
    
    
pkl_model = "Models/LGBM/LGBM-maxabs-wp2-50trials_best_warm_start-featselect.pkl"
with open(pkl_model, 'wb') as file:
    pickle.dump(lst_models_trained[1], file)
    

pkl_model = "Models/LGBM/LGBM-maxabs-wp3-100trials_best_warm_start-featselect.pkl"
with open(pkl_model, 'wb') as file:
    pickle.dump(lst_models_trained[2], file)


pkl_model = "Models/LGBM/LGBM-maxabs-wp4-50trials_best_warm_start-featselect.pkl"
with open(pkl_model, 'wb') as file:
    pickle.dump(lst_models_trained[3], file)


pkl_model = "Models/LGBM/LGBM-maxabs-wp5-50trials_best_warm_start-featselect.pkl"
with open(pkl_model, 'wb') as file:
    pickle.dump(lst_models_trained[4], file)


pkl_model = "Models/LGBM/LGBM-maxabs-wp6-50trials_best_warm_start-featselect.pkl"
with open(pkl_model, 'wb') as file:
    pickle.dump(lst_models_trained[5], file)