# Model development

Ici on fait nos modèles et prédictions. Le mieux c'est de faire des parties par modèles je pense ?

Il faut aussi qu'on trouve un nomenclature pour les modèles si on les enregistre, afin de garder en tête les différents résultats

In [1]:
import pandas as pd
import seaborn as sns
import openpyxl
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle

In [2]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error

from xgboost import XGBRegressor
import optuna

In [3]:
from Functions.helper_functions import * 

In [4]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
pd.options.mode.chained_assignment = None  # default='warn'

In [5]:
np.random.seed(42)

In [6]:
from vmdpy import VMD

# Data

In [7]:
train_wp1 = pd.read_csv('Data/Preprocessing/WP1_train_preprocessed.csv', sep=',')
train_wp2 = pd.read_csv('Data/Preprocessing/WP2_train_preprocessed.csv', sep=',')
train_wp3 = pd.read_csv('Data/Preprocessing/WP3_train_preprocessed.csv', sep=',')
train_wp4 = pd.read_csv('Data/Preprocessing/WP4_train_preprocessed.csv', sep=',')
train_wp5 = pd.read_csv('Data/Preprocessing/WP5_train_preprocessed.csv', sep=',')
train_wp6 = pd.read_csv('Data/Preprocessing/WP6_train_preprocessed.csv', sep=',')

In [8]:
test_wp1 = pd.read_csv('Data/Preprocessing/WP1_test_preprocessed.csv', sep=',')
test_wp2 = pd.read_csv('Data/Preprocessing/WP2_test_preprocessed.csv', sep=',')
test_wp3 = pd.read_csv('Data/Preprocessing/WP3_test_preprocessed.csv', sep=',')
test_wp4 = pd.read_csv('Data/Preprocessing/WP4_test_preprocessed.csv', sep=',')
test_wp5 = pd.read_csv('Data/Preprocessing/WP5_test_preprocessed.csv', sep=',')
test_wp6 = pd.read_csv('Data/Preprocessing/WP6_test_preprocessed.csv', sep=',')
test_dates = pd.read_csv('Data/Initial/test.csv', sep=',').date.values

In [9]:
to_drop = ['date','wd','forecast_time', 'forecast', "forecast_dist"]

# XGBoost

In [10]:
# def xgboost_cross_validation(X, y, params):
#     if params == None:
#         model = XGBRegressor()
#     else:
#         model = XGBRegressor(**params)

#     print('-----------XGBOOST CROSS VALIDATION BEGINNING-----------')
#     split = 10
#     kf = KFold(n_splits=split, shuffle=True)       
#     xgboost_rmse_scores = []
#     xgboost_mae_scores = []
#     i = 1
#     for (train_index, test_index) in kf.split(pd.DataFrame(X), pd.DataFrame(y)):
#         X_train, X_test = pd.DataFrame(X).iloc[train_index], pd.DataFrame(X).iloc[test_index]
#         Y_train, Y_test = pd.DataFrame(y).iloc[train_index],pd.DataFrame(y).iloc[test_index]

#         model.fit(X_train, Y_train, eval_set=[(X_test, Y_test)], verbose=100)

#         prediction = model.predict(X_test)
#         xgboost_rmse_scores.append(mean_squared_error(Y_test, prediction,squared=False))
#         xgboost_mae_scores.append(mean_absolute_error(Y_test, prediction))
        
#         print(show_evaluation(prediction, Y_test))
#         print(f'-------------------FOLD {i}-----------------')
#         i+=1

#     print('---------------CROSS VALIDATION COMPLETE-------------')
#     print('--------------------------RMSE-----------------------')
#     display_scores(xgboost_rmse_scores)
#     print('--------------------------MAE------------------------')
#     display_scores(xgboost_mae_scores)

In [11]:
def xgboost_cross_validation(X, y, params):
    if params == None:
        model = Pipeline([('scaler', MaxAbsScaler()),('xgbr', XGBRegressor())])
    else:
        model = Pipeline([('scaler', MaxAbsScaler()),('xgbr', XGBRegressor(**params))])

    print('-----------XGBOOST CROSS VALIDATION BEGINNING-----------')
    split = 10
    kf = KFold(n_splits=split, shuffle=True)       
    xgboost_rmse_scores = []
    xgboost_mae_scores = []
    i = 1
    for (train_index, test_index) in kf.split(pd.DataFrame(X), pd.DataFrame(y)):
        X_train, X_test = pd.DataFrame(X).iloc[train_index], pd.DataFrame(X).iloc[test_index]
        Y_train, Y_test = pd.DataFrame(y).iloc[train_index],pd.DataFrame(y).iloc[test_index]

        model.fit(X_train, Y_train)

        prediction = model.predict(X_test)
        xgboost_rmse_scores.append(mean_squared_error(Y_test, prediction,squared=False))
        xgboost_mae_scores.append(mean_absolute_error(Y_test, prediction))
        
        print(show_evaluation(prediction, Y_test))
        print(f'-------------------FOLD {i}-----------------')
        i+=1

    print('---------------CROSS VALIDATION COMPLETE-------------')
    print('--------------------------RMSE-----------------------')
    display_scores(xgboost_rmse_scores)
    print('--------------------------MAE------------------------')
    display_scores(xgboost_mae_scores)

In [12]:
# def hyperparametrization(trial, train_x, test_x, train_y, test_y):
#     param = {
#         'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
#         'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
#         'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 1e-8, 1),
#         'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
#         'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
#         'n_estimators': trial.suggest_int('n_estimators', 100, 700),
#         'max_depth': trial.suggest_int("max_depth", 20, 70),
#         'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
#         'eta' : trial.suggest_loguniform("eta", 1e-8, 1.0),
#         'gamma' : trial.suggest_loguniform("gamma", 1e-8, 1.0),
#         'grow_policy' : trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
#     }
#     model = XGBRegressor(**param)  
    
#     model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
#     preds = model.predict(test_x)
    
#     rmse =  mean_squared_error(test_y, preds,squared=False)
    
#     return rmse

In [13]:
def hyperparametrization(trial, train_x, test_x, train_y, test_y):
    param = {
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 1e-8, 1),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 700),
        'max_depth': trial.suggest_int("max_depth", 20, 70),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'eta' : trial.suggest_loguniform("eta", 1e-8, 1.0),
        'gamma' : trial.suggest_loguniform("gamma", 1e-8, 1.0),
        'grow_policy' : trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    }
    model = Pipeline([('scaler', MaxAbsScaler()),('xgbr', XGBRegressor(**param))]) 
    
    model.fit(train_x,train_y)
    
    preds = model.predict(test_x)
    
    rmse =  mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [14]:
def vmd(y,k,alpha, tau,DC, init, tol):
    
    #Intrinsic mode generation
     #Empirical Mode Decomposition
    #. some sample parameters for VMD  
    alpha = 1       # moderate bandwidth constraint  
    tau = 0.           # noise-tolerance (no strict fidelity enforcement)  
    K = k              # k modes  
    DC = 0             # no DC part imposed  
    init = 1           # initialize omegas uniformly  
    tol = 1e-7
    u, u_hat, omega = VMD(y,alpha, tau, K, DC, init, tol)
    df_vmfs = pd.DataFrame()
    #Integration in the dataframe
    for num, imf in enumerate(u):
        #print('----Creating VMFwp{0} EMD columns----'.format(num+1))
        df_vmfs['IMFwp{0}'.format(num+1)] = imf
    return df_vmfs

## WP1 

| |  | Mean | Std | Sum up |
| --- | --- | --- | --- | --- |
| No params | RMSE | 0.08791418461185974 | 0.001582447940959254 |  |
| No params - MaxAbs | RMSE | 0.08777836846276121 | 0.0014715383851711332 |  |
| After tuning 50trials| RMSE | 0.06871560882190697 | 0.0009799918898982718 |  |
| After tuning 50trials - MaxAbs | RMSE | 0.06436201945957092 | 0.0016136864674464234 |  |
| --- | --- | --- | --- | --- |
| No params | MAE | 0.0614505433779813 | 0.0010902461358058533 |  |
| No params - MaxAbs | MAE | 0.06135361126361555 | 0.0009797777856978493 |  |
| After tuning 50trials| MAE | 0.0447212685268329 | 0.0006115489443412801 |  |
| After tuning 50trials - MaxAbs | MAE | 0.04177557094032202 | 0.0008888330284581335 |  |

In [15]:
wp1_X = train_wp1[[c for c in train_wp1 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X1 = wp1_X.drop('wp', axis=1)
y1 = wp1_X['wp']
y1_vmf = vmd(y1,4,0,0,0,0,0)['IMFwp1']

def objective_wp1(trial,data=X1,target=y1_vmf):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization(trial, train_x, test_x, train_y, test_y)

In [16]:
#xgboost_cross_validation(X1, y1_vmf, None)

In [17]:
# try_these_first = [{
#     'lambda': 2.1359622347936646,
#     'alpha': 0.016202766042783825,
#     'colsample_bytree': 0.8075360516891219,
#     'subsample': 0.8,
#     'learning_rate': 0.06792370224097045,
#     'n_estimators': 320,
#     'max_depth': 58,
#     'min_child_weight': 102,
#     'eta': 6.934521001624072e-05,
#     'gamma': 4.369012735807193e-06,
#     'grow_policy': 'lossguide'
# },  {
#     'lambda': 0.3643806022565838,
#     'alpha': 0.003650309466012506,
#     'colsample_bytree': 0.9640425007241273,
#     'subsample': 0.8,
#     'learning_rate': 0.052762727588106954,
#     'n_estimators': 700,
#     'max_depth': 54,
#     'min_child_weight': 96,
#     'eta': 3.119364108002744e-05,
#     'gamma': 5.177778739056542e-05,
#     'grow_policy': 'lossguide'
# }]

# study = optuna.create_study(direction='minimize')
# study.enqueue_trial(try_these_first[0])

In [18]:
# # study = optuna.create_study(direction='minimize')
# study.optimize(objective_wp1, n_trials=50)
# # write_results('Data/Hyperparametrization/xgboost_50trials.xlsx', 'wp1', study.trials_dataframe())
# best_trial = study.best_trial.params
# best_trial

In [21]:
# Warm start MaxAbs
params_1 = {
    'lambda': 0.3643806022565838,
    'alpha': 0.003650309466012506,
    'colsample_bytree': 0.9640425007241273,
    'subsample': 0.8,
    'learning_rate': 0.052762727588106954,
    'n_estimators': 100,
    'max_depth': 54,
    'min_child_weight': 96,
    'eta': 3.119364108002744e-05,
    'gamma': 5.177778739056542e-05,
    'grow_policy': 'lossguide'
}

# 50 trials no scaler
# params_1 = {
#     'lambda': 2.1359622347936646,
#     'alpha': 0.016202766042783825,
#     'colsample_bytree': 0.8075360516891219,
#     'subsample': 0.8,
#     'learning_rate': 0.06792370224097045,
#     'n_estimators': 320,
#     'max_depth': 58,
#     'min_child_weight': 102,
#     'eta': 6.934521001624072e-05,
#     'gamma': 4.369012735807193e-06,
#     'grow_policy': 'lossguide'
# }

In [22]:
xgboost_cross_validation(X1, y1_vmf, params_1)

-----------XGBOOST CROSS VALIDATION BEGINNING-----------
RMSE score: 0.06027794375545382
MAE score: 0.039799655871442075
None
-------------------FOLD 1-----------------
RMSE score: 0.06211235661539879
MAE score: 0.04173993398324079
None
-------------------FOLD 2-----------------


KeyboardInterrupt: 

## WP2

| |  | Mean | Std | Sum up |
| --- | --- | --- | --- | --- |
| No params | RMSE | 0.09275649653219382 | 0.0018096321280782113 |  |
| No params - MaxAbs | RMSE | 0.09230889203333872 | 0.0020804503145906636 |  |
| After tuning 100trials| RMSE | 0.07182012947223423 | 0.0012496316182635523 |  |
| After tuning 50trials - MaxAbs | RMSE | 0.06859684788065064 | 0.0016932252507600254 |  |
| --- | --- | --- | --- | --- |
| No params | MAE | 0.0639385458606043 | 0.0009679991365400539 |  |
| No params - MaxAbs | MAE | 0.06371111009968947 | 0.001194440893727917 |  |
| After tuning 100trials| MAE | 0.04649572151134333 | 0.0006489429134030307 |  |
| After tuning 50trials - MaxAbs | MAE | 0.044900768162719054 | 0.0006831432438047666 |  |

In [22]:
wp2_X = train_wp2[[c for c in train_wp2 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X2 = wp2_X.drop('wp', axis=1)
y2 = wp2_X['wp']

def objective_wp2(trial,data = X2,target = y2):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization(trial, train_x, test_x, train_y, test_y)

In [23]:
# xgboost_cross_validation(X2, y2, None)

In [24]:
# try_these_first = [{
#     'lambda': 2.1359622347936646,
#     'alpha': 0.016202766042783825,
#     'colsample_bytree': 0.8075360516891219,
#     'subsample': 0.8,
#     'learning_rate': 0.06792370224097045,
#     'n_estimators': 320,
#     'max_depth': 58,
#     'min_child_weight': 102,
#     'eta': 6.934521001624072e-05,
#     'gamma': 4.369012735807193e-06,
#     'grow_policy': 'lossguide'
# }]

# study = optuna.create_study(direction='minimize')
# study.enqueue_trial(try_these_first[0])

In [25]:
# # study = optuna.create_study(direction='minimize')
# study.optimize(objective_wp2, n_trials=50)
# # write_results('Data/Hyperparametrization/xgboost_50trials.xlsx', 'wp2', study.trials_dataframe())
# best_trial = study.best_trial.params
# best_trial

In [26]:
# 50 tirals - warm start MaxAbs
params_2 = {
    'lambda': 0.005195058020286749,
    'alpha': 0.15427340616771562,
    'colsample_bytree': 0.4794118698886291,
    'subsample': 0.7,
    'learning_rate': 0.13969003989794868,
    'n_estimators': 583,
    'max_depth': 20,
    'min_child_weight': 81,
    'eta': 0.0006994052800675432,
    'gamma': 4.0927842177131904e-08,
    'grow_policy': 'depthwise'
}

# {
#     'lambda': 4.982427302967441,
#     'alpha': 0.023879453147379343,
#     'colsample_bytree': 0.29850970311481473,
#     'subsample': 0.7,
#     'learning_rate': 0.07986759823219342,
#     'n_estimators': 634,
#     'max_depth': 52,
#     'min_child_weight': 142,
#     'eta': 0.9698508070965183,
#     'gamma': 6.168834828494383e-06,
#     'grow_policy': 'depthwise'
# }

In [27]:
# xgboost_cross_validation(X2, y2, params_2)

## WP3

| |  | Mean | Std | Sum up |
| --- | --- | --- | --- | --- |
| No params | RMSE | 0.08531695654385577 | 0.0009512381902157176 |  |
| No params - MaxAbs | RMSE | 0.08509439114002723 | 0.0018013097694686512 |  |
| After tuning 100trials| RMSE | 0.0573197789387906 | 0.0008490715337388156 |  |
| After tuning 50trials - MaxAbs | RMSE | 0.057135478429648404 | 0.0010258941390154284 |  |
| --- | --- | --- | --- | --- |
| No params | MAE | 0.06125227568636895 | 0.000538637024754324 |  |
| No params - MaxAbs | MAE | 0.06109215580462539 | 0.0011200741022377755 |  |
| After tuning 100trials| MAE | 0.04009718047732659 | 0.000599116601355594 |  |
| After tuning 50trials - MaxAbs | MAE | 0.040053630091772346 | 0.0006140572130488917 |  |

In [28]:
wp3_X = train_wp3[[c for c in train_wp3 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X3 = wp3_X.drop('wp', axis = 1)
y3 = wp3_X['wp']

def objective_wp3(trial,data = X3,target = y3):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization(trial, train_x, test_x, train_y, test_y)

In [29]:
# xgboost_cross_validation(X3, y3, None)

In [30]:
# try_these_first = [{
#     'lambda': 0.018191871915246106,
#     'alpha': 0.2397827070234125,
#     'colsample_bytree': 0.4710946041352672,
#     'subsample': 0.8,
#     'learning_rate': 0.14812785561924302,
#     'n_estimators': 688,
#     'max_depth': 32,
#     'min_child_weight': 218,
#     'eta': 6.950960910550952e-08,
#     'gamma': 2.0149702062428016e-07,
#     'grow_policy': 'lossguide'
# }]

# study = optuna.create_study(direction='minimize')
# study.enqueue_trial(try_these_first[0])

In [31]:
# # study = optuna.create_study(direction='minimize')
# study.optimize(objective_wp3, n_trials=50)
# # write_results('Data/Hyperparametrization/xgboost_50trials.xlsx', 'wp3', study.trials_dataframe())
# best_trial = study.best_trial.params
# best_trial

In [32]:
# 50 tirals - warm start MaxAbs
params_3 = {
    'lambda': 0.018191871915246106,
    'alpha': 0.2397827070234125,
    'colsample_bytree': 0.4710946041352672,
    'subsample': 0.8,
    'learning_rate': 0.14812785561924302,
    'n_estimators': 688,
    'max_depth': 32,
    'min_child_weight': 218,
    'eta': 6.950960910550952e-08,
    'gamma': 2.0149702062428016e-07,
    'grow_policy': 'lossguide'
}

# {
#     'lambda': 0.018191871915246106,
#     'alpha': 0.2397827070234125,
#     'colsample_bytree': 0.4710946041352672,
#     'subsample': 0.8,
#     'learning_rate': 0.14812785561924302,
#     'n_estimators': 688,
#     'max_depth': 32,
#     'min_child_weight': 218,
#     'eta': 6.950960910550952e-08,
#     'gamma': 2.0149702062428016e-07,
#     'grow_policy': 'lossguide'
# }

In [33]:
# xgboost_cross_validation(X3, y3, params_3)

## WP4

| |  | Mean | Std | Sum up |
| --- | --- | --- | --- | --- |
| No params | RMSE | 0.08705646834205574 | 0.0013432615911574354 |  |
| No params - MaxAbs | RMSE | 0.0870757500361273 | 0.0014362323840110152 |  |
| After tuning 100trials| RMSE | 0.06505388151301929 | 0.0009022910606251192 |  |
| After tuning 50trials - MaxAbs | RMSE | 0.0626312118373484 | 0.0010312894520400255 |  |
| --- | --- | --- | --- | --- |
| No params | MAE | 0.06217383365682638 | 0.000883527934746644 |  |
| No params - MaxAbs | MAE | 0.062254149447791104 | 0.062254149447791104 |  |
| After tuning 100trials| MAE | 0.043762279489834674 | 0.0004905220832103441 |  |
| After tuning 50trials - MaxAbs | MAE | 0.041378801055099515 | 0.0007406645752025426 |  |

In [34]:
wp4_X = train_wp4[[c for c in train_wp4 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X4 = wp4_X.drop('wp', axis = 1)
y4 = wp4_X['wp']

def objective_wp4(trial,data = X4,target = y4):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization(trial, train_x, test_x, train_y, test_y)

In [35]:
# xgboost_cross_validation(X4, y4, None)

In [36]:
# try_these_first = [{
#     'lambda': 0.001340947773207149,
#     'alpha': 0.002479638085657274,
#     'colsample_bytree': 0.3030181981060389,
#     'subsample': 0.7,
#     'learning_rate': 0.07696248319007938,
#     'n_estimators': 367,
#     'max_depth': 31,
#     'min_child_weight': 72,
#     'eta': 3.704957186572025e-08,
#     'gamma': 8.44315434172209e-05,
#     'grow_policy': 'depthwise'
# }, {
#     'lambda': 0.13763482520556616,
#     'alpha': 0.0010077676339636944,
#     'colsample_bytree': 0.954734556572597,
#     'subsample': 0.8,
#     'learning_rate': 0.05499114408834853,
#     'n_estimators': 546,
#     'max_depth': 43,
#     'min_child_weight': 94,
#     'eta': 1.2784286267654713e-06,
#     'gamma': 1.6935174502873177e-05,
#     'grow_policy': 'depthwise'
# }]

# study = optuna.create_study(direction='minimize')
# study.enqueue_trial(try_these_first[0])

In [37]:
# # study = optuna.create_study(direction='minimize')
# study.optimize(objective_wp4, n_trials=50)
# # write_results('Data/Hyperparametrization/xgboost_50trials.xlsx', 'wp4', study.trials_dataframe())
# best_trial = study.best_trial.params
# best_trial

In [38]:
# 50 tirals - warm start MaxAbs
params_4 = {
    'lambda': 0.13763482520556616,
    'alpha': 0.0010077676339636944,
    'colsample_bytree': 0.954734556572597,
    'subsample': 0.8,
    'learning_rate': 0.05499114408834853,
    'n_estimators': 546,
    'max_depth': 43,
    'min_child_weight': 94,
    'eta': 1.2784286267654713e-06,
    'gamma': 1.6935174502873177e-05,
    'grow_policy': 'depthwise'
}

# 50 trials
# params_4 = {
#     'lambda': 0.001340947773207149,
#     'alpha': 0.002479638085657274,
#     'colsample_bytree': 0.3030181981060389,
#     'subsample': 0.7,
#     'learning_rate': 0.07696248319007938,
#     'n_estimators': 367,
#     'max_depth': 31,
#     'min_child_weight': 72,
#     'eta': 3.704957186572025e-08,
#     'gamma': 8.44315434172209e-05,
#     'grow_policy': 'depthwise'
# }

In [39]:
# xgboost_cross_validation(X4, y4, params_4)

## WP5

| |  | Mean | Std | Sum up |
| --- | --- | --- | --- | --- |
| No params | RMSE | 0.09905505938942759 | 0.0010560057294061722 |  |
| No params - MaxAbs | RMSE | 0.09902019521905119 | 0.0013460300390687509 |  |
| After tuning 100trials| RMSE | 0.07467175992757297 | 0.0018566395951414925 |  |
| After tuning 50trials - MaxAbs | RMSE | 0.07315316873758233 | 0.0021847037668090933 |  |
| --- | --- | --- | --- | --- |
| No params | MAE | 0.0703001590429472 | 0.0006221166408279179 |  |
| No params - MaxAbs | MAE | 0.07031017432456337 | 0.0008486582761726584 |  |
| After tuning 100trials| MAE | 0.05087662307993675 | 0.0010894679027767663 |  |
| After tuning 50trials - MaxAbs | MAE | 0.04890386336557044 | 0.0010155747369788642 |  |

In [40]:
wp5_X = train_wp5[[c for c in train_wp5 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X5 = wp5_X.drop('wp', axis = 1)
y5 = wp5_X['wp']

def objective_wp5(trial, data = X5,target = y5):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization(trial, train_x, test_x, train_y, test_y)

In [41]:
# xgboost_cross_validation(X5, y5, None)

In [42]:
# try_these_first = [{
#     'lambda': 4.537995153532639,
#     'alpha': 0.15887083612902936,
#     'colsample_bytree': 0.35129085402309673,
#     'subsample': 0.8,
#     'learning_rate': 0.20146110291550628,
#     'n_estimators': 354,
#     'max_depth': 27,
#     'min_child_weight': 91,
#     'eta': 0.1963402390178624,
#     'gamma': 4.730295821405375e-07,
#     'grow_policy': 'lossguide'
# }]

# study = optuna.create_study(direction='minimize')
# study.enqueue_trial(try_these_first[0])

In [43]:
# # study = optuna.create_study(direction='minimize')
# study.optimize(objective_wp5, n_trials=50)
# # write_results('Data/Hyperparametrization/xgboost_50trials.xlsx', 'wp5', study.trials_dataframe())
# best_trial = study.best_trial.params
# best_trial

In [44]:
# 50 start warm - MaxAbs
params_5 = {
    'lambda': 4.7653031074423104,
    'alpha': 0.004963619239675007,
    'colsample_bytree': 0.8616303151950829,
    'subsample': 0.8,
    'learning_rate': 0.167247240657064,
    'n_estimators': 509,
    'max_depth': 31,
    'min_child_weight': 73,
    'eta': 0.1392993925005545,
    'gamma': 1.4909263616645174e-07,
    'grow_policy': 'depthwise'
}

# 50 trials
# params_5 = {
#     'lambda': 4.537995153532639,
#     'alpha': 0.15887083612902936,
#     'colsample_bytree': 0.35129085402309673,
#     'subsample': 0.8,
#     'learning_rate': 0.20146110291550628,
#     'n_estimators': 354,
#     'max_depth': 27,
#     'min_child_weight': 91,
#     'eta': 0.1963402390178624,
#     'gamma': 4.730295821405375e-07,
#     'grow_policy': 'lossguide'
# }

In [45]:
# xgboost_cross_validation(X5, y5, params_5)

## WP6

| |  | Mean | Std | Sum up |
| --- | --- | --- | --- | --- |
| No params | RMSE |  0.07704369819527615 | 0.0010801451601647142 |  |
| No params - MaxAbs | RMSE | 0.07724267929514025 | 0.0015012274298760973 |  |
| After tuning 100trials| RMSE | 0.052403202133489465 | 0.0010631888908178239 |  |
| After tuning 50trials - warm start - MaxAbs | RMSE | 0.051604419990347344 | 0.0008376347497504802 |  |
| --- | --- | --- | --- | --- |
| No params | MAE | 0.056804680824990995 | 0.000879205156712233 |  |
| No params - MaxAbs | MAE | 0.0568038624920563 | 0.0007877324372786336 |  |
| After tuning 100trials| MAE | 0.03630430575055383 | 0.0007604647331428116 |  |
| After tuning 50trials - warm start - MaxAbs | MAE | 0.03479476729901705 | 0.0004812579905210939 |  |

In [46]:
wp6_X = train_wp6[[c for c in train_wp6 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X6 = wp6_X.drop('wp', axis = 1)
y6 = wp6_X['wp']

def objective_wp6(trial,data = X6,target = y6):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization(trial, train_x, test_x, train_y, test_y)

In [47]:
# xgboost_cross_validation(X6, y6, None)

In [48]:
# try_these_first = [{
#     'lambda': 0.5705269295320163,
#     'alpha': 0.06713843687958011,
#     'colsample_bytree': 0.8718486759988152,
#     'subsample': 0.8,
#     'learning_rate': 0.07668854905667996,
#     'n_estimators': 582,
#     'max_depth': 49,
#     'min_child_weight': 143,
#     'eta': 9.055710235537663e-07,
#     'gamma': 1.111486195598291e-06,
#     'grow_policy': 'depthwise'
# }]

# study = optuna.create_study(direction='minimize')
# study.enqueue_trial(try_these_first[0])

In [49]:
# # study = optuna.create_study(direction='minimize')
# study.optimize(objective_wp6, n_trials=50)
# # write_results('Data/Hyperparametrization/xgboost_50trials.xlsx', 'wp6', study.trials_dataframe())
# best_trial = study.best_trial.params
# best_trial

In [50]:
# 50 start warm - MaxAbs
params_6 = {
    'lambda': 6.198890709955999,
    'alpha': 0.009212761583335095,
    'colsample_bytree': 0.9364947872025757,
    'subsample': 0.6,
    'learning_rate': 0.0377294321765545,
    'n_estimators': 458,
    'max_depth': 50,
    'min_child_weight': 28,
    'eta': 1.0671149195024988e-08,
    'gamma': 1.4697758952551594e-05,
    'grow_policy': 'depthwise'
}


# A REFAIRE TOURNER
# {
#     'lambda': 0.5705269295320163,
#     'alpha': 0.06713843687958011,
#     'colsample_bytree': 0.8718486759988152,
#     'subsample': 0.8,
#     'learning_rate': 0.07668854905667996,
#     'n_estimators': 582,
#     'max_depth': 49,
#     'min_child_weight': 143,
#     'eta': 9.055710235537663e-07,
#     'gamma': 1.111486195598291e-06,
#     'grow_policy': 'depthwise'
# }

In [51]:
# xgboost_cross_validation(X6, y6, params_6)

# XGBoost Predictions

## Functions

In [52]:
to_drop = ['date','wd','forecast_time', 'forecast', "forecast_dist", 'wp']
def make_prediction_dataset(test, to_drop=to_drop):
    test_to_predict = test.dropna(subset=['ws','u','v'], how = 'any') # keeps only lines with u,v,ws,wd
    test_to_predict = test_to_predict[test_to_predict['wp'].isna()] # keeps only lines with no wp
    test_to_predict = test_to_predict.sort_values(by=['date', 'forecast_time'], ascending = [True, False]).drop_duplicates(subset='date')
    test_to_predict = test_to_predict.drop(to_drop, axis = 1)
    return test_to_predict

In [53]:
def make_submission_file(lst_X_trains, lst_y_trains, lst_tests, lst_models, dates):
    i = 1
    lst_prediction = []
    lst_models_trained = []
    for X, y, test, model in zip(lst_X_trains, lst_y_trains, lst_tests, lst_models):
        print(f'--------------Model {i}--------------')
        model.fit(X, y)
        print(f'True:\n\tMin:{min(y)}\n\tMax:{max(y)}\n\tMean:{y.mean()}')
        predictions = model.predict(test)
        print(f'Prediction:\n\tMin:{min(predictions)}\n\tMax:{max(predictions)}\n\tMean:{np.mean(predictions)}')
        predictions = [min(y) if i < 0 else i for i in predictions]
        predictions = [max(y) if i > max(y) else i for i in predictions]
        print(f'Prediction corrected:\n\tMin:{min(predictions)}\n\tMax:{max(predictions)}\n\tMean:{np.mean(predictions)}')
        lst_prediction.append(predictions)
        lst_models_trained.append(model)
        i+=1
    
    df_predictions = pd.DataFrame({
        'date': test_dates,
        'wp1': lst_prediction[0],
        'wp2': lst_prediction[1],
        'wp3': lst_prediction[2],
        'wp4': lst_prediction[3],
        'wp5': lst_prediction[4],
        'wp6': lst_prediction[5],        
    })
    return df_predictions, lst_models_trained

## Submission

In [54]:
model_1 = Pipeline([('scaler', MaxAbsScaler()),('xgbr', XGBRegressor(**params_1))])
model_2 = Pipeline([('scaler', MaxAbsScaler()),('xgbr', XGBRegressor(**params_2))])
model_3 = Pipeline([('scaler', MaxAbsScaler()),('xgbr', XGBRegressor(**params_3))])
model_4 = Pipeline([('scaler', MaxAbsScaler()),('xgbr', XGBRegressor(**params_4))])
model_5 = Pipeline([('scaler', MaxAbsScaler()),('xgbr', XGBRegressor(**params_5))])
model_6 = Pipeline([('scaler', MaxAbsScaler()),('xgbr', XGBRegressor(**params_6))])

In [55]:
# model_1 = XGBRegressor(**params_1)
# model_2 = XGBRegressor(**params_2)
# model_3 = XGBRegressor(**params_3)
# model_4 = XGBRegressor(**params_4)
# model_5 = XGBRegressor(**params_5)
# model_6 = XGBRegressor(**params_6)

lst_models = [model_1, model_2, model_3, model_4, model_5, model_6]
lst_X_trains = [X1, X2, X3, X4, X5, X6]
lst_y_trains = [y1, y2, y3, y4, y5, y6]

In [56]:
lst_tests = []
for test in [test_wp1, test_wp2, test_wp3, test_wp4, test_wp5, test_wp6]:
    test = make_prediction_dataset(test)
    lst_tests.append(test)

In [57]:
df_predictions, lst_models_trained = make_submission_file(lst_X_trains, lst_y_trains, lst_tests, lst_models, test_dates)

--------------Model 1--------------
True:
	Min:0.0
	Max:0.96
	Mean:0.2845981952075702
Prediction:
	Min:-0.040609680116176605
	Max:0.9584257006645203
	Mean:0.2994403839111328
Prediction corrected:
	Min:0.0
	Max:0.9584257006645203
	Mean:0.299561041407802
--------------Model 2--------------
True:
	Min:0.0
	Max:0.966
	Mean:0.25890153769841273
Prediction:
	Min:-0.03859954699873924
	Max:1.0072672367095947
	Mean:0.2552209198474884
Prediction corrected:
	Min:0.0
	Max:0.966
	Mean:0.2553686954124512
--------------Model 3--------------
True:
	Min:0.0
	Max:0.989
	Mean:0.2625247252747253
Prediction:
	Min:-0.07822347432374954
	Max:1.0252376794815063
	Mean:0.2915489375591278
Prediction corrected:
	Min:0.0
	Max:0.989
	Mean:0.29178245276974524
--------------Model 4--------------
True:
	Min:0.0
	Max:0.992
	Mean:0.2763637820512821
Prediction:
	Min:-0.019250165671110153
	Max:0.9634188413619995
	Mean:0.2832432985305786
Prediction corrected:
	Min:0.0
	Max:0.9634188413619995
	Mean:0.2833424258501441
--------

In [58]:
df_predictions.to_csv('Predictions/submission_nb_7_full_absmax-xgboost.csv', index=False, sep=';')

In [59]:
pkl_model = "Models/XGBOOST/XGBoost-maxabs-wp1-50trials_best.pkl"
with open(pkl_model, 'wb') as file:
    pickle.dump(lst_models_trained[0], file)
    
pkl_model = "Models/XGBOOST/XGBoost-maxabs-wp2-50trials_best.pkl"
with open(pkl_model, 'wb') as file:
    pickle.dump(lst_models_trained[1], file)
    
pkl_model = "Models/XGBOOST/XGBoost-maxabs-wp3-50trials_best.pkl"
with open(pkl_model, 'wb') as file:
    pickle.dump(lst_models_trained[2], file)

pkl_model = "Models/XGBOOST/XGBoost-maxabs-wp4-50trials_best.pkl"
with open(pkl_model, 'wb') as file:
    pickle.dump(lst_models_trained[3], file)
    
pkl_model = "Models/XGBOOST/XGBoost-maxabs-wp5-50trials_best.pkl"
with open(pkl_model, 'wb') as file:
    pickle.dump(lst_models_trained[4], file)
    
pkl_model = "Models/XGBOOST/XGBoost-maxabs-wp6-50trials_best.pkl"
with open(pkl_model, 'wb') as file:
    pickle.dump(lst_models_trained[5], file)