# Model development

Ici on fait nos modèles et prédictions. Le mieux c'est de faire des parties par modèles je pense ?

Il faut aussi qu'on trouve un nomenclature pour les modèles si on les enregistre, afin de garder en tête les différents résultats

In [None]:
import pandas as pd
import seaborn as sns
import openpyxl
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle

In [None]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error

from xgboost import XGBRegressor
import optuna

In [None]:
from Functions.helper_functions import * 

In [None]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
np.random.seed(42)

In [None]:
from vmdpy import VMD

# Data

In [None]:
train_wp1 = pd.read_csv('Data/Preprocessing/WP1_train_preprocessed.csv', sep=',')
train_wp2 = pd.read_csv('Data/Preprocessing/WP2_train_preprocessed.csv', sep=',')
train_wp3 = pd.read_csv('Data/Preprocessing/WP3_train_preprocessed.csv', sep=',')
train_wp4 = pd.read_csv('Data/Preprocessing/WP4_train_preprocessed.csv', sep=',')
train_wp5 = pd.read_csv('Data/Preprocessing/WP5_train_preprocessed.csv', sep=',')
train_wp6 = pd.read_csv('Data/Preprocessing/WP6_train_preprocessed.csv', sep=',')

In [None]:
test_wp1 = pd.read_csv('Data/Preprocessing/WP1_test_preprocessed.csv', sep=',')
test_wp2 = pd.read_csv('Data/Preprocessing/WP2_test_preprocessed.csv', sep=',')
test_wp3 = pd.read_csv('Data/Preprocessing/WP3_test_preprocessed.csv', sep=',')
test_wp4 = pd.read_csv('Data/Preprocessing/WP4_test_preprocessed.csv', sep=',')
test_wp5 = pd.read_csv('Data/Preprocessing/WP5_test_preprocessed.csv', sep=',')
test_wp6 = pd.read_csv('Data/Preprocessing/WP6_test_preprocessed.csv', sep=',')
test_dates = pd.read_csv('Data/Initial/test.csv', sep=',').date.values

In [None]:
to_drop = ['date','wd','forecast_time', 'forecast', "forecast_dist"]

# XGBoost

## Functions

In [None]:
def xgboost_cross_validation_vmf(X, y1,y2,y3,y4,y_true, params):
    if params == None:
        model1 = Pipeline([('scaler', MaxAbsScaler()),('xgbr', XGBRegressor())])
        model2 = Pipeline([('scaler', MaxAbsScaler()),('xgbr', XGBRegressor())])
        model3 = Pipeline([('scaler', MaxAbsScaler()),('xgbr', XGBRegressor())])
        model4 = Pipeline([('scaler', MaxAbsScaler()),('xgbr', XGBRegressor())])
    else:
        model1 = Pipeline([('scaler', MaxAbsScaler()),('xgbr', XGBRegressor(**params[0]))])
        model2 = Pipeline([('scaler', MaxAbsScaler()),('xgbr', XGBRegressor(**params[1]))])
        model3 = Pipeline([('scaler', MaxAbsScaler()),('xgbr', XGBRegressor(**params[2]))])
        model4 = Pipeline([('scaler', MaxAbsScaler()),('xgbr', XGBRegressor(**params[3]))])

    print('-----------XGBOOST CROSS VALIDATION BEGINNING-----------')
    split = 10
    kf = KFold(n_splits=split, shuffle=True)       
    xgboost_rmse_scores = []
    xgboost_mae_scores = []
    i = 1
    for (train_index, test_index) in kf.split(pd.DataFrame(X), pd.DataFrame(y)):
        X_train, X_test = pd.DataFrame(X).iloc[train_index], pd.DataFrame(X).iloc[test_index]
        Y1_train, Y1_test = pd.DataFrame(y1).iloc[train_index],pd.DataFrame(y1).iloc[test_index]
        Y2_train, Y2_test = pd.DataFrame(y2).iloc[train_index],pd.DataFrame(y2).iloc[test_index]
        Y3_train, Y3_test = pd.DataFrame(y3).iloc[train_index],pd.DataFrame(y3).iloc[test_index]
        Y4_train, Y4_test = pd.DataFrame(y4).iloc[train_index],pd.DataFrame(y4).iloc[test_index]
        Y_train, Y_test = pd.DataFrame(y_true).iloc[train_index],pd.DataFrame(y_true).iloc[test_index]

        model1.fit(X_train, Y1_train)
        model2.fit(X_train, Y2_train)
        model3.fit(X_train, Y3_train)
        model4.fit(X_train, Y4_train)

        prediction = (model1.predict(X_test)+
                      model2.predict(X_test)+
                      model3.predict(X_test)+
                      model4.predict(X_test))
        
        xgboost_rmse_scores.append(mean_squared_error(Y_test, prediction,squared=False))
        xgboost_mae_scores.append(mean_absolute_error(Y_test, prediction))
        
        print(show_evaluation(prediction, Y_test))
        print(f'-------------------FOLD {i}-----------------')
        i+=1

    print('---------------CROSS VALIDATION COMPLETE-------------')
    print('--------------------------RMSE-----------------------')
    display_scores(xgboost_rmse_scores)
    print('--------------------------MAE------------------------')
    display_scores(xgboost_mae_scores)

In [None]:
def hyperparametrization_cv(trial, X, y):
    param = {
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 1e-8, 1),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 700),
        'max_depth': trial.suggest_int("max_depth", 10, 70),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'eta' : trial.suggest_loguniform("eta", 1e-8, 1.0),
        'gamma' : trial.suggest_loguniform("gamma", 1e-8, 1.0),
        'grow_policy' : trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
        
    }
    
    model = Pipeline([('scaler', MaxAbsScaler()),('xgbr', XGBRegressor(**param))]) 
    rmse = - cross_val_score(model, X, y, cv=3, scoring = 'neg_root_mean_squared_error').mean()
    
    return rmse

In [None]:
def vmd(y,k):
    
    #Intrinsic mode generation
     #Empirical Mode Decomposition
    #. some sample parameters for VMD  
    alpha = 1       # moderate bandwidth constraint  
    tau = 0.           # noise-tolerance (no strict fidelity enforcement)  
    K = k              # k modes  
    DC = 0             # no DC part imposed  
    init = 1           # initialize omegas uniformly  
    tol = 1e-7
    u, u_hat, omega = VMD(y,alpha, tau, K, DC, init, tol)
    df_vmfs = pd.DataFrame()
    #Integration in the dataframe
    for num, imf in enumerate(u):
        #print('----Creating VMFwp{0} EMD columns----'.format(num+1))
        df_vmfs['IMFwp{0}'.format(num+1)] = imf
    return df_vmfs

## WP1 

In [None]:
wp1_X = train_wp1[[c for c in train_wp1 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X1 = wp1_X.drop('wp', axis=1)
y1 = wp1_X['wp']
vmf_1 = vmd(y1,4)

#def objective_wp1(trial,data=X1,target=vmf['IMFwp1']):
 #   train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
  #  return hyperparametrization(trial, train_x, test_x, train_y, test_y)

In [None]:
xgboost_cross_validation_vmf(X1, vmf_1['IMFwp1'],vmf_1['IMFwp2'],vmf_1['IMFwp3'],vmf_1['IMFwp4'],y1, None)

### Optuna Optim

In [None]:
def objective_wp1(trial,data=X1,target=vmf_1['IMFwp1']):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization_cv(trial, train_x, test_x, train_y, test_y)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp1, n_trials=50)
#write_results('Data/Hyperparametrization/xgboost_vmd_50trials.xlsx', 'wp1', study.trials_dataframe())
best_trial_1_1 = study.best_trial.params
best_trial_1_1

In [None]:
def objective_wp1(trial,data=X1,target=vmf_1['IMFwp2']):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization_cv(trial, train_x, test_x, train_y, test_y)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp1, n_trials=50)
#write_results('Data/Hyperparametrization/xgboost_vmd_50trials.xlsx', 'wp1', study.trials_dataframe())
best_trial_1_2 = study.best_trial.params
best_trial_1_2

In [None]:
def objective_wp1(trial,data=X1,target=vmf_1['IMFwp3']):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization_cv(trial, train_x, test_x, train_y, test_y)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp1, n_trials=50)
#write_results('Data/Hyperparametrization/xgboost_vmd_50trials.xlsx', 'wp1', study.trials_dataframe())
best_trial_1_3 = study.best_trial.params
best_trial_1_3

In [None]:
def objective_wp1(trial,data=X1,target=vmf_1['IMFwp4']):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization_cv(trial, train_x, test_x, train_y, test_y)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp1, n_trials=50)
#write_results('Data/Hyperparametrization/xgboost_vmd_50trials.xlsx', 'wp1', study.trials_dataframe())
best_trial_1_4 = study.best_trial.params
best_trial_1_4

In [None]:
params_vmf_1 = [best_trial_1_1,best_trial_1_2,best_trial_1_3,best_trial_1_4]


In [None]:
xgboost_cross_validation_vmf(X1, vmf_1['IMFwp1'],vmf_1['IMFwp2'],vmf_1['IMFwp3'],vmf_1['IMFwp4'],y1, params_vmf_1)

## WP2

In [None]:
wp2_X = train_wp2[[c for c in train_wp2 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X2 = wp2_X.drop('wp', axis=1)
y2 = wp2_X['wp']
vmf_2=vmd(y2,4)
#def objective_wp2(trial,data = X2,target = y2):
#    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
#    return hyperparametrization(trial, train_x, test_x, train_y, test_y)

In [None]:
xgboost_cross_validation_vmf(X2, vmf_2['IMFwp1'],vmf_2['IMFwp2'],vmf_2['IMFwp3'],vmf_2['IMFwp4'],y2, None)

### Optuna Optim

In [None]:
def objective_wp2(trial,data=X2,target=vmf_2['IMFwp1']):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization_cv(trial, train_x, test_x, train_y, test_y)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp2, n_trials=50)
#write_results('Data/Hyperparametrization/xgboost_vmd_50trials.xlsx', 'wp1', study.trials_dataframe())
best_trial_2_1 = study.best_trial.params
best_trial_2_1

In [None]:
def objective_wp2(trial,data=X2,target=vmf_2['IMFwp2']):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization_cv(trial, train_x, test_x, train_y, test_y)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp2, n_trials=50)
#write_results('Data/Hyperparametrization/xgboost_vmd_50trials.xlsx', 'wp1', study.trials_dataframe())
best_trial_2_2 = study.best_trial.params
best_trial_2_2

In [None]:
def objective_wp2(trial,data=X2,target=vmf_2['IMFwp3']):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization_cv(trial, train_x, test_x, train_y, test_y)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp2, n_trials=50)
#write_results('Data/Hyperparametrization/xgboost_vmd_50trials.xlsx', 'wp1', study.trials_dataframe())
best_trial_2_3 = study.best_trial.params
best_trial_2_3

In [None]:
def objective_wp2(trial,data=X2,target=vmf_2['IMFwp4']):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization_cv(trial, train_x, test_x, train_y, test_y)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp2, n_trials=50)
#write_results('Data/Hyperparametrization/xgboost_vmd_50trials.xlsx', 'wp1', study.trials_dataframe())
best_trial_2_4 = study.best_trial.params
best_trial_2_4

In [None]:
params_vmf_2 = [best_trial_2_1,best_trial_2_2,best_trial_2_3,best_trial_2_4]

In [None]:
xgboost_cross_validation_vmf(X2, vmf_2['IMFwp1'],vmf_2['IMFwp2'],vmf_2['IMFwp3'],vmf_2['IMFwp4'],y2, params_vmf_2)

## WP3

In [None]:
wp3_X = train_wp3[[c for c in train_wp3 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X3 = wp3_X.drop('wp', axis = 1)
y3 = wp3_X['wp']
vmf_3=vmd(y3,4)

In [None]:
xgboost_cross_validation_vmf(X3, vmf_3['IMFwp1'],vmf_3['IMFwp2'],vmf_3['IMFwp3'],vmf_3['IMFwp4'],y3, None)

### Optuna Optim

In [None]:
def objective_wp3(trial,data=X3,target=vmf_3['IMFwp1']):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization_cv(trial, train_x, test_x, train_y, test_y)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp3, n_trials=50)
#write_results('Data/Hyperparametrization/xgboost_vmd_50trials.xlsx', 'wp1', study.trials_dataframe())
best_trial_3_1 = study.best_trial.params
best_trial_3_1

In [None]:
def objective_wp3(trial,data=X3,target=vmf_3['IMFwp2']):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization_cv(trial, train_x, test_x, train_y, test_y)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp3, n_trials=50)
#write_results('Data/Hyperparametrization/xgboost_vmd_50trials.xlsx', 'wp1', study.trials_dataframe())
best_trial_3_2 = study.best_trial.params
best_trial_3_2

In [None]:
def objective_wp3(trial,data=X3,target=vmf_3['IMFwp3']):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization_cv(trial, train_x, test_x, train_y, test_y)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp3, n_trials=50)
#write_results('Data/Hyperparametrization/xgboost_vmd_50trials.xlsx', 'wp1', study.trials_dataframe())
best_trial_3_3 = study.best_trial.params
best_trial_3_3

In [None]:
def objective_wp3(trial,data=X3,target=vmf_3['IMFwp4']):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization_cv(trial, train_x, test_x, train_y, test_y)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp3, n_trials=50)
#write_results('Data/Hyperparametrization/xgboost_vmd_50trials.xlsx', 'wp1', study.trials_dataframe())
best_trial_3_3 = study.best_trial.params
best_trial_3_3

In [None]:
params_vmf_3 = [best_trial_3_1,best_trial_3_2,best_trial_3_3,best_trial_3_4]


In [None]:
xgboost_cross_validation_vmf(X3, vmf_3['IMFwp1'],vmf_3['IMFwp2'],vmf_3['IMFwp3'],vmf_3['IMFwp4'],y3, params_vmf_3)

## WP4

In [None]:
wp4_X = train_wp4[[c for c in train_wp4 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X4 = wp4_X.drop('wp', axis = 1)
y4 = wp4_X['wp']
vmf_4=vmd(y4,4)

In [None]:
xgboost_cross_validation_vmf(X4, vmf_4['IMFwp1'],vmf_4['IMFwp2'],vmf_4['IMFwp3'],vmf_4['IMFwp4'],y4, None)

### Optuna Optim

In [None]:
def objective_wp4(trial,data=X4,target=vmf_4['IMFwp1']):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization_cv(trial, train_x, test_x, train_y, test_y)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp4, n_trials=50)
#write_results('Data/Hyperparametrization/xgboost_vmd_50trials.xlsx', 'wp1', study.trials_dataframe())
best_trial_4_1 = study.best_trial.params
best_trial_4_1

In [None]:
def objective_wp4(trial,data=X4,target=vmf_4['IMFwp2']):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization_cv(trial, train_x, test_x, train_y, test_y)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp4, n_trials=50)
#write_results('Data/Hyperparametrization/xgboost_vmd_50trials.xlsx', 'wp1', study.trials_dataframe())
best_trial_4_2 = study.best_trial.params
best_trial_4_3

In [None]:
def objective_wp4(trial,data=X4,target=vmf_4['IMFwp3']):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization_cv(trial, train_x, test_x, train_y, test_y)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp4, n_trials=50)
#write_results('Data/Hyperparametrization/xgboost_vmd_50trials.xlsx', 'wp1', study.trials_dataframe())
best_trial_4_3 = study.best_trial.params
best_trial_4_3

In [None]:
def objective_wp4(trial,data=X4,target=vmf_4['IMFwp4']):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization_cv(trial, train_x, test_x, train_y, test_y)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp4, n_trials=50)
#write_results('Data/Hyperparametrization/xgboost_vmd_50trials.xlsx', 'wp1', study.trials_dataframe())
best_trial_4_4 = study.best_trial.params
best_trial_4_4

In [None]:
params_vmf_4 = [best_trial_4_1,best_trial_4_2,best_trial_4_3,best_trial_4_4]


In [None]:
xgboost_cross_validation_vmf(X4, vmf_4['IMFwp1'],vmf_4['IMFwp2'],vmf_4['IMFwp3'],vmf_4['IMFwp4'],y4, params_vmf_4)

## WP5

In [None]:
wp5_X = train_wp5[[c for c in train_wp5 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X5 = wp5_X.drop('wp', axis = 1)
y5 = wp5_X['wp']
vmf_5=vmd(y5,4)

In [None]:
xgboost_cross_validation_vmf(X5, vmf_5['IMFwp1'],vmf_5['IMFwp2'],vmf_5['IMFwp3'],vmf_5['IMFwp4'],y5, None)

### Optuna Optim

In [None]:
def objective_wp5(trial,data=X5,target=vmf_5['IMFwp1']):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization_cv(trial, train_x, test_x, train_y, test_y)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp5, n_trials=50)
#write_results('Data/Hyperparametrization/xgboost_vmd_50trials.xlsx', 'wp1', study.trials_dataframe())
best_trial_5_1 = study.best_trial.params
best_trial_5_1

In [None]:
def objective_wp5(trial,data=X5,target=vmf_5['IMFwp2']):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization_cv(trial, train_x, test_x, train_y, test_y)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp5, n_trials=50)
#write_results('Data/Hyperparametrization/xgboost_vmd_50trials.xlsx', 'wp1', study.trials_dataframe())
best_trial_5_2 = study.best_trial.params
best_trial_5_2

In [None]:
def objective_wp5(trial,data=X5,target=vmf_5['IMFwp3']):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization_cv(trial, train_x, test_x, train_y, test_y)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp5, n_trials=50)
#write_results('Data/Hyperparametrization/xgboost_vmd_50trials.xlsx', 'wp1', study.trials_dataframe())
best_trial_5_2 = study.best_trial.params
best_trial_5_2

In [None]:
def objective_wp5(trial,data=X5,target=vmf_5['IMFwp4']):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization_cv(trial, train_x, test_x, train_y, test_y)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp5, n_trials=50)
#write_results('Data/Hyperparametrization/xgboost_vmd_50trials.xlsx', 'wp1', study.trials_dataframe())
best_trial_5_2 = study.best_trial.params
best_trial_5_2

In [None]:
params_vmf_5 = [best_trial_5_1,best_trial_5_2,best_trial_5_3,best_trial_5_4]

In [None]:
xgboost_cross_validation_vmf(X5, vmf_5['IMFwp1'],vmf_5['IMFwp2'],vmf_5['IMFwp3'],vmf_5['IMFwp4'],y5, params_vmf_5)

## WP6

In [None]:
wp6_X = train_wp6[[c for c in train_wp6 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X6 = wp6_X.drop('wp', axis = 1)
y6 = wp6_X['wp']
vmf_6=vmd(y6,4)

In [None]:
xgboost_cross_validation_vmf(X6, vmf_6['IMFwp1'],vmf_6['IMFwp2'],vmf_6['IMFwp3'],vmf_6['IMFwp4'],y6, None)

### Optuna Optim

In [None]:
def objective_wp6(trial,data=X6,target=vmf_6['IMFwp1']):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization_cv(trial, train_x, test_x, train_y, test_y)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp6, n_trials=50)
#write_results('Data/Hyperparametrization/xgboost_vmd_50trials.xlsx', 'wp1', study.trials_dataframe())
best_trial_6_1 = study.best_trial.params
best_trial_6_1

In [None]:
def objective_wp6(trial,data=X6,target=vmf_6['IMFwp2']):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization_cv(trial, train_x, test_x, train_y, test_y)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp6, n_trials=50)
#write_results('Data/Hyperparametrization/xgboost_vmd_50trials.xlsx', 'wp1', study.trials_dataframe())
best_trial_6_2 = study.best_trial.params
best_trial_6_2

In [None]:
def objective_wp6(trial,data=X6,target=vmf_6['IMFwp3']):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization_cv(trial, train_x, test_x, train_y, test_y)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp6, n_trials=50)
#write_results('Data/Hyperparametrization/xgboost_vmd_50trials.xlsx', 'wp1', study.trials_dataframe())
best_trial_6_3 = study.best_trial.params
best_trial_6_3

In [None]:
def objective_wp6(trial,data=X6,target=vmf_6['IMFwp4']):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization_cv(trial, train_x, test_x, train_y, test_y)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp6, n_trials=50)
#write_results('Data/Hyperparametrization/xgboost_vmd_50trials.xlsx', 'wp1', study.trials_dataframe())
best_trial_6_4 = study.best_trial.params
best_trial_6_4

In [None]:
params_vmf_6 = [best_trial_6_1,best_trial_5_2,best_trial_6_3,best_trial_6_4]

In [None]:
xgboost_cross_validation_vmf(X6, vmf_6['IMFwp1'],vmf_6['IMFwp2'],vmf_6['IMFwp3'],vmf_6['IMFwp4'],y6, params_vmf_6)

# XGBoost Predictions

## Functions

In [None]:
to_drop = ['date','wd','forecast_time', 'forecast', "forecast_dist", 'wp']
def make_prediction_dataset(test, to_drop=to_drop):
    test_to_predict = test.dropna(subset=['ws','u','v'], how = 'any') # keeps only lines with u,v,ws,wd
    test_to_predict = test_to_predict[test_to_predict['wp'].isna()] # keeps only lines with no wp
    test_to_predict = test_to_predict.sort_values(by=['date', 'forecast_time'], ascending = [True, False]).drop_duplicates(subset='date')
    test_to_predict = test_to_predict.drop(to_drop, axis = 1)
    return test_to_predict

In [None]:
def make_submission_file_nmodels(lst_X_trains, lst_y_trains,lst_y_vmfs,lst_tests, lst_models, dates):
    i = 1
    lst_prediction = []
    lst_models_trained = []
    for X, y, test, model in zip(lst_X_trains, lst_y_vmfs, lst_tests, lst_models):
        print(f'--------------Model {i}--------------')
        for n in range(len(model)):
            model[n].fit(X,y[n])
        
        print(f'True:\n\tMin:{min(lst_y_trains[i-1])}\n\tMax:{max(lst_y_trains[i-1])}\n\tMean:{lst_y_trains[i-1].mean()}')
        
        for m in range(len(model)):
            if m==0:
                predictions = model[m].predict(test)
            else:
                predictions += model[m].predict(test)
        print(f'Prediction:\n\tMin:{min(predictions)}\n\tMax:{max(predictions)}\n\tMean:{np.mean(predictions)}')
#         predictions = [min(y) if i < 0 else i for i in predictions]
#         predictions = [max(y) if i > max(y) else i for i in predictions]
        predictions = [0 if i < 0 else i for i in predictions]
        predictions = [1 if i > 1 else i for i in predictions]
        print(f'Prediction corrected:\n\tMin:{min(predictions)}\n\tMax:{max(predictions)}\n\tMean:{np.mean(predictions)}')
        lst_prediction.append(predictions)
        lst_models_trained.append(model)
        i+=1
    
    df_predictions = pd.DataFrame({
        'date': test_dates,
        'wp1': lst_prediction[0],
        'wp2': lst_prediction[1],
        'wp3': lst_prediction[2],
        'wp4': lst_prediction[3],
        'wp5': lst_prediction[4],
        'wp6': lst_prediction[5],        
    })
    return df_predictions, lst_models_trained

### Optimized Algo

In [None]:
model_1=[]
model_2=[]
model_3=[]
model_4=[]
model_5=[]
model_6=[]
for i in range(4):
    model_1+=Pipeline([('scaler', MaxAbsScaler()),('xgbr', XGBRegressor(**params_vmf_1[i]))])
    model_2+=Pipeline([('scaler', MaxAbsScaler()),('xgbr', XGBRegressor(**params_vmf_2[i]))])
    model_3+=Pipeline([('scaler', MaxAbsScaler()),('xgbr', XGBRegressor(**params_vmf_3[i]))])
    model_4+=Pipeline([('scaler', MaxAbsScaler()),('xgbr', XGBRegressor(**params_vmf_4[i]))])
    model_5+=Pipeline([('scaler', MaxAbsScaler()),('xgbr', XGBRegressor(**params_vmf_5[i]))])
    model_6+=Pipeline([('scaler', MaxAbsScaler()),('xgbr', XGBRegressor(**params_vmf_6[i]))])

### Non-Optimized Algo

In [None]:
model_1=[]
model_2=[]
model_3=[]
model_4=[]
model_5=[]
model_6=[]
for i in range(4):
    model=Pipeline([('scaler', MaxAbsScaler()),('ridge', XGBRegressor())])
    model_1+=[model]
    model_2+=[model]
    model_3+=[model]
    model_4+=[model]
    model_5+=[model]
    model_6+=[model]

## Submission

In [None]:
y_vmf1=[]
y_vmf2=[]
y_vmf3=[]
y_vmf4=[]
y_vmf5=[]
y_vmf6=[]
for i in range(1,5):
    y_vmf1.append(vmf_1['IMFwp'+str(i)])
    y_vmf2.append(vmf_2['IMFwp'+str(i)])
    y_vmf3.append(vmf_3['IMFwp'+str(i)])
    y_vmf4.append(vmf_4['IMFwp'+str(i)])
    y_vmf5.append(vmf_5['IMFwp'+str(i)])
    y_vmf6.append(vmf_6['IMFwp'+str(i)])

In [None]:
lst_models = [model_1, model_2, model_3, model_4, model_5, model_6]
lst_X_trains = [X1, X2, X3, X4, X5, X6]
lst_y_trains = [y1, y2, y3, y4, y5, y6]
lst_y_vmf=[y_vmf1, y_vmf2, y_vmf3, y_vmf4, y_vmf5, y_vmf6]

In [None]:
lst_tests = []
for test in [test_wp1, test_wp2, test_wp3, test_wp4, test_wp5, test_wp6]:
    test = make_prediction_dataset(test)
    lst_tests.append(test)

In [None]:
df_predictions, lst_models_trained = make_submission_file_nmodels(lst_X_trains, lst_y_trains, lst_y_vmf,lst_tests, lst_models, test_dates)

### Saving

In [None]:
nb_sub = 39
model = "maxAbsScaler-xgb-vmf"
prepro = 'RobustScaler'
postpro = "Prediction limited by 0-1"

In [None]:
df_predictions.to_csv(f'Predictions/submission_nb_{nb_sub}_{model}.csv', index=False, sep=';')

In [None]:
df_predictions.head()