# Model development

Ici on fait nos modèles et prédictions. Le mieux c'est de faire des parties par modèles je pense ?

Il faut aussi qu'on trouve un nomenclature pour les modèles si on les enregistre, afin de garder en tête les différents résultats

In [1]:
import pandas as pd
import seaborn as sns
import openpyxl
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import numpy as np
import os

In [2]:
from sklearn.model_selection import KFold, train_test_split

from sklearn.metrics import mean_squared_error, mean_absolute_error
import optuna

In [3]:
from Functions.helper_functions import * 

In [4]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
pd.options.mode.chained_assignment = None  # default='warn'

In [5]:
np.random.seed(42)

# Data

In [6]:
train_wp1 = pd.read_csv('Data/Preprocessing/WP1_train_preprocessed.csv', sep=',')
train_wp2 = pd.read_csv('Data/Preprocessing/WP2_train_preprocessed.csv', sep=',')
train_wp3 = pd.read_csv('Data/Preprocessing/WP3_train_preprocessed.csv', sep=',')
train_wp4 = pd.read_csv('Data/Preprocessing/WP4_train_preprocessed.csv', sep=',')
train_wp5 = pd.read_csv('Data/Preprocessing/WP5_train_preprocessed.csv', sep=',')
train_wp6 = pd.read_csv('Data/Preprocessing/WP6_train_preprocessed.csv', sep=',')

In [7]:
to_drop = ['date','wd','forecast_time', 'forecast', "forecast_dist"]

# LGBMs

In [8]:
from lightgbm import LGBMRegressor

In [9]:
def lgbm_cross_validation(X, y, params):
    if params == None:
        model = LGBMRegressor()
    else:
        model = LGBMRegressor(**params)

    print('-----------LGBM CROSS VALIDATION BEGINNING-----------')
    split = 10
    kf = KFold(n_splits=split, shuffle=True)       
    lgbm_rmse_scores = []
    lgbm_mae_scores = []
    i = 1
    for (train_index, test_index) in kf.split(pd.DataFrame(X), pd.DataFrame(y)):
        X_train, X_test = pd.DataFrame(X).iloc[train_index], pd.DataFrame(X).iloc[test_index]
        Y_train, Y_test = pd.DataFrame(y).iloc[train_index],pd.DataFrame(y).iloc[test_index]

        model.fit(X_train, Y_train, eval_set=[(X_test, Y_test)], verbose=100)

        prediction = model.predict(X_test)
        lgbm_rmse_scores.append(mean_squared_error(Y_test, prediction,squared=False))
        lgbm_mae_scores.append(mean_absolute_error(Y_test, prediction))
        
        print(show_evaluation(prediction, Y_test))
        print(f'-------------------FOLD {i}-----------------')
        i+=1

    print('---------------CROSS VALIDATION COMPLETE-------------')
    print('--------------------------RMSE-----------------------')
    display_scores(lgbm_rmse_scores)
    print('--------------------------MAE------------------------')
    display_scores(lgbm_mae_scores)

In [24]:
def hyperparametrization(trial, train_x, test_x, train_y, test_y):
    param = {
        'metric': 'rmse', 
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
        'max_depth': trial.suggest_int("max_depth", 20, 100),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
    }
    
    model = LGBMRegressor(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)
    
    rmse =  mean_squared_error(test_y, preds,squared=False)
    
    return rmse

## WP1 

| |  | Mean | Std |
| --- | --- | --- | --- |
| No params | RMSE | 0.10335401115522524 | 0.0018269668363797113 |
| No params | MAE | 0.07294693686735004 | 0.000991859760708556 |
| After tuning 50trials| RMSE | 0.06830085723562579 | 0.0012998671387256361 |
| After tuning 50trials| MAE | 0.04481696427654311 | 0.000727722171899004 |

In [11]:
train_wp1 = train_wp1[[c for c in train_wp1 if c not in ["wp"]] + ["wp"]]
wp1_X = train_wp1.drop(to_drop, axis = 1)

X_train1, X_test1, y_train1, y_test1 = train_test_split(wp1_X.drop('wp', axis = 1), wp1_X['wp'], test_size=0.20, random_state=42)
X1 = X_train2.append(X_test1)
y1 = y_train2.append(y_test1)

In [12]:
def objective_wp1(trial,data=X1,target=y1):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization(trial, train_x, test_x, train_y, test_y)

In [13]:
lgbm_cross_validation(X1, y1, None)

-----------LGBM CROSS VALIDATION BEGINNING-----------
[100]	valid_0's l2: 0.0105542
RMSE score: 0.1027337832371307
MAE score: 0.07285847370568564
None
-------------------FOLD 1-----------------
[100]	valid_0's l2: 0.0106684
RMSE score: 0.10328794524554154
MAE score: 0.07310300492057431
None
-------------------FOLD 2-----------------
[100]	valid_0's l2: 0.0106743
RMSE score: 0.1033164967104661
MAE score: 0.07308431358003745
None
-------------------FOLD 3-----------------
[100]	valid_0's l2: 0.0103395
RMSE score: 0.10168330262192399
MAE score: 0.07332568219013001
None
-------------------FOLD 4-----------------
[100]	valid_0's l2: 0.0099116
RMSE score: 0.09955704092593376
MAE score: 0.07082081105397789
None
-------------------FOLD 5-----------------
[100]	valid_0's l2: 0.0105422
RMSE score: 0.10267499969955379
MAE score: 0.07212390267283442
None
-------------------FOLD 6-----------------
[100]	valid_0's l2: 0.0107644
RMSE score: 0.10375140144915475
MAE score: 0.07210930288582018
None
----

In [14]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective_wp1, n_trials=100)
# study.trials_dataframe().to_excel('Data/Hyperparametrization/lgbm_wp1_100trials.xlsx', index=False)
# best_trial = study.best_trial.params
# best_trial

In [15]:
params_1 = {
    'reg_alpha': 0.664265743859848,
    'reg_lambda': 9.83047434398735,
    'colsample_bytree': 1.0,
    'subsample': 1.0,
    'learning_rate': 0.24237997149103074,
    'max_depth': 77,
    'num_leaves': 389,
    'min_child_samples': 2,
    'min_data_per_groups': 75
}

In [16]:
lgbm_cross_validation(X1, y1, params_1)

-----------LGBM CROSS VALIDATION BEGINNING-----------
[100]	valid_0's l2: 0.00497808
RMSE score: 0.07055548799734954
MAE score: 0.04634581387801592
None
-------------------FOLD 1-----------------
[100]	valid_0's l2: 0.00475082
RMSE score: 0.06892615813699253
MAE score: 0.04441365688959473
None
-------------------FOLD 2-----------------
[100]	valid_0's l2: 0.00465164
RMSE score: 0.06820293783972789
MAE score: 0.044789601412267815
None
-------------------FOLD 3-----------------
[100]	valid_0's l2: 0.00470489
RMSE score: 0.06859221873541076
MAE score: 0.04484449341732311
None
-------------------FOLD 4-----------------
[100]	valid_0's l2: 0.00463465
RMSE score: 0.06807824945906427
MAE score: 0.044879790363565104
None
-------------------FOLD 5-----------------
[100]	valid_0's l2: 0.00452369
RMSE score: 0.06725837870970279
MAE score: 0.044616574221668696
None
-------------------FOLD 6-----------------
[100]	valid_0's l2: 0.00481751
RMSE score: 0.06940831934294442
MAE score: 0.045176067824875

## WP2

| |  | Mean | Std |
| --- | --- | --- | --- |
| No params | RMSE | 0.10935335541057582 | 0.0014425096116734836 |
| No params | MAE | 0.07681923856705511 | 0.0008670825615244791 |
| After tuning - 50trials| RMSE | 0.06970100668304041 | 0.001166583796670067 |
| After tuning - 50trials| MAE | 0.04421560437745413 | 0.0005536593190500475 |

In [18]:
train_wp2 = train_wp2[[c for c in train_wp2 if c not in ["wp"]] + ["wp"]]
wp2_X = train_wp2.drop(to_drop, axis = 1)

X_train2, X_test2, y_train2, y_test2 = train_test_split(wp2_X.drop('wp', axis = 1), wp2_X['wp'], test_size=0.20, random_state=42)
X2 = X_train2.append(X_test2)
y2 = y_train2.append(y_test2)

In [19]:
def objective_wp2(trial,data = X2,target = y2):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization(trial, train_x, test_x, train_y, test_y)

In [42]:
lgbm_cross_validation(X2, y2, None)

-----------LGBM CROSS VALIDATION BEGINNING-----------
[100]	valid_0's l2: 0.012581
RMSE score: 0.11216513130764552
MAE score: 0.07809914701092714
None
-------------------FOLD 1-----------------
[100]	valid_0's l2: 0.011739
RMSE score: 0.1083466051043113
MAE score: 0.07635896828680372
None
-------------------FOLD 2-----------------
[100]	valid_0's l2: 0.0117763
RMSE score: 0.10851881061196905
MAE score: 0.07655464613519548
None
-------------------FOLD 3-----------------
[100]	valid_0's l2: 0.0116526
RMSE score: 0.10794712265054236
MAE score: 0.076153935787008
None
-------------------FOLD 4-----------------
[100]	valid_0's l2: 0.0115482
RMSE score: 0.10746244909528826
MAE score: 0.07586833563169988
None
-------------------FOLD 5-----------------
[100]	valid_0's l2: 0.0119061
RMSE score: 0.10911507746057371
MAE score: 0.07610085695869068
None
-------------------FOLD 6-----------------
[100]	valid_0's l2: 0.0121253
RMSE score: 0.11011478009743106
MAE score: 0.07714359630288058
None
-------

In [None]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective_wp2, n_trials=50)
# write_results('Data/Hyperparametrization/lgbm_50trials.xlsx', 'wp2', study.trials_dataframe())
# best_trial = study.best_trial.params
# best_trial

In [30]:
params_2 = {'reg_alpha': 0.04439450895032273,
 'reg_lambda': 0.7790968728875318,
 'colsample_bytree': 0.4,
 'subsample': 1.0,
 'learning_rate': 0.09520041095092219,
 'max_depth': 31,
 'num_leaves': 883,
 'min_child_samples': 18,
 'min_data_per_groups': 56}

In [None]:
lgbm_cross_validation(X2, y2, params_2)

-----------LGBM CROSS VALIDATION BEGINNING-----------


## WP3

| |  | Mean | Std |
| --- | --- | --- | --- |
| No params | RMSE | 0.10392558077951244 | 0.0019038044796542812 |
| No params | MAE | 0.07550802464973318 | 0.0012006073434917633 |
| After tuning - 50trials| RMSE | 0.058253804820626545 | 0.0009893279354834155 |
| After tuning - 50trials| MAE | 0.03787310900962521 | 0.000442034368456366 |

In [32]:
train_wp3 = train_wp3[[c for c in train_wp3 if c not in ["wp"]] + ["wp"]]
wp3_X = train_wp3.drop(to_drop, axis = 1)

X_train3, X_test3, y_train3, y_test3 = train_test_split(wp3_X.drop('wp', axis = 1), wp3_X['wp'], test_size=0.20, random_state=42)
X3 = X_train3.append(X_test3)
y3 = y_train3.append(y_test3)

In [33]:
def objective_wp3(trial,data = X3,target = y3):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization(trial, train_x, test_x, train_y, test_y)

In [39]:
lgbm_cross_validation(X3, y3, None)

-----------LGBM CROSS VALIDATION BEGINNING-----------
[100]	valid_0's l2: 0.0101759
RMSE score: 0.10087541934048462
MAE score: 0.07457730117952695
None
-------------------FOLD 1-----------------
[100]	valid_0's l2: 0.0108271
RMSE score: 0.1040535900745228
MAE score: 0.0749764961707013
None
-------------------FOLD 2-----------------
[100]	valid_0's l2: 0.0113774
RMSE score: 0.10666484815226834
MAE score: 0.07663574251973151
None
-------------------FOLD 3-----------------
[100]	valid_0's l2: 0.010356
RMSE score: 0.10176422177215298
MAE score: 0.0737619231132376
None
-------------------FOLD 4-----------------
[100]	valid_0's l2: 0.0102758
RMSE score: 0.10136977412348909
MAE score: 0.07331261317956127
None
-------------------FOLD 5-----------------
[100]	valid_0's l2: 0.0111312
RMSE score: 0.10550435372712953
MAE score: 0.07671797145858211
None
-------------------FOLD 6-----------------
[100]	valid_0's l2: 0.0107609
RMSE score: 0.10373480008995818
MAE score: 0.07659805954759628
None
------

In [38]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective_wp3, n_trials=50)
# write_results('Data/Hyperparametrization/lgbm_50trials.xlsx', 'wp3', study.trials_dataframe())
# best_trial = study.best_trial.params
# best_trial

In [40]:
params_3 = {
    'reg_alpha': 0.002937356908910416,
    'reg_lambda': 0.003822180117262245,
    'colsample_bytree': 0.8,
    'subsample': 1.0,
    'learning_rate': 0.09489749817678472,
    'max_depth': 41,
    'num_leaves': 842,
    'min_child_samples': 18,
    'min_data_per_groups': 46
}

In [41]:
lgbm_cross_validation(X3, y3, params_3)

-----------LGBM CROSS VALIDATION BEGINNING-----------
[100]	valid_0's l2: 0.00347678
RMSE score: 0.05896421336810876
MAE score: 0.038357187179331176
None
-------------------FOLD 1-----------------
[100]	valid_0's l2: 0.00335425
RMSE score: 0.0579158663790702
MAE score: 0.03773183429418428
None
-------------------FOLD 2-----------------
[100]	valid_0's l2: 0.00321283
RMSE score: 0.05668179832213056
MAE score: 0.0373643965891432
None
-------------------FOLD 3-----------------
[100]	valid_0's l2: 0.003323
RMSE score: 0.05764545793595014
MAE score: 0.03720880596082158
None
-------------------FOLD 4-----------------
[100]	valid_0's l2: 0.00329718
RMSE score: 0.057421086443220845
MAE score: 0.037892180335313
None
-------------------FOLD 5-----------------
[100]	valid_0's l2: 0.00353375
RMSE score: 0.059445374095947544
MAE score: 0.03831585330353137
None
-------------------FOLD 6-----------------
[100]	valid_0's l2: 0.00340596
RMSE score: 0.05836064259791145
MAE score: 0.03780532752623225
Non

## WP4

| |  | Mean | Std |
| --- | --- | --- | --- |
| No params | RMSE |  |  |
| No params | MAE |  |  |
| After tuning - 50trials| RMSE |  | |
| After tuning - 50trials| MAE |  |  |

In [None]:
train_wp4 = train_wp4[[c for c in train_wp4 if c not in ["wp"]] + ["wp"]]
wp4_X = train_wp4.drop(to_drop, axis = 1)

X_train4, X_test4, y_train4, y_test4 = train_test_split(wp4_X.drop('wp', axis = 1), wp4_X['wp'], test_size=0.20, random_state=42)
X4 = X_train4.append(X_test4)
y4 = y_train4.append(y_test4)

In [None]:
def objective_wp4(trial,data = X4,target = y4):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization(trial, train_x, test_x, train_y, test_y)

In [None]:
lgbm_cross_validation(X4, y4, None)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp4, n_trials=50)
write_results('Data/Hyperparametrization/lgbm_50trials.xlsx', 'wp4', study.trials_dataframe())
best_trial = study.best_trial.params
best_trial

In [None]:
params_4 = 

In [None]:
lgbm_cross_validation(X4, y4, params_4)

## WP5

| |  | Mean | Std |
| --- | --- | --- | --- |
| No params | RMSE |  |  |
| No params | MAE |  |  |
| After tuning - 50trials| RMSE |  | |
| After tuning - 50trials| MAE |  |  |

In [None]:
train_wp5 = train_wp5[[c for c in train_wp5 if c not in ["wp"]] + ["wp"]]
wp5_X = train_wp5.drop(to_drop, axis = 1)

X_train5, X_test5, y_train5, y_test5 = train_test_split(wp5_X.drop('wp', axis = 1), wp5_X['wp'], test_size=0.20, random_state=42)
X5 = X_train5.append(X_test5)
y5 = y_train5.append(y_test5)

In [None]:
def objective_wp5(trial, data = X5,target = y5):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization(trial, train_x, test_x, train_y, test_y)

In [None]:
lgbm_cross_validation(X5, y5, None)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp5, n_trials=50)
write_results('Data/Hyperparametrization/lgbm_50trials.xlsx', 'wp5', study.trials_dataframe())
best_trial = study.best_trial.params
best_trial

In [None]:
params_5 = 

In [None]:
lgbm_cross_validation(X5, y5, params_5)

# WP6

| |  | Mean | Std |
| --- | --- | --- | --- |
| No params | RMSE |  |  |
| No params | MAE |  |  |
| After tuning - 50trials| RMSE |  | |
| After tuning - 50trials| MAE |  |  |

In [None]:
train_wp6 = train_wp6[[c for c in train_wp6 if c not in ["wp"]] + ["wp"]]
wp6_X = train_wp6.drop(to_drop, axis = 1)

X_train6, X_test6, y_train6, y_test6 = train_test_split(wp6_X.drop('wp', axis = 1), wp6_X['wp'], test_size=0.20, random_state=42)
X6 = X_train6.append(X_test6)
y6 = y_train6.append(y_test6)

In [None]:
def objective_wp6(trial,data = X6,target = y6):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization(trial, train_x, test_x, train_y, test_y)

In [None]:
lgbm_cross_validation(X6, y6, None)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp6, n_trials=50)
write_results('Data/Hyperparametrization/lgbm_50trials.xlsx', 'wp6', study.trials_dataframe())
best_trial = study.best_trial.params
best_trial

In [None]:
params_6 =

In [None]:
lgbm_cross_validation(X6, y6, params_6)