# Model development

Ici on fait nos modèles et prédictions. Le mieux c'est de faire des parties par modèles je pense ?

Il faut aussi qu'on trouve un nomenclature pour les modèles si on les enregistre, afin de garder en tête les différents résultats

In [1]:
import pandas as pd
import seaborn as sns
import openpyxl
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import numpy as np
import os

In [2]:
from sklearn.model_selection import KFold, train_test_split

from sklearn.metrics import mean_squared_error, mean_absolute_error
import optuna

In [3]:
from Functions.helper_functions import * 

In [4]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
pd.options.mode.chained_assignment = None  # default='warn'

In [5]:
np.random.seed(42)

# Data

In [6]:
train_wp1 = pd.read_csv('Data/Preprocessing/WP1_train_preprocessed.csv', sep=',')
train_wp2 = pd.read_csv('Data/Preprocessing/WP2_train_preprocessed.csv', sep=',')
train_wp3 = pd.read_csv('Data/Preprocessing/WP3_train_preprocessed.csv', sep=',')
train_wp4 = pd.read_csv('Data/Preprocessing/WP4_train_preprocessed.csv', sep=',')
train_wp5 = pd.read_csv('Data/Preprocessing/WP5_train_preprocessed.csv', sep=',')
train_wp6 = pd.read_csv('Data/Preprocessing/WP6_train_preprocessed.csv', sep=',')

In [None]:
test_wp1 = pd.read_csv('Data/Preprocessing/WP1_test_preprocessed.csv', sep=',')
test_wp2 = pd.read_csv('Data/Preprocessing/WP2_test_preprocessed.csv', sep=',')
test_wp3 = pd.read_csv('Data/Preprocessing/WP3_test_preprocessed.csv', sep=',')
test_wp4 = pd.read_csv('Data/Preprocessing/WP4_test_preprocessed.csv', sep=',')
test_wp5 = pd.read_csv('Data/Preprocessing/WP5_test_preprocessed.csv', sep=',')
test_wp6 = pd.read_csv('Data/Preprocessing/WP6_test_preprocessed.csv', sep=',')
test_dates = pd.read_csv('Data/Initial/test.csv', sep=',').date.values

In [7]:
to_drop = ['date','wd','forecast_time', 'forecast', "forecast_dist"]

# LGBMs

In [8]:
from lightgbm import LGBMRegressor

In [9]:
def lgbm_cross_validation(X, y, params):
    if params == None:
        model = LGBMRegressor()
    else:
        model = LGBMRegressor(**params)

    print('-----------LGBM CROSS VALIDATION BEGINNING-----------')
    split = 10
    kf = KFold(n_splits=split, shuffle=True)       
    lgbm_rmse_scores = []
    lgbm_mae_scores = []
    i = 1
    for (train_index, test_index) in kf.split(pd.DataFrame(X), pd.DataFrame(y)):
        X_train, X_test = pd.DataFrame(X).iloc[train_index], pd.DataFrame(X).iloc[test_index]
        Y_train, Y_test = pd.DataFrame(y).iloc[train_index],pd.DataFrame(y).iloc[test_index]

        model.fit(X_train, Y_train, eval_set=[(X_test, Y_test)], verbose=100)

        prediction = model.predict(X_test)
        lgbm_rmse_scores.append(mean_squared_error(Y_test, prediction,squared=False))
        lgbm_mae_scores.append(mean_absolute_error(Y_test, prediction))
        
        print(show_evaluation(prediction, Y_test))
        print(f'-------------------FOLD {i}-----------------')
        i+=1

    print('---------------CROSS VALIDATION COMPLETE-------------')
    print('--------------------------RMSE-----------------------')
    display_scores(lgbm_rmse_scores)
    print('--------------------------MAE------------------------')
    display_scores(lgbm_mae_scores)

In [10]:
def hyperparametrization(trial, train_x, test_x, train_y, test_y):
    param = {
        'metric': 'rmse', 
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
        'max_depth': trial.suggest_int("max_depth", 20, 100),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
    }
    
    model = LGBMRegressor(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)
    
    rmse =  mean_squared_error(test_y, preds,squared=False)
    
    return rmse

## WP1 

| |  | Mean | Std | Sum up |
| --- | --- | --- | --- | --- |
| No params | RMSE | 0.10344875448880764 | 0.0019070131550065564 |  |
| After tuning 50trials| RMSE | 0.06830085723562579 | 0.0012998671387256361 | To keep, maybe redo optuna with warm start with it |
| After tuning 100trials| RMSE | 0.0695930431604128 | 0.0015123892627707553 |  |
| --- | --- | --- | --- | --- |
| No params | MAE | 0.07306057409517844 | 0.0009857628815465762 |  |
| After tuning 50trials| MAE | 0.04481696427654311 | 0.000727722171899004 |  |
| After tuning 100trials| MAE | 0.045696725242359994 | 0.0008508558100930331 |  |

In [11]:
train_wp1 = train_wp1[[c for c in train_wp1 if c not in ["wp"]] + ["wp"]]
wp1_X = train_wp1.drop(to_drop, axis = 1)

X_train1, X_test1, y_train1, y_test1 = train_test_split(wp1_X.drop('wp', axis = 1), wp1_X['wp'], test_size=0.20, random_state=42)
X1 = X_train1.append(X_test1)
y1 = y_train1.append(y_test1)

In [12]:
def objective_wp1(trial,data=X1,target=y1):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization(trial, train_x, test_x, train_y, test_y)

In [13]:
# lgbm_cross_validation(X1, y1, None)

In [14]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective_wp1, n_trials=100)
# write_results('Data/Hyperparametrization/lgbm_100trials.xlsx', 'wp1', study.trials_dataframe())
# best_trial = study.best_trial.params
# best_trial

In [15]:
# # 100
# params_1 = {
#     'reg_alpha': 0.8314449043001416,
#     'reg_lambda': 9.093012403173608,
#     'colsample_bytree': 0.9,
#     'subsample': 0.4,
#     'learning_rate': 0.2033256175102991,
#     'max_depth': 55,
#     'num_leaves': 964,
#     'min_child_samples': 25,
#     'min_data_per_groups': 36
# }

#50
params_1 = {
    'reg_alpha': 0.664265743859848,
    'reg_lambda': 9.83047434398735,
    'colsample_bytree': 1.0,
    'subsample': 1.0,
    'learning_rate': 0.24237997149103074,
    'max_depth': 77,
    'num_leaves': 389,
    'min_child_samples': 2,
    'min_data_per_groups': 75
}

In [16]:
# lgbm_cross_validation(X1, y1, params_1)

## WP2

| |  | Mean | Std | Sum up|
| --- | --- | --- | --- | |
| No params | RMSE | 0.10935335541057582 | 0.0014425096116734836 | |
| After tuning - 50trials| RMSE | 0.0725081520968898 | 0.0016974702626377217 | |
| After tuning 100trials| RMSE | 0.0707064364904941 | 0.001396820290618349 | More stable, to keep |
| --- | --- | --- | --- |---|
| No params | MAE | 0.07681923856705511 | 0.0008670825615244791 | |
| After tuning - 50trials| MAE | 0.04512164110351975 |  0.0006579433030966575 | |
| After tuning 100trials| MAE | 0.04457902842458915 | 0.0006807155447311589 | |

In [17]:
train_wp2 = train_wp2[[c for c in train_wp2 if c not in ["wp"]] + ["wp"]]
wp2_X = train_wp2.drop(to_drop, axis = 1)

X_train2, X_test2, y_train2, y_test2 = train_test_split(wp2_X.drop('wp', axis = 1), wp2_X['wp'], test_size=0.20, random_state=42)
X2 = X_train2.append(X_test2)
y2 = y_train2.append(y_test2)

In [18]:
# def objective_wp2(trial,data = X2,target = y2):
#     train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
#     return hyperparametrization(trial, train_x, test_x, train_y, test_y)

In [19]:
# lgbm_cross_validation(X2, y2, None)

In [20]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective_wp2, n_trials=100)
# write_results('Data/Hyperparametrization/lgbm_100trials.xlsx', 'wp2', study.trials_dataframe())
# best_trial = study.best_trial.params
# best_trial

In [21]:
# 100
params_2 = {
    'reg_alpha': 0.18268883436586145,
    'reg_lambda': 0.15916821051528962,
    'colsample_bytree': 1.0,
    'subsample': 0.6,
    'learning_rate': 0.18007000714755378,
    'max_depth': 77,
    'num_leaves': 425,
    'min_child_samples': 10,
    'min_data_per_groups': 19
}

# 50
# params_2 = {
#  'reg_alpha': 0.04439450895032273,
#  'reg_lambda': 0.7790968728875318,
#  'colsample_bytree': 0.4,
#  'subsample': 1.0,
#  'learning_rate': 0.09520041095092219,
#  'max_depth': 31,
#  'num_leaves': 883,
#  'min_child_samples': 18,
#  'min_data_per_groups': 56}

In [22]:
# lgbm_cross_validation(X2, y2, params_2)

## WP3

| |  | Mean | Std ||
| --- | --- | --- | --- ||
| No params | RMSE | 0.10392558077951244 | 0.0019038044796542812 ||
| After tuning - 50trials| RMSE | 0.058253804820626545 | 0.0009893279354834155 | More stable, to keep |
| After tuning 100trials| RMSE | 0.058338944346627106 | 0.0017133930174837203 ||
| --- | --- | --- | --- | --- |
| No params | MAE | 0.07550802464973318 | 0.0012006073434917633 ||
| After tuning - 50trials| MAE | 0.03787310900962521 | 0.000442034368456366 ||
| After tuning 100trials| MAE | 0.03838030476025398 | 0.0007480100565996748 ||

In [23]:
train_wp3 = train_wp3[[c for c in train_wp3 if c not in ["wp"]] + ["wp"]]
wp3_X = train_wp3.drop(to_drop, axis = 1)

X_train3, X_test3, y_train3, y_test3 = train_test_split(wp3_X.drop('wp', axis = 1), wp3_X['wp'], test_size=0.20, random_state=42)
X3 = X_train3.append(X_test3)
y3 = y_train3.append(y_test3)

In [24]:
# def objective_wp3(trial,data = X3,target = y3):
#     train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
#     return hyperparametrization(trial, train_x, test_x, train_y, test_y)

In [25]:
# lgbm_cross_validation(X3, y3, None)

In [26]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective_wp3, n_trials=100)
# write_results('Data/Hyperparametrization/lgbm_100trials.xlsx', 'wp3', study.trials_dataframe())
# best_trial = study.best_trial.params
# best_trial

In [27]:
# # 100
# params_3 = {
#     'reg_alpha': 0.26013926149282945,
#     'reg_lambda': 0.002325658512162904,
#     'colsample_bytree': 1.0,
#     'subsample': 0.7,
#     'learning_rate': 0.10619054458258967,
#     'max_depth': 83,
#     'num_leaves': 647,
#     'min_child_samples': 3,
#     'min_data_per_groups': 24
# }

# 50
params_3 = {
    'reg_alpha': 0.002937356908910416,
    'reg_lambda': 0.003822180117262245,
    'colsample_bytree': 0.8,
    'subsample': 1.0,
    'learning_rate': 0.09489749817678472,
    'max_depth': 41,
    'num_leaves': 842,
    'min_child_samples': 18,
    'min_data_per_groups': 46
}

In [28]:
# lgbm_cross_validation(X3, y3, params_3)

## WP4

| |  | Mean | Std |
| --- | --- | --- | --- |
| No params | RMSE | 0.10486204816363351 | 0.0015105949978751166 |
| After tuning - 50trials| RMSE | 0.06513233717204232 | 0.0015891617240032727 |
| After tuning 100trials| RMSE | 0.06357594848470964 | 0.0013676749030776929 |
| No params | MAE | 0.07564776733421566 | 0.00104638869825841 |
| After tuning - 50trials| MAE | 0.04219236028055372 | 0.0008190579419060266 |
| After tuning 100trials| MAE |0.04172111697148837  | 0.0009349285385250968 |

In [29]:
train_wp4 = train_wp4[[c for c in train_wp4 if c not in ["wp"]] + ["wp"]]
wp4_X = train_wp4.drop(to_drop, axis = 1)

X_train4, X_test4, y_train4, y_test4 = train_test_split(wp4_X.drop('wp', axis = 1), wp4_X['wp'], test_size=0.20, random_state=42)
X4 = X_train4.append(X_test4)
y4 = y_train4.append(y_test4)

In [30]:
def objective_wp4(trial,data = X4,target = y4):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization(trial, train_x, test_x, train_y, test_y)

In [31]:
# lgbm_cross_validation(X4, y4, None)

In [32]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective_wp4, n_trials=100)
# write_results('Data/Hyperparametrization/lgbm_100trials.xlsx', 'wp4', study.trials_dataframe())
# best_trial = study.best_trial.params
# best_trial

In [33]:
# 100
params_4 = {
    'reg_alpha': 0.08714703614419553,
    'reg_lambda': 9.983645262139024,
    'colsample_bytree': 0.9,
    'subsample': 0.8,
    'learning_rate': 0.13413154768816146,
    'max_depth': 41,
    'num_leaves': 613,
    'min_child_samples': 15,
    'min_data_per_groups': 29
}

# {
#     'reg_alpha': 0.15331128149569725,
#     'reg_lambda': 0.28560184971009756,
#     'colsample_bytree': 0.7,
#     'subsample': 0.5,
#     'learning_rate': 0.11430869527789024,
#     'max_depth': 24,
#     'num_leaves': 856,
#     'min_child_samples': 14,
#     'min_data_per_groups': 33
# }

In [34]:
# lgbm_cross_validation(X4, y4, params_4)

## WP5

| |  | Mean | Std |
| --- | --- | --- | --- |
| No params | RMSE | 0.11722129743692011 | 0.0017732599261516583 |
| After tuning - 50trials| RMSE | 0.07721413638593042 | 0.0011020420293213135 |
| After tuning - 100trials| RMSE | 0.07297648991888442 | 0.0014970317509404526 |
| No params | MAE | 0.08497074568090211 | 0.0009101526501392155 |
| After tuning - 50trials| MAE | 0.051677856581467195 | 0.0006374939894477714 |
| After tuning - 100trials| MAE | 0.04765271414503236 | 0.0006257356756510128 |

In [35]:
train_wp5 = train_wp5[[c for c in train_wp5 if c not in ["wp"]] + ["wp"]]
wp5_X = train_wp5.drop(to_drop, axis = 1)

X_train5, X_test5, y_train5, y_test5 = train_test_split(wp5_X.drop('wp', axis = 1), wp5_X['wp'], test_size=0.20, random_state=42)
X5 = X_train5.append(X_test5)
y5 = y_train5.append(y_test5)

In [36]:
# def objective_wp5(trial, data = X5,target = y5):
#     train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
#     return hyperparametrization(trial, train_x, test_x, train_y, test_y)

In [37]:
# lgbm_cross_validation(X5, y5, None)

In [38]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective_wp5, n_trials=100)
# write_results('Data/Hyperparametrization/lgbm_100trials.xlsx', 'wp5', study.trials_dataframe())
# best_trial = study.best_trial.params
# best_trial

In [39]:
# 100
params_5 = {
    'reg_alpha': 0.04781362061382749,
    'reg_lambda': 9.716980953182604,
    'colsample_bytree': 0.9,
    'subsample': 0.7,
    'learning_rate': 0.14614317149730652,
    'max_depth': 57,
    'num_leaves': 532,
    'min_child_samples': 7,
    'min_data_per_groups': 84
}

# # 50
# params_5 = {
#     'reg_alpha': 0.0025641515787025067,
#     'reg_lambda': 0.024580995322705475,
#     'colsample_bytree': 0.8,
#     'subsample': 0.4,
#     'learning_rate': 0.11844862032615265,
#     'max_depth': 69,
#     'num_leaves': 328,
#     'min_child_samples': 62,
#     'min_data_per_groups': 34
# }

In [40]:
# lgbm_cross_validation(X5, y5, params_5)

## WP6

| |  | Mean | Std |
| --- | --- | --- | --- |
| No params | RMSE | 0.0940394026188472 | 0.0010749562915831372 |
| After tuning - 50trials| RMSE | 0.05404362835213171 | 0.0008595325139047733 |
| After tuning 100trials| RMSE | 0.054861488499908594 | 0.0007335378238383901 |
| No params | MAE | 0.070455643271004 | 0.0006641538274191148 |
| After tuning - 50trials| MAE | 0.03657758274248596 | 0.0005325521314198646 |
| After tuning 100trials| MAE | 0.03783933495157941 | 0.00045956939815828987 |

In [41]:
train_wp6 = train_wp6[[c for c in train_wp6 if c not in ["wp"]] + ["wp"]]
wp6_X = train_wp6.drop(to_drop, axis = 1)

X_train6, X_test6, y_train6, y_test6 = train_test_split(wp6_X.drop('wp', axis = 1), wp6_X['wp'], test_size=0.20, random_state=42)
X6 = X_train6.append(X_test6)
y6 = y_train6.append(y_test6)

In [42]:
def objective_wp6(trial,data = X6,target = y6):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    return hyperparametrization(trial, train_x, test_x, train_y, test_y)

In [43]:
# lgbm_cross_validation(X6, y6, None)

In [44]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective_wp6, n_trials=100)
# write_results('Data/Hyperparametrization/lgbm_100trials.xlsx', 'wp6', study.trials_dataframe())
# best_trial = study.best_trial.params
# best_trial

In [45]:
# 100
params_6 = {
    'reg_alpha': 0.23451110075396234,
    'reg_lambda': 0.796705483623135,
    'colsample_bytree': 0.9,
    'subsample': 0.4,
    'learning_rate': 0.1561492653707781,
    'max_depth': 67,
    'num_leaves': 998,
    'min_child_samples': 45,
    'min_data_per_groups': 48
}

# # 50
# params_6 = {
#     'reg_alpha': 0.11420484028619322,
#     'reg_lambda': 2.6106462927544216,
#     'colsample_bytree': 0.5,
#     'subsample': 0.4,
#     'learning_rate': 0.13579539259861131,
#     'max_depth': 35,
#     'num_leaves': 765,
#     'min_child_samples': 3,
#     'min_data_per_groups': 29
# }

In [46]:
# lgbm_cross_validation(X6, y6, params_6)

## LGBM Predictions

In [47]:
to_drop = ['date','wd','forecast_time', 'forecast', "forecast_dist", 'wp']
def make_prediction_dataset(test, to_drop=to_drop):
    test_to_predict = test.dropna(subset=['ws','u','v'], how = 'any') # keeps only lines with u,v,ws,wd
    test_to_predict = test_to_predict[test_to_predict['wp'].isna()] # keeps only lines with no wp
    test_to_predict = test_to_predict.sort_values(by=['date', 'forecast_time'], ascending = [True, False]).drop_duplicates(subset='date')
    test_to_predict = test_to_predict.drop(to_drop, axis = 1)
    return test_to_predict

In [48]:
def make_submission_file(lst_X_trains, lst_y_trains, lst_tests, lst_models, dates):
    i = 1
    lst_prediction = []
    for X, y, test, model in zip(lst_X_trains, lst_y_trains, lst_tests, lst_models):
        print(f'--------------Model {i}--------------')
        model.fit(X, y)
        print(f'True:\n\tMin:{min(y)}\n\tMax:{max(y)}\n\tMean:{y.mean()}')
        predictions = model.predict(test)
        print(f'Prediction:\n\tMin:{min(predictions)}\n\tMax:{max(predictions)}\n\tMean:{np.mean(predictions)}')
        predictions = [min(y) if i < 0 else i for i in predictions]
        predictions = [max(y) if i > 1 else i for i in predictions]
        print(f'Prediction corrected:\n\tMin:{min(predictions)}\n\tMax:{max(predictions)}\n\tMean:{np.mean(predictions)}')
        lst_prediction.append(predictions)
        i+=1
    
    df_predictions = pd.DataFrame({
        'date': test_dates,
        'wp1': lst_prediction[0],
        'wp2': lst_prediction[1],
        'wp3': lst_prediction[2],
        'wp4': lst_prediction[3],
        'wp5': lst_prediction[4],
        'wp6': lst_prediction[5],        
    })
    return df_predictions

In [50]:
model_1 = LGBMRegressor(**params_1)
model_2 = LGBMRegressor(**params_2)
model_3 = LGBMRegressor(**params_3)
model_4 = LGBMRegressor(**params_4)
model_5 = LGBMRegressor(**params_5)
model_6 = LGBMRegressor(**params_6)

lst_models = [model_1, model_2, model_3, model_4, model_5, model_6]
lst_X_trains = [X1, X2, X3, X4, X5, X6]
lst_y_trains = [y1, y2, y3, y4, y5, y6]

In [51]:
lst_tests = []
for test in [test_wp1, test_wp2, test_wp3, test_wp4, test_wp5, test_wp6]:
    test = make_prediction_dataset(test)
    lst_tests.append(test)

In [52]:
df_predictions = make_submission_file(lst_X_trains, lst_y_trains, lst_tests, lst_models, test_dates)

--------------Model 1--------------
True:
	Min:0.0
	Max:0.96
	Mean:0.28459819520757024
Prediction:
	Min:-0.022194848783538994
	Max:0.958845943771023
	Mean:0.29647903856901203
Prediction corrected:
	Min:0.0
	Max:0.958845943771023
	Mean:0.2965935331440954
--------------Model 2--------------
True:
	Min:0.0
	Max:0.966
	Mean:0.25890153769841273
Prediction:
	Min:-0.01725407854145281
	Max:0.9883402029479273
	Mean:0.2519910450757791
Prediction corrected:
	Min:0.0
	Max:0.9883402029479273
	Mean:0.25201789111610734
--------------Model 3--------------
True:
	Min:0.0
	Max:0.989
	Mean:0.26252472527472526
Prediction:
	Min:-0.011775425483345798
	Max:0.9556402894892506
	Mean:0.2880111253562854
Prediction corrected:
	Min:0.0
	Max:0.9556402894892506
	Mean:0.28801720482185034
--------------Model 4--------------
True:
	Min:0.0
	Max:0.992
	Mean:0.2763637820512821
Prediction:
	Min:-0.018291852701813067
	Max:0.9194936750713967
	Mean:0.281148685671947
Prediction corrected:
	Min:0.0
	Max:0.9194936750713967
	Mea

In [53]:
df_predictions.to_csv('Predictions/submission_nb_1_full_lgbm.csv', index=False, sep=';')