# Model development

Ici on fait nos modèles et prédictions. Le mieux c'est de faire des parties par modèles je pense ?

Il faut aussi qu'on trouve un nomenclature pour les modèles si on les enregistre, afin de garder en tête les différents résultats

In [1]:
import pandas as pd
import seaborn as sns
import openpyxl
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle

In [2]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import PolynomialFeatures

import optuna

In [3]:
from Functions.helper_functions import * 

In [4]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
pd.options.mode.chained_assignment = None  # default='warn'

In [5]:
np.random.seed(42)

# Data

In [6]:
train_wp1 = pd.read_csv('Data/Preprocessing/WP1_train_preprocessed.csv', sep=',')
train_wp2 = pd.read_csv('Data/Preprocessing/WP2_train_preprocessed.csv', sep=',')
train_wp3 = pd.read_csv('Data/Preprocessing/WP3_train_preprocessed.csv', sep=',')
train_wp4 = pd.read_csv('Data/Preprocessing/WP4_train_preprocessed.csv', sep=',')
train_wp5 = pd.read_csv('Data/Preprocessing/WP5_train_preprocessed.csv', sep=',')
train_wp6 = pd.read_csv('Data/Preprocessing/WP6_train_preprocessed.csv', sep=',')

In [7]:
test_wp1 = pd.read_csv('Data/Preprocessing/WP1_test_preprocessed.csv', sep=',')
test_wp2 = pd.read_csv('Data/Preprocessing/WP2_test_preprocessed.csv', sep=',')
test_wp3 = pd.read_csv('Data/Preprocessing/WP3_test_preprocessed.csv', sep=',')
test_wp4 = pd.read_csv('Data/Preprocessing/WP4_test_preprocessed.csv', sep=',')
test_wp5 = pd.read_csv('Data/Preprocessing/WP5_test_preprocessed.csv', sep=',')
test_wp6 = pd.read_csv('Data/Preprocessing/WP6_test_preprocessed.csv', sep=',')
test_dates = pd.read_csv('Data/Initial/test.csv', sep=',').date.values

In [8]:
to_drop = ['date','wd','forecast_time', 'forecast', "forecast_dist"]

In [9]:
u_to_drop = [
    'u_T_1', 'u_T_2', 'u_T_3', 'u_T_4', 'u_T_5', 'u_T_6', 
    'u_T_2_mean', 'u_T_3_mean', 'u_T_4_mean', 'u_T_5_mean', 'u_T_6_mean', 'u_T_7_mean',
    'u_T_8_mean', 'u_T_9_mean', 'u_T_10_mean', 'u_T_11_mean', 'u_T_12_mean','u_T_24_mean',
    'u_T_2_std', 'u_T_4_std', 'u_T_5_std', 'u_T_6_std',
    'u_T_2_median', 'u_T_3_median', 'u_T_4_median', 'u_T_5_median', 'u_T_6_median', 'u_T_12_median','u_T_24_median', 'u_T_36_median',
    'u_T_2_max', 'u_T_3_max', 'u_T_4_max', 'u_T_5_max', 'u_T_6_max', 'u_T_12_max',
    'u_T_2_min', 'u_T_3_min', 'u_T_4_min', 'u_T_5_min', 'u_T_6_min', 'u_T_12_min',
    'u2_T_1', 'u2_T_2', 'u2_T_3', 'u2_T_4', 'u2_T_5', 'u2_T_6', 
    'u2_T_2_mean', 'u2_T_3_mean', 'u2_T_4_mean', 'u2_T_5_mean', 'u2_T_6_mean', 'u2_T_7_mean',
    'u2_T_8_mean', 'u2_T_9_mean', 'u2_T_10_mean', 'u2_T_11_mean', 'u2_T_12_mean','u2_T_24_mean',
    'u2_T_2_std', 'u2_T_4_std', 'u2_T_5_std', 'u2_T_6_std', 'u2_T_24_std',
    'u2_T_2_median', 'u2_T_3_median', 'u2_T_4_median', 'u2_T_5_median', 'u2_T_6_median', 'u2_T_12_median',
    'u2_T_2_max','u2_T_3_max', 'u2_T_4_max','u2_T_5_max', 'u2_T_6_max', 'u2_T_12_max',
    'u2_T_2_min', 'u2_T_3_min', 'u2_T_4_min', 'u2_T_5_min', 'u2_T_6_min',
    'u2_T_12', 'u2_T_36_mean', 'u2_T_36_std', 'u2_T_24_median', 'u2_T_24_max',
    'u_T_36_mean','u_T_12','u_T_24_max','u2_T_36_median','u_T_24_min'
]
ws_to_drop = [
    'ws_T_1', 'ws_T_2', 'ws_T_3', 'ws_T_4', 'ws_T_5', 'ws_T_6', 'ws_T_7', 'ws_T_8', 'ws_T_10','ws_T_11', 'ws_T_12',
    'ws_T_2_mean', 'ws_T_3_mean', 'ws_T_4_mean', 'ws_T_5_mean', 'ws_T_6_mean', 'ws_T_7_mean', 'ws_T_8_mean', 'ws_T_9_mean', 
    'ws_T_10_mean', 'ws_T_11_mean', 'ws_T_12_mean', 'ws_T_24_mean', 
    'ws_T_2_std', 'ws_T_3_std', 'ws_T_4_std', 'ws_T_5_std', 
    'ws_T_2_median', 'ws_T_3_median', 'ws_T_4_median', 'ws_T_5_median', 'ws_T_6_median',
    'ws_T_12_median', 'ws_T_24_median', 'ws_T_36_median',
    'ws_T_2_max', 'ws_T_3_max', 'ws_T_4_max', 'ws_T_5_max','ws_T_6_max', 'ws_T_12_max',
     'ws_T_2_min', 'ws_T_3_min', 'ws_T_4_min', 'ws_T_5_min', 'ws_T_6_min', 'ws_T_12_min','ws_T_24_max','ws_T_24_min'
]

v_to_drop = [
    'v_T_1', 'v_T_2', 'v_T_3', 'v_T_4', 'v_T_5', 'v_T_6', 
    'v_T_2_mean', 'v_T_3_mean', 'v_T_4_mean', 'v_T_5_mean', 'v_T_6_mean', 'v_T_7_mean',
    'v_T_8_mean', 'v_T_9_mean', 'v_T_10_mean', 'v_T_11_mean', 'v_T_12_mean', 'v_T_24_mean','v_T_36_mean',
    'v_T_3_std', 'v_T_4_std', 'v_T_5_std','v_T_6_std','v_T_24_std', 'v_T_36_median',
    'v_T_2_median', 'v_T_3_median', 'v_T_4_median', 'v_T_5_median', 'v_T_6_median', 
    'v_T_2_max', 'v_T_3_max', 'v_T_4_max', 'v_T_5_max', 'v_T_6_max', 'v_T_12_max', 
    'v_T_2_min', 'v_T_3_min', 'v_T_4_min', 'v_T_5_min', 'v_T_6_min', 'v_T_12_min', 
    'v_T_36_min', 'v_T_36', 'v_T_24_max',  'v_T_12_median', 'v_T_24_median',
]

wd_to_drop = [
    'coswd_1', 'coswd_2', 'coswd_3', 'coswd_4', 'coswd_5', 'coswd_6',
    'coswd_2_mean', 'coswd_3_mean', 'coswd_4_mean', 'coswd_5_mean', 'coswd_6_mean', 'coswd_7_mean', 
    'coswd_8_mean', 'coswd_9_mean', 'coswd_10_mean', 'coswd_11_mean', 'coswd_12_mean', 'coswd_24_mean', 
    'coswd_3_std', 'coswd_4_std','coswd_5_std','coswd_2_median', 'coswd_3_median','coswd_4_median', 
    'coswd_5_median', 'coswd_6_median', 'coswd_36_median', 'coswd_24_median', 'coswd_12_median',
    'coswd_2_max', 'coswd_3_max', 'coswd_4_max', 'coswd_5_max', 'coswd_6_max', 'coswd_12_max', 'coswd_24_max',
    'coswd_2_min', 'coswd_3_min', 'coswd_4_min', 'coswd_5_min', 'coswd_6_min', 'coswd_12_min', 'coswd_24_min',
    'ws_T_36_max', 'ws_T_36_min', 'coswd_12', 'coswd_24'
]

other_to_drop = [
    'cos_day', 'u', 'v'
]

feature_corr = u_to_drop+ws_to_drop+v_to_drop+wd_to_drop+other_to_drop
to_drop = feature_corr+to_drop

# RidgeCV

In [10]:
def ridge_cross_validation(X, y, scaler):
    if scaler == 'minmax':
        model = Pipeline([('scaler', MinMaxScaler()),('ridge', RidgeCV(alphas = [1e-3, 1e-2, 1e-1, 1]))])
    else:
        model = Pipeline([('scaler', StandardScaler()),('ridge', RidgeCV(alphas = [1e-3, 1e-2, 1e-1, 1]))])

    print('-----------RIDGECV CROSS VALIDATION BEGINNING-----------')
    split = 10
    kf = KFold(n_splits=split, shuffle=True)       
    ridge_rmse_scores = []
    ridge_mae_scores = []
    i = 1
    for (train_index, test_index) in kf.split(pd.DataFrame(X), pd.DataFrame(y)):
        X_train, X_test = pd.DataFrame(X).iloc[train_index], pd.DataFrame(X).iloc[test_index]
        Y_train, Y_test = pd.DataFrame(y).iloc[train_index],pd.DataFrame(y).iloc[test_index]

        model.fit(X_train, Y_train)
        
        prediction = model.predict(X_test)
        prediction = [0 if i < 0 else i for i in prediction]
        prediction = [Y_test.wp.max() if i > Y_test.wp.max() else i for i in prediction]
        
        ridge_rmse_scores.append(mean_squared_error(Y_test, prediction,squared=False))
        ridge_mae_scores.append(mean_absolute_error(Y_test, prediction))
        
        print(show_evaluation(prediction, Y_test))
        print(f'-------------------FOLD {i}-----------------')
        i+=1

    print('---------------CROSS VALIDATION COMPLETE-------------')
    print('--------------------------RMSE-----------------------')
    display_scores(ridge_rmse_scores)
    print('--------------------------MAE------------------------')
    display_scores(ridge_mae_scores)

## WP1 

In [11]:
wp1_X = train_wp1[[c for c in train_wp1 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X1 = wp1_X.drop('wp', axis=1)
y1 = wp1_X['wp']

In [12]:
ridge_cross_validation(X1, y1, 'minmax')

-----------RIDGECV CROSS VALIDATION BEGINNING-----------
RMSE score: 0.15714193061179552
MAE score: 0.11716140359583194
None
-------------------FOLD 1-----------------
RMSE score: 0.15639769397558537
MAE score: 0.11729927677637784
None
-------------------FOLD 2-----------------
RMSE score: 0.15578578239613716
MAE score: 0.1161580638565696
None
-------------------FOLD 3-----------------
RMSE score: 0.15824561727426106
MAE score: 0.11881950200698262
None
-------------------FOLD 4-----------------
RMSE score: 0.1575995842317239
MAE score: 0.11765013545543963
None
-------------------FOLD 5-----------------
RMSE score: 0.15539547474615323
MAE score: 0.11531249529876327
None
-------------------FOLD 6-----------------
RMSE score: 0.1591913852686654
MAE score: 0.11857716735661045
None
-------------------FOLD 7-----------------
RMSE score: 0.1564258281314072
MAE score: 0.11672662727844012
None
-------------------FOLD 8-----------------
RMSE score: 0.16177848210129075
MAE score: 0.12171528888361

## WP2

In [13]:
wp2_X = train_wp2[[c for c in train_wp2 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X2 = wp2_X.drop('wp', axis=1)
y2 = wp2_X['wp']

In [14]:
ridge_cross_validation(X2, y2, 'minmax')

-----------RIDGECV CROSS VALIDATION BEGINNING-----------
RMSE score: 0.1643467031691214
MAE score: 0.12053765508493974
None
-------------------FOLD 1-----------------
RMSE score: 0.1724295341867724
MAE score: 0.12741507459822363
None
-------------------FOLD 2-----------------
RMSE score: 0.16540086834693246
MAE score: 0.12230310462787282
None
-------------------FOLD 3-----------------
RMSE score: 0.1709641558665732
MAE score: 0.12686077354371794
None
-------------------FOLD 4-----------------
RMSE score: 0.16949514084412498
MAE score: 0.12515630992998844
None
-------------------FOLD 5-----------------
RMSE score: 0.16514875997275655
MAE score: 0.12272517869440792
None
-------------------FOLD 6-----------------
RMSE score: 0.17205582623058557
MAE score: 0.1285501716543369
None
-------------------FOLD 7-----------------
RMSE score: 0.17221329180192915
MAE score: 0.1267402019071862
None
-------------------FOLD 8-----------------
RMSE score: 0.17049331863553308
MAE score: 0.124186203620682

## WP3

In [15]:
wp3_X = train_wp3[[c for c in train_wp3 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X3 = wp3_X.drop('wp', axis = 1)
y3 = wp3_X['wp']

In [16]:
ridge_cross_validation(X3, y3, 'minmax')

-----------RIDGECV CROSS VALIDATION BEGINNING-----------
RMSE score: 0.16809392636339116
MAE score: 0.1272431980441684
None
-------------------FOLD 1-----------------
RMSE score: 0.17132685789652488
MAE score: 0.12886432399264128
None
-------------------FOLD 2-----------------
RMSE score: 0.17490474443834758
MAE score: 0.13197879805774906
None
-------------------FOLD 3-----------------
RMSE score: 0.16995819711093788
MAE score: 0.12873045193851276
None
-------------------FOLD 4-----------------
RMSE score: 0.17256352902564087
MAE score: 0.1302465830363438
None
-------------------FOLD 5-----------------
RMSE score: 0.1738219963346922
MAE score: 0.1315325870065274
None
-------------------FOLD 6-----------------
RMSE score: 0.17435116705992212
MAE score: 0.131595627530549
None
-------------------FOLD 7-----------------
RMSE score: 0.17179630195485998
MAE score: 0.13065157807040956
None
-------------------FOLD 8-----------------
RMSE score: 0.17289397688691976
MAE score: 0.1309203804072832

## WP4

In [17]:
wp4_X = train_wp4[[c for c in train_wp4 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X4 = wp4_X.drop('wp', axis = 1)
y4 = wp4_X['wp']

In [18]:
ridge_cross_validation(X4, y4, 'minmax')

-----------RIDGECV CROSS VALIDATION BEGINNING-----------
RMSE score: 0.15624125564212518
MAE score: 0.11793165609669976
None
-------------------FOLD 1-----------------
RMSE score: 0.1547731580815974
MAE score: 0.11684807821529153
None
-------------------FOLD 2-----------------
RMSE score: 0.15852850083865822
MAE score: 0.12055334685140717
None
-------------------FOLD 3-----------------
RMSE score: 0.15886802773783193
MAE score: 0.12031326293054939
None
-------------------FOLD 4-----------------
RMSE score: 0.15750395725028324
MAE score: 0.11863586576036554
None
-------------------FOLD 5-----------------
RMSE score: 0.15562014793657358
MAE score: 0.11849485470615498
None
-------------------FOLD 6-----------------
RMSE score: 0.1551631907304398
MAE score: 0.11851312514385284
None
-------------------FOLD 7-----------------
RMSE score: 0.1562716628780599
MAE score: 0.11797216790512659
None
-------------------FOLD 8-----------------
RMSE score: 0.1584149569741389
MAE score: 0.11930623487803

## WP5

In [19]:
wp5_X = train_wp5[[c for c in train_wp5 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X5 = wp5_X.drop('wp', axis = 1)
y5 = wp5_X['wp']

In [20]:
ridge_cross_validation(X5, y5, 'minmax')

-----------RIDGECV CROSS VALIDATION BEGINNING-----------
RMSE score: 0.17228265445529223
MAE score: 0.12975811632983286
None
-------------------FOLD 1-----------------
RMSE score: 0.17094715933454124
MAE score: 0.1289937894252428
None
-------------------FOLD 2-----------------
RMSE score: 0.1726040781211631
MAE score: 0.13125926206156402
None
-------------------FOLD 3-----------------
RMSE score: 0.17262042488533672
MAE score: 0.1312218382823617
None
-------------------FOLD 4-----------------
RMSE score: 0.1740750050882022
MAE score: 0.13174705864582933
None
-------------------FOLD 5-----------------
RMSE score: 0.16921134305462857
MAE score: 0.12770567062225607
None
-------------------FOLD 6-----------------
RMSE score: 0.1707356765107346
MAE score: 0.1291456408744331
None
-------------------FOLD 7-----------------
RMSE score: 0.1702382864773385
MAE score: 0.12910443878185449
None
-------------------FOLD 8-----------------
RMSE score: 0.16892929930681405
MAE score: 0.12704293954257861

## WP6

In [21]:
wp6_X = train_wp6[[c for c in train_wp6 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X6 = wp6_X.drop('wp', axis = 1)
y6 = wp6_X['wp']

In [22]:
ridge_cross_validation(X6, y6, 'minmax')

-----------RIDGECV CROSS VALIDATION BEGINNING-----------
RMSE score: 0.15248079245857016
MAE score: 0.11561917612969624
None
-------------------FOLD 1-----------------
RMSE score: 0.15384563186195266
MAE score: 0.11604214642893365
None
-------------------FOLD 2-----------------
RMSE score: 0.15397801649513237
MAE score: 0.11621178992729903
None
-------------------FOLD 3-----------------
RMSE score: 0.1509461069566942
MAE score: 0.11510833199438879
None
-------------------FOLD 4-----------------
RMSE score: 0.14921944196731626
MAE score: 0.11333970338988189
None
-------------------FOLD 5-----------------
RMSE score: 0.14881544075922662
MAE score: 0.11276965824671341
None
-------------------FOLD 6-----------------
RMSE score: 0.15250166991812256
MAE score: 0.11661239796146085
None
-------------------FOLD 7-----------------
RMSE score: 0.1518090789024589
MAE score: 0.11582385256762003
None
-------------------FOLD 8-----------------
RMSE score: 0.15127962199162026
MAE score: 0.115032610816

# RidgeCV Predictions

## Functions

In [28]:
to_drop = ['date','wd','forecast_time', 'forecast', "forecast_dist", 'wp']+feature_corr
def make_prediction_dataset(test, to_drop=to_drop):
    test_to_predict = test.dropna(subset=['ws','u','v'], how = 'any') # keeps only lines with u,v,ws,wd
    test_to_predict = test_to_predict[test_to_predict['wp'].isna()] # keeps only lines with no wp
    test_to_predict = test_to_predict.sort_values(by=['date', 'forecast_time'], ascending = [True, False]).drop_duplicates(subset='date')
    test_to_predict = test_to_predict.drop(to_drop, axis = 1)
    return test_to_predict

In [31]:
def make_submission_file(lst_X_trains, lst_y_trains, lst_tests, lst_models, dates):
    i = 1
    lst_prediction = []
    lst_models_trained = []
    for X, y, test, model in zip(lst_X_trains, lst_y_trains, lst_tests, lst_models):
        print(f'--------------Model {i}--------------')
        model.fit(X, y)
        print(f'True:\n\tMin:{min(y)}\n\tMax:{max(y)}\n\tMean:{y.mean()}')
        predictions = model.predict(test)
        print(f'Prediction:\n\tMin:{min(predictions)}\n\tMax:{max(predictions)}\n\tMean:{np.mean(predictions)}')
        predictions = [min(y) if i < 0 else i for i in predictions]
        predictions = [max(y) if i > max(y) else i for i in predictions]
        print(f'Prediction corrected:\n\tMin:{min(predictions)}\n\tMax:{max(predictions)}\n\tMean:{np.mean(predictions)}')
        lst_prediction.append(predictions)
        lst_models_trained.append(model)
        i+=1
    
    df_predictions = pd.DataFrame({
        'date': test_dates,
        'wp1': lst_prediction[0],
        'wp2': lst_prediction[1],
        'wp3': lst_prediction[2],
        'wp4': lst_prediction[3],
        'wp5': lst_prediction[4],
        'wp6': lst_prediction[5],        
    })
    return df_predictions, lst_models_trained

## Submission

In [30]:
model_1 = Pipeline([('scaler', MinMaxScaler()),('ridge', RidgeCV())])
model_2 = Pipeline([('scaler', MinMaxScaler()),('ridge', RidgeCV())])
model_3 = Pipeline([('scaler', MinMaxScaler()),('ridge', RidgeCV())])
model_4 = Pipeline([('scaler', MinMaxScaler()),('ridge', RidgeCV())])
model_5 = Pipeline([('scaler', MinMaxScaler()),('ridge', RidgeCV())])
model_6 = Pipeline([('scaler', MinMaxScaler()),('ridge', RidgeCV())])

lst_models = [model_1, model_2, model_3, model_4, model_5, model_6]
lst_X_trains = [X1, X2, X3, X4, X5, X6]
lst_y_trains = [y1, y2, y3, y4, y5, y6]

In [32]:
lst_tests = []
for test in [test_wp1, test_wp2, test_wp3, test_wp4, test_wp5, test_wp6]:
    test = make_prediction_dataset(test)
    lst_tests.append(test)

In [33]:
df_predictions, lst_models_trained = make_submission_file(lst_X_trains, lst_y_trains, lst_tests, lst_models, test_dates)

--------------Model 1--------------
True:
	Min:0.0
	Max:0.96
	Mean:0.2845981952075702
Prediction:
	Min:-0.4766415909848688
	Max:1.3986940981181895
	Mean:0.30346395141349247
Prediction corrected:
	Min:0.0
	Max:0.96
	Mean:0.3100917494541774
--------------Model 2--------------
True:
	Min:0.0
	Max:0.966
	Mean:0.25890153769841273
Prediction:
	Min:-0.3098633004102123
	Max:1.3229435779243248
	Mean:0.26061850555633775
Prediction corrected:
	Min:0.0
	Max:0.966
	Mean:0.26714909157674804
--------------Model 3--------------
True:
	Min:0.0
	Max:0.989
	Mean:0.2625247252747253
Prediction:
	Min:-0.3652711407865974
	Max:1.562625016073661
	Mean:0.3008239822123913
Prediction corrected:
	Min:0.0
	Max:0.989
	Mean:0.3042360471467673
--------------Model 4--------------
True:
	Min:0.0
	Max:0.992
	Mean:0.2763637820512821
Prediction:
	Min:-0.42578879042305023
	Max:1.1675270554068433
	Mean:0.28890313534211337
Prediction corrected:
	Min:0.0
	Max:0.992
	Mean:0.2989969234663061
--------------Model 5--------------
T

In [34]:
df_predictions.to_csv('Predictions/submission_nb_11_full_ridgecv-featselect.csv', index=False, sep=';')

In [35]:
pkl_model = "Models/RidgeCV/RidgeCV-wp1-featselect.pkl"
with open(pkl_model, 'wb') as file:
    pickle.dump(lst_models_trained[0], file)

In [36]:
pkl_model = "Models/RidgeCV/RidgeCV-wp2-featselect.pkl"
with open(pkl_model, 'wb') as file:
    pickle.dump(lst_models_trained[1], file)

In [37]:
pkl_model = "Models/RidgeCV/RidgeCV-wp3-featselect.pkl"
with open(pkl_model, 'wb') as file:
    pickle.dump(lst_models_trained[2], file)

In [38]:
pkl_model = "Models/RidgeCV/RidgeCV-wp4-featselect.pkl"
with open(pkl_model, 'wb') as file:
    pickle.dump(lst_models_trained[3], file)

In [39]:
pkl_model = "Models/RidgeCV/RidgeCV-wp5-featselect.pkl"
with open(pkl_model, 'wb') as file:
    pickle.dump(lst_models_trained[4], file)

In [40]:
pkl_model = "Models/RidgeCV/RidgeCV-wp6-featselect.pkl"
with open(pkl_model, 'wb') as file:
    pickle.dump(lst_models_trained[5], file)