# LGBM Model development - CV one model per fold - optim NelderMead

In [1]:
import pandas as pd
import seaborn as sns
import openpyxl
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle
import time

In [2]:
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
import optuna
optuna.logging.set_verbosity(0)
from vmdpy import VMD

In [3]:
from scipy.optimize import minimize

In [4]:
from Functions.helper_functions import * 

In [5]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
pd.options.mode.chained_assignment = None  # default='warn'

In [6]:
np.random.seed(42)

# Data

In [7]:
train_wp1 = pd.read_csv('Data/Preprocessing/WP1_train_preprocessed.csv', sep=',')
train_wp2 = pd.read_csv('Data/Preprocessing/WP2_train_preprocessed.csv', sep=',')
train_wp3 = pd.read_csv('Data/Preprocessing/WP3_train_preprocessed.csv', sep=',')
train_wp4 = pd.read_csv('Data/Preprocessing/WP4_train_preprocessed.csv', sep=',')
train_wp5 = pd.read_csv('Data/Preprocessing/WP5_train_preprocessed.csv', sep=',')
train_wp6 = pd.read_csv('Data/Preprocessing/WP6_train_preprocessed.csv', sep=',')

In [8]:
test_wp1 = pd.read_csv('Data/Preprocessing/WP1_test_preprocessed.csv', sep=',')
test_wp2 = pd.read_csv('Data/Preprocessing/WP2_test_preprocessed.csv', sep=',')
test_wp3 = pd.read_csv('Data/Preprocessing/WP3_test_preprocessed.csv', sep=',')
test_wp4 = pd.read_csv('Data/Preprocessing/WP4_test_preprocessed.csv', sep=',')
test_wp5 = pd.read_csv('Data/Preprocessing/WP5_test_preprocessed.csv', sep=',')
test_wp6 = pd.read_csv('Data/Preprocessing/WP6_test_preprocessed.csv', sep=',')
test_dates = pd.read_csv('Data/Initial/test.csv', sep=',').date.values

In [9]:
to_drop = ['date','wd','forecast_time', 'forecast', "forecast_dist"]

In [10]:
u_to_drop = [
    'u_T_1', 'u_T_2', 'u_T_3', 'u_T_4', 'u_T_5', 'u_T_6', 
    'u_T_2_mean', 'u_T_3_mean', 'u_T_4_mean', 'u_T_5_mean', 'u_T_6_mean', 'u_T_7_mean',
    'u_T_8_mean', 'u_T_9_mean', 'u_T_10_mean', 'u_T_11_mean', 'u_T_12_mean','u_T_24_mean',
    'u_T_2_std', 'u_T_4_std', 'u_T_5_std', 'u_T_6_std',
    'u_T_2_median', 'u_T_3_median', 'u_T_4_median', 'u_T_5_median', 'u_T_6_median', 'u_T_12_median','u_T_24_median', 'u_T_36_median',
    'u_T_2_max', 'u_T_3_max', 'u_T_4_max', 'u_T_5_max', 'u_T_6_max', 'u_T_12_max',
    'u_T_2_min', 'u_T_3_min', 'u_T_4_min', 'u_T_5_min', 'u_T_6_min', 'u_T_12_min',
    'u2_T_1', 'u2_T_2', 'u2_T_3', 'u2_T_4', 'u2_T_5', 'u2_T_6', 
    'u2_T_2_mean', 'u2_T_3_mean', 'u2_T_4_mean', 'u2_T_5_mean', 'u2_T_6_mean', 'u2_T_7_mean',
    'u2_T_8_mean', 'u2_T_9_mean', 'u2_T_10_mean', 'u2_T_11_mean', 'u2_T_12_mean','u2_T_24_mean',
    'u2_T_2_std', 'u2_T_4_std', 'u2_T_5_std', 'u2_T_6_std', 'u2_T_24_std',
    'u2_T_2_median', 'u2_T_3_median', 'u2_T_4_median', 'u2_T_5_median', 'u2_T_6_median', 'u2_T_12_median',
    'u2_T_2_max','u2_T_3_max', 'u2_T_4_max','u2_T_5_max', 'u2_T_6_max', 'u2_T_12_max',
    'u2_T_2_min', 'u2_T_3_min', 'u2_T_4_min', 'u2_T_5_min', 'u2_T_6_min',
    'u2_T_12', 'u2_T_36_mean', 'u2_T_36_std', 'u2_T_24_median', 'u2_T_24_max',
    'u_T_36_mean','u_T_12','u_T_24_max','u2_T_36_median','u_T_24_min'
]
ws_to_drop = [
    'ws_T_1', 'ws_T_2', 'ws_T_3', 'ws_T_4', 'ws_T_5', 'ws_T_6', 'ws_T_7', 'ws_T_8', 'ws_T_10','ws_T_11', 'ws_T_12',
    'ws_T_2_mean', 'ws_T_3_mean', 'ws_T_4_mean', 'ws_T_5_mean', 'ws_T_6_mean', 'ws_T_7_mean', 'ws_T_8_mean', 'ws_T_9_mean', 
    'ws_T_10_mean', 'ws_T_11_mean', 'ws_T_12_mean', 'ws_T_24_mean', 
    'ws_T_2_std', 'ws_T_3_std', 'ws_T_4_std', 'ws_T_5_std', 
    'ws_T_2_median', 'ws_T_3_median', 'ws_T_4_median', 'ws_T_5_median', 'ws_T_6_median',
    'ws_T_12_median', 'ws_T_24_median', 'ws_T_36_median',
    'ws_T_2_max', 'ws_T_3_max', 'ws_T_4_max', 'ws_T_5_max','ws_T_6_max', 'ws_T_12_max',
     'ws_T_2_min', 'ws_T_3_min', 'ws_T_4_min', 'ws_T_5_min', 'ws_T_6_min', 'ws_T_12_min','ws_T_24_max','ws_T_24_min'
]

v_to_drop = [
    'v_T_1', 'v_T_2', 'v_T_3', 'v_T_4', 'v_T_5', 'v_T_6', 
    'v_T_2_mean', 'v_T_3_mean', 'v_T_4_mean', 'v_T_5_mean', 'v_T_6_mean', 'v_T_7_mean',
    'v_T_8_mean', 'v_T_9_mean', 'v_T_10_mean', 'v_T_11_mean', 'v_T_12_mean', 'v_T_24_mean','v_T_36_mean',
    'v_T_3_std', 'v_T_4_std', 'v_T_5_std','v_T_6_std','v_T_24_std', 'v_T_36_median',
    'v_T_2_median', 'v_T_3_median', 'v_T_4_median', 'v_T_5_median', 'v_T_6_median', 
    'v_T_2_max', 'v_T_3_max', 'v_T_4_max', 'v_T_5_max', 'v_T_6_max', 'v_T_12_max', 
    'v_T_2_min', 'v_T_3_min', 'v_T_4_min', 'v_T_5_min', 'v_T_6_min', 'v_T_12_min', 
    'v_T_36_min', 'v_T_36', 'v_T_24_max',  'v_T_12_median', 'v_T_24_median',
]

wd_to_drop = [
    'coswd_1', 'coswd_2', 'coswd_3', 'coswd_4', 'coswd_5', 'coswd_6',
    'coswd_2_mean', 'coswd_3_mean', 'coswd_4_mean', 'coswd_5_mean', 'coswd_6_mean', 'coswd_7_mean', 
    'coswd_8_mean', 'coswd_9_mean', 'coswd_10_mean', 'coswd_11_mean', 'coswd_12_mean', 'coswd_24_mean', 
    'coswd_3_std', 'coswd_4_std','coswd_5_std','coswd_2_median', 'coswd_3_median','coswd_4_median', 
    'coswd_5_median', 'coswd_6_median', 'coswd_36_median', 'coswd_24_median', 'coswd_12_median',
    'coswd_2_max', 'coswd_3_max', 'coswd_4_max', 'coswd_5_max', 'coswd_6_max', 'coswd_12_max', 'coswd_24_max',
    'coswd_2_min', 'coswd_3_min', 'coswd_4_min', 'coswd_5_min', 'coswd_6_min', 'coswd_12_min', 'coswd_24_min',
    'ws_T_36_max', 'ws_T_36_min', 'coswd_12', 'coswd_24'
]

other_to_drop = [
    'cos_day', 'u', 'v'
]

feature_corr = u_to_drop+ws_to_drop+v_to_drop+wd_to_drop+other_to_drop
#to_drop = to_drop+feature_corr

# LGBM functions

In [14]:
from lightgbm import LGBMRegressor

In [21]:
def create_dataset(data,n,split):
    n_batch=int(len(data)/84)
    new_data=np.array_split(data,n_batch)
    train = pd.DataFrame()
    val=pd.DataFrame()
    for i in range(n_batch):
        if (i+n)%split ==0:
            val = pd.concat([val,new_data[i]])
        else:
            train=pd.concat([train,new_data[i]])
    return train,val

In [22]:
def create_lst_dataset(x,y,cv):
    lst_X=[]
    lst_Y=[]
    split=cv
    for n in range(cv):
        print('-----Creating {0} Xs-----'.format(n+1))
        X_train,X_test=create_dataset(data=x,n=n,split=split)
        lst_X.append(X_train)
        print('-----Creating {0} Ys-----'.format(n+1))
        Y_train,Y_test=create_dataset(data=y,n=n,split=split)
        lst_Y.append(Y_train)
        try:
            len(X_train)==len(Y_train)
        except:
            print('/!\ lengh non identicale')
    return lst_X, lst_Y

In [23]:
def create_lst_lst_dataset(x,y,cv):
    lst_dataset_Y=[]
    lst_dataset_X=[]
    i=0
    for x,y in zip(x,y):
        start_time = time.time()
        

        print('----Start Creating {0} dataset list----'.format(i+1))
        X_train,Y_train=create_lst_dataset(x=x,y=y,cv=cv)
        print('--------Appending-----')
        lst_dataset_X.append(X_train)
        lst_dataset_Y.append(Y_train)
        print('-----completed round {0}'.format(i+1))
        i+=1
        print("--- %s seconds ---" % (time.time() - start_time))
    return lst_dataset_X,lst_dataset_Y

In [24]:
def create_lst_model(cv,n_estimators):
    model_1=[]
    model_2=[]
    model_3=[]
    model_4=[]
    model_5=[]
    model_6=[]
    for i in range(cv):
        model=Pipeline([('scaler', RobustScaler()),('ridge', LGBMRegressor(num_iterations=n_estimators,n_jobs=-1))])
        model_1+=[model]
        model_2+=[model]
        model_3+=[model]
        model_4+=[model]
        model_5+=[model]
        model_6+=[model]
    return [model_1,model_2,model_3,model_4,model_5,model_6]

In [61]:
def create_lst_model_params(cv,n_estimators):
    model_1=[]
    model_2=[]
    model_3=[]
    model_4=[]
    model_5=[]
    model_6=[]
    for i in range(cv):
        #model=Pipeline([('scaler', RobustScaler()),('ridge', LGBMRegressor(num_iterations=n_estimators,n_jobs=-1))])
        model_1+=[Pipeline([('scaler', RobustScaler()),('ridge', LGBMRegressor(**params_1[i]))])]
        model_2+=[Pipeline([('scaler', RobustScaler()),('ridge', LGBMRegressor(**params_2[i]))])]
        model_3+=[Pipeline([('scaler', RobustScaler()),('ridge', LGBMRegressor(**params_3[i]))])]
        model_4+=[Pipeline([('scaler', RobustScaler()),('ridge', LGBMRegressor(**params_4[i]))])]
        model_5+=[Pipeline([('scaler', RobustScaler()),('ridge', LGBMRegressor(**params_5[i]))])]
        model_6+=[Pipeline([('scaler', RobustScaler()),('ridge', LGBMRegressor(**params_6[i]))])]
    return [model_1,model_2,model_3,model_4,model_5,model_6]

In [41]:
def hyperparametrization(trial, train_x, test_x, train_y, test_y):
    param = {
        'n_jobs':-1,
        'num_iterations':trial.suggest_int('num_iterations',10,200),
        'metric': 'rmse', 
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
        'max_depth': trial.suggest_int("max_depth", 20, 100),
        'num_leaves' : trial.suggest_int('num_leaves', 2, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
    }
    
#     model = LGBMRegressor(**param)  
    model = Pipeline([('scaler', RobustScaler()),('ridge', LGBMRegressor(**param))])
#     model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    model.fit(train_x,train_y)
    
    preds = model.predict(test_x)
    
    rmse =  mean_squared_error(test_y, preds,squared=False)
    
    return rmse

# DATA

In [31]:
wp1_X = train_wp1[[c for c in train_wp1 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X1 = wp1_X.drop('wp', axis=1)
y1 = wp1_X['wp']

In [55]:
wp2_X = train_wp2[[c for c in train_wp2 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X2 = wp2_X.drop('wp', axis=1)
y2 = wp2_X['wp']

In [56]:
wp3_X = train_wp3[[c for c in train_wp3 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X3 = wp3_X.drop('wp', axis = 1)
y3 = wp3_X['wp']


In [57]:
wp4_X = train_wp4[[c for c in train_wp4 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X4 = wp4_X.drop('wp', axis = 1)
y4 = wp4_X['wp']

In [58]:
wp5_X = train_wp5[[c for c in train_wp5 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X5 = wp5_X.drop('wp', axis = 1)
y5 = wp5_X['wp']

In [59]:
wp6_X = train_wp6[[c for c in train_wp6 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X6 = wp6_X.drop('wp', axis = 1)
y6 = wp6_X['wp']

# Creating the 10 dataset per WP

In [42]:
cv=10
lst_X = [X1, X2, X3, X4, X5, X6]
lst_Y = [y1, y2, y3, y4, y5, y6]
lst_X_trains_split, lst_Y_trains_split = create_lst_lst_dataset(x=lst_X,y=lst_Y,cv=cv)

----Start Creating 1 dataset list----
-----Creating 1 Xs-----
-----Creating 1 Ys-----
-----Creating 2 Xs-----
-----Creating 2 Ys-----
-----Creating 3 Xs-----
-----Creating 3 Ys-----
-----Creating 4 Xs-----
-----Creating 4 Ys-----
-----Creating 5 Xs-----
-----Creating 5 Ys-----
-----Creating 6 Xs-----
-----Creating 6 Ys-----
-----Creating 7 Xs-----
-----Creating 7 Ys-----
-----Creating 8 Xs-----
-----Creating 8 Ys-----
-----Creating 9 Xs-----
-----Creating 9 Ys-----
-----Creating 10 Xs-----
-----Creating 10 Ys-----
--------Appending-----
-----completed round 1
--- 182.14605903625488 seconds ---
----Start Creating 2 dataset list----
-----Creating 1 Xs-----
-----Creating 1 Ys-----
-----Creating 2 Xs-----
-----Creating 2 Ys-----
-----Creating 3 Xs-----
-----Creating 3 Ys-----
-----Creating 4 Xs-----
-----Creating 4 Ys-----
-----Creating 5 Xs-----
-----Creating 5 Ys-----
-----Creating 6 Xs-----
-----Creating 6 Ys-----
-----Creating 7 Xs-----
-----Creating 7 Ys-----
-----Creating 8 Xs-----
-

In [43]:
params_1=[]
for i in range(cv):
    def objective_wp1(trial,data=X1,target=y1):
        train_x, test_x = create_dataset(data,i,cv)
        train_y,test_y=create_dataset(target,i,cv)
        return hyperparametrization(trial, train_x, test_x, train_y, test_y)
    study = optuna.create_study(direction='minimize')
    study.optimize(objective_wp1, n_trials=20)
    #write_results('Data/Hyperparametrization/lgbm_vmd_50trials_cv.xlsx', 'wp1', study.trials_dataframe())
    params_1.append(study.best_trial.params)
    print('------cv n°:{0}-------'.format(i))
    #best_trial

------cv n°:0-------
------cv n°:1-------
------cv n°:2-------
------cv n°:3-------
------cv n°:4-------
------cv n°:5-------
------cv n°:6-------
------cv n°:7-------
------cv n°:8-------
------cv n°:9-------


In [44]:
params_2=[]
for i in range(cv):
    def objective_wp2(trial,data=X2,target=y2):
        train_x, test_x = create_dataset(data,i,cv)
        train_y,test_y=create_dataset(target,i,cv)
        return hyperparametrization(trial, train_x, test_x, train_y, test_y)
    study = optuna.create_study(direction='minimize')
    study.optimize(objective_wp2, n_trials=20)
    #write_results('Data/Hyperparametrization/lgbm_vmd_50trials_cv.xlsx', 'wp1', study.trials_dataframe())
    params_2.append(study.best_trial.params)
    print('------cv n°:{0}-------'.format(i))
    #best_trial

------cv n°:0-------
------cv n°:1-------
------cv n°:2-------
------cv n°:3-------
------cv n°:4-------
------cv n°:5-------
------cv n°:6-------
------cv n°:7-------
------cv n°:8-------
------cv n°:9-------


In [45]:
params_3=[]
for i in range(cv):
    def objective_wp3(trial,data=X3,target=y3):
        train_x, test_x = create_dataset(data,i,cv)
        train_y,test_y=create_dataset(target,i,cv)
        return hyperparametrization(trial, train_x, test_x, train_y, test_y)
    study = optuna.create_study(direction='minimize')
    study.optimize(objective_wp3, n_trials=20)
    #write_results('Data/Hyperparametrization/lgbm_vmd_50trials_cv.xlsx', 'wp1', study.trials_dataframe())
    params_3.append(study.best_trial.params)
    print('------cv n°:{0}-------'.format(i))
    #best_trial

------cv n°:0-------
------cv n°:1-------
------cv n°:2-------
------cv n°:3-------
------cv n°:4-------
------cv n°:5-------
------cv n°:6-------
------cv n°:7-------
------cv n°:8-------
------cv n°:9-------


In [46]:
params_4=[]
for i in range(cv):
    def objective_wp4(trial,data=X4,target=y4):
        train_x, test_x = create_dataset(data,i,cv)
        train_y,test_y=create_dataset(target,i,cv)
        return hyperparametrization(trial, train_x, test_x, train_y, test_y)
    study = optuna.create_study(direction='minimize')
    study.optimize(objective_wp4, n_trials=20)
    #write_results('Data/Hyperparametrization/lgbm_vmd_50trials_cv.xlsx', 'wp1', study.trials_dataframe())
    params_4.append(study.best_trial.params)
    print('------cv n°:{0}-------'.format(i))
    #best_trial

------cv n°:0-------
------cv n°:1-------
------cv n°:2-------
------cv n°:3-------
------cv n°:4-------
------cv n°:5-------
------cv n°:6-------
------cv n°:7-------
------cv n°:8-------
------cv n°:9-------


In [None]:
params_5=[]
for i in range(cv):
    def objective_wp5(trial,data=X5,target=y5):
        train_x, test_x = create_dataset(data,i,cv)
        train_y,test_y=create_dataset(target,i,cv)
        return hyperparametrization(trial, train_x, test_x, train_y, test_y)
    study = optuna.create_study(direction='minimize')
    study.optimize(objective_wp5, n_trials=20)
    #write_results('Data/Hyperparametrization/lgbm_vmd_50trials_cv.xlsx', 'wp1', study.trials_dataframe())
    params_5.append(study.best_trial.params)
    print('------cv n°:{0}-------'.format(i))
    #best_trial

------cv n°:0-------
------cv n°:1-------
------cv n°:2-------
------cv n°:3-------
------cv n°:4-------
------cv n°:5-------
------cv n°:6-------
------cv n°:7-------
------cv n°:8-------
------cv n°:9-------


In [None]:
params_6=[]
for i in range(cv):
    def objective_wp6(trial,data=X6,target=y6):
        train_x, test_x = create_dataset(data,i,cv)
        train_y,test_y=create_dataset(target,i,cv)
        return hyperparametrization(trial, train_x, test_x, train_y, test_y)
    study = optuna.create_study(direction='minimize')
    study.optimize(objective_wp6, n_trials=20)
    #write_results('Data/Hyperparametrization/lgbm_vmd_50trials_cv.xlsx', 'wp1', study.trials_dataframe())
    params_6.append(study.best_trial.params)
    print('------cv n°:{0}-------'.format(i))
    #best_trial

------cv n°:0-------
------cv n°:1-------
------cv n°:2-------
------cv n°:3-------
------cv n°:4-------
------cv n°:5-------
------cv n°:6-------
------cv n°:7-------
------cv n°:8-------
------cv n°:9-------


In [62]:
#lst_model = create_lst_model(cv=cv,n_estimators=50)
lst_model = create_lst_model_params(cv=cv,n_estimators=50)

In [60]:
lst_model[0][1]


Pipeline(steps=[('scaler', RobustScaler()),
                ('ridge',
                 LGBMRegressor(colsample_bytree=0.8,
                               learning_rate=0.06530909015442644, max_depth=100,
                               min_child_samples=81, num_iterations=188,
                               num_leaves=277, reg_alpha=0.23651797761143137,
                               reg_lambda=0.43480032497704474,
                               subsample=0.6))])

In [77]:
#for model,x,y in zip(lst_model,lst_X_trains_split,lst_Y_trains_split):
 #   for i in range(cv):
  #      model[i].fit(x[i],y[i])

In [63]:
lst_predictions_all=[]
for model,x_all,x,y in zip(lst_model,lst_X,lst_X_trains_split,lst_Y_trains_split):
    lst_predictions=[]
    for i in range(cv):
        clf=model[i]
        clf.fit(x[i],y[i])
        lst_predictions.append(clf.predict(x_all))
    lst_predictions_all.append(lst_predictions)

In [64]:
#lst_predictions_all=[]
#for model,x in zip(lst_model,lst_X):
#    lst_predictions=[]
#    for i in range(cv):
#        lst_predictions.append(model[i].predict(x))
#    lst_predictions_all.append(lst_predictions)
        

In [65]:
np.shape(lst_predictions_all)

(6, 10, 52416)

In [66]:
weights_all=[]
for pred,y in zip(lst_predictions_all,lst_Y):
    weights=[]
    for i in range(cv):
        weights.append(0)
        
    def mae_func(weights):
        #final_prediction=0
        for i in range(len(weights)):
            if i==0:
                final_prediction = weights[i]*pred[i]
            else:
                final_prediction += weights[i]*pred[i]
        return mean_absolute_error(y, final_prediction)
    res = minimize(mae_func, weights, method='Nelder-Mead')
    weights_all.append(res['x'])
        
    

In [67]:
np.shape(weights_all)

(6, 10)

In [68]:
for i in range(6):
    print(weights_all[i])

[ 0.31991014 -0.0238613  -0.10940174 -0.19349788  0.07031025  0.23880206
  0.00186802  0.37715865  0.09788769  0.22418375]
[-0.06244564  0.09586822  0.37371852 -0.22258204  0.06510213  0.21254207
  0.14458159  0.26435358  0.25844112 -0.12824824]
[ 0.23057355 -0.04131105 -0.02482222  0.27681608  0.14429401  0.36054735
  0.08135188 -0.10824421  0.11694401 -0.02939328]
[ 0.48898097  0.22743608  0.32609225  0.1101067   0.10906726 -0.02270059
 -0.04484524 -0.24096601  0.06755364 -0.01755197]
[-0.08591238 -0.0231156   0.24821044  0.42984388 -0.08387205 -0.06078966
  0.29066019  0.04894939  0.16601354  0.07244438]
[-0.02101669  0.40865147  0.35348165  0.21041043  0.01536501  0.05036303
 -0.1908781  -0.02696856  0.10860408  0.09457782]


In [45]:
to_drop_test = ['date','wd','forecast_time', 'forecast', "forecast_dist", 'wp']#+feature_corr
def make_prediction_dataset(test, to_drop=to_drop_test):
    test_to_predict = test.dropna(subset=['ws','u','v'], how = 'any') # keeps only lines with u,v,ws,wd
    test_to_predict = test_to_predict[test_to_predict['wp'].isna()] # keeps only lines with no wp
    test_to_predict = test_to_predict.sort_values(by=['date', 'forecast_time'], ascending = [True, False]).drop_duplicates(subset='date')
    test_to_predict = test_to_predict.drop(to_drop, axis = 1)
    return test_to_predict

In [46]:
lst_tests = []
for test in [test_wp1, test_wp2, test_wp3, test_wp4, test_wp5, test_wp6]:
    test = make_prediction_dataset(test)
    lst_tests.append(test)

In [71]:
#lst_prediction=[]
#for weights,model,test in zip(weights_all,lst_model,lst_tests):
#    for i in range(cv):
#        if i==0:
#            y_pred=model[i].predict(test)*weights[i]
#        else:
#            y_pred+=model[i].predict(test)*weights[i]
#    lst_prediction.append(y_pred)
        

In [72]:
lst_final_prediction=[]
for weights,model,test,x_train,y_train in zip(weights_all,lst_model,lst_tests,lst_X_trains_split,lst_Y_trains_split):
    for i in range(cv):
        clf=model[i]
        clf.fit(x_train[0],y_train[0])
        if i==0:
            y_pred=clf.predict(test)*weights[i]
        else:
            y_pred+=clf.predict(test)*weights[i]
    lst_final_prediction.append(y_pred)
        

In [73]:
df_predictions = pd.DataFrame({
        'date': test_dates,
        'wp1': lst_final_prediction[0],
        'wp2': lst_final_prediction[1],
        'wp3': lst_final_prediction[2],
        'wp4': lst_final_prediction[3],
        'wp5': lst_final_prediction[4],
        'wp6': lst_final_prediction[5],        
    })

In [74]:
nb_sub = 1
model = "xxx-lgm_10models"
prepro = 'RobustScaler'
postpro = "Prediction limited by 0-1"

In [75]:
df_predictions.head()

Unnamed: 0,date,wp1,wp2,wp3,wp4,wp5,wp6
0,2011010101,0.727103,0.386094,0.060713,0.498558,0.721578,0.537591
1,2011010102,0.645368,0.39174,0.032693,0.457985,0.696621,0.594817
2,2011010103,0.591685,0.421102,0.092002,0.478924,0.701911,0.610174
3,2011010104,0.575016,0.364737,0.193376,0.519291,0.673011,0.642027
4,2011010105,0.688413,0.236107,0.325698,0.61621,0.702053,0.667779


In [76]:
# df_predictions.to_csv('Predictions/submission_nb_10_full_maxabs-lgbm-featselect.csv', index=False, sep=';')
df_predictions.to_csv(f'Predictions/submission_nb_{nb_sub}_{model}.csv', index=False, sep=';')

# METHODE 2 - MV Train test split

In [15]:
from lightgbm import LGBMRegressor

In [17]:
from Functions.preprocessing import *

In [24]:
lst_X1_train, lst_y1_train,lst_X1_test,lst_y1_test =  splitting_train_test_forecast(df_wp=train_wp1)

In [68]:
for i in range(len(lst_X1_train)):
    lst_X1_train[i] = lst_X1_train[i].drop(to_drop,axis=1)
    lst_X1_test[i] = lst_X1_test[i].drop(to_drop,axis=1)

KeyError: "['date' 'wd' 'forecast_time' 'forecast' 'forecast_dist'] not found in axis"

In [60]:
lst_X = [X1, X2, X3, X4, X5, X6]
lst_Y = [y1, y2, y3, y4, y5, y6]

In [61]:
lst_X2_train, lst_y2_train,lst_X2_test,lst_y2_test =  splitting_train_test_forecast(df_wp=train_wp2)

In [69]:
for i in range(len(lst_X1_train)):
    lst_X2_train[i] = lst_X2_train[i].drop(to_drop,axis=1)
    lst_X2_test[i] = lst_X2_test[i].drop(to_drop,axis=1)

In [84]:
lst_X3_train, lst_y3_train,lst_X3_test,lst_y3_test =  splitting_train_test_forecast(df_wp=train_wp3)

In [85]:
for i in range(len(lst_X3_train)):
    lst_X3_train[i] = lst_X3_train[i].drop(to_drop,axis=1)
    lst_X3_test[i] = lst_X3_test[i].drop(to_drop,axis=1)

In [63]:
lst_X4_train, lst_y4_train,lst_X4_test,lst_y4_test =  splitting_train_test_forecast(df_wp=train_wp4)

In [73]:
for i in range(len(lst_X4_train)):
    lst_X4_train[i] = lst_X4_train[i].drop(to_drop,axis=1)
    lst_X4_test[i] = lst_X4_test[i].drop(to_drop,axis=1)

In [64]:
lst_X5_train, lst_y5_train,lst_X5_test,lst_y5_test =  splitting_train_test_forecast(df_wp=train_wp5)

In [74]:
for i in range(len(lst_X5_train)):
    lst_X5_train[i] = lst_X5_train[i].drop(to_drop,axis=1)
    lst_X5_test[i] = lst_X5_test[i].drop(to_drop,axis=1)

In [65]:
lst_X6_train, lst_y6_train,lst_X6_test,lst_y6_test =  splitting_train_test_forecast(df_wp=train_wp6)

In [75]:
for i in range(len(lst_X6_train)):
    lst_X6_train[i] = lst_X6_train[i].drop(to_drop,axis=1)
    lst_X6_test[i] = lst_X6_test[i].drop(to_drop,axis=1)

In [66]:
def create_lst_model(cv,n_estimators):
    model_1=[]
    model_2=[]
    model_3=[]
    model_4=[]
    model_5=[]
    model_6=[]
    for i in range(cv):
        model=Pipeline([('scaler', RobustScaler()),('ridge', LGBMRegressor(num_iterations=n_estimators,n_jobs=-1))])
        model_1+=[model]
        model_2+=[model]
        model_3+=[model]
        model_4+=[model]
        model_5+=[model]
        model_6+=[model]
    return [model_1,model_2,model_3,model_4,model_5,model_6]

In [95]:
lst_model = create_lst_model(cv=8,n_estimators=50)

In [96]:
lst_X_trains_split = [lst_X1_train,lst_X2_train,lst_X3_train,lst_X4_train,lst_X5_train,lst_X6_train]
lst_Y_trains_split = [lst_y1_train,lst_y2_train,lst_y3_train,lst_y4_train,lst_y5_train,lst_y6_train]

In [97]:
lst_predictions_all=[]
t=1
for model,x_all,x,y in zip(lst_model,lst_X,lst_X_trains_split,lst_Y_trains_split):
    lst_predictions=[]
    print(t)
    for i in range(8):
        #print(t)
        clf=model[i]
        clf.fit(x[i],y[i])
        lst_predictions.append(clf.predict(x_all))
    lst_predictions_all.append(lst_predictions)
    t+=1

1
2
3
4
5
6


In [98]:
weights_all=[]
for pred,y in zip(lst_predictions_all,lst_Y):
    weights=[]
    for i in range(8):
        weights.append(0)
        
    def mae_func(weights):
        #final_prediction=0
        for i in range(len(weights)):
            if i==0:
                final_prediction = weights[i]*pred[i]
            else:
                final_prediction += weights[i]*pred[i]
        return mean_absolute_error(y, final_prediction)
    res = minimize(mae_func, weights, method='Nelder-Mead')
    weights_all.append(res['x'])
        
    

In [99]:
lst_final_prediction=[]
for weights,model,test,x_train,y_train in zip(weights_all,lst_model,lst_tests,lst_X_trains_split,lst_Y_trains_split):
    for i in range(8):
        clf=model[i]
        clf.fit(x_train[i],y_train[i])
        if i==0:
            y_pred=clf.predict(test)*weights[i]
        else:
            y_pred+=clf.predict(test)*weights[i]
    lst_final_prediction.append(y_pred)
        

In [100]:
df_predictions = pd.DataFrame({
        'date': test_dates,
        'wp1': lst_final_prediction[0],
        'wp2': lst_final_prediction[1],
        'wp3': lst_final_prediction[2],
        'wp4': lst_final_prediction[3],
        'wp5': lst_final_prediction[4],
        'wp6': lst_final_prediction[5],        
    })

In [101]:
nb_sub = 3
model = "xxx-lgm_10models"
prepro = 'RobustScaler'
postpro = "Prediction limited by 0-1"

In [102]:
# df_predictions.to_csv('Predictions/submission_nb_10_full_maxabs-lgbm-featselect.csv', index=False, sep=';')
df_predictions.to_csv(f'Predictions/submission_nb_{nb_sub}_{model}.csv', index=False, sep=';')

# test on WP1

In [25]:
np.shape(lst_X1_train[0])

(22464, 292)

In [29]:
for i in range(len(lst_X1_train)):
    lst_X1_train[i] = lst_X1_train[i].drop(to_drop,axis=1)
    lst_X1_test[i] = lst_X1_test[i].drop(to_drop,axis=1)

In [30]:
np.shape(lst_X1_train[0])

(22464, 287)

In [39]:
lst_model=[]
for i in range(8):
    model=Pipeline([('scaler', RobustScaler()),('ridge', LGBMRegressor(num_iterations=50,n_jobs=-1))])
    lst_model.append(model)

In [40]:
for model in lst_model:
    print(model)

Pipeline(steps=[('scaler', RobustScaler()),
                ('ridge', LGBMRegressor(num_iterations=50))])
Pipeline(steps=[('scaler', RobustScaler()),
                ('ridge', LGBMRegressor(num_iterations=50))])
Pipeline(steps=[('scaler', RobustScaler()),
                ('ridge', LGBMRegressor(num_iterations=50))])
Pipeline(steps=[('scaler', RobustScaler()),
                ('ridge', LGBMRegressor(num_iterations=50))])
Pipeline(steps=[('scaler', RobustScaler()),
                ('ridge', LGBMRegressor(num_iterations=50))])
Pipeline(steps=[('scaler', RobustScaler()),
                ('ridge', LGBMRegressor(num_iterations=50))])
Pipeline(steps=[('scaler', RobustScaler()),
                ('ridge', LGBMRegressor(num_iterations=50))])
Pipeline(steps=[('scaler', RobustScaler()),
                ('ridge', LGBMRegressor(num_iterations=50))])


In [49]:
lst_predictions_train=[]
lst_predictions_test=[]
for model,x,y in zip(lst_model,lst_X1_train,lst_y1_train):
    model.fit(x,y)
    lst_predictions_train.append(model.predict(X1))
    lst_predictions_test.append(model.predict(X_test))

In [44]:
weights=[]
for i in range(8):
    weights.append(0)
        
def mae_func(weights):
    for i in range(len(weights)):
        if i==0:
            final_prediction = weights[i]*lst_predictions_vrai[i]
        else:
            final_prediction += weights[i]*lst_predictions_vrai[i]
    return mean_absolute_error(y1, final_prediction)
res = minimize(mae_func, weights, method='Nelder-Mead')
print(res['x'])
        
    

[ 0.15228283  0.28953147 -0.04358191  0.27913313  0.14975101 -0.00208192
  0.03742032  0.1615039 ]


In [47]:
X_test = lst_tests[0]
X_test.head()

Unnamed: 0,u,v,ws,cos_hour,sin_hour,cos_day,sin_day,cos_month,sin_month,cos_wd,...,v_T_24_max,v_T_36_max,v_T_2_min,v_T_3_min,v_T_4_min,v_T_5_min,v_T_6_min,v_T_12_min,v_T_24_min,v_T_36_min
216,2.27,7.04,2.720294,0.965926,0.258819,0.999852,0.017213,0.866025,0.5,0.951862,...,9.1,9.1,7.04,7.04,7.04,7.04,7.04,7.04,7.04,7.04
217,1.95,6.93,2.681418,0.866025,0.5,0.999852,0.017213,0.866025,0.5,0.962739,...,9.1,9.1,6.93,6.93,6.93,6.93,6.93,6.93,6.93,6.93
218,1.49,7.09,2.690725,0.707107,0.707107,0.999852,0.017213,0.866025,0.5,0.978617,...,9.1,9.1,6.93,6.93,6.93,6.93,6.93,6.93,6.93,6.93
219,0.95,7.43,2.738613,0.5,0.866025,0.999852,0.017213,0.866025,0.5,0.991961,...,9.1,9.1,7.09,6.93,6.93,6.93,6.93,6.93,6.93,6.93
220,0.37,7.84,2.801785,0.258819,0.965926,0.999852,0.017213,0.866025,0.5,0.998914,...,9.1,9.1,7.43,7.09,6.93,6.93,6.93,6.93,6.93,6.93


In [50]:
for i in range(8):
    if i==0:
        prediction = res['x'][i]*lst_predictions_test[i]
    else:
        prediction += res['x'][i]*lst_predictions_test[i]

In [51]:
df_predictions = pd.DataFrame({
        'date': test_dates,
        'wp1': prediction,        
    })

In [54]:
df_predictions.to_csv('blablabla.csv', index=False, sep=';')