# LGBM Model development - CV one model per fold - optim NelderMead

In [1]:
import pandas as pd
import seaborn as sns
import openpyxl
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle
import time

In [2]:
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
import optuna
from vmdpy import VMD

In [3]:
from scipy.optimize import minimize

In [4]:
from Functions.helper_functions import * 

In [5]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
pd.options.mode.chained_assignment = None  # default='warn'

In [6]:
np.random.seed(42)

# Data

In [7]:
train_wp1 = pd.read_csv('Data/Preprocessing/WP1_train_preprocessed.csv', sep=',')
train_wp2 = pd.read_csv('Data/Preprocessing/WP2_train_preprocessed.csv', sep=',')
train_wp3 = pd.read_csv('Data/Preprocessing/WP3_train_preprocessed.csv', sep=',')
train_wp4 = pd.read_csv('Data/Preprocessing/WP4_train_preprocessed.csv', sep=',')
train_wp5 = pd.read_csv('Data/Preprocessing/WP5_train_preprocessed.csv', sep=',')
train_wp6 = pd.read_csv('Data/Preprocessing/WP6_train_preprocessed.csv', sep=',')

In [8]:
test_wp1 = pd.read_csv('Data/Preprocessing/WP1_test_preprocessed.csv', sep=',')
test_wp2 = pd.read_csv('Data/Preprocessing/WP2_test_preprocessed.csv', sep=',')
test_wp3 = pd.read_csv('Data/Preprocessing/WP3_test_preprocessed.csv', sep=',')
test_wp4 = pd.read_csv('Data/Preprocessing/WP4_test_preprocessed.csv', sep=',')
test_wp5 = pd.read_csv('Data/Preprocessing/WP5_test_preprocessed.csv', sep=',')
test_wp6 = pd.read_csv('Data/Preprocessing/WP6_test_preprocessed.csv', sep=',')
test_dates = pd.read_csv('Data/Initial/test.csv', sep=',').date.values

In [166]:
to_drop = ['date','wd','forecast_time', 'forecast', "forecast_dist"]

In [167]:
u_to_drop = [
    'u_T_1', 'u_T_2', 'u_T_3', 'u_T_4', 'u_T_5', 'u_T_6', 
    'u_T_2_mean', 'u_T_3_mean', 'u_T_4_mean', 'u_T_5_mean', 'u_T_6_mean', 'u_T_7_mean',
    'u_T_8_mean', 'u_T_9_mean', 'u_T_10_mean', 'u_T_11_mean', 'u_T_12_mean','u_T_24_mean',
    'u_T_2_std', 'u_T_4_std', 'u_T_5_std', 'u_T_6_std',
    'u_T_2_median', 'u_T_3_median', 'u_T_4_median', 'u_T_5_median', 'u_T_6_median', 'u_T_12_median','u_T_24_median', 'u_T_36_median',
    'u_T_2_max', 'u_T_3_max', 'u_T_4_max', 'u_T_5_max', 'u_T_6_max', 'u_T_12_max',
    'u_T_2_min', 'u_T_3_min', 'u_T_4_min', 'u_T_5_min', 'u_T_6_min', 'u_T_12_min',
    'u2_T_1', 'u2_T_2', 'u2_T_3', 'u2_T_4', 'u2_T_5', 'u2_T_6', 
    'u2_T_2_mean', 'u2_T_3_mean', 'u2_T_4_mean', 'u2_T_5_mean', 'u2_T_6_mean', 'u2_T_7_mean',
    'u2_T_8_mean', 'u2_T_9_mean', 'u2_T_10_mean', 'u2_T_11_mean', 'u2_T_12_mean','u2_T_24_mean',
    'u2_T_2_std', 'u2_T_4_std', 'u2_T_5_std', 'u2_T_6_std', 'u2_T_24_std',
    'u2_T_2_median', 'u2_T_3_median', 'u2_T_4_median', 'u2_T_5_median', 'u2_T_6_median', 'u2_T_12_median',
    'u2_T_2_max','u2_T_3_max', 'u2_T_4_max','u2_T_5_max', 'u2_T_6_max', 'u2_T_12_max',
    'u2_T_2_min', 'u2_T_3_min', 'u2_T_4_min', 'u2_T_5_min', 'u2_T_6_min',
    'u2_T_12', 'u2_T_36_mean', 'u2_T_36_std', 'u2_T_24_median', 'u2_T_24_max',
    'u_T_36_mean','u_T_12','u_T_24_max','u2_T_36_median','u_T_24_min'
]
ws_to_drop = [
    'ws_T_1', 'ws_T_2', 'ws_T_3', 'ws_T_4', 'ws_T_5', 'ws_T_6', 'ws_T_7', 'ws_T_8', 'ws_T_10','ws_T_11', 'ws_T_12',
    'ws_T_2_mean', 'ws_T_3_mean', 'ws_T_4_mean', 'ws_T_5_mean', 'ws_T_6_mean', 'ws_T_7_mean', 'ws_T_8_mean', 'ws_T_9_mean', 
    'ws_T_10_mean', 'ws_T_11_mean', 'ws_T_12_mean', 'ws_T_24_mean', 
    'ws_T_2_std', 'ws_T_3_std', 'ws_T_4_std', 'ws_T_5_std', 
    'ws_T_2_median', 'ws_T_3_median', 'ws_T_4_median', 'ws_T_5_median', 'ws_T_6_median',
    'ws_T_12_median', 'ws_T_24_median', 'ws_T_36_median',
    'ws_T_2_max', 'ws_T_3_max', 'ws_T_4_max', 'ws_T_5_max','ws_T_6_max', 'ws_T_12_max',
     'ws_T_2_min', 'ws_T_3_min', 'ws_T_4_min', 'ws_T_5_min', 'ws_T_6_min', 'ws_T_12_min','ws_T_24_max','ws_T_24_min'
]

v_to_drop = [
    'v_T_1', 'v_T_2', 'v_T_3', 'v_T_4', 'v_T_5', 'v_T_6', 
    'v_T_2_mean', 'v_T_3_mean', 'v_T_4_mean', 'v_T_5_mean', 'v_T_6_mean', 'v_T_7_mean',
    'v_T_8_mean', 'v_T_9_mean', 'v_T_10_mean', 'v_T_11_mean', 'v_T_12_mean', 'v_T_24_mean','v_T_36_mean',
    'v_T_3_std', 'v_T_4_std', 'v_T_5_std','v_T_6_std','v_T_24_std', 'v_T_36_median',
    'v_T_2_median', 'v_T_3_median', 'v_T_4_median', 'v_T_5_median', 'v_T_6_median', 
    'v_T_2_max', 'v_T_3_max', 'v_T_4_max', 'v_T_5_max', 'v_T_6_max', 'v_T_12_max', 
    'v_T_2_min', 'v_T_3_min', 'v_T_4_min', 'v_T_5_min', 'v_T_6_min', 'v_T_12_min', 
    'v_T_36_min', 'v_T_36', 'v_T_24_max',  'v_T_12_median', 'v_T_24_median',
]

wd_to_drop = [
    'coswd_1', 'coswd_2', 'coswd_3', 'coswd_4', 'coswd_5', 'coswd_6',
    'coswd_2_mean', 'coswd_3_mean', 'coswd_4_mean', 'coswd_5_mean', 'coswd_6_mean', 'coswd_7_mean', 
    'coswd_8_mean', 'coswd_9_mean', 'coswd_10_mean', 'coswd_11_mean', 'coswd_12_mean', 'coswd_24_mean', 
    'coswd_3_std', 'coswd_4_std','coswd_5_std','coswd_2_median', 'coswd_3_median','coswd_4_median', 
    'coswd_5_median', 'coswd_6_median', 'coswd_36_median', 'coswd_24_median', 'coswd_12_median',
    'coswd_2_max', 'coswd_3_max', 'coswd_4_max', 'coswd_5_max', 'coswd_6_max', 'coswd_12_max', 'coswd_24_max',
    'coswd_2_min', 'coswd_3_min', 'coswd_4_min', 'coswd_5_min', 'coswd_6_min', 'coswd_12_min', 'coswd_24_min',
    'ws_T_36_max', 'ws_T_36_min', 'coswd_12', 'coswd_24'
]

other_to_drop = [
    'cos_day', 'u', 'v'
]

feature_corr = u_to_drop+ws_to_drop+v_to_drop+wd_to_drop+other_to_drop
#to_drop = to_drop+feature_corr

# LGBM functions

In [141]:
from lightgbm import LGBMRegressor

In [142]:
def create_dataset(data,n,split):
    n_batch=int(len(data)/84)
    new_data=np.array_split(data,n_batch)
    train = pd.DataFrame()
    val=pd.DataFrame()
    for i in range(n_batch):
        if (i+n)%split ==0:
            val = pd.concat([val,new_data[i]])
        else:
            train=pd.concat([train,new_data[i]])
    return train,val

In [143]:
def create_lst_dataset(x,y,cv):
    lst_X=[]
    lst_Y=[]
    split=cv
    for n in range(cv):
        print('-----Creating {0} Xs-----'.format(n+1))
        X_train,X_test=create_dataset(data=x,n=n,split=split)
        lst_X.append(X_train)
        print('-----Creating {0} Ys-----'.format(n+1))
        Y_train,Y_test=create_dataset(data=y,n=n,split=split)
        lst_Y.append(Y_train)
        try:
            len(X_train)==len(Y_train)
        except:
            print('/!\ lengh non identicale')
    return lst_X, lst_Y

In [144]:
def create_lst_lst_dataset(x,y,cv):
    lst_dataset_Y=[]
    lst_dataset_X=[]
    i=0
    for x,y in zip(x,y):
        start_time = time.time()
        

        print('----Start Creating {0} dataset list----'.format(i+1))
        X_train,Y_train=create_lst_dataset(x=x,y=y,cv=cv)
        print('--------Appending-----')
        lst_dataset_X.append(X_train)
        lst_dataset_Y.append(Y_train)
        print('-----completed round {0}'.format(i+1))
        i+=1
        print("--- %s seconds ---" % (time.time() - start_time))
    return lst_dataset_X,lst_dataset_Y

In [145]:
def create_lst_model(cv,n_estimators):
    model_1=[]
    model_2=[]
    model_3=[]
    model_4=[]
    model_5=[]
    model_6=[]
    for i in range(cv):
        model=Pipeline([('scaler', MaxAbsScaler()),('ridge', LGBMRegressor(num_iterations=n_estimators,n_jobs=-1))])
        model_1+=[model]
        model_2+=[model]
        model_3+=[model]
        model_4+=[model]
        model_5+=[model]
        model_6+=[model]
    return [model_1,model_2,model_3,model_4,model_5,model_6]

# DATA

In [168]:
wp1_X = train_wp1[[c for c in train_wp1 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X1 = wp1_X.drop('wp', axis=1)
y1 = wp1_X['wp']

In [169]:
wp2_X = train_wp2[[c for c in train_wp2 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X2 = wp2_X.drop('wp', axis=1)
y2 = wp2_X['wp']

In [170]:
wp3_X = train_wp3[[c for c in train_wp3 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X3 = wp3_X.drop('wp', axis = 1)
y3 = wp3_X['wp']


In [171]:
wp4_X = train_wp4[[c for c in train_wp4 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X4 = wp4_X.drop('wp', axis = 1)
y4 = wp4_X['wp']

In [172]:
wp5_X = train_wp5[[c for c in train_wp5 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X5 = wp5_X.drop('wp', axis = 1)
y5 = wp5_X['wp']

In [173]:
wp6_X = train_wp6[[c for c in train_wp6 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X6 = wp6_X.drop('wp', axis = 1)
y6 = wp6_X['wp']

# Creating the 10 dataset per WP

In [174]:
cv=10
lst_X = [X1, X2, X3, X4, X5, X6]
lst_Y = [y1, y2, y3, y4, y5, y6]

lst_X_trains_split, lst_Y_trains_split = create_lst_lst_dataset(x=lst_X,y=lst_Y,cv=cv)

----Start Creating 1 dataset list----
-----Creating 1 Xs-----
-----Creating 1 Ys-----
-----Creating 2 Xs-----
-----Creating 2 Ys-----
-----Creating 3 Xs-----
-----Creating 3 Ys-----
-----Creating 4 Xs-----
-----Creating 4 Ys-----
-----Creating 5 Xs-----
-----Creating 5 Ys-----
-----Creating 6 Xs-----
-----Creating 6 Ys-----
-----Creating 7 Xs-----
-----Creating 7 Ys-----
-----Creating 8 Xs-----
-----Creating 8 Ys-----
-----Creating 9 Xs-----
-----Creating 9 Ys-----
-----Creating 10 Xs-----
-----Creating 10 Ys-----
--------Appending-----
-----completed round 1
--- 209.99046516418457 seconds ---
----Start Creating 2 dataset list----
-----Creating 1 Xs-----
-----Creating 1 Ys-----
-----Creating 2 Xs-----
-----Creating 2 Ys-----
-----Creating 3 Xs-----
-----Creating 3 Ys-----
-----Creating 4 Xs-----
-----Creating 4 Ys-----
-----Creating 5 Xs-----
-----Creating 5 Ys-----
-----Creating 6 Xs-----
-----Creating 6 Ys-----
-----Creating 7 Xs-----
-----Creating 7 Ys-----
-----Creating 8 Xs-----
-

In [175]:
lst_model = create_lst_model(cv=cv,n_estimators=50)

In [176]:
np.shape(lst_X_trains_split[0][0])

(47124, 287)

In [177]:
#for model,x,y in zip(lst_model,lst_X_trains_split,lst_Y_trains_split):
 #   for i in range(cv):
  #      model[i].fit(x[i],y[i])

In [178]:
lst_predictions_all=[]
for model,x_all,x,y in zip(lst_model,lst_X,lst_X_trains_split,lst_Y_trains_split):
    lst_predictions=[]
    for i in range(cv):
        clf=model[i]
        clf.fit(x[i],y[i])
        lst_predictions.append(clf.predict(x_all))
    lst_predictions_all.append(lst_predictions)

In [179]:
#lst_predictions_all=[]
#for model,x in zip(lst_model,lst_X):
#    lst_predictions=[]
#    for i in range(cv):
#        lst_predictions.append(model[i].predict(x))
#    lst_predictions_all.append(lst_predictions)
        

In [180]:
np.shape(lst_predictions_all)

(6, 10, 52416)

In [181]:
weights_all=[]
for pred,y in zip(lst_predictions_all,lst_Y):
    weights=[]
    for i in range(cv):
        weights.append(0)
        
    def mae_func(weights):
        #final_prediction=0
        for i in range(len(weights)):
            if i==0:
                final_prediction = weights[i]*pred[i]
            else:
                final_prediction += weights[i]*pred[i]
        return mean_absolute_error(y, final_prediction)
    res = minimize(mae_func, weights, method='Nelder-Mead')
    weights_all.append(res['x'])
        
    

In [182]:
np.shape(weights_all)

(6, 10)

In [183]:
for i in range(6):
    print(weights_all[i])

[ 0.0276698   0.40474727  0.08374892  0.03634743  0.04754457  0.25196456
  0.17129103 -0.02859922  0.15363488 -0.12286536]
[ 0.40678342  0.02783575  0.15442511  0.08419277  0.03656284 -0.12358874
  0.25321849 -0.02872975  0.17209357  0.04777072]
[ 0.40827092  0.17262445 -0.12383225  0.02794989  0.15498584  0.25396152
  0.08449633  0.03662861  0.04799305 -0.02879317]
[ 0.40774193  0.17259683  0.08442932  0.25391487 -0.12389491  0.15484956
  0.04792071 -0.02880406  0.03658159  0.02787659]
[-0.12350761 -0.02882424  0.03653592  0.40723416  0.04785688  0.17236359
  0.25345559  0.08420649  0.15457112  0.02784825]
[ 0.02775796  0.17210608 -0.02867244  0.4071074   0.25321854  0.0365818
  0.08429822  0.15449387  0.04778374 -0.12350635]


In [184]:
to_drop_test = ['date','wd','forecast_time', 'forecast', "forecast_dist", 'wp']#+feature_corr
def make_prediction_dataset(test, to_drop=to_drop_test):
    test_to_predict = test.dropna(subset=['ws','u','v'], how = 'any') # keeps only lines with u,v,ws,wd
    test_to_predict = test_to_predict[test_to_predict['wp'].isna()] # keeps only lines with no wp
    test_to_predict = test_to_predict.sort_values(by=['date', 'forecast_time'], ascending = [True, False]).drop_duplicates(subset='date')
    test_to_predict = test_to_predict.drop(to_drop, axis = 1)
    return test_to_predict

In [185]:
lst_tests = []
for test in [test_wp1, test_wp2, test_wp3, test_wp4, test_wp5, test_wp6]:
    test = make_prediction_dataset(test)
    lst_tests.append(test)

In [186]:
#lst_prediction=[]
#for weights,model,test in zip(weights_all,lst_model,lst_tests):
#    for i in range(cv):
#        if i==0:
#            y_pred=model[i].predict(test)*weights[i]
#        else:
#            y_pred+=model[i].predict(test)*weights[i]
#    lst_prediction.append(y_pred)
        

In [187]:
lst_final_prediction=[]
for weights,model,test,x_train,y_train in zip(weights_all,lst_model,lst_tests,lst_X_trains_split,lst_Y_trains_split):
    for i in range(cv):
        clf=model[i]
        clf.fit(x_train[0],y_train[0])
        if i==0:
            y_pred=clf.predict(test)*weights[i]
        else:
            y_pred+=clf.predict(test)*weights[i]
    lst_final_prediction.append(y_pred)
        

In [188]:
df_predictions = pd.DataFrame({
        'date': test_dates,
        'wp1': lst_final_prediction[0],
        'wp2': lst_final_prediction[1],
        'wp3': lst_final_prediction[2],
        'wp4': lst_final_prediction[3],
        'wp5': lst_final_prediction[4],
        'wp6': lst_final_prediction[5],        
    })

In [189]:
nb_sub = 45
model = "lgm_10models"
prepro = 'MaxAbsScaler'
postpro = "Prediction limited by 0-1"

In [190]:
df_predictions.head()

Unnamed: 0,date,wp1,wp2,wp3,wp4,wp5,wp6
0,2011010101,0.675777,0.388518,0.110718,0.655784,0.738502,0.651295
1,2011010102,0.656002,0.398256,0.136802,0.569115,0.732579,0.675604
2,2011010103,0.654136,0.386581,0.126663,0.579,0.754484,0.69136
3,2011010104,0.652536,0.35469,0.175976,0.62547,0.772127,0.658792
4,2011010105,0.682563,0.340131,0.31266,0.636555,0.753277,0.645904


In [191]:
# df_predictions.to_csv('Predictions/submission_nb_10_full_maxabs-lgbm-featselect.csv', index=False, sep=';')
df_predictions.to_csv(f'Predictions/submission_nb_{nb_sub}_{model}.csv', index=False, sep=';')