# Exploratory Data Analysis

 - Wind forecast and wind power from __2009/07/01 to 2011/01/01__, the initial train phase
 - Wind forecast and wind power on 36 hours phases between each of the 157 test periods on which you can retrain you models

In [1]:
import pandas as pd
import seaborn as sns
import openpyxl
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import numpy as np
import os

pd.options.mode.chained_assignment = None  # default='warn'
np.random.seed(42)

In [2]:
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)

from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline, make_pipeline

In [3]:
from Functions.preprocessing import *
from Functions.helper_functions import * 

In [4]:
training_data_1 = pd.read_csv("Data/Initial/train.csv")
testing_data_1 = pd.read_csv("Data/Initial/test.csv")

wp_1_forecast = pd.read_csv("Data/Initial/wp1.csv")
wp_2_forecast = pd.read_csv("Data/Initial/wp2.csv")
wp_3_forecast = pd.read_csv("Data/Initial/wp3.csv")
wp_4_forecast = pd.read_csv("Data/Initial/wp4.csv")
wp_5_forecast = pd.read_csv("Data/Initial/wp5.csv")
wp_6_forecast = pd.read_csv("Data/Initial/wp6.csv")

In [5]:
# 36hrs before the real start date of the prediction
before_start_36h = '2010-12-30 13:00:00'
start_date = '2011-01-01 01:00:00'
start_forecastdate = '2011-01-01 00:00:00'
without_wp_date = '2010-12-30 12:00:00'
end_date = '2012-06-25 00:00:00'

In [6]:
# datetime(2011,1,1,1,0,0)+timedelta(hours=(36+48)*155)-timedelta(hours=36)

# Functions

In [7]:
def feature_importance(model, df, subset=None):
    importances = pd.DataFrame(model.feature_importances_, index=df.columns)
    importances.sort_values(by=0, inplace=True, ascending=False)
    if subset:
        importances = importances.reset_index()[importances.index.isin(subset)]
        importances = importances.set_index('index')
    a4_dims = (25,8)
    fig, ax = plt.subplots(figsize=a4_dims)
    importances.plot.bar(ax=ax)

    ax.set_title("Feature importances using MDI")
    ax.set_ylabel("Mean decrease in impurity")
    fig.tight_layout()
    
    return importances

In [8]:
def show_evaluation(pred, true):
    print(f'RMSE score: {mean_squared_error(true.values, pred, squared=False)}')
    print(f'MAE score: {mean_absolute_error(true.values, pred)}')

# Date correction

In [9]:
training_data_1['date'] = training_data_1.date.apply(lambda x: integer2date(x))

In [10]:
testing_data_1['date'] = testing_data_1.date.apply(lambda x: integer2date(x))

# WP1 preprocessing

In [11]:
wp_preproc = FeaturesPreprocessing(training_data_1, without_wp_date, before_start_36h)
train, test = wp_preproc.transform(wp_1_forecast, 'wp1')
train.to_csv('Data/Preprocessing/WP1_train_preprocessed.csv', sep=',', index=False)
test.to_csv('Data/Preprocessing/WP1_test_preprocessed.csv', sep=',', index=False)

In [12]:
train = pd.read_csv('Data/Preprocessing/WP1_train_preprocessed.csv', sep=',')
test = pd.read_csv('Data/Preprocessing/WP1_test_preprocessed.csv', sep=',')

## Cleaning the technical maintenance period

In [13]:
# wp_to_predict = forecast_nb_to_predict(wp_1_forecast, testing_data_1, start_forecastdate)
# wp_1_forecast = wp_1_forecast[~((wp_1_forecast.wp <=0) & (wp_1_forecast.ws > 3.3)) & (~wp_1_forecast.forecast.isin(wp_to_predict))]

<!-- ## Separation into training and testing data -->

In [14]:
# wp_1_training = wp_1_forecast[(wp_1_forecast.forecast_time < without_wp_date)]
# wp_1_predict = wp_1_forecast[(wp_1_forecast.date >= before_start_36h)]

## Visualisation

In [15]:
# n_rows=5
# n_cols=4

# a4_dims = (20, 20)
# fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=a4_dims)

# for i, column in enumerate(wp_1_forecast.columns[:20]):
#     sns.histplot(wp_1_forecast[column],ax=axes[i//n_cols,i%n_cols])

In [16]:
# wp_1_test_data = wp_1_forecast[(wp_1_forecast.date >= start_date)].sort_values(by='date').head(4*48+4*36)
# wp_1_test_data.groupby([wp_1_test_data["date"].dt.year, wp_1_test_data["date"].dt.month, wp_1_test_data["date"].dt.day,  wp_1_test_data["date"].dt.hour]).count().plot(kind="bar", figsize=(40,5))

# Model testing

In [17]:
to_drop = ['date','wd','forecast_time', 'forecast', "forecast_dist"]
train = train[[c for c in train if c not in ["wp"]] + ["wp"]]

In [18]:
wp_X = train.drop(to_drop, axis = 1)

## RF

In [19]:
# rf_reg = RandomForestRegressor()
# rf_reg.fit(X, y)
# wp_pred = rf_reg.predict(X)
# rf_rmse = np.sqrt(mean_squared_error(y, wp_pred))
# rf_rmse

In [20]:
# rf_reg_cv = make_pipeline(StandardScaler(), RandomForestRegressor())
# rf_nmse_scores = cross_val_score(rf_reg_cv, X, y, scoring='neg_mean_squared_error', cv=10)
# rf_rmse_scores = np.sqrt(-rf_nmse_scores)
# display_scores(rf_rmse_scores)

The score of the model on the training set is much lower than those of the validation sets == OVERFITTING ! Removing some of the feature thanks to the feature_importances_ will probably reduce the gap between the scores.

## LGBMs

BEFORE TUNING:</br>
--------------------------RMSE----------------------- </br>
Mean: 0.10335401115522524</br>
Std: 0.0018269668363797113</br>
--------------------------MAE------------------------</br>
Mean: 0.07294693686735004</br>
Std: 0.000991859760708556

AFTER TUNING:</br>
--------------------------RMSE------------------------</br>
Mean: 0.10335401115522524</br>
Std: 0.0018269668363797113</br>
--------------------------MAE------------------------</br>
Mean: 0.07294693686735004</br>
Std: 0.000991859760708556</br>

In [21]:
X_train, X_test, y_train, y_test = train_test_split(wp_X.drop('wp', axis = 1), wp_X['wp'], test_size=0.20, random_state=42)
X = X_train.append(X_test, sort=False)
y = y_train.append(y_test, sort=False)

In [22]:
def lgbm_cross_validation(X, y, params):
    if params == None:
        model = LGBMRegressor()
    else:
        model = LGBMRegressor(**params)

    print('-----------LGBM CROSS VALIDATION BEGINNING-----------')
    split = 10
    kf = KFold(n_splits=split, shuffle=True)       
    lgbm_rmse_scores = []
    lgbm_mae_scores = []
    i = 1
    for (train_index, test_index) in kf.split(pd.DataFrame(X), pd.DataFrame(y)):
        X_train, X_test = pd.DataFrame(X).iloc[train_index], pd.DataFrame(X).iloc[test_index]
        Y_train, Y_test = pd.DataFrame(y).iloc[train_index],pd.DataFrame(y).iloc[test_index]

        model.fit(X_train, Y_train, eval_set=[(X_test, Y_test)], verbose=100)

        prediction = model.predict(X_test)
        lgbm_rmse_scores.append(mean_squared_error(Y_test, prediction,squared=False))
        lgbm_mae_scores.append(mean_absolute_error(Y_test, prediction))
        
        print(show_evaluation(prediction, Y_test))
        print(f'-------------------FOLD {i}-----------------')
        i+=1

    print('---------------CROSS VALIDATION COMPLETE-------------')
    print('--------------------------RMSE-----------------------')
    display_scores(lgbm_rmse_scores)
    print('--------------------------MAE------------------------')
    display_scores(lgbm_mae_scores)

In [23]:
def objective(trial,data=X,target=y):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)

    param = {
        'metric': 'rmse', 
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
        'max_depth': trial.suggest_int("max_depth", 20, 100),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
    }
    
    model = LGBMRegressor(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)
    
    rmse =  mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [24]:
lgbm_cross_validation(X, y, None)

-----------LGBM CROSS VALIDATION BEGINNING-----------
[100]	valid_0's l2: 0.0105542
RMSE score: 0.1027337832371307
MAE score: 0.07285847370568564
None
-----------------------FOLD 1---------------------
[100]	valid_0's l2: 0.0106684
RMSE score: 0.10328794524554154
MAE score: 0.07310300492057431
None
-----------------------FOLD 2---------------------
[100]	valid_0's l2: 0.0106743
RMSE score: 0.1033164967104661
MAE score: 0.07308431358003745
None
-----------------------FOLD 3---------------------
[100]	valid_0's l2: 0.0103395
RMSE score: 0.10168330262192399
MAE score: 0.07332568219013001
None
-----------------------FOLD 4---------------------
[100]	valid_0's l2: 0.0099116
RMSE score: 0.09955704092593376
MAE score: 0.07082081105397789
None
-----------------------FOLD 5---------------------
[100]	valid_0's l2: 0.0105422
RMSE score: 0.10267499969955379
MAE score: 0.07212390267283442
None
-----------------------FOLD 6---------------------
[100]	valid_0's l2: 0.0107644
RMSE score: 0.1037514014

In [25]:
import optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
Best_trial = study.best_trial.params
study.trials_dataframe()

[32m[I 2021-08-23 09:39:05,368][0m A new study created in memory with name: no-name-5a748f63-19d8-454f-a85c-1b6dcf3666ec[0m
[32m[I 2021-08-23 09:39:11,521][0m Trial 0 finished with value: 0.09544425405424999 and parameters: {'reg_alpha': 0.9032192879755856, 'reg_lambda': 0.023264423731051656, 'colsample_bytree': 0.8, 'subsample': 0.7, 'learning_rate': 0.048656732324904244, 'max_depth': 42, 'num_leaves': 469, 'min_child_samples': 154, 'min_data_per_groups': 23}. Best is trial 0 with value: 0.09544425405424999.[0m
[32m[I 2021-08-23 09:39:15,746][0m Trial 1 finished with value: 0.21062723301402927 and parameters: {'reg_alpha': 1.5185800455483844, 'reg_lambda': 2.563355372835644, 'colsample_bytree': 0.9, 'subsample': 0.8, 'learning_rate': 0.005190374107590175, 'max_depth': 69, 'num_leaves': 182, 'min_child_samples': 263, 'min_data_per_groups': 60}. Best is trial 0 with value: 0.09544425405424999.[0m
[32m[I 2021-08-23 09:39:25,578][0m Trial 2 finished with value: 0.06929273068257

Number of finished trials: 50
Best trial: {'reg_alpha': 0.664265743859848, 'reg_lambda': 9.83047434398735, 'colsample_bytree': 1.0, 'subsample': 1.0, 'learning_rate': 0.24237997149103074, 'max_depth': 77, 'num_leaves': 389, 'min_child_samples': 2, 'min_data_per_groups': 75}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_min_child_samples,params_min_data_per_groups,params_num_leaves,params_reg_alpha,params_reg_lambda,params_subsample,state
0,0,0.095444,2021-08-23 09:39:05.368685,2021-08-23 09:39:11.521783,00:00:06.153098,0.8,0.048657,42,154,23,469,0.903219,0.023264,0.7,COMPLETE
1,1,0.210627,2021-08-23 09:39:11.521783,2021-08-23 09:39:15.746927,00:00:04.225144,0.9,0.00519,69,263,60,182,1.51858,2.563355,0.8,COMPLETE
2,2,0.069293,2021-08-23 09:39:15.746927,2021-08-23 09:39:25.577508,00:00:09.830581,0.3,0.11294,77,5,2,515,0.005075,0.025669,0.8,COMPLETE
3,3,0.068888,2021-08-23 09:39:25.578506,2021-08-23 09:39:37.361970,00:00:11.783464,0.6,0.16527,100,25,66,705,0.704785,8.501164,0.5,COMPLETE
4,4,0.124496,2021-08-23 09:39:37.363977,2021-08-23 09:39:41.588342,00:00:04.224365,0.3,0.018346,24,129,53,590,0.128317,0.001391,0.8,COMPLETE
5,5,0.118676,2021-08-23 09:39:41.588342,2021-08-23 09:39:44.694427,00:00:03.106085,0.5,0.026331,100,200,52,379,1.358036,1.069174,0.4,COMPLETE
6,6,0.122685,2021-08-23 09:39:44.694427,2021-08-23 09:39:48.548070,00:00:03.853643,0.5,0.022839,99,225,13,191,0.448742,2.007353,0.5,COMPLETE
7,7,0.263896,2021-08-23 09:39:48.549068,2021-08-23 09:39:52.458191,00:00:03.909123,0.5,0.001739,77,227,17,102,0.001126,0.002933,0.5,COMPLETE
8,8,0.156575,2021-08-23 09:39:52.458191,2021-08-23 09:39:57.608365,00:00:05.150174,0.7,0.011101,28,198,10,351,0.009366,6.128472,0.5,COMPLETE
9,9,0.073495,2021-08-23 09:39:57.608365,2021-08-23 09:40:12.874091,00:00:15.265726,0.4,0.056249,59,3,64,620,0.171244,1.877888,1.0,COMPLETE


In [28]:
# params_3 = {
#     'reg_alpha': 0.7047852908267428, 
#     'reg_lambda': 8.501163553334052, 
#     'colsample_bytree': 0.6, 
#     'subsample': 0.5, 
#     'learning_rate': 0.1652704612342063, 
#     'max_depth': 100, 'num_leaves': 705, 'min_child_samples': 25, 
#     'min_data_per_groups': 66
# }

In [27]:
params = {
    'reg_alpha': 0.664265743859848,
    'reg_lambda': 9.83047434398735,
    'colsample_bytree': 1.0,
    'subsample': 1.0,
    'learning_rate': 0.24237997149103074,
    'max_depth': 77,
    'num_leaves': 389,
    'min_child_samples': 2,
    'min_data_per_groups': 75
}

lgbm_cross_validation(X, y, params)

-----------LGBM CROSS VALIDATION BEGINNING-----------
[100]	valid_0's l2: 0.00497808
RMSE score: 0.07055548799734954
MAE score: 0.04634581387801592
None
-----------------------FOLD 1---------------------
[100]	valid_0's l2: 0.00475082
RMSE score: 0.06892615813699253
MAE score: 0.04441365688959473
None
-----------------------FOLD 2---------------------
[100]	valid_0's l2: 0.00465164
RMSE score: 0.06820293783972789
MAE score: 0.044789601412267815
None
-----------------------FOLD 3---------------------
[100]	valid_0's l2: 0.00470489
RMSE score: 0.06859221873541076
MAE score: 0.04484449341732311
None
-----------------------FOLD 4---------------------
[100]	valid_0's l2: 0.00463465
RMSE score: 0.06807824945906427
MAE score: 0.044879790363565104
None
-----------------------FOLD 5---------------------
[100]	valid_0's l2: 0.00452369
RMSE score: 0.06725837870970279
MAE score: 0.044616574221668696
None
-----------------------FOLD 6---------------------
[100]	valid_0's l2: 0.00481751
RMSE score: 

## XGBMs optimisation:

No hyperparametrization:
- With all features : 0.0795813815391506 / Mean: 0.16347992729285327 and Std: 0.028571836417637404
- With 50 features from importances : 0.08672582833757675 / Mean: 0.15735171469783743 Std: 0.02123509496831613
- With 60 features from importances : 0.09199791310830104 / Mean: 0.16637648056438986 Std: 0.058488702716156395
- Using the train_test_split I got way better results..

- Without parameters: Scores: Mean: 0.08902062886284395 / Std: 0.0016141651737562462

In [None]:
def xgboost_cross_validation(X, y, params):

    model = XGBRegressor(**params)
        
        print('---------------XGBOOST CROSS VALIDATION BEGINNING')
        split = 10
        kf = KFold(n_splits=split, shuffle=True)       
        xgbm_rmse_scores = []

        i = 1
        for (train_index, test_index) in kf.split(pd.DataFrame(X), pd.DataFrame(y)):
            X_train, X_test = pd.DataFrame(X).iloc[train_index], pd.DataFrame(X).iloc[test_index]
            Y_train, Y_test = pd.DataFrame(y).iloc[train_index],pd.DataFrame(y).iloc[test_index]

            model.fit(X_train, Y_train, eval_set=[(X_test, Y_test)], verbose=100)

            prediction = model.predict(X_test)
            xgbm_rmse_scores.append(mean_squared_error(Y_test, prediction,squared=False))

            print(show_evaluation(prediction, Y_test))
            print(f'-----------------------FOLD {i}---------------------')
            i+=1
            
        print('---------------CROSS VALIDATION COMPLETE')
        display_scores(xgbm_rmse_score)
        
        return test, train

In [None]:
def objective(trial,data=X,target=y):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    param = {
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 1e-8, 1),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 700),
        'max_depth': trial.suggest_int("max_depth", 20, 70),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'eta' : trial.suggest_loguniform("eta", 1e-8, 1.0),
        'gamma' : trial.suggest_loguniform("gamma", 1e-8, 1.0),
        'grow_policy' : trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    }
    model = XGBRegressor(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)
    
    rmse =  mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [None]:
# to_drop = ['v_T_3_median', 'u2_T_2_median', 'sin_day', 'coswd_2_median','v_T_2_median', 'ws_T_2_median', 'u2_T_3_median']
# wp_X = wp_X.drop(to_drop, axis = 1)
X_train, X_test, y_train, y_test = train_test_split(wp_X.drop('wp', axis = 1), wp_X['wp'], test_size=0.20, random_state=42)
X = X_train.append(X_test, sort=False)
y = y_train.append(y_test)

In [None]:
# subset = [x for x in wp_X.columns if 'wd' in x]
# subset = [x for x in wp_X.columns if 'v' in x]
# subset = [x for x in wp_X.columns if 'ws' in x]
# subset = [x for x in wp_X.columns if ('u' not in x)&('v' not in x)&('ws' not in x)]
# subset = None
# importances_xgbm = feature_importance(xgbm_reg, wp_X.drop("wp", axis = 1), subset)

In [None]:
import optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
Best_trial = study.best_trial.params
study.trials_dataframe()

In [None]:
params = {'lambda': 0.07800673048957292,
         'alpha': 0.6748382106850264,
         'colsample_bytree': 0.2890454510041761,
         'subsample': 1.0,
         'learning_rate': 0.0948313824862934,
         'n_estimators': 480,
         'max_depth': 64,
         'min_child_weight': 42,
         'eta': 0.000934499310942119,
         'gamma': 0.0069730010099778115,
         'grow_policy': 'depthwise'}

In [None]:
xgboost_cross_validation(X, y, params)

# Feature importances

In [None]:
# importances_lgbm_sum = feature_importance(lgbm_reg, wp_X.drop("wp", axis = 1)).sum(0)

In [None]:
# subset = [x for x in wp_X.columns if 'wd' in x]
# subset = [x for x in wp_X.columns if 'v' in x]
# subset = [x for x in wp_X.columns if 'ws' in x]
# subset = [x for x in wp_X.columns if ('u' not in x)&('v' not in x)&('ws' not in x)]
# subset = None
# importances_rf = feature_importance(rf_reg, wp_X.drop("wp", axis = 1), subset)
# importances_lgbm = feature_importance(lgbm_reg, wp_X.drop("wp", axis = 1), subset)/importances_lgbm_sum
# importances_xgbm = feature_importance(xgbm_reg, wp_X.drop("wp", axis = 1), subset)

In [None]:
# importances_lgbm.columns = ['LGBM']
# importances_rf.columns = ['RF']
# importances_xgbm.columns = ['XGBM']

# all_models = pd.concat([importances_lgbm, importances_rf, importances_xgbm], axis = 1)

# write_results('Data/Feature_importances_test2.xlsx','all', all_models)

## Correlation : 

- cross plot wp, ws 

In [None]:
# to_drop = ['date','wd','forecast_time', 'forecast', "forecast_dist"]
# wp_1_training = wp_1_training[[c for c in wp_1_training if c not in ["wp"]] + ["wp"]]
# wp_X = wp_1_training.drop(to_drop, axis = 1)

In [None]:
# subset = [x for x in wp_X.columns if 'u' in x]
# corr = wp_X[subset].corr()
# mask = np.zeros_like(corr, dtype=bool)
# mask[np.triu_indices_from(mask)] = True
# corr[mask] = np.nan
# (corr
#  .style
#  .background_gradient(cmap='coolwarm', axis=None, vmin=-1, vmax=1)
#  .highlight_null(null_color='#f1f1f1')  # Color NaNs grey
#  .set_precision(2))

In [None]:
# write_results('Data/Feature_importances_xgboost_correlation.xlsx','coswd', corr)