# Exploratory Data Analysis

 - Wind forecast and wind power from __2009/07/01 to 2011/01/01__, the initial train phase
 - Wind forecast and wind power on 36 hours phases between each of the 157 test periods on which you can retrain you models

In [2]:
import pandas as pd
import seaborn as sns
import openpyxl
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import numpy as np
import os

pd.options.mode.chained_assignment = None  # default='warn'
np.random.seed(42)

In [3]:
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)

# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import Pipeline, make_pipeline

In [4]:
from Functions.preprocessing import *
from Functions.helper_functions import * 

In [5]:
training_data_1 = pd.read_csv("Data/Initial/train.csv")
testing_data_1 = pd.read_csv("Data/Initial/test.csv")

# wp_1_forecast = pd.read_csv("Data/Initial/wp1.csv")
# wp_2_forecast = pd.read_csv("Data/Initial/wp2.csv")
# wp_3_forecast = pd.read_csv("Data/Initial/wp3.csv")
# wp_4_forecast = pd.read_csv("Data/Initial/wp4.csv")
# wp_5_forecast = pd.read_csv("Data/Initial/wp5.csv")
# wp_6_forecast = pd.read_csv("Data/Initial/wp6.csv")

In [6]:
# 36hrs before the real start date of the prediction
before_start_36h = '2010-12-30 13:00:00'
start_date = '2011-01-01 01:00:00'
start_forecastdate = '2011-01-01 00:00:00'
without_wp_date = '2010-12-30 12:00:00'
end_date = '2012-06-25 00:00:00'

In [7]:
# datetime(2011,1,1,1,0,0)+timedelta(hours=(36+48)*155)-timedelta(hours=36)

In [8]:
train_wp1 = pd.read_csv('Data/Preprocessing/WP1_train_preprocessed.csv', sep=',')
train_wp2 = pd.read_csv('Data/Preprocessing/WP2_train_preprocessed.csv', sep=',')
train_wp3 = pd.read_csv('Data/Preprocessing/WP3_train_preprocessed.csv', sep=',')
train_wp4 = pd.read_csv('Data/Preprocessing/WP4_train_preprocessed.csv', sep=',')
train_wp5 = pd.read_csv('Data/Preprocessing/WP5_train_preprocessed.csv', sep=',')
train_wp6 = pd.read_csv('Data/Preprocessing/WP6_train_preprocessed.csv', sep=',')

In [51]:
int((len(train_wp1)-12)/84)

624

In [52]:
int(9.99)

9

In [59]:
def batch_train_test_forecast(df_wp, shift, nb_index=84):
    train_wp = pd.DataFrame(columns=df_wp.columns)
    test_wp = pd.DataFrame(columns=df_wp.columns)
    nb_batch = int((len(train_wp1)-shift)/nb_index)
    for i in range(nb_batch):
        id0 = shift + nb_index*i
        id1 = shift + (nb_index*(i+1)-1)
        train_wp = pd.concat([train_wp, df_wp.loc[id0:id1].head(36)])
        test_wp = pd.concat([test_wp, df_wp.loc[id0:id1].tail(48)])
    return train_wp, test_wp

In [67]:
def splitting_train_test_forecast(df_wp):
    train_1, test_1 = batch_train_test_forecast(df_wp, 0)
    train_2, test_2 = batch_train_test_forecast(df_wp, 12)
    train_3, test_3 = batch_train_test_forecast(df_wp, 24)
    train_4, test_4 = batch_train_test_forecast(df_wp, 36)
    train_5, test_5 = batch_train_test_forecast(df_wp, 48)   
    train_6, test_6 = batch_train_test_forecast(df_wp, 60)
    train_7, test_7 = batch_train_test_forecast(df_wp, 72)  
    train_8, test_8 = batch_train_test_forecast(df_wp, 84) 
    train = [train_1, train_2, train_3, train_4, train_5, train_6, train_7, train_8]
    test = [test_1, test_2, test_3, test_4, test_5, test_6, test_7, test_8]
    return train, test

In [None]:
from sklearn.model_selection import train_test_split

# Stacking of one model
class StackingSL_1Model(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_model, meta_model, n_folds=5, verbose=True):
        self.base_model = base_model
        self.meta_model = meta_model
        self.n_folds = n_folds
        self.verbose = verbose
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = []
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        X_train, X_test = splitting_train_test_forecast(X)
        y_train, y_test = splitting_train_test_forecast(y)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X_test.shape[0], self.n_folds))
        i = 0
        for train_index, holdout_index in kfold.split(X_train, y_train):
            if self.verbose:
                print(f"--------------Model {i}--------------")
            instance = clone(self.base_model)
            self.base_models_.append(instance)
            instance.fit(X_train[train_index], y_train[train_index])

            prediction = instance.predict(X_test)
            print('RMSE: ', mean_squared_error(y_test, prediction,squared=False))
            
            y_pred = instance.predict(X_test)
            out_of_fold_predictions[:,i] = y_pred
            i+=1
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y_test)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([base_model.predict(X) for base_model in self.base_models_ ])
        ])
        return self.meta_model_.predict(meta_features)

# Functions

In [101]:
def feature_importance(model, df, subset=None):
    importances = pd.DataFrame(model.feature_importances_, index=df.columns)
    importances.sort_values(by=0, inplace=True, ascending=False)
    if subset:
        importances = importances.reset_index()[importances.index.isin(subset)]
        importances = importances.set_index('index')
    a4_dims = (25,8)
    fig, ax = plt.subplots(figsize=a4_dims)
    importances.plot.bar(ax=ax)

    ax.set_title("Feature importances using MDI")
    ax.set_ylabel("Mean decrease in impurity")
    fig.tight_layout()
    
    return importances

In [102]:
def show_evaluation(pred, true):
    print(f'RMSE score: {mean_squared_error(true.values, pred, squared=False)}')
    print(f'MAE score: {mean_absolute_error(true.values, pred)}')

# Date correction

In [103]:
training_data_1['date'] = training_data_1.date.apply(lambda x: integer2date(x))

In [104]:
testing_data_1['date'] = testing_data_1.date.apply(lambda x: integer2date(x))
testing_data_1.to_csv('Data/Preprocessing/submission_dates.csv', sep=',', index=False)

# WP1 preprocessing

In [105]:
# wp_preproc = FeaturesPreprocessing(training_data_1, without_wp_date, before_start_36h)
# train, test = wp_preproc.transform(wp_1_forecast, 'wp1')
# train.to_csv('Data/Preprocessing/WP1_train_preprocessed.csv', sep=',', index=False)
# test.to_csv('Data/Preprocessing/WP1_test_preprocessed.csv', sep=',', index=False)

In [106]:
train = pd.read_csv('Data/Preprocessing/WP1_train_preprocessed.csv', sep=',')
test = pd.read_csv('Data/Preprocessing/WP1_test_preprocessed.csv', sep=',')

## Cleaning the technical maintenance period

In [107]:
# wp_to_predict = forecast_nb_to_predict(wp_1_forecast, testing_data_1, start_forecastdate)
# wp_1_forecast = wp_1_forecast[~((wp_1_forecast.wp <=0) & (wp_1_forecast.ws > 3.3)) & (~wp_1_forecast.forecast.isin(wp_to_predict))]

<!-- ## Separation into training and testing data -->

In [108]:
# wp_1_training = wp_1_forecast[(wp_1_forecast.forecast_time < without_wp_date)]
# wp_1_predict = wp_1_forecast[(wp_1_forecast.date >= before_start_36h)]

## Visualisation

In [109]:
# n_rows=5
# n_cols=4

# a4_dims = (20, 20)
# fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=a4_dims)

# for i, column in enumerate(wp_1_forecast.columns[:20]):
#     sns.histplot(wp_1_forecast[column],ax=axes[i//n_cols,i%n_cols])

In [110]:
# wp_1_test_data = wp_1_forecast[(wp_1_forecast.date >= start_date)].sort_values(by='date').head(4*48+4*36)
# wp_1_test_data.groupby([wp_1_test_data["date"].dt.year, wp_1_test_data["date"].dt.month, wp_1_test_data["date"].dt.day,  wp_1_test_data["date"].dt.hour]).count().plot(kind="bar", figsize=(40,5))

# Model testing

In [276]:
# u_to_drop = [
#     'u_T_1', 'u_T_2', 'u_T_3', 'u_T_4', 'u_T_5', 'u_T_6', 
#     'u_T_2_mean', 'u_T_3_mean', 'u_T_4_mean', 'u_T_5_mean', 'u_T_6_mean', 'u_T_7_mean',
#     'u_T_8_mean', 'u_T_9_mean', 'u_T_10_mean', 'u_T_11_mean', 'u_T_12_mean','u_T_24_mean',
#     'u_T_2_std', 'u_T_4_std', 'u_T_5_std', 'u_T_6_std',
#     'u_T_2_median', 'u_T_3_median', 'u_T_4_median', 'u_T_5_median', 'u_T_6_median', 'u_T_12_median','u_T_24_median', 'u_T_36_median',
#     'u_T_2_max', 'u_T_3_max', 'u_T_4_max', 'u_T_5_max', 'u_T_6_max', 'u_T_12_max',
#     'u_T_2_min', 'u_T_3_min', 'u_T_4_min', 'u_T_5_min', 'u_T_6_min', 'u_T_12_min',
#     'u2_T_1', 'u2_T_2', 'u2_T_3', 'u2_T_4', 'u2_T_5', 'u2_T_6', 
#     'u2_T_2_mean', 'u2_T_3_mean', 'u2_T_4_mean', 'u2_T_5_mean', 'u2_T_6_mean', 'u2_T_7_mean',
#     'u2_T_8_mean', 'u2_T_9_mean', 'u2_T_10_mean', 'u2_T_11_mean', 'u2_T_12_mean','u2_T_24_mean',
#     'u2_T_2_std', 'u2_T_4_std', 'u2_T_5_std', 'u2_T_6_std', 'u2_T_24_std',
#     'u2_T_2_median', 'u2_T_3_median', 'u2_T_4_median', 'u2_T_5_median', 'u2_T_6_median', 'u2_T_12_median',
#     'u2_T_2_max','u2_T_3_max', 'u2_T_4_max','u2_T_5_max', 'u2_T_6_max', 'u2_T_12_max',
#     'u2_T_2_min', 'u2_T_3_min', 'u2_T_4_min', 'u2_T_5_min', 'u2_T_6_min',
#     'u2_T_12', 'u2_T_36_mean', 'u2_T_36_std', 'u2_T_24_median', 'u2_T_24_max',
#     'u_T_36_mean','u_T_12','u_T_24_max','u2_T_36_median','u_T_24_min'
# ]
# ws_to_drop = [
#     'ws_T_1', 'ws_T_2', 'ws_T_3', 'ws_T_4', 'ws_T_5', 'ws_T_6', 'ws_T_7', 'ws_T_8', 'ws_T_10','ws_T_11', 'ws_T_12',
#     'ws_T_2_mean', 'ws_T_3_mean', 'ws_T_4_mean', 'ws_T_5_mean', 'ws_T_6_mean', 'ws_T_7_mean', 'ws_T_8_mean', 'ws_T_9_mean', 
#     'ws_T_10_mean', 'ws_T_11_mean', 'ws_T_12_mean', 'ws_T_24_mean', 
#     'ws_T_2_std', 'ws_T_3_std', 'ws_T_4_std', 'ws_T_5_std', 
#     'ws_T_2_median', 'ws_T_3_median', 'ws_T_4_median', 'ws_T_5_median', 'ws_T_6_median',
#     'ws_T_12_median', 'ws_T_24_median', 'ws_T_36_median',
#     'ws_T_2_max', 'ws_T_3_max', 'ws_T_4_max', 'ws_T_5_max','ws_T_6_max', 'ws_T_12_max',
#      'ws_T_2_min', 'ws_T_3_min', 'ws_T_4_min', 'ws_T_5_min', 'ws_T_6_min', 'ws_T_12_min','ws_T_24_max','ws_T_24_min'
# ]

# v_to_drop = [
#     'v_T_1', 'v_T_2', 'v_T_3', 'v_T_4', 'v_T_5', 'v_T_6', 
#     'v_T_2_mean', 'v_T_3_mean', 'v_T_4_mean', 'v_T_5_mean', 'v_T_6_mean', 'v_T_7_mean',
#     'v_T_8_mean', 'v_T_9_mean', 'v_T_10_mean', 'v_T_11_mean', 'v_T_12_mean', 'v_T_24_mean','v_T_36_mean',
#     'v_T_3_std', 'v_T_4_std', 'v_T_5_std','v_T_6_std','v_T_24_std', 'v_T_36_median',
#     'v_T_2_median', 'v_T_3_median', 'v_T_4_median', 'v_T_5_median', 'v_T_6_median', 
#     'v_T_2_max', 'v_T_3_max', 'v_T_4_max', 'v_T_5_max', 'v_T_6_max', 'v_T_12_max', 
#     'v_T_2_min', 'v_T_3_min', 'v_T_4_min', 'v_T_5_min', 'v_T_6_min', 'v_T_12_min', 
#     'v_T_36_min', 'v_T_36', 'v_T_24_max',  'v_T_12_median', 'v_T_24_median',
# ]

# wd_to_drop = [
#     'coswd_1', 'coswd_2', 'coswd_3', 'coswd_4', 'coswd_5', 'coswd_6',
#     'coswd_2_mean', 'coswd_3_mean', 'coswd_4_mean', 'coswd_5_mean', 'coswd_6_mean', 'coswd_7_mean', 
#     'coswd_8_mean', 'coswd_9_mean', 'coswd_10_mean', 'coswd_11_mean', 'coswd_12_mean', 'coswd_24_mean', 
#     'coswd_3_std', 'coswd_4_std','coswd_5_std','coswd_2_median', 'coswd_3_median','coswd_4_median', 
#     'coswd_5_median', 'coswd_6_median', 'coswd_36_median', 'coswd_24_median', 'coswd_12_median',
#     'coswd_2_max', 'coswd_3_max', 'coswd_4_max', 'coswd_5_max', 'coswd_6_max', 'coswd_12_max', 'coswd_24_max',
#     'coswd_2_min', 'coswd_3_min', 'coswd_4_min', 'coswd_5_min', 'coswd_6_min', 'coswd_12_min', 'coswd_24_min',
#     'ws_T_36_max', 'ws_T_36_min', 'coswd_12', 'coswd_24'
# ]

# other_to_drop = [
#     'cos_day', 'u', 'v'
# ]

# feature_corr = u_to_drop+ws_to_drop+v_to_drop+wd_to_drop+other_to_drop

In [280]:
to_drop = ['date','wd','forecast_time', 'forecast', "forecast_dist"]
train = train[[c for c in train if c not in ["wp"]] + ["wp"]]
wp_X = train.drop(to_drop, axis = 1)
wp_X = wp_X.drop(to_drop, axis = 1)

In [283]:
# params_1 = {
#     'reg_alpha': 0.8314449043001416,
#     'reg_lambda': 9.093012403173608,
#     'colsample_bytree': 0.9,
#     'subsample': 0.4,
#     'learning_rate': 0.2033256175102991,
#     'max_depth': 55,
#     'num_leaves': 964,
#     'min_child_samples': 25,
#     'min_data_per_groups': 36
# }

# lgbm_reg = LGBMRegressor(**params_1)
# lgbm_reg.fit(wp_X.drop("wp", axis = 1), wp_X['wp'])
# importances_lgbm_sum = feature_importance(lgbm_reg, wp_X.drop("wp", axis = 1))
# importances_lgbm_sum = importances_lgbm_sum[0]/importances_lgbm_sum[0].sum()

In [284]:
# params_1 = {
#     'lambda': 0.3643806022565838,
#     'alpha': 0.003650309466012506,
#     'colsample_bytree': 0.9640425007241273,
#     'subsample': 0.8,
#     'learning_rate': 0.052762727588106954,
#     'n_estimators': 700,
#     'max_depth': 54,
#     'min_child_weight': 96,
#     'eta': 3.119364108002744e-05,
#     'gamma': 5.177778739056542e-05,
#     'grow_policy': 'lossguide'
# }

# xgb_reg = XGBRegressor(**params_1)
# xgb_reg.fit(wp_X.drop("wp", axis = 1), wp_X['wp'])
# importances_xgb_sum = feature_importance(xgb_reg, wp_X.drop("wp", axis = 1))
# importances_xgb_sum = importances_xgb_sum[0]/importances_xgb_sum[0].sum()

# Ridge

In [294]:
from sklearn.linear_model import RidgeCV, Ridge 
from sklearn.svm import LinearSVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import PolynomialFeatures

In [20]:
to_drop = ['date','wd','forecast_time', 'forecast', "forecast_dist"]

In [21]:
wp_X = train[[c for c in train if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X_train, X_test, y_train, y_test = train_test_split(wp_X.drop('wp', axis = 1), wp_X['wp'], test_size=0.20, random_state=42)

In [22]:
# poly_ridge = Pipeline([
#     ('scaler', MinMaxScaler()),
#     ('ridge', RidgeCV(alphas = [1e-3, 1e-2, 1e-1,0.2, 0.5, 0.7, 1]))])
# poly_ridge.fit(X_train, y_train)

In [23]:
# poly_ridge  =RidgeCV(alphas = [1e-3, 1e-2, 1e-1,0.2, 0.5, 0.7, 1])
# poly_ridge.fit(X_train, y_train)

In [24]:
# poly_ridge.best_score_

In [25]:
# wp_pred = poly_ridge.predict(X_test)

# ridge_rmse = np.sqrt(mean_squared_error(y_test, wp_pred))
# print(ridge_rmse)
# wp_pred = [0 if i < 0 else i for i in wp_pred]
# wp_pred = [y_test.max() if i > y_test.max() else i for i in wp_pred]

# ridge_rmse = np.sqrt(mean_squared_error(y_test, wp_pred))
# ridge_rmse

# Stacking

In [297]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV, Ridge 
from sklearn.svm import LinearSVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from sklearn.model_selection import train_test_split

In [298]:
params_lgbm = {
    'lambda': 2.1359622347936646,
    'alpha': 0.016202766042783825,
    'colsample_bytree': 0.8075360516891219,
    'subsample': 0.8,
    'learning_rate': 0.06792370224097045,
    'n_estimators': 320,
    'max_depth': 58,
    'min_child_weight': 102,
    'eta': 6.934521001624072e-05,
    'gamma': 4.369012735807193e-06,
    'grow_policy': 'lossguide'
}

params_xbg = {
    'lambda': 0.3643806022565838,
    'alpha': 0.003650309466012506,
    'colsample_bytree': 0.9640425007241273,
    'subsample': 0.8,
    'learning_rate': 0.052762727588106954,
    'n_estimators': 700,
    'max_depth': 54,
    'min_child_weight': 96,
    'eta': 3.119364108002744e-05,
    'gamma': 5.177778739056542e-05,
    'grow_policy': 'lossguide'
}

In [304]:
wp_X = train[[c for c in train if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)

estimators = [
    ('lr', RidgeCV()),
    ('svr', LinearSVR(random_state=42)),
    ('lgbm', Pipeline([('scaler', MaxAbsScaler()),('lgbm', LGBMRegressor(**params_lgbm))])),
    ('xgb', Pipeline([('scaler', MaxAbsScaler()),('xgb', XGBRegressor(**params_xbg))]))
]

reg = StackingRegressor(
     estimators=estimators,
     final_estimator=RandomForestRegressor(n_estimators=10,
                                           random_state=42)
)

X_train, X_test, y_train, y_test = train_test_split(wp_X.drop('wp', axis = 1), wp_X['wp'], random_state=42)

reg.fit(X_train, y_train).score(X_test, y_test)

0.9382945624150366

In [305]:
y_pred = reg.predict(X_test)

In [309]:
mean_squared_error(y_test, y_pred, squared=False)

0.07495841149535656

In [None]:
wp_X = train[[c for c in train if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)

estimators = [
    ('lr', RidgeCV()),
    ('svr', LinearSVR(random_state=42)),
    ('lgbm', LGBMRegressor(**params_lgbm)),
    ('xgb', XGBRegressor(**params_xbg)),
]
reg = Pipeline([('scaler', MaxAbsScaler()),('stacking',StackingRegressor(estimators=estimators,final_estimator=RandomForestRegressor(n_estimators=10,random_state=42)))])

X_train, X_test, y_train, y_test = train_test_split(wp_X.drop('wp', axis = 1), wp_X['wp'], random_state=42)

reg.fit(X_train, y_train).score(X_test, y_test)

# Feature importances

In [None]:
# importances_lgbm_sum = feature_importance(lgbm_reg, wp_X.drop("wp", axis = 1)).sum(0)

In [None]:
# subset = [x for x in wp_X.columns if 'wd' in x]
# subset = [x for x in wp_X.columns if 'v' in x]
# subset = [x for x in wp_X.columns if 'ws' in x]
# subset = [x for x in wp_X.columns if ('u' not in x)&('v' not in x)&('ws' not in x)]
# subset = None
# importances_rf = feature_importance(rf_reg, wp_X.drop("wp", axis = 1), subset)
# importances_lgbm = feature_importance(lgbm_reg, wp_X.drop("wp", axis = 1), subset)/importances_lgbm_sum
# importances_xgbm = feature_importance(xgbm_reg, wp_X.drop("wp", axis = 1), subset)

In [None]:
# importances_lgbm.columns = ['LGBM']
# importances_rf.columns = ['RF']
# importances_xgbm.columns = ['XGBM']

# all_models = pd.concat([importances_lgbm, importances_rf, importances_xgbm], axis = 1)

# write_results('Data/Feature_importances_test2.xlsx','all', all_models)

## Correlation : 

- cross plot wp, ws 

In [273]:
to_drop = ['date','wd','forecast_time', 'forecast', "forecast_dist"]
train = train[[c for c in train if c not in ["wp"]] + ["wp"]]
wp_X = train.drop(to_drop, axis = 1)

In [274]:
u_to_drop = [
    'u_T_1', 'u_T_2', 'u_T_3', 'u_T_4', 'u_T_5', 'u_T_6', 
    'u_T_2_mean', 'u_T_3_mean', 'u_T_4_mean', 'u_T_5_mean', 'u_T_6_mean', 'u_T_7_mean',
    'u_T_8_mean', 'u_T_9_mean', 'u_T_10_mean', 'u_T_11_mean', 'u_T_12_mean','u_T_24_mean',
    'u_T_2_std', 'u_T_4_std', 'u_T_5_std', 'u_T_6_std',
    'u_T_2_median', 'u_T_3_median', 'u_T_4_median', 'u_T_5_median', 'u_T_6_median', 'u_T_12_median','u_T_24_median', 'u_T_36_median',
    'u_T_2_max', 'u_T_3_max', 'u_T_4_max', 'u_T_5_max', 'u_T_6_max', 'u_T_12_max',
    'u_T_2_min', 'u_T_3_min', 'u_T_4_min', 'u_T_5_min', 'u_T_6_min', 'u_T_12_min',
    'u2_T_1', 'u2_T_2', 'u2_T_3', 'u2_T_4', 'u2_T_5', 'u2_T_6', 
    'u2_T_2_mean', 'u2_T_3_mean', 'u2_T_4_mean', 'u2_T_5_mean', 'u2_T_6_mean', 'u2_T_7_mean',
    'u2_T_8_mean', 'u2_T_9_mean', 'u2_T_10_mean', 'u2_T_11_mean', 'u2_T_12_mean','u2_T_24_mean',
    'u2_T_2_std', 'u2_T_4_std', 'u2_T_5_std', 'u2_T_6_std', 'u2_T_24_std',
    'u2_T_2_median', 'u2_T_3_median', 'u2_T_4_median', 'u2_T_5_median', 'u2_T_6_median', 'u2_T_12_median',
    'u2_T_2_max','u2_T_3_max', 'u2_T_4_max','u2_T_5_max', 'u2_T_6_max', 'u2_T_12_max',
    'u2_T_2_min', 'u2_T_3_min', 'u2_T_4_min', 'u2_T_5_min', 'u2_T_6_min',
    'u2_T_12', 'u2_T_36_mean', 'u2_T_36_std', 'u2_T_24_median', 'u2_T_24_max',
    'u_T_36_mean','u_T_12','u_T_24_max','u2_T_36_median','u_T_24_min'
]
ws_to_drop = [
    'ws_T_1', 'ws_T_2', 'ws_T_3', 'ws_T_4', 'ws_T_5', 'ws_T_6', 'ws_T_7', 'ws_T_8', 'ws_T_10','ws_T_11', 'ws_T_12',
    'ws_T_2_mean', 'ws_T_3_mean', 'ws_T_4_mean', 'ws_T_5_mean', 'ws_T_6_mean', 'ws_T_7_mean', 'ws_T_8_mean', 'ws_T_9_mean', 
    'ws_T_10_mean', 'ws_T_11_mean', 'ws_T_12_mean', 'ws_T_24_mean', 
    'ws_T_2_std', 'ws_T_3_std', 'ws_T_4_std', 'ws_T_5_std', 
    'ws_T_2_median', 'ws_T_3_median', 'ws_T_4_median', 'ws_T_5_median', 'ws_T_6_median',
    'ws_T_12_median', 'ws_T_24_median', 'ws_T_36_median',
    'ws_T_2_max', 'ws_T_3_max', 'ws_T_4_max', 'ws_T_5_max','ws_T_6_max', 'ws_T_12_max',
     'ws_T_2_min', 'ws_T_3_min', 'ws_T_4_min', 'ws_T_5_min', 'ws_T_6_min', 'ws_T_12_min','ws_T_24_max','ws_T_24_min'
]

v_to_drop = [
    'v_T_1', 'v_T_2', 'v_T_3', 'v_T_4', 'v_T_5', 'v_T_6', 
    'v_T_2_mean', 'v_T_3_mean', 'v_T_4_mean', 'v_T_5_mean', 'v_T_6_mean', 'v_T_7_mean',
    'v_T_8_mean', 'v_T_9_mean', 'v_T_10_mean', 'v_T_11_mean', 'v_T_12_mean', 'v_T_24_mean','v_T_36_mean',
    'v_T_3_std', 'v_T_4_std', 'v_T_5_std','v_T_6_std','v_T_24_std', 'v_T_36_median',
    'v_T_2_median', 'v_T_3_median', 'v_T_4_median', 'v_T_5_median', 'v_T_6_median', 
    'v_T_2_max', 'v_T_3_max', 'v_T_4_max', 'v_T_5_max', 'v_T_6_max', 'v_T_12_max', 
    'v_T_2_min', 'v_T_3_min', 'v_T_4_min', 'v_T_5_min', 'v_T_6_min', 'v_T_12_min', 
    'v_T_36_min', 'v_T_36', 'v_T_24_max',  'v_T_12_median', 'v_T_24_median',
]

wd_to_drop = [
    'coswd_1', 'coswd_2', 'coswd_3', 'coswd_4', 'coswd_5', 'coswd_6',
    'coswd_2_mean', 'coswd_3_mean', 'coswd_4_mean', 'coswd_5_mean', 'coswd_6_mean', 'coswd_7_mean', 
    'coswd_8_mean', 'coswd_9_mean', 'coswd_10_mean', 'coswd_11_mean', 'coswd_12_mean', 'coswd_24_mean', 
    'coswd_3_std', 'coswd_4_std','coswd_5_std','coswd_2_median', 'coswd_3_median','coswd_4_median', 
    'coswd_5_median', 'coswd_6_median', 'coswd_36_median', 'coswd_24_median', 'coswd_12_median',
    'coswd_2_max', 'coswd_3_max', 'coswd_4_max', 'coswd_5_max', 'coswd_6_max', 'coswd_12_max', 'coswd_24_max',
    'coswd_2_min', 'coswd_3_min', 'coswd_4_min', 'coswd_5_min', 'coswd_6_min', 'coswd_12_min', 'coswd_24_min',
    'ws_T_36_max', 'ws_T_36_min', 'coswd_12', 'coswd_24'
]

other_to_drop = [
    'cos_day', 'u', 'v'
]

feature_corr = u_to_drop+ws_to_drop+v_to_drop+wd_to_drop+other_to_drop

wp_X = wp_X.drop(feature_corr, axis = 1)

In [275]:
subset = [x for x in wp_X.columns if 'wd' in x]
corr = wp_X.corr()
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
corr[mask] = np.nan
(corr
 .style
 .background_gradient(cmap='coolwarm', axis=None, vmin=-1, vmax=1)
 .highlight_null(null_color='#f1f1f1')  # Color NaNs grey
 .set_precision(2))

Unnamed: 0,ws,cos_hour,sin_hour,sin_day,cos_month,sin_month,cos_wd,sin_wd,u2,ws_T_9,ws_T_24,ws_T_36,ws_T_36_mean,ws_T_6_std,ws_T_12_std,ws_T_24_std,ws_T_36_std,coswd_36,coswd_36_mean,coswd_2_std,coswd_6_std,coswd_12_std,coswd_24_std,coswd_36_std,coswd_36_max,coswd_36_min,u_T_24,u_T_36,u_T_3_std,u_T_12_std,u_T_24_std,u_T_36_std,u_T_36_max,u_T_36_min,u2_T_24,u2_T_36,u2_T_3_std,u2_T_12_std,u2_T_36_max,u2_T_12_min,u2_T_24_min,u2_T_36_min,v_T_12,v_T_24,v_T_2_std,v_T_12_std,v_T_36_std,v_T_36_max,v_T_24_min,wp
ws,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
cos_hour,-0.02,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
sin_hour,-0.08,-0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
sin_day,0.06,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
cos_month,0.34,0.0,0.0,-0.02,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
sin_month,0.08,0.0,0.0,-0.01,0.02,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
cos_wd,0.0,-0.08,0.02,0.06,-0.04,-0.13,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
sin_wd,0.19,0.02,-0.19,-0.04,0.02,-0.04,-0.03,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
u2,0.57,-0.04,-0.02,0.02,0.25,0.03,0.03,0.24,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
ws_T_9,0.6,0.07,0.04,0.04,0.34,0.09,-0.12,0.23,0.43,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
