# Bagging

### 1. Import necessary libraries:

In [16]:
import numpy as np # linear algebra
import pandas as pd  # data management (dataframes)
import matplotlib.pyplot as plt
import seaborn as sns  # plotting

from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

from sklearn import metrics
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

from sklearn.inspection import permutation_importance

from sklearn.ensemble import BaggingClassifier
#from sklearn.ensemble import BaggingRegressor

# others
from mltools import classification_tools as CT
from mltools import model_tools as MT

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.pipeline import Pipeline

### 2. Load data as a Pandas DataFrame:

In [17]:
path_irrad = "../data/G07A_DATOS_IRRAD.csv"
df_orig_irrad = pd.read_csv(path_irrad)

# Parse the date (string) for it to be a datetime with format yyyy/mm/dd
df_orig_irrad['FECHA'] = pd.to_datetime(df_orig_irrad['FECHA'], format='%Y-%m-%d')

s = df_orig_irrad.loc[:,'FECHA']
df_orig_irrad['FECHA'] =  s.dt.date

In [18]:
path_util = "../data/G07A_DATOS_UTIL.csv"
df_orig_util = pd.read_csv(path_util)

# Parse the date (string) for it to be a datetime with format yyyy/mm/dd
df_orig_util['FECHA'] = pd.to_datetime(df_orig_util['FECHA'], format='%Y-%m-%d')

s = df_orig_util.loc[:,'FECHA']
df_orig_util['FECHA'] =  s.dt.date

In [19]:
df = pd.merge(df_orig_irrad, df_orig_util, on=['FECHA', 'ANNO', 'MES', 'DIA', 'DIASEM'] , how='inner')
df.head()

Unnamed: 0,FECHA,IRRADH00,IRRADH03,IRRADH06,IRRADH09,IRRADH12,IRRADH15,IRRADH18,IRRADH21,ANNO,...,DIA,DIASEM,UTILH00,UTILH03,UTILH06,UTILH09,UTILH12,UTILH15,UTILH18,UTILH21
0,2015-01-01,0.0,0.0,0.0,414536.22,1193085.2,481816.38,0.0,0.0,2015,...,1,3,0.0,0.0,0.027848,0.235443,0.263291,0.039241,0.0,0.0
1,2015-01-02,0.0,0.0,0.0,319990.2,926238.1,182854.1,0.0,0.0,2015,...,2,4,0.0,0.0,0.027848,0.181857,0.205063,0.024051,0.0,0.0
2,2015-01-03,0.0,0.0,0.0,403464.62,1146347.2,360073.94,0.0,0.0,2015,...,3,5,0.0,0.0,0.029536,0.237975,0.235865,0.036287,0.0,0.0
3,2015-01-04,0.0,0.0,0.0,349597.03,1016177.06,315783.62,0.0,0.0,2015,...,4,6,0.0,0.0,0.029114,0.196203,0.21097,0.02616,0.0,0.0
4,2015-01-05,0.0,0.0,0.0,408705.38,1090662.9,532075.1,0.0,0.0,2015,...,5,0,0.0,0.0,0.02827,0.205485,0.22827,0.036287,0.0,0.0


### 3. Split the data into training and test sets:

In [20]:
# Define input and output matrices:
INPUTS = ['IRRADH00', 'IRRADH03', 'IRRADH06', 'IRRADH09', 'IRRADH12',
          'IRRADH15', 'IRRADH18', 'IRRADH21', 'ANNO', 'MES', 'DIA', 'DIASEM']
OUTPUTS = ['UTILH00', 'UTILH03', 'UTILH06', 'UTILH09', 
          'UTILH12', 'UTILH15', 'UTILH18', 'UTILH21']

X = df[INPUTS]
Y = df[OUTPUTS]

# Split:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    test_size=0.3,  # Percentage of test data
                                                    random_state=0) # Seed for replication

In [21]:
# Create dataset to store model predictions:
dfTR_eval = X_train.copy()
dfTR_eval[OUTPUTS] = Y_train

dfTS_eval = X_test.copy()
dfTS_eval[OUTPUTS] = Y_test

### 4. Training of Bagging Regressor

In [27]:
hours = ['00', '03', '06', '09', '12', '15', '18', '21']

# Initialize a dictionary to store the trained models for each hour
# and a second one to store the Cross-Validation scores for each trained model:
hourly_models   = {}
CV_score_hourly = {}
summary_hourly  = {}
metrics_hourly  = {}

# Train a linear regression model for each hour:
for hour in hours:

    # Define input and output features for the current hour:
    inputs_hourly = ['IRRADH' + hour, 'ANNO', 'MES', 'DIA', 'DIASEM']
    output_hourly = 'UTILH' + hour
    
   # Create input and output matrices for the current hour:
    X_train_hourly = X_train[inputs_hourly].values  # Convert to numpy array
    y_train_hourly = Y_train[output_hourly]

    X_test_hourly = X_test[inputs_hourly].values  # Convert to numpy array


    number_estimators = 50

    base_estimator = Pipeline(steps=[
        ('DT', DecisionTreeRegressor(
            criterion='squared_error',       # Mean Squared Error as impurity measure
            min_samples_split=2,   # Minimum number of samples required to split an internal node
            min_samples_leaf=1,    # Minimum number of samples required to be at a leaf node
            random_state=999))     # For reproducibility
    ])

    bag_reg_estim = BaggingRegressor(
        base_estimator=base_estimator,
        n_estimators=number_estimators,
        random_state=0
    )

    # Fit the bagged trees
    bag_reg_estim.fit(X_train_hourly, y_train_hourly)

    # Store the trained model in the dictionary:
    hourly_models[hour] = bag_reg_estim




In [28]:
hourly_models

{'00': BaggingRegressor(base_estimator=Pipeline(steps=[('DT',
                                                  DecisionTreeRegressor(random_state=999))]),
                  n_estimators=50, random_state=0),
 '03': BaggingRegressor(base_estimator=Pipeline(steps=[('DT',
                                                  DecisionTreeRegressor(random_state=999))]),
                  n_estimators=50, random_state=0),
 '06': BaggingRegressor(base_estimator=Pipeline(steps=[('DT',
                                                  DecisionTreeRegressor(random_state=999))]),
                  n_estimators=50, random_state=0),
 '09': BaggingRegressor(base_estimator=Pipeline(steps=[('DT',
                                                  DecisionTreeRegressor(random_state=999))]),
                  n_estimators=50, random_state=0),
 '12': BaggingRegressor(base_estimator=Pipeline(steps=[('DT',
                                                  DecisionTreeRegressor(random_state=999))]),
            