# Bagging

### 1. Import necessary libraries:

In [1]:
import pandas as pd
import math
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor, plot_tree, export_text
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn import metrics
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.inspection import permutation_importance
from sklearn.ensemble import BaggingRegressor

# others
# from mltools import classification_tools as CT
# from mltools import model_tools as MT

### 2. Load data as a Pandas DataFrame:

In [2]:
path_irrad = "../data/G07A_DATOS_IRRAD.csv"
df_orig_irrad = pd.read_csv(path_irrad)

# Parse the date (string) for it to be a datetime with format yyyy/mm/dd
df_orig_irrad['FECHA'] = pd.to_datetime(df_orig_irrad['FECHA'], format='%Y-%m-%d')

s = df_orig_irrad.loc[:,'FECHA']
df_orig_irrad['FECHA'] =  s.dt.date

In [3]:
path_util = "../data/G07A_DATOS_UTIL.csv"
df_orig_util = pd.read_csv(path_util)

# Parse the date (string) for it to be a datetime with format yyyy/mm/dd
df_orig_util['FECHA'] = pd.to_datetime(df_orig_util['FECHA'], format='%Y-%m-%d')

s = df_orig_util.loc[:,'FECHA']
df_orig_util['FECHA'] =  s.dt.date

In [4]:
df = pd.merge(df_orig_irrad, df_orig_util, on=['FECHA', 'ANNO', 'MES', 'DIA', 'DIASEM'] , how='inner')

# Separando el DataFrame en partes según la descripción
fecha_df = df[['ANNO', 'MES', 'DIA', 'DIASEM']]
irrad_df = df.filter(regex='^IRRAD')
util_df = df.filter(regex='^UTIL')

# Transformando los DataFrames de irradiación y utilización
irrad_melted = irrad_df.melt(var_name='FranjaHoraria', value_name='Irradiacion')
util_melted = util_df.melt(var_name='FranjaHoraria', value_name='Utilizacion')

# Ajustando la franja horaria para extracción
irrad_melted['FranjaHoraria'] = irrad_melted['FranjaHoraria'].str.extract('(\d+)')
util_melted['FranjaHoraria'] = util_melted['FranjaHoraria'].str.extract('(\d+)')

# Repetimos fecha_df para cada franja horaria (8 veces)
fecha_expanded = pd.concat([fecha_df]*irrad_df.shape[1], ignore_index=True)

df_repetido = fecha_expanded.reset_index(drop=True)
irrad_melted = irrad_melted.reset_index(drop=True)
util_melted = util_melted.reset_index(drop=True)

# Extraer las columnas específicas y resetear sus índices
irrad_serie = irrad_melted['Irradiacion'].reset_index(drop=True)
franja_serie = irrad_melted['FranjaHoraria'].reset_index(drop=True)  # Incluir la columna 'franja horaria'
util_serie = util_melted['Utilizacion'].reset_index(drop=True)

# Unir los DataFrames/series
df_final = pd.concat([df_repetido, franja_serie, irrad_serie, util_serie], axis=1)

  irrad_melted['FranjaHoraria'] = irrad_melted['FranjaHoraria'].str.extract('(\d+)')
  util_melted['FranjaHoraria'] = util_melted['FranjaHoraria'].str.extract('(\d+)')


In [5]:
# Convertir la columna "FranjaHoraria" a tipo int
df_final['FranjaHoraria'] = df_final['FranjaHoraria'].astype(int)

In [6]:
df_final = df_final.sort_values(by=['ANNO', 'MES', 'DIA', 'FranjaHoraria'])

In [7]:
df_final['Irradiacion_lag1'] = df_final['Irradiacion'].shift(1)
df_final['Irradiacion_lag2'] = df_final['Irradiacion'].shift(2)
df_final['Irradiacion_lag8'] = df_final['Irradiacion'].shift(8) # un día antes
df_final['Irradiacion_lag365'] = df_final['Irradiacion'].shift(365*8) # un año antes


df_final['Utilizacion_lag1'] = df_final['Utilizacion'].shift(1)
df_final['Utilizacion_lag2'] = df_final['Utilizacion'].shift(2)
df_final['Utilizacion_lag8'] = df_final['Utilizacion'].shift(8) # un día antes
df_final['Utilizacion_lag365'] = df_final['Utilizacion'].shift(365*8) # un año antes

In [8]:
df_final = df_final.dropna()

### 3. Split the data into training and test sets:

In [9]:
# Define input and output matrices
INPUTS = ['ANNO', 'MES', 'DIA', 'FranjaHoraria', 'Irradiacion',	'Irradiacion_lag1',	'Irradiacion_lag2',	'Irradiacion_lag8',	'Irradiacion_lag365',	'Utilizacion_lag1',	'Utilizacion_lag2',	'Utilizacion_lag8',	'Utilizacion_lag365']
OUTPUT = 'Utilizacion'

# We have categorical inputs with many classes. We will create dummy variables automatically after
X = df_final[INPUTS]
y = df_final[OUTPUT]

## Divide the data into training and test sets ---------------------------------------------------
#X_train, X_test, y_train, y_test = train_test_split(X, y,
#                                                    test_size=0.2,  #percentage of test data
#                                                    random_state=0) #seed for replication
## Divide the data into training and test sets SEQUENTIALY -------------------------------------
# Create random 80/20 % split
X_train = X.iloc[0:round(0.8*X.shape[0])]
X_test = X.iloc[round(0.8*X.shape[0])+1:X.shape[0]]
y_train = y.iloc[0:round(0.8*X.shape[0])]
y_test = y.iloc[round(0.8*X.shape[0])+1:X.shape[0]]

## Create dataset to store model predictions
dfTR_eval = X_train.copy()
dfTR_eval['Utilizacion'] = y_train
dfTS_eval = X_test.copy()
dfTS_eval['Utilizacion'] = y_test

### 4. Training of Bagging Regressor:

#### 4.1 Creation of bagging base estimator:
 - criterion='squared_error',
 - min_impurity_decrease= 0.001,
 - min_samples_leaf= 10,
 - min_samples_split= 1,
 - random_state = 999


In [10]:
base_estimator =  Pipeline(steps=[('DT', 
                                    DecisionTreeRegressor(criterion='squared_error',  # impurity measure
                                                          min_impurity_decrease=0.001,
                                                          min_samples_split=2, # Minimum number of obs in node to keep cutting
                                                          min_samples_leaf=10, # Minimum number of obs in a terminal node
                                                          random_state=999))]) # For replication

#### 4.2 Crea el conjunto de árboles, seleccionando un nº de árboles a combinar de manera óptima
- Realiza un barrido entre 20 y 150 (de 5 en 5)
- Utiliza como scoring en GridSearchCV 'neg_mean_absolute_error'

In [11]:
# Find optim nº of trees in the ensemble
param = {'BT__n_estimators': range(20, 150, 5)} # Number of trees to aggregate

pipe =  Pipeline(steps=[('BT', 
                            BaggingRegressor(estimator=base_estimator,
                                              random_state=150))])

# We use Grid Search Cross Validation to find the best parameter for the model in the grid defined 
nFolds = 10
bt_fit = GridSearchCV(estimator=pipe, # Structure of the model to use
                       param_grid=param, # Defined grid to search in
                       n_jobs=-1, # Number of cores to use (parallelize)
                       scoring='neg_mean_absolute_error', 
                       cv=nFolds) # Number of Folds 

# Fit the bagged trees
INPUTS_BT = INPUTS
bt_fit.fit(X_train, y_train) # Search in grid

#### 4.3. Muestra la salida estimada del bagged tree
- Incluye en la gráfica los datos de TR, la estimación del modelo y el valor de "y_true" como referencia
- Indica cúantos árboles se están combinando