# Contenido 

1. **Carga de librerías y datos**
    - 1.a. Librerías
    - 1.b. Funciones
    - 1.c. Carga de datos
   
2. **Modelado**
    - 2.a. Creación variables exógenas
    - 2.b. Cálculo predictivo

#  1. Carga de librerías y datos

## 1.a. Librerías

In [None]:
import warnings
warnings.filterwarnings("ignore")

# Modelado y Forecasting
# ==============================================================================
import xgboost
import lightgbm
import catboost
import sklearn
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import RFECV
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

import skforecast
from skforecast.ForecasterBaseline import ForecasterEquivalentDate
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.model_selection import bayesian_search_forecaster
from skforecast.model_selection import backtesting_forecaster
from skforecast.model_selection import select_features
from skforecast.model_selection import backtesting_forecaster
import shap

import cloudpickle
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import yaml

In [2]:
from Creacion_exog import calculo_variables_exogenas

## 1.b. Funciones

In [3]:
def imputar_nulos_por_hora(datos):
    datos.index = pd.to_datetime(datos.index)
    horas = datos.index.hour
    
    # Promedio por hora y sustitución el valores Nan
    media_por_hora = datos.groupby(horas).transform('mean')
    datos = datos.fillna(media_por_hora)
    
    return datos

def auxiliar(variables_exogenas):
    # Selección de variables exógenas a incluir en el modelo
    # ==============================================================================
    exog_cols = []
    # Columnas que terminan con _seno o _coseno son seleccionadas
    exog_cols.extend(variables_exogenas.filter(regex='_seno$|_coseno$').columns.tolist())
    
    # Columnas que empiezan con festivo_ son seleccionadas
    exog_cols.extend(variables_exogenas.filter(regex='^festivo_.*').columns.tolist())
    exog_cols.extend(['FESTIVO'])
    
    variables_exogenas = variables_exogenas.filter(exog_cols, axis=1)
    return exog_cols, variables_exogenas

## 1.c. Carga de datos

In [None]:
# Filtrado de las 498 estaciones para las que se han entrenado modelos
with open("../../1-DATOS/3-DATOS DE RESULTADOS/PREDICCION/MODELOS/station_ids.yaml", "r") as file:
    data = yaml.safe_load(file)
stations_ids = data["station_ids"]

In [None]:
df = pd.read_parquet('../../1-DATOS/2-DATOS PROCESADOS/BICING/INFORMACION COMPLETA/BICICLETAS_HORARIO_2022_2023_FILTRADO.parquet')
df = df.iloc[:, df.columns.isin(stations_ids)]

# 2. Predicción

## 2.a.  Creación variables exógenas

El conjunto de variables exógenas es independiente de la estación, puesto que no se han estudiado sucesos que afecten únicamente a un grupo de estaciones concretas o una estación específica. Por lo tanto, se genera este conjunto considerando la estación 1 (pero podría haber sido cualquier otra estación).

In [6]:
datos = df[1]
datos = imputar_nulos_por_hora(df)
variables_exogenas = calculo_variables_exogenas(datos)
exog_cols, variables_exogenas = auxiliar(variables_exogenas)

In [7]:
variables_exogenas.head(3)

Unnamed: 0_level_0,mes_seno,mes_coseno,semana_anyo_seno,semana_anyo_coseno,dia_semana_seno,dia_semana_coseno,hora_dia_seno,hora_dia_coseno,hora_amanecer_seno,hora_amanecer_coseno,...,poly_hora_dia_coseno__hora_anochecer_coseno,poly_hora_amanecer_seno__hora_amanecer_coseno,poly_hora_amanecer_seno__hora_anochecer_seno,poly_hora_amanecer_seno__hora_anochecer_coseno,poly_hora_amanecer_coseno__hora_anochecer_seno,poly_hora_amanecer_coseno__hora_anochecer_coseno,poly_hora_anochecer_seno__hora_anochecer_coseno,festivo_dia_anterior,festivo_dia_siguiente,FESTIVO
FECHA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-01 00:00:00,0.5,0.866025,0.0,1.0,-0.781831,0.62349,0.258819,0.965926,0.866025,-0.5,...,,,,,,,,,0.0,1
2022-01-01 01:00:00,0.5,0.866025,0.0,1.0,-0.781831,0.62349,0.5,0.866025,0.866025,-0.5,...,,,,,,,,,0.0,1
2022-01-01 02:00:00,0.5,0.866025,0.0,1.0,-0.781831,0.62349,0.707107,0.707107,0.866025,-0.5,...,,,,,,,,,0.0,1


## 2.b. Cálculo predictivo

In [8]:
fecha_inicial_pred = '2023-10-01 00:00:00'
fecha_final_pred = '2023-10-01 07:59:59'

variables_exogenas = variables_exogenas[(variables_exogenas.index >= fecha_inicial_pred)
                                        & (variables_exogenas.index <= fecha_final_pred)]

In [9]:
def prepare_time_series(data, column_name='CANTIDAD', freq='H'):
    data = data.to_frame(name=column_name)
    data.index = pd.to_datetime(data.index)
    data = data.asfreq(freq)
    data.index.name = 'FECHA'
    return data

In [None]:
%%time

fecha_inicial_serie = '2023-09-29 00:00:00'
fecha_final_serie = '2023-09-30 23:59:00'

predicciones_dict = {}

for column in df.columns:

    with open(f'../../1-DATOS/3-DATOS DE RESULTADOS/PREDICCION/MODELOS/forecaster_{column}.pkl', 'rb') as f:
        forecaster = cloudpickle.load(f)
    
    data = df[column]
    datos = prepare_time_series(data, column_name='CANTIDAD', freq='H')
    datos = datos[(datos.index >= fecha_inicial_serie) & (datos.index < fecha_final_serie)]
    
    predicciones = forecaster.predict(
        steps=8,
        last_window=datos,
        exog=variables_exogenas
    )
    
    predicciones_dict[column] = predicciones

predicciones_df = pd.DataFrame(predicciones_dict)


CPU times: total: 2min 14s
Wall time: 36.8 s


In [11]:
predicciones_df.head(5)

Unnamed: 0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,11.0,...,450.0,452.0,458.0,490.0,465.0,502.0,503.0,52.0,91.0,467.0
2023-10-01 00:00:00,8.630921,14.78816,2.388018,8.197733,3.648555,7.848613,11.685831,18.157154,15.997639,15.708567,...,5.100935,26.713561,8.914617,2.019992,8.850814,5.651359,5.711303,14.878897,26.599227,9.165166
2023-10-01 01:00:00,11.176678,14.669596,2.851128,8.172179,3.123234,7.341913,10.962891,16.145895,13.635933,15.397873,...,5.530996,28.26254,8.941457,2.440866,6.883305,6.038924,6.733168,14.971393,24.938137,9.109914
2023-10-01 02:00:00,13.458239,14.62311,2.920438,8.493033,2.952743,6.830744,9.551177,13.99738,12.232619,15.574784,...,5.673731,28.661475,8.833127,2.152383,6.977444,6.70322,8.505746,14.498003,23.909929,9.323876
2023-10-01 03:00:00,15.626274,14.706897,2.616994,8.731397,2.9405,6.872814,8.728795,12.482944,11.519564,15.680811,...,5.733866,27.865355,8.823678,1.548895,7.207196,7.034134,9.374709,14.009818,23.457263,9.106368
2023-10-01 04:00:00,15.463017,14.889944,2.571998,8.981865,2.997617,7.370409,8.415812,11.006299,11.411153,15.995831,...,5.415764,27.274631,8.792032,1.236793,7.130398,6.375783,9.859231,13.016184,24.317428,8.734769
