<a href="https://colab.research.google.com/github/MandbeZ/TFM_sequia/blob/main/notebooks/4_1_Modelo_Univariante_Exogenas_ARIMA_RF__SPI_SPEI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Instalación de SKtime

In [None]:
%pip install sktime
%pip install sktime[all_extras]
%pip install esig
%pip install utils



Importar librerías básicas

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sktime.utils.plotting import plot_series
from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.model_selection import temporal_train_test_split

In [None]:
from warnings import simplefilter
simplefilter(action="ignore", category=RuntimeWarning)
simplefilter(action="ignore", category=FutureWarning)
# simplefilter(action="ignore", category=ModelFitWarning)

# Una variable endógena con varias variables exógenas

In [None]:
#Definición de funciones
import matplotlib.pyplot as plt
from sktime.performance_metrics.forecasting import mean_absolute_percentage_error,\
                                                   mean_absolute_error, mean_squared_error
def evaluar_modelo(test,pred):
    # print ('MAPE: ',mean_absolute_percentage_error(test, pred, symmetric=False))
    # print ('MAE: ' ,mean_absolute_error(test, pred) )
    # print ('RMSE: ', mean_squared_error(test, pred, square_root=True) )# square_root=True RMSE , false MSE)
    # print('MSE: ', mean_squared_error(test, pred, square_root=False)) # square_root=True RMSE , false MSE)
    mape = mean_absolute_percentage_error(test, pred, symmetric=False)
    mae = mean_absolute_error(test, pred)
    rmse = mean_squared_error(test, pred, square_root=True) # square_root=True RMSE , false MSE)
    mse = mean_squared_error(test, pred, square_root=False) # square_root=True RMSE , false MSE)
    return [mape, mae, rmse, mse]

def graficar_modelo(train,test,pred,titulo='Modelo',inicio_serie=200):
  plot_series(train[inicio_serie:], test, pred, labels=["y_train", "y_test", "y_pred"])
  plt.axhline(y=0.5, color='r', linestyle='dashed')
  plt.title(titulo)
  plt.show()

'''Función que normaliza los datos entre 0 y 1''' 
def minmax_norm(df_sn):
    return (df_sn - df_sn.min()) / ( df_sn.max() - df_sn.min())

'''Carga un dataframe elimina las n primeras filas y establece la fecha como índice'''
def cargar_series(ruta,nfilas): 
  '''nfilas es el número de filas que se eliminará del dataframe'''
  df=pd.read_csv('https://raw.githubusercontent.com/MandbeZ/TFM_sequia/main/datos/'+ruta, sep = ',', parse_dates=True)
  '''Configurar la fecha como datetime , índice y periodo'''
  df['fecha']=pd.to_datetime(df['fecha'])
  '''Eliminar nfilas'''
  df = df.iloc[nfilas-1:]
  df.set_index('fecha',inplace = True)
  df.index.name=None
  df.index=df.index.to_period("M")
  return(df)

'''Crea un dataframe de variables exógenas como X'''
def crear_exogenas(var1,var2,var3,cluster,c):
  X=pd.DataFrame()
  # X.set_index=var1.index
  X['tmed']=gen_cluster(cluster,c,var1)
  X['hmed']=gen_cluster(cluster,c,var2)
  nesta = cluster[cluster['cluster'] == c].shape[0]
  X['oni']=repetir(var3, nesta)
  return(X) 

'''Devuelve un Dataframe con todas las estaciones de un determinado cluster'''
def gen_cluster(lista_clust, clust, normalizados):
  estaciones = lista_clust[lista_clust['cluster'] == clust]
  nom_cols = [col for est in estaciones['id'] for col in normalizados.columns if str(est) in col]
  datos = normalizados[nom_cols]
  datos = datos.melt(value_name='valor').reset_index(drop=True)
  return datos.iloc[:, 1]

def repetir(data, cant):
  datos = pd.DataFrame()
  for i in range(cant):
    datos = pd.concat([datos,data], axis=0)
  return datos.reset_index(drop=True)

In [None]:
'''Dividir el Dataset en Entrenamiento y prueba'''
def dividir_datos(serie, tamanio):
    y_train, y_test = temporal_train_test_split(serie, test_size=tamanio)
    return y_train, y_test

def graficar_pred_est(data, modelo, clust=[0,1,2,3], escala=[1,3,6,12]):
  predic = data
  if (len(data.shape) > 1):
    if 'spi' in predic.columns[0] : s = 'spi'
    else : s = 'spei'

    for e in escala:
      indice = [ind for ind in predic.columns if s+str(e) in ind]
      print('Escala: '+str(e))
      if not indice: 
        print(indice)
        continue
      else:
        datos = cargar_datos('indices_'+s+str(e)+'.csv')
        p_datos = procesa_datos(datos)
        datos_normalizados=normalizar_datos(p_datos)

        for c in clust:
          
          cad = s+str(e)+'_c'+str(c)
          
          if (cad not in predic.columns):
            continue
          else:
            estaciones = cluster[cluster['cluster'] == c].reset_index(drop=True)
            

            for est in range(estaciones.shape[0]):
              estacion = [col for col in datos_normalizados.columns if str(estaciones['id'][est]) in col]
              serie = datos_normalizados.loc[:,estacion]
              train, test = dividir_datos(serie, tamanio = tam_ypred)
              pred = data.iloc[:,0]
              pred.set_index = test.index
              nom_est = 'Estacion '+str(estaciones.iloc[est,0])+' - '+str(estaciones.iloc[est,1]).capitalize() 
              nom_clu = 'Cluster ' + str(c) + ' - '

              graficar_modelo(train, test, pred,titulo=modelo+' - '+nom_clu+nom_est, inicio_serie=0, etiqy = s+str(e)+' - Normalizado')
  else:
    estaciones = cluster[cluster['cluster'] == clust[0]].reset_index(drop=True)
    for est in range(estaciones.shape[0]):
      estacion = [col for col in spi.columns if str(estaciones['id'][est]) in col]
      serie = minmax_norm(spi).loc[:,estacion]
      train, test = dividir_datos(serie, 12)
      pred = data
      pred.index= test.index
      nom_est = 'Estacion '+str(estaciones.iloc[est,0])+' - '+str(estaciones.iloc[est,1]).capitalize() 
      nom_clu = 'Cluster ' + str(clust[0]) + ' - '

      graficar_modelo(train, test, pred,titulo=modelo+' - '+nom_clu+nom_est, inicio_serie=0)


In [None]:
# cargar archivos con datos de cluster
cluster = pd.read_csv('https://raw.githubusercontent.com/MandbeZ/TFM_sequia/main/datos/spi_spei/cluster_4.csv',  sep = ',', usecols = {'id', 'cluster', 'estacion'})

'''Definir SPI-SPEI y escala'''
lista_indice = ['spei']
lista_escala = [3,6,12]
# indice = 'spei'
# escala = 1

'''Definir el cluster'''
lista_cluster = [0,1,2,3]
# nclust = 6



In [None]:
# for e in lista_escala:  
#   # Cargar el dataset de variables exógenas
#   tmed=cargar_series('nasa/nasa_mensual_tmed_bc.csv',e)
#   hmed=cargar_series('nasa/nasa_mensual_hmed_bc.csv',e)
#   oni=cargar_series('noaa/noaa_mensual_oni.csv',e)

#   X=crear_exogenas(minmax_norm(tmed),minmax_norm(hmed),minmax_norm(oni),cluster,nclust)
#   # spi=cargar_series('spi_spei/indices_'+indice+str(6)+'.csv',6)
#   # y=gen_cluster(cluster, nclust, minmax_norm(spi))
#   plot_series(X['tmed'])
#   plt.axhline(y=0.5, color='r', linestyle='dashed')
#   plt.show()
#   plot_series(X['hmed'])
#   plt.axhline(y=0.5, color='r', linestyle='dashed')
#   plt.show()
#   plot_series(X['oni'])
#   plt.axhline(y=0.5, color='r', linestyle='dashed')
#   plt.show()

In [None]:
from sktime.forecasting.arima import AutoARIMA
#AutoARIMA para SPI con variables exogenas
aa_eval = pd.DataFrame(index=['mape','mae','rmse','mse'])
aa_pred = pd.DataFrame()
hpred = np.arange(12) + 1
for i in lista_indice:
  for e in lista_escala:
    for c in lista_cluster:
      # Cargar el dataset de variables exógenas
      tmed=cargar_series('nasa/nasa_mensual_tmed_bc.csv',e)
      hmed=cargar_series('nasa/nasa_mensual_hmed_bc.csv',e)
      oni=cargar_series('noaa/noaa_mensual_oni.csv',e)
      print(f'indice:{i}, escala{e}, cluster{c}')
      # Variables exogenas y los datos de SPI-SPEI
      X=crear_exogenas(minmax_norm(tmed),minmax_norm(hmed),minmax_norm(oni),cluster,c)
      spi=cargar_series('spi_spei/indices_'+i+str(e)+'.csv',e)
      y=gen_cluster(cluster, c, minmax_norm(spi))  

      #AutoARIMA para SPI con variables exogenas
      y_train, y_test, X_train, X_pred = temporal_train_test_split(y, X, fh=hpred) #X representa las variables exógenas
      predictor = AutoARIMA(suppress_warnings=True)
      predictor.fit(y_train, X_train)
      y_pred = predictor.predict(hpred, X=X_pred)
      datos_eval = evaluar_modelo(y_test, y_pred)

      #Evalular
      aa_pred[i+str(e)+'_c'+str(c)] = y_pred.reset_index(drop=True)
      aa_eval[i+str(e)+'_c'+str(c)] = datos_eval

      
      graficar_modelo(y_train,y_test,y_pred, 'Modelo AutoARIMA con variables exógenas',100)
      graficar_modelo(y_train,y_test,y_pred, 'Modelo AutoARIMA con variables exógenas - Acercamiento',400)
      





In [None]:
aa_eval

Unnamed: 0,spei3_c0,spei3_c1,spei3_c2,spei3_c3,spei6_c0,spei6_c1,spei6_c2,spei6_c3,spei12_c0,spei12_c1,spei12_c2,spei12_c3
mape,0.273429,0.141209,0.410886,0.463209,0.310308,0.083981,0.621776,0.393884,0.672047,0.05701,1.115275,0.293899
mae,0.109199,0.056213,0.115333,0.135317,0.116669,0.042971,0.154327,0.128755,0.224102,0.03208,0.197257,0.120773
rmse,0.135859,0.07039,0.141541,0.1602,0.13326,0.054298,0.173332,0.145009,0.227402,0.039735,0.203727,0.123157
mse,0.018458,0.004955,0.020034,0.025664,0.017758,0.002948,0.030044,0.021028,0.051712,0.001579,0.041505,0.015168


In [None]:
graficar_pred_est(y_pred, 'autoARIMA', [6], [3])
# len(X.shape)


In [None]:
#predictor.get_params(deep=True)

In [None]:
#RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
from sktime.forecasting.compose import make_reduction

rf_eval = pd.DataFrame(index=['mape','mae','rmse','mse'])
rf_pred = pd.DataFrame()
hpred = np.arange(12) + 1
for i in lista_indice:
  for e in lista_escala:
    for c in lista_cluster:
      # Cargar el dataset de variables exógenas
      tmed=cargar_series('nasa/nasa_mensual_tmed_bc.csv',e)
      hmed=cargar_series('nasa/nasa_mensual_hmed_bc.csv',e)
      oni=cargar_series('noaa/noaa_mensual_oni.csv',e)
      print(f'indice:{i}, escala{e}, cluster{c}')
      # Variables exogenas y los datos de SPI-SPEI
      X=crear_exogenas(minmax_norm(tmed),minmax_norm(hmed),minmax_norm(oni),cluster,c)
      spi=cargar_series('spi_spei/indices_'+i+str(e)+'.csv',e)
      y=gen_cluster(cluster, c, minmax_norm(spi))       

      #RandomForestRegressor
      y_train, y_test, X_train, X_pred = temporal_train_test_split(y, X, fh=hpred) #X representa las variables exógenas
      regresor = RandomForestRegressor(n_estimators=30)
      predictor = make_reduction(regresor, strategy="recursive", window_length=48) #swindow_length=48
      predictor.fit(y_train,X_train)
      y_pred = predictor.predict(hpred,X=X_pred)
      graficar_modelo(y_train,y_test,y_pred, 'RandomForestRegressor',0)
      graficar_modelo(y_train,y_test,y_pred, 'RandomForestRegressor - Acercamiento',400)
      datos_eval = evaluar_modelo(y_test, y_pred)

      #Evaluar
      rf_pred[i+str(e)+'_c'+str(c)] = y_pred.reset_index(drop=True)
      rf_eval[i+str(e)+'_c'+str(c)] = datos_eval

In [None]:
rf_eval

Unnamed: 0,spei3_c0,spei3_c1,spei3_c2,spei3_c3,spei6_c0,spei6_c1,spei6_c2,spei6_c3,spei12_c0,spei12_c1,spei12_c2,spei12_c3
mape,0.080096,0.145786,0.508218,0.275802,0.091475,0.188194,0.398705,0.167197,0.347136,0.246151,0.630778,0.125279
mae,0.03484,0.068672,0.137191,0.080712,0.037071,0.100864,0.092112,0.058713,0.115106,0.14127,0.110254,0.051302
rmse,0.039226,0.073975,0.17764,0.099568,0.045634,0.109277,0.119455,0.083109,0.120594,0.144958,0.119559,0.055505
mse,0.001539,0.005472,0.031556,0.009914,0.002082,0.011941,0.01427,0.006907,0.014543,0.021013,0.014294,0.003081


In [None]:
#DecisionTreeRegressor
from sklearn.tree import DecisionTreeRegressor
from sktime.forecasting.compose import make_reduction

dt_eval = pd.DataFrame(index=['mape','mae','rmse','mse'])
dt_pred = pd.DataFrame()
hpred = np.arange(12) + 1
for i in lista_indice:
  for e in lista_escala:
    for c in lista_cluster:
      # Cargar el dataset de variables exógenas
      tmed=cargar_series('nasa/nasa_mensual_tmed_bc.csv',e)
      hmed=cargar_series('nasa/nasa_mensual_hmed_bc.csv',e)
      oni=cargar_series('noaa/noaa_mensual_oni.csv',e)
      print(f'indice:{i}, escala{e}, cluster{c}')
      # Variables exogenas y los datos de SPI-SPEI
      X=crear_exogenas(minmax_norm(tmed),minmax_norm(hmed),minmax_norm(oni),cluster,c)
      spi=cargar_series('spi_spei/indices_'+i+str(e)+'.csv',e)
      y=gen_cluster(cluster, c, minmax_norm(spi))       

      y_train, y_test, X_train, X_pred = temporal_train_test_split(y, X, fh=hpred) #X representa las variables exógenas
      regresor = DecisionTreeRegressor()
      predictor = make_reduction(regresor, strategy="recursive", window_length=48) #swindow_length=48
      predictor.fit(y_train,X_train)
      y_pred = predictor.predict(hpred,X=X_pred)
      graficar_modelo(y_train,y_test,y_pred, 'Modelo DecisionTreeRegressor',0)
      graficar_modelo(y_train,y_test,y_pred, 'Modelo DecisionTreeRegressor - Acercamiento',400)
      evaluar_modelo(y_test, y_pred)

      #Evaluar
      dt_pred[i+str(e)+'_c'+str(c)] = y_pred.reset_index(drop=True)
      dt_eval[i+str(e)+'_c'+str(c)] = datos_eval

In [None]:
#KNeighborsRegressor
from sklearn.neighbors import KNeighborsRegressor
from sktime.forecasting.compose import make_reduction

kn_eval = pd.DataFrame(index=['mape','mae','rmse','mse'])
kn_pred = pd.DataFrame()
hpred = np.arange(12) + 1
for i in lista_indice:
  for e in lista_escala:
    for c in lista_cluster:
      # Cargar el dataset de variables exógenas
      tmed=cargar_series('nasa/nasa_mensual_tmed_bc.csv',e)
      hmed=cargar_series('nasa/nasa_mensual_hmed_bc.csv',e)
      oni=cargar_series('noaa/noaa_mensual_oni.csv',e)
      print(f'indice:{i}, escala{e}, cluster{c}')
      # Variables exogenas y los datos de SPI-SPEI
      X=crear_exogenas(minmax_norm(tmed),minmax_norm(hmed),minmax_norm(oni),cluster,c)
      spi=cargar_series('spi_spei/indices_'+i+str(e)+'.csv',e)
      y=gen_cluster(cluster, c, minmax_norm(spi)) 

      #KNeighborsRegressor
      y_train, y_test, X_train, X_pred = temporal_train_test_split(y, X, fh=hpred) #X representa las variables exógenas
      regresor = KNeighborsRegressor(n_neighbors=11)
      predictor = make_reduction(regresor, strategy="recursive", window_length=48)
      predictor.fit(y_train,X_train)
      y_pred = predictor.predict(hpred,X=X_pred)
      graficar_modelo(y_train,y_test,y_pred, 'Modelo KNeighbors para Regresión',0)
      graficar_modelo(y_train,y_test,y_pred, 'Modelo KNeighbors para Regresión - Acercamiento',400)
      datos_eval = evaluar_modelo(y_test, y_pred)

      #Evaluar
      kn_pred[i+str(e)+'_c'+str(c)] = y_pred.reset_index(drop=True)
      kn_eval[i+str(e)+'_c'+str(c)] = datos_eval      