In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from xgboost import XGBRegressor

import sys

In [2]:
import sys
sys.path.append('../scr')

from utils_clean import *
from utils_data import downloadTable
from utils_model import buscar_hiperparametros_arima_sarima
from utils_metrics import errorMetrics

In [3]:
queryPrices = """
SELECT * FROM `desarrollo-444913.globalPrices.prices`
"""
# Usar forceDownload=True para actualizar la tabla
df = downloadTable(queryPrices,"globalPricesTable",forceDownload=False)

Folder 'bigqueryDatabases' already exists.
Reading bigqueryDatabases\globalPricesTable.csv from local CSV.


In [4]:
df.head()

Unnamed: 0,date,price,priceId,priceName,currency,currencyId,priceType,source
0,2022-10-01,14.637188,4,Japan Frozen Atlantic TRIM-E,USD,0,EXPORT,CHILEAN EXPORTS
1,2023-03-01,12.26516,5,Japan Fresh Atlantic TRIM-A,USD,0,EXPORT,CHILEAN EXPORTS
2,2024-02-01,5.859417,1,Japan Frozen Coho HG,USD,0,EXPORT,CHILEAN EXPORTS
3,2012-11-08,6.724091,11,UB Atlantic TRIM-D 3-4 Lb FOB Miami,USD,0,EXPORTS,URNER BARRY
4,2007-12-26,7.495708,11,UB Atlantic TRIM-D 3-4 Lb FOB Miami,USD,0,EXPORTS,URNER BARRY


In [5]:
df_filtrado = filtrar_ub_semanal(df) # filtra semanalmente y semanas sin datos son nans

In [6]:
df_semanal = imputar_nulos_semanal(df_filtrado) # Imputa nulos con el promedio de la semana

In [7]:
df_semanal # serie semanal desde el 2007 hasta Junio del 2025

date
2007-01-08     9.038942
2007-01-15     9.038942
2007-01-22     9.204289
2007-01-29     9.369635
2007-02-05     9.479866
                ...    
2025-05-12    14.054452
2025-05-19    13.393066
2025-05-26    13.007258
2025-06-02    12.566334
2025-06-09    12.235641
Freq: W-MON, Name: price, Length: 962, dtype: float64

In [8]:
df_semanal.index = pd.to_datetime(df_semanal.index, errors='coerce')

In [11]:
df_semanal = df_semanal.to_frame(name="price")
df_features = crear_features_temporales(df_semanal)

In [12]:
df_features

Unnamed: 0_level_0,price,year,month,quarter,weekofyear,dayofyear,lag_1wk,lag_2wk,lag_3wk,lag_4wk,...,ma_8wk,std_8wk,min_8wk,max_8wk,median_8wk,ma_12wk,std_12wk,min_12wk,max_12wk,median_12wk
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-01-08,9.038942,2007,1,1,2,8,,,,,...,,,,,,,,,,
2007-01-15,9.038942,2007,1,1,3,15,9.038942,,,,...,,,,,,,,,,
2007-01-22,9.204289,2007,1,1,4,22,9.038942,9.038942,,,...,,,,,,,,,,
2007-01-29,9.369635,2007,1,1,5,29,9.204289,9.038942,9.038942,,...,,,,,,,,,,
2007-02-05,9.479866,2007,2,1,6,36,9.369635,9.204289,9.038942,9.038942,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-12,14.054452,2025,5,2,20,132,14.605607,14.605607,14.440261,14.109568,...,14.082010,0.461129,13.337951,14.605607,14.082010,14.068231,0.370938,13.337951,14.605607,14.082010
2025-05-19,13.393066,2025,5,2,21,139,14.054452,14.605607,14.605607,14.440261,...,14.020005,0.520324,13.337951,14.605607,14.082010,14.022302,0.418732,13.337951,14.605607,14.082010
2025-05-26,13.007258,2025,5,2,22,146,13.393066,14.054452,14.605607,14.605607,...,13.944221,0.622167,13.007258,14.605607,14.082010,13.930443,0.509022,13.007258,14.605607,14.026895
2025-06-02,12.566334,2025,6,2,23,153,13.007258,13.393066,14.054452,14.605607,...,13.847769,0.771476,12.566334,14.605607,14.082010,13.801840,0.638206,12.566334,14.605607,13.944221


In [13]:
df_features.columns

Index(['price', 'year', 'month', 'quarter', 'weekofyear', 'dayofyear',
       'lag_1wk', 'lag_2wk', 'lag_3wk', 'lag_4wk', 'lag_8wk', 'lag_12wk',
       'diff_1wk', 'pct_change_1wk', 'ma_4wk', 'std_4wk', 'min_4wk', 'max_4wk',
       'median_4wk', 'ma_8wk', 'std_8wk', 'min_8wk', 'max_8wk', 'median_8wk',
       'ma_12wk', 'std_12wk', 'min_12wk', 'max_12wk', 'median_12wk'],
      dtype='object')

# División train test

In [14]:
# Fechas límite
fecha_entrenamiento_fin = '2023-12-31'
fecha_prueba_ini        = '2024-01-01'

# División temporal
train = df_features.loc[df_features.index <= fecha_entrenamiento_fin].copy()
test  = df_features.loc[df_features.index >= fecha_prueba_ini].copy()

# Separar X e y
X_train = train.drop(columns=['price'])
y_train = train['price']
X_test  = test.drop(columns=['price'])
y_test  = test['price']


## Entrenamiento

In [33]:
# usar XGBoost regressor para predecir el precio
# buscar hiperparámetros
