In [13]:
import pandas as pd
import statsmodels.api as sm
import plotly.graph_objects as go
from stargazer.stargazer import Stargazer
import re
import plotly.graph_objects as go

In [14]:
mes_max_prediccion = 6
max_lag = 12

In [15]:
df = pd.read_stata("../data/variables_productos_primarios.dta",index_col="fecha")
df = df.drop(["mes","anio"],axis=1)
df["pp"] = df.pp / 1_000_000
df

Unnamed: 0_level_0,itcr,ip,pre,pp
fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2005-01-01,105.929359,86.866026,1423.333333,508.020737
2005-02-01,103.960244,87.907449,969.666667,500.539941
2005-03-01,102.549715,86.226525,1730.000000,656.181802
2005-04-01,101.267876,87.964450,620.000000,939.064907
2005-05-01,100.294069,89.117152,54.666667,1028.823820
...,...,...,...,...
2022-07-01,44.102487,208.916689,0.000000,2431.418408
2022-08-01,43.439754,200.965695,146.666667,1988.408046
2022-09-01,43.289343,205.021010,106.666667,2071.879245
2022-10-01,43.241229,205.366248,543.333333,2339.045364


In [16]:
def expand_df_max_lag(df_diff:pd.DataFrame, max_lag:int):
    last_date = df_diff.index.max()
    new_dates = pd.date_range(start=last_date + pd.DateOffset(days=1), 
                            periods=max_lag, 
                            freq='MS')
    new_df = pd.DataFrame(index=new_dates)
    df_diff = pd.concat([df_diff, new_df])
    return df_diff

def create_lag_columns(df:pd.DataFrame, variable_name:str, max_lag:int):
    for lag in range(1, max_lag + 1):
        new_column_name = f"{variable_name}_lag_{lag}"
        df[new_column_name] = df[variable_name].shift(lag)
    return df

In [17]:
df = expand_df_max_lag(df, max_lag)
df = create_lag_columns(df, "itcr", max_lag)
df = create_lag_columns(df, "ip", max_lag)
df = create_lag_columns(df, "pre", max_lag)
df = create_lag_columns(df, "pp", max_lag)

df_diff = df.diff()
df_diff

Unnamed: 0,itcr,ip,pre,pp,itcr_lag_1,itcr_lag_2,itcr_lag_3,itcr_lag_4,itcr_lag_5,itcr_lag_6,...,pp_lag_3,pp_lag_4,pp_lag_5,pp_lag_6,pp_lag_7,pp_lag_8,pp_lag_9,pp_lag_10,pp_lag_11,pp_lag_12
2005-01-01,,,,,,,,,,,...,,,,,,,,,,
2005-02-01,-1.969115,1.041423,-453.666667,-7.480796,,,,,,,...,,,,,,,,,,
2005-03-01,-1.410529,-1.680925,760.333333,155.641861,-1.969115,,,,,,...,,,,,,,,,,
2005-04-01,-1.281839,1.737925,-1110.000000,282.883105,-1.410529,-1.969115,,,,,...,,,,,,,,,,
2005-05-01,-0.973807,1.152702,-565.333333,89.758913,-1.281839,-1.410529,-1.969115,,,,...,-7.480796,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-07-01,,,,,,,,,,,...,,,,,,-917.160348,267.166119,83.471199,-443.010362,396.209023
2023-08-01,,,,,,,,,,,...,,,,,,,-917.160348,267.166119,83.471199,-443.010362
2023-09-01,,,,,,,,,,,...,,,,,,,,-917.160348,267.166119,83.471199
2023-10-01,,,,,,,,,,,...,,,,,,,,,-917.160348,267.166119


## Lag óptimo
Si es para el mes siguiente: quedarme como exógenas todo lo que tenga más de un lag

In [18]:
def obtener_exogenas_prediccion(df:pd.DataFrame, mes_a_predecir:int):
    columns_to_keep = []
    for column in df.columns:
        match = re.search(r'\d+$', column)
        if match:
            if int(match.group()) >= mes_a_predecir:
                columns_to_keep.append(column)
    df = df[columns_to_keep]

    return df

def limitar_rezagos_variable(df:pd.DataFrame, variable:str, rezago_max:int):
    mask = df.columns.str.startswith(variable)
    df = df.iloc[:, mask]
    columns_to_drop = []
    for column in df.columns:
        match = re.search(r'\d+$', column)
        if match:
            if int(match.group()) > rezago_max:
                columns_to_drop.append(column)
    df = df.drop(columns_to_drop, axis=1)

    return df

In [70]:
exogena = "ip"
modelos_pp = []
for rezago in range(6, 13):
    df_train = df_diff[:-6].copy().dropna()
    y_train = df_train[exogena]
    X_train = obtener_exogenas_prediccion(df_train, 6)
    X = limitar_rezagos_variable(X_train, exogena, rezago)
    X = sm.add_constant(X)
    model = sm.OLS(y_train, X).fit()
    modelos_pp.append(model)
    # print(model.summary())
aic_values = [model.aic for model in modelos_pp]
stargazer = Stargazer(modelos_pp)
aic_notes = [f'Model {i+1} AIC: {aic}' for i, aic in enumerate(aic_values)]
stargazer.add_custom_notes(aic_notes)
tex_file = open( f"../output/stargazer_calibracion_{exogena}.tex", "w" ) #This will overwrite an existing file
tex_file.write( stargazer.render_latex())

2424

In [68]:
#Calibracion de exogenas
for exogena in ["pp", "pre", "itcr",
                # "ip"
                ]:
    modelos_pp = []
    for rezago in range(6, 13):
        df_train = df_diff[:-6].copy().dropna()
        y_train = df_train[exogena]
        X_train = obtener_exogenas_prediccion(df_train, 6)
        X = limitar_rezagos_variable(X_train, exogena, rezago)
        X = sm.add_constant(X)
        model = sm.OLS(y_train, X).fit()
        modelos_pp.append(model)
        # print(model.summary())
    aic_values = [model.aic for model in modelos_pp]
    stargazer = Stargazer(modelos_pp)
    aic_notes = [f'Model {i+1} AIC: {aic}' for i, aic in enumerate(aic_values)]
    stargazer.add_custom_notes(aic_notes)
    tex_file = open( f"../output/stargazer_calibracion_{exogena}.tex", "w" ) #This will overwrite an existing file
    tex_file.write( stargazer.render_latex())

In [19]:
def add_lag_calibration_aic(X_df:pd.DataFrame, exog: list[str], y_df:pd.DataFrame, max_lag:int, mes_a_predecir:int):
    lag_calibration = pd.DataFrame(columns=exog)
    for variable in exog:
        for rezago in range(mes_a_predecir, max_lag + 1):
            X = limitar_rezagos_variable(X_df, variable, rezago)
            X = sm.add_constant(X)
            model = sm.OLS(y_df, X).fit()
            lag_calibration.loc[rezago,variable] = model.aic            
    return lag_calibration

def get_lowest_aic_lag(aic_results:pd.DataFrame):
    optimal_lag = {}
    for exog in aic_results.columns:
        optimal_lag[exog] = aic_results[exog].sort_values(ascending=True).index[0]
    return optimal_lag

def get_aic_results(df_diff:pd.DataFrame,mes_a_predecir:int):
    df_train = df_diff[:-mes_a_predecir].copy().dropna()
    y_train = df_train["pp"]
    X_train = obtener_exogenas_prediccion(df_train, mes_a_predecir)

    aic_results = add_lag_calibration_aic(X_train, ["pp", "itcr", "ip", "pre"], y_train ,max_lag, mes_a_predecir)
    # print(mes_a_predecir, get_lowest_aic_lag(aic_results))
    aic_results.index.name = "Rezagos"
    return aic_results 
    
def write_aic_lags(df_diff:pd.DataFrame, mes_max_prediccion:int):
    writer = pd.ExcelWriter("../data/calibration_rm/aic_lags.xlsx", engine="xlsxwriter")
    for mes_a_predecir in range(1,mes_max_prediccion+1):
        aic_results = get_aic_results(df_diff,mes_a_predecir)
        aic_results.to_excel(writer, sheet_name=f"mes_a_predecir_{mes_a_predecir}")
    writer.close()
    
def get_optimal_lag_dict(df_diff:pd.DataFrame,mes_max_prediccion:int):
    optimal_lag_dict = {}
    for mes_a_predecir in range(1,mes_max_prediccion+1):
        aic_results = get_aic_results(df_diff, mes_a_predecir)
        optimal_lag_dict[mes_a_predecir] = get_lowest_aic_lag(aic_results)
    return optimal_lag_dict

In [20]:
optimal_lag_dict = get_optimal_lag_dict(df_diff, mes_max_prediccion)
optimal_lag_dict

{1: {'pp': 12, 'itcr': 1, 'ip': 1, 'pre': 9},
 2: {'pp': 12, 'itcr': 2, 'ip': 2, 'pre': 9},
 3: {'pp': 12, 'itcr': 3, 'ip': 5, 'pre': 8},
 4: {'pp': 12, 'itcr': 4, 'ip': 5, 'pre': 7},
 5: {'pp': 12, 'itcr': 5, 'ip': 5, 'pre': 12},
 6: {'pp': 12, 'itcr': 6, 'ip': 6, 'pre': 12}}

## Modelo
1. Definir training y test. En test, se tiene que expandir la base de los lags
2. Correr la regresión con los lags óptimos y guardar el modelo

In [21]:
def get_lags_to_drop(columns:list[str], rezago_max:int):
    lags_to_drop = []
    for column in columns:
        match = re.search(r'\d+$', column)
        if match:
            if int(match.group()) > rezago_max:
                lags_to_drop.append(column)
    return lags_to_drop

In [22]:
def drop_lags_exog(exogenas:list[str], X:pd.DataFrame, mes_a_predecir:int):
    for exog in exogenas:
        lags_to_drop = X.columns[X.columns.str.startswith(exog)]
        # print("LAGS TO DROP",lags_to_drop)
        lag_max = optimal_lag_dict[mes_a_predecir][exog]
        lags_to_drop = get_lags_to_drop(lags_to_drop, lag_max)  
        X = X.drop(lags_to_drop,axis=1)
    return X

In [23]:
modelos_rm = {}
for mes_a_predecir in range(1, mes_max_prediccion + 1):
    df_train = df_diff.dropna()[:-mes_a_predecir]
    y_train = df_train.pp
    X_train = obtener_exogenas_prediccion(df_train, mes_a_predecir)
    X_train = drop_lags_exog(exogenas = ["pre","ip","pp","itcr"], X=X_train, mes_a_predecir = mes_a_predecir)
    # print(X_train)
    model = sm.OLS(y_train, sm.add_constant(X_train)).fit()
    modelos_rm[mes_a_predecir] = model

In [24]:
tex_file = open( "../output/stargazer_rm_final_stargazer.tex", "w" ) #This will overwrite an existing file
stargazer = Stargazer([modelos_rm[6]])
tex_file.write( stargazer.render_latex())
# print(stargazer.title("Regresión múltiple: productos primarios"))

1225

In [25]:
tex_file = open("../output/stargazer_rm_final_py.tex","w")
tex_file.write(modelos_rm[6].summary().as_latex())

4071

In [26]:
# model = modelos_rm[6]
# model.resid.plot()

In [27]:
models_predictions = {}
for mes_a_predecir in range(1,mes_max_prediccion+1):
    y_test = df_diff[max_lag+1:-(max_lag - mes_a_predecir)]["pp"]
    X_test = obtener_exogenas_prediccion(df_diff, mes_a_predecir=mes_a_predecir)[max_lag+1:-(max_lag - mes_a_predecir)]
    X_test = drop_lags_exog(exogenas = ["pre","ip","pp","itcr"], X=X_test, mes_a_predecir=mes_a_predecir)
    model = modelos_rm[mes_a_predecir]
    prediction = model.predict(sm.add_constant(X_test))
    observ_vs_predict_df = pd.DataFrame({'observed': y_test, 
                                         'predicted': prediction,
                                         'pp_lag_1':df["pp_lag_1"]
                                         })
    observ_vs_predict_df["observed_reverted"] = observ_vs_predict_df["pp_lag_1"] + observ_vs_predict_df["observed"]
    observ_vs_predict_df["predicted_reverted"] = observ_vs_predict_df["pp_lag_1"] + observ_vs_predict_df["predicted"]
    observ_vs_predict_df = observ_vs_predict_df.dropna(subset=["predicted"])
    
    while observ_vs_predict_df['predicted_reverted'].isnull().any():
        observ_vs_predict_df['prev_predicted_reverted'] = observ_vs_predict_df['predicted_reverted'].shift(1)
        observ_vs_predict_df['sum'] = observ_vs_predict_df['prev_predicted_reverted'].add(observ_vs_predict_df['predicted'])
        observ_vs_predict_df['predicted_reverted'] = observ_vs_predict_df['predicted_reverted'].fillna(observ_vs_predict_df['sum'])
    try:
        observ_vs_predict_df = observ_vs_predict_df.drop(['prev_predicted_reverted', 'sum'],axis=1)
    except:
        continue
    models_predictions[mes_a_predecir] = observ_vs_predict_df

In [28]:
models_predictions[3][-20:]

Unnamed: 0,observed,predicted,pp_lag_1,observed_reverted,predicted_reverted
2021-07-01,340.268769,1.077307,2020.014984,2360.283753,2021.092292
2021-08-01,406.068545,-103.189206,2360.283753,2766.352298,2257.094546
2021-09-01,-476.440505,-153.398782,2766.352298,2289.911793,2612.953516
2021-10-01,-463.71806,-122.635106,2289.911793,1826.193733,2167.276687
2021-11-01,-507.902805,-244.761593,1826.193733,1318.290929,1581.432141
2021-12-01,434.133249,-44.020346,1318.290929,1752.424178,1274.270583
2022-01-01,129.630687,16.357573,1752.424178,1882.054864,1768.781751
2022-02-01,-50.345857,-35.779071,1882.054864,1831.709007,1846.275794
2022-03-01,251.076815,215.567583,1831.709007,2082.785822,2047.276591
2022-04-01,212.692357,298.610122,2082.785822,2295.478179,2381.395944


In [51]:
def plot_prediccion_rm(models_predictions:list, mes_a_predecir:int = mes_max_prediccion):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x = models_predictions[mes_a_predecir].index, y = models_predictions[mes_a_predecir].observed_reverted, name = "Observado"))
    fig.add_trace(go.Scatter(x = models_predictions[mes_a_predecir].index, y = models_predictions[mes_a_predecir].predicted_reverted, name = "Predicción"))
    fig.add_vline(x = models_predictions[mes_a_predecir].index[-mes_a_predecir], line_width=3, line_dash="dash", line_color="green")
    fig.update_layout(template = None, title_text = f"Predicción de las exportaciones de los siguientes {mes_a_predecir} meses en base a una regresión múltiple",
                      font_family = "georgia")
    return fig

plot_prediccion_rm(models_predictions, mes_a_predecir = 6)

In [52]:
plot_prediccion_rm(models_predictions, mes_a_predecir = 6).write_image("../output/Prediccion_rm_plot.pdf")