modelo de lo general a lo particular

Factores de corto y largo plazo

In [1]:
import pandas as pd
import statsmodels.api as sm
import plotly.graph_objects as go
from stargazer.stargazer import Stargazer
import re
import plotly.graph_objects as go

In [2]:
mes_max_prediccion = 6
max_lag = 12
exogenous_list = ["pp", "pre", "er_cp", "gap","pi"]

In [3]:
df = pd.read_excel("../data/data.xlsx", index_col=0)
df = df[:"2022-11"]
df

Unnamed: 0,pp,pi,gap,er_cp,pre
2005-01-31,508.020737,86.866026,1.154717,827.759645,1423.333333
2005-02-28,500.539941,87.907449,1.160477,805.766198,969.666667
2005-03-31,656.181802,86.226525,1.170061,794.915295,1730.000000
2005-04-30,939.064907,87.964450,1.149008,797.011580,620.000000
2005-05-31,1028.823820,89.117152,1.121609,789.385468,54.666667
...,...,...,...,...,...
2022-07-31,2431.418408,208.867735,2.355276,346.523629,0.000000
2022-08-31,1988.408046,200.959296,2.149003,341.449828,146.666667
2022-09-30,2071.879245,204.883577,2.071033,340.484956,106.666667
2022-10-31,2339.045364,205.876822,2.022584,340.216828,543.333333


In [4]:
# df = pd.read_stata("../data/variables_productos_primarios.dta",index_col="fecha")
# df = df.drop(["mes","anio"],axis=1)
# df["pp"] = df.pp / 1_000_000
# df

In [5]:
def expand_df_max_lag(df_diff:pd.DataFrame, max_lag:int):
    last_date = df_diff.index.max()
    new_dates = pd.date_range(start=last_date + pd.DateOffset(days=1), 
                            periods=max_lag, 
                            freq='MS')
    new_df = pd.DataFrame(index=new_dates)
    df_diff = pd.concat([df_diff, new_df])
    return df_diff

def create_lag_columns(df:pd.DataFrame, variable_name:str, max_lag:int):
    for lag in range(1, max_lag + 1):
        new_column_name = f"{variable_name}_lag_{lag}"
        df[new_column_name] = df[variable_name].shift(lag)
    return df

In [6]:
df = expand_df_max_lag(df, max_lag)
df = create_lag_columns(df, "er_cp", max_lag)
df = create_lag_columns(df, "pi", max_lag)
df = create_lag_columns(df, "pre", max_lag)
df = create_lag_columns(df, "pp", max_lag)
df = create_lag_columns(df, "gap", max_lag)

df_diff = df.diff()
df_diff

Unnamed: 0,pp,pi,gap,er_cp,pre,er_cp_lag_1,er_cp_lag_2,er_cp_lag_3,er_cp_lag_4,er_cp_lag_5,...,gap_lag_3,gap_lag_4,gap_lag_5,gap_lag_6,gap_lag_7,gap_lag_8,gap_lag_9,gap_lag_10,gap_lag_11,gap_lag_12
2005-01-31,,,,,,,,,,,...,,,,,,,,,,
2005-02-28,-7.480796,1.041423,0.005760,-21.993447,-453.666667,,,,,,...,,,,,,,,,,
2005-03-31,155.641861,-1.680925,0.009585,-10.850903,760.333333,-21.993447,,,,,...,,,,,,,,,,
2005-04-30,282.883105,1.737925,-0.021054,2.096285,-1110.000000,-10.850903,-21.993447,,,,...,,,,,,,,,,
2005-05-31,89.758913,1.152702,-0.027399,-7.626112,-565.333333,2.096285,-10.850903,-21.993447,,,...,0.00576,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-07-01,,,,,,,,,,,...,,,,,,-0.064416,-0.048449,-0.077971,-0.206273,0.469452
2023-08-01,,,,,,,,,,,...,,,,,,,-0.064416,-0.048449,-0.077971,-0.206273
2023-09-01,,,,,,,,,,,...,,,,,,,,-0.064416,-0.048449,-0.077971
2023-10-01,,,,,,,,,,,...,,,,,,,,,-0.064416,-0.048449


## Lag óptimo
Si es para el mes siguiente: quedarme como exógenas todo lo que tenga más de un lag

In [7]:
def obtener_exogenas_prediccion(df:pd.DataFrame, mes_a_predecir:int):
    columns_to_keep = []
    for column in df.columns:
        match = re.search(r'\d+$', column)
        if match:
            if int(match.group()) >= mes_a_predecir:
                columns_to_keep.append(column)
    df = df[columns_to_keep]

    return df

def limitar_rezagos_variable(df:pd.DataFrame, variable:str, rezago_max:int):
    mask = df.columns.str.startswith(variable)
    df = df.iloc[:, mask]
    columns_to_drop = []
    for column in df.columns:
        match = re.search(r'\d+$', column)
        if match:
            if int(match.group()) > rezago_max:
                columns_to_drop.append(column)
    df = df.drop(columns_to_drop, axis=1)

    return df

In [8]:
#Calibracion de exogenas
for exogena in exogenous_list:
    modelos_pp = []
    for rezago in range(6, 13):
        df_train = df_diff[:-6].copy().dropna()
        y_train = df_train[exogena]
        X_train = obtener_exogenas_prediccion(df_train, 6)
        X = limitar_rezagos_variable(X_train, exogena, rezago)
        X = sm.add_constant(X)
        model = sm.OLS(y_train, X).fit()
        modelos_pp.append(model)
        # print(model.summary())
    aic_values = [model.aic for model in modelos_pp]
    stargazer = Stargazer(modelos_pp)
    aic_notes = [f'Model {i+1} AIC: {aic}' for i, aic in enumerate(aic_values)]
    stargazer.add_custom_notes(aic_notes)
    tex_file = open( f"../output/calibration_mr/stargazer_calibracion_{exogena}.tex", "w" ) #This will overwrite an existing file
    tex_file.write( stargazer.render_latex())
    tex_file.close()

In [9]:
def add_lag_calibration_aic(X_df:pd.DataFrame, exog: list[str], y_df:pd.DataFrame, max_lag:int, mes_a_predecir:int):
    lag_calibration = pd.DataFrame(columns=exog)
    for variable in exog:
        for rezago in range(mes_a_predecir, max_lag + 1):
            X = limitar_rezagos_variable(X_df, variable, rezago)
            X = sm.add_constant(X)
            model = sm.OLS(y_df, X).fit()
            lag_calibration.loc[rezago,variable] = model.aic            
    return lag_calibration

def get_lowest_aic_lag(aic_results:pd.DataFrame):
    optimal_lag = {}
    for exog in aic_results.columns:
        optimal_lag[exog] = aic_results[exog].sort_values(ascending=True).index[0]
    return optimal_lag

def get_aic_results(df_diff:pd.DataFrame,mes_a_predecir:int):
    df_train = df_diff[:-mes_a_predecir].copy().dropna()
    y_train = df_train["pp"]
    X_train = obtener_exogenas_prediccion(df_train, mes_a_predecir)

    aic_results = add_lag_calibration_aic(X_train, exogenous_list, y_train ,max_lag, mes_a_predecir)
    # print(mes_a_predecir, get_lowest_aic_lag(aic_results))
    aic_results.index.name = "Rezagos"
    return aic_results 
    
def write_aic_lags(df_diff:pd.DataFrame, mes_max_prediccion:int):
    writer = pd.ExcelWriter("../data/calibration_rm/aic_lags.xlsx", engine="xlsxwriter")
    for mes_a_predecir in range(1,mes_max_prediccion+1):
        aic_results = get_aic_results(df_diff,mes_a_predecir)
        aic_results.to_excel(writer, sheet_name=f"mes_a_predecir_{mes_a_predecir}")
    writer.close()
    
def get_optimal_lag_dict(df_diff:pd.DataFrame,mes_max_prediccion:int):
    optimal_lag_dict = {}
    for mes_a_predecir in range(1,mes_max_prediccion+1):
        aic_results = get_aic_results(df_diff, mes_a_predecir)
        optimal_lag_dict[mes_a_predecir] = get_lowest_aic_lag(aic_results)
    return optimal_lag_dict

In [10]:
optimal_lag_dict = get_optimal_lag_dict(df_diff, mes_max_prediccion)
optimal_lag_dict

{1: {'pp': 12, 'pre': 9, 'er_cp': 1, 'gap': 4, 'pi': 1},
 2: {'pp': 12, 'pre': 9, 'er_cp': 2, 'gap': 4, 'pi': 2},
 3: {'pp': 12, 'pre': 8, 'er_cp': 3, 'gap': 4, 'pi': 5},
 4: {'pp': 12, 'pre': 7, 'er_cp': 4, 'gap': 4, 'pi': 5},
 5: {'pp': 12, 'pre': 12, 'er_cp': 5, 'gap': 5, 'pi': 5},
 6: {'pp': 12, 'pre': 12, 'er_cp': 6, 'gap': 11, 'pi': 6}}

## Modelo
1. Definir training y test. En test, se tiene que expandir la base de los lags
2. Correr la regresión con los lags óptimos y guardar el modelo

In [11]:
def get_lags_to_drop(columns:list[str], rezago_max:int):
    lags_to_drop = []
    for column in columns:
        match = re.search(r'\d+$', column)
        if match:
            if int(match.group()) > rezago_max:
                lags_to_drop.append(column)
    return lags_to_drop

In [12]:
def drop_lags_exog(exogenas:list[str], X:pd.DataFrame, mes_a_predecir:int):
    for exog in exogenas:
        lags_to_drop = X.columns[X.columns.str.startswith(exog)]
        # print("LAGS TO DROP",lags_to_drop)
        lag_max = optimal_lag_dict[mes_a_predecir][exog]
        lags_to_drop = get_lags_to_drop(lags_to_drop, lag_max)  
        X = X.drop(lags_to_drop,axis=1)
    return X

In [13]:
modelos_rm = {}
for mes_a_predecir in range(1, mes_max_prediccion + 1):
    df_train = df_diff.dropna()[:-mes_a_predecir]
    y_train = df_train.pp
    X_train = obtener_exogenas_prediccion(df_train, mes_a_predecir)
    X_train = drop_lags_exog(exogenas = exogenous_list, X=X_train, mes_a_predecir = mes_a_predecir)
    # print(X_train)
    model = sm.OLS(y_train, sm.add_constant(X_train)).fit()
    modelos_rm[mes_a_predecir] = model

In [14]:
tex_file = open( "../output/stargazer_rm_final_stargazer.tex", "w" ) #This will overwrite an existing file
stargazer = Stargazer([modelos_rm[6]])
tex_file.write( stargazer.render_latex())
# print(stargazer.title("Regresión múltiple: productos primarios"))

1494

In [15]:
tex_file = open("../output/stargazer_rm_final_py.tex","w")
tex_file.write(modelos_rm[6].summary().as_latex())

4917

In [16]:
# model = modelos_rm[6]
# model.resid.plot()

In [17]:
models_predictions = {}
for mes_a_predecir in range(1,mes_max_prediccion+1):
    y_test = df_diff[max_lag+1:-(max_lag - mes_a_predecir)]["pp"]
    X_test = obtener_exogenas_prediccion(df_diff, mes_a_predecir=mes_a_predecir)[max_lag+1:-(max_lag - mes_a_predecir)]
    X_test = drop_lags_exog(exogenas = exogenous_list, X=X_test, mes_a_predecir=mes_a_predecir)
    model = modelos_rm[mes_a_predecir]
    prediction = model.predict(sm.add_constant(X_test))
    observ_vs_predict_df = pd.DataFrame({'observed': y_test, 
                                         'predicted': prediction,
                                         'pp_lag_1':df["pp_lag_1"]
                                         })
    observ_vs_predict_df["observed_reverted"] = observ_vs_predict_df["pp_lag_1"] + observ_vs_predict_df["observed"]
    observ_vs_predict_df["predicted_reverted"] = observ_vs_predict_df["pp_lag_1"] + observ_vs_predict_df["predicted"]
    observ_vs_predict_df = observ_vs_predict_df.dropna(subset=["predicted"])
    
    while observ_vs_predict_df['predicted_reverted'].isnull().any():
        observ_vs_predict_df['prev_predicted_reverted'] = observ_vs_predict_df['predicted_reverted'].shift(1)
        observ_vs_predict_df['sum'] = observ_vs_predict_df['prev_predicted_reverted'].add(observ_vs_predict_df['predicted'])
        observ_vs_predict_df['predicted_reverted'] = observ_vs_predict_df['predicted_reverted'].fillna(observ_vs_predict_df['sum'])
    try:
        observ_vs_predict_df = observ_vs_predict_df.drop(['prev_predicted_reverted', 'sum'],axis=1)
    except:
        continue
    models_predictions[mes_a_predecir] = observ_vs_predict_df

In [18]:
models_predictions[3][-20:]

Unnamed: 0,observed,predicted,pp_lag_1,observed_reverted,predicted_reverted
2021-07-31,340.268769,4.809357,2020.014984,2360.283753,2024.824341
2021-08-31,406.068545,-88.132923,2360.283753,2766.352298,2272.15083
2021-09-30,-476.440505,-167.338726,2766.352298,2289.911793,2599.013572
2021-10-31,-463.71806,-128.312002,2289.911793,1826.193733,2161.599791
2021-11-30,-507.902805,-261.1031,1826.193733,1318.290929,1565.090633
2021-12-31,434.133249,-32.951661,1318.290929,1752.424178,1285.339268
2022-01-31,129.630687,20.155217,1752.424178,1882.054864,1772.579395
2022-02-28,-50.345857,-48.532846,1882.054864,1831.709007,1833.522018
2022-03-31,251.076815,140.256683,1831.709007,2082.785822,1971.96569
2022-04-30,212.692357,335.560393,2082.785822,2295.478179,2418.346215


In [19]:
def plot_prediccion_rm(models_predictions:list, mes_a_predecir:int = mes_max_prediccion):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x = models_predictions[mes_a_predecir].index, y = models_predictions[mes_a_predecir].observed_reverted, name = "Observado"))
    fig.add_trace(go.Scatter(x = models_predictions[mes_a_predecir].index, y = models_predictions[mes_a_predecir].predicted_reverted, name = "Predicción"))
    fig.add_vline(x = models_predictions[mes_a_predecir].index[-mes_a_predecir], line_width=3, line_dash="dash", line_color="green")
    fig.update_layout(template = None, title_text = f"Predicción de las exportaciones de los siguientes {mes_a_predecir} meses en base a una regresión múltiple",
                      font_family = "georgia")
    return fig

plot_prediccion_rm(models_predictions, mes_a_predecir = 6)

In [20]:
plot_prediccion_rm(models_predictions, mes_a_predecir = 6).write_image("../output/Prediccion_rm_plot.pdf")