**Table with the RMSE values for the DFM specification with 2 Hard+Surveys factors and lag order 3 using four forecast setups**:

- Hard + Surveys  
- Hard + Surveys + Topics  
- Hard + Surveys + Sentiment-adjusted Topics (BPW)
- Hard + Surveys + Sign-adjusted Topics (BCC)

**Evaluation subperiods:** 2008-2018, 2008-2010, and 2011-2018.

In [1]:
import os
import pandas as pd
import numpy as np

# ---------- Helpers ----------
def quarter_to_float(q_str: str) -> float:
    year = int(q_str[:4])
    month = int(q_str[5:])
    if month == 3:
        fraction = 0.00
    elif month == 6:
        fraction = 0.25
    elif month == 9:
        fraction = 0.50
    elif month == 12:
        fraction = 0.75
    else:
        raise ValueError(f"Unexpected month in quarter string: {q_str}")
    return year + fraction

def ensure_date_column(df: pd.DataFrame) -> pd.DataFrame:
    """
    Ensure a 'date' column exists. If the first column is the date index,
    move it out of the index and name it 'date'.
    """
    if "date" in df.columns:
        df["date"] = df["date"].astype(str)
        return df
    if df.index.name is None:
        df.index.name = "date"
    df = df.reset_index()
    df["date"] = df["date"].astype(str)
    return df

def compute_rmse(forecasts_df: pd.DataFrame, actual_df: pd.DataFrame,
                 period_filter, vintage_cols) -> dict:
    merged = pd.merge(forecasts_df, actual_df, on="date", how="inner")
    merged["date_float"] = merged["date"].apply(quarter_to_float)
    merged = merged.sort_values("date_float")
    merged_period = merged[period_filter(merged)]

    vints_available = [v for v in vintage_cols if v in merged_period.columns]
    if len(vints_available) < len(vintage_cols):
        missing = [v for v in vintage_cols if v not in vints_available]
        print(f"Warning: missing vintages skipped: {missing}")

    rmse = {col: float(np.sqrt(((merged_period[col] - merged_period["growth"]) ** 2).mean()))
            for col in vints_available}
    return rmse

def load_forecast_data(path: str, filename: str) -> pd.DataFrame:
    df = pd.read_csv(os.path.join(path, filename), index_col=0)
    df = ensure_date_column(df)
    return df

# ---------- Configuration ----------
q_var = "GDP"
vintage_cols = ["M1-01", "M1-16", "M2-01", "M2-16", "M3-01", "M3-16", "M4-01"]

periods = {
    "2008–2018": lambda df: df["date_float"] < 2019,
    "2008–2010": lambda df: df["date_float"] < 2011,
    "2011–2018": lambda df: df["date_float"] >= 2011,
}

# Model specs 
number_HS = 2
order_hard_surveys = 3
order_topics = 3
order_topics_bpw = 3
order_topics_bcc = 3

# Directories 
forecasts_dir_hard_surveys = f"../../forecasts/DFM_hard_surveys_global_factors_{q_var}"
forecasts_dir_topics       = f"../../forecasts/DFM_topics_2007_200_all_separate_factors_{q_var}"
forecasts_dir_topics_bpw   = f"../../forecasts/DFM_topics_BPW_2007_200_all_separate_factors_{q_var}"
forecasts_dir_topics_bcc   = f"../../forecasts/DFM_topics_BCC_2009_200_all_selected_separate_factors_{q_var}"

file_hard_surveys = f"forecasts_{q_var}_Global_{number_HS}_{order_hard_surveys}.csv"
file_topics       = f"forecasts_{q_var}_HS_{number_HS}_{order_topics}.csv"
file_topics_bpw   = f"forecasts_{q_var}_HS_{number_HS}_{order_topics_bpw}.csv"
file_topics_bcc   = f"forecasts_{q_var}_HS_{number_HS}_{order_topics_bcc}.csv"

# Actuals
actual_growth = pd.read_csv(f"../../{q_var}_growth_actual.csv")
if "date" not in actual_growth.columns:
    # If first column is the date index/name, rename it to 'date'
    actual_growth = actual_growth.rename(columns={actual_growth.columns[0]:"date"})
actual_growth["date"] = actual_growth["date"].astype(str)

# ---------- Load forecasts ----------
hard_surveys = load_forecast_data(forecasts_dir_hard_surveys, file_hard_surveys)
topics       = load_forecast_data(forecasts_dir_topics,       file_topics)
topics_bpw   = load_forecast_data(forecasts_dir_topics_bpw,   file_topics_bpw)
topics_bcc   = load_forecast_data(forecasts_dir_topics_bcc,   file_topics_bcc)

models = {
    "Hard+Surveys": hard_surveys,
    "Topics": topics,
    "Topics_BPW": topics_bpw,
    "Topics_BCC": topics_bcc,
}

# ---------- Compute RMSEs ----------
results = {period: {} for period in periods}
for period_label, filt in periods.items():
    for model_name, df_model in models.items():
        results[period_label][model_name] = compute_rmse(df_model, actual_growth, filt, vintage_cols)

# ---------- Build LaTeX ----------
latex_lines = []
latex_lines.append("\\begin{table}[ht]")
latex_lines.append("\\centering")
latex_lines.append("\\footnotesize")
latex_lines.append("\\renewcommand{\\arraystretch}{1.2}")
latex_lines.append("\\caption{RMSE scores for GDP growth forecasts across subperiods}")
latex_lines.append("\\label{tab:rmse_gdp_all}")
latex_lines.append("\\begin{tabular}{lccccccc}")
latex_lines.append("\\toprule")

order_in_table = ["Hard+Surveys", "Topics", "Topics_BPW", "Topics_BCC"]

for i, (period_label, _) in enumerate(periods.items()):
    panel_letter = chr(ord("A") + i)
    latex_lines.append(f"\\multicolumn{{8}}{{c}}{{\\textbf{{Panel {panel_letter}: {period_label}}}}} \\\\")
    latex_lines.append("\\midrule")
    latex_lines.append("Model & " + " & ".join(vintage_cols) + " \\\\")
    latex_lines.append("\\midrule")
    for model_name in order_in_table:
        vals = [results[period_label][model_name].get(v, np.nan) for v in vintage_cols]
        vals_fmt = [("" if (v is None or (isinstance(v, float) and np.isnan(v))) else f"{v:.2f}") for v in vals]
        latex_lines.append(f"{model_name} & " + " & ".join(vals_fmt) + " \\\\")
    latex_lines.append("\\midrule")

latex_lines.append("\\bottomrule")
latex_lines.append("\\end{tabular}")
latex_lines.append("\\end{table}")

# ---------- Save .tex ----------
output_dir = "tables"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "table_rmse_gdp_all.tex")
with open(output_path, "w", encoding="utf-8") as f:
    f.write("\n".join(latex_lines))

print(f"Saved: {output_path}")

Saved: tables\table_rmse_gdp_all.tex
