In [6]:
import pandas as pd

# === Step 1: Load original datasets ===
w_bias = pd.read_parquet('./data/Results/RF_with_lookahead_raw_005.parquet')            # Replication of Panel A
df_train = pd.read_csv('./data/Results/df_train_a1.csv')     # IBES-merged dataset with actuals

In [22]:
# Print all column names
print(df_train.columns.tolist())
print(w_bias.columns.tolist())

['Unnamed: 0', 'FPEDATS', 'EPS_ana_y1', 'EPS_true_y1', 'EPS_true_l1_y1', 'ANNDATS_y1', 'ANNDATS_l1_y1', 'permno', 'YearMonth', 'siccd', 'ret', 'prc', 'bh1m', 'shrout', 'ME', 'prc_l1', 'gvkey', 'adate', 'qdate', 'public_date', 'CAPEI', 'bm', 'evm', 'pe_op_basic', 'pe_op_dil', 'pe_exi', 'pe_inc', 'ps', 'pcf', 'dpr', 'npm', 'opmbd', 'opmad', 'gpm', 'ptpm', 'cfm', 'roa', 'roe', 'roce', 'efftax', 'aftret_eq', 'aftret_invcapx', 'aftret_equity', 'pretret_noa', 'pretret_earnat', 'GProf', 'equity_invcap', 'debt_invcap', 'totdebt_invcap', 'capital_ratio', 'int_debt', 'int_totdebt', 'cash_lt', 'invt_act', 'rect_act', 'debt_at', 'debt_ebitda', 'short_debt', 'curr_debt', 'lt_debt', 'profit_lct', 'ocf_lct', 'cash_debt', 'fcf_ocf', 'lt_ppent', 'dltt_be', 'debt_assets', 'debt_capital', 'de_ratio', 'intcov', 'intcov_ratio', 'cash_ratio', 'quick_ratio', 'curr_ratio', 'cash_conversion', 'inv_turn', 'at_turn', 'rect_turn', 'pay_turn', 'sale_invcap', 'sale_equity', 'sale_nwc', 'rd_sale', 'adv_sale', 'staff

In [25]:
# Convert YearMonth to datetime
w_bias['YearMonth'] = pd.to_datetime(w_bias['YearMonth'])
df_train['YearMonth'] = pd.to_datetime(df_train['YearMonth'])

# Drop rows in w_bias with missing AE_y1 before merging
w_bias = w_bias.dropna(subset=['AE_y1'])

# Merge EPS_true_y1 into w_bias
w_bias_merged = w_bias.merge(
    df_train[['permno', 'YearMonth', 'EPS_true_y1']],
    on=['permno', 'YearMonth'],
    how='left'
)

# Create new variable: difference between actual and forecasted earnings
w_bias_merged['eps_diff_y1'] = w_bias_merged['EPS_true_y1'] - w_bias_merged['AE_y1']

# Preview result
print(w_bias_merged[['permno', 'YearMonth', 'AF_y1', 'EPS_true_y1', 'eps_diff_y1']].head())

    permno  YearMonth   AF_y1  EPS_true_y1  eps_diff_y1
0  10057.0 1986-01-31  1.2000        -0.34          0.0
1  10145.0 1986-01-31  3.9800         3.40          0.0
2  10154.0 1986-01-31  1.1500         1.01          0.0
3  10218.0 1986-01-31  0.1500         0.01          0.0
4  10364.0 1986-01-31 -1.8976        -2.87          0.0


In [27]:
# Drop rows with missing values in either actual or forecast
valid = w_bias_merged.dropna(subset=['EPS_true_y1', 'AE_y1'])

# Check if actual != forecast
diff = valid['EPS_true_y1'] != valid['AE_y1']

# Compute percentage
diff_rate = diff.mean()
print(f"Percentage of EPS_true_y1 ≠ AE_y1: {diff_rate:.2%} ({diff.sum()} out of {len(valid)} rows)")

Percentage of EPS_true_y1 ≠ AE_y1: 1.51% (18853 out of 1245902 rows)


In [28]:
# Save to Stata format
w_bias_merged.to_stata('data/w_bias_jz_y1.dta', write_index=False)