In [4]:
import pandas as pd

# === Step 1: Load original datasets ===
w_bias = pd.read_parquet('./data/Results/RF_with_lookahead_raw_005.parquet')            # Replication of Panel A
df_train = pd.read_csv('./data/Results/df_train_a1.csv')     # IBES-merged dataset with actuals

In [2]:
# Print all column names
print(df_train.columns.tolist())
print(w_bias.columns.tolist())

['Unnamed: 0', 'FPEDATS', 'EPS_ana_y1', 'EPS_true_y1', 'EPS_true_l1_y1', 'ANNDATS_y1', 'ANNDATS_l1_y1', 'permno', 'YearMonth', 'siccd', 'ret', 'prc', 'bh1m', 'shrout', 'ME', 'prc_l1', 'gvkey', 'adate', 'qdate', 'public_date', 'CAPEI', 'bm', 'evm', 'pe_op_basic', 'pe_op_dil', 'pe_exi', 'pe_inc', 'ps', 'pcf', 'dpr', 'npm', 'opmbd', 'opmad', 'gpm', 'ptpm', 'cfm', 'roa', 'roe', 'roce', 'efftax', 'aftret_eq', 'aftret_invcapx', 'aftret_equity', 'pretret_noa', 'pretret_earnat', 'GProf', 'equity_invcap', 'debt_invcap', 'totdebt_invcap', 'capital_ratio', 'int_debt', 'int_totdebt', 'cash_lt', 'invt_act', 'rect_act', 'debt_at', 'debt_ebitda', 'short_debt', 'curr_debt', 'lt_debt', 'profit_lct', 'ocf_lct', 'cash_debt', 'fcf_ocf', 'lt_ppent', 'dltt_be', 'debt_assets', 'debt_capital', 'de_ratio', 'intcov', 'intcov_ratio', 'cash_ratio', 'quick_ratio', 'curr_ratio', 'cash_conversion', 'inv_turn', 'at_turn', 'rect_turn', 'pay_turn', 'sale_invcap', 'sale_equity', 'sale_nwc', 'rd_sale', 'adv_sale', 'staff

In [3]:
# Convert YearMonth to datetime
w_bias['YearMonth'] = pd.to_datetime(w_bias['YearMonth'])
df_train['YearMonth'] = pd.to_datetime(df_train['YearMonth'])

# Drop rows in w_bias with missing AE_y1 before merging
w_bias = w_bias.dropna(subset=['AE_y1'])

# Merge EPS_true_y1 into w_bias
w_bias_merged = w_bias.merge(
    df_train[['permno', 'YearMonth', 'EPS_true_y1', 'FPEDATS']],
    on=['permno', 'YearMonth'],
    how='left'
)

# Create new variable: difference between actual and forecasted earnings
w_bias_merged['eps_diff_y1'] = w_bias_merged['EPS_true_y1'] - w_bias_merged['AE_y1']

# Preview result
print(w_bias_merged[['permno', 'YearMonth', 'AF_y1', 'EPS_true_y1', 'eps_diff_y1']].head())

    permno  YearMonth   AF_y1  EPS_true_y1  eps_diff_y1
0  10057.0 1986-01-31  1.2000        -0.34          0.0
1  10145.0 1986-01-31  3.9800         3.40          0.0
2  10154.0 1986-01-31  1.1500         1.01          0.0
3  10218.0 1986-01-31  0.1500         0.01          0.0
4  10364.0 1986-01-31 -1.8976        -2.87          0.0


In [4]:
# Drop rows with missing values in either actual or forecast
valid = w_bias_merged.dropna(subset=['EPS_true_y1', 'AE_y1'])

# Check if actual != forecast
diff = valid['EPS_true_y1'] != valid['AE_y1']

# Compute percentage
diff_rate = diff.mean()
print(f"Percentage of EPS_true_y1 ≠ AE_y1: {diff_rate:.2%} ({diff.sum()} out of {len(valid)} rows)")

Percentage of EPS_true_y1 ≠ AE_y1: 1.51% (18853 out of 1245902 rows)


In [7]:
# Save to Stata format
w_bias_merged.to_stata('./data/w_bias_jz_y1_w_date.dta', write_index=False)

In [None]:
df_test = df_test[df_test['EPS_true_q1'].notna()]

# New Test

In [17]:
df_test = pd.read_parquet('./data/Results/df_train_new.parquet')

In [18]:
df_test = df_test[df_test['EPS_true_q1'].notna()]

In [19]:
print(df_test.shape[0])

1244903


In [20]:
# Read the Stata file
df_train_a1_U = pd.read_stata('./data/Results/df_train_a1_U.dta')

# Print number of rows
print(len(df_train_a1_U))

1348775


In [22]:
# Convert both yearmonth columns to consistent string format 'YYYY-MM'
df_test_renamed['yearmonth'] = pd.to_datetime(df_test_renamed['yearmonth']).dt.to_period('M').astype(str)
df_train_a1_U['yearmonth'] = pd.to_datetime(df_train_a1_U['yearmonth']).dt.to_period('M').astype(str)

# Now do the merge
merged = df_test_renamed.merge(
    df_train_a1_U,
    on=['permno', 'yearmonth'],
    how='outer',
    indicator=True
)

# Check match result
print(merged['_merge'].value_counts())

_merge
both          1207455
right_only     141320
left_only       37448
Name: count, dtype: int64


In [12]:
print(df_test.columns.tolist())

['EPS_ana_q1', 'EPS_true_q1', 'EPS_true_l1_q1', 'ANNDATS_q1', 'ANNDATS_l1_q1', 'permno', 'YearMonth', 'EPS_ana_q2', 'EPS_true_q2', 'EPS_true_l1_q2', 'ANNDATS_q2', 'ANNDATS_l1_x', 'EPS_ana_q3', 'EPS_true_q3', 'EPS_true_l1_q3', 'ANNDATS_q3', 'ANNDATS_l1_y', 'EPS_ana_y1', 'EPS_true_y1', 'EPS_true_l1_y1', 'ANNDATS_y1', 'ANNDATS_l1_y1', 'EPS_ana_y2', 'EPS_true_y2', 'EPS_true_l1_y2', 'ANNDATS_y2', 'ANNDATS_l1_y2', 'siccd', 'ret', 'prc', 'bh1m', 'shrout', 'ME', 'prc_l1', 'gvkey', 'adate', 'qdate', 'public_date', 'CAPEI', 'bm', 'evm', 'pe_op_basic', 'pe_op_dil', 'pe_exi', 'pe_inc', 'ps', 'pcf', 'dpr', 'npm', 'opmbd', 'opmad', 'gpm', 'ptpm', 'cfm', 'roa', 'roe', 'roce', 'efftax', 'aftret_eq', 'aftret_invcapx', 'aftret_equity', 'pretret_noa', 'pretret_earnat', 'GProf', 'equity_invcap', 'debt_invcap', 'totdebt_invcap', 'capital_ratio', 'int_debt', 'int_totdebt', 'cash_lt', 'invt_act', 'rect_act', 'debt_at', 'debt_ebitda', 'short_debt', 'curr_debt', 'lt_debt', 'profit_lct', 'ocf_lct', 'cash_debt',

In [14]:
print(df_train_a1_U.columns.tolist())

['v1', 'statpers', 'fpedats', 'eps_ana_y1', 'eps_true_y1', 'eps_true_l1_y1', 'anndats_y1', 'anndats_l1_y1', 'permno', 'yearmonth', 'siccd', 'ret', 'prc', 'bh1m', 'shrout', 'me', 'prc_l1', 'gvkey', 'adate', 'qdate', 'public_date', 'capei', 'bm', 'evm', 'pe_op_basic', 'pe_op_dil', 'pe_exi', 'pe_inc', 'ps', 'pcf', 'dpr', 'npm', 'opmbd', 'opmad', 'gpm', 'ptpm', 'cfm', 'roa', 'roe', 'roce', 'efftax', 'aftret_eq', 'aftret_invcapx', 'aftret_equity', 'pretret_noa', 'pretret_earnat', 'gprof', 'equity_invcap', 'debt_invcap', 'totdebt_invcap', 'capital_ratio', 'int_debt', 'int_totdebt', 'cash_lt', 'invt_act', 'rect_act', 'debt_at', 'debt_ebitda', 'short_debt', 'curr_debt', 'lt_debt', 'profit_lct', 'ocf_lct', 'cash_debt', 'fcf_ocf', 'lt_ppent', 'dltt_be', 'debt_assets', 'debt_capital', 'de_ratio', 'intcov', 'intcov_ratio', 'cash_ratio', 'quick_ratio', 'curr_ratio', 'cash_conversion', 'inv_turn', 'at_turn', 'rect_turn', 'pay_turn', 'sale_invcap', 'sale_equity', 'sale_nwc', 'rd_sale', 'adv_sale', 's

In [23]:
# Rows in df_test_renamed but not in df_train_a1_U
left_only = merged[merged['_merge'] == 'left_only']
print("Rows only in df_test_renamed (left_only):")
print(left_only[['permno', 'yearmonth']].head())  # show a preview
print(f"Total left_only: {len(left_only)}\n")

# Rows in df_train_a1_U but not in df_test_renamed
right_only = merged[merged['_merge'] == 'right_only']
print("Rows only in df_train_a1_U (right_only):")
print(right_only[['permno', 'yearmonth']].head())  # show a preview
print(f"Total right_only: {len(right_only)}")

Rows only in df_test_renamed (left_only):
      permno yearmonth
192  10001.0   2014-09
335  10008.0   1988-04
336  10008.0   1988-07
410  10010.0   1995-01
411  10010.0   1995-02
Total left_only: 37448

Rows only in df_train_a1_U (right_only):
    permno yearmonth
0  10001.0   1989-03
1  10001.0   1989-04
2  10001.0   1989-05
3  10001.0   1989-06
4  10001.0   1989-07
Total right_only: 141320


In [24]:
# Ensure yearmonth is datetime
df_test['yearmonth'] = pd.to_datetime(df_test['YearMonth'])  # or 'yearmonth' if already renamed
df_train_a1_U['yearmonth'] = pd.to_datetime(df_train_a1_U['yearmonth'])

# Print date ranges
print("df_test yearmonth range:")
print(df_test['yearmonth'].min(), "to", df_test['yearmonth'].max())

print("\ndf_train_a1_U yearmonth range:")
print(df_train_a1_U['yearmonth'].min(), "to", df_train_a1_U['yearmonth'].max())

df_test yearmonth range:
1984-01-31 00:00:00 to 2019-12-31 00:00:00

df_train_a1_U yearmonth range:
1984-01-01 00:00:00 to 2019-12-01 00:00:00


In [25]:
# 1. Convert to datetime if not already
df_test['yearmonth'] = pd.to_datetime(df_test['yearmonth'])
df_train_a1_U['yearmonth'] = pd.to_datetime(df_train_a1_U['yearmonth'])

# 2. Find the overlapping range
start = max(df_test['yearmonth'].min(), df_train_a1_U['yearmonth'].min())
end = min(df_test['yearmonth'].max(), df_train_a1_U['yearmonth'].max())

# 3. Subset both datasets to the common date range
df_test_sub = df_test[(df_test['yearmonth'] >= start) & (df_test['yearmonth'] <= end)]
df_train_a1_U_sub = df_train_a1_U[(df_train_a1_U['yearmonth'] >= start) & (df_train_a1_U['yearmonth'] <= end)]

# 4. Print row counts after subsetting
print("Row count after subsetting to common yearmonth range:")
print(f"df_test_sub: {len(df_test_sub)}")
print(f"df_train_a1_U_sub: {len(df_train_a1_U_sub)}")

Row count after subsetting to common yearmonth range:
df_test_sub: 1241992
df_train_a1_U_sub: 1347561
