In [1]:
try:
    !pip install tqdm
    !pip install pyarrow
except:
    pass

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.tseries.offsets import *
from tqdm import tqdm
from functools import reduce
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)



In [2]:
# Load and prepare CRSP monthly data
crsp = pd.read_parquet('./data/WRDS/crsp_m.parquet')
crsp['prc'] = abs(crsp['prc'])
crsp['ME'] = crsp['prc'] * crsp['shrout']  # Market equity
crsp.sort_values(by=['permno', 'YearMonth'], inplace=True)
crsp['bh1m'] = crsp.groupby('permno')['retadj'].shift(-1)  # Buy-and-hold return next month
crsp['prc_l1'] = crsp.groupby('permno')['prc'].shift(1)    # Lagged price
print("crsp rows:", crsp.shape[0])

crsp rows: 2704599


In [3]:
# Load and prepare IBES actual EPS data
EPS_true = pd.read_stata('./data/WRDS/EPS_unadjusted_actual_full.dta')
EPS_true['YearMonth'] = EPS_true['ANNDATS'] + MonthEnd(0)
EPS_true['EPS_true'] = EPS_true['VALUE']

In [4]:
# Separate into quarterly and annual EPS
EPS_true_qtr = EPS_true[EPS_true['PDICITY'] == 'QTR'].sort_values(by=['TICKER','PENDS'])
EPS_true_ann = EPS_true[EPS_true['PDICITY'] == 'ANN'].sort_values(by=['TICKER','PENDS'])

In [5]:
# Lag EPS and announcement dates
EPS_true_qtr['EPS_true_l1'] = EPS_true_qtr.groupby('TICKER')['EPS_true'].shift(1)
EPS_true_qtr['ANNDATS_l1'] = EPS_true_qtr.groupby('TICKER')['ANNDATS'].shift(1)
EPS_true_ann['EPS_true_l1'] = EPS_true_ann.groupby('TICKER')['EPS_true'].shift(1)
EPS_true_ann['ANNDATS_l1'] = EPS_true_ann.groupby('TICKER')['ANNDATS'].shift(1)

In [6]:
# Load IBES consensus forecasts and merge with actuals
consensus = pd.read_parquet('./data/WRDS/EPS_summary.parquet')
consensus['YearMonth'] = consensus['STATPERS'] + MonthEnd(0)
consensus['EPS_ana'] = consensus['MEANEST']

In [7]:
# Split into quarterly and annual forecasts
consensus_quarter = consensus[consensus.FPI.isin(['6','7','8'])].copy()
consensus_annual = consensus[consensus.FPI.isin(['1','2'])].copy()

In [8]:
# Merge with actual EPS (quarterly)
consensus_quarter = consensus_quarter.merge(
    EPS_true_qtr[['TICKER','PENDS','EPS_true','ANNDATS','ANNDATS_l1','EPS_true_l1']], 
    left_on=['TICKER','FPEDATS'], 
    right_on=['TICKER','PENDS']
)
print("consensus_quarter after merge rows:", consensus_quarter.shape[0])

consensus_quarter after merge rows: 7035910


In [9]:
# Merge with actual EPS (annual)
consensus_annual = consensus_annual.merge(
    EPS_true_ann[['TICKER','PENDS','EPS_true','ANNDATS','ANNDATS_l1','EPS_true_l1']], 
    left_on=['TICKER','FPEDATS'], 
    right_on=['TICKER','PENDS']
)
print("consensus_annual after merge rows:", consensus_annual.shape[0])

consensus_annual after merge rows: 11758331


In [10]:
# Combine the two
consensus = pd.concat([consensus_quarter, consensus_annual], axis=0)
print("consensus combined rows:", consensus.shape[0])

consensus combined rows: 18794241


In [11]:
# Load and clean CRSP-IBES link table
iclink = pd.read_csv('./data/WRDS/iclink_WRDS.csv')
iclink.columns = ['ticker','permno','ncusip','sdate','edate','score']
iclink['sdate'] = pd.to_datetime(iclink['sdate'])
iclink['edate'] = pd.to_datetime(iclink['edate'])
iclink.dropna(subset=['permno'], inplace=True)
print("iclink rows:", iclink.shape[0])

iclink rows: 29751


In [12]:
# Process FPI = 1 (annual forecast)
a1 = consensus[consensus['FPI'] == '1'][['TICKER', 'STATPERS', 'FPEDATS', 'ANNDATS', 'CUSIP', 'EPS_ana',
                                         'EPS_true', 'EPS_true_l1', 'ANNDATS_l1']].drop_duplicates(subset=['TICKER', 'STATPERS']).copy()
print("a1 initial rows:", a1.shape[0])

a1 initial rows: 6261065


In [13]:
# Merge with CRSP-IBES link and filter by valid dates
a1 = a1.merge(iclink[['ticker', 'permno', 'sdate', 'edate', 'score']], 
              left_on='TICKER', right_on='ticker')
print("a1 after link merge rows:", a1.shape[0])

a1 after link merge rows: 3138358


In [14]:
a1 = a1[(a1['STATPERS'] >= a1['sdate']) & (a1['STATPERS'] <= a1['edate'])]
print("a1 after date filter rows:", a1.shape[0])

a1 after date filter rows: 1867422


In [15]:
# Adjust EPS using cfacshr at announcement date
a1['ANN_m'] = a1['ANNDATS'] + MonthEnd(0)
a1 = a1.merge(crsp[['permno', 'YearMonth', 'cfacshr']], 
              left_on=['permno', 'ANN_m'], right_on=['permno', 'YearMonth'])
print("a1 after EPS_true cfacshr merge rows:", a1.shape[0])

a1 after EPS_true cfacshr merge rows: 1491868


In [16]:
a1['EPS_true'] = a1['EPS_true'] / a1['cfacshr']
a1.drop(columns=['YearMonth', 'cfacshr'], inplace=True)

# Adjust lagged EPS
a1['ANN_m'] = a1['ANNDATS_l1'] + MonthEnd(0)
a1 = a1.merge(crsp[['permno', 'YearMonth', 'cfacshr']], 
              left_on=['permno', 'ANN_m'], right_on=['permno', 'YearMonth'],
              how='left')
print("a1 after EPS_true_l1 cfacshr merge rows:", a1.shape[0])

a1 after EPS_true_l1 cfacshr merge rows: 1491868


In [17]:
a1['cfacshr'] = pd.to_numeric(a1['cfacshr'], errors='coerce')
a1['EPS_true_l1'] = pd.to_numeric(a1['EPS_true_l1'], errors='coerce')
a1['EPS_true_l1'] = a1['EPS_true_l1'] / a1['cfacshr']
a1.drop(columns=['YearMonth', 'cfacshr', 'sdate', 'edate', 'ANN_m'], inplace=True)

# Adjust both EPS to current cfacshr at STATPERS date
a1['YearMonth'] = a1['STATPERS'] + MonthEnd(0)
a1 = a1.merge(crsp[['permno', 'YearMonth', 'cfacshr', 'ncusip']], on=['permno', 'YearMonth'])
print("a1 after final cfacshr merge rows:", a1.shape[0])

a1 after final cfacshr merge rows: 1490170


In [18]:
a1['EPS_true'] = a1['EPS_true'] * a1['cfacshr']
a1['EPS_true_l1'] = a1['EPS_true_l1'] * a1['cfacshr']

# Keep observations where CUSIP matches
a1 = a1[a1['CUSIP'] == a1['ncusip']].copy()
print("a1 after CUSIP match rows:", a1.shape[0])

a1.drop(columns=['cfacshr', 'ncusip'], inplace=True)

a1 after CUSIP match rows: 1486269


In [19]:
a1.head

<bound method NDFrame.head of         TICKER   STATPERS    FPEDATS    ANNDATS     CUSIP  EPS_ana  EPS_true  \
0         0000 2014-04-17 2014-12-31 2015-01-30  87482X10     0.52      1.21   
1         0000 2014-05-15 2014-12-31 2015-01-30  87482X10     0.56      1.21   
2         0000 2014-06-19 2014-12-31 2015-01-30  87482X10     0.56      1.21   
3         0000 2014-07-17 2014-12-31 2015-01-30  87482X10     0.56      1.21   
4         0000 2014-08-14 2014-12-31 2015-01-30  87482X10     1.18      1.21   
...        ...        ...        ...        ...       ...      ...       ...   
1490165   ZYNX 2022-10-20 2022-12-31 2023-03-13  98986M10     0.43      0.44   
1490166   ZYNX 2022-11-17 2022-12-31 2023-03-13  98986M10     0.43      0.44   
1490167   ZYNX 2022-12-15 2022-12-31 2023-03-13  98986M10     0.43      0.44   
1490168   ZYNX 2023-01-19 2022-12-31 2023-03-13  98986M10     0.43      0.44   
1490169   ZYNX 2023-02-16 2022-12-31 2023-03-13  98986M10     0.43      0.44   

         