In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.tseries.offsets import *
from tqdm import tqdm
from functools import reduce
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
# Load and prepare CRSP monthly data
crsp = pd.read_parquet('./data/WRDS/crsp_m.parquet')
crsp['prc'] = abs(crsp['prc'])
crsp['ME'] = crsp['prc'] * crsp['shrout']  # Market equity
crsp.sort_values(by=['permno', 'YearMonth'], inplace=True)
crsp['bh1m'] = crsp.groupby('permno')['retadj'].shift(-1)  # Buy-and-hold return next month
crsp['prc_l1'] = crsp.groupby('permno')['prc'].shift(1)    # Lagged price
crsp.duplicated(subset=['permno','YearMonth']).sum()
print("crsp rows:", crsp.shape[0])

crsp rows: 2704599


In [3]:
# Load and prepare IBES actual EPS data
EPS_true = pd.read_stata('./data/WRDS/EPS_unadjusted_actual_full.dta')
EPS_true['YearMonth'] = EPS_true['ANNDATS'] + MonthEnd(0)
EPS_true['EPS_true'] = EPS_true['VALUE']

In [4]:
# Separate into quarterly and annual EPS
EPS_true_qtr = EPS_true[EPS_true['PDICITY'] == 'QTR'].sort_values(by=['TICKER','PENDS'])
EPS_true_ann = EPS_true[EPS_true['PDICITY'] == 'ANN'].sort_values(by=['TICKER','PENDS'])

In [5]:
# Lag EPS and announcement dates
EPS_true_qtr['EPS_true_l1'] = EPS_true_qtr.groupby('TICKER')['EPS_true'].shift(1)
EPS_true_qtr['ANNDATS_l1'] = EPS_true_qtr.groupby('TICKER')['ANNDATS'].shift(1)
EPS_true_ann['EPS_true_l1'] = EPS_true_ann.groupby('TICKER')['EPS_true'].shift(1)
EPS_true_ann['ANNDATS_l1'] = EPS_true_ann.groupby('TICKER')['ANNDATS'].shift(1)

In [6]:
# Load IBES consensus forecasts and merge with actuals
consensus = pd.read_parquet('./data/WRDS/EPS_summary.parquet')
consensus['YearMonth'] = consensus['STATPERS'] + MonthEnd(0)
consensus['EPS_ana'] = consensus['MEANEST']

In [7]:
# Split into quarterly and annual forecasts
consensus_quarter = consensus[consensus.FPI.isin(['6','7','8'])].copy()
consensus_annual = consensus[consensus.FPI.isin(['1','2'])].copy()

In [8]:
# Merge with actual EPS (quarterly)
consensus_quarter = consensus_quarter.merge(
    EPS_true_qtr[['TICKER','PENDS','EPS_true','ANNDATS','ANNDATS_l1','EPS_true_l1']], 
    left_on=['TICKER','FPEDATS'], 
    right_on=['TICKER','PENDS']
)
print("consensus_quarter after merge rows:", consensus_quarter.shape[0])

consensus_quarter after merge rows: 7035910


In [9]:
# Merge with actual EPS (annual)
consensus_annual = consensus_annual.merge(
    EPS_true_ann[['TICKER','PENDS','EPS_true','ANNDATS','ANNDATS_l1','EPS_true_l1']], 
    left_on=['TICKER','FPEDATS'], 
    right_on=['TICKER','PENDS']
)
print("consensus_annual after merge rows:", consensus_annual.shape[0])

consensus_annual after merge rows: 11758331


In [10]:
# Combine the two
consensus = pd.concat([consensus_quarter, consensus_annual], axis=0)
print("consensus combined rows:", consensus.shape[0])

consensus combined rows: 18794241


In [11]:
# Load and clean CRSP-IBES link table
iclink = pd.read_csv('./data/WRDS/iclink_WRDS.csv')
iclink.columns = ['ticker','permno','ncusip','sdate','edate','score']
iclink['sdate'] = pd.to_datetime(iclink['sdate'])
iclink['edate'] = pd.to_datetime(iclink['edate'])
iclink.dropna(subset=['permno'], inplace=True)
print("iclink rows:", iclink.shape[0])

iclink rows: 29751


In [12]:
# Process FPI = 1 (annual forecast)
a1 = consensus[consensus['FPI'] == '1'][['TICKER', 'STATPERS', 'FPEDATS', 'ANNDATS', 'CUSIP', 'EPS_ana',
                                         'EPS_true', 'EPS_true_l1', 'ANNDATS_l1']].drop_duplicates(subset=['TICKER', 'STATPERS']).copy()
print("a1 initial rows:", a1.shape[0])

a1 initial rows: 6261065


In [13]:
# Merge with CRSP-IBES link and filter by valid dates
a1 = a1.merge(iclink[['ticker', 'permno', 'sdate', 'edate', 'score']], 
              left_on='TICKER', right_on='ticker')
print("a1 after link merge rows:", a1.shape[0])

a1 after link merge rows: 3138358


In [14]:
a1 = a1[(a1['STATPERS'] >= a1['sdate']) & (a1['STATPERS'] <= a1['edate'])]
print("a1 after date filter rows:", a1.shape[0])

a1 after date filter rows: 1867422


In [15]:
# Adjust EPS using cfacshr at announcement date
a1['ANN_m'] = a1['ANNDATS'] + MonthEnd(0)
a1 = a1.merge(crsp[['permno', 'YearMonth', 'cfacshr']], 
              left_on=['permno', 'ANN_m'], right_on=['permno', 'YearMonth'])
print("a1 after EPS_true cfacshr merge rows:", a1.shape[0])

a1 after EPS_true cfacshr merge rows: 1491868


In [16]:
a1['EPS_true'] = a1['EPS_true'] / a1['cfacshr']
a1.drop(columns=['YearMonth', 'cfacshr'], inplace=True)

# Adjust lagged EPS
a1['ANN_m'] = a1['ANNDATS_l1'] + MonthEnd(0)
a1 = a1.merge(crsp[['permno', 'YearMonth', 'cfacshr']], 
              left_on=['permno', 'ANN_m'], right_on=['permno', 'YearMonth'],
              how='left')
print("a1 after EPS_true_l1 cfacshr merge rows:", a1.shape[0])

a1 after EPS_true_l1 cfacshr merge rows: 1491868


In [17]:
a1['cfacshr'] = pd.to_numeric(a1['cfacshr'], errors='coerce')
a1['EPS_true_l1'] = pd.to_numeric(a1['EPS_true_l1'], errors='coerce')
a1['EPS_true_l1'] = a1['EPS_true_l1'] / a1['cfacshr']
a1.drop(columns=['YearMonth', 'cfacshr', 'sdate', 'edate', 'ANN_m'], inplace=True)

# Adjust both EPS to current cfacshr at STATPERS date
a1['YearMonth'] = a1['STATPERS'] + MonthEnd(0)
a1 = a1.merge(crsp[['permno', 'YearMonth', 'cfacshr', 'ncusip']], on=['permno', 'YearMonth'])
print("a1 after final cfacshr merge rows:", a1.shape[0])

a1 after final cfacshr merge rows: 1490170


In [18]:
a1['EPS_true'] = a1['EPS_true'] * a1['cfacshr']
a1['EPS_true_l1'] = a1['EPS_true_l1'] * a1['cfacshr']

# Keep observations where CUSIP matches
a1 = a1[a1['CUSIP'] == a1['ncusip']].copy()
print("a1 after CUSIP match rows:", a1.shape[0])

a1.rename(columns={'EPS_true_l1': 'EPS_true_l1_y1', 
                   'EPS_true': 'EPS_true_y1', 
                   'EPS_ana': 'EPS_ana_y1', 
                   'ANNDATS': 'ANNDATS_y1', 
                   'ANNDATS_l1': 'ANNDATS_l1_y1'}, inplace=True)

a1.drop(columns=['cfacshr', 'ncusip'], inplace=True)

a1 after CUSIP match rows: 1486269


In [19]:
a1.head

<bound method NDFrame.head of         TICKER   STATPERS    FPEDATS ANNDATS_y1     CUSIP  EPS_ana_y1  \
0         0000 2014-04-17 2014-12-31 2015-01-30  87482X10        0.52   
1         0000 2014-05-15 2014-12-31 2015-01-30  87482X10        0.56   
2         0000 2014-06-19 2014-12-31 2015-01-30  87482X10        0.56   
3         0000 2014-07-17 2014-12-31 2015-01-30  87482X10        0.56   
4         0000 2014-08-14 2014-12-31 2015-01-30  87482X10        1.18   
...        ...        ...        ...        ...       ...         ...   
1490165   ZYNX 2022-10-20 2022-12-31 2023-03-13  98986M10        0.43   
1490166   ZYNX 2022-11-17 2022-12-31 2023-03-13  98986M10        0.43   
1490167   ZYNX 2022-12-15 2022-12-31 2023-03-13  98986M10        0.43   
1490168   ZYNX 2023-01-19 2022-12-31 2023-03-13  98986M10        0.43   
1490169   ZYNX 2023-02-16 2022-12-31 2023-03-13  98986M10        0.43   

         EPS_true_y1  EPS_true_l1_y1 ANNDATS_l1_y1 ticker   permno  score  \
0               

In [20]:
# Winsorize EPS forecast and actual values (monthly basis)
a1['EPS_ana'] = a1.groupby('YearMonth', group_keys=False)['EPS_ana']\
    .transform(lambda x: x.clip(x.quantile(0.01), x.quantile(0.99)))
a1['EPS_true'] = a1.groupby('YearMonth', group_keys=False)['EPS_true']\
    .transform(lambda x: x.clip(x.quantile(0.01), x.quantile(0.99)))

# Rename columns to indicate FPI = 1 forecast
a1.rename(columns={
    'EPS_true_l1': 'EPS_true_l1_y1',
    'EPS_true': 'EPS_true_y1',
    'EPS_ana': 'EPS_ana_y1',
    'ANNDATS': 'ANNDATS_y1',
    'ANNDATS_l1': 'ANNDATS_l1_y1'
}, inplace=True)

KeyError: 'Column not found: EPS_ana'

In [None]:
a1_cleaned = a1.drop(columns=['TICKER', 'STATPERS', 'CUSIP', 'score', 'ticker'])

In [None]:
ana_all = reduce(lambda x, y: pd.merge(x, y,
                                       on=['permno', 'YearMonth'],
                                       how='outer'),
                 [a1_cleaned])

df = ana_all.merge(crsp[['permno','YearMonth','siccd',
                         'ret', 'prc', 'bh1m', 'shrout', 'ME','prc_l1'
                        ]],
                   on=['permno','YearMonth'], 
                   )

# ------------------------------------------------------
# Load and prepare WRDS financial ratio data
# ------------------------------------------------------
ratios = pd.read_stata('./data/WRDS/financial_ratio.dta')

# Align the public date to the end of the month
ratios['public_date'] = ratios['public_date'] + MonthEnd(0)

# Ensure gvkey is float for consistency
ratios['gvkey'] = ratios['gvkey'].astype(float)

# ------------------------------------------------------
# Load and prepare Compustat data (used for SIC codes)
# ------------------------------------------------------
compa = pd.read_parquet('./data/WRDS/compa.parquet')
compa['gvkey'] = compa['gvkey'].astype(float)

# Merge SIC code from Compustat into financial ratios
ratios = ratios.merge(
    compa[['gvkey', 'datadate', 'sich']],
    left_on=['gvkey', 'adate'],
    right_on=['gvkey', 'datadate'],
    how='left'
)

# ------------------------------------------------------
# Merge financial ratios into main dataframe (df)
# ------------------------------------------------------
df = df.merge(
    ratios,
    left_on=['permno', 'YearMonth'],
    right_on=['permno', 'public_date'],
    how='left'
)

# ------------------------------------------------------
# Assign a SIC code using either Compustat (sich) or CRSP (siccd)
# ------------------------------------------------------
df['sic'] = np.where(df['sich'].isna(), df['siccd'], df['sich'])
df['sic'] = df['sic'].astype(int)

# ------------------------------------------------------
# Load Fama-French 49 industry classification
# ------------------------------------------------------
fama49 = pd.read_csv('./data/Other/Siccodes49.csv')

# Helper function: convert SIC ranges (start, end) to list of individual SICs
def zip_2_list(pairs):
    result = []
    for start, end in pairs:
        result.extend(range(start, end))
    return result

# Group SIC ranges by FF49 industry and convert to full SIC lists
fama49 = fama49.groupby('ff49').apply(lambda x: zip_2_list(zip(x.sic1, x.sic2 + 1)))

# Function to assign FF49 industry based on SIC
def fama_industry(sic, fama_dict):
    for ff49, sics in fama_dict.items():
        if sic in sics:
            return ff49
    return 49  # Default to "Other"

# ------------------------------------------------------
# Map each SIC in df to an FF49 industry
# ------------------------------------------------------
unique_sics = df['sic'].unique()
sic_to_ff49 = pd.DataFrame({'sic': unique_sics})
sic_to_ff49['fama49'] = sic_to_ff49['sic'].apply(lambda x: fama_industry(x, fama49))

# Merge FF49 codes back into the main dataframe
df = df.merge(sic_to_ff49, how='left', on='sic')

In [None]:
print("df pre-merge:", df.shape[0])

In [None]:
## Fill NA with Industry Median
## preprocess 
ratio_chars = ['CAPEI', 'bm',
       'evm', 'pe_exi', 'pe_inc', 'ps', 'pcf',
       'dpr', 'npm', 'opmbd', 'opmad', 'gpm', 'ptpm', 'cfm', 'roa', 'roe',
       'roce', 'efftax', 'aftret_eq', 'aftret_invcapx', 'aftret_equity',
       'pretret_noa', 'pretret_earnat', 'GProf', 'equity_invcap',
       'debt_invcap', 'totdebt_invcap', 'capital_ratio', 'int_debt',
       'int_totdebt', 'cash_lt', 'invt_act', 'rect_act', 'debt_at',
       'debt_ebitda', 'short_debt', 'curr_debt', 'lt_debt', 'profit_lct',
       'ocf_lct', 'cash_debt', 'fcf_ocf', 'lt_ppent', 'dltt_be', 'debt_assets',
       'debt_capital', 'de_ratio', 'intcov', 'intcov_ratio', 'cash_ratio',
       'quick_ratio', 'curr_ratio', 'cash_conversion', 'inv_turn', 'at_turn',
       'rect_turn', 'pay_turn', 'sale_invcap', 'sale_equity', 'sale_nwc',
       'rd_sale', 'adv_sale', 'staff_sale', 'accrual', 'ptb', 'PEG_trailing',
       'divyield']

## XX per share characteristics: IN Online Appendix A.2, BHL states that they "consider another twenty-six 
# fundamental values per share derived from these financial ratios"
# We recover these features from their persudo-data shared in RFS code & data
# See the data they shared: "/Earnings Forecasts/SampleFigure1.csv". Columns 'BU' to 'CR', totaling 24
# I add "sales_p" & "invcap_p" to make it 26
per_share_chars = ['dividend_p','BE_p','Liability_p','cur_liability_p','LT_debt_p',
                   'cash_p', 'total_asset_p', 'tot_debt_p', 'accrual_p', 'EBIT_p', 
                   'cur_asset_p', 'pbda_p', 'ocf_p', 'inventory_p', 'receivables_p',
                   'Cur_debt_p', 'interest_p', 'fcf_ocf_p', 'evm_p',
                   'sales_p', 'invcap_p', 'c_equity_p', 'rd_p', 'opmad_p', 'gpm_p','ptpm_p'
                  ]

df['dividend_p'] = df['divyield'] * df['prc']
df['BE_p'] = df['bm'] * df['prc'] # book-equity
df['Liability_p'] = df['de_ratio'] * df['BE_p'] # Total Debt
df['cur_liability_p'] = df['curr_debt'] * df['Liability_p']
df['LT_debt_p'] = df['lt_debt'] * df['Liability_p']
df['cash_p'] = df['cash_lt'] * df['Liability_p']
df['total_asset_p'] = df['Liability_p'] / df['debt_at']
df['tot_debt_p'] = df['debt_assets'] * df['total_asset_p']
df['accrual_p'] = df['accrual'] * df['total_asset_p']
df['EBIT_p'] = df['debt_ebitda'] / df['tot_debt_p']
df['cur_asset_p'] = df['curr_ratio']*df['cur_liability_p']
df['pbda_p'] = df['profit_lct'] * df['cur_liability_p'] # Operating Income before D&A
df['ocf_p'] = df['ocf_lct'] * df['cur_liability_p'] # Operating Cash Flow
df['inventory_p'] = df['invt_act'] * df['cur_asset_p']
df['receivables_p'] = df['rect_act'] * df['cur_asset_p']
df['Cur_debt_p'] = df['short_debt'] * df['total_asset_p'] # Short-term Debt
df['interest_p'] = df['int_totdebt'] * df['tot_debt_p']
df['fcf_ocf_p'] = df['fcf_ocf'] * df['ocf_p'] # Free Cash Flow
df['evm_p'] = df['evm'] * df['EBIT_p'] # Multiple of Enterprise Value

## ADD by YANDI ##
df['sales_p'] = df['sale_equity'] * df['BE_p'] # Sales
df['invcap_p'] = df['debt_invcap'] / df['LT_debt_p'] # Invested Capital

## Recover theirs
df['c_equity_p'] = df['equity_invcap'] * df['invcap_p'] # Common Equity
df['rd_p'] = df['rd_sale'] * df['sales_p'] # R&D
df['opmad_p'] = df['opmad'] * df['sales_p'] # Operating Income After Depreciation
df['gpm_p'] = df['gpm']  * df['sales_p'] # Gross Profit
df['ptpm_p'] = df['ptpm']  * df['sales_p'] # Pretax Income

df.replace([-np.inf, np.inf], np.nan, inplace=True)

In [None]:
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

# Your existing code here
for v in tqdm(ratio_chars+per_share_chars):
    df[v] = df.groupby(['YearMonth','fama49'], group_keys=False)[v].apply(lambda x: x.fillna(x.median()))

for v in tqdm(ratio_chars+per_share_chars):
    df[v] = df.groupby(['YearMonth'], group_keys=False)[v].apply(lambda x: x.fillna(x.median()))

In [None]:
### Macro Data
RGDP = pd.read_excel('./data/Macro/RGDP.xlsx').set_index('DATE')
RGDP = RGDP.apply(lambda x: np.log(x.dropna()).diff().iloc[-1], axis=0)
RGDP.index = pd.date_range(start='1965-11', end='2024-04', freq='M')

RCON = pd.read_excel('./data/Macro/RCON.xlsx').set_index('DATE')
RCON = RCON.apply(lambda x: np.log(x.dropna()).diff().iloc[-1], axis=0)
RCON.index = pd.date_range(start='1965-11', end='2024-04', freq='M')

INDPROD = pd.read_excel('./data/Macro/INDPROD.xlsx').set_index('DATE')
INDPROD = INDPROD.apply(lambda x: np.log(x.dropna()).diff().iloc[-1], axis=0)
INDPROD.index = pd.date_range(start='1962-11', end='2024-03', freq='M')

UNEMP = pd.read_excel('./data/Macro/UNEMP.xlsx').set_index('DATE')
UNEMP = UNEMP['RUC24Q1'].dropna()
UNEMP.index = pd.date_range(start='1948-01', end='2024-02', freq='M')
## LAG one month, we can only observe last month UNEMP
UNEMP = UNEMP.shift(1)

macro = pd.DataFrame({'RGDP':RGDP,'RCON':RCON,'INDPROD':INDPROD,'UNEMP':UNEMP})

In [None]:
df = df.merge(macro, left_on='YearMonth', right_index=True)
print("df macro-merge:", df.shape[0])

In [None]:
column_names = df.columns.tolist()
print(column_names)

In [None]:
ratio_chars = ['CAPEI', 'bm',
       'evm', 'pe_exi', 'pe_inc', 'ps', 'pcf',
       'dpr', 'npm', 'opmbd', 'opmad', 'gpm', 'ptpm', 'cfm', 'roa', 'roe',
       'roce', 'efftax', 'aftret_eq', 'aftret_invcapx', 'aftret_equity',
       'pretret_noa', 'pretret_earnat', 'GProf', 'equity_invcap',
       'debt_invcap', 'totdebt_invcap', 'capital_ratio', 'int_debt',
       'int_totdebt', 'cash_lt', 'invt_act', 'rect_act', 'debt_at',
       'debt_ebitda', 'short_debt', 'curr_debt', 'lt_debt', 'profit_lct',
       'ocf_lct', 'cash_debt', 'fcf_ocf', 'lt_ppent', 'dltt_be', 'debt_assets',
       'debt_capital', 'de_ratio', 'intcov', 'intcov_ratio', 'cash_ratio',
       'quick_ratio', 'curr_ratio', 'cash_conversion', 'inv_turn', 'at_turn',
       'rect_turn', 'pay_turn', 'sale_invcap', 'sale_equity', 'sale_nwc',
       'rd_sale', 'adv_sale', 'staff_sale', 'accrual', 'ptb', 'PEG_trailing',
       'divyield']

per_share_chars = ['dividend_p','BE_p','Liability_p','cur_liability_p','LT_debt_p',
                  'cash_p', 'total_asset_p', 'tot_debt_p', 'accrual_p', 'EBIT_p', 
                   'cur_asset_p', 'pbda_p', 'ocf_p', 'inventory_p', 'receivables_p',
                   'Cur_debt_p', 'interest_p', 'fcf_ocf_p', 'evm_p',
                   'sales_p', 'invcap_p', 'c_equity_p', 'rd_p', 'opmad_p', 'gpm_p','ptpm_p'
                  ]

macro_chars = ['RGDP', 'RCON', 'INDPROD', 'UNEMP']

fundamental_chars = ['ret', 'prc',
                    'EPS_true_l1_q1','EPS_true_l1_q2','EPS_true_l1_q3',
                    'EPS_true_l1_y1','EPS_true_l1_y2',
                    ]

analyst_chars = ['EPS_ana_q1','EPS_ana_q2','EPS_ana_q3','EPS_ana_y1','EPS_ana_y2']

targets = ['EPS_true_q1', 'EPS_true_q2', 'EPS_true_q3', 'EPS_true_y1', 'EPS_true_y2']

In [None]:
### Lag one month information ###
### Except for analyst forecasts
df.sort_values(by=['permno', 'YearMonth'], inplace=True)
# Filter vars_lag to only those columns that exist in df
vars_lag = [v for v in ratio_chars + per_share_chars + macro_chars + fundamental_chars if v in df.columns]

# Then apply the shift
df[vars_lag] = df.groupby('permno')[vars_lag].shift(1)

In [None]:
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

# ## FillNA with Industry Median
fillNA = ratio_chars + per_share_chars + fundamental_chars
for v in tqdm(fillNA):
    df[v] = df.groupby(['YearMonth','fama49'], group_keys=False)[v].apply(lambda x: x.fillna(x.median()))
## In case some characteristics are all NA in some industry
for v in tqdm(fillNA + macro_chars):
    df[v] = df.groupby(['YearMonth'], group_keys=False)[v].apply(lambda x: x.fillna(x.median()))

In [None]:
df_tmp = df[(df['YearMonth'] >= '1984-01-01') & (df['YearMonth'] <= '2019-12-31')].reset_index(drop=True).copy()

In [None]:
# winsorization period-by-period
cols = ratio_chars + per_share_chars + fundamental_chars
df_tmp[cols] = df_tmp.groupby('YearMonth',group_keys=False)[cols]\
                             .transform(lambda x: x.clip(x.quantile(0.01),x.quantile(0.99)))

In [None]:
df_tmp.to_parquet('./data/Results/df_train_new.parquet')

In [None]:
df_tmp.to_cvs('./data/Results/df_train.csv')

In [3]:
import pandas as pd

# Load the dataset
df = pd.read_stata('./data/Results/df_train_a1_U.dta')

# Convert date columns to datetime
df['fpedats'] = pd.to_datetime(df['fpedats'])
df['statpers'] = pd.to_datetime(df['statpers'])

# Keep only statpers strictly after fpedats and within 45 days
df_filtered = df[(df['statpers'] > df['fpedats']) & 
                 (df['statpers'] <= df['fpedats'] + pd.Timedelta(days=45))]

# Sort by statpers to identify the first one after fpedats
df_first = df_filtered.sort_values(['permno', 'fpedats', 'statpers']).groupby(
    ['permno', 'fpedats'], as_index=False).first()

# Print number of rows
print(f"Number of rows: {len(df_first)}")

# Save the result
df_first.to_stata('./data/Results/df_train_a1_U_45d.dta', write_index=False)

Number of rows: 110212


In [4]:
# Check for duplicates based on (permno, fpedats)
duplicates = df_first[df_first.duplicated(subset=['permno', 'fpedats'], keep=False)]

# Print duplicates
print("Duplicates based on (permno, fpedats):")
print(duplicates)

Duplicates based on (permno, fpedats):
Empty DataFrame
Columns: [permno, fpedats, v1, statpers, eps_ana_y1, eps_true_y1, eps_true_l1_y1, anndats_y1, anndats_l1_y1, yearmonth, siccd, ret, prc, bh1m, shrout, me, prc_l1, gvkey, adate, qdate, public_date, capei, bm, evm, pe_op_basic, pe_op_dil, pe_exi, pe_inc, ps, pcf, dpr, npm, opmbd, opmad, gpm, ptpm, cfm, roa, roe, roce, efftax, aftret_eq, aftret_invcapx, aftret_equity, pretret_noa, pretret_earnat, gprof, equity_invcap, debt_invcap, totdebt_invcap, capital_ratio, int_debt, int_totdebt, cash_lt, invt_act, rect_act, debt_at, debt_ebitda, short_debt, curr_debt, lt_debt, profit_lct, ocf_lct, cash_debt, fcf_ocf, lt_ppent, dltt_be, debt_assets, debt_capital, de_ratio, intcov, intcov_ratio, cash_ratio, quick_ratio, curr_ratio, cash_conversion, inv_turn, at_turn, rect_turn, pay_turn, sale_invcap, sale_equity, sale_nwc, rd_sale, adv_sale, staff_sale, accrual, ptb, peg_trailing, divyield, peg_1yrforward, peg_ltgforward, ticker, cusip, datadate, s

In [5]:
print(f"Number of duplicate rows: {len(duplicates)}")

Number of duplicate rows: 0


# 45 Days

In [12]:
import pandas as pd

# Load the dataset
df = pd.read_stata('./data/Results/df_train_a1_U.dta')

# Convert datetime columns and strip time to keep only date
df['anndats_l1_y1'] = pd.to_datetime(df['anndats_l1_y1']).dt.normalize()
df['statpers'] = pd.to_datetime(df['statpers']).dt.normalize()
df['fpedats'] = pd.to_datetime(df['fpedats']).dt.normalize()

df_filtered = df[
    (df['statpers'] > df['anndats_l1_y1']) &
    (df['statpers'] <= df['anndats_l1_y1'] + pd.Timedelta(days=45))
].copy()

# Now safely add the new column
df_filtered['diff_days'] = (df_filtered['statpers'] - df_filtered['anndats_l1_y1']).dt.days

# Keep the first statpers within that window for each (permno, anndats_l1_y1)
df_first = df_filtered.sort_values(['permno', 'anndats_l1_y1', 'statpers']).groupby(
    ['permno', 'anndats_l1_y1'], as_index=False).first()

# Print number of rows
print(f"Number of rows: {len(df_first)}")

# Save the output
df_first.to_stata('./data/Results/df_train_a1_U_45d_after_anndats_y1_l1.dta', write_index=False)

Number of rows: 111779


In [3]:
print(df.columns)

Index(['v1', 'statpers', 'fpedats', 'eps_ana_y1', 'eps_true_y1',
       'eps_true_l1_y1', 'anndats_y1', 'anndats_l1_y1', 'permno', 'yearmonth',
       ...
       'invcap_p', 'c_equity_p', 'rd_p', 'opmad_p', 'gpm_p', 'ptpm_p', 'rgdp',
       'rcon', 'indprod', 'unemp'],
      dtype='object', length=128)


In [14]:
print(df_filtered['diff_days'].describe())
print(df_filtered['diff_days'].value_counts().sort_index())

count    164944.000000
mean         23.476574
std          12.880020
min           1.000000
25%          14.000000
50%          23.000000
75%          35.000000
max          45.000000
Name: diff_days, dtype: float64
diff_days
1     5450
2     5282
3     2828
4       23
5      227
6     2494
7     5969
8     5400
9     5145
10    3168
11       8
12     196
13    2487
14    5957
15    5830
16    6275
17    4334
18      15
19     249
20    3548
21    8899
22    7532
23    7087
24    3437
25      58
26     744
27    2872
28    6972
29    6257
30    5863
31    2960
32      59
33     464
34    2619
35    6277
36    5648
37    5300
38    3275
39       2
40     185
41    2563
42    5929
43    5567
44    5553
45    3937
Name: count, dtype: int64
