In [1]:
import pandas as pd
import re
from datetime import datetime, timedelta
import wrds

In [2]:
# === Connect to WRDS ===
conn = wrds.Connection()

Loading library list...
Done


In [16]:
# === Load TF-IDF output ===
df = pd.read_csv('tfidf_output.csv')
len(df)

9966

In [19]:
# === Extract CIK and Filing Date from filename ===
def extract_metadata(fname):
    match = re.search(r'(\d{8})_10-(K|Q)_edgar_data_(\d+)_', fname)
    if match:
        fdate = datetime.strptime(match.group(1), '%Y%m%d')
        cik = int(match.group(3))
        return fdate, cik
    return None, None

df['filing_date'], df['cik'] = zip(*df['file'].map(extract_metadata))
df.sample(5)

Unnamed: 0,file,avg_tfidf_finneg,avg_tfidf_h4n,doc_length,filing_date,cik
5567,20221027_10-Q_edgar_data_105770_0000105770-22-...,4.683584e-08,8.588503e-08,1409725,2022-10-27,105770
5284,20220804_10-Q_edgar_data_1326160_0001326160-22...,8.523765e-08,8.056598e-08,5728233,2022-08-04,1326160
9408,20240815_10-K_edgar_data_1116132_0001116132-24...,2.109844e-07,1.451725e-07,2478467,2024-08-15,1116132
9839,20241107_10-Q_edgar_data_1539838_0001539838-24...,1.116344e-07,4.968792e-07,1880723,2024-11-07,1539838
7700,20231101_10-Q_edgar_data_827052_0000827052-23-...,2.694136e-07,1.625819e-07,3175810,2023-11-01,827052


In [24]:
# === Step 1: Map CIK → GVKEY → PERMNO ===
# Load WRDS CIK → GVKEY
cik_gvkey = conn.raw_sql("""
    SELECT cik, conml, gvkey
    FROM comp.company
    WHERE cik IS NOT NULL
""")
# Drop missing, convert CIK to int
cik_gvkey = cik_gvkey.dropna().drop_duplicates()
cik_gvkey['cik'] = cik_gvkey['cik'].astype(int)

cik_gvkey.head(5)

Unnamed: 0,cik,conml,gvkey
0,723576,A & M Food Services Inc,1001
1,1306124,AAI Corp,1002
2,730052,A.A. Importing Co Inc,1003
3,1750,AAR Corp,1004
4,1882,ABKCO Industries Inc,1007


In [25]:
# Merge GVKEY into our TF-IDF dataframe
df_gvkey = pd.merge(df, cik_gvkey, on='cik', how='left')
df_gvkey.head(5)

Unnamed: 0,file,avg_tfidf_finneg,avg_tfidf_h4n,doc_length,filing_date,cik,conml,gvkey
0,20200102_10-Q_edgar_data_23217_0001564590-20-0...,9.409495e-08,7.404681e-08,3592889,2020-01-02,23217,Conagra Brands Inc,3362
1,20200102_10-Q_edgar_data_940944_0000940944-20-...,8.685097e-08,8.00328e-08,1508779,2020-01-02,940944,Darden Restaurants Inc,31846
2,20200103_10-Q_edgar_data_1679273_0001558370-20...,3.806863e-08,5.446093e-08,1437953,2020-01-03,1679273,Lamb Weston Holdings Inc,28790
3,20200107_10-Q_edgar_data_1170010_0001170010-20...,9.956749e-08,6.202268e-08,1497866,2020-01-07,1170010,CarMax Inc,64410
4,20200107_10-Q_edgar_data_320187_0000320187-20-...,3.130926e-08,3.793531e-08,1793056,2020-01-07,320187,NIKE Inc,7906


In [31]:
# === Step 2: Get PERMNO for each GVKEY ===
gvkey_permno = conn.raw_sql("""
    SELECT gvkey, lpermno as permno, linkdt, linkenddt
    FROM crsp.ccmxpf_linktable
    WHERE lpermno IS NOT NULL
""", date_cols=['linkdt', 'linkenddt'])
# if linkenddt is missing then set to today date
gvkey_permno['linkenddt']=gvkey_permno['linkenddt'].fillna(pd.to_datetime('today'))
gvkey_permno.head(5)

Unnamed: 0,gvkey,permno,linkdt,linkenddt
0,1000,25881.0,1970-11-13,1978-06-30 00:00:00.000000
1,1001,10015.0,1983-09-20,1986-07-31 00:00:00.000000
2,1002,10023.0,1972-12-14,1973-06-05 00:00:00.000000
3,1003,10031.0,1983-12-07,1989-08-16 00:00:00.000000
4,1004,54594.0,1972-04-24,2025-04-11 23:28:49.535230


In [33]:
# First just link by matching PERMNO
df_permno = pd.merge(df_gvkey, gvkey_permno, how='left', on=['gvkey'])

# Then set link date bounds
df_permno = df_permno.loc[(df_permno['filing_date']>=df_permno['linkdt'])\
                        &(df_permno['filing_date']<=df_permno['linkenddt'])]
df_permno.head(5)

Unnamed: 0,file,avg_tfidf_finneg,avg_tfidf_h4n,doc_length,filing_date,cik,conml,gvkey,permno,linkdt,linkenddt
0,20200102_10-Q_edgar_data_23217_0001564590-20-0...,9.409495e-08,7.404681e-08,3592889,2020-01-02,23217,Conagra Brands Inc,3362,56274.0,1972-12-14,2025-04-11 23:28:49.535230
1,20200102_10-Q_edgar_data_940944_0000940944-20-...,8.685097e-08,8.00328e-08,1508779,2020-01-02,940944,Darden Restaurants Inc,31846,81655.0,1995-05-30,2025-04-11 23:28:49.535230
2,20200103_10-Q_edgar_data_1679273_0001558370-20...,3.806863e-08,5.446093e-08,1437953,2020-01-03,1679273,Lamb Weston Holdings Inc,28790,16431.0,2016-11-10,2025-04-11 23:28:49.535230
4,20200107_10-Q_edgar_data_1170010_0001170010-20...,9.956749e-08,6.202268e-08,1497866,2020-01-07,1170010,CarMax Inc,64410,89508.0,2002-10-01,2025-04-11 23:28:49.535230
5,20200107_10-Q_edgar_data_320187_0000320187-20-...,3.130926e-08,3.793531e-08,1793056,2020-01-07,320187,NIKE Inc,7906,57665.0,1980-12-02,2025-04-11 23:28:49.535230


In [34]:
len(df_permno)

10367

In [35]:
# Filter to active linkages at filing date
def match_permno(row):
    matches = gvkey_permno[(gvkey_permno['gvkey'] == row['gvkey']) &
                           (row['filing_date'] >= gvkey_permno['linkdt']) &
                           (row['filing_date'] <= gvkey_permno['linkenddt'])]
    return matches['permno'].iloc[0] if not matches.empty else None

df_permno = df_gvkey.copy(deep=True)
df_permno['permno'] = df_permno.apply(match_permno, axis=1)
df_permno.head(5)

Unnamed: 0,file,avg_tfidf_finneg,avg_tfidf_h4n,doc_length,filing_date,cik,conml,gvkey,permno
0,20200102_10-Q_edgar_data_23217_0001564590-20-0...,9.409495e-08,7.404681e-08,3592889,2020-01-02,23217,Conagra Brands Inc,3362,56274.0
1,20200102_10-Q_edgar_data_940944_0000940944-20-...,8.685097e-08,8.00328e-08,1508779,2020-01-02,940944,Darden Restaurants Inc,31846,81655.0
2,20200103_10-Q_edgar_data_1679273_0001558370-20...,3.806863e-08,5.446093e-08,1437953,2020-01-03,1679273,Lamb Weston Holdings Inc,28790,16431.0
3,20200107_10-Q_edgar_data_1170010_0001170010-20...,9.956749e-08,6.202268e-08,1497866,2020-01-07,1170010,CarMax Inc,64410,89508.0
4,20200107_10-Q_edgar_data_320187_0000320187-20-...,3.130926e-08,3.793531e-08,1793056,2020-01-07,320187,NIKE Inc,7906,57665.0


In [36]:
len(df_permno)

9966

In [37]:
# === Step 3: Get 3-day stock returns ===
permnos = tuple(df_permno['permno'].dropna().unique())
start_date = df_permno['filing_date'].min() - timedelta(days=5)
end_date = df_permno['filing_date'].max() + timedelta(days=5)

stock_returns = conn.raw_sql(f"""
    SELECT permno, date, ret
    FROM crsp.dsf
    WHERE date BETWEEN '{start_date}' AND '{end_date}'
    AND permno IN {permnos}
""", date_cols=['date'])

stock_returns.head(5)

Unnamed: 0,permno,date,ret
0,10104,2019-12-30,-0.014216
1,10107,2019-12-30,-0.008619
2,10138,2019-12-30,-0.008223
3,10145,2019-12-30,-0.00034
4,10516,2019-12-30,-0.003888


In [38]:
# === Step 4: Get market returns ===
market_returns = conn.raw_sql(f"""
    SELECT date, vwretd
    FROM crsp.dsi
    WHERE date BETWEEN '{start_date}' AND '{end_date}'
""", date_cols=['date'])
market_returns.head(5)

Unnamed: 0,date,vwretd
0,2019-12-30,-0.005105
1,2019-12-31,0.002964
2,2020-01-02,0.007412
3,2020-01-03,-0.005828
4,2020-01-06,0.003269


In [39]:
# === Step 5: Compute 3-day excess return ===
def calc_excess_return(row):
    filing = row['filing_date']
    pid = row['permno']
    stock_sub = stock_returns[(stock_returns['permno'] == pid) &
                              (stock_returns['date'] >= filing) &
                              (stock_returns['date'] <= filing + timedelta(days=6))]
    market_sub = market_returns[(market_returns['date'] >= filing) &
                                (market_returns['date'] <= filing + timedelta(days=6))]

    if len(stock_sub) < 3 or len(market_sub) < 3:
        return None

    # Buy-and-hold return (1+r1)(1+r2)(1+r3) - 1
    sr = (1 + stock_sub['ret'].fillna(0)).prod() - 1
    mr = (1 + market_sub['vwretd'].fillna(0)).prod() - 1
    return sr - mr

df_return = df_permno.copy(deep=True)
df_return['excess_return_3d'] = df_return.apply(calc_excess_return, axis=1)
df_return.head(5)

Unnamed: 0,file,avg_tfidf_finneg,avg_tfidf_h4n,doc_length,filing_date,cik,conml,gvkey,permno,excess_return_3d
0,20200102_10-Q_edgar_data_23217_0001564590-20-0...,9.409495e-08,7.404681e-08,3592889,2020-01-02,23217,Conagra Brands Inc,3362,56274.0,-0.075913
1,20200102_10-Q_edgar_data_940944_0000940944-20-...,8.685097e-08,8.00328e-08,1508779,2020-01-02,940944,Darden Restaurants Inc,31846,81655.0,0.037704
2,20200103_10-Q_edgar_data_1679273_0001558370-20...,3.806863e-08,5.446093e-08,1437953,2020-01-03,1679273,Lamb Weston Holdings Inc,28790,16431.0,0.074559
3,20200107_10-Q_edgar_data_1170010_0001170010-20...,9.956749e-08,6.202268e-08,1497866,2020-01-07,1170010,CarMax Inc,64410,89508.0,0.033491
4,20200107_10-Q_edgar_data_320187_0000320187-20-...,3.130926e-08,3.793531e-08,1793056,2020-01-07,320187,NIKE Inc,7906,57665.0,-0.008988


In [40]:
# === Save merged file ===
df_return.to_csv('tfidf_with_excess_returns.csv', index=False)