In [1]:
import os, re
import numpy as np
import pandas as pd

In [2]:
os.chdir('..')

---

In [3]:
df = pd.read_csv(os.path.join(os.getcwd(), '3_pipeline', '1_intermediate', 'compustat.tsv'),
                 sep = '\t')

add historic header variables

In [4]:
comphist = pd.read_csv(os.path.join(os.getcwd(), '3_pipeline', '1_intermediate', 'comphist_prepared.tsv'),
                       sep = '\t')

In [5]:
df = df.merge(right = comphist, on = ['gvkey', 'fyear'], how = 'left')

add header variables and replace missing values on the historic variables by the header; then drop header variables

In [6]:
comp_header = pd.read_csv(os.path.join(os.getcwd(), '3_pipeline', '1_intermediate', 'compustat_header.tsv'),
                          sep = '\t')
comp_header = comp_header.drop(columns = 'loc')

In [7]:
df = df.merge(right = comp_header, on = 'gvkey', how = 'left')

In [8]:
for var in ['fic', 'sic', 'cik']:
    df[f'h{var}'] = df[f'h{var}'].fillna(df[var])

In [9]:
df = df.drop(columns = ['fic', 'sic', 'cik'])

add CRSP PERMNO (based on CCM)

In [10]:
ccm = pd.read_csv(os.path.join(os.getcwd(), '1_data', 'compustat_crsp_link.tsv'),
                  sep = '\t')

In [11]:
df = df.merge(right = ccm, on = ['gvkey', 'fyear'], how = 'left')

In [12]:
df = df.rename(columns = {'lpermno' : 'permno'})

add IBES ticker (based on CRSP-IBES link)

In [13]:
ibes = pd.read_csv(os.path.join(os.getcwd(), '3_pipeline', '1_intermediate', 'crsp_ibes_link_final.tsv'),
                   sep = '\t')

In [14]:
ibes = ibes.rename(columns = {'ticker' : 'ibes_ticker'})

In [15]:
df = df.merge(right = ibes, on = ['permno', 'fyear'], how = 'left')

load VA data and merge to the Compustat records; using the cik-fyear

In [16]:
va_info = pd.read_csv(os.path.join(os.getcwd(), '3_pipeline', '1_intermediate', 'va_data_info.tsv'),
                      sep = '\t')

In [17]:
va_info = va_info.drop_duplicates(subset = ['cik', 'fyear'])

In [18]:
va_info = va_info.rename(columns = {'cik' : 'hcik', 'sic' : 'sic_from_sec'})

In [19]:
df = df.merge(right = va_info, on = ['hcik', 'fyear'], how = 'left')

add valuation allowance data

In [20]:
va = pd.read_csv(os.path.join(os.getcwd(), '3_pipeline', '1_intermediate', 'va_data_clean.tsv'),
                 sep = '\t')

In [21]:
df = df.merge(right = va[['adsh', 'tag', 'va', 'zero_inferred']], on = 'adsh', how = 'left')

add DTA (gross)

In [22]:
dta = pd.read_csv(os.path.join(os.getcwd(), '3_pipeline', '1_intermediate', 'dta_gross.tsv'),
                  sep = '\t')

In [23]:
df = df.merge(right = dta, on = 'adsh', how = 'left')

add imputed TLCFs

In [24]:
imp = pd.read_csv(os.path.join(os.getcwd(), '1_data', 'imputation_upload_2024.txt'),
                  sep = '\t')

In [25]:
df = df.merge(right = imp, on = ['gvkey', 'fyear'], how = 'left')

add additional info from imputation paper

In [26]:
imp_add = pd.read_csv(os.path.join(os.getcwd(), '1_data', 'imputation_upload_2024_specifics_footnote_paper.txt'),
                  sep = '\t')

In [27]:
df = df.merge(right = imp_add[['gvkey', 'fyear', 'tlcf_pre_2018', 'tlcf_post_2018']], on = ['gvkey', 'fyear'], how = 'left')

save to disk

In [28]:
df.to_csv(os.path.join(os.getcwd(), '3_pipeline', '1_intermediate', 'sample_raw.tsv'),
          sep = '\t', index = False)