In [1]:
import os, re
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm_notebook as tqdm
import matplotlib.pyplot as plt

In [2]:
os.chdir('..')

---

In [3]:
df = pd.read_csv(os.path.join(os.getcwd(), '3_pipeline', '1_intermediate', 'sample_raw.tsv'),
                 sep = '\t')

In [1444]:
df = df.sort_values(by = ['gvkey', 'fyear'])

replacing missings by zero

In [1445]:
df[['spi', 'xrd', 'glp', 'sppiv']] = df[['spi', 'xrd', 'glp', 'sppiv']].fillna(0)

In [1446]:
# dhaliwal state "We define a loss firm as a firm that reports negative income before extraordinary items
# (Compustat data item IB) and
# does not report positive U.S. pretax income (Compustat data item PIDOM)"

# the PIDOM part suggests that they don't use pidom < 0, and leaves unclear
# whether missings on PIDOM are replaced by zero
# if I don't do that, lose many observations
df['pidom'] = df['pidom'].fillna(0)

create variables that need full Compustat series

In [1447]:
# first create variable to check whether years are consecutive
df['L1_fyear'] = df.groupby('gvkey')['fyear'].shift(1)
df['is_year_cons'] = np.where((df['fyear'] - df['L1_fyear']) == 1, 1, 0)

In [1448]:
for var in ['at', 'dvc', 'va', 'ib', 'dta_gross', 'sale']:
    df[f'L1_{var}'] = df.groupby('gvkey')[var].shift(1)
    df[f'L1_{var}'] = np.where(df['is_year_cons'] == 1, df[f'L1_{var}'], np.nan)

In [1449]:
# p. 139: "we scale accounting variables by beginning total assets and winsorize
# small values of beginning total assets to $10 million
# in order to mitigate scaling issues associated with extremely small firms"
df['L1_at'] = np.where(df['L1_at'] < 10, 10, df['L1_at'])

In [1450]:
# earnings and lagged earnings
df['earnings'] = df['ib'] / df['L1_at']
df['L1_earnings'] = df.groupby('gvkey')['earnings'].shift(1)
df['L1_earnings'] = np.where(df['is_year_cons'] == 0, np.nan, df['L1_earnings'])

In [1451]:
# age
df['age'] = df['fyear'] - df.groupby('gvkey')['fyear'].transform('min')

In [1452]:
# # loss sequence (for LOSSEQ variable)
# for i, r in df.iterrows()

sample selection

In [1453]:
df = df.loc[df['fyear'].isin([y for y in range(2012, 2023 + 1)])]

In [1454]:
df = df.loc[df['hfic'] == 'USA']

In [1455]:
df['sic_group'] = df['hsic'].astype(int).astype(str).str.zfill(4).str[:2].astype(int)

In [1456]:
sic_exclude = [49, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69]
df = df.loc[~df['sic_group'].isin(sic_exclude)]

In [1457]:
df = df.loc[df['ceq'] >= 0]

In [1458]:
# drop if variables for initial analysis are missing
df = df.loc[df[['txdfed', 'txfed', 'va', 'L1_va', 'dta_gross']].notna().all(axis = 1)]

In [1459]:
# dropping the variables below is for the full Dhaliwal analysis;
# for now, I only drop if the variables necessary to compare the
# Dhaliwal classification with actual VA data is missing

# df = df.loc[df[['txfed', 'txdfed', 'ib', 'pidom', 'oancf', 'xidoc',
#                 'spi', 'nopi', 'glp', 'L1_at', 'sppiv', 'sale',
#                 'xrd', 'prcc_f', 'csho', 'dvc', 'L1_dvc']].notna().all(axis = 1)]

In [1460]:
df['loss'] = np.where((df['ib'] < 0) & (df['pidom'] <= 0), 1, 0)

variables

In [1461]:
df['bn'] = np.where(df['txdfed'] >= 0, 1, 0)

In [1462]:
df['gn_va'] = np.where((df['txfed'] <= 0) & (df['txdfed'] < 0), 1, 0)

In [1463]:
df['gn_ti'] = np.where((df['txfed'] > 0) & (df['txdfed'] < 0), 1, 0)

In [1464]:
df['taxcatg'] = np.where(df['bn'] == 1, 0, np.where(df['gn_va'] == 1, 1, 2))

In [1492]:
df.loc[df['gn_ti'] == 1, 'taxcatg'].value_counts()

taxcatg
2    3883
Name: count, dtype: int64

In [1465]:
# df['current_tax'] = np.where(df['txfed'] == 0, 'zero', np.where(df['txfed'] > 0, 'positive', 'negative'))

In [1466]:
# df['deferred_tax'] = np.where(df['txdfed'] == 0, 'zero', np.where(df['txdfed'] > 0, 'positive', 'negative'))

In [1467]:
# # rank-transform earnings and lagged earnings
# df['earnings'] = df['earnings'].rank()
# df['L1_earnings'] = df['L1_earnings'].rank()

In [1468]:
# # cashflow
# df['cashflow'] = (df['oancf'] - df['xidoc']) / df['L1_at']
# df['cashflow'] = np.where(df['cashflow'] >= df.groupby('fyear')['cashflow'].transform('median'),
#                           1, 0)

In [1469]:
# # d_earnings
# df = df.loc[df['L1_earnings'].notna()]
# df['diff_earnings'] = (df['earnings'] - df['L1_earnings']).abs()
# df['d_earnings'] = np.where(df['diff_earnings'] >= df.groupby('fyear')['diff_earnings'].transform('median'),
#                              1, 0)
# df = df.drop(columns = 'diff_earnings')

In [1470]:
# # negspiw
# df['negspiw'] = np.where(df['spi'] < 0, 1, 0)

In [1471]:
# # negnop
# df['negnop'] = np.where(df['nopi'] < 0, 1, 0)

In [1472]:
# # negglis
# df['negglis'] = np.where(df['glp'] < 0, 1, 0)

In [1473]:
# # negglcf
# df['negglcf'] = np.where(df['sppiv'] > 0, 1, 0)

In [1474]:
# # salesgrowth
# df['salesgrowth'] = np.where(df['salesgrowth'] >= df.groupby('fyear')['salesgrowth'].transform('median'),
#                              1, 0)

In [1475]:
# # age
# # check: is never == 1?
# df['age'] = np.where((df['age'] == 1) | (df['age'] == 2), 1, 0)

In [1476]:
# # R&D
# df['rd'] = np.where(df['xrd'] > 0, 1, 0)

In [1477]:
# # firstloss
# df['firstloss'] = np.where((df['ib'] < 0) & (df['L1_ib'] > 0), 1, 0)

In [1478]:
# losseq (loss sequence)
# to be added...

In [1479]:
# # bigloss
# df['bigloss'] = np.where((df['ib'] / df['L1_at']) < - 0.8, 1, 0)

In [1480]:
# # size
# df['mve'] = df['prcc_f'] * df['csho']
# df['size'] = np.log(df['mve'])
# df['size'] = np.where(df['size'] == -np.inf, np.nan, df['size'])

In [1481]:
# # divdum
# df['divdum'] = np.where(df['dvc'] > 0, 1, 0)

In [1482]:
# # divstop
# df['divstop'] = np.where((df['L1_dvc'] > 0) & (df['dvc'] == 0), 1, 0)

In [1483]:
# df = df.drop(columns = ['adsh', 'name', 'period', 'instance', 'url_instance'])

In [1484]:
# # rescale earnings to range [0, 1]
# df['earnings'] = (df['earnings'] - df['earnings'].min()) / (df['earnings'].max() - df['earnings'].min())

In [1487]:
df.to_stata(os.path.join(os.getcwd(), '3_pipeline', '1_intermediate', 'sample_dhaliwal.dta'),
            write_index = False)

---

In [314]:
# restrict to loss observations
table1 = df.loc[df['loss'] == 1]

In [315]:
table1['current_tax'].value_counts(normalize = True)

current_tax
zero        0.720586
negative    0.167200
positive    0.112214
Name: proportion, dtype: float64

In [316]:
table1['deferred_tax'].value_counts(normalize = True)

deferred_tax
zero        0.687238
negative    0.208148
positive    0.104614
Name: proportion, dtype: float64