In [2]:
import os, re
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm_notebook as tqdm
import matplotlib.pyplot as plt

In [3]:
os.chdir('..')

---

In [150]:
df = pd.read_csv(os.path.join(os.getcwd(), '3_pipeline', '1_intermediate', 'sample_raw.tsv'),
                 sep = '\t')

In [151]:
df = df.sort_values(by = ['gvkey', 'fyear'])

create loss sequence variable

In [152]:
# first create variable to check whether years are consecutive
df['L1_fyear'] = df.groupby('gvkey')['fyear'].shift(1)
df['is_year_cons'] = np.where((df['fyear'] - df['L1_fyear']) == 1, 1, 0)

In [153]:
out = []
loss_count = 0

for i, r in df.iterrows():
    if r['is_year_cons'] == 0:
        loss_count = 0
    elif pd.isnull(r['ib']):
        loss_count = 0
    elif r['ib'] >= 0:
        loss_count = 0
    elif r['ib'] < 0:
        loss_count += 1
    
    # only append from second loss onwards (otherwise 'sequential' doesn't really make sense?)
    # and: there is also a firstloss dummy in the data that captures the first instance of a loss
    loss_count_to_append = 0 if loss_count == 1 else loss_count
    
    out.append(loss_count_to_append)

In [154]:
df['lossseq'] = out

In [155]:
df['lossseq'] = np.where(df['ib'].isna(), np.nan, df['lossseq'])

identify observations that have a zero TLCF somewhere in years 2017-onwards

In [156]:
df['tlcf_zero'] = np.where((df['tlcf'] == 0) & (df['fyear'].isin(range(2018, 2023 + 1))), 1, 0)
df['tlcf_nonzero'] = np.where((df['tlcf'] > 0) & (df['fyear'].isin(range(2018, 2023 + 1))), 1, 0)

In [157]:
df['tlcf_zero_max'] = df.groupby('gvkey')['tlcf_zero'].transform('max')
df['tlcf_zero_max'] = np.where(df['fyear'] < 2018, np.nan, df['tlcf_zero_max'])
df['tlcf_nonzero_max'] = df.groupby('gvkey')['tlcf_nonzero'].transform('max')
df['tlcf_nonzero_max'] = np.where(df['fyear'] < 2018, np.nan, df['tlcf_nonzero_max'])

In [158]:
# get first year in which a TLCF of zero occurred in the period 2017-2023
# if a firm has a nonzero TLCF after *that* should be an indefinite TLCF
tlcf_zero_firstyear = df.loc[(df['tlcf_nonzero_max'] == 1) &
                             (df['tlcf_zero_max'] == 1), ['gvkey', 'fyear', 'tlcf_zero',
                                                          'tlcf', 'tlcf_zero_max', 'tlcf_nonzero_max']]

In [159]:
tlcf_zero_firstyear['first_year'] = tlcf_zero_firstyear.groupby(['gvkey', 'tlcf_zero'])['fyear'].transform('min')

In [160]:
tlcf_zero_firstyear = tlcf_zero_firstyear.loc[tlcf_zero_firstyear['tlcf_zero'] == 1, ['gvkey', 'first_year']]

In [161]:
tlcf_zero_firstyear = tlcf_zero_firstyear.drop_duplicates('gvkey', keep = 'first')

In [162]:
# merge that info back to df
df = df.merge(right = tlcf_zero_firstyear, how = 'left', on = 'gvkey')

In [163]:
# note: the below assigns a zero also to observations outside of 2017-2023; check this
df['treat'] = np.where((df['fyear'] > 2018) &
                       (df['fyear'] > df['first_year']) &
                       (df['tlcf'] > 0), 1, 0)

In [164]:
df = df.drop(columns = ['tlcf_zero', 'tlcf_nonzero', 'tlcf_zero_max', 'tlcf_nonzero_max'])

In [167]:
df.to_stata(os.path.join(os.getcwd(), '3_pipeline', '1_intermediate', 'sample.dta'),
            write_index = False)