In [1]:
import os, re
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm_notebook as tqdm
import matplotlib.pyplot as plt

In [2]:
os.chdir('..')

---

In [18]:
df = pd.read_stata(os.path.join(os.getcwd(), '3_pipeline', '1_intermediate', 'sample_for_cohort_formation.dta'))

In [25]:
stack = []
already_treated = []
tlcf_treated_threshold = 0.75
tlcf_control_threshold = 0.0

for year in range(2018, 2022 + 1):
    temp = df.loc[df['fyear'].isin(range(year - 3, year + 3))]
    gvkey_treated = temp.loc[(temp['tlcf_indefin'] >= tlcf_treated_threshold) &
                             (temp['fyear'] == year)]
    # focus on *first-time* treatment - i.e., if firm was treated in earlier cohort, drop
    gvkey_treated = gvkey_treated.loc[~temp['gvkey'].isin(already_treated), 'gvkey'].unique()
    # add newly treated to the list of already treated
    already_treated.extend(gvkey_treated)
    treated = temp.loc[temp['gvkey'].isin(gvkey_treated)]
    treated['treated'] = 1
    
    # inclusion in control only if:
    # 1) no tlcf_indefin above treshold across entire cohort
    # 2) not already in treatment group of prior cohort
    gvkey_control = temp.loc[(temp.groupby('gvkey')['tlcf_indefin'].transform('max') <= tlcf_control_threshold) &
                              (~temp['gvkey'].isin(already_treated)), 'gvkey'].unique()
    
    control = temp.loc[temp['gvkey'].isin(gvkey_control)]
    control['treated'] = 0
    
    cohort = pd.concat([treated, control], ignore_index = True)
                    
    cohort['cohort'] = year
    cohort['post'] = np.where(cohort['fyear'] >= year, 1, 0)
    
    cohort['post_min3'] = np.where((cohort['fyear'] - year) == -3, 1, 0)
    cohort['post_min2'] = np.where((cohort['fyear'] - year) == -2, 1, 0)
    cohort['post_min1'] = np.where((cohort['fyear'] - year) == -1, 1, 0)
    cohort['post_0'] = np.where((cohort['fyear'] - year) == 0, 1, 0)
    cohort['post_plus1'] = np.where((cohort['fyear'] - year) == 1, 1, 0)
    cohort['post_plus2'] = np.where((cohort['fyear'] - year) == 2, 1, 0)
    
    ## to think about: what is the average 'tlcf_indefin' for the treated observations in the pre-period
    ## because: we look at first-time treatment, but these firms might have a tlcf_indefin of >0 in the
    ## pre-period already, which reduces the change expected in the post-period 
#     drop_treatment = cohort.loc[(cohort['treated'] == 1) &
#                                 (cohort['fyear'] < cohort['cohort']) &
#                                 (cohort['tlcf_indefin'] > 0), 'gvkey'].unique().tolist()
#     cohort = cohort.loc[~cohort['gvkey'].isin(drop_treatment)]
    
    stack.append(cohort)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  treated['treated'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control['treated'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  treated['treated'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cave

In [26]:
stacked = pd.concat(stack, ignore_index = True)

In [27]:
(stacked.groupby(['cohort', 'treated', 'post'])['gvkey'].count().reset_index()
 .rename(columns = {'gvkey' : 'N'}))

Unnamed: 0,cohort,treated,post,N
0,2018,0,0,1227
1,2018,0,1,439
2,2018,1,0,86
3,2018,1,1,183
4,2019,0,0,791
5,2019,0,1,283
6,2019,1,0,146
7,2019,1,1,319
8,2020,0,0,456
9,2020,0,1,182


In [28]:
len(stacked)

7611

In [29]:
os.path.join(os.getcwd(), '3_pipeline', '1_intermediate', 'sample_cohorts.dta')

'/Users/maltemax/ownCloud/SBE_ACC_Replacement_Surfdrive (Projectfolder)/SBE_ACC_Replacement_Surfdrive (Projectfolder)/projects/footnote/3_pipeline/1_intermediate/sample_cohorts.dta'

In [30]:
stacked.to_stata(os.path.join(os.getcwd(), '3_pipeline', '1_intermediate', 'sample_cohorts.dta'), write_index = False)