In [157]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

In [326]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Load datasets
integrate4 = pd.read_csv('integrate4.csv')
tass5 = pd.read_csv('tass5.csv')
diag3 = pd.read_csv('diag3.csv')
intheset = pd.read_csv('intheset.csv')
car36 = pd.read_csv('car36.csv')
all_close_treat = pd.read_csv('all_close_treat.csv')

# Filter and process integrate4 data
df = integrate4[['id', 'mydate', 'crisis_treat24', 'crisis_treat24_90', 'fund_closedxcrisis', 'div_corr', 'corr_eps']]
df = df.sort_values(by=['id', 'mydate']).drop_duplicates(subset=['id', 'mydate'])

# Handle div_corr > 0.985
df.loc[df['div_corr'] > 0.985, 'div_corr'] = np.nan

# Calculate mindate
df['mindate'] = df.groupby('id')['mydate'].transform('min')

# Merge with tass5
df = df.merge(tass5, on=['id', 'mydate'], how='left')

# Merge with diag3 to get pre_treat and other variables
df = df.merge(diag3, on='companyid', how='left')

# Create TREATED variable
df['TREATED'] = np.where((df['mydate'] > 593) & (df['mydate'] <= 617) & (df['pre_treat'] == 1), 1, 0)

# Merge with intheset
df = df.merge(intheset, on=['id', 'mydate'], how='left')

# Calculate additional variables
df['fundcounter_i'] = 1
df['missing_aum_firm'] = df['aum'].isna().astype(int)

# Aggregate firm-level data
firm_level_agg = df.groupby(['companyid', 'mydate']).agg({
    'aum': 'sum',
    'fundcounter_i': 'sum',
    'missing_aum_firm': 'max'
}).reset_index().rename(columns={
    'aum': 'aum_firm',
    'fundcounter_i': 'fundcounter_j'
})
tempo = pd.DataFrame(firm_level_agg)
tempo.describe()

Unnamed: 0,companyid,mydate,aum_firm,fundcounter_j,missing_aum_firm
count,107854.0,107854.0,107854.0,107854.0,107854.0
mean,13316.232815,560.78546,962379300.0,1.494455,0.0
std,15204.115541,45.978855,2840067000.0,1.312269,0.0
min,29.0,408.0,1000000.0,1.0,0.0
25%,3039.0,536.0,49366000.0,1.0,0.0
50%,7711.0,572.0,205009300.0,1.0,0.0
75%,19719.0,597.0,767713200.0,2.0,0.0
max,96963.0,617.0,101134400000.0,29.0,0.0


In [327]:
firm_level_agg['log_firmscope'] = np.log(firm_level_agg['fundcounter_j'])

# Merge the firm-level data back to the original dataframe
df = df.merge(firm_level_agg, on=['companyid', 'mydate'], how='left')

df['age'] = df['mydate'] - df['mindate'] + 1
df['log_age'] = np.log(df['age'])


# Create deciles for aum and age
df['sz_dec'] = pd.qcut(df['aum'], 10, labels=False, duplicates='drop')
df['age_dec'] = pd.qcut(df['age'], 10, labels=False, duplicates='drop')

# Create size decile dummies
sz_q_dummies = pd.get_dummies(df['sz_dec'], prefix='sz_q')
df = pd.concat([df, sz_q_dummies], axis=1)

# Create post variable (assuming crisis ends at mydate 593)
df['post'] = np.where(df['mydate'] > 593, 1, 0)

# Handle missing aum
df['missing_aum'] = df['aum'].isna().astype(int)

# Calculate standardized variables and IR
df['stdv'] = df['stdv'].clip(df['stdv'].quantile(0.01), df['stdv'].quantile(0.99))
df['ir'] = df['excess_ret'] / df['stdv']
df['ir'] = df['ir'].clip(df['ir'].quantile(0.01), df['ir'].quantile(0.99))

df['excess_ret'] = df['excess_ret'].clip(df['excess_ret'].quantile(0.01), df['excess_ret'].quantile(0.99))

# Calculate scope quartiles
# df['firmscope'] = np.exp(df['log_firmscope'])
df['firmscope'] = df['fundcounter_j']
df['scope_q1'] = np.where(df['firmscope'] <= df['firmscope'].quantile(0.25), 1, 0)
df['scope_q2'] = np.where(df['firmscope'].between(df['firmscope'].quantile(0.25), df['firmscope'].quantile(0.50)), 1, 0)
df['scope_q3'] = np.where(df['firmscope'].between(df['firmscope'].quantile(0.50), df['firmscope'].quantile(0.75)), 1, 0)
df['scope_q4'] = np.where(df['firmscope'] > df['firmscope'].quantile(0.75), 1, 0)

# Ensure mydate is a string and pad it to the correct length
df['mydate'] = df['mydate'].astype(str).str.pad(width=8, fillchar='0')

df['date'] = pd.to_datetime(df['date'])
# Create year variable
df['year'] = pd.to_datetime(df['date'], format='%Y%m%d').dt.year
# Ensure year_dum variables are correctly created
year_dummies = pd.get_dummies(df['year'], prefix='year_dum')
df['year1'] = df['year']
df = pd.concat([df, year_dummies], axis=1)

df['int1c'] = np.where(df['div_corr'].notna(), df['div_corr'] * df['TREATED'], 0)
df['ever_treated'] = df.groupby('id')['TREATED'].transform('max').astype(bool)
df['first30'] = np.where(df['ever_treated'] & df['mydate'].astype(int).between(594, 623), 1, 0)
df['second30'] = np.where(df['ever_treated'] & df['mydate'].astype(int).between(624, 653), 1, 0)

df['firmsz_dec'] = pd.qcut(df['aum_firm'], 10, labels=False)
firmsz_q_dummies = pd.get_dummies(df['firmsz_dec'], prefix='firmsz_q')

df = pd.concat([df, firmsz_q_dummies], axis=1)

df['age_dec'] = pd.qcut(df['age'], 10, labels=False)
age_q_dummies = pd.get_dummies(df['age_dec'], prefix='age_q')




df = pd.concat([df, age_q_dummies], axis=1)

tass7 = df.copy()

# Create firm-level data for fund-level analysis
df_firm_level = df[df['intheset'] == 1]
df_firm_level = df_firm_level[df_firm_level['firmscope'] < 50]
df_firm_level = df_firm_level.groupby(['companyid', 'mydate']).agg({
    'excess_ret': 'mean',
    'ret': 'mean',
    'post': 'mean',
    'aum': 'mean',
    'fundcounter_j': 'max',
    'TREATED': 'max',
    'age': 'max',
    **{col: 'mean' for col in year_dummies.columns},
    **{col: 'mean' for col in sz_q_dummies.columns}
}).reset_index()
df_firm_level['log_firmscope'] = np.log(df_firm_level['fundcounter_j'])
df_firm_level['log_firmage'] = np.log(df_firm_level['age'] + 1)
df_firm_level['age_dec'] = pd.qcut(df_firm_level['age'], 10, labels=False)
df_firm_level['stdv_firm'] = df_firm_level.groupby(['companyid', 'TREATED'])['excess_ret'].transform('std')
df_firm_level['firm_ir'] = df_firm_level['excess_ret'] / df_firm_level['stdv_firm']
df_firm_level = df_firm_level.rename(columns={'fundcounter_j': 'firm_scope', 'age': 'firm_age'})

tass8 = df_firm_level.copy()

In [276]:
tass8

Unnamed: 0,companyid,mydate,excess_ret,ret,post,aum,firm_scope,TREATED,firm_age,year_dum_1994,...,sz_q_5,sz_q_6,sz_q_7,sz_q_8,sz_q_9,log_firmscope,log_firmage,age_dec,stdv_firm,firm_ir
0,29.0,00000526,-0.007494,-0.016699,0.0,1.478643e+07,1.0,0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.693147,0,0.094323,-0.079445
1,29.0,00000527,0.074482,-0.005199,0.0,3.372292e+07,1.0,0,2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,1.098612,0,0.094323,0.789643
2,29.0,00000528,0.002050,-0.017199,0.0,2.153770e+07,2.0,0,3,0.0,...,0.0,0.0,0.0,0.0,0.0,0.693147,1.386294,0,0.094323,0.021735
3,29.0,00000529,0.020911,-0.002550,0.0,2.147236e+07,2.0,0,4,0.0,...,0.0,0.0,0.0,0.0,0.0,0.693147,1.609438,0,0.094323,0.221699
4,29.0,00000530,-0.000848,0.019399,0.0,2.198978e+07,2.0,0,5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.693147,1.791759,0,0.094323,-0.008987
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68761,96683.0,00000613,0.041830,0.014399,1.0,1.269940e+09,1.0,0,67,0.0,...,0.0,0.0,0.0,1.0,0.0,0.000000,4.219508,6,0.069957,0.597940
68762,96683.0,00000614,0.066779,0.001499,1.0,1.309984e+09,1.0,0,68,0.0,...,0.0,0.0,0.0,1.0,0.0,0.000000,4.234107,6,0.069957,0.954577
68763,96683.0,00000615,0.065613,0.017199,1.0,1.338020e+09,1.0,0,69,0.0,...,0.0,0.0,0.0,1.0,0.0,0.000000,4.248495,6,0.069957,0.937915
68764,96683.0,00000616,0.041814,-0.004399,1.0,1.336691e+09,1.0,0,70,0.0,...,0.0,0.0,0.0,1.0,0.0,0.000000,4.262680,6,0.069957,0.597712


In [328]:
# Fund-level analysis: Merge datasets
columns_to_merge_from_tass7 = ['companyid', 'mydate', 'TREATED','missing_aum', 'int1c', 'ever_treated', 'first30', 'second30', 'id', 'div_corr', 'fund_closedxcrisis','firmscope', 'intheset', 'scope_q1', 'scope_q2', 'scope_q3', 'scope_q4', 'aum', 'log_age', 'age','aum_firm',
                               'ir', 'year1']
columns_to_merge_from_tass7.extend(firmsz_q_dummies.columns)  # Add the firmsize dummies
# columns_to_merge_from_tass7.extend(firmage_q_dummies.columns) # Add the firmage dummies
columns_to_merge_from_tass7.extend(age_q_dummies.columns) # Add the firmage dummies

df_fund_level = tass8.merge(
    tass7[columns_to_merge_from_tass7], 
    on=['companyid', 'mydate'], 
    how='left'
)
# Ensure 'id' is present in car36
car36['id'] = car36['id'].astype(str)
df_fund_level['id'] = df_fund_level['id'].astype(str)

# Convert mydate in car36 to string for merging
car36['mydate'] = car36['mydate'].astype(str).str.pad(width=8, fillchar='0')
all_close_treat['mydate'] = all_close_treat['mydate'].astype(str).str.pad(width=8, fillchar='0')

df_fund_level = df_fund_level.merge(car36[['id', 'mydate']], on=['id', 'mydate'], how='left')
df_fund_level = df_fund_level.merge(all_close_treat[['companyid', 'mydate', 'all_close_treat']], on=['companyid', 'mydate'], how='left')
df_fund_level = df_fund_level.loc[:, ~df_fund_level.columns.duplicated()]

# Check if TREATED variable exists after merging steps
df_fund_level['TREATED'] = df_fund_level['TREATED_x']
df_fund_level['ALL_TREAT'] = df_fund_level['all_close_treat'] - df_fund_level['TREATED']
# Create dummies for firm age deciles
df_fund_level['firmage_q'] = pd.qcut(df_fund_level['firm_age'], 10, labels=False)
firmage_q_dummies = pd.get_dummies(df_fund_level['firmage_q'], prefix='firmage_q')

In [316]:
print(firmage_q_dummies)
df_fund_level = pd.concat([df_fund_level, firmage_q_dummies], axis=1)
df_fund_level

        firmage_q_0  firmage_q_1  firmage_q_2  firmage_q_3  firmage_q_4  \
0              True        False        False        False        False   
1              True        False        False        False        False   
2              True        False        False        False        False   
3              True        False        False        False        False   
4              True        False        False        False        False   
...             ...          ...          ...          ...          ...   
105614        False        False        False        False        False   
105615        False        False        False        False        False   
105616        False        False        False        False        False   
105617        False        False        False        False        False   
105618        False        False        False        False        False   

        firmage_q_5  firmage_q_6  firmage_q_7  firmage_q_8  firmage_q_9  
0             False      

Unnamed: 0,companyid,mydate,excess_ret,ret,post,aum_x,firm_scope,TREATED_x,firm_age,year_dum_1994,...,firmage_q_0,firmage_q_1,firmage_q_2,firmage_q_3,firmage_q_4,firmage_q_5,firmage_q_6,firmage_q_7,firmage_q_8,firmage_q_9
0,29.0,00000526,-0.007494,-0.016699,0.0,1.478643e+07,1.0,0,1,0.0,...,True,False,False,False,False,False,False,False,False,False
1,29.0,00000527,0.074482,-0.005199,0.0,3.372292e+07,1.0,0,2,0.0,...,True,False,False,False,False,False,False,False,False,False
2,29.0,00000528,0.002050,-0.017199,0.0,2.153770e+07,2.0,0,3,0.0,...,True,False,False,False,False,False,False,False,False,False
3,29.0,00000528,0.002050,-0.017199,0.0,2.153770e+07,2.0,0,3,0.0,...,True,False,False,False,False,False,False,False,False,False
4,29.0,00000529,0.020911,-0.002550,0.0,2.147236e+07,2.0,0,4,0.0,...,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105614,96683.0,00000613,0.041830,0.014399,1.0,1.269940e+09,1.0,0,67,0.0,...,False,False,False,False,False,True,False,False,False,False
105615,96683.0,00000614,0.066779,0.001499,1.0,1.309984e+09,1.0,0,68,0.0,...,False,False,False,False,False,True,False,False,False,False
105616,96683.0,00000615,0.065613,0.017199,1.0,1.338020e+09,1.0,0,69,0.0,...,False,False,False,False,False,True,False,False,False,False
105617,96683.0,00000616,0.041814,-0.004399,1.0,1.336691e+09,1.0,0,70,0.0,...,False,False,False,False,False,True,False,False,False,False


REG1 part 2

In [329]:
df_fund_level['ALL_TREAT'] = df_fund_level['all_close_treat'] - df_fund_level['TREATED']
df_fund_level['ALL_TREAT'] = df_fund_level['ALL_TREAT'].clip(lower=0)

df_fund_level['time_treat'] = df_fund_level['mydate'].astype(int) - 594
df_fund_level['int2'] = df_fund_level['TREATED'] * df_fund_level['time_treat']
df_fund_level['int3'] = df_fund_level['ALL_TREAT'] * df_fund_level['time_treat']
df_fund_level['int3'] = df_fund_level['int3'].fillna(0)

df_fund_level['max_treated'] = df_fund_level.groupby('id')['TREATED'].transform('max')
df_fund_level['max_treated'] = np.where(df_fund_level['mydate'].astype(int) < 594, 0, df_fund_level['max_treated'])

df_fund_level['first24'] = np.where(df_fund_level['mydate'].astype(int).between(594, 617), 1, 0)
df_fund_level['next36'] = np.where(df_fund_level['mydate'].astype(int) > 617, 1, 0)

df_fund_level['Tx24'] = df_fund_level['max_treated'] * df_fund_level['first24']
df_fund_level['Txn36'] = df_fund_level['max_treated'] * df_fund_level['next36']

df_fund_level['max_int1c'] = df_fund_level.groupby('id')['int1c'].transform('max')
df_fund_level['max_int1c'] = np.where(df_fund_level['mydate'].astype(int) < 594, 0, df_fund_level['max_int1c'])

df_fund_level['max_int2'] = df_fund_level.groupby('id')['int2'].transform('max')
df_fund_level['max_int2'] = np.where(df_fund_level['mydate'].astype(int) < 594, 0, df_fund_level['max_int2'])

df_fund_level['Tx24xrelated'] = df_fund_level['Tx24'] * df_fund_level['max_int1c']
df_fund_level['Txn36xrelated'] = df_fund_level['Txn36'] * df_fund_level['max_int1c']

df_fund_level['triple_int'] = np.where(df_fund_level['div_corr'].notna(), df_fund_level['TREATED'] * df_fund_level['time_treat'] * df_fund_level['div_corr'], 0)
df_fund_level['int1c_max'] = np.where(df_fund_level['div_corr'].notna(), df_fund_level['max_treated'] * df_fund_level['div_corr'], 0)
df_fund_level['int2_max'] = df_fund_level['max_treated'] * df_fund_level['time_treat']
df_fund_level['triple_int_max'] = np.where(df_fund_level['div_corr'].notna(), df_fund_level['max_treated'] * df_fund_level['time_treat'] * df_fund_level['div_corr'], 0)

df_fund_level['max_firmscope'] = df_fund_level.groupby('companyid')['firmscope'].transform('max')
median_max_firmscope = df_fund_level.loc[(df_fund_level['max_treated'] == 1) & (df_fund_level['intheset'] == 1) & (df_fund_level['firmscope'] < 50), 'max_firmscope'].median()
# Create the interaction term
df_fund_level['TREATED_related'] = df_fund_level['TREATED'] * df_fund_level['int1c']

df_fund_level.to_csv('tass12.csv', index=False)

# Summary statistics
summary_stats_file = df_fund_level[(df_fund_level['intheset'] == 1) & (df_fund_level['mydate'].astype(int) <= 617) & (df_fund_level['firmscope'] < 50)]

summary_stats_file.to_csv('summary_stats_file.csv')

summary_stats = summary_stats_file.describe()
summary_stats.to_csv('summary_stats_real.csv')

# Filter the dataframe for the regression
regression_df = df_fund_level[(df_fund_level['intheset'] == 1) & (df_fund_level['mydate'].astype(int) <= 617) & (df_fund_level['firmscope'] < 50)]
regression_df['year'] = regression_df['year1']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regression_df['year'] = regression_df['year1']


In [321]:

regression_df


Unnamed: 0,companyid,mydate,excess_ret,ret,post,aum_x,firm_scope,TREATED_x,firm_age,year_dum_1994,...,firmage_q_0,firmage_q_1,firmage_q_2,firmage_q_3,firmage_q_4,firmage_q_5,firmage_q_6,firmage_q_7,firmage_q_8,firmage_q_9
0,29.0,00000526,-0.007494,-0.016699,0.0,1.478643e+07,1.0,0,1,0.0,...,True,False,False,False,False,False,False,False,False,False
1,29.0,00000527,0.074482,-0.005199,0.0,3.372292e+07,1.0,0,2,0.0,...,True,False,False,False,False,False,False,False,False,False
2,29.0,00000528,0.002050,-0.017199,0.0,2.153770e+07,2.0,0,3,0.0,...,True,False,False,False,False,False,False,False,False,False
3,29.0,00000528,0.002050,-0.017199,0.0,2.153770e+07,2.0,0,3,0.0,...,True,False,False,False,False,False,False,False,False,False
4,29.0,00000529,0.020911,-0.002550,0.0,2.147236e+07,2.0,0,4,0.0,...,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105614,96683.0,00000613,0.041830,0.014399,1.0,1.269940e+09,1.0,0,67,0.0,...,False,False,False,False,False,True,False,False,False,False
105615,96683.0,00000614,0.066779,0.001499,1.0,1.309984e+09,1.0,0,68,0.0,...,False,False,False,False,False,True,False,False,False,False
105616,96683.0,00000615,0.065613,0.017199,1.0,1.338020e+09,1.0,0,69,0.0,...,False,False,False,False,False,True,False,False,False,False
105617,96683.0,00000616,0.041814,-0.004399,1.0,1.336691e+09,1.0,0,70,0.0,...,False,False,False,False,False,True,False,False,False,False


In [330]:


# for i in range(10):
#     regression_df[f'sz_q{i}'] = regression_df[f'sz_q_{i}']
#     regression_df[f'age_q{i}'] = (regression_df[f'age_q_{i}']) 
#     regression_df[f'firmsz_q{i}'] = (regression_df[f'firmsz_q_{i}.0'])
#     regression_df[f'firmage_q{i}'] = (regression_df[f'firmage_q_{i}'])
# 

# 2. Handling Missing Values
# Impute or drop missing values in the relevant columns
regression_df.dropna(subset=['ir', 'TREATED'] + [col for col in regression_df.columns if col.startswith(('sz_q', 'scope_q', 'age_q', 'firmsz_q', 'firmage_q', 'year_dum_'))], inplace=True)

# 3. Filtering for clustering
# Ensure groups have at least two observations
id_counts = regression_df['id'].value_counts()
regression_df = regression_df[regression_df['id'].isin(id_counts[id_counts > 1].index)]

# Double-check and convert 'id' to numeric if needed
regression_df['id'] = pd.to_numeric(regression_df['id']) 

# Convert boolean columns to numeric (0 or 1)
boolean_columns = regression_df.select_dtypes(include='bool').columns
regression_df[boolean_columns] = regression_df[boolean_columns].astype(int)

# 4. Reset the index after filtering
regression_df = regression_df.reset_index(drop=True)
regression_df.to_csv('regression_df.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regression_df.dropna(subset=['ir', 'TREATED'] + [col for col in regression_df.columns if col.startswith(('sz_q', 'scope_q', 'age_q', 'firmsz_q', 'firmage_q', 'year_dum_'))], inplace=True)


In [307]:
regression_df['id']

array([  4561.,   5221.,   5223., ..., 593916.,   5358.,  25306.])

REGRESSION: ONE DECILE IS OMITTED FOR EVERY CATEGORY TO AVOID MULTICOLINNEARITY

In [293]:
# Try a simple model with only TREATED and a few controls
y = regression_df['ir']

simple_X = sm.add_constant(regression_df[['TREATED', 'sz_q0', 'sz_q1', 'sz_q2','sz_q3','sz_q4','sz_q5','sz_q6','sz_q7','sz_q8',
                                          # 'sz_q9',
                                          'scope_q1','scope_q2','scope_q3',
                                          # 'scope_q4',
                                          'age_q0', 'age_q1', 'age_q2', 'age_q3', 'age_q4', 'age_q5', 'age_q6', 'age_q7', 'age_q8', 
                                          # age_q9
                                          # 'firmsz_q0',
                                          'firmsz_q1', 'firmsz_q2', 'firmsz_q3', 'firmsz_q4', 'firmsz_q5', 'firmsz_q6', 'firmsz_q7', 'firmsz_q8',
                                          'firmsz_q9',
                                         'year_dum_1994', 'year_dum_1995', 'year_dum_1996', 'year_dum_1997', 'year_dum_1998', 'year_dum_1999', 'year_dum_2000','year_dum_2001','year_dum_2002','year_dum_2003', 'year_dum_2004', 'year_dum_2005', 'year_dum_2006',
                     'year_dum_2007', 'year_dum_2008', 'year_dum_2009', 'year_dum_2010', 
                                          # 'year_dum_2011',
                                          'firmage_q1','firmage_q2', 'firmage_q3', 'firmage_q4', 'firmage_q5',
                                          'firmage_q6', 'firmage_q7', 'firmage_q8', 'firmage_q9'
                                          # ,'firmage_q0'
                                          ]])
simple_model = sm.OLS(y, simple_X).fit(cov_type='cluster', cov_kwds={'groups': regression_df['id']})
print(simple_model.summary())


                            OLS Regression Results                            
Dep. Variable:                     ir   R-squared:                       0.044
Model:                            OLS   Adj. R-squared:                  0.043
Method:                 Least Squares   F-statistic:                     6.033
Date:                Thu, 08 Aug 2024   Prob (F-statistic):           1.39e-35
Time:                        11:59:06   Log-Likelihood:            -1.4649e+05
No. Observations:               94311   AIC:                         2.931e+05
Df Residuals:                   94254   BIC:                         2.936e+05
Df Model:                          56                                         
Covariance Type:              cluster                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -0.6707      0.320     -2.094



Summary statistics + correlation table

In [302]:
# List of selected variables for pairwise correlation
selected_variables = [
    'ir', 'TREATED', 'ALL_TREAT', 'TREATED_related',
    'aum_y', 'firm_scope',
    'firm_age', 'age', 'aum_firm', 'int1c', 'firmscope'
    # 'Firm AUM ($)', idiosyncratic risk
]

# Calculate the pairwise correlation table
summary_stats_corr = summary_stats_file[selected_variables].corr()

# Save the correlation table
summary_stats_corr.to_csv('summary_stats_correlation.csv')

In [308]:
regression_df


KeyError: 'year'

In [325]:
# List of columns to keep
columns_to_keep = ['ir', 'TREATED', 'firmscope', 'intheset', 'int1c','id','mydate', 'year', 'firm_age', 'age', 'aum_firm', 'aum_y', ] + \
                  [f'sz_q{i}' for i in range(10)] + \
                  [f'scope_q{i}' for i in range(1, 5)] + \
                  [f'age_q{i}' for i in range(10)] + \
                  [f'firmsz_q{i}' for i in range(10)] + \
                  [f'firmage_q{i}' for i in range(10)] + \
    [f'year_dum_{i}' for i in range(1994, 2012)]
                

# Select the specified columns from the dataframe
selected_data = regression_df[columns_to_keep]

# Save the new dataset to a CSV file
selected_data.to_csv('pure_regression.csv', index=False)

In [304]:
selected_data

Unnamed: 0,ir,TREATED,firmscope,intheset,int1c,id,mydate,sz_q0,sz_q1,sz_q2,...,year_dum_2002,year_dum_2003,year_dum_2004,year_dum_2005,year_dum_2006,year_dum_2007,year_dum_2008,year_dum_2009,year_dum_2010,year_dum_2011
0,-0.010104,0,1.0,1.0,0.0,4561.0,00000526,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.100427,0,1.0,1.0,0.0,4561.0,00000527,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.015879,0,2.0,1.0,0.0,4561.0,00000528,0.5,0.0,0.5,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.010346,0,2.0,1.0,0.0,5221.0,00000528,0.5,0.0,0.5,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.029113,0,2.0,1.0,0.0,4561.0,00000529,0.5,0.0,0.5,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94306,0.312140,0,1.0,1.0,0.0,25306.0,00000613,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
94307,0.498313,0,1.0,1.0,0.0,25306.0,00000614,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
94308,0.489615,0,1.0,1.0,0.0,25306.0,00000615,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
94309,0.312020,0,1.0,1.0,0.0,25306.0,00000616,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [167]:
sumstats = regression_df.describe()
sumstats.to_csv('sumstats.csv')

In [299]:


# Additional regressions as needed
# Regression model with absorb(id)
model_absorb = smf.ols('ir ~ TREATED + ' +
                       ' + '.join([f'sz_q{i}' for i in range(1, 10)]) + ' + ' +
                       ' + '.join([f'scope_q{i}' for i in range(1, 5)]) + ' + ' +
                       ' + '.join([f'age_q{i}' for i in range(1, 10)]) + ' + ' +
                       ' + '.join([f'firmsz_q{i}' for i in range(10)]) + ' + ' +
                       # ' + '.join([f'firmage_q{i}' for i in range(10)]),
                       ' + '.join([f'year_dum_{i}' for i in range(1994, 2012)]),
                       data=regression_df[(regression_df['intheset'] == 1) & (regression_df['mydate'].astype(int) <= 617) & (regression_df['firmscope'] < 50)]).fit(cov_type='cluster', cov_kwds={'groups': regression_df['id']})

print(model_absorb.summary())

                            OLS Regression Results                            
Dep. Variable:                     ir   R-squared:                       0.042
Model:                            OLS   Adj. R-squared:                  0.041
Method:                 Least Squares   F-statistic:                 2.017e-15
Date:                Thu, 08 Aug 2024   Prob (F-statistic):               1.00
Time:                        12:14:48   Log-Likelihood:            -1.4659e+05
No. Observations:               94311   AIC:                         2.933e+05
Df Residuals:                   94263   BIC:                         2.937e+05
Df Model:                          47                                         
Covariance Type:              cluster                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept      1.062e+11        nan        nan

  return np.sqrt(np.diag(self.cov_params()))


In [297]:

# Save model results
with open('model_absorb_results.txt', 'w') as f:
    f.write(model_absorb.summary().as_text())

# Additional analyses and summary statistics
regression_df['median_rel_int'] = 0
regression_df['median_rel_int'] = regression_df['int1c'].where(regression_df['int1c'] >= regression_df['int1c'].median(), 0)
regression_df['median_rel_int'] = regression_df['median_rel_int'].where(regression_df['int1c'] < regression_df['int1c'].median(), 1)

regression_df['intheset2'] = regression_df['intheset'].where(regression_df['median_rel_int'] == 1, 0)
regression_df['intheset3'] = regression_df['intheset'].where(regression_df['median_rel_int'] == 0, 0)

model_rel_int = smf.ols('ir ~ TREATED + int1c + ' +
                        ' + '.join([f'sz_q{i}' for i in range(1, 10)]) + ' + ' +
                        ' + '.join([f'scope_q{i}' for i in range(1, 5)]) + ' + ' +
                        ' + '.join([f'age_q{i}' for i in range(1, 10)]) + ' + ' +
                        ' + '.join([f'firmsz_q{i}' for i in range(10)]) + ' + ' +
                        ' + '.join([f'firmage_q{i}' for i in range(10)]) + ' + ' +
                        ' + '.join([f'year_dum_{i}' for i in range(1994, 2012)]),
                        data=regression_df[(regression_df['intheset2'] == 1) & (regression_df['firmscope'] < 50)]).fit(cov_type='cluster', cov_kwds={'groups': regression_df['id']})

print(model_rel_int.summary())
with open('model_rel_int_results.txt', 'w') as f:
    f.write(model_rel_int.summary().as_text())
##########################################################################################################################################################################
# Create grouped object for clustering BEFORE filtering
grouped = regression_df.groupby('id')

# Regressions for additional analysis
# Model for "related" firms with firmscope < 50
filtered_related = grouped.filter(lambda x: len(x) >= 2 and (x['intheset2'] == 1).any() and (x['firmscope'] < 50).all())  # Filter within groups
model_rel_int = smf.ols('ir ~ max_treated + time_treat + ' +
                        ' + '.join([f'sz_q{i}' for i in range(1, 10)]) + ' + ' +
                        ' + '.join([f'scope_q{i}' for i in range(1, 5)]) + ' + ' +
                        ' + '.join([f'age_q{i}' for i in range(1, 10)]) + ' + ' +
                        ' + '.join([f'firmsz_q{i}' for i in range(1, 10)]) + ' + ' +
                        ' + '.join([f'firmage_q{i}' for i in range(1, 10)]) + ' + ' +
                        ' + '.join([f'year_dum_{i}' for i in range(1994, 2012)]),
                        data=filtered_related).fit(cov_type='cluster', cov_kwds={'groups': filtered_related['id']})
print(model_rel_int.summary())
with open('model_rel_int_results.txt', 'w') as f:
    f.write(model_rel_int.summary().as_text())

# Subsample regressions
median_max_firmscope = regression_df[(regression_df['max_treated'] == 1) & (regression_df['intheset'] == 1) & (regression_df['firmscope'] < 50)]['max_firmscope'].median()

# Model: firm_ir ~ max_treated + int1c_max + int2_max + triple_int_max + controls (firmscope < median)
filtered_low_scope = grouped.filter(lambda x: len(x) >= 2 and (x['max_firmscope'] < median_max_firmscope).all())
model_subsample_review_1 = smf.ols('firm_ir ~ max_treated + int1c_max + int2_max + triple_int_max + ' +
                                   ' + '.join([f'sz_q{i}' for i in range(1, 10)]) + ' + ' +
                                   ' + '.join([f'scope_q{i}' for i in range(1, 5)]) + ' + ' +
                                   ' + '.join([f'age_q{i}' for i in range(1, 10)]) + ' + ' +
                                   ' + '.join([f'firmsz_q{i}' for i in range(1, 10)]) + ' + ' +
                                   ' + '.join([f'firmage_q{i}' for i in range(1, 10)]) + ' + ' +
                                   ' + '.join([f'year_dum_{i}' for i in range(1994, 2012)]),
                                   data=filtered_low_scope).fit(cov_type='cluster', cov_kwds={'groups': filtered_low_scope['id']})
print(model_subsample_review_1.summary())
with open('model_subsample_review_1_results.txt', 'w') as f:
    f.write(model_subsample_review_1.summary().as_text())

# Model: firm_ir ~ max_treated + int1c_max + int2_max + triple_int_max + controls (firmscope >= median)
filtered_high_scope = grouped.filter(lambda x: len(x) >= 2 and (x['max_firmscope'] >= median_max_firmscope).all())
model_subsample_review_2 = smf.ols('firm_ir ~ max_treated + int1c_max + int2_max + triple_int_max + ' +
                                   ' + '.join([f'sz_q{i}' for i in range(1, 10)]) + ' + ' +
                                   ' + '.join([f'scope_q{i}' for i in range(1, 5)]) + ' + ' +
                                   ' + '.join([f'age_q{i}' for i in range(1, 10)]) + ' + ' +
                                   ' + '.join([f'firmsz_q{i}' for i in range(1, 10)]) + ' + ' +
                                   ' + '.join([f'firmage_q{i}' for i in range(1, 10)]) + ' + ' +
                                   ' + '.join([f'year_dum_{i}' for i in range(1994, 2012)]),
                                   data=filtered_high_scope).fit(cov_type='cluster', cov_kwds={'groups': filtered_high_scope['id']})
print(model_subsample_review_2.summary())
with open('model_subsample_review_2_results.txt', 'w') as f:
    f.write(model_subsample_review_2.summary().as_text())
# Save model results

########################################################################################################################################

# Additional preparations for the regressions and final steps of reg1.do
# Subsample regressions as called for by reviewer
model_subsample_review_1 = smf.ols('firm_ir ~ max_treated + int1c_max + int2_max + triple_int_max + ' +
                                   ' + '.join([f'sz_q{i}' for i in range(1, 10)]) + ' + ' +
                                   ' + '.join([f'scope_q{i}' for i in range(1, 5)]) + ' + ' +
                                   ' + '.join([f'age_q{i}' for i in range(1, 10)]) + ' + ' +
                                   ' + '.join([f'firmsz_q{i}' for i in range(10)]) + ' + ' +
                                   ' + '.join([f'firmage_q{i}' for i in range(10)]) + ' + ' +
                                   ' + '.join([f'year_dum_{i}' for i in range(1994, 2012)]),
                                   data=regression_df[regression_df['max_firmscope'] < median_max_firmscope]).fit(cov_type='cluster', cov_kwds={'groups': regression_df['id']})
print(model_subsample_review_1.summary())
with open('model_subsample_review_1_results.txt', 'w') as f:
    f.write(model_subsample_review_1.summary().as_text())

model_subsample_review_2 = smf.ols('firm_ir ~ max_treated + int1c_max + int2_max + triple_int_max + ' +
                                   ' + '.join([f'sz_q{i}' for i in range(1, 10)]) + ' + ' +
                                   ' + '.join([f'scope_q{i}' for i in range(1, 5)]) + ' + ' +
                                   ' + '.join([f'age_q{i}' for i in range(1, 10)]) + ' + ' +
                                   ' + '.join([f'firmsz_q{i}' for i in range(10)]) + ' + ' +
                                   ' + '.join([f'firmage_q{i}' for i in range(10)]) + ' + ' +
                                   ' + '.join([f'year_dum_{i}' for i in range(1994, 2012)]),
                                   data=regression_df[regression_df['max_firmscope'] >= median_max_firmscope]).fit(cov_type='cluster', cov_kwds={'groups': regression_df['id']})
print(model_subsample_review_2.summary())
with open('model_subsample_review_2_results.txt', 'w') as f:
    f.write(model_subsample_review_2.summary().as_text())


  return np.sqrt(np.diag(self.cov_params()))


                            OLS Regression Results                            
Dep. Variable:                     ir   R-squared:                       0.031
Model:                            OLS   Adj. R-squared:                  0.031
Method:                 Least Squares   F-statistic:                -3.451e-17
Date:                Thu, 08 Aug 2024   Prob (F-statistic):               1.00
Time:                        12:14:05   Log-Likelihood:            -1.4710e+05
No. Observations:               94311   AIC:                         2.943e+05
Df Residuals:                   94270   BIC:                         2.947e+05
Df Model:                          40                                         
Covariance Type:              cluster                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   -3.29e+10   1.72e+19  -1.91e-09      1.0

  return np.sqrt(np.diag(self.cov_params()))


                            OLS Regression Results                            
Dep. Variable:                     ir   R-squared:                       0.044
Model:                            OLS   Adj. R-squared:                  0.043
Method:                 Least Squares   F-statistic:                -9.325e-17
Date:                Thu, 08 Aug 2024   Prob (F-statistic):               1.00
Time:                        12:14:06   Log-Likelihood:            -1.4649e+05
No. Observations:               94311   AIC:                         2.931e+05
Df Residuals:                   94253   BIC:                         2.937e+05
Df Model:                          57                                         
Covariance Type:              cluster                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        1.1e+11        nan        nan



                            OLS Regression Results                            
Dep. Variable:                     ir   R-squared:                       0.044
Model:                            OLS   Adj. R-squared:                  0.043
Method:                 Least Squares   F-statistic:                     5.900
Date:                Thu, 08 Aug 2024   Prob (F-statistic):           1.68e-35
Time:                        12:14:10   Log-Likelihood:            -1.4649e+05
No. Observations:               94311   AIC:                         2.931e+05
Df Residuals:                   94253   BIC:                         2.936e+05
Df Model:                          57                                         
Covariance Type:              cluster                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         0.3405      0.096      3.565

  return np.sum(weights * (model.endog - mean)**2)


ValueError: r_matrix performs f_test for using dimensions that are asymptotically non-normal

**Adjusting the regression: SIMPLIFICATION OF VARIABLES -> Seems to have good effect on the regression**

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
regression_df = pd.read_csv('regression_df.csv')

# Ensure continuous variables are ready
# Assuming 'aum', 'age', 'firmscope', 'firm_age', and 'year' are the original continuous variables

# Prepare the simplified dataset
regression_df['log_aum'] = np.log(regression_df['aum_x'] + 1)  # Log transformation for aum to handle skewness
regression_df['log_age'] = np.log(regression_df['age'] + 1)  # Log transformation for age
regression_df['log_firmscope'] = np.log(regression_df['firmscope'] + 1)  # Log transformation for firm scope
regression_df['log_firm_age'] = np.log(regression_df['firm_age'] + 1)  # Log transformation for firm age

# Select the continuous variables and other relevant variables
simpler_columns = ['TREATED', 'log_aum', 'log_age', 'log_firmscope', 'log_firm_age']

# Create a simplified regression model
model_simple = (smf.ols('ir ~ ' + ' + '.join(simpler_columns), data=regression_df))
                
result = model_simple.fit(cov_type='cluster', cov_kwds={'groups': regression_df['id']})
print(result.summary())
# Save model results
with open('model_simple_results.txt', 'w') as f:
    f.write(result.summary().as_text())

In [None]:
# Select the continuous variables, treatment, and year dummies
year_dummies = [f'year_dum_{i}' for i in range(1994, 2008)]
simpler_columns_with_year = simpler_columns + year_dummies

# Create a regression model including year dummies
model_simple_with_year = smf.ols('ir ~ ' + ' + '.join(simpler_columns_with_year), data=regression_df).fit(cov_type='cluster', cov_kwds={'groups': regression_df['id']})

print(model_simple_with_year.summary())


Simplified other models


In [None]:
print(regression_df.head())

ABSORB MODEL (MODEL2)

In [None]:
from linearmodels import PanelOLS
import pandas as pd

# Ensure 'id' is treated as a categorical variable and 'mydate' is date-like
regression_df['id'] = regression_df['id'].astype('category')

# Convert 'mydate' to a datetime format or ensure it is numeric
if not pd.api.types.is_numeric_dtype(regression_df['mydate']):
    regression_df['mydate'] = pd.to_datetime(regression_df['mydate'], format='%Y%m%d')

# Set the index to be a MultiIndex with 'id' and 'mydate'
regression_df = regression_df.set_index(['id', 'mydate'])

# Define the model formula, dropping one year dummy to avoid multicollinearity
# year_dummies = ' + '.join([f'year_dum_{i}' for i in range(1994, 2011)])  # Dropping 2012 to avoid perfect collinearity

formula = f'ir ~ TREATED + log_aum + log_age + log_firmscope + log_firm_age'

# Fit the fixed effects model
model_absorb = PanelOLS.from_formula(formula, data=regression_df, drop_absorbed=True)
results = model_absorb.fit(cov_type='clustered', cluster_entity=True)

print(results.summary)

# Save model results
with open('model_absorb_results.txt', 'w') as f:
    f.write(results.summary.as_text())


MODEL 3 (Median-related interaction model)

In [None]:
from linearmodels import PanelOLS
import pandas as pd

# Ensure 'id' is treated as a categorical variable and 'mydate' is date-like
regression_df['id'] = regression_df['id'].astype('category')

# Convert 'mydate' to a datetime format or ensure it is numeric
if not pd.api.types.is_numeric_dtype(regression_df['mydate']):
    regression_df['mydate'] = pd.to_datetime(regression_df['mydate'], format='%Y%m%d')

# Set the index to be a MultiIndex with 'id' and 'mydate'
regression_df = regression_df.set_index(['id', 'mydate'])

# Create the interaction term
regression_df['TREATED_related'] = regression_df['TREATED'] * regression_df['int1c']

# Define the model formula, dropping one year dummy to avoid multicollinearity
year_dummies = ' + '.join([f'year_dum_{i}' for i in range(1994, 2011)])  # Dropping 2012 to avoid perfect collinearity

formula = f'ir ~ TREATED + TREATED_related + log_aum + log_age + log_firmscope + log_firm_age'

# Fit the fixed effects model
model_rel_int_simple = PanelOLS.from_formula(formula, data=regression_df, drop_absorbed=True)
results = model_rel_int_simple.fit(cov_type='clustered', cluster_entity=True)

print(results.summary)

# Save model results
with open('model_rel_int_simple_results.txt', 'w') as f:
    f.write(results.summary.as_text())


In [ ]:
# Calculate median max_firmscope
median_max_firmscope = regression_df[(regression_df['max_treated'] == 1) & (regression_df['intheset'] == 1) & (regression_df['firmscope'] < 50)]['max_firmscope'].median()

# Simplified regression model for firmscope < median
filtered_low_scope = regression_df[(regression_df['max_firmscope'] < median_max_firmscope) & (regression_df['firmscope'] < 50)]
model_subsample_review_1_simple = smf.ols('firm_ir ~ max_treated + int1c_max + int2_max + triple_int_max + log_aum + log_age + log_firmscope + log_firm_age + ' +
                                          ' + '.join([f'year_dum_{i}' for i in range(1994, 2012)]),
                                          data=filtered_low_scope).fit(cov_type='cluster', cov_kwds={'groups': filtered_low_scope['id']})

print(model_subsample_review_1_simple.summary())
with open('model_subsample_review_1_simple_results.txt', 'w') as f:
    f.write(model_subsample_review_1_simple.summary().as_text())

# Simplified regression model for firmscope >= median
filtered_high_scope = regression_df[(regression_df['max_firmscope'] >= median_max_firmscope) & (regression_df['firmscope'] < 50)]
model_subsample_review_2_simple = smf.ols('firm_ir ~ max_treated + int1c_max + int2_max + triple_int_max + log_aum + log_age + log_firmscope + log_firm_age + ' +
                                          ' + '.join([f'year_dum_{i}' for i in range(1994, 2012)]),
                                          data=filtered_high_scope).fit(cov_type='cluster', cov_kwds={'groups': filtered_high_scope['id']})

print(model_subsample_review_2_simple.summary())
with open('model_subsample_review_2_simple_results.txt', 'w') as f:
    f.write(model_subsample_review_2_simple.summary().as_text())
