# Overview

- This notebook intends to walk through the steps of replicating Table 2.
- `df_combo` is the main sample as shown in Table 1.

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import wrds

from load_CRSP_fund import load_CRSP_combined_file
from load_mflink import load_mflink1

import config
OUTPUT_DIR = Path(config.OUTPUT_DIR)
path = Path(OUTPUT_DIR) / "main_sample.parquet" 
df_combo = pd.read_parquet(path)

df_crsp = load_CRSP_combined_file()
df_mflink1 = load_mflink1()

# CRSP Mutual Fund Data

- CRSP data and mflink1 are merged based on `crsp_fundno`to obtain the appropriate `wficn`.
- Calculate `mret` and `mtna` for each `wficn`.
- The new CRSP data is then merged with `df_combo` by `year` and `wficn` to get the main sample's monthly returns. 

In [None]:
df_crsp = df_crsp.merge(df_mflink1, how="inner", on="crsp_fundno").reset_index(drop=True)

df_crsp = df_crsp.sort_values(["caldt", "wficn"])
df_crsp['mret'] = df_crsp['mret'].fillna(0)
df_crsp['lipper_class_name'] = df_crsp['lipper_class_name'].fillna('None')

df_crsp = df_crsp[~df_crsp['lipper_class_name'].astype(str).str.contains('International|Fixed Income|Precious Metal', case=False, regex=True)]
ret = df_crsp.groupby(["caldt", "wficn", 'lipper_class_name'])["mret"].mean().reset_index().rename(columns={"mret": "crsp_ret"})
tna = df_crsp.groupby(["caldt", "wficn", 'lipper_class_name'])["mtna"].sum().reset_index().rename(columns={"mtna": "crsp_tna"})
df_crsp = pd.merge(pd.merge(ret, tna, on=["caldt", "wficn", 'lipper_class_name'], how="inner"), 
                   df_crsp[["caldt", "wficn", 'lipper_class_name', 'index_fund_flag']], on=["caldt", "wficn", 'lipper_class_name'], how="inner").sort_values(["caldt", "wficn"])
df_crsp = df_crsp.drop_duplicates()
df_crsp = df_crsp.rename(columns={"caldt": "date"})

df_crsp['year'] = df_crsp['date'].dt.year.astype('int')
df_crsp = pd.merge(df_crsp, df_combo[['year', 'wficn']], on=["year", "wficn"], how="inner")

df_crsp['date'] = df_crsp['date'].dt.strftime('%Y%m').astype('int')
df_crsp

In [None]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)
float_format_func = lambda x: '{:.2f}'.format(x)
df_crsp = df_crsp.rename(columns={'lipper_class_name': 'lipper class','crsp_ret': '$crsp_{ret}$', 'crsp_tna':'$crsp_{TNA}$', 'index_fund_flag':'index fund flag'}) 

latexTS_crsp_t2 = df_crsp.tail(10).to_latex(float_format = float_format_func)

path_to_save = f'../output/table_crsp_t2.tex'

with open(path_to_save, 'w') as f: 
    f.write(latexTS_crsp_t2)
    
df_crsp = df_crsp.rename(columns={'lipper class name':'lipper_class_name','$crsp_{ret}$': 'crsp_ret', '$crsp_{TNA}$':'crsp_tna', 'index fund flag':'index_fund_flag'}) 

# Fama French Factors

- Factor returns, `df_ff`, are pulled from Kenneth R. French's website.

In [None]:
df_ff = pd.read_csv(Path(config.DATA_DIR)/'manual'/'F-F_Research_Data_5_Factors_2x3.csv').drop(['RF'], axis=1)
df_mom = pd.read_csv(Path(config.DATA_DIR)/'manual'/'F-F_Momentum_Factor.csv')
df_ff = df_ff.merge(df_mom, how='inner', on=['date'])
df_ff = df_ff[(df_ff['date'] >= 198001) & (df_ff['date'] <= 201912)]
df_ff

- CRSP data and factor returns are merged, and for each fund i in month t, $flow_{i,t}$ is calculated using the formula:

$$
\text{flow}_{i,t} = \frac{\text{TNA}_{i,t}}{\text{TNA}_{i,t-1}} \times (1 + \text{ret}_{i,t})
$$

In [None]:
df_reg = pd.merge(df_crsp[df_crsp['date'] <= 201912], df_ff, on=['date'], how="outer").sort_values(["date"])
flow = df_reg.groupby('wficn').apply(lambda d: d['crsp_tna']/(d['crsp_tna'].shift(1)) - (1+d['crsp_ret'])).reset_index().rename(columns={'level_1': 'index', 0: "flow"})
flow.set_index('index', inplace=True)
df_reg = pd.merge(df_reg, flow[['flow']], left_index=True, right_index=True).sort_values(['wficn', 'date'])
df_reg[['crsp_ret', 'flow']] *= 100
df_reg.replace([np.inf, -np.inf], np.nan, inplace=True)
df_reg= df_reg.fillna(0)
df_reg



In [None]:
 latexTS_df_reg = df_reg.tail(10).to_latex(float_format = float_format_func)

path_to_save = f'../output/table_fama_french_t2.tex'

with open(path_to_save, 'w') as f: 
    f.write(latexTS_df_reg) 

# Reporting the mean, std, and percentiles of factor betas across all funds

- To replicate Panel A of Table 2, for each fund i in month t, we run the following rolling time-series regression:
$$
\text{ret}_{i,t+1-k} = \alpha_{i,t} + \beta_{\text{MKT} i,t} \times \text{MKT}_{t+1-k} + \beta_{\text{HML} i,t} \times \text{HML}_{t+1-k} + \beta_{\text{SMB} i,t} \times \text{SMB}_{t+1-k} + \beta_{\text{MOM} i,t} \times \text{MOM}_{t+1-k} + \beta_{\text{CMA} i,t} \times \text{CMA}_{t+1-k} + \beta_{\text{RMW} i,t} \times \text{RMW}_{t+1-k} + \beta_{\text{flow} i,t} \times \text{flow}_{i,t+1-k} + \epsilon_{i,t,t+1-k}
$$
where k = 1,2,...,60.

- We require a fund should have 60 months of returns data and each rolling window contains 24 monthly observationswe need to run regression

In [None]:
from sklearn.linear_model import LinearRegression

def regression(df):
    beta = pd.DataFrame(columns = ['Mkt-RF', 'SMB', 'HML', 'MOM', 'CMA', 'RMW', 'flow'])

    for fund, data in df.groupby('wficn'):
        if len(data) >= 60: 
            for month in range(len(data)-59):
                sample = data.iloc[month:month+60, :]
                for rw in range(len(sample)-23):
                    rolling_window = sample.iloc[rw:rw+24, :]
                    X = rolling_window[['Mkt-RF', 'SMB', 'HML', 'MOM', 'CMA', 'RMW', 'flow']]
                    y = rolling_window[['crsp_ret']]
                    model = LinearRegression().fit(X, y)
                    coef = pd.DataFrame((model.coef_).reshape(-1, 7), columns = ['Mkt-RF', 'SMB', 'HML', 'MOM', 'CMA', 'RMW', 'flow'])
                    beta = pd.concat([beta, coef], axis=0)
    return beta

In [None]:
df_reg = df_reg.sample(frac=0.3, random_state=42)
all_funds = regression(df_reg)
panelA = all_funds.describe().loc[['mean', 'std']].append(all_funds.quantile(0.05)).append(all_funds.describe().loc[['25%', '50%', '75%']]).append(all_funds.quantile(0.95))
panelA = panelA.rename(index={0.05: 'P5', '25%': 'P25', '50%': 'P50', '75%': 'P75', 0.95: 'P95'})
panelA

In [None]:
latexTS_panelA = panelA.to_latex(float_format = float_format_func)

path_to_save = f'../output/table_panelA.tex'

with open(path_to_save, 'w') as f: 
    f.write(latexTS_panelA) 

# Reporting the mean factor betas by Lipper mutual fund classifications

- To replicate Panel B of Table 2, we classify funds according to `lipper_class_name`, and then run the regressions again. 

In [None]:
df_growth = df_reg[df_reg['lipper_class_name'].astype(str).str.contains('Growth', case=False, regex=True)]
df_value = df_reg[df_reg['lipper_class_name'].astype(str).str.contains('Value', case=False, regex=True)]
df_base = df_reg[df_reg['lipper_class_name'].astype(str).str.contains('Base', case=False, regex=True)]
df_large_cap = df_reg[df_reg['lipper_class_name'].astype(str).str.contains('Large-Cap', case=False, regex=True)]
df_mid_cap = df_reg[df_reg['lipper_class_name'].astype(str).str.contains('Mid-Cap', case=False, regex=True)]
df_small_cap = df_reg[df_reg['lipper_class_name'].astype(str).str.contains('Small-Cap', case=False, regex=True)]

In [None]:
growth = regression(df_growth)
value = regression(df_value)
base = regression(df_base)
large_cap = regression(df_large_cap)
mid_cap = regression(df_mid_cap)
small_cap = regression(df_small_cap)
panelB = pd.DataFrame({'All': all_funds.mean(), 'Growth': growth.mean(), 'Value': value.mean(), 
              'Large cap': large_cap.mean(), 'Medium cap': mid_cap.mean(), 'Small cap': small_cap.mean()}).T
panelB

In [None]:
latexTS_panelB = panelB.to_latex(float_format = float_format_func)

path_to_save = f'../output/table_panelB.tex'

with open(path_to_save, 'w') as f: 
    f.write(latexTS_panelB) 

# Reporting the mean factor betas by index fund status
- To replicate Panel C of Table 2, we classify funds according to index fund status.
- `index_fund_flag` identifies if a fund is an index fund:
- B = index-based fund
- D = pure index fund
- E = index fund enhanced

In [None]:
df_index = df_reg[df_reg['index_fund_flag'].astype(str).str.contains('D|B|E', case=False, regex=True)]
df_enhanced = df_reg[df_reg['index_fund_flag'].astype(str).str.contains('E', case=False, regex=True)]
df_base = df_reg[df_reg['index_fund_flag'].astype(str).str.contains('B', case=False, regex=True)]
df_pure = df_reg[df_reg['index_fund_flag'].astype(str).str.contains('D', case=False, regex=True)]
df_non_index = df_reg[~df_reg['index_fund_flag'].astype(str).str.contains('D|B|E', case=False, regex=True)]

In [None]:
index = regression(df_index)
enhanced = regression(df_enhanced)
base = regression(df_base)
pure = regression(df_pure)
non_index = regression(df_non_index)
panelC = pd.DataFrame({'All index funds': index.mean(), 'Enhanced': enhanced.mean(), 'Base': base.mean(), 
              'Pure': pure.mean(), 'All non-index funds': non_index.mean()}).T
panelC

# Recalculation using data up until the Present

In [12]:
df_ff = pd.read_csv(Path(config.DATA_DIR)/'manual'/'F-F_Research_Data_5_Factors_2x3.csv').drop(['RF'], axis=1)
df_mom = pd.read_csv(Path(config.DATA_DIR)/'manual'/'F-F_Momentum_Factor.csv')
df_ff = df_ff.merge(df_mom, how='inner', on=['date'])
df_ff = df_ff[df_ff['date'] >= 202001]

df_reg = pd.merge(df_crsp[df_crsp['date'] >= 202001], df_ff, on=['date'], how="outer").sort_values(["date"])
flow = df_reg.groupby('wficn').apply(lambda d: d['crsp_tna']/(d['crsp_tna'].shift(1)) - (1+d['crsp_ret'])).reset_index().rename(columns={'level_1': 'index', 0: "flow"})
flow.set_index('index', inplace=True)
df_reg = pd.merge(df_reg, flow[['flow']], left_index=True, right_index=True).sort_values(['wficn', 'date'])
df_reg[['crsp_ret', 'flow']] *= 100
df_reg.replace([np.inf, -np.inf], np.nan, inplace=True)
df_reg= df_reg.fillna(0)
df_reg

Unnamed: 0,date,wficn,lipper_class_name,crsp_ret,crsp_tna,index_fund_flag,year,Mkt-RF,SMB,HML,RMW,CMA,MOM,flow
0,202001,100003.0,Mid-Cap Core Funds,-2.142260,2345.1,0,2020,-0.11,-4.38,-6.25,-1.17,-2.32,5.98,0.000000
3901,202002,100003.0,Mid-Cap Core Funds,-8.212024,2148.4,0,2020,-8.13,0.04,-3.81,-1.47,-2.51,-0.40,-0.175678
7822,202003,100003.0,Mid-Cap Core Funds,-20.235685,1692.2,0,2020,-13.39,-8.28,-13.87,-1.57,1.24,7.97,-0.998722
11748,202004,100003.0,Mid-Cap Core Funds,13.859613,1923.3,0,2020,13.65,2.56,-1.33,2.72,-1.00,-5.23,-0.202834
15677,202005,100003.0,Mid-Cap Core Funds,8.487728,2063.6,0,2020,5.58,1.97,-4.88,0.95,-3.26,0.43,-1.192975
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175147,202308,612137.0,Multi-Cap Growth Funds,-6.721006,10.8,0,2023,-2.39,-3.65,-1.06,3.43,-2.37,3.77,-0.175546
178965,202309,612137.0,Multi-Cap Growth Funds,-8.267921,9.9,0,2023,-5.24,-1.80,1.52,1.86,-0.83,0.26,-0.065413
182784,202310,612137.0,Multi-Cap Growth Funds,-9.767911,8.9,0,2023,-3.19,-4.04,0.19,2.46,-0.66,1.73,-0.333099
186603,202311,612137.0,Multi-Cap Growth Funds,14.954149,10.3,0,2023,8.84,-0.12,1.64,-3.91,-1.00,2.75,0.776188


In [13]:
all_funds_pre = regression(df_reg)
panelA_pre = all_funds_pre.describe().loc[['mean', 'std']].append(all_funds_pre.quantile(0.05)).append(all_funds_pre.describe().loc[['25%', '50%', '75%']]).append(all_funds_pre.quantile(0.95))
panelA_pre = panelA_pre.rename(index={0.05: 'P5', '25%': 'P25', '50%': 'P50', '75%': 'P75', 0.95: 'P95'})
panelA_pre

  panelA_pre = all_funds_pre.describe().loc[['mean', 'std']].append(all_funds_pre.quantile(0.05)).append(all_funds_pre.describe().loc[['25%', '50%', '75%']]).append(all_funds_pre.quantile(0.95))
  panelA_pre = all_funds_pre.describe().loc[['mean', 'std']].append(all_funds_pre.quantile(0.05)).append(all_funds_pre.describe().loc[['25%', '50%', '75%']]).append(all_funds_pre.quantile(0.95))
  panelA_pre = all_funds_pre.describe().loc[['mean', 'std']].append(all_funds_pre.quantile(0.05)).append(all_funds_pre.describe().loc[['25%', '50%', '75%']]).append(all_funds_pre.quantile(0.95))


Unnamed: 0,Mkt-RF,SMB,HML,MOM,CMA,RMW,flow
mean,0.838474,0.261014,0.00759,-0.093572,0.009176,0.060307,0.000462
std,0.713365,1.236849,1.121301,0.796588,1.370855,0.94174,0.009823
P5,0.050474,-0.487254,-0.496928,-0.597806,-0.653491,-0.566958,-0.00642
P25,0.763041,-0.038289,-0.088519,-0.212292,-0.161232,-0.098061,-0.0006
P50,0.918895,0.19919,0.047543,-0.071579,0.021339,0.049314,3e-06
P75,1.017338,0.468853,0.179727,0.042161,0.2166,0.181903,0.000958
P95,1.201752,0.979487,0.440373,0.353255,0.736844,0.603689,0.007302


In [14]:
df_growth = df_reg[df_reg['lipper_class_name'].astype(str).str.contains('Growth', case=False, regex=True)]
df_value = df_reg[df_reg['lipper_class_name'].astype(str).str.contains('Value', case=False, regex=True)]
df_base = df_reg[df_reg['lipper_class_name'].astype(str).str.contains('Base', case=False, regex=True)]
df_large_cap = df_reg[df_reg['lipper_class_name'].astype(str).str.contains('Large-Cap', case=False, regex=True)]
df_mid_cap = df_reg[df_reg['lipper_class_name'].astype(str).str.contains('Mid-Cap', case=False, regex=True)]
df_small_cap = df_reg[df_reg['lipper_class_name'].astype(str).str.contains('Small-Cap', case=False, regex=True)]

growth_pre = regression(df_growth)
value_pre = regression(df_value)
base_pre = regression(df_base)
large_cap_pre = regression(df_large_cap)
mid_cap_pre = regression(df_mid_cap)
small_cap_pre = regression(df_small_cap)
panelB_pre = pd.DataFrame({'All': all_funds_pre.mean(), 'Growth': growth_pre.mean(), 'Value': value_pre.mean(), 
              'Large cap': large_cap_pre.mean(), 'Medium cap': mid_cap_pre.mean(), 'Small cap': small_cap_pre.mean()}).T
panelB_pre

Unnamed: 0,Mkt-RF,SMB,HML,MOM,CMA,RMW,flow
All,0.838474,0.261014,0.00759,-0.093572,0.009176,0.060307,0.000462
Growth,1.047901,0.373294,-0.202189,-0.004929,-0.064706,-0.18359,-0.000438
Value,,,,,,,
Large cap,0.960899,-0.072918,0.040981,-0.039026,0.055797,0.053435,0.002944
Medium cap,,,,,,,
Small cap,0.982935,0.657577,0.042552,-0.049937,-0.102132,0.026979,-0.002335


In [15]:
df_index = df_reg[df_reg['index_fund_flag'].astype(str).str.contains('D|B|E', case=False, regex=True)]
df_enhanced = df_reg[df_reg['index_fund_flag'].astype(str).str.contains('E', case=False, regex=True)]
df_base = df_reg[df_reg['index_fund_flag'].astype(str).str.contains('B', case=False, regex=True)]
df_pure = df_reg[df_reg['index_fund_flag'].astype(str).str.contains('D', case=False, regex=True)]
df_non_index = df_reg[~df_reg['index_fund_flag'].astype(str).str.contains('D|B|E', case=False, regex=True)]

index_pre = regression(df_index)
enhanced_pre = regression(df_enhanced)
base_pre = regression(df_base)
pure_pre = regression(df_pure)
non_index_pre = regression(df_non_index)
panelC_pre = pd.DataFrame({'All index funds': index_pre.mean(), 'Enhanced': enhanced_pre.mean(), 'Base': base_pre.mean(), 
              'Pure': pure_pre.mean(), 'All non-index funds': non_index_pre.mean()}).T
panelC_pre

Unnamed: 0,Mkt-RF,SMB,HML,MOM,CMA,RMW,flow
All index funds,0.67792,0.206927,-0.019068,-0.069861,0.243898,0.044582,0.002805
Enhanced,-0.055148,0.261547,-0.286657,0.020898,0.206939,-0.015011,0.006991
Base,0.983586,0.038352,0.051423,-0.022033,0.305585,0.165105,-0.004597
Pure,1.031657,0.274066,0.082868,-0.046547,0.276652,0.005352,0.006511
All non-index funds,0.889793,0.217524,0.052294,-0.078463,0.014563,0.047907,0.000457


In [None]:
latexTS_panelC = panelC.to_latex(float_format = float_format_func)

path_to_save = f'../output/table_panelC.tex'

with open(path_to_save, 'w') as f: 
    f.write(latexTS_panelC) 