# AFP Project

WRDS data

IMPORTANT: change user name to your own wrds username. Remeber to do DUO 2FA on WRDS: https://wrds-www.wharton.upenn.edu/pages/about/log-in-to-wrds-using-two-factor-authentication/

In [2]:
import wrds
import datetime as dt
import pandas as pd
import numpy as np
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [38]:
if 'conn' in locals():
    print("WRDS connection already open!")
else:
    conn = wrds.Connection(wrds_username='chunnijiang')

# set sample date range
begdate = '01/01/2015'
enddate = '12/30/2022'

WRDS connection already open!


In [40]:
#####################################################
# Calculate Trades, Flows and Turnover Ratios       #
# November 2018                                     #
# Input  - Thomson-Reuters 13F Data (TR-13F)        #
#          S34TYPE3 Holdings data                   #
#          S34TYPE1 data                            #
#          CRSP Stock File data                     #
# Output - TRADES data with detailed Buys & Sales   #
#          by each institution                      #
#          AGGREGATES data with Total Buys & Sales  #
#          Assets, Flows and Turnover Measures      #
#####################################################
# sql to extract data from CRSP
price = conn.raw_sql(f"""
                      select permno, date, cfacpr, cfacshr, shrout, prc, ret
                      from crsp.msf 
                      where date between '{begdate}' and '{enddate}'
                      """, date_cols=['date']) 

# change variable format to int
price[['permno']]= price[['permno']].astype(int)

# get month and quarter-end dates
price['mdate'] = price['date'] + pd.offsets.MonthEnd(0)
price['qdate'] = price['date'] + pd.offsets.QuarterEnd(0)

# calculate adjusted price, total shares and market cap
price['p'] = price['prc'].abs() / price['cfacpr'] # price adjusted
price['tso'] = price['shrout'] * price['cfacshr'] * 1e3 # total shares out adjusted

# Keep only the records with shrout>0
price = price[price['shrout'] > 0]

# keep only relevant columns
price = price[['permno','mdate','qdate','date','cfacshr', 'p', 'tso','ret']]

# create log return for computing quarterly compounded return
price['ret'] = price['ret'].fillna(0)
price['logret'] = np.log(1+price['ret'])

qret = price.groupby(['permno','qdate'])['logret'].sum().reset_index()
qret['qret']=np.exp(qret['logret'])-1

# shift qdate by one quarter to make qret next quarter return
qret['qdate']=qret['qdate']+pd.offsets.QuarterEnd(-1)
qret = qret.drop(columns=['logret'], axis=1)

# Finalize CRSP Subset - Keep Quarterly Observations 
# And Add Forward Quartely Returns
price = price[price['qdate']==price['mdate']]
price = price[['qdate','permno','cfacshr','p','tso']]

price = pd.merge(price, qret, how='left', on=['permno','qdate'])

In [41]:
price.head()

Unnamed: 0,qdate,permno,cfacshr,p,tso,qret
0,2015-03-31,10001,1.0,9.96,10452000.0,0.034137
1,2015-03-31,10025,1.0,55.040001,5083000.0,0.002907
2,2015-03-31,10026,1.0,106.699997,18689000.0,0.040581
3,2015-03-31,10028,1.0,1.49,12254000.0,-0.700201
4,2015-03-31,10032,1.0,40.77,33657000.0,0.076282


In [6]:
######################################
# Step 1                             #
# Merge TR13F S34Type1 & 4Type3 sets #
######################################

# First keep first vintage with holdings data for each rdate-mgrno combination
fst_vint = conn.raw_sql("""
                      select rdate, fdate, mgrno, mgrname
                      from tfn.s34type1 
                      """, date_cols=['rdate','fdate'])

# Keep first vintage with holding data for each mgrno-rdate combo
min_fdate = fst_vint.groupby(['mgrno','rdate'])['fdate'].min().reset_index()

# Merge back with the fst_vint data to keep only the first vintage records
fst_vint = pd.merge(fst_vint, min_fdate, how='inner', on=['mgrno','rdate','fdate'])

# Sort by mgrno and rdate and create lag_rdate to calculate gap
fst_vint = fst_vint.sort_values(['mgrno', 'rdate'])
fst_vint['lag_rdate']=fst_vint.groupby(['mgrno'])['rdate'].shift(1)

fst_vint.dropna(inplace=True)
def diff_q(a: pd.Period, b: pd.Period) -> int:
    return (a - b).n

fst_vint['qtr'] = fst_vint.apply(lambda row: diff_q(row['rdate'].to_period('Q'), row['lag_rdate'].to_period('Q')), axis=1)
fst_vint['qtr'] = fst_vint['qtr'].fillna(-1)

fst_vint['first_report'] = ((fst_vint.qtr.isnull()) | (fst_vint.qtr>=2))
fst_vint = fst_vint.drop(['qtr'],axis=1)

# Last report by manager or missing 13F reports in the next quarter(s)
fst_vint = fst_vint.sort_values(['mgrno','rdate'], ascending=[True, False])
fst_vint['lead_rdate'] = fst_vint.groupby(['mgrno'])['rdate'].shift(1)

# Number of quarters gap between lead_rdate and rdate
fst_vint.dropna(inplace=True)
fst_vint['qtr'] = fst_vint.apply(lambda row: diff_q(row['lead_rdate'].to_period('Q'), row['rdate'].to_period('Q')), axis=1)
fst_vint['qtr'] = fst_vint['qtr'].fillna(-1)

# label last_report flag
fst_vint['last_report'] = ((fst_vint.qtr.isnull()) | (fst_vint.qtr>=2))
fst_vint = fst_vint.drop(['qtr'],axis=1)

fst_vint = fst_vint[(fst_vint['rdate']<=enddate) & (fst_vint['rdate']>=begdate)]\
                    .drop(['lag_rdate','lead_rdate'], axis=1)

In [7]:
fst_vint
# rdate: holdings report date
# fdate: vintage date
# mgrno: institution code

Unnamed: 0,rdate,fdate,mgrno,mgrname,first_report,last_report
387366,2022-09-30,2022-09-30,110.0,"AR ASSET MANAGEMENT, INC.",False,False
380611,2022-06-30,2022-06-30,110.0,"AR ASSET MANAGEMENT, INC.",False,False
373861,2022-03-31,2022-03-31,110.0,"AR ASSET MANAGEMENT, INC.",False,False
367016,2021-12-31,2021-12-31,110.0,"AR ASSET MANAGEMENT, INC.",False,False
361004,2021-09-30,2021-09-30,110.0,"AR ASSET MANAGEMENT, INC.",False,False
...,...,...,...,...,...,...
256417,2016-03-31,2016-03-31,95110.0,PHOENIX/ZWEIG ADVISERS LLC,False,False
252272,2015-12-31,2015-12-31,95110.0,PHOENIX/ZWEIG ADVISERS LLC,False,False
248065,2015-09-30,2015-09-30,95110.0,PHOENIX/ZWEIG ADVISERS LLC,False,False
244059,2015-06-30,2015-06-30,95110.0,PHOENIX/ZWEIG ADVISERS LLC,False,False


In [8]:
######################################
# Step 2                             #
# Extract Holdings and Adjust Shares #
######################################

# Fdate - Vintage Date - is used in shares adjustment

s34type3 = conn.raw_sql(f"""
                      select fdate, mgrno, cusip, shares
                      from tfn.s34type3
                      where fdate between '{begdate}' and '{enddate}'
                      """, date_cols=['fdate']) 


In [9]:
s34type3.head()

Unnamed: 0,fdate,mgrno,cusip,shares
0,2015-03-31,110.0,00206R10,59438.0
1,2015-03-31,110.0,00282410,8700.0
2,2015-03-31,110.0,00287Y10,12200.0
3,2015-03-31,110.0,01717510,1594.0
4,2015-03-31,110.0,02209S10,103906.0


In [10]:
holdings_v1 = pd.merge(fst_vint, s34type3, how='inner', on=['fdate','mgrno'] )
# Map 13F's historical cusip to CRSP's permno information
crsp = conn.raw_sql("""
                    select distinct permno, ncusip
                    from crsp.msenames
                    where ncusip != ''
                    """)

holdings_v2 = pd.merge(holdings_v1, crsp, how='inner', left_on='cusip', right_on='ncusip')
holdings_v2 = holdings_v2.drop(['mgrname','cusip','ncusip'], axis=1)

In [11]:
holdings_v2.head()

Unnamed: 0,rdate,fdate,mgrno,first_report,last_report,shares,permno
0,2022-09-30,2022-09-30,110.0,False,False,8700.0,20482.0
1,2022-06-30,2022-06-30,110.0,False,False,8700.0,20482.0
2,2022-03-31,2022-03-31,110.0,False,False,8700.0,20482.0
3,2021-12-31,2021-12-31,110.0,False,False,8700.0,20482.0
4,2021-09-30,2021-09-30,110.0,False,False,8700.0,20482.0


In [12]:
######################################
# Step 3                             #
# Adjust Shares Using CRSP CFACSHR   #
# Align at Vintage Dates             #
######################################

holdings = pd.merge(holdings_v2, price[['qdate','permno','cfacshr']], \
                    how='inner', left_on=['permno','fdate'], right_on=['permno','qdate'])

# Calculate Adjusted Shares
holdings['shares_adj']=holdings['shares']*holdings['cfacshr']
holdings=holdings.drop(['qdate','cfacshr','fdate'], axis=1)

# Sanity Checks for Duplicates - Ultimately, Should be 0 Duplicates
holdings = holdings.drop_duplicates(subset=['mgrno','permno','rdate'])

# Keep only observations with shares_adj>0
holdings = holdings[holdings['shares_adj']>0]

In [13]:
holdings.head()
# need 2

Unnamed: 0,rdate,mgrno,first_report,last_report,shares,permno,shares_adj
0,2022-09-30,110.0,False,False,8700.0,20482.0,8700.0
1,2022-09-30,185.0,False,False,1119016.0,20482.0,1119016.0
2,2022-09-30,195.0,False,False,243598.0,20482.0,243598.0
3,2022-09-30,205.0,False,False,172542.0,20482.0,172542.0
4,2022-09-30,220.0,False,False,133411.0,20482.0,133411.0


In [14]:
######################################
# Step 4                             #
# Calculate Institutional Trades     #
# Security-by-Security               #
# trade>0 -> Buy vs trade<0 -> Sale  #
# buysale variable for trade types:  #
#     1 = Initiating Buys            #
#     2 = Incremental (Regular) Buys #
#    -1 = Terminating Sales          #
#    -2 = Regular Sales              #
######################################

t1 = holdings.sort_values(['mgrno','permno','rdate'])

# create  phrdate pshares_adj trade and lead lag permno information

# previous holding quarter
t1['phrdate'] = t1.groupby(['mgrno','permno'])['rdate'].shift(1)
# previous quarter shares
t1['pshares_adj'] = t1.groupby(['mgrno','permno'])['shares_adj'].shift(1)
# trade as difference in current and previous quarter shares
t1['trade']=t1['shares_adj'] - t1['pshares_adj']

In [15]:
t1.head()
# pshares_adj: previous adjusted shares
# shares_adj
# trade: change in adjusted shares

Unnamed: 0,rdate,mgrno,first_report,last_report,shares,permno,shares_adj,phrdate,pshares_adj,trade
3892276,2021-03-31,110.0,False,False,3000.0,10104.0,3000.0,NaT,,
3890200,2021-06-30,110.0,False,False,3000.0,10104.0,3000.0,2021-03-31,3000.0,0.0
3888122,2021-09-30,110.0,False,False,3000.0,10104.0,3000.0,2021-06-30,3000.0,0.0
3886028,2021-12-31,110.0,False,False,3000.0,10104.0,3000.0,2021-09-30,3000.0,0.0
3883846,2022-03-31,110.0,False,False,3000.0,10104.0,3000.0,2021-12-31,3000.0,0.0


In [16]:
t1.isnull().sum()

rdate                 0
mgrno                 0
first_report          0
last_report           0
shares                0
permno                0
shares_adj            0
phrdate         4360061
pshares_adj     4360061
trade           4360061
dtype: int64

In [17]:
t1.dropna(inplace=True)

In [18]:
# quarter gap
t1['qtrgap'] = t1.apply(lambda row: diff_q(row['rdate'].to_period('Q'), row['phrdate'].to_period('Q')), axis=1)

# lag permno for determining first permno
t1['lpermno'] = t1['permno'].shift(1)

# lead permno for determining last permno
t1['npermno'] = t1['permno'].shift(-1)

In [33]:
t1.to_parquet('data/holdings.parquet.gzip', compression='gzip')

In [34]:
t1.head()

Unnamed: 0,rdate,mgrno,first_report,last_report,shares,permno,shares_adj,phrdate,pshares_adj,trade,qtrgap,lpermno,npermno,modtrade,buysale
3890200,2021-06-30,110.0,False,False,3000.0,10104.0,3000.0,2021-03-31,3000.0,0.0,1,,10104.0,3000.0,1.0
3888122,2021-09-30,110.0,False,False,3000.0,10104.0,3000.0,2021-06-30,3000.0,0.0,1,10104.0,10104.0,0.0,0.0
3886028,2021-12-31,110.0,False,False,3000.0,10104.0,3000.0,2021-09-30,3000.0,0.0,1,10104.0,10104.0,0.0,0.0
3883846,2022-03-31,110.0,False,False,3000.0,10104.0,3000.0,2021-12-31,3000.0,0.0,1,10104.0,10104.0,0.0,0.0
3881706,2022-06-30,110.0,False,False,3000.0,10104.0,3000.0,2022-03-31,3000.0,0.0,1,10104.0,10107.0,0.0,0.0


In [29]:
import pyarrow.parquet as pq

# Specify the path to your parquet.gzip file
parquet_file_path = 't1.parquet.gzip'

# Read the Parquet file using pyarrow
table = pq.read_table(parquet_file_path)

# Convert the table to a Pandas DataFrame if needed
df = table.to_pandas()

# Now, you can work with the DataFrame (df)
print(df.head())

             rdate  mgrno  first_report  last_report  shares   permno  \
3890200 2021-06-30  110.0         False        False  3000.0  10104.0   
3888122 2021-09-30  110.0         False        False  3000.0  10104.0   
3886028 2021-12-31  110.0         False        False  3000.0  10104.0   
3883846 2022-03-31  110.0         False        False  3000.0  10104.0   
3881706 2022-06-30  110.0         False        False  3000.0  10104.0   

         shares_adj    phrdate  pshares_adj  trade  qtrgap  lpermno  npermno  
3890200      3000.0 2021-03-31       3000.0    0.0       1      NaN  10104.0  
3888122      3000.0 2021-06-30       3000.0    0.0       1  10104.0  10104.0  
3886028      3000.0 2021-09-30       3000.0    0.0       1  10104.0  10104.0  
3883846      3000.0 2021-12-31       3000.0    0.0       1  10104.0  10104.0  
3881706      3000.0 2022-03-31       3000.0    0.0       1  10104.0  10107.0  


In [22]:
# List conditions 
cond1 = (t1.permno!=t1.lpermno)
cond1_1 = (t1.permno!=t1.lpermno) & (~t1.first_report)
cond2_1 = (t1.permno==t1.lpermno) & (~t1.first_report) & (t1.qtrgap==1)
cond2_2 = (t1.permno==t1.lpermno) & (~t1.first_report) & (t1.qtrgap!=1)

# Assign modtrade value based on the conditions listed above
t1['modtrade'] = t1['trade']
t1.loc[cond1, 'modtrade'] = np.nan
t1.loc[cond1_1, 'modtrade'] = t1.loc[cond1_1, 'shares_adj']
t1.loc[cond2_1, 'modtrade'] = t1.loc[cond2_1, 'trade']
t1.loc[cond2_2, 'modtrade'] = t1.loc[cond2_2, 'shares_adj']

# Assign buysale value based on the conditions
t1.loc[cond1_1, 'buysale'] = 1
t1.loc[cond2_1, 'buysale'] = 2 * np.sign(t1.loc[cond2_1, 'trade'])
t1.loc[cond2_2, 'buysale'] = 1.5

# Adjusting rdate for terminating sales records
t1['rdate'] = np.where(t1['buysale']==-1, t1['rdate'] + pd.offsets.QuarterEnd(1), t1['rdate'])

# Focusing on cases of intermediate sales - with gaps > 1 qtr 
# Then need to split transaction into 2
t2=t1[(t1.buysale==1.5)]
t2['rdate'] = t2['phrdate']+pd.offsets.QuarterEnd(1)

t2['buysale'] = -1
t2['modtrade'] = -t2['pshares_adj']

# Go back to modify the t1 dataset with buysale variable labeled as 1.5
t1['buysale'] = np.where(t1['buysale']==1.5, 1, t1['buysale'])

# handle terminating sales
t3 = t1[(t1.permno != t1.npermno) & (t1.last_report == False)]

t3['rdate'] = t3['rdate']+pd.offsets.QuarterEnd(1)
t3['modtrade'] = -t3['shares_adj']
t3['buysale'] = -1

# Append t1 t2 and t3 to create the complete trades output
trades = pd.concat([t1, t2, t3])
trades = trades[(trades.modtrade != 0) & (trades.modtrade.notna()) & (trades.buysale.notna())]
trades = trades[['rdate','mgrno','permno','modtrade','buysale']]\
.rename(columns={'modtrade':'trade'})

In [23]:
######################################
# Step 6                             #
# Calculate Assets, Total Buys, and  #
# Total Sales per Institution Each   #
# Quarter                            #
######################################

# Get Total Assets and Portfolio Returns from Holdings  
# Assuming Buys and Sales Are Executed at Calendar Quarter Ends

_holdings = holdings[['mgrno','permno','rdate','shares_adj']]
_price = price[['permno','qdate','p','qret']]
_assets = pd.merge(_holdings, _price, \
                  how='inner', left_on=['permno','rdate'], right_on=['permno','qdate'])

# create intermediate variables before summing by manager and quarter

# dollar held in each stock per quarter per manager
_assets['hold_per_stock'] = _assets['shares_adj']*_assets['p']/1000000
# dollar held projected to next quarter assuming same shares held
_assets['next_value'] = _assets['shares_adj']*_assets['p']*_assets['qret']
# dollar held curent quarter
_assets['curr_value'] = _assets['shares_adj']*_assets['p']

_assets = _assets.sort_values(['mgrno','rdate'])

# sum values across different stocks by a manager in a quarter
hold_per_stock = _assets.groupby(['mgrno','rdate'])['hold_per_stock'].sum().reset_index()
next_qtr = _assets.groupby(['mgrno','rdate'])['next_value'].sum().reset_index()
curr_qtr = _assets.groupby(['mgrno','rdate'])['curr_value'].sum().reset_index()

# Forward portfolio return
pret = pd.merge(next_qtr, curr_qtr, how='inner', on=['mgrno','rdate'])
pret['pret'] = pret['next_value']/pret['curr_value']
pret = pret[['mgrno','rdate','pret']]

assets = pd.merge(hold_per_stock, pret, how='inner',on=['mgrno','rdate'])

# Total portfolio Assets
assets = assets.rename(columns={'hold_per_stock':'assets'})

# Aggregate Total Buys and Sales per Institution Every Quarter 

# create intermediate variables first
_flows = pd.merge(trades, _price, how='inner', \
                  left_on=['permno','rdate'], right_on=['permno','qdate'])

# dollar amount buys per stock by a manager in a quarter
_flows['tbuys']  = _flows['trade']*(_flows['trade']>0) * _flows['p']/1000000
# dollar amount sells per stock by a manager in a quarter
_flows['tsales'] = (-1)*_flows['trade']*(_flows['trade']<0) * _flows['p']/1000000
# net gain from trades per stock 
_flows['tgain']  = _flows['trade']*_flows['p']*_flows['qret']/1000000

# sum values across different stocks by a manager in a quarter
tbuys  = _flows.groupby(['mgrno','rdate'])['tbuys'].sum().reset_index()
tsales = _flows.groupby(['mgrno','rdate'])['tsales'].sum().reset_index()
tgain  = _flows.groupby(['mgrno','rdate'])['tgain'].sum().reset_index()

ttran = pd.merge(tbuys, tsales, how='inner', on =['mgrno','rdate'])

# Flows dataframe
flows = pd.merge(ttran, tgain, how='inner',on=['mgrno','rdate'])

In [24]:
######################################
# Step 7                             #
# Calculate Net Flows and Turnover   #
######################################

fst_vint = fst_vint.sort_values(['mgrno','rdate'])
fst_vint = fst_vint.drop_duplicates(subset=['mgrno','rdate'])

# inner join fst_vint and assets dataframes then left join with flows
_agg1 = pd.merge(fst_vint, assets, how='inner', on =['mgrno','rdate'])
_agg1 = _agg1.drop(['fdate'], axis=1)
aggregates = pd.merge(_agg1, flows, how='left', on=['mgrno','rdate'])

# assets compound value
aggregates['assets_comp'] = aggregates['assets']*(1+aggregates['pret'])
aggregates = aggregates.sort_values(['mgrno','rdate'])
# lag asset compound value
aggregates['lassets_comp'] = aggregates.groupby(['mgrno'])['assets_comp'].shift(1)
# lag asset value
aggregates['lassets'] = aggregates.groupby(['mgrno'])['assets'].shift(1)

# Trade Returns = Returns on Purchases - Forgone Returns on Sales
aggregates['tgainret'] = aggregates['tgain']/(aggregates['tbuys'] + aggregates['tsales'])
aggregates['netflows'] = aggregates['assets'] - aggregates['lassets_comp']

# Three Types of Turnover Measures

# Carhart (1997) Turnover Definition
aggregates['turnover1'] = \
(aggregates[['tbuys', 'tsales']].min(axis=1)) / (aggregates[['assets', 'lassets']].mean(axis=1))

# Adding Back Net Flows and Redemptions
aggregates['turnover2'] = \
(aggregates[['tbuys', 'tsales']].min(axis=1) + aggregates['netflows'].abs().fillna(0)) \
/ aggregates['lassets']

# or, Alternatively
aggregates['turnover3'] = \
(aggregates['tbuys'].fillna(0)+aggregates['tsales'].fillna(0)\
-aggregates['netflows'].abs().fillna(0))\
/ aggregates['lassets']

# Assign missing values for first_report records
aggregates['netflows']=np.where(aggregates['first_report'], np.nan, aggregates['netflows'])
aggregates['tgainret']=np.where(aggregates['first_report'], np.nan, aggregates['tgainret'])
aggregates['turnover1']=np.where(aggregates['first_report'], np.nan, aggregates['turnover1'])
aggregates['turnover2']=np.where(aggregates['first_report'], np.nan, aggregates['turnover2'])
aggregates['turnover3']=np.where(aggregates['first_report'], np.nan, aggregates['turnover3'])

aggregates = aggregates.drop(['assets_comp', 'lassets_comp', 'lassets'], axis=1)
aggregates.to_parquet('aggregate_df.parquet.gzip', compression='gzip')
####################
# Close connection #
####################
conn.close()

######################################
##########  End of Program  ##########
######################################

In [25]:
aggregates.head()
# need3

Unnamed: 0,rdate,mgrno,mgrname,first_report,last_report,assets,pret,tbuys,tsales,tgain,tgainret,netflows,turnover1,turnover2,turnover3
0,2015-03-31,110.0,"AR ASSET MANAGEMENT, INC.",False,False,230.559108,-0.00058,,,,,,,,
1,2015-06-30,110.0,"AR ASSET MANAGEMENT, INC.",False,False,226.166607,-0.077177,220.872751,0.0,-17.77829,-0.080491,-4.258772,0.0,0.018471,0.939516
2,2015-09-30,110.0,"AR ASSET MANAGEMENT, INC.",False,False,206.563771,0.031058,8.114555,5.910309,-0.986499,-0.070339,-2.148021,0.027316,0.03563,0.052514
3,2015-12-31,110.0,"AR ASSET MANAGEMENT, INC.",False,False,203.341691,0.03711,5.266832,8.467398,-0.931106,-0.067795,-9.637487,0.025698,0.072154,0.019833
4,2016-03-31,110.0,"AR ASSET MANAGEMENT, INC.",False,False,214.631618,0.042093,7.935235,2.598843,0.324678,0.030822,3.743941,0.012435,0.031193,0.033393


In [31]:
aggregates.to_parquet('data/hf_netflow.parquet.gzip', compression='gzip')