Variables:

1. Monthly returns including dividend distributions

ret

2. Share Price 

prc

3. Common Shares Outstanding 

shrout

4. Share code  

shrcd

5. CRSP permanent company identifier (Permno and or Permco) 

permco 

6. Delisting return 

dlret

7. Date 

8. CRSP value‐weighted index returns

vwretd

In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import regex as re
import statsmodels.formula.api as smf

crsp = pd.read_csv('/Users/kailiao/Downloads/crsp.csv', dtype = object)

# change strings or floats to integers
for col in ['PERMNO','PERMCO']:
    crsp[col] = crsp[col].astype(int)

# change returns and prices to floats. If error, coerce to NaN
for col in ['PRC', 'RET', 'vwretd', 'SHROUT'] :
    crsp[col] = pd.to_numeric(crsp[col], errors='coerce')

# deal with negative price
crsp['PRC'] = crsp['PRC'].abs()

crsp['year']  = crsp['date'].str.slice(start=0, stop=4).astype(int)
crsp['month'] = crsp['date'].str.slice(start=4, stop=6).astype(int)
crsp['mrkcap'] = crsp['PRC'] * crsp['SHROUT']

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


In [2]:
periods = [1973 + i for i in np.arange(9) * 5]
periods

[1973, 1978, 1983, 1988, 1993, 1998, 2003, 2008, 2013]

In [3]:
pd.options.mode.chained_assignment = None

period= 1973
crsp_1 = crsp[(crsp['year'] < period + 3) & (crsp['year'] >= period)]

def kill_nan_ret(df):
    if df['RET'].isnull().values.any():
        df['mrkcap'] = 0
    return df

crsp_1 = crsp_1.groupby('PERMNO').apply(kill_nan_ret)

# select the largest 500 firms
largest = list(crsp_1.groupby('PERMNO').first().nlargest(n=500, columns=['mrkcap']).reset_index().PERMNO)
crsp_2 = pd.DataFrame()
for firm in largest:
    crsp_temp = crsp_1[crsp_1['PERMNO'] == firm]
    model = smf.ols("RET ~ vwretd", data=crsp_temp)
    result = model.fit()
    crsp_temp['beta'] = result.params.vwretd
    crsp_2 = pd.concat([crsp_2, crsp_temp])

crsp_2 = crsp_2.merge(crsp_2.groupby('PERMNO').first().beta.rank().reset_index(), on='PERMNO')
crsp_2 = crsp_2.rename(columns={"beta_x" : "beta", "beta_y" : "beta_rank"})


In [11]:
# generate portfolios

grids = [i for i in np.arange(10) * 50]
for grid in grids:
    crsp_2.loc[(crsp_2["beta_rank"] > grid)&(crsp_2["beta_rank"] <= grid + 50), 'port'] = np.floor(grid/50) + 1

crsp_2

Unnamed: 0,PERMNO,date,SHRCD,TICKER,PERMCO,DLRET,PRC,RET,SHROUT,vwretd,year,month,mrkcap,beta,beta_rank,port
0,12490,19730131,11,IBM,20990,,435.500,0.083333,116171.0,-0.027454,1973,1,5.059247e+07,0.814789,124.0,3.0
1,12490,19730228,11,IBM,20990,,431.500,-0.005970,116171.0,-0.044370,1973,2,5.012779e+07,0.814789,124.0,3.0
2,12490,19730330,11,IBM,20990,,431.500,0.000000,116171.0,-0.007857,1973,3,5.012779e+07,0.814789,124.0,3.0
3,12490,19730430,11,IBM,20990,,408.000,-0.054461,116171.0,-0.051769,1973,4,4.739777e+07,0.814789,124.0,3.0
4,12490,19730531,11,IBM,20990,,314.750,-0.032261,145214.0,-0.024508,1973,5,4.570611e+07,0.814789,124.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17995,25137,19750829,11,CCLR,800,,15.375,-0.075188,11007.0,-0.023558,1975,8,1.692326e+05,0.879222,153.0,4.0
17996,25137,19750930,11,CCLR,800,,14.375,-0.059837,11007.0,-0.038010,1975,9,1.582256e+05,0.879222,153.0,4.0
17997,25137,19751031,11,CCLR,800,,16.375,0.139130,11007.0,0.055857,1975,10,1.802396e+05,0.879222,153.0,4.0
17998,25137,19751128,11,CCLR,800,,16.375,0.000000,11007.0,0.031222,1975,11,1.802396e+05,0.879222,153.0,4.0


Take the 10 beta‐sub‐samples you formed each period based on the firm level beta estimates and form 10 value‐weighted monthly portfolios for each three‐year window. Remember, the weights for each monthly return should be based on the previous month market capitalization. Be sure the weights sum to one each period! You should now have 10 time‐series of monthly beta‐portfolio returns in each period for a total of $10portfolios*36months*8periods=2,880 observation$.

In [20]:
def port_weight(df):
    df['weight'] = df['mrkcap']/df['mrkcap'].sum()
    return df

ports = [i for i in np.arange(10) + 1]
port = 1
crsp_2['weight'] = np.nan
crsp_2 = crsp_2.groupby(['port', 'date']).apply(port_weight)
# weighted return for the stock in a portfolio
crsp_2['portwret'] = crsp_2['RET'] * crsp_2['weight']

Unnamed: 0,PERMNO,date,SHRCD,TICKER,PERMCO,DLRET,PRC,RET,SHROUT,vwretd,year,month,mrkcap,beta,beta_rank,port,weight,portwret
0,12490,19730131,11,IBM,20990,,435.500,0.083333,116171.0,-0.027454,1973,1,5.059247e+07,0.814789,124.0,3.0,0.331231,0.027602
1,12490,19730228,11,IBM,20990,,431.500,-0.005970,116171.0,-0.044370,1973,2,5.012779e+07,0.814789,124.0,3.0,0.337777,-0.002017
2,12490,19730330,11,IBM,20990,,431.500,0.000000,116171.0,-0.007857,1973,3,5.012779e+07,0.814789,124.0,3.0,0.339351,0.000000
3,12490,19730430,11,IBM,20990,,408.000,-0.054461,116171.0,-0.051769,1973,4,4.739777e+07,0.814789,124.0,3.0,0.333336,-0.018154
4,12490,19730531,11,IBM,20990,,314.750,-0.032261,145214.0,-0.024508,1973,5,4.570611e+07,0.814789,124.0,3.0,0.334070,-0.010777
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17995,25137,19750829,11,CCLR,800,,15.375,-0.075188,11007.0,-0.023558,1975,8,1.692326e+05,0.879222,153.0,4.0,0.002696,-0.000203
17996,25137,19750930,11,CCLR,800,,14.375,-0.059837,11007.0,-0.038010,1975,9,1.582256e+05,0.879222,153.0,4.0,0.002683,-0.000161
17997,25137,19751031,11,CCLR,800,,16.375,0.139130,11007.0,0.055857,1975,10,1.802396e+05,0.879222,153.0,4.0,0.002919,0.000406
17998,25137,19751128,11,CCLR,800,,16.375,0.000000,11007.0,0.031222,1975,11,1.802396e+05,0.879222,153.0,4.0,0.002872,0.000000


In [50]:
portfolios = pd.DataFrame(crsp_2.groupby(['port','date']).portwret.sum()).reset_index()
portfolios = portfolios.rename(columns={'portwret' : 'portret'})
portfolios = portfolios.merge(crsp_2[['date','vwretd']].drop_duplicates(), on='date')
portfolios

Unnamed: 0,port,date,portret,vwretd
0,1.0,19730131,0.005835,-0.027454
1,2.0,19730131,0.006360,-0.027454
2,3.0,19730131,0.014912,-0.027454
3,4.0,19730131,0.004928,-0.027454
4,5.0,19730131,-0.029178,-0.027454
...,...,...,...,...
355,6.0,19751231,-0.007657,-0.010998
356,7.0,19751231,-0.015676,-0.010998
357,8.0,19751231,-0.017735,-0.010998
358,9.0,19751231,-0.013070,-0.010998


In [58]:
def beta_port(df):
    model = smf.ols("portret ~ vwretd", data=df)
    result = model.fit()
    return result.params.vwretd

port_beta = pd.DataFrame(portfolios.groupby('port').apply(beta_port)).reset_index().rename(columns={0 : 'beta_port'})

Unnamed: 0,port,beta_port
0,1.0,0.482903
1,2.0,0.714655
2,3.0,0.829689
3,4.0,0.937254
4,5.0,1.029643
5,6.0,1.138358
6,7.0,1.213818
7,8.0,1.330404
8,9.0,1.508009
9,10.0,1.912798


In [94]:
period= 1973
crsp_t = crsp[(crsp['year'] < period + 8) & (crsp['year'] >= period + 3)]
# select the largest firm as in the training samples
crsp_t = crsp_t[crsp_t['PERMNO'].isin(largest)]
crsp_t = crsp_t.merge(crsp_2[['port', 'PERMNO']], on='PERMNO')
crsp_t = crsp_t.groupby(['PERMNO', 'date']).first().reset_index()

crsp_t

Unnamed: 0,PERMNO,date,SHRCD,TICKER,PERMCO,DLRET,PRC,RET,SHROUT,vwretd,year,month,mrkcap,port
0,10137,19760130,11,AYP,20045,,19.375,0.047297,27292.0,0.126012,1976,1,528782.50,5.0
1,10137,19760227,11,AYP,20045,,18.000,-0.070968,27292.0,0.007310,1976,2,491256.00,5.0
2,10137,19760331,11,AYP,20045,,17.625,0.001389,27292.0,0.026740,1976,3,481021.50,5.0
3,10137,19760430,11,AYP,20045,,17.750,0.007092,27292.0,-0.010402,1976,4,484433.00,5.0
4,10137,19760528,11,AYP,20045,,18.000,0.014085,27292.0,-0.009326,1976,5,491256.00,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29268,82086,19800829,30,WDRIY,4884,,91.125,0.058055,14082.0,0.023554,1980,8,1283222.25,1.0
29269,82086,19800930,30,WDRIY,4884,,113.375,0.244170,14082.0,0.029498,1980,9,1596546.75,1.0
29270,82086,19801031,30,WDRIY,4884,,107.000,-0.056229,14082.0,0.020095,1980,10,1506774.00,1.0
29271,82086,19801128,30,WDRIY,4884,,112.750,0.053738,14082.0,0.104952,1980,11,1587745.50,1.0


Unnamed: 0,PERMNO,date,SHRCD,TICKER,PERMCO,DLRET,PRC,RET,SHROUT,vwretd,year,month,mrkcap,port
29124,78204,19760130,31,UNONY,4694,,5.0,-0.047619,58100.0,0.126012,1976,1,290500.0,1.0
29125,78204,19760227,31,UNONY,4694,,3.75,-0.25,58100.0,0.00731,1976,2,217875.0,1.0
29126,78204,19760331,31,UNONY,4694,,3.625,0.05484,58100.0,0.02674,1976,3,210612.5,1.0
29127,78204,19760430,31,UNONY,4694,,4.25,0.172414,58100.0,-0.010402,1976,4,246925.0,1.0
29128,78204,19760528,31,UNONY,4694,,3.625,-0.147059,58100.0,-0.009326,1976,5,210612.5,1.0
29129,78204,19760630,31,UNONY,4694,,3.5,-0.034483,58100.0,0.044521,1976,6,203350.0,1.0
29130,78204,19760730,31,UNONY,4694,,3.0,-0.142857,58100.0,-0.006136,1976,7,174300.0,1.0
29131,78204,19760831,31,UNONY,4694,,2.375,-0.208333,58100.0,-0.001398,1976,8,137987.5,1.0
29132,78204,19760930,31,UNONY,4694,,2.5,0.105112,58100.0,0.024513,1976,9,145250.0,1.0
29133,78204,19761029,31,UNONY,4694,,2.875,0.15,58100.0,-0.020372,1976,10,167037.5,1.0


In [82]:
12*500*5

30000

In [70]:
crsp_t['PERMNO'].isin(largest)

987        False
988        False
989        False
990        False
991        False
           ...  
4228137    False
4228138    False
4228139    False
4228140    False
4228141    False
Name: PERMNO, Length: 310556, dtype: bool

In [71]:
largest

[12490,
 10401,
 11754,
 12079,
 11850,
 14322,
 12060,
 27983,
 14736,
 22592,
 18163,
 11308,
 46309,
 11703,
 15966,
 14541,
 40416,
 22111,
 22752,
 15667,
 19553,
 33099,
 25785,
 12749,
 21768,
 18403,
 50876,
 12570,
 20626,
 47079,
 10890,
 24678,
 37867,
 14357,
 10604,
 18542,
 25013,
 15368,
 26438,
 14277,
 39917,
 13928,
 21004,
 59176,
 13901,
 58827,
 26403,
 21936,
 15659,
 11631,
 43449,
 13696,
 23819,
 12546,
 14090,
 62616,
 59184,
 45604,
 27828,
 18550,
 18374,
 14218,
 11471,
 19393,
 22103,
 16432,
 15579,
 11260,
 22293,
 19414,
 14592,
 13856,
 23369,
 18729,
 26542,
 48071,
 16424,
 41718,
 17953,
 17209,
 32491,
 42892,
 46850,
 26681,
 23915,
 13688,
 21573,
 25267,
 14656,
 22779,
 24109,
 30365,
 15069,
 47061,
 27190,
 20853,
 24563,
 18382,
 47300,
 27887,
 40272,
 25769,
 57681,
 19916,
 40010,
 45081,
 14525,
 59221,
 19845,
 33814,
 66800,
 28353,
 18569,
 18016,
 27051,
 20220,
 48725,
 23341,
 11404,
 28425,
 18411,
 16109,
 20730,
 66333,
 62252,
