In [74]:
import numpy as np
import pandas as pd
import datetime as dt
import os
import statsmodels.formula.api as smf
from scipy.stats import pearsonr
from scipy.stats import spearmanr
import statsmodels.api as sm
from scipy.stats.mstats import winsorize

In [75]:
os.chdir(r'/Users/mac/Desktop/Asset Pricing/A shares market/beta/data')
ret = pd.read_csv(os.path.join('TRD_Month.csv'))
daily_data = pd.read_csv(os.path.join('TRD_daily.csv'))
#code = pd.read_csv(os.path.join('Acode.csv'))
ff5 = pd.read_csv(os.path.join('fivefactor_monthly.csv')) #数据来源：央财因子数据库
mktcap = pd.read_csv(os.path.join('fivefactor_daily.csv'))

In [76]:
#定义获取收益和因子数据年份与月份的函数，便于后续处理

def get_month(table,key):
    table[key] = pd.to_datetime(table[key])
    table['year'] = table[key].dt.year
    table['month'] = table[key].dt.month
    return table

def get_month2(table,key):
    table[key] = pd.to_datetime(table[key],format = '%Y%m')
    table['year'] = table[key].dt.year
    table['month'] = table[key].dt.month
    return table

In [77]:
ret.head(10)

Unnamed: 0,Stkcd,Trdmnt,Opndt,Mopnprc,Clsdt,Mclsprc,Mnshrtrd,Mnvaltrd,Msmvosd,Msmvttl,Ndaytrd,Mretwd,Mretnd,Markettype,Capchgdt,Ahshrtrd_M,Ahvaltrd_M
0,2,1991-01,29,14.58,30,14.51,10000,145000.0,406280.0,598489.62,2,,,4,1991-01-29,,
1,2,1991-02,4,14.66,28,15.09,878500,11065000.0,422520.0,622412.7,13,0.039972,0.039972,4,1991-01-29,,
2,2,1991-03,1,15.01,29,13.18,174500,2996000.0,369040.0,543631.51,21,-0.126574,-0.126574,4,1991-01-29,,
3,2,1991-04,2,13.11,30,11.65,44500,560000.0,326200.0,480524.06,18,-0.116085,-0.116085,4,1991-01-29,,
4,2,1991-05,2,11.59,27,11.29,1550500,17972000.0,316120.0,465675.24,17,-0.030901,-0.030901,4,1991-01-29,,
5,2,1991-06,10,7.5,28,6.55,1626500,10941000.0,370194.03,504156.83,15,-0.303809,-0.303809,4,1991-06-08,,
6,2,1991-07,1,6.6,31,6.15,4955000,24981050.0,347586.76,473368.63,27,-0.061069,-0.061069,4,1991-06-08,,
7,2,1991-08,1,6.15,31,6.3,2725000,15601200.0,356064.49,484914.21,26,0.02439,0.02439,4,1991-06-08,,
8,2,1991-09,2,6.3,30,4.7,4362600,14893175.0,265635.41,361761.39,26,-0.253968,-0.253968,4,1991-06-08,,
9,2,1991-10,3,6.4,31,16.9,10678500,129485325.0,955157.12,1300801.6,25,2.595745,2.595745,4,1991-06-08,,


In [78]:
#计算月度超额收益
'''
**1997-2023月度数据，97年开始的原因是因为，我国A股市场96年实行涨跌停板机制，为了避免影响故从97年开始
**全A股收益率（包含上证A，深A，科创板，创业板）

Trdmnt:股票收益时间，CSMAR字段
trdmn:无风险收益时间，CSMAR字段
'''
ret1 = get_month(ret,'Trdmnt') #获取股票月度收益时间
ff5 = get_month2(ff5,'trdmn') #从因子数据库 ff5 中获取无风险收益的时间

#筛选1997-2023的股票
def time(df,t1,t2):
    df = df[(df['year']>=t1)&(df['year']<=t2)]
    return df
ret1 = time(ret1,1997,2023)
ff5 = time(ff5,1997,2023)
ff = ff5['mkt_rf']
ff.index = ff5['trdmn']

#筛选A股，因为原数据中包含了除主板、创业板和科创板以外的股票，因此需要筛选剔除
def filt(df,x1,x2,x3,x4):
    df = df[(df['Markettype'] == x1)|(df['Markettype'] == x2)|(df['Markettype'] == x3)|(df['Markettype'] == x4)]
    return df

ret2 = filt(ret1,1,4,16,32)

#合并收益率数据计算超额收益

def data(inx,col,value): 
    temp = ff5[['year','month','mkt_rf','rf']]
    df = pd.merge(ret2,temp,on = ['year','month'])   #两个表因为交易时间问题，时间并不一致所以通过年份和月份进行合并 
    df['rt'] = df['Mretnd'] - df['rf']
    month_data = pd.pivot(df,index=inx,columns=col,values=value)
    month_data['month_num'] = (month_data.index.year-1997)*12+month_data.index.month   
    return month_data

month_data = data('Trdmnt','Stkcd','rt')
# data1 = month_data.shift(-1) #这里是为FM回归做的数据准备，可以之后再进行修改
# reg = data1.T  #rt+1
# reg.columns = beta_1m.columns

In [79]:
month_data.head(10)

Stkcd,2,4,6,7,8,9,10,11,12,14,...,688788,688789,688793,688798,688799,688800,688819,688981,689009,month_num
Trdmnt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1997-01-01,0.061791,0.067557,0.091844,0.33626,0.170058,0.050418,0.069247,0.081369,0.092707,0.151873,...,,,,,,,,,,1
1997-02-01,-0.006022,0.146626,-0.047577,0.123978,0.106972,-0.008762,0.168978,0.024977,-0.01047,0.143126,...,,,,,,,,,,2
1997-03-01,0.278414,0.161546,0.25476,0.435718,0.41276,0.195901,0.500361,0.223377,0.178965,0.252322,...,,,,,,,,,,3
1997-04-01,0.406234,-0.225929,0.313102,0.323901,-0.113357,-0.127165,0.04765,-0.134645,0.124446,-0.201503,...,,,,,,,,,,4
1997-05-01,-0.119927,0.032554,-0.072398,-0.125637,-0.191393,-0.112654,-0.225861,-0.1765,-0.166129,0.260156,...,,,,,,,,,,5
1997-06-01,0.349209,0.125407,0.167037,0.149504,0.333461,0.007078,0.095353,0.070419,0.013041,0.052802,...,,,,,,,,,,6
1997-07-01,-0.10939,-0.176477,-0.08847,-0.18916,-0.018878,-0.215792,-0.035663,-0.222553,-0.142744,-0.189993,...,,,,,,,,,,7
1997-08-01,-0.212124,0.141619,0.166726,-0.104397,-0.152069,0.055796,-0.141091,-0.071401,-0.141185,-0.114281,...,,,,,,,,,,8
1997-09-01,-0.175434,-0.190372,-0.142143,-0.192208,-0.223887,-0.119424,-0.088807,-0.155466,-0.162309,-0.22004,...,,,,,,,,,,9
1997-10-01,0.400493,0.138483,0.177161,0.269687,0.159739,0.084541,0.529753,0.367357,0.29623,0.412591,...,,,,,,,,,,10


In [80]:
daily_data.head(10)

Unnamed: 0.1,Unnamed: 0,Stkcd,Trddt,Opnprc,Hiprc,Loprc,Clsprc,Dnshrtrd,Dnvaltrd,Dsmvosd,...,Markettype,Capchgdt,Trdsta,Ahshrtrd_D,Ahvaltrd_D,PreClosePrice,ChangeRatio,LimitDown,LimitUp,LimitStatus
0,0,2,1997-01-02,10.41,10.78,10.3,10.45,2919324,30568143.2,1808900.53,...,4,1996-08-06,1,,,10.47,-0.00191,9.42,11.52,0.0
1,1,2,1997-01-03,10.6,11.2,10.38,10.5,6157916,66558878.15,1817555.55,...,4,1996-08-06,1,,,10.45,0.004785,9.41,11.5,0.0
2,2,2,1997-01-06,10.4,10.5,9.9,9.9,2973721,30129517.64,1713695.24,...,4,1996-08-06,1,,,10.5,-0.057143,9.45,11.55,0.0
3,3,2,1997-01-07,9.7,10.44,9.36,10.24,3794509,38103638.09,1772549.42,...,4,1996-08-06,1,,,9.9,0.034343,8.91,10.89,0.0
4,4,2,1997-01-08,10.24,10.25,9.95,10.04,1845936,18558994.5,1737929.31,...,4,1996-08-06,1,,,10.24,-0.019531,9.22,11.26,0.0
5,5,2,1997-01-09,10.1,10.45,10.1,10.29,2338071,24052645.76,1781204.44,...,4,1996-08-06,1,,,10.04,0.0249,9.04,11.04,0.0
6,6,2,1997-01-10,10.4,10.41,10.1,10.29,1889624,19433797.19,1781204.44,...,4,1996-08-06,1,,,10.29,0.0,9.26,11.32,0.0
7,7,2,1997-01-13,10.45,10.5,10.3,10.41,2148719,22386630.89,1801976.51,...,4,1996-08-06,1,,,10.29,0.011662,9.26,11.32,0.0
8,8,2,1997-01-14,10.35,10.68,10.2,10.4,2490413,26081140.1,1800245.5,...,4,1996-08-06,1,,,10.41,-0.000961,9.37,11.45,0.0
9,9,2,1997-01-15,10.35,10.45,10.25,10.36,1410712,14605277.9,1793321.48,...,4,1996-08-06,1,,,10.4,-0.003846,9.36,11.44,0.0


In [81]:
#修改daily_data中收益列的名称，Dretwd改为rt
daily_data = daily_data.rename(columns={'Dretwd': 'rt'})

In [82]:
#修改daily_data中的日期列的名称，Trddt改为date
daily_data = daily_data.rename(columns={'Trddt': 'date'})

In [83]:
mktcap.head(10)

Unnamed: 0,trddy,mkt_rf,smb,hml,umd,rmw,cma,rf,smb_equal,hml_equal,umd_equal,rmw_equal,cma_equal
0,1994-01-04,-0.00395,0.000329,0.003827,0.010374,-0.004755,-0.014706,0.000285,0.002883,0.00537,0.008679,-0.007093,-0.019481
1,1994-01-05,0.007166,0.01185,0.012679,0.009254,-0.02033,-0.006645,0.000285,0.01178,0.013751,0.012208,-0.017225,-4.5e-05
2,1994-01-06,0.028537,0.012046,0.005722,-0.000346,-0.002358,0.008642,0.000285,0.008664,0.00176,-0.004893,-0.004767,-0.000838
3,1994-01-07,-0.004087,0.009666,0.005175,-0.027683,0.003846,0.00441,0.000285,0.011543,0.007985,-0.028359,0.000227,0.001732
4,1994-01-10,0.002382,0.010774,0.018843,-0.000999,-0.031611,-0.031871,0.000285,0.009054,0.014788,0.001511,-0.02559,-0.01048
5,1994-01-11,-0.012748,-0.003695,-0.005612,0.012066,0.007814,0.00395,0.000285,-0.004199,-0.002248,0.008111,0.00217,-0.002964
6,1994-01-12,-0.005325,0.003344,0.003427,0.002793,0.002738,-0.002812,0.000285,0.007745,0.004416,-0.00207,-0.003284,-0.012985
7,1994-01-13,-0.000292,0.003335,0.010129,-0.00876,-0.00799,-0.008683,0.000285,0.0024,0.010147,-0.004906,-0.005779,0.000705
8,1994-01-14,-0.0337,-0.023248,-0.026024,0.007709,0.037823,0.014779,0.000285,-0.021369,-0.018212,0.007946,0.031944,0.004768
9,1994-01-17,-0.002632,0.015485,0.021263,0.001471,-0.025348,-0.019621,0.000285,0.018612,0.018992,0.000477,-0.023375,-0.006185


In [84]:
#修改mktcap的时间列名，trddy修改为date
mktcap = mktcap.rename(columns={'trddy': 'date'})

In [85]:
#修改daily_data代码列名，Stkcd改为code
daily_data = daily_data.rename(columns={'Stkcd': 'code'})

In [86]:
##筛选日度数据
daily_data = get_month(daily_data,'date')
daily_data = time(daily_data,1997,2023)
daily_data = filt(daily_data,1,4,16,32)
daily_data['month_num'] = (daily_data['year']-1997)*12 + daily_data['month']

##合并收益率数据与无风险利率
mktcap['date'] =  pd.to_datetime(mktcap['date'])
mktcap = mktcap[['date','mkt_rf','rf']]
daily_data = pd.merge(daily_data,mktcap,on='date')
daily_data.head(10)
#daily_data['rt'] = daily_data['rt']-daily_data['rf']  #获得超额收益

# full_data = pd.pivot(daily_data,index='date',columns='code',values='rt')
# full_data['month_num'] = (full_data.index.year-1997)*12+full_data.index.month

# # mktcap2 = mktcap['mkt'] #修改mktcap只剩下mkt数据，但是原数据中只有mkt_rf和rf，可以两个相加得到mkt
# # mktcap2.index = mktcap['date']

Unnamed: 0.1,Unnamed: 0,code,date,Opnprc,Hiprc,Loprc,Clsprc,Dnshrtrd,Dnvaltrd,Dsmvosd,...,PreClosePrice,ChangeRatio,LimitDown,LimitUp,LimitStatus,year,month,month_num,mkt_rf,rf
0,0,2,1997-01-02,10.41,10.78,10.3,10.45,2919324,30568143.2,1808900.53,...,10.47,-0.00191,9.42,11.52,0.0,1997,1,1,0.015487,0.000197
1,1196,4,1997-01-02,5.8,6.26,5.71,6.22,1730784,10613037.48,259106.49,...,5.98,0.040134,5.38,6.58,0.0,1997,1,1,0.015487,0.000197
2,2390,6,1997-01-02,13.51,14.01,13.51,13.78,1610052,22096412.0,1162398.73,...,13.59,0.013981,12.23,14.95,0.0,1997,1,1,0.015487,0.000197
3,3591,7,1997-01-02,8.7,9.18,8.52,8.99,887998,7941758.04,366796.44,...,8.94,0.005593,8.05,9.83,0.0,1997,1,1,0.015487,0.000197
4,4787,8,1997-01-02,5.9,6.3,5.9,6.18,1471001,9015815.06,218095.74,...,6.02,0.026578,5.42,6.62,0.0,1997,1,1,0.015487,0.000197
5,5986,9,1997-01-02,6.81,7.0,6.78,6.94,6418291,44121443.37,4019736.83,...,6.91,0.004342,6.22,7.6,0.0,1997,1,1,0.015487,0.000197
6,7185,10,1997-01-02,7.34,7.8,7.3,7.7,1232552,9311546.52,146598.75,...,7.44,0.034946,6.7,8.18,0.0,1997,1,1,0.015487,0.000197
7,8378,11,1997-01-02,7.9,8.2,7.8,8.03,1918575,15338949.28,733872.14,...,8.01,0.002497,7.21,8.81,0.0,1997,1,1,0.015487,0.000197
8,9577,12,1997-01-02,10.1,10.5,10.1,10.21,1842926,18859728.31,819642.31,...,10.23,-0.001955,9.21,11.25,0.0,1997,1,1,0.015487,0.000197
9,10780,14,1997-01-02,5.9,6.35,5.89,6.3,1759039,10869897.64,284888.21,...,6.08,0.036184,5.47,6.69,0.0,1997,1,1,0.015487,0.000197


In [87]:
daily_data['rt'] = daily_data['rt']-daily_data['rf'] 

In [88]:
daily_data.head(5)

Unnamed: 0.1,Unnamed: 0,code,date,Opnprc,Hiprc,Loprc,Clsprc,Dnshrtrd,Dnvaltrd,Dsmvosd,...,PreClosePrice,ChangeRatio,LimitDown,LimitUp,LimitStatus,year,month,month_num,mkt_rf,rf
0,0,2,1997-01-02,10.41,10.78,10.3,10.45,2919324,30568143.2,1808900.53,...,10.47,-0.00191,9.42,11.52,0.0,1997,1,1,0.015487,0.000197
1,1196,4,1997-01-02,5.8,6.26,5.71,6.22,1730784,10613037.48,259106.49,...,5.98,0.040134,5.38,6.58,0.0,1997,1,1,0.015487,0.000197
2,2390,6,1997-01-02,13.51,14.01,13.51,13.78,1610052,22096412.0,1162398.73,...,13.59,0.013981,12.23,14.95,0.0,1997,1,1,0.015487,0.000197
3,3591,7,1997-01-02,8.7,9.18,8.52,8.99,887998,7941758.04,366796.44,...,8.94,0.005593,8.05,9.83,0.0,1997,1,1,0.015487,0.000197
4,4787,8,1997-01-02,5.9,6.3,5.9,6.18,1471001,9015815.06,218095.74,...,6.02,0.026578,5.42,6.62,0.0,1997,1,1,0.015487,0.000197


In [89]:
full_data = pd.pivot(daily_data,index='date',columns='code',values='rt')

In [90]:
full_data.head(5)

code,2,4,6,7,8,9,10,11,12,14,...,688787,688788,688789,688793,688798,688799,688800,688819,688981,689009
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1997-01-02,-0.002107,0.039937,0.013784,0.005396,0.026381,0.004145,0.034749,0.0023,-0.002152,0.035987,...,,,,,,,,,,
1997-01-03,0.004588,-0.035567,-0.025596,-0.003534,0.087182,-0.011724,-0.052145,0.002294,-0.013909,-0.031943,...,,,,,,,,,,
1997-01-06,-0.05734,-0.085197,-0.047107,-0.085018,-0.099899,-0.067252,-0.100197,-0.067278,-0.05978,-0.078886,...,,,,,,,,,,
1997-01-07,0.034146,0.052626,0.026366,0.054681,0.04939,0.045116,0.060686,0.041081,0.025146,0.058522,...,,,,,,,,,,
1997-01-08,-0.019728,-0.010578,-0.01009,-0.029099,-0.009646,-0.012155,-0.01024,-0.005312,-0.008436,0.033416,...,,,,,,,,,,


In [91]:
full_data['month_num'] = (full_data.index.year-1997)*12+full_data.index.month

In [92]:
full_data.head(5)

code,2,4,6,7,8,9,10,11,12,14,...,688788,688789,688793,688798,688799,688800,688819,688981,689009,month_num
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1997-01-02,-0.002107,0.039937,0.013784,0.005396,0.026381,0.004145,0.034749,0.0023,-0.002152,0.035987,...,,,,,,,,,,1
1997-01-03,0.004588,-0.035567,-0.025596,-0.003534,0.087182,-0.011724,-0.052145,0.002294,-0.013909,-0.031943,...,,,,,,,,,,1
1997-01-06,-0.05734,-0.085197,-0.047107,-0.085018,-0.099899,-0.067252,-0.100197,-0.067278,-0.05978,-0.078886,...,,,,,,,,,,1
1997-01-07,0.034146,0.052626,0.026366,0.054681,0.04939,0.045116,0.060686,0.041081,0.025146,0.058522,...,,,,,,,,,,1
1997-01-08,-0.019728,-0.010578,-0.01009,-0.029099,-0.009646,-0.012155,-0.01024,-0.005312,-0.008436,0.033416,...,,,,,,,,,,1


In [93]:
mktcap['mkt'] = mktcap['mkt_rf'] + mktcap['rf']
mktcap.index = mktcap['date']

In [94]:
mktcap.head(5)

Unnamed: 0_level_0,date,mkt_rf,rf,mkt
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1994-01-04,1994-01-04,-0.00395,0.000285,-0.003665
1994-01-05,1994-01-05,0.007166,0.000285,0.007451
1994-01-06,1994-01-06,0.028537,0.000285,0.028822
1994-01-07,1994-01-07,-0.004087,0.000285,-0.003802
1994-01-10,1994-01-10,0.002382,0.000285,0.002667


In [24]:
def beta_calculator(data,factor,span,low_limit):
    '''
    用来计算beta的表格函数，输出是某一种计算方式的beta的表格。
    
    输入参数
    ----------
    data是以month_num为columns，code为index，rt为value
    span是每次回归跨度月份数，一年为12
    low_limit是计算beta的最低样本数（天数），一个月为10，三个月为50等
    输出
    -------
    index为股票代码，columns为月份编号，value为对应规则算出beta 的df
    '''
    X = pd.DataFrame()
    for i in range(max(data['month_num'])-span+1):
        same_time_data = data[(data['month_num']>i)&(data['month_num']<=i+span)]
        same_time = []
        code_list = list(same_time_data.columns[:-1])
        for code in code_list:
            temp_data = same_time_data[code]
            temp_data.name = 'rt'
            reg_data = pd.concat([temp_data,factor],axis=1,join='inner')
            if reg_data['rt'].notna().sum() >= low_limit:
                model = smf.ols('rt~mkt_rf',reg_data,missing='drop').fit()
                beta = model.params[1]
            else:
                beta = np.nan
            same_time.append(beta)
        same_time = pd.Series(same_time,index = code_list,name = i+span)
        X = pd.concat([X,same_time],axis=1)
    return X

In [27]:
# 计算不同时间窗口的 beta
beta_1m = beta_calculator(full_data,mktcap,1,10)
beta_3m = beta_calculator(full_data,mktcap,3,50)
beta_6m = beta_calculator(full_data,mktcap,6,100)
beta_12m = beta_calculator(full_data,mktcap,12,200)
beta_24m = beta_calculator(full_data,mktcap,24,450)
beta_1y = beta_calculator(month_data,ff,12,10)
beta_2y = beta_calculator(month_data,ff,24,20)
beta_3y = beta_calculator(month_data,ff,36,24)
beta_5y = beta_calculator(month_data,ff,60,24)

# 输出beta为csv，方便之后读取（因为算力限制无法直接进行后续计算）
beta_24m.to_csv('beta_24m.csv')
beta_1y.to_csv('beta_1y.csv')
beta_2y.to_csv('beta_2y.csv')
beta_3y.to_csv('beta_3y.csv')
beta_5y.to_csv('beta_5y.csv')

In [97]:
print(daily_data.index.name)
daily_data.head(5)

None


Unnamed: 0.1,Unnamed: 0,code,date,Opnprc,Hiprc,Loprc,Clsprc,Dnshrtrd,Dnvaltrd,Dsmvosd,...,PreClosePrice,ChangeRatio,LimitDown,LimitUp,LimitStatus,year,month,month_num,mkt_rf,rf
0,0,2,1997-01-02,10.41,10.78,10.3,10.45,2919324,30568143.2,1808900.53,...,10.47,-0.00191,9.42,11.52,0.0,1997,1,1,0.015487,0.000197
1,1196,4,1997-01-02,5.8,6.26,5.71,6.22,1730784,10613037.48,259106.49,...,5.98,0.040134,5.38,6.58,0.0,1997,1,1,0.015487,0.000197
2,2390,6,1997-01-02,13.51,14.01,13.51,13.78,1610052,22096412.0,1162398.73,...,13.59,0.013981,12.23,14.95,0.0,1997,1,1,0.015487,0.000197
3,3591,7,1997-01-02,8.7,9.18,8.52,8.99,887998,7941758.04,366796.44,...,8.94,0.005593,8.05,9.83,0.0,1997,1,1,0.015487,0.000197
4,4787,8,1997-01-02,5.9,6.3,5.9,6.18,1471001,9015815.06,218095.74,...,6.02,0.026578,5.42,6.62,0.0,1997,1,1,0.015487,0.000197


In [98]:
print(mktcap.index.name)
mktcap.head(5)

date


Unnamed: 0_level_0,date,mkt_rf,rf,mkt
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1994-01-04,1994-01-04,-0.00395,0.000285,-0.003665
1994-01-05,1994-01-05,0.007166,0.000285,0.007451
1994-01-06,1994-01-06,0.028537,0.000285,0.028822
1994-01-07,1994-01-07,-0.004087,0.000285,-0.003802
1994-01-10,1994-01-10,0.002382,0.000285,0.002667


In [99]:
# 修改索引名称，否则会出现index与column混淆的情况
mktcap.index.name = 'date1'

In [108]:
def beta_calculator_sw(data, factor):
    '''
    用来计算beta_sw的表格函数，输出是beta_sw计算方式的beta的表格。
    
    输入参数
    ----------
    data是有每只股票每月日度收益和市场超额收益信息的表格，变量命名month_num,code,rt,mkt
    span是每次回归跨度月份数，一年为12
    low_limit是计算beta的最低样本数（天数），一个月为15，三个月为40等
    
    输出
    -------
    index为股票代码，columns为月份编号，value为对应规则算出beta 的df
    '''
    # 创建一个空的DataFrame来存储beta值
    X = pd.DataFrame()

    # 计算市场因子的自相关系数
    rou = pearsonr(factor['mkt'][1:], factor['mkt'][:-1])[0]

    # 为市场超额收益创建不同的时间偏移量
    mktcap1 = mktcap2 = mktcap3 = factor[['date', 'mkt']]
    mktcap1['date'] = mktcap1['date'] + dt.timedelta(days=1)
    mktcap3['date'] = mktcap3['date'] + dt.timedelta(days=-1)

    # 对每个时间窗口进行循环
    for i in range(1, max(data['month_num']) - 12):
        same_time_data = data[(data['month_num'] > i) & (data['month_num'] <= i + 12)]
        same_time = []
        code_list = list(set(same_time_data['code']))

        # 对每只股票分别计算beta值
        for code in code_list:
            temp_data = same_time_data[same_time_data['code'] == code]
            reg_data1 = pd.merge(temp_data, mktcap1, on='date')
            reg_data2 = pd.merge(temp_data, mktcap2, on='date')
            reg_data3 = pd.merge(temp_data, mktcap3, on='date')

            reg_data1 = reg_data1[['rt', 'mkt']]
            reg_data2 = reg_data2[['rt', 'mkt']]
            reg_data3 = reg_data3[['rt', 'mkt']]

            # 检查数据是否足够用来回归
            if reg_data2['rt'].notna().sum() >= 200:
                model1 = smf.ols('rt~mkt', reg_data1, missing='drop').fit()
                model2 = smf.ols('rt~mkt', reg_data2, missing='drop').fit()
                model3 = smf.ols('rt~mkt', reg_data3, missing='drop').fit()

                beta1 = model1.params[1]
                beta2 = model2.params[1]
                beta3 = model3.params[1]

                beta = (beta1 + beta2 + beta3) / (1 + 2 * rou)
            else:
                beta = np.nan

            same_time.append(beta)

        # 将beta值按股票代码索引并加入结果
        same_time = pd.Series(same_time, index=code_list, name=i + 12)
        X = pd.concat([X, same_time], axis=1)

    return X


In [101]:
full_data1 = daily_data[['code','rt','date','month_num']]
beta_sw = beta_calculator_sw(full_data1,mktcap)
# beta_sw.columns = beta_sw.columns+1
beta_sw.to_csv('beta_sw.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mktcap1['date'] = mktcap1['date'] + dt.timedelta(days=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mktcap3['date'] = mktcap3['date'] + dt.timedelta(days=-1)


In [123]:
full_data.head(5)

code,2,4,6,7,8,9,10,11,12,14,...,688788,688789,688793,688798,688799,688800,688819,688981,689009,month_num
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1997-01-02,-0.002107,0.039937,0.013784,0.005396,0.026381,0.004145,0.034749,0.0023,-0.002152,0.035987,...,,,,,,,,,,1
1997-01-03,0.004588,-0.035567,-0.025596,-0.003534,0.087182,-0.011724,-0.052145,0.002294,-0.013909,-0.031943,...,,,,,,,,,,1
1997-01-06,-0.05734,-0.085197,-0.047107,-0.085018,-0.099899,-0.067252,-0.100197,-0.067278,-0.05978,-0.078886,...,,,,,,,,,,1
1997-01-07,0.034146,0.052626,0.026366,0.054681,0.04939,0.045116,0.060686,0.041081,0.025146,0.058522,...,,,,,,,,,,1
1997-01-08,-0.019728,-0.010578,-0.01009,-0.029099,-0.009646,-0.012155,-0.01024,-0.005312,-0.008436,0.033416,...,,,,,,,,,,1


In [131]:
def beta_calculator_d(data,factor):
            '''
    用来计算beta_d的表格函数，输出是beta_d计算方式的beta的表格。
    
    输入参数
    ----------
    data是以month_num为columns，code为index，rt为value
    span是每次回归跨度月份数，一年为12
    low_limit是计算beta的最低样本数（天数），一个月为15，三个月为40等
    
    输出
    -------
    index为股票代码，columns为月份编号，value为对应规则算出beta 的df
    '''
    X = pd.DataFrame()
    mkt = pd.DataFrame()
    for  k in range(6):
        x1 = factor.shift(k)
        x2 = factor.shift(-k)
        if k==0:
            mkt = pd.concat([mkt,x1],axis=1)
        else:
            mkt = pd.concat([mkt,x1],axis=1)
            mkt = pd.concat([mkt,x2],axis=1)
    mkt.columns = ['mkt6','mkt5','mkt7','mkt4','mkt8','mkt3','mkt9','mkt2','mkt10','mkt1','mkt11']
    mkt = mkt.dropna()
    for i in range(1,max(data['month_num'])-12):
        same_time_data = data[(data['month_num']>i)&(data['month_num']<=i+12)]
        same_time = []
        code_list = same_time_data.columns[:-1]
        for code in code_list:
            temp_data = same_time_data[code]
            temp_data.name = 'rt'
            reg_data = pd.concat([temp_data,mkt],axis=1,join='inner')
            if reg_data['rt'].notna().sum()>=200:
                model = smf.ols('rt~mkt6+mkt5+mkt7+mkt4+mkt8+mkt3+mkt9+mkt2+mkt10+mkt1+mkt11',reg_data,missing='drop').fit()
                beta = sum(model.params[1:])
            else:
                beta = np.nan
            same_time.append(beta)
        same_time = pd.Series(same_time,index = code_list,name = i+12)
        X = pd.concat([X,same_time],axis=1)
        print(i)
    return X


beta_d = beta_calculator_d(full_data,mktcap)

ValueError: 列数不匹配，生成的mkt有 44 列，而不是预期的 11 列

# 描述性统计

In [None]:
beta_1m = pd.read_csv(os.path.join('中国市场','beta_1m.csv'),index_col=0)
beta_3m = pd.read_csv(os.path.join('中国市场','beta_3m.csv'),index_col=0)
beta_6m = pd.read_csv(os.path.join('中国市场','beta_6m.csv'),index_col=0)
beta_12m = pd.read_csv(os.path.join('中国市场','beta_12m.csv'),index_col=0)
beta_24m = pd.read_csv(os.path.join('中国市场','beta_24m.csv'),index_col=0)
beta_1y = pd.read_csv(os.path.join('中国市场','beta_1y.csv'),index_col=0)
beta_2y = pd.read_csv(os.path.join('中国市场','beta_2y.csv'),index_col=0)
beta_3y = pd.read_csv(os.path.join('中国市场','beta_3y.csv'),index_col=0)
beta_5y = pd.read_csv(os.path.join('中国市场','beta_5y.csv'),index_col=0)
beta_sw = pd.read_csv(os.path.join('中国市场','beta_sw.csv'),index_col=0)
beta_d = pd.read_csv(os.path.join('中国市场','beta_d.csv'),index_col=0)

In [None]:
def beta_statistic(list_of_beta,name_of_beta):
    X = pd.DataFrame()
    for i in range(len(list_of_beta)):
        x = list_of_beta[i]
        new = pd.Series([x.mean().mean(),x.std().mean(),x.skew().mean(),x.kurt().mean(),x.min().mean(),x.quantile(.05).mean(),x.quantile(.25).mean(),x.median().mean(),x.quantile(.75).mean(),x.quantile(.95).mean(),x.max().mean(),x.count().mean()],
                         index = ['Mean','SD','Skew','Kurt','Min','5%','25%','Median','75%','95%','Max','n'],name = name_of_beta[i])
        X = pd.concat([X,new],axis=1)
    X = X.T
    X = X.applymap(lambda x:round(x, 2))
    return X

df_list = [beta_1m,beta_3m,beta_6m,beta_12m,beta_24m,beta_1y,beta_2y,beta_3y,beta_5y,beta_sw,beta_d]

def drop():
    beta = []
    for i in df_list:
        if i.columns[-1] == '276':
            i = i.drop('276',axis = 1)
        beta.append(i)
    return beta

beta_list = drop()

beta_name_list = ['beta_1m','beta_3m','beta_6m','beta_12m','beta_24m','beta_1y','beta_2y','beta_3y','beta_5y','beta_sw','beta_d']
table1 = beta_statistic(beta_list,beta_name_list)
table1