## 1、获取每个月的最后一个交易日的列表

In [18]:
import pandas as pd
import numpy as np

In [8]:
# 获取2012~2014一整年的日期
date = pd.date_range("20120101", "20140201")

In [9]:
date

DatetimeIndex(['2012-01-01', '2012-01-02', '2012-01-03', '2012-01-04',
               '2012-01-05', '2012-01-06', '2012-01-07', '2012-01-08',
               '2012-01-09', '2012-01-10',
               ...
               '2014-01-23', '2014-01-24', '2014-01-25', '2014-01-26',
               '2014-01-27', '2014-01-28', '2014-01-29', '2014-01-30',
               '2014-01-31', '2014-02-01'],
              dtype='datetime64[ns]', length=763, freq='D')

In [11]:
# 获取月初的日期
date = date[date.is_month_start]

# 获取月末交易日日期
month_trading_end = []

for i in range(len(date)):
    month_trading_end.append(get_previous_trading_date(date[i]))

In [22]:
# 月末交易日日期列表
month_trading_end[1:]

[datetime.date(2012, 1, 31),
 datetime.date(2012, 2, 29),
 datetime.date(2012, 3, 30),
 datetime.date(2012, 4, 27),
 datetime.date(2012, 5, 31),
 datetime.date(2012, 6, 29),
 datetime.date(2012, 7, 31),
 datetime.date(2012, 8, 31),
 datetime.date(2012, 9, 28),
 datetime.date(2012, 10, 31),
 datetime.date(2012, 11, 30),
 datetime.date(2012, 12, 31),
 datetime.date(2013, 1, 31),
 datetime.date(2013, 2, 28),
 datetime.date(2013, 3, 29),
 datetime.date(2013, 4, 26),
 datetime.date(2013, 5, 31),
 datetime.date(2013, 6, 28),
 datetime.date(2013, 7, 31),
 datetime.date(2013, 8, 30),
 datetime.date(2013, 9, 30),
 datetime.date(2013, 10, 31),
 datetime.date(2013, 11, 29),
 datetime.date(2013, 12, 31),
 datetime.date(2014, 1, 30)]

## 2、准备特征值，获取9个因子的横截面数据

In [15]:
# 获取沪深300股票列表
stocks = index_components("000300.XSHG")

# 建立空的dataframe方便数据合并
all_data = pd.DataFrame()

for date in month_trading_end[1:-1]:
    # 实例化一个query对象
    q = query(fundamentals.eod_derivative_indicator.pe_ratio,
              fundamentals.eod_derivative_indicator.pb_ratio,
              fundamentals.eod_derivative_indicator.market_cap,
              fundamentals.financial_indicator.ev,
              fundamentals.financial_indicator.return_on_asset_net_profit,
              fundamentals.financial_indicator.du_return_on_equity,
              fundamentals.financial_indicator.earnings_per_share,
              fundamentals.income_statement.revenue,
              fundamentals.income_statement.total_expense).filter(
    fundamentals.stockcode.in_(stocks))
    
    fund = get_fundamentals(q, entry_date=date)[:, 0, :]
    
    # 添加日期信息
    fund["date"] = date
    
    # 合并
    all_data = pd.concat([all_data, fund])

In [17]:
# 缺失值处理
all_data = all_data.dropna()

In [19]:
# 添加月收益率字段
all_data["month_returns"] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [21]:
all_data.head()

Unnamed: 0,pe_ratio,pb_ratio,market_cap,ev,return_on_asset_net_profit,du_return_on_equity,earnings_per_share,revenue,total_expense,date,month_returns
000001.XSHE,8.2942,1.1818,85252600000.0,1217700000000.0,0.8006,14.9403,2.01,20701400000.0,10882500000.0,2012-01-31,
000002.XSHE,8.7392,1.588,84113400000.0,129427000000.0,1.6463,7.8656,0.326,29308400000.0,23783500000.0,2012-01-31,
000060.XSHE,20.7,3.7146,19680500000.0,25722900000.0,5.6036,14.617,0.35,9189390000.0,7935540000.0,2012-01-31,
000063.XSHE,24.7131,2.1011,50913200000.0,102604000000.0,1.2814,4.6063,0.31,57838600000.0,58633400000.0,2012-01-31,
000069.XSHE,13.1335,2.5616,41727200000.0,61211800000.0,2.8729,10.9097,0.271,8951450000.0,7091400000.0,2012-01-31,


## 3、准备月收益率

In [23]:
# 准备空DataFrame，方便进行价格数据的合并
all_prices = pd.DataFrame()

for date in month_trading_end[1:]:
    
    # 获取当月月末的收盘价
    price = get_price(stocks, start_date=date, end_date=date, fields="close")
    
    # 合并
    all_prices = pd.concat([all_prices, price])

In [26]:
# 缺失值处理
all_prices = all_prices.T.dropna()

In [27]:
all_prices.head()

Unnamed: 0,2012-01-31 00:00:00,2012-02-29 00:00:00,2012-03-30 00:00:00,2012-04-27 00:00:00,2012-05-31 00:00:00,2012-06-29 00:00:00,2012-07-31 00:00:00,2012-08-31 00:00:00,2012-09-28 00:00:00,2012-10-31 00:00:00,...,2013-04-26 00:00:00,2013-05-31 00:00:00,2013-06-28 00:00:00,2013-07-31 00:00:00,2013-08-30 00:00:00,2013-09-30 00:00:00,2013-10-31 00:00:00,2013-11-29 00:00:00,2013-12-31 00:00:00,2014-01-30 00:00:00
600009.XSHG,10.9057,11.4466,11.2721,11.307,11.3942,11.1151,10.9406,10.9714,10.8067,10.4681,...,11.557,12.4812,10.9073,11.2734,15.0162,14.2442,13.9335,14.1971,13.4816,12.5684
601360.XSHG,5.3869,5.7978,5.2631,5.3573,5.5933,5.3975,4.4435,5.3021,4.9406,4.81,...,6.9881,7.8119,6.2754,7.0714,7.1547,7.8767,8.0803,7.923,7.5527,7.7841
002241.XSHE,6.1211,7.0194,7.0005,7.041,8.2037,9.8439,9.2234,9.9944,10.1138,10.1816,...,13.3711,17.1425,17.754,17.0691,20.0582,20.6844,17.0691,18.8939,17.162,14.1288
601766.XSHG,3.9535,4.2371,3.816,4.3145,4.2371,3.9191,3.7372,3.5758,3.6834,4.006,...,3.5669,3.9433,3.2263,3.687,3.6778,3.595,4.4777,5.0202,4.6064,3.8984
601788.XSHG,9.6466,10.9497,10.832,11.9722,13.1125,12.1425,12.4006,10.1418,11.5985,10.7042,...,12.4376,13.2822,9.4674,10.1635,9.3467,8.8362,8.4371,8.4743,8.0659,7.4625


In [29]:
# 计算月收益率
for i in range(len(all_prices.columns) - 1):
    
    all_prices.iloc[:, i] = all_prices.iloc[:, i+1] / all_prices.iloc[:, i] - 1

In [31]:
# 收益率计算完毕
all_prices.head()

Unnamed: 0,2012-01-31 00:00:00,2012-02-29 00:00:00,2012-03-30 00:00:00,2012-04-27 00:00:00,2012-05-31 00:00:00,2012-06-29 00:00:00,2012-07-31 00:00:00,2012-08-31 00:00:00,2012-09-28 00:00:00,2012-10-31 00:00:00,...,2013-04-26 00:00:00,2013-05-31 00:00:00,2013-06-28 00:00:00,2013-07-31 00:00:00,2013-08-30 00:00:00,2013-09-30 00:00:00,2013-10-31 00:00:00,2013-11-29 00:00:00,2013-12-31 00:00:00,2014-01-30 00:00:00
600009.XSHG,0.049598,-0.015245,0.003096,0.007712,-0.024495,-0.015699,0.002815,-0.015012,-0.031332,-0.016603,...,0.079969,-0.126102,0.033565,0.332003,-0.051411,-0.021812,0.018918,-0.050398,-0.067737,12.5684
601360.XSHG,0.076278,-0.092225,0.017898,0.044052,-0.035006,-0.176748,0.193226,-0.068181,-0.026434,-0.102287,...,0.117886,-0.196687,0.126845,0.01178,0.100913,0.025848,-0.019467,-0.046737,0.030638,7.7841
002241.XSHE,0.146755,-0.002693,0.005785,0.165133,0.199934,-0.063034,0.083592,0.011947,0.006704,-0.046613,...,0.282056,0.035672,-0.038577,0.175118,0.031219,-0.174784,0.106907,-0.091665,-0.176739,14.1288
601766.XSHG,0.071734,-0.099384,0.130634,-0.01794,-0.075051,-0.046414,-0.043187,0.030091,0.087582,0.069346,...,0.105526,-0.181827,0.142795,-0.002495,-0.022513,0.245535,0.121156,-0.082427,-0.153699,3.8984
601788.XSHG,0.135084,-0.010749,0.105262,0.095246,-0.073975,0.021256,-0.182152,0.143633,-0.077105,-0.121448,...,0.067907,-0.287211,0.073526,-0.080366,-0.054618,-0.045166,0.004409,-0.048193,-0.074809,7.4625


## 4、将因子值和对应的月收益率放在一起

In [32]:
all_data.head()

Unnamed: 0,pe_ratio,pb_ratio,market_cap,ev,return_on_asset_net_profit,du_return_on_equity,earnings_per_share,revenue,total_expense,date,month_returns
000001.XSHE,8.2942,1.1818,85252600000.0,1217700000000.0,0.8006,14.9403,2.01,20701400000.0,10882500000.0,2012-01-31,
000002.XSHE,8.7392,1.588,84113400000.0,129427000000.0,1.6463,7.8656,0.326,29308400000.0,23783500000.0,2012-01-31,
000060.XSHE,20.7,3.7146,19680500000.0,25722900000.0,5.6036,14.617,0.35,9189390000.0,7935540000.0,2012-01-31,
000063.XSHE,24.7131,2.1011,50913200000.0,102604000000.0,1.2814,4.6063,0.31,57838600000.0,58633400000.0,2012-01-31,
000069.XSHE,13.1335,2.5616,41727200000.0,61211800000.0,2.8729,10.9097,0.271,8951450000.0,7091400000.0,2012-01-31,


In [33]:
all_prices.head()

Unnamed: 0,2012-01-31 00:00:00,2012-02-29 00:00:00,2012-03-30 00:00:00,2012-04-27 00:00:00,2012-05-31 00:00:00,2012-06-29 00:00:00,2012-07-31 00:00:00,2012-08-31 00:00:00,2012-09-28 00:00:00,2012-10-31 00:00:00,...,2013-04-26 00:00:00,2013-05-31 00:00:00,2013-06-28 00:00:00,2013-07-31 00:00:00,2013-08-30 00:00:00,2013-09-30 00:00:00,2013-10-31 00:00:00,2013-11-29 00:00:00,2013-12-31 00:00:00,2014-01-30 00:00:00
600009.XSHG,0.049598,-0.015245,0.003096,0.007712,-0.024495,-0.015699,0.002815,-0.015012,-0.031332,-0.016603,...,0.079969,-0.126102,0.033565,0.332003,-0.051411,-0.021812,0.018918,-0.050398,-0.067737,12.5684
601360.XSHG,0.076278,-0.092225,0.017898,0.044052,-0.035006,-0.176748,0.193226,-0.068181,-0.026434,-0.102287,...,0.117886,-0.196687,0.126845,0.01178,0.100913,0.025848,-0.019467,-0.046737,0.030638,7.7841
002241.XSHE,0.146755,-0.002693,0.005785,0.165133,0.199934,-0.063034,0.083592,0.011947,0.006704,-0.046613,...,0.282056,0.035672,-0.038577,0.175118,0.031219,-0.174784,0.106907,-0.091665,-0.176739,14.1288
601766.XSHG,0.071734,-0.099384,0.130634,-0.01794,-0.075051,-0.046414,-0.043187,0.030091,0.087582,0.069346,...,0.105526,-0.181827,0.142795,-0.002495,-0.022513,0.245535,0.121156,-0.082427,-0.153699,3.8984
601788.XSHG,0.135084,-0.010749,0.105262,0.095246,-0.073975,0.021256,-0.182152,0.143633,-0.077105,-0.121448,...,0.067907,-0.287211,0.073526,-0.080366,-0.054618,-0.045166,0.004409,-0.048193,-0.074809,7.4625


In [35]:
# 通过便利查找，给all_data填上月收益率
for i in range(len(all_data)):
    # 获取股票代码
    stock = all_data.index[i]
    
    # 获取相对应的日期
    date = all_data.ix[i, "date"]
    
    # 在月收益率表中找到对应的月收益率填充到all_data中
    # 判断相对应的月收益率是否存在
    if stock in all_prices.index and date in all_prices.columns:
        
        all_data.ix[i, "month_returns"] = all_prices.loc[stock, date]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [38]:
# 缺失值处理
all_data = all_data.dropna()

## 5、处理好特征值和目标值

In [39]:
all_data.head()

Unnamed: 0,pe_ratio,pb_ratio,market_cap,ev,return_on_asset_net_profit,du_return_on_equity,earnings_per_share,revenue,total_expense,date,month_returns
000001.XSHE,8.2942,1.1818,85252600000.0,1217700000000.0,0.8006,14.9403,2.01,20701400000.0,10882500000.0,2012-01-31,0.027659
000002.XSHE,8.7392,1.588,84113400000.0,129427000000.0,1.6463,7.8656,0.326,29308400000.0,23783500000.0,2012-01-31,0.082352
000060.XSHE,20.7,3.7146,19680500000.0,25722900000.0,5.6036,14.617,0.35,9189390000.0,7935540000.0,2012-01-31,0.121609
000063.XSHE,24.7131,2.1011,50913200000.0,102604000000.0,1.2814,4.6063,0.31,57838600000.0,58633400000.0,2012-01-31,0.158786
000069.XSHE,13.1335,2.5616,41727200000.0,61211800000.0,2.8729,10.9097,0.271,8951450000.0,7091400000.0,2012-01-31,-0.00267


In [43]:
# 先把特征值和目标值筛选出来
x = all_data.drop(["date", "month_returns"], axis=1)

In [45]:
x.head()

Unnamed: 0,pe_ratio,pb_ratio,market_cap,ev,return_on_asset_net_profit,du_return_on_equity,earnings_per_share,revenue,total_expense
000001.XSHE,8.2942,1.1818,85252600000.0,1217700000000.0,0.8006,14.9403,2.01,20701400000.0,10882500000.0
000002.XSHE,8.7392,1.588,84113400000.0,129427000000.0,1.6463,7.8656,0.326,29308400000.0,23783500000.0
000060.XSHE,20.7,3.7146,19680500000.0,25722900000.0,5.6036,14.617,0.35,9189390000.0,7935540000.0
000063.XSHE,24.7131,2.1011,50913200000.0,102604000000.0,1.2814,4.6063,0.31,57838600000.0,58633400000.0
000069.XSHE,13.1335,2.5616,41727200000.0,61211800000.0,2.8729,10.9097,0.271,8951450000.0,7091400000.0


In [46]:
y = all_data["month_returns"]

In [47]:
y.head()

000001.XSHE    0.027659
000002.XSHE    0.082352
000060.XSHE    0.121609
000063.XSHE    0.158786
000069.XSHE   -0.002670
Name: month_returns, dtype: float64

In [48]:
# 处理特征值：去极值、标准化、市值中性化
def med_method(factor):
    # 1、找到MAD值
    med = np.median(factor)
    distance = abs(factor - med)
    MAD = np.median(distance)
    # 2、求出MAD_e
    MAD_e = 1.4826 * MAD
    # 3、求出正常值范围的边界
    up_scale = med + 3 * MAD_e
    down_scale = med - 3 * MAD_e
    # 4、替换
    factor = np.where(factor > up_scale, up_scale, factor)
    factor = np.where(factor < down_scale, down_scale, factor)
    return factor

# 自实现标准化
# (x - mean) / std
def stand_method(factor):
    mean = np.mean(factor)
    std = np.std(factor)
    factor = (factor - mean) / std
    return factor

In [49]:
from sklearn.linear_model import LinearRegression

In [50]:
# 对特征值进行因子处理：去极值、标准化、市值中性化
# 方便后续进行市值中性化处理，需要保留原始市值
market_cap = x["market_cap"]

# 批量对因子进行去极值、标准化和市值中性化
for factor in x.columns:
    
    # 去极值
    x[factor] = med_method(x[factor])
    
    # 标准化
    x[factor] = stand_method(x[factor])
    
    # 市值中性化
    # 市值本身不需要进行市值中性化，跳过循环
    if factor == "market_cap":
        continue
    
    # 市值中性化的特征值
    x_market_cap = market_cap.reshape((-1, 1))
    y_factor = x[factor]
    
    # 线性回归预估器流程
    estimator = LinearRegression()
    estimator.fit(x_market_cap, y_factor)
    y_predict = estimator.predict(x_market_cap)
    
    x[factor] = y_factor - y_predict



In [52]:
x.head()

Unnamed: 0,pe_ratio,pb_ratio,market_cap,ev,return_on_asset_net_profit,du_return_on_equity,earnings_per_share,revenue,total_expense
000001.XSHE,-0.780256,-0.920602,1.87291,0.0315102,-0.752906,0.623487,1.48327,-0.134731,-0.778307
000002.XSHE,-0.760964,-0.700663,1.83277,0.0679137,-0.484579,-0.41295,-0.66514,0.254929,0.348508
000060.XSHE,-0.276782,0.392316,-0.437099,-0.101393,0.784474,1.19697,0.176723,0.271231,0.304203
000063.XSHE,-0.0606299,-0.454609,0.663182,0.610782,-0.592678,-0.580813,-0.318181,1.06687,1.08789
000069.XSHE,-0.602094,-0.213165,0.339575,-0.0164567,-0.0861168,0.439288,-0.328008,-0.292695,-0.295008


In [53]:
y.head()

000001.XSHE    0.027659
000002.XSHE    0.082352
000060.XSHE    0.121609
000063.XSHE    0.158786
000069.XSHE   -0.002670
Name: month_returns, dtype: float64

## 6、建立特征值（9个因子）和目标值（月收益率）的线性回归/岭回归

In [54]:
# 用线性回归建立特征值和目标值之间的关系
estimator1 = LinearRegression()

estimator1.fit(x, y)

# 得出回归系数
estimator1.coef_

array([ 0.00237505, -0.00208239, -0.01355057,  0.00164993,  0.00270257,
        0.0077811 ,  0.00036861,  0.01311638, -0.01319032])

In [56]:
# 用岭回归建立特征值和目标值之间的关系
from sklearn.linear_model import Ridge

estimator2 = Ridge()

estimator2.fit(x, y)

# 得出回归系数
estimator2.coef_

array([ 0.00237299, -0.00208294, -0.0135488 ,  0.00166421,  0.00270095,
        0.00778129,  0.00037851,  0.01290236, -0.01299596])