In [72]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

## 1、回归训练区间

### 2014-01-01  ~  2016-12-31

## 2、回归股票选择

### - 股票选择（HS300指数）
### - 如果是以A股选择回归，那么回测就以A股去进行选股

## 3、回归因子数据准备、收益率计算

### - 因子数据：横截面数据拼接，添加日期数据、去除空值
### - 收益率计算: 所有样本的收益率计算、取出价格为空值的计算
### - 这个回归就是每个月用股票的回报率对股票上一期末的因子值做一个横截 面的回归。 

## 4、目标值特征值提取进行回归估计

### 数据处理：去除收益为0(价格数据不存在)、去极值、标准化处理、市值中性化处理

### 一、 准备好这些因子对应日期的数据
* 每个月月末的因子数据

In [5]:
# 确定每月日期 2014-01-01~2016-01-01
dates = get_trading_dates(start_date="2014-01-01", end_date="2016-01-01")

In [6]:
# 每天日期---->每月月末
# 每月最后一个交易日, 按月计算收益率
month_date = []
for i in range(len(dates) -1):
  if dates[i].year != dates[i+1].year:
    month_date.append(dates[i])
  elif dates[i].month != dates[i+1].month:
    month_date.append(dates[i])

#把最后一个交易日加入
month_date.append(dates[-1])

In [10]:
# 每月日期列表获取
month_date

[datetime.date(2014, 1, 30),
 datetime.date(2014, 2, 28),
 datetime.date(2014, 3, 31),
 datetime.date(2014, 4, 30),
 datetime.date(2014, 5, 30),
 datetime.date(2014, 6, 30),
 datetime.date(2014, 7, 31),
 datetime.date(2014, 8, 29),
 datetime.date(2014, 9, 30),
 datetime.date(2014, 10, 31),
 datetime.date(2014, 11, 28),
 datetime.date(2014, 12, 31),
 datetime.date(2015, 1, 30),
 datetime.date(2015, 2, 27),
 datetime.date(2015, 3, 31),
 datetime.date(2015, 4, 30),
 datetime.date(2015, 5, 29),
 datetime.date(2015, 6, 30),
 datetime.date(2015, 7, 31),
 datetime.date(2015, 8, 31),
 datetime.date(2015, 9, 30),
 datetime.date(2015, 10, 30),
 datetime.date(2015, 11, 30),
 datetime.date(2015, 12, 31)]

### 二、准备因子数据根据训练每月最后交易日日期列表（特征值）
* 特征值都是该月的因子数据（避免下个月在日期列表当中不存在）
* 因子的数据处理：直接删除缺失值
* month_date当中datetime.date(2015, 12, 31)号因子数据不用去
* [datetime.date(2014, 1, 30),
 datetime.date(2014, 2, 28),
 datetime.date(2014, 3, 31),
 datetime.date(2014, 4, 30),
 datetime.date(2014, 5, 30),
 datetime.date(2014, 6, 30),
 datetime.date(2014, 7, 31),
 datetime.date(2014, 8, 29),
 datetime.date(2014, 9, 30),
 datetime.date(2014, 10, 31),
 datetime.date(2014, 11, 28),
 datetime.date(2014, 12, 31),
 datetime.date(2015, 1, 30),
 datetime.date(2015, 2, 27),
 datetime.date(2015, 3, 31),
 datetime.date(2015, 4, 30),
 datetime.date(2015, 5, 29),
 datetime.date(2015, 6, 30),
 datetime.date(2015, 7, 31),
 datetime.date(2015, 8, 31),
 datetime.date(2015, 9, 30),
 datetime.date(2015, 10, 30),
 datetime.date(2015, 11, 30),

In [11]:
# 获取沪深300 的股票列表
stocks = index_components("000300.XSHG")

In [48]:
all_data = pd.DataFrame()
for date in month_date[:-1]:
  
  q = query(fundamentals.eod_derivative_indicator.pe_ratio, 
            fundamentals.eod_derivative_indicator.pb_ratio, 
            fundamentals.eod_derivative_indicator.market_cap, 
            fundamentals.financial_indicator.ev, 
            fundamentals.financial_indicator.return_on_asset_net_profit, 
            fundamentals.financial_indicator.du_return_on_equity, 
            fundamentals.financial_indicator.earnings_per_share, 
            fundamentals.income_statement.revenue, 
            fundamentals.income_statement.total_expense).filter(fundamentals.stockcode.in_(stocks))

  # 查询因子数据
  fund = get_fundamentals(q, entry_date=date)[:, 0, :]
  
  fund['date'] = date
  
  # 进行每月因子数据拼接
  all_data = pd.concat([all_data, fund])

In [49]:
# 删除缺失值
all_data = all_data.dropna()

In [50]:
# 建立每个股票样本，每个月的因子值对应下个月的收益率
all_data['next_month_return'] = np.nan

In [54]:
all_data.head()

Unnamed: 0,pe_ratio,pb_ratio,market_cap,ev,return_on_asset_net_profit,du_return_on_equity,earnings_per_share,revenue,total_expense,date,next_month_return
000001.XSHE,7.126,0.9684,108537000000.0,1858090000000.0,0.6756,13.0164,1.43,37345000000.0,22038000000.0,2014-01-30,
000002.XSHE,5.3769,1.0571,81290500000.0,191646000000.0,1.7804,9.3272,0.56,63415300000.0,53960000000.0,2014-01-30,
000008.XSHE,114.328,4.6166,2601870000.0,2917610000.0,2.7598,3.0992,0.0561,228189000.0,206793000.0,2014-01-30,
000060.XSHE,28.8896,1.9053,11366800000.0,19182800000.0,0.9316,3.068,0.08,9758750000.0,9642690000.0,2014-01-30,
000063.XSHE,8.2098,2.0244,45616200000.0,96363700000.0,0.5568,2.561,0.16,54557500000.0,57006200000.0,2014-01-30,


### 三、获取价格数据计算对应的收益率

In [41]:
# 1、获取每月月末价格数据
all_price= pd.DataFrame()
for date in month_date:
  price = get_price(stocks, start_date=date, end_date=date, fields='close')
  all_price = pd.concat([all_price, price], axis=0)

In [42]:
# 转置方便后面计算处理
all_price = all_price.T

In [44]:
# 按照股票进行去除空值
all_price = all_price.dropna()

In [46]:
# 2、计算收益率
for i in range(len(all_price.columns)-1):
  # 利用每次后一个月的收盘价-循环的这个月的收盘价/循环的这个月的收盘价
  all_price.iloc[:, i] = all_price.iloc[:, i + 1]/ all_price.iloc[:, i] - 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [53]:
all_price.head()

Unnamed: 0,2014-01-30 00:00:00,2014-02-28 00:00:00,2014-03-31 00:00:00,2014-04-30 00:00:00,2014-05-30 00:00:00,2014-06-30 00:00:00,2014-07-31 00:00:00,2014-08-29 00:00:00,2014-09-30 00:00:00,2014-10-31 00:00:00,...,2015-03-31 00:00:00,2015-04-30 00:00:00,2015-05-29 00:00:00,2015-06-30 00:00:00,2015-07-31 00:00:00,2015-08-31 00:00:00,2015-09-30 00:00:00,2015-10-30 00:00:00,2015-11-30 00:00:00,2015-12-31 00:00:00
600018.XSHG,-0.048302,0.004241,-0.010535,-0.044675,-0.008911,0.053749,0.019745,0.139778,0.0,0.084899,...,0.300119,-0.062147,-0.146711,-0.092291,0.091926,-0.110972,0.024396,-0.058822,-0.03572,6.1469
600061.XSHG,-0.032409,0.093801,-0.035219,-0.003182,-0.020693,0.047157,0.0,0.0,0.0,1.358673,...,0.086844,0.016461,0.275977,-0.454785,-0.263339,-0.028961,0.216947,0.379384,0.047256,25.4813
600008.XSHG,0.080235,-0.019302,-0.024249,-0.030913,0.014803,0.027496,0.051955,0.152696,-0.01168,0.02233,...,0.045944,-0.05411,-0.02448,-0.236227,-0.127752,-0.078447,0.233831,-0.030353,-0.033211,4.8644
601877.XSHG,0.083473,-0.061325,-0.03009,-0.09063,0.039862,0.13556,-0.023716,0.062491,-0.009088,0.135165,...,-0.05619,0.033766,0.0,0.0,0.0,0.0,0.004727,-0.094979,-0.055311,24.8049
600482.XSHG,0.150253,-0.013242,-0.103758,0.014852,0.059397,0.095328,0.137375,0.187547,-0.068224,-0.006099,...,0.168191,0.846886,0.0,0.0,0.0,0.0,-0.097903,0.160608,0.060402,46.6912


### 四、将收益率填充到因子对应的下个月收益率列当中
* all_data, all_price

In [55]:
for i in range(len(all_data)):
  
  # 每个样本填充对应收益率
  stock = all_data.index[i]
  
  date = all_data.ix[i, 'date']
  
  # 在all_price里面寻找收益率
  if stock in all_price.index and date in all_price.columns:
    
    all_data.ix[i, 'next_month_return'] = all_price.loc[stock, date]

In [57]:
# 把收益率为空删除
all_data = all_data.dropna()

In [58]:
all_data

Unnamed: 0,pe_ratio,pb_ratio,market_cap,ev,return_on_asset_net_profit,du_return_on_equity,earnings_per_share,revenue,total_expense,date,next_month_return
000001.XSHE,7.126,0.9684,1.08537e+11,1.85809e+12,0.6756,13.0164,1.43,3.7345e+10,2.2038e+10,2014-01-30,-0.023689
000002.XSHE,5.3769,1.0571,8.12905e+10,1.91646e+11,1.7804,9.3272,0.56,6.34153e+10,5.396e+10,2014-01-30,-0.089430
000008.XSHE,114.328,4.6166,2.60187e+09,2.91761e+09,2.7598,3.0992,0.0561,2.28189e+08,2.06793e+08,2014-01-30,0.121334
000060.XSHE,28.8896,1.9053,1.13668e+10,1.91828e+10,0.9316,3.068,0.08,9.75875e+09,9.64269e+09,2014-01-30,0.079866
000063.XSHE,8.2098,2.0244,4.56162e+10,9.63637e+10,0.5568,2.561,0.16,5.45575e+10,5.70062e+10,2014-01-30,-0.009043
000069.XSHE,7.1736,1.4932,3.56303e+10,6.55072e+10,4.0417,13.7533,0.399,1.72365e+10,1.32602e+10,2014-01-30,-0.077550
000100.XSHE,6.2327,1.5054,2.13287e+10,5.31028e+10,2.1248,10.2132,0.149,6.12241e+10,6.05763e+10,2014-01-30,0.112002
000157.XSHE,111.858,0.935,3.89151e+10,7.0563e+10,4.387,9.2131,0.49,2.88971e+10,2.42468e+10,2014-01-30,-0.009907
000333.XSHE,10.7138,2.4057,7.90211e+10,9.53968e+10,7.2895,17.3695,3.73,9.39206e+10,8.7225e+10,2014-01-30,-0.169011
000338.XSHE,8.4296,1.262,3.49879e+10,5.65886e+10,4.0573,10.4072,1.35,4.33327e+10,4.00661e+10,2014-01-30,-0.002849


### 五、特征值和目标值处理

In [62]:
def mad(factor):
  """3倍中位数去极值
  """
  # 求出因子值的中位数
  med = np.median(factor)

  # 求出因子值与中位数的差值，进行绝对值
  mad = np.median(np.abs(factor - med))

  # 定义几倍的中位数上下限
  high = med + (3 * 1.4826 * mad)
  low = med - (3 * 1.4826 * mad)

  # 替换上下限以外的值
  factor = np.where(factor > high, high, factor)
  factor = np.where(factor < low, low, factor)
  return factor

def stand(factor):
    """标准化
    """
    mean = np.mean(factor)
    std = np.std(factor)
    return (factor - mean)/std

In [76]:
y = all_data['next_month_return']
x = all_data.drop(['next_month_return', 'date'], axis=1)
x_market_cap = x['market_cap']

In [77]:
x.head()

Unnamed: 0,pe_ratio,pb_ratio,market_cap,ev,return_on_asset_net_profit,du_return_on_equity,earnings_per_share,revenue,total_expense
000001.XSHE,7.126,0.9684,108537000000.0,1858090000000.0,0.6756,13.0164,1.43,37345000000.0,22038000000.0
000002.XSHE,5.3769,1.0571,81290500000.0,191646000000.0,1.7804,9.3272,0.56,63415300000.0,53960000000.0
000008.XSHE,114.328,4.6166,2601870000.0,2917610000.0,2.7598,3.0992,0.0561,228189000.0,206793000.0
000060.XSHE,28.8896,1.9053,11366800000.0,19182800000.0,0.9316,3.068,0.08,9758750000.0,9642690000.0
000063.XSHE,8.2098,2.0244,45616200000.0,96363700000.0,0.5568,2.561,0.16,54557500000.0,57006200000.0


In [78]:
# 1、特征值处理
# 去极值、标准化、中性化
for name in x.columns:
  x[name] = mad(x[name])
  x[name] = stand(x[name])

In [79]:
x.head()

Unnamed: 0,pe_ratio,pb_ratio,market_cap,ev,return_on_asset_net_profit,du_return_on_equity,earnings_per_share,revenue,total_expense
000001.XSHE,-0.770255,-1.04225,1.5755,1.77523,-0.7284,1.01763,2.18913,1.61114,1.22293
000002.XSHE,-0.824801,-1.00937,0.855849,1.77523,-0.319151,0.427776,0.670299,1.61114,1.55947
000008.XSHE,1.89736,0.310218,-1.22255,-1.13321,0.0436467,-0.56799,-0.850114,-0.96684,-0.975081
000060.XSHE,-0.0915574,-0.694924,-0.991041,-0.854831,-0.63357,-0.572979,-0.778001,-0.200865,-0.0250552
000063.XSHE,-0.736457,-0.650771,-0.0864143,0.466137,-0.772406,-0.654041,-0.536617,1.61114,1.55947


In [80]:
for name in x.columns:
  
    if name == "market_cap":
      continue

    # 准备特征值、目标值
    # x_market_cap
    y_factor = x[name]

    # 线性回归方程建立
    lr = LinearRegression()

    lr.fit(x_market_cap.values.reshape(-1, 1), y_factor)

    y_predict = lr.predict(x_market_cap.values.reshape(-1, 1))

    # 得出真实值与预测之间的误差当做新的因子值
    x[name] = y_factor - y_predict

In [81]:
# 收益率目标值y 
y = stand(y)

### 六、建立特征值因子数据（处理过的） 与目标值（标准化） 下期收益率 之间的回归方程

In [83]:
lr = LinearRegression()

In [84]:
lr.fit(x, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [85]:
lr.coef_

array([ 0.02953221, -0.04920124, -0.10791485,  0.00801783, -0.03613599,
        0.1310877 , -0.03030564,  0.40286239, -0.30166898])