# 导入模块并改变工作目录

In [1]:
import pandas as pd
import numpy  as np
from sklearn import svm
import os
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.decomposition import PCA
from prettytable import PrettyTable 
import talib
import time
% matplotlib inline

In [2]:
import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
from plotly.graph_objs import *
init_notebook_mode() 

In [3]:
os.chdir('J:\Lifelong Learning\QUANT\SVM')

# 一、导入数据

定义数据导入并格式化的函数

In [4]:
def import_data(file):
    data = pd.read_csv(file, index_col=0, squeeze=True)
    data.index = pd.to_datetime(data.index)
    return data

#### 沪深300基本交易数据

In [5]:
asset = 'HS300'

In [6]:
data = import_data('%s.csv' % asset)  # 数据来源于wind

In [7]:
data.head()

Unnamed: 0_level_0,open,high,low,close,volume,value
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2005-01-04,994.769,994.769,980.658,982.794,741286894,4431977000.0
2005-01-05,981.577,997.323,979.877,992.564,711910898,4529208000.0
2005-01-06,993.331,993.788,980.33,983.174,628802905,3921015000.0
2005-01-07,983.045,995.711,979.812,983.958,729869409,4737469000.0
2005-01-10,983.76,993.959,979.789,993.879,579169799,3762933000.0


In [8]:
x = data.index
y = data['close']
iplot({'data':[Scatter(x=x, y=y, mode='line', name=asset)],'layout':Layout(title=asset)},link_text='')

#### 标普指数、道琼指数、人民币兑美元汇率中间价、美元指数

In [9]:
international_data = import_data('international_data.csv')  # 数据来源于wind

In [None]:
international_data.head()

#### cpi与pmi月度数据

In [10]:
cpi = import_data('cpi.csv')
pmi = import_data('pmi.csv')  # 数据来源于国家统计局

注：cpi与pmi均是以去年同月为100计算的

In [None]:
cpi.head()

#### 货币供应量：M0，M1，M2

In [11]:
money_supply =import_data('money_supply.csv')  # 数据来源于国家统计局

数据中包含M0，M1，M2的数据以及同比增长率

In [None]:
money_supply.head()

#### 固定资产投资同比增长率，月度

In [12]:
fixed_asset_inv = import_data('fixed_asset_inv.csv')  # 数据来源于国家统计局

这里，固定资产投资为累计量，并且为了避免1月份春节给数据准确度造成影响，1/2月一起统计，所以1月份数据是缺失的。为了填补1月分数据的缺失，我们假定1/2月投资量是一样的。然后计算出每月新增固定资产投资，最后计算出每月的同比增长率。

In [13]:
Jan = fixed_asset_inv.isnull()
Feb = fixed_asset_inv.isnull().shift().fillna(False)
fixed_asset_inv[Jan] = list(fixed_asset_inv[Feb] / 2)
new_inv = fixed_asset_inv.diff()
new_inv[Jan] = list(fixed_asset_inv[Feb] / 2)
new_inv_pct = new_inv.pct_change(12).dropna()

In [None]:
new_inv_pct.head()

#### 工业增加值同比增长率，月度

这里，工业增加值也没有1月份的数据，1/2月一起统计的。由于数据本身已是同比增长率了，所以我们直接用2月份的同比增长率填充1月份的数据。

In [14]:
value_added = import_data('ind_value_added_change.csv')  # 数据来源于国家统计局
value_added.fillna(method='bfill', inplace=True)

In [None]:
value_added.head()

# 二、SVM训练并预测

#### 关于数据标准化

这里有一点需要注意，关于标准化的相关问题。在《SVM商品期货择时策略.pdf》文章中，作者提到了同一资产最高价与最低价标准化后，最低价反而比最高价大的情况（如下图）。这种情况的存在不足为奇，标准化后这两个数据谁高谁低并没有一个统一的答案，但这却反映了另外一个问题：就是各个特征标准化后，他们之间具有的大小关系（或者其他关系）被忽略掉了。就拿最高价和最低价为例，就每一个时点而言，最高价肯定不小于最低价，这种关系的变化可能是未来的价格走势的某种信号：最高价与最低价相等可能以为着未来价格震荡（瞎说的），而最高价远大于最低价，则可能意味着未来价格会上涨（也是瞎说的）。标准化后，最高价与最低价共同传达出的信号被剔除了，相当于我们舍弃掉了一个有用的信号。  
如何解决这个问题呢？ 
1. 特征之间具有关系且存在有用信号的，不对其做标准化处理。但是，如果存在特征没有进行标准化，其他特征的标准化就没有任何意义；另外，如何判断两个特征之间是否具有关系，该关系是否存在有用的信号？
2. 另一种更为可行的方法是，加入具有有用关系的特征的组合特征。比如最高价与最低价的比值，最高价与最低价的差值。但这个方法也有一个问题，就是如何构建特征之间的组合特征？

In [15]:
temp = (data - data.mean()) / data.std()
iplot({'data':[Scatter(x=temp.index, y=temp['high'], mode='line', name='high'), 
               Scatter(x=temp.index, y=temp['low'], mode='line', name='low')],
       'layout':Layout(title=asset)},link_text='')

#### 定义分析所用函数

- 滚动训练svm模型，并预测下一期涨跌

In [16]:
def SvmPredict(char_values, pol, window, z, decom, kernel_function, C, output, mode='rolling'):
    print('========'+ kernel_function + '=========', file=output)
    print('========'+ kernel_function + '=========')
    prediction = []
    for t in char_values.index[window:]:
        clf = svm.SVC(kernel=kernel_function, C=C)
        if mode == 'rolling':
            x = char_values[:t].ix[-(window+1):-1]
            y = pol[:t].ix[-(window+1):-1]
        elif mode == 'all':  # 此模式下用之前所有数据来训练模型
            x = char_values[:t].ix[:-1]
            y = pol[:t].ix[:-1]
        new_feature = char_values.ix[t]
        if z:
            x = (x - x.mean()) / x.std()
            new_feature = (new_feature - x.mean()) / x.std()
            if decom:
                pca = PCA(n_components=0.95)
                x = pca.fit_transform(x)
                new_feature = pca.transform([list(new_feature)])
        clf.fit(x, y)
        prediction.append(clf.predict(new_feature))
    prediction = pd.DataFrame(prediction, index=char_values.index[window:], columns=['prediction'])
    return prediction

SVM的代码做法参考：  
[scikit-learn官方文档](http://scikit-learn.org/stable/modules/svm.html)  
[sklearn.svm.SVC](http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)  
PCA的代码做法参考：  
[scikit-learn中PCA的使用方法](http://doc.okbase.net/u012162613/archive/120946.html)  
[sklearn.decomposition.PCA](http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html)

- 结果输出函数

In [17]:
def table_print(df, output):
    x = PrettyTable([df.index.name] + list(df.columns))
    x.align[df.index.name] = "l"
    for i in df.index:
        x.add_row([i] + list(df.ix[i]))
    print(x, file=output)
    print(x)

- 预测胜率统计函数

In [18]:
def analysis(pol, output, prediction, kernel_function):
    prediction['real'] = pol[window:]
    bingo = prediction['prediction'] == prediction['real']
    wrong = prediction['prediction'] != prediction['real']
    
    print('预测的准确率为：' + str(bingo.value_counts()[1] / prediction.shape[0]), file=output)
    print('预测的准确率为：' + str(bingo.value_counts()[1] / prediction.shape[0]))
    print('上涨次数占比为：' + str(prediction['real'].value_counts()[1] / prediction.shape[0]), file=output)
    print('上涨次数占比为：' + str(prediction['real'].value_counts()[1] / prediction.shape[0]))
    
    pre_rise = prediction['prediction'] == 1
    pre_fall = prediction['prediction'] == -1
    rea_rise = prediction['real'] == 1
    rea_fall = prediction['real'] == -1
    
    a = np.logical_and(rea_rise, pre_rise).sum()
    b = np.logical_and(rea_rise, pre_fall).sum()
    c = np.logical_and(rea_fall, pre_rise).sum()
    d = np.logical_and(rea_fall, pre_fall).sum()

    summary = pd.DataFrame([[a,c],[b,d]], columns=['real_rise', 'real_fall'], index=['predict_rise', 'predict_fall'])
    summary['sum'] = summary.sum(axis=1)

    temp = pd.DataFrame(summary.sum(), columns=['sum'])
    summary = pd.concat([summary, temp.T], axis=0)
    summary.index.name = kernel_function
    ratio = summary/summary.ix['sum']
    ratio.columns = ['rise', 'fall', 'total']
    summary = pd.concat([summary, ratio], axis=1)
    table_print(summary, output)
    return summary

- 绘图函数

In [19]:
def combine_plot(close, prediction, kernel, window):
    signals = prediction['prediction'].shift(1)
    close = close[signals.index]
    close_rise = close[signals==1]
    close_rise.name = 'close_rise'
    close_fall = close[signals==-1]
    close_fall.name = 'close_fall'
    close = pd.concat([close, close_rise, close_fall], axis=1)
    x = close.index
    iplot({'data':[Scatter(x=x, y=close['close'], mode='line', name='HS300'), 
                   Scatter(x=x, y=close['close_rise'], mode='markers', name='predict to rise'), 
                   Scatter(x=x, y=close['close_fall'], mode='markers', name='predict to fall')],
           'layout':Layout(title=kernel + '  ' + str(window))},link_text='')

- 回测函数

In [20]:
def backTest(signals, open_close, start_value, fee_rate, margin):
    date = [i.strftime('%Y-%m-%d') for i in signals.index]
    position = dict()
    position[date[0]] = 0
    deal = dict()
    Volume = dict()
    Volume[date[0]] = 0
    net_value = dict()
    net_value[date[0]] = start_value
    stop = False
    for t in date[:-1]:
        next_day = date[date.index(t) + 1]
        signal = signals.ix[t]
        if net_value[t] <= 0:
            stop = True
            stop_date = t
            break
        if (position[t] == 0) and (signal == -1):
            position[next_day] = -1
            deal[next_day] = -1
            volume = int(net_value[t] / (open_close['open'][next_day] * (margin + fee_rate)) / 100.) * 100
            Volume[next_day] = volume
            net_value[next_day] = net_value[t] - volume * open_close['open'][next_day] * fee_rate
        elif (position[t] == 0) and (signal == 1):
            position[next_day] = 1
            deal[next_day] = 1
            volume = int(net_value[t] / (open_close['open'][next_day] * (margin + fee_rate)) / 100.) * 100
            Volume[next_day] = volume
            net_value[next_day] = net_value[t] - volume * open_close['open'][next_day] * fee_rate
        elif (position[t] == -1) and (signal == -1):
            position[next_day] = -1
            net_value[next_day] = net_value[t] + Volume[t] * position[t] * (open_close['open'][next_day] - open_close['open'][t])
            Volume[next_day] = Volume[t]
        elif (position[t] == -1) and (signal == 1):
            position[next_day] = 1
            deal[next_day] = 1
            value = (net_value[t] + 
                     Volume[t] * position[t] * (open_close['open'][next_day] - open_close['open'][t]) - 
                     Volume[t] * open_close['open'][next_day] * fee_rate)
            volume = int(value / (open_close['open'][next_day] * (margin + fee_rate)) / 100.) * 100
            Volume[next_day] = volume
            net_value[next_day] = value - volume * open_close['open'][next_day] * fee_rate
        elif (position[t] == 1) and (signal == -1):
            position[next_day] = -1
            deal[next_day] = -1
            value = (net_value[t] + 
                     Volume[t] * position[t] * (open_close['open'][next_day] - open_close['open'][t]) - 
                     Volume[t] * open_close['open'][next_day] * fee_rate)
            volume = int(value / (open_close['open'][next_day] * (margin + fee_rate)) / 100) * 100
            Volume[next_day] = volume
            net_value[next_day] = value - volume * open_close['open'][next_day] * fee_rate
        elif (position[t] == 1) and (signal == 1):
            position[next_day] =1
            net_value[next_day] = net_value[t] + Volume[t] * position[t] * (open_close['open'][next_day] - open_close['open'][t])
            Volume[next_day] = Volume[t]
    if stop:
        del position[stop_date], Volume[stop_date], net_value[stop_date]
        if stop_date in deal.keys():
            del deal[stop_date]
    return pd.Series(position), pd.Series(deal), pd.Series(Volume), pd.Series(net_value)

- 回撤函数

In [21]:
def draw_back(cum_gain, mode):
    if mode == 'A':  # 回撤数值，绝对量
        cum_max = cum_gain.cummax()
        draw_back_amount = cum_max - cum_gain
        return draw_back_amount
    elif mode == 'R':  # 回撤比例，相对值
        cum_max = cum_gain.cummax()
        draw_back_portion = (cum_max - cum_gain) / cum_max
        return draw_back_portion

- 回测结果汇总函数

In [22]:
def analysis_2(cum_gain, output):
    day_num = len(cum_gain)  # 回测时长（天）
    
    net_gain = int(cum_gain.ix[-1] - cum_gain.ix[0])  # 净利润
    annualized_return = (cum_gain.ix[-1] / cum_gain.ix[0] - 1) * 250. / float(day_num) * 100  # 年化收益率
    compound_annualized_return = ((cum_gain[-1] / cum_gain[0]) ** (250 / day_num) - 1) * 100  # 复合年化收益率
    max_draw_back = draw_back(cum_gain, mode='R').max() * 100  # 最大回撤
    annualized_std = cum_gain.pct_change().std() * 250 ** 0.5 * 100  # 年化波动率
    sharpe_ratio = float(compound_annualized_return / annualized_std)  # 夏普比率
    start_value = int(cum_gain.ix[0])  # 期初权益
    end_value = int(cum_gain.ix[-1])  # 期末权益
    start_time = cum_gain.index[0]  # 起始时间
    end_time = cum_gain.index[-1]  # 终止时间
    return_analysis = pd.DataFrame([net_gain,'%d%s' % (int(compound_annualized_return), '%'),'%d%s' %(int(max_draw_back),'%'),
                                    '%d%s'%(int(annualized_std),'%'),round(sharpe_ratio,2),start_value,end_value,
                                    start_time,end_time,'%d天'%day_num])
    return_analysis.index = ['净利润','复合年化收益率','最大回撤','波动率','夏普比率','期初权益','期末权益',
                             '起始时间','终止时间','回测时长']
    return_analysis.columns = ['收益率分析']
    table_print(return_analysis.T, output)
    return return_analysis.T

# 尝试一

### 1. 预测日度收益

#### 计算相关特征值

In [85]:
try_id = '尝试一'
cycle = 'D'

In [87]:
feature_set = 'group3'
predict_window = 'W'

In [89]:
if (feature_set == 'group1') or (feature_set == 'group2') or (feature_set == 'group3'):
    feature = data.copy(deep=True)
    # 基本指标

if (feature_set == 'group2') or (feature_set == 'group3'):
    feature['returns'] = data['close'].pct_change()
    feature['ma1'] = data['close'].rolling(window=5).mean()
    feature['ma2'] = data['close'].rolling(window=20).mean()
    feature['var'] = data['close'].rolling(window=20).std()
    feature['max_high'] = data['high'].rolling(window=5).max()
    feature['min_low'] = data['low'].rolling(window=5).min()
    feature['ma_volume'] = data['volume'].rolling(window=5).mean()
    feature['ma_value'] = data['value'].rolling(window=5).mean()
    # 衍生指标
if feature_set == 'group3':
    a, b, c = talib.MACD(np.array(data['close']))
    feature['MACD'] = pd.Series(a, index=data.index)
    feature['MACDsignal'] = pd.Series(b, index=data.index)
    feature['MACDhist'] = pd.Series(c, index=data.index)

    sar = talib.SAR(np.array(data['high']), np.array(data['low']))
    feature['SAR'] = pd.Series(sar, index=data.index)

    mom = talib.MOM(np.array(data['close']))
    feature['MOM'] = pd.Series(mom, index=data.index)

    rsi = talib.MOM(np.array(data['close']))
    feature['RSI'] = pd.Series(rsi, index=data.index)

    x, y, z = talib.BBANDS(np.array(data['close']))
    feature['Upper'] = pd.Series(x, index=data.index)
    feature['Middl'] = pd.Series(y, index=data.index)
    feature['Lower'] = pd.Series(z, index=data.index)
    # 技术指标
if predict_window == 'D':
    feature['follow_return'] = data['close'].pct_change().shift(-1)
elif predict_window == 'W':
    feature['follow_return'] = data['close'].pct_change(5).shift(-5)

feature.dropna(axis=0, how='any', inplace=True)
pol = feature['follow_return'].copy(deep=True)
pol[pol > 0] = 1
pol[pol <= 0] = -1
del feature['follow_return']

In [90]:
feature.head()

Unnamed: 0_level_0,open,high,low,close,volume,value,returns,ma1,ma2,var,...,ma_value,MACD,MACDsignal,MACDhist,SAR,MOM,RSI,Upper,Middl,Lower
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-03-01,1039.351,1042.737,1031.168,1035.934,996209016,6288791000.0,-0.003899,1042.4176,1009.6101,30.037597,...,9238909000.0,15.990289,11.630691,4.359598,1023.260732,19.076,19.076,1050.336508,1042.4176,1034.498692
2005-03-02,1036.368,1045.76,1021.004,1021.32,1422513553,8971308000.0,-0.014107,1037.8928,1010.76945,30.018769,...,8786783000.0,14.409325,12.186417,2.222908,1059.483,-2.264,-2.264,1056.196814,1037.8928,1019.588786
2005-03-03,1019.876,1028.402,1014.752,1027.71,920308564,5582812000.0,0.006257,1034.3428,1012.2662,30.082977,...,8104595000.0,13.516213,12.452376,1.063836,1058.71342,7.104,7.104,1052.280705,1034.3428,1016.404895
2005-03-04,1027.927,1031.847,1022.176,1023.667,817364167,5383868000.0,-0.003934,1029.724,1013.95315,29.708046,...,6883376000.0,12.339932,12.429888,-0.089956,1056.954963,17.612,17.612,1044.015396,1029.724,1015.432604
2005-03-07,1024.48,1031.344,1024.48,1029.871,793294979,5316473000.0,0.006061,1027.7004,1016.71515,28.398819,...,6308650000.0,11.772624,12.298435,-0.525811,1055.266845,4.238,4.238,1037.878085,1027.7004,1017.522715


In [91]:
close = data['close']

#### 滚动训练预测

In [92]:
start_value = 1000000
fee_rate = 0.001
margin = 1

In [93]:
z = True  # 是否标准化特征数据
decom = True  # 是否进行pca降维
window_list = [60, 120]  # 设定训练数据时间长度（周或者日）
kernel_list = ['rbf', 'linear', 'sigmoid']  # 核函数
C = 10  # [1, 5, 10]

In [None]:
output = open('%s_数据频率%s_预测长度%s_标准化%s_PCA%s.txt' % (try_id, cycle,predict_window, str(z), str(decom)), 'w+')
for window in window_list:
    print('\n训练数据长度：%d' % window, file=output)
    print('\n训练数据长度：%d' % window)
    for f in kernel_list:
        prediction = SvmPredict(feature, pol, window, z, decom, f, C, output)
        signals = prediction['prediction'].copy(deep=True)
        _ = analysis(pol, output, prediction, f)
        combine_plot(close, prediction, f, window)
        open_close = data.ix[signals.index][['open', 'close']]
        position, deal, Volume, net_value = backTest(signals, open_close, start_value, fee_rate, margin)
        _ = analysis_2(net_value, output)
        iplot({'data':[Scatter(x=net_value.index, y=net_value, mode='line', name='cum_gain')],
               'layout':Layout(title='累积净值 \n' + f + ' ' + str(window))},link_text='')
output.close()


训练数据长度：60
预测的准确率为：0.560097833683
上涨次数占比为：0.556953179595
+--------------+-----------+-----------+--------+----------------+----------------+----------------+
| rbf          | real_rise | real_fall |  sum   |      rise      |      fall      |     total      |
+--------------+-----------+-----------+--------+----------------+----------------+----------------+
| predict_rise |   1020.0  |   685.0   | 1705.0 | 0.639899623588 | 0.540220820189 | 0.595737246681 |
| predict_fall |   574.0   |   583.0   | 1157.0 | 0.360100376412 | 0.459779179811 | 0.404262753319 |
| sum          |   1594.0  |   1268.0  | 2862.0 |      1.0       |      1.0       |      1.0       |
+--------------+-----------+-----------+--------+----------------+----------------+----------------+


+------------+----------+----------------+----------+--------+----------+----------+----------+------------+------------+----------+
|    None    |  净利润  | 复合年化收益率 | 最大回撤 | 波动率 | 夏普比率 | 期初权益 | 期末权益 |  起始时间  |  终止时间  | 回测时长 |
+------------+----------+----------------+----------+--------+----------+----------+----------+------------+------------+----------+
| 收益率分析 | 98019346 |      49%       |   32%    |  27%   |   1.8    | 1000000  | 99019346 | 2005-05-31 | 2017-03-06 |  2862天  |
+------------+----------+----------------+----------+--------+----------+----------+----------+------------+------------+----------+


预测的准确率为：0.505590496157
上涨次数占比为：0.556953179595
+--------------+-----------+-----------+--------+----------------+----------------+----------------+
| linear       | real_rise | real_fall |  sum   |      rise      |      fall      |     total      |
+--------------+-----------+-----------+--------+----------------+----------------+----------------+
| predict_rise |   709.0   |   530.0   | 1239.0 | 0.444792973651 | 0.417981072555 | 0.432914046122 |
| predict_fall |   885.0   |   738.0   | 1623.0 | 0.555207026349 | 0.582018927445 | 0.567085953878 |
| sum          |   1594.0  |   1268.0  | 2862.0 |      1.0       |      1.0       |      1.0       |
+--------------+-----------+-----------+--------+----------------+----------------+----------------+


+------------+---------+----------------+----------+--------+----------+----------+----------+------------+------------+----------+
|    None    |  净利润 | 复合年化收益率 | 最大回撤 | 波动率 | 夏普比率 | 期初权益 | 期末权益 |  起始时间  |  终止时间  | 回测时长 |
+------------+---------+----------------+----------+--------+----------+----------+----------+------------+------------+----------+
| 收益率分析 | -632109 |      -8%       |   79%    |  21%   |  -0.39   | 1000000  |  367890  | 2005-05-31 | 2017-03-06 |  2862天  |
+------------+---------+----------------+----------+--------+----------+----------+----------+------------+------------+----------+


预测的准确率为：0.566736547869
上涨次数占比为：0.556953179595
+--------------+-----------+-----------+--------+----------------+----------------+----------------+
| sigmoid      | real_rise | real_fall |  sum   |      rise      |      fall      |     total      |
+--------------+-----------+-----------+--------+----------------+----------------+----------------+
| predict_rise |   1041.0  |   687.0   | 1728.0 | 0.653074027604 | 0.541798107256 | 0.603773584906 |
| predict_fall |   553.0   |   581.0   | 1134.0 | 0.346925972396 | 0.458201892744 | 0.396226415094 |
| sum          |   1594.0  |   1268.0  | 2862.0 |      1.0       |      1.0       |      1.0       |
+--------------+-----------+-----------+--------+----------------+----------------+----------------+


+------------+----------+----------------+----------+--------+----------+----------+----------+------------+------------+----------+
|    None    |  净利润  | 复合年化收益率 | 最大回撤 | 波动率 | 夏普比率 | 期初权益 | 期末权益 |  起始时间  |  终止时间  | 回测时长 |
+------------+----------+----------------+----------+--------+----------+----------+----------+------------+------------+----------+
| 收益率分析 | 68004329 |      44%       |   32%    |  27%   |   1.62   | 1000000  | 69004329 | 2005-05-31 | 2017-03-06 |  2862天  |
+------------+----------+----------------+----------+--------+----------+----------+----------+------------+------------+----------+



训练数据长度：120


## 尝试二

在尝试一上加入基本面数据，数据频率使用月度数据

In [146]:
try_id = '尝试二'
cycle = 'M'

#### 计算特征向量

In [147]:
resample_data = data.resample(cycle)
close = resample_data['close'].last()

resample_high = resample_data['high'].max().dropna()  # 周最高价
resample_low = resample_data['low'].min().dropna()  # 周最低价
resample_return = resample_data['close'].last().dropna().pct_change()  # 周收益率
resample_value = resample_data['value'].sum().dropna()  # 周成交额

char_values = pd.DataFrame()
char_values['high'] = resample_high
char_values['low'] = resample_low
char_values['return'] = resample_return
char_values['value'] = resample_value
char_values['p_return'] = resample_return.shift(1)  # 上周收益率
char_values['pp_return'] = resample_return.shift(2)  # 上上周收益率
char_values['p_value'] = resample_value.shift(1)  # 上周成交额 
char_values['m_value'] = resample_value.rolling(window=4).mean()  # 前四周平均成交额
char_values['follow_return'] = resample_return.shift(-1)

In [148]:
char_values.head()

Unnamed: 0_level_0,high,low,return,value,p_return,pp_return,p_value,m_value,follow_return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2005-01-31,1006.463,943.439,,91007760000.0,,,,,0.089132
2005-02-28,1059.483,952.741,0.089132,108735700000.0,,,91007760000.0,,-0.094026
2005-03-31,1054.64,928.335,-0.094026,153976900000.0,0.089132,,108735700000.0,,-0.01041
2005-04-30,1008.735,914.829,-0.01041,161126300000.0,-0.094026,0.089132,153976900000.0,128711600000.0,-0.081992
2005-05-31,937.39,842.102,-0.081992,77564120000.0,-0.01041,-0.094026,161126300000.0,125350700000.0,0.026567


In [157]:
char_values = pd.concat([char_values, money_supply.shift(1), international_data.resample(cycle).last()], axis=1)  
# 货币供应量滞后一期是因为在每月月底还不知道当月的货币供给量，在下月月中才知道；下面的同理
char_values['CPI'] = cpi.shift(1)
char_values['PMI'] = pmi.shift(1)
char_values['value_added'] = value_added.shift(1)
char_values['inv'] = fixed_asset_inv.shift(1)

In [158]:
char_values.head()

Unnamed: 0_level_0,high,low,return,value,p_return,pp_return,p_value,m_value,follow_return,M2,...,M0,M0_Change,SPX,DJI,USDCNY,USDX,CPI,PMI,value_added,inv
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-01-31,1006.463,943.439,,91007760000.0,,,,,0.089132,,...,,,1181.27,10489.94,8.2765,83.6,,,,58620.28
2005-02-28,1059.483,952.741,0.089132,108735700000.0,,,91007760000.0,,-0.094026,257708.47,...,24015.41,7.8,1203.6,10766.23,8.2765,82.51,101.9,110.7,7.6,2110.89
2005-03-31,1054.64,928.335,-0.094026,153976900000.0,0.089132,,108735700000.0,,-0.01041,259357.29,...,22667.97,14.0,1180.59,10503.76,8.2765,84.06,103.9,109.8,7.6,4221.78
2005-04-30,1008.735,914.829,-0.01041,161126300000.0,-0.094026,0.089132,153976900000.0,128711600000.0,-0.081992,264588.94,...,21238.95,10.1,1156.85,10192.51,8.2765,84.43,102.7,109.7,15.1,9036.68
2005-05-31,937.39,842.102,-0.081992,77564120000.0,-0.01041,-0.094026,161126300000.0,125350700000.0,0.026567,266992.66,...,21666.56,9.0,1191.5,10467.48,8.2765,87.76,101.8,109.9,16.0,14024.67


#### 下一期涨跌标记

In [159]:
pol = np.sign(char_values['follow_return'])

#### 对齐数据

In [160]:
char_values['pol'] = pol
char_values.dropna(axis=0, how='any', inplace=True)
pol = char_values['pol']
del char_values['pol']

In [161]:
char_values.head()

Unnamed: 0_level_0,high,low,return,value,p_return,pp_return,p_value,m_value,follow_return,M2,...,M0,M0_Change,SPX,DJI,USDCNY,USDX,CPI,PMI,value_added,inv
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-04-30,1008.735,914.829,-0.01041,161126300000.0,-0.094026,0.089132,153976900000.0,128711600000.0,-0.081992,264588.94,...,21238.95,10.1,1156.85,10192.51,8.2765,84.43,102.7,109.7,15.1,9036.68
2005-05-31,937.39,842.102,-0.081992,77564120000.0,-0.01041,-0.094026,161126300000.0,125350700000.0,0.026567,266992.66,...,21666.56,9.0,1191.5,10467.48,8.2765,87.76,101.8,109.9,16.0,14024.67
2005-06-30,925.365,807.784,0.026567,168157000000.0,-0.081992,-0.01041,77564120000.0,140206100000.0,0.010787,269240.49,...,20811.59,9.3,1191.33,10274.97,8.2765,89.09,101.8,110.0,16.6,19719.32
2005-07-31,900.997,818.863,0.010787,117223900000.0,0.026567,-0.081992,168157000000.0,131017800000.0,0.044757,275785.53,...,20848.76,9.6,1234.18,10640.91,8.1056,89.35,101.6,109.0,16.8,27967.0
2005-08-31,959.343,885.85,0.044757,218259100000.0,0.010787,0.026567,117223900000.0,145301000000.0,-0.011342,276966.28,...,21171.2,9.1,1220.33,10481.6,8.0998,87.53,101.8,108.5,16.1,34637.16


In [162]:
z = True  # 是否标准化特征数据
decom = True  # 是否进行pca降维
window_list = [12, 24]  # 设定训练数据时间长度（周或者日）
kernel_list = ['rbf', 'linear', 'sigmoid', 'poly']  # 核函数
C = 10  # [1, 5, 10]

In [163]:
output = open('%s_数据频率%s_标准化%s_PCA%s.txt' % (try_id, cycle, str(z), str(decom)), 'w+')
for window in window_list:
    print('\n训练数据长度：%d' % window, file=output)
    print('\n训练数据长度：%d' % window)
    for f in kernel_list:
        prediction = SvmPredict(char_values, pol, window, z, decom, f, C, output)
        signals = prediction['prediction'].copy(deep=True)
        _ = analysis(pol, output, prediction, f)
        combine_plot(close, prediction, f, window)
        open_close = data.ix[signals.index][['open', 'close']]
        position, deal, Volume, net_value = backTest(signals, open_close, start_value, fee_rate, margin)
        _ = analysis_2(net_value, output)
        iplot({'data':[Scatter(x=net_value.index, y=net_value, mode='line', name='cum_gain')],
               'layout':Layout(title='累积净值 \n' + f + ' ' + str(window))},link_text='')
output.close()


训练数据长度：12
预测的准确率为：0.527131782946
上涨次数占比为：0.565891472868
+--------------+-----------+-----------+-------+----------------+----------------+----------------+
| rbf          | real_rise | real_fall |  sum  |      rise      |      fall      |     total      |
+--------------+-----------+-----------+-------+----------------+----------------+----------------+
| predict_rise |    44.0   |    32.0   |  76.0 | 0.602739726027 | 0.571428571429 | 0.589147286822 |
| predict_fall |    29.0   |    24.0   |  53.0 | 0.397260273973 | 0.428571428571 | 0.410852713178 |
| sum          |    73.0   |    56.0   | 129.0 |      1.0       |      1.0       |      1.0       |
+--------------+-----------+-----------+-------+----------------+----------------+----------------+


ValueError: cannot convert float NaN to integer

In [167]:
data

Unnamed: 0_level_0,pre_close,open,high,low,close,volume,value,change,pct_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2005-01-04,1000.0000,994.7690,994.7690,980.6580,982.7940,741286894,4.431977e+09,-17.2060,-1.7206
2005-01-05,982.7940,981.5770,997.3230,979.8770,992.5640,711910898,4.529208e+09,9.7700,0.9941
2005-01-06,992.5640,993.3310,993.7880,980.3300,983.1740,628802905,3.921015e+09,-9.3900,-0.9460
2005-01-07,983.1740,983.0450,995.7110,979.8120,983.9580,729869409,4.737469e+09,0.7840,0.0797
2005-01-10,983.9580,983.7600,993.9590,979.7890,993.8790,579169799,3.762933e+09,9.9210,1.0083
2005-01-11,993.8790,994.1890,999.5540,991.0920,997.1350,584907998,3.704077e+09,3.2560,0.3276
2005-01-12,997.1350,996.6510,996.9760,989.2570,996.7480,501452509,3.093300e+09,-0.3870,-0.0388
2005-01-13,996.7480,996.0780,999.4730,992.6950,996.8770,604406584,3.842173e+09,0.1290,0.0129
2005-01-14,996.8770,996.6170,1006.4630,987.2330,988.3060,729784238,4.162921e+09,-8.5710,-0.8598
2005-01-17,988.3060,979.1110,981.5250,965.0780,967.4520,728818876,4.249808e+09,-20.8540,-2.1101


## 尝试三

In [None]:
try_id = '尝试三'
cycle = 'D'

In [None]:
window = 15
mean_close = data['close'].rolling(window=window).mean()  # 前15日均价
mean_vol = data['volume'].rolling(window=window).mean()  # 前15日均量
Return = data['close'].pct_change(window)  # 前15日收益率
S = data['close'].rolling(window=window).std()  # 前15日标准差
close = data['close']

char_values = pd.DataFrame()
char_values['close/mean'] = data['close'] / mean_close
char_values['high/mean'] = data['high'] / mean_close
char_values['low/mean'] = data['low'] / mean_close
char_values['vol/mean'] = data['volume'] / mean_vol
char_values['return'] = Return
char_values['price'] = data['close']
char_values['std'] = S
char_values['vol'] = data['volume']
char_values['follow_return'] = data['close'].pct_change(5).shift(-5)

In [None]:
pol = np.sign(char_values['follow_return'])

In [None]:
char_values['pol'] = pol
char_values.dropna(axis=0, how='any', inplace=True)
pol = char_values['pol']
del char_values['pol']

In [None]:
char_values.head()

In [None]:
z = True  # 是否标准化特征数据
decom = True  # 是否进行pca降维
window_list = [60,120]  # 设定训练数据时间长度（周或者日）
kernel_list = ['rbf', 'sigmoid', 'poly']  # 核函数
C = 1  # [1, 5, 10]

In [None]:
output = open('%s_数据频率%s_标准化%s_PCA%s.txt' % (try_id, cycle, str(z), str(decom)), 'w+')
for window in window_list:
    print('\n训练数据长度：%d' % window, file=output)
    print('\n训练数据长度：%d' % window)
    for f in kernel_list:
        prediction = SvmPredict(char_values, pol, window, z, decom, f, C, output)
        _ = analysis(pol, output, *prediction)
        combine_plot(close, prediction, window)
output.close()

## 尝试四

In [78]:
try_id = '尝试四'
cycle = 'D'

In [79]:
n1 = 10
n2 = 20

In [80]:
Close_mean1 = data['close'].rolling(window=n1).mean()
Close_mean1.name = 'MA1'
Close_mean2 = data['close'].rolling(window=n2).mean()
Close_mean2.name = 'MA2'
Var = data['close'].pct_change().rolling(window=n2).std()
Distance = (data['close'] - Close_mean2) / Var
Distance.name = 'Distance'

daily_change = data['close'].diff()
UP = []
DN = []
for i in range(len(daily_change)-n1):
	temp = daily_change[i:n1+i]
	up = temp[temp > 0].sum() / n1
	dn = temp[temp < 0].sum() / n1
	UP.append(up)
	DN.append(dn)
UP = pd.Series(UP, index=daily_change.index[n1:], name='UP')
DN = pd.Series(DN, index=daily_change.index[n1:], name='DOWN')
RSI = 100 * UP / (UP - DN)
RSI.name = 'RSI'

Max_high = data['high'].rolling(window=n1).max()
Max_high.name = 'Max_high'
Min_low = data['low'].rolling(window=n1).min()
Min_low.name = 'Min_low'

Vol_mean = data['volume'].rolling(window=n1).mean()
Relative_Vol = data['volume'] / Vol_mean
Relative_Vol.name = 'rVol'

MMC = np.array(data[['low', 'high', 'close']])
CM = np.array(Close_mean1).reshape((len(Vol_mean),1))
Relative_price = MMC / CM
T = pd.DataFrame(Relative_price, index=data.index, columns=['rLow', 'rHigh', 'rClose'])

Return = data['close'].pct_change(periods=15)
Return.name = 'Return'

Daily_return = data['close'].pct_change().shift(-1)
Daily_return[Daily_return>0] = 1
Daily_return[Daily_return<=0] = -1
PoL_D = Daily_return
PoL_D.name = 'PoL_D' # profit or loss

Weekly_return = data['close'].pct_change(periods=5).shift(-5)
Weekly_return[Weekly_return>0] = 1
Weekly_return[Weekly_return<=0] = -1
PoL_W = Weekly_return
PoL_W.name = 'PoL_W'

temp = pd.concat([Distance, data['close'],Close_mean1,Close_mean2,
                        RSI,Max_high,Min_low,Relative_Vol,Return,PoL_D,PoL_W],
                        axis =1)
clean_data  = temp.dropna(how='any')
char_values = clean_data[['Distance','close','MA1','MA2','RSI','Max_high','Min_low','rVol','Return']]

close = char_values['close']

pol = clean_data['PoL_D']

In [81]:
char_values.head()

Unnamed: 0_level_0,Distance,close,MA1,MA2,RSI,Max_high,Min_low,rVol,Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2005-02-01,-2151.040556,955.951,974.6565,981.06735,44.327441,1001.854,943.439,0.851279,-0.041302
2005-02-02,1496.928497,1006.913,978.6268,981.7848,41.047985,1006.932,943.439,1.73991,0.010198
2005-02-03,644.908723,993.215,982.3238,982.28685,63.400409,1014.187,943.439,1.582218,-0.003673
2005-02-04,1858.1764,1016.858,985.7492,983.93185,62.251945,1021.025,952.741,1.449416,0.02889
2005-02-16,2162.303182,1023.584,988.2943,985.4171,61.559956,1033.248,952.741,1.14079,0.05802


In [82]:
start_value = 1000000
fee_rate = 0.001
margin = 1

In [83]:
z = True  # 是否标准化特征数据
decom = True  # 是否进行pca降维
window_list = [60]  # 设定训练数据时间长度（周或者日）
kernel_list = ['rbf', 'sigmoid', 'poly']  # 核函数
C = 1  # [1, 5, 10]

In [84]:
output = open('%s_数据频率%s_标准化%s_PCA%s.txt' % (try_id, cycle, str(z), str(decom)), 'w+')
for window in window_list:
    print('\n训练数据长度：%d' % window, file=output)
    print('\n训练数据长度：%d' % window)
    for f in kernel_list:
        prediction = SvmPredict(char_values, pol, window, z, decom, f, C, output)
        signals = prediction['prediction'].copy(deep=True)
        _ = analysis(pol, output, prediction, f)
        combine_plot(close, prediction, f, window)
        open_close = data.ix[signals.index][['open', 'close']]
        position, deal, Volume, net_value = backTest(signals, open_close, start_value, fee_rate, margin)
        _ = analysis_2(net_value, output)
        iplot({'data':[Scatter(x=net_value.index, y=net_value, mode='line', name='cum_gain')],
               'layout':Layout(title='累积净值 \n' + f + ' ' + str(window))},link_text='')
output.close()


训练数据长度：60
预测的准确率为：0.539826086957
上涨次数占比为：0.541913043478
+--------------+-----------+-----------+--------+----------------+----------------+----------------+
| rbf          | real_rise | real_fall |  sum   |      rise      |      fall      |     total      |
+--------------+-----------+-----------+--------+----------------+----------------+----------------+
| predict_rise |   1010.0  |   775.0   | 1785.0 | 0.648267008986 | 0.588458618071 | 0.620869565217 |
| predict_fall |   548.0   |   542.0   | 1090.0 | 0.351732991014 | 0.411541381929 | 0.379130434783 |
| sum          |   1558.0  |   1317.0  | 2875.0 |      1.0       |      1.0       |      1.0       |
+--------------+-----------+-----------+--------+----------------+----------------+----------------+


+------------+---------+----------------+----------+--------+----------+----------+----------+------------+------------+----------+
|    None    |  净利润 | 复合年化收益率 | 最大回撤 | 波动率 | 夏普比率 | 期初权益 | 期末权益 |  起始时间  |  终止时间  | 回测时长 |
+------------+---------+----------------+----------+--------+----------+----------+----------+------------+------------+----------+
| 收益率分析 | 2943214 |      12%       |   56%    |  28%   |   0.44   | 1000000  | 3943214  | 2005-05-12 | 2017-03-06 |  2875天  |
+------------+---------+----------------+----------+--------+----------+----------+----------+------------+------------+----------+


预测的准确率为：0.546782608696
上涨次数占比为：0.541913043478
+--------------+-----------+-----------+--------+----------------+----------------+---------------+
| sigmoid      | real_rise | real_fall |  sum   |      rise      |      fall      |     total     |
+--------------+-----------+-----------+--------+----------------+----------------+---------------+
| predict_rise |   1077.0  |   822.0   | 1899.0 | 0.691270860077 | 0.624145785877 | 0.66052173913 |
| predict_fall |   481.0   |   495.0   | 976.0  | 0.308729139923 | 0.375854214123 | 0.33947826087 |
| sum          |   1558.0  |   1317.0  | 2875.0 |      1.0       |      1.0       |      1.0      |
+--------------+-----------+-----------+--------+----------------+----------------+---------------+


+------------+---------+----------------+----------+--------+----------+----------+----------+------------+------------+----------+
|    None    |  净利润 | 复合年化收益率 | 最大回撤 | 波动率 | 夏普比率 | 期初权益 | 期末权益 |  起始时间  |  终止时间  | 回测时长 |
+------------+---------+----------------+----------+--------+----------+----------+----------+------------+------------+----------+
| 收益率分析 | 6753602 |      19%       |   47%    |  28%   |   0.68   | 1000000  | 7753602  | 2005-05-12 | 2017-03-06 |  2875天  |
+------------+---------+----------------+----------+--------+----------+----------+----------+------------+------------+----------+


预测的准确率为：0.47547826087
上涨次数占比为：0.541913043478
+--------------+-----------+-----------+--------+----------------+----------------+----------------+
| poly         | real_rise | real_fall |  sum   |      rise      |      fall      |     total      |
+--------------+-----------+-----------+--------+----------------+----------------+----------------+
| predict_rise |   442.0   |   392.0   | 834.0  | 0.283697047497 | 0.297646165528 | 0.290086956522 |
| predict_fall |   1116.0  |   925.0   | 2041.0 | 0.716302952503 | 0.702353834472 | 0.709913043478 |
| sum          |   1558.0  |   1317.0  | 2875.0 |      1.0       |      1.0       |      1.0       |
+--------------+-----------+-----------+--------+----------------+----------------+----------------+


+------------+---------+----------------+----------+--------+----------+----------+----------+------------+------------+----------+
|    None    |  净利润 | 复合年化收益率 | 最大回撤 | 波动率 | 夏普比率 | 期初权益 | 期末权益 |  起始时间  |  终止时间  | 回测时长 |
+------------+---------+----------------+----------+--------+----------+----------+----------+------------+------------+----------+
| 收益率分析 | -709187 |      -10%      |   74%    |  22%   |  -0.46   | 1000000  |  290812  | 2005-05-12 | 2017-03-06 |  2875天  |
+------------+---------+----------------+----------+--------+----------+----------+----------+------------+------------+----------+
