In [3]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score # K折交叉验证模块
from sklearn.preprocessing import MinMaxScaler

import QUANTAXIS as QA
import pandas as pd
import numpy as np
import pyecharts
import talib

#设定绘图的默认大小
import matplotlib
matplotlib.rcParams["figure.figsize"]=[16,5]

matplotlib.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['font.sans-serif'] = ['Noto Sans CJK SC','SimHei']
matplotlib.rcParams['axes.unicode_minus']=False #用来正常显示负号

#加载 seaborn，并且设置默认使用 seaborn
import seaborn as sns
sns.set(font=['Noto Sans CJK SC','SimHei'])

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

In [27]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import ExtraTreeRegressor

In [1]:
stock_code='601398'
benchmark_code='399300'
start_time='2005-01-01'
end_time='2018-12-31'

In [4]:
data_raw=QA.QA_fetch_stock_day_adv(stock_code, start_time, end_time).to_qfq().data.reset_index().set_index('date')
data_raw.head(1)

Unnamed: 0_level_0,code,open,high,low,close,volume,amount,preclose,adj
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2006-10-27,601398,1.992133,2.01557,1.910104,1.921823,44076540.0,8725310000.0,,0.585922


# 方法定义

In [16]:
def normalize_data(df):
    """数据归一化"""
    return df / df.iloc[0]


def min_max_sclar(df):
    columns=list(df.columns)
    s = MinMaxScaler().fit_transform(df[columns])
    return pd.DataFrame(s, columns=columns, index=df.index)

def create_valid_data(df,days)->pd.DataFrame:
    '''根据 `df` 中的 close 列，制作新的数据。
    取 `days` 天后的 close 列数据。
    '''
    result={}
    if 'close' not in df.columns:
        raise ValueError('数据中不包含 close 列。')
    df_copy=df.copy()
    df_copy[days]=df_copy.shift(days*-1).close
    return df_copy[[days]].dropna()

In [20]:
data=data_raw.copy()

创建5日后的收盘价与当日收盘价的变化率为结果集。

结果集列命名为当前计算的日期数量。下面是5.

将X，y数据合并后dropna()，之后再拆分为X，y。

X为**ASI**计算后的数据。

In [54]:
X=QA.QA_indicator_ASI(data).dropna()
y=create_valid_data(data[data.index.isin(X.index)],5)
y=((y[5]/data.close-1)*100).to_frame()
y=y.rename(columns={0:5})
d=X.join(y).dropna()
X=d.drop(columns=5)
y=d[[5]]

In [57]:
X.head()

Unnamed: 0_level_0,ASI,ASIT
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2006-12-15,10.461545,8.28348
2006-12-18,12.527926,8.887444
2006-12-19,13.61011,9.5348
2006-12-20,13.419587,10.251801
2006-12-21,12.784311,10.936613


In [58]:
y.head()

Unnamed: 0_level_0,5
date,Unnamed: 1_level_1
2006-12-15,9.722222
2006-12-18,14.254386
2006-12-19,16.490486
2006-12-20,21.108742
2006-12-21,25.806452


In [64]:
from sklearn.model_selection import cross_val_score # K折交叉验证模块
warnings.filterwarnings('ignore')

def cross_validate(X,y,test_size=0.2,random_state=10,funcs={'DecisionTreeRegressor':DecisionTreeRegressor(),
       'LinearRegression':LinearRegression(),
       'SVR':SVR(),
       'KNeighborsRegressor':KNeighborsRegressor(),
       'RandomForestRegressor':RandomForestRegressor(),
       'AdaBoostRegressor':AdaBoostRegressor(),
       'GradientBoostingRegressor':GradientBoostingRegressor(),
       'BaggingRegressor':BaggingRegressor(),
       'ExtraTreeRegressor':ExtraTreeRegressor()
      }):

    # 拆分数据源
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                            test_size=test_size,
                                                            random_state=random_state)
    cross_validation_report=[]

    for k,v in funcs.items():
        v.fit(X_train, y_train)
        model_score=v.score(X_test, y_test)
        #使用K折交叉验证模块
        scores = cross_val_score(v, X, y, cv=30)
        #将5次的预测准确平均率打印出
        cross_score=scores.mean()
        cross_validation_report.append([k,model_score,cross_score])
    return pd.DataFrame(cross_validation_report,columns=['name','model_score','cross_score']).sort_values('cross_score',ascending=False)

In [65]:
cross_validate(X,y)

Unnamed: 0,name,model_score,cross_score
4,LinearRegression,-0.004525,-0.04977
0,SVR,0.028348,-0.102438
7,GradientBoostingRegressor,-0.008583,-0.14401
8,KNeighborsRegressor,0.012409,-0.555606
5,AdaBoostRegressor,-0.036944,-0.606012
6,BaggingRegressor,-0.144779,-0.687044
3,RandomForestRegressor,-0.18004,-0.706975
2,ExtraTreeRegressor,-0.50409,-2.256453
1,DecisionTreeRegressor,-0.773751,-2.381113
