In [19]:
import warnings
warnings.filterwarnings('ignore')
import jupyter_helper
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score # K折交叉验证模块
from sklearn.preprocessing import MinMaxScaler

import QUANTAXIS as QA
import pandas as pd
import numpy as np
import pyecharts

#设定绘图的默认大小
import matplotlib
matplotlib.rcParams["figure.figsize"]=[16,5]

matplotlib.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['font.sans-serif'] = ['Noto Sans CJK SC','SimHei']
matplotlib.rcParams['axes.unicode_minus']=False #用来正常显示负号

#加载 seaborn，并且设置默认使用 seaborn
import seaborn as sns
sns.set(font=['Noto Sans CJK SC','SimHei'])

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

In [2]:
def create_validate_df_close(df: pd.DataFrame,
                             days: int,
                             column: str = 'close') -> pd.DataFrame:
    """制作验证用数据。

    根据 df[column].shift(-days)` 查找指定天数后的指定列的数据作为验证数据。

    Args:
        df: 数据源
        days: 天数
        column: 使用的列
    """
    if column not in df.columns:
        raise ValueError('数据中不包含 {} 列。'.format(column))
    return pd.DataFrame(df[column].shift(-1 * days).values,
                        columns=[days],
                        index=df.index).dropna()


def get_calc_data(stock_code, s, e, fq='qfq',
                   drop_columns=['code', 'preclose', 'adj'],
                   scaler=['amount', 'volume'],
                   scaler_func=MinMaxScaler):
    """获取计算用数据源

    Args:
        fq: 是否采用复权数据。默认使用前复权。如果不需要复权则传''即可。
        stock_code:
        s:
        e:
        drop_columns: 丢弃的列
        scaler: 需要做归一化的数据列。
        scaler_func: 做归一化时默认使用的方法。
    """
    raw_data = QA.QA_fetch_stock_day_adv(stock_code, s, e)
    calc_data = raw_data.data.reset_index().set_index(
        'date').copy()
    if fq == 'qfq':
        qfq_data = raw_data.to_qfq().data.reset_index().set_index(
            'date')  # DataFrame格式
        calc_data = qfq_data.copy()
    elif fq == 'hfq':
        qfq_data = raw_data.to_qfq().data.reset_index().set_index(
            'date')  # DataFrame格式
        calc_data = qfq_data.copy()
    if drop_columns:
        calc_data = calc_data.drop(columns=drop_columns)  # 丢弃多余的列
    if scaler:
        calc_data[scaler] = scaler_func().fit_transform(
            calc_data[scaler])
    return calc_data

def packing_calc_data(df:pd.DataFrame,stock_code)->pd.DataFrame:
    return df


def get_fit_report(X, y, func, days, column: str = 'close') -> pd.DataFrame:
    """对测试数据集获取验证结果

    Args:
        X: 测试数据集
        func: 实现了`predict`方法的sklearn函数
        days: 天数

    Returns:

    """
    Z = X[[column]].rename(columns={column: '当日收盘价'}).join(
        y[[days]].rename(columns={days: '{}日后收盘价'.format(days)}))
    return Z.join(
        pd.DataFrame(func.predict(X), index=X.index).rename(columns={0: '预测值'}))


def plot_report(days, func, X,y, column: str = 'close', title='', subtitle=''):
    r = get_fit_report(X,y, func, days, column)
    line = pyecharts.Line('{} Days'.format(days) if not title else title,
                          subtitle=subtitle)
    for col in r.columns:
        line.add(col, r.index.date, r[col], datazoom_extra_type='both',
             is_datazoom_extra_show=True,
             datazoom_extra_orient='horizontal')
    # line.add('实际值', data.index.date, data[column], datazoom_extra_type='both',
    #          is_datazoom_extra_show=True,
    #          datazoom_extra_orient='horizontal')
    # line.add('预测值', r.index.date, r[0], datazoom_extra_type='both',
    #          is_datazoom_extra_show=True,
    #          datazoom_extra_orient='horizontal')
    return line

In [3]:
s,e=jupyter_helper.get_start_end_date()
benchmark_code=jupyter_helper.get_benchmark_code()
stock_code='601398'
DAYS=5#计算时间段

In [4]:
data=get_calc_data(stock_code,s,e)

In [5]:
X=packing_calc_data(data[:-100],stock_code)
y=create_validate_df_close(X,DAYS)
X = X[X.index.isin(y.index)]

test_size=0.2
random_state=10

# 拆分数据源
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=test_size,
                                                        random_state=random_state)

lr=LinearRegression()

lr.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [6]:
lr.score(X_test, y_test)

0.9767319033541031

In [28]:
fit_report=get_fit_report(X[-100:],create_validate_df_close(X[-100:],DAYS),lr,DAYS)

In [29]:
fit_report.sort_index()

Unnamed: 0_level_0,当日收盘价,5日后收盘价,预测值
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-03-27,5.725586,5.629840,5.709484
2018-03-28,5.725586,5.620266,5.739159
2018-03-29,5.830906,5.610691,5.834885
2018-03-30,5.830906,5.859630,5.826911
2018-04-02,5.706437,5.974524,5.707961
2018-04-03,5.629840,5.821331,5.623483
2018-04-04,5.620266,5.773459,5.623076
2018-04-09,5.610691,5.572393,5.616231
2018-04-10,5.859630,5.543669,5.853763
2018-04-11,5.974524,5.629840,5.972044
