In [1]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score # K折交叉验证模块
from sklearn.preprocessing import MinMaxScaler

import QUANTAXIS as QA
import pandas as pd
import numpy as np
import pyecharts
import talib

#设定绘图的默认大小
import matplotlib
matplotlib.rcParams["figure.figsize"]=[16,5]

matplotlib.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['font.sans-serif'] = ['Noto Sans CJK SC','SimHei']
matplotlib.rcParams['axes.unicode_minus']=False #用来正常显示负号

#加载 seaborn，并且设置默认使用 seaborn
import seaborn as sns
sns.set(font=['Noto Sans CJK SC','SimHei'])

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, LSTM, TimeDistributed, RepeatVector
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [2]:
stock_code='601398'
benchmark_code='399300'
start_time='2005-01-01'
end_time='2018-12-31'

data_raw=QA.QA_fetch_stock_day_adv(stock_code, start_time, end_time).to_qfq().data.reset_index().set_index('date')
data_raw.head(1)

Unnamed: 0_level_0,code,open,high,low,close,volume,amount,preclose,adj
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2006-10-27,601398,1.992133,2.01557,1.910104,1.921823,44076540.0,8725310000.0,,0.585922


In [3]:
data=data_raw.drop(columns=['code','amount','preclose','adj'])
data.head(1)

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2006-10-27,1.992133,2.01557,1.910104,1.921823,44076540.0


## 对数据进行包装

### 增加特性数据

In [4]:
def augFeatures(train):
    df=train.copy()
    df["year"] = df.index.year
    df["month"] = df.index.month
    df["date"] = df.index.day
    df["day"] = df.index.dayofweek
    return df

### 正则化

In [5]:
def normalize(train):
    return train.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))

### 其他方法

In [34]:
def buildData(df,pastDays=30,futureDays=5):
    """取 `pastDays` 的数据作为计算数据，取 `futureDays` 的 `close` %save据作为测算数据。
    返回的X集合中包含类型为 `DataFrame` ，Y集合中包含为 `Series`。"""
    X,Y=[],[]
    for i in range(df.shape[0]-futureDays-futureDays):
        X.append(df.iloc[i:i+pastDays])
        Y.append(df.iloc[i+pastDays:i+pastDays+futureDays]['close'])
    return X,Y

def splitBuildData(X,Y,test_size=0.2,random_state=10,shuffle=True):
    """调用 `sklearn.model_selection.train_test_split` callable分训练集和测试集。
    Args:
        shuffle: 是否打乱数据。默认为 `True`。
        
    Returns:
        返回内容为：X_train, X_test, y_train, y_test。
        其中 X_train和y_train配对，X_test和y_test配对。"""
    return train_test_split(X,Y,test_size=test_size,random_state=random_state,shuffle=shuffle)

def toNpArray(d):
    r=[]
    for i in d:
        r.append(i.values)
    return np.array(r)

## 数据演示

In [12]:
data = data_raw.drop(columns=['code','amount','preclose','adj'])

data_Aug = augFeatures(data)

data_norm = normalize(data_Aug)

X,Y=buildData(data_norm)

X_train, X_test, y_train, y_test = splitBuildData(X, Y)

In [84]:
y_test[0]

date
2009-10-26   -0.053285
2009-10-27   -0.068068
2009-10-28   -0.071480
2009-10-29   -0.083989
2009-10-30   -0.071480
Name: close, dtype: float64

In [158]:
# r=[]
# for x in range(len(X)):
#     r.append(X_train[x].as_matrix())
#     if x==642:
#         break
# np.array(r).shape,np.array(r).size

r=[]
r.append(X_train[642].values)
r.append(X_train[643].values)
np.array(r).shape

(2,)

In [150]:
def buildManyToOneModel(shape):
    model = Sequential()
    model.add(LSTM(10, input_length=shape[0], input_dim=shape[1]))
    # output shape: (1, 1)
    model.add(Dense(1))
    model.compile(loss="mse", optimizer="adam")
    model.summary()
    return model

In [152]:
data = data_raw.drop(columns=['code','amount','preclose','adj'])
data_Aug = augFeatures(data)
data_norm = normalize(data_Aug)
X,Y=buildData(data_norm)
X_train, X_test, y_train, y_test = splitBuildData(X, Y)

model = buildManyToOneModel(X_train[0].shape)
callback = EarlyStopping(monitor="loss", patience=10, verbose=1, mode="auto")
model.fit(X_train, y_train, epochs=1000, batch_size=128, validation_data=(X_test, y_test), callbacks=[callback])

  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 10)                800       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 811
Trainable params: 811
Non-trainable params: 0
_________________________________________________________________


ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 1 array(s), but instead got the following list of 2346 arrays: [array([[-8.98332003e-02, -9.22247393e-02, -8.90403623e-02,
        -9.00312548e-02, -2.68170943e-02,  1.31526787e-01,
         3.94217403e-01,  9.53562125e-02, -2.71831464e-03],
       [-9.29290116e-...