In [236]:
import pandas as pd
import numpy as np
import sys 
import csv
import talib as ta
from talib import abstract
import stockstats

In [6]:
ETF=pd.read_csv('taetfp.csv',encoding='CP950',usecols=['代碼','日期', '中文簡稱', '開盤價(元)', '最高價(元)', '最低價(元)', '收盤價(元)', '成交張數(張)'])

In [7]:
ETF.columns=['Code','Date','che','Op','Hp','Lp','Cp','Vol']
ETF=ETF.drop(columns=['che'])

In [8]:
#去除逗號
ETF[['Op','Hp','Lp','Cp','Vol']]=ETF[['Op','Hp','Lp','Cp','Vol']].replace(',','',regex=True)  #regex若為false，則表示是以格子取代，而非找字串

In [9]:
#開收盤價差
ETF = ETF.apply(pd.to_numeric, errors='ignore')
ETF['Dif']=ETF.Cp-ETF.Op

In [10]:
#價格均線
ETF['5ma']=ETF.groupby(['Code'], group_keys=False).apply(lambda a: a['Cp'].rolling(window=5,center=False).mean())
ETF['10ma']=ETF.groupby(['Code'], group_keys=False).apply(lambda a: a['Cp'].rolling(window=10,center=False).mean())

In [11]:
#量差
ETF['Dif_V']=ETF.Vol-ETF.Vol.shift(1)

In [12]:
#報酬率
ETF['Return']=ETF.groupby(['Code'], group_keys=False).apply(lambda a: (a.Cp-a.Cp.shift(1))/a.Cp.shift(1))

In [13]:
#計算RSI
#首先建立漲的虛擬變數
f=lambda a: 1 if a.Return>0 else 0
ETF['SP']=ETF.apply(f,axis=1) 
ETF['RSI_5']=ETF.groupby(['Code'], group_keys=False).apply(lambda a: (a.SP+a.SP.shift(1)+a.SP.shift(2)+a.SP.shift(3)+a.SP.shift(4))/5)

In [14]:
#量價上揚
f=lambda a: 1 if a.Return>0 and a.Dif_V>0 else 0
ETF['VP_up']=ETF.apply(f,axis=1)
#量價下跌
f=lambda a: 1 if a.Return<0 and a.Dif_V<0 else 0
ETF['VP_down']=ETF.apply(f,axis=1)

In [26]:
#價格大於均線
x=lambda a: 1 if a.Cp>a['5ma'] else 0
y=lambda a: 1 if a.Cp>a['10ma'] else 0
z=lambda a: 1 if a.Cp>a['5ma'] and a.Cp>a['10ma'] else 0
ETF['H_5ma']=ETF.apply(x,axis=1)
ETF['H_10ma']=ETF.apply(x,axis=1)
ETF['H_5&10ma']=ETF.apply(x,axis=1)

In [253]:
#用這個去確認下述計算各個技術指標的input和output
from talib import abstract
print(talib.abstract.BBANDS)

BBANDS([input_arrays], [timeperiod=5], [nbdevup=2], [nbdevdn=2], [matype=0])

Bollinger Bands (Overlap Studies)

Inputs:
    price: (any ndarray)
Parameters:
    timeperiod: 5
    nbdevup: 2
    nbdevdn: 2
    matype: 0 (Simple Moving Average)
Outputs:
    upperband
    middleband
    lowerband


In [265]:
#技術指標
#MACD
ETF['MACD']=ETF.groupby(['Code'],group_keys=False).apply(lambda x:ta.MACD(x.Cp)[0])  #ouput的第一個才是MACD
#KD
ETF['K']=ETF.groupby(['Code'],group_keys=False).apply(lambda x:ta.STOCH(x.Hp,x.Lp,x.Cp)[0]) 
ETF['D']=ETF.groupby(['Code'],group_keys=False).apply(lambda x:ta.STOCH(x.Hp,x.Lp,x.Cp)[1]) 
#ATR
ETF['ATR']=ETF.groupby(['Code'],group_keys=False).apply(lambda x:ta.ATR(x.Hp,x.Lp,x.Cp)) 
#威廉
ETF['WILLR']=ETF.groupby(['Code'],group_keys=False).apply(lambda x:ta.WILLR(x.Hp,x.Lp,x.Cp)) 
#布林通道
ETF['UBand']=ETF.groupby(['Code'],group_keys=False).apply(lambda x:ta.BBANDS(x.Cp,timeperiod=20)[0])
ETF['MBand']=ETF.groupby(['Code'],group_keys=False).apply(lambda x:ta.BBANDS(x.Cp,timeperiod=20)[1])
ETF['LBand']=ETF.groupby(['Code'],group_keys=False).apply(lambda x:ta.BBANDS(x.Cp,timeperiod=20)[2])
#布林通道的常用指標
ETF['PB']=(ETF.Cp-ETF.LBand)/(ETF.UBand-ETF.LBand)

In [266]:
pd.set_option('display.max_row',100)
pd.set_option('display.max_column',30)
ETF

Unnamed: 0,Code,Date,Op,Hp,Lp,Cp,Vol,Dif,5ma,10ma,Dif_V,Return,SP,RSI_5,VP_up,VP_down,H_5ma,H_10ma,H_5&10ma,MACD,K,D,ATR,UBand,MBand,LBand,WILLR,PB
0,50,20130102,46.57,47.13,46.49,46.92,16487,0.35,,,,,0,,0,0,0,0,0,,,,,,,,,
1,50,20130103,47.35,47.48,47.13,47.31,29020,-0.04,,,12533.0,0.008312,1,,1,0,0,0,0,,,,,,,,,
2,50,20130104,47.31,47.31,46.92,47.00,9837,-0.31,,,-19183.0,-0.006553,0,,0,1,0,0,0,,,,,,,,,
3,50,20130107,47.05,47.05,46.49,46.79,8910,-0.26,,,-927.0,-0.004468,0,,0,1,0,0,0,,,,,,,,,
4,50,20130108,46.57,46.75,46.27,46.49,12507,-0.08,46.902,,3597.0,-0.006412,0,0.2,0,0,0,0,0,,,,,,,,,
5,50,20130109,46.36,46.83,46.36,46.66,7529,0.30,46.850,,-4978.0,0.003657,1,0.4,0,0,0,0,0,,,,,,,,,
6,50,20130110,46.83,47.13,46.70,47.00,13953,0.17,46.788,,6424.0,0.007287,1,0.4,1,0,1,1,1,,,,,,,,,
7,50,20130111,47.18,47.26,46.87,46.96,11837,-0.22,46.780,,-2116.0,-0.000851,0,0.4,0,1,1,1,1,,,,,,,,,
8,50,20130114,46.57,47.00,46.40,47.00,7282,0.43,46.822,,-4555.0,0.000852,1,0.6,0,0,1,1,1,,71.208884,56.261429,,,,,,
9,50,20130115,46.75,46.96,46.49,46.57,6609,-0.18,46.838,46.870,-673.0,-0.009149,0,0.6,0,1,0,0,0,,55.589226,61.390557,,,,,,


# 建立模型

In [274]:
import sklearn

In [267]:
ETF.columns

Index(['Code', 'Date', 'Op', 'Hp', 'Lp', 'Cp', 'Vol', 'Dif', '5ma', '10ma', 'Dif_V', 'Return', 'SP', 'RSI_5', 'VP_up', 'VP_down', 'H_5ma', 'H_10ma', 'H_5&10ma', 'MACD', 'K', 'D', 'ATR', 'UBand', 'MBand', 'LBand', 'WILLR', 'PB'], dtype='object')

In [272]:
#建立特徵值
Feature=['Dif','Dif_V','RSI_5','VP_up', 'VP_down', 'H_5ma', 'H_10ma', 'H_5&10ma', 'MACD', 'K', 'D', 'ATR', 'WILLR', 'PB']
ETF[Feature]=ETF.groupby(['Code'],group_keys=False).apply(lambda x: x[Feature].shift(1))
ETF=ETF.dropna()

In [336]:
#建立模型
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
x, y=ETF[Feature], ETF['Return']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [338]:
x_train=np.array(x_train)
x_test=np.array(x_test)
y_train=np.array(y_train)
y_test=np.array(y_test)

In [339]:
x_train

TypeError: '>' not supported between instances of 'int' and 'str'

In [325]:
model= RandomForestRegressor(n_estimators= 100 )
model.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [329]:
model.predict(x_test)

TypeError: '>' not supported between instances of 'int' and 'str'