In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import svm

In [2]:
names = ['stock_number','date','stock_name','open','high','low','close','volume']
dtype = {'stock_number':np.str, 'date':np.int32, 'open':np.float16,'high':np.float16,'low':np.float16,'close':np.float16,'volume':np.int32}

#dataset
tetfp = pd.read_csv('C:/Users/USER/Desktop/tbrain/dataset/tetfp.txt', encoding='big5hkscs',  sep='\s+', names=names, dtype=dtype, low_memory=False)
tsharep = pd.read_csv('C:/Users/USER/Desktop/tbrain/dataset/tsharep.txt', encoding='big5hkscs', sep='\s+', names=names, dtype=dtype, low_memory=False)

In [3]:
tetfp_0050 = tetfp[tetfp.stock_number=='0050']
tetfp_0050_18b = tetfp_0050[tetfp_0050.date < 20180000]
tetfp_0050_17 = tetfp_0050[(tetfp_0050.date > 20170000)&(tetfp_0050.date < 20180000)]
tetfp_0050_18 = tetfp_0050[tetfp_0050.date > 20180000]

In [4]:
tsharep_2317 = tsharep[tsharep.stock_number=='2317']
tsharep_2317_18b = tsharep_2317[tsharep_2317.date < 20180000]
tsharep_2317_18 = tsharep_2317[tsharep_2317.date > 20180000]
tsharep_2330 = tsharep[tsharep.stock_number=='2330']
tsharep_2330_18b = tsharep_2330[tsharep_2330.date < 20180000]
tsharep_2330_18 = tsharep_2330[tsharep_2330.date > 20180000]

In [30]:
def diffRateGen(data):
    diffRate = [0.0]
    for i in range(len(data)):
        if(i != 0):
            if(data[i-1]==0): 
                diffRate.append(0)
            else: 
                diffRate.append((data[i]-data[i-1])/data[i-1]-1)
    return diffRate

In [31]:
def trend(data):
    trend = []
    for i in range(len(data)):
        if(i != len(data)-1):
            trend.append(np.sign(data[i+1]-data[i]))
    trend.append(0)
    return trend

In [32]:
trend([0,1,2,5,3,2,1])

[1, 1, 1, -1, -1, -1, 0]

In [33]:
def ma(days, data):
    ma = []
    for i in range(len(data)):
        if(i >= days-1):
            value = 0.0
            for j in range(days):
                value += data[i-j]
            ma.append(value/days)
        else: 
            ma.append(data[i])
    return ma

In [34]:
def closeGen(stock_number, dataset, data_len=321):
    close = dataset[dataset.stock_number==stock_number].close.values
    #possiblely shorter close_array
    if(len(close)!=data_len):
            close = np.concatenate([close, np.zeros(data_len-len(close))])
    return close

In [35]:
def x_gen(data):
    open_ = data.open.values
    close_ = data.close.values
    low = data.low.values
    high = data.high.values

    volume = data.volume.values
    ma5 = ma(5, close_)
    #ma10 = ma(10, close_)
    #ma20 = ma(20, close_)
    return_rate = diffRateGen(close_)
    red_black = high-low
    candle = np.sign(close_ - open_)
    x = np.array([volume, ma5, return_rate, red_black, candle])
    return x

In [40]:
train_x = np.concatenate((x_gen(tetfp_0050_18b), x_gen(tsharep_2317_18b), x_gen(tsharep_2330_18b)), axis=0).T
train_y = trend(tetfp_0050_18b.close.values)

In [41]:
svm = sklearn.svm.SVC(C=0.8)
clf = svm.fit(train_x,train_y)

predict_y = svm.predict(train_x)
E_in = np.count_nonzero(train_y != predict_y)
print('acc =', 1 - E_in/len(train_x))

acc = 0.9275244299674267


In [42]:
test_x = np.concatenate((x_gen(tetfp_0050_18), x_gen(tsharep_2317_18), x_gen(tsharep_2330_18)), axis=0).T[:5]
test_y = trend(tetfp_0050_18.close.values)[:5]

predict_y = svm.predict(test_x)
E_val = np.count_nonzero(test_y != predict_y)
print('acc =', 1 - E_val/len(test_x))

acc = 0.8
