In [37]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from pymongo import MongoClient
from Tbrain_service import DataPreproecss
from Tbrain_service import DataVisualization
from Tbrain_service import Util
from Tbrain_service import Evaluation
from Tbrain_service import FeatureExtraction
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima_model import ARIMA
from xgboost import XGBRegressor
from keras.layers import LSTM, GRU, Dense, TimeDistributed
from keras.models import Sequential
from stockstats import StockDataFrame
from keras.callbacks import EarlyStopping
from fbprophet import Prophet

In [2]:
def perform_linear_regression(etf_df):
    dp = DataPreproecss()
    train_df = dp.trans_time_series_to_supervised(etf_df, 5, 'adj close')
    print("linear_regression train df shape: {}".format(train_df.shape))
    
    train_X = train_df.loc[:,train_df.columns!='y']
    train_y = train_df.loc[:,train_df.columns=='y']
    
    train_X_scale, scaler_X = dp.standardize(train_X)
    train_y_scale, scaler_y = dp.standardize(train_y)

    # test data
    test_X = etf_df.iloc[-5:]
    test_X_scale = scaler_X.transform(test_X)
    model = LinearRegression()
    model.fit(train_X_scale, train_y_scale)
    pred = scaler_y.inverse_transform(model.predict(test_X_scale)).flatten()
    return pred

In [3]:
def perform_xgboost(etf_df):
    dp = DataPreproecss()
    train_df = dp.trans_time_series_to_supervised(etf_df, 5, 'adj close')
    train_X = train_df.loc[:,train_df.columns!='y']
    train_y = train_df.loc[:,train_df.columns=='y']
    
    train_X_scale, scaler_X = dp.standardize(train_X)
    train_y_scale, scaler_y = dp.standardize(train_y)
    
    # test data
    test_X = etf_df.iloc[-5:]
    test_X_scale = scaler_X.transform(test_X)
    model = XGBRegressor()
    model.fit(train_X_scale, train_y_scale)
    pred = scaler_y.inverse_transform(model.predict(test_X_scale)).flatten()
    return pred

In [4]:
def perform_ARIMA(etf_df):
    train_series = etf_df.loc[:,['adj close']]
    # arima p, d, and q parameters
    model = ARIMA(train_series, order=(5,0,0))  
    model = model.fit()
    start_index = len(train_series) - 1
    end_index = len(train_series) - 1 + 4
    pred = model.predict(start_index, end_index, dynamic= True)
    pred = np.array(pred)
    return pred

In [103]:
def perform_prophet(etf_df):
    df = pd.DataFrame()
    df['ds'] = etf_df.index
    df['y'] = etf_df['adj close'].values
    m = Prophet()
    m.fit(df)
    future = m.make_future_dataframe(periods=5)
    pred = m.predict(future).tail(5)['yhat'].values
    return pred

In [5]:
def perform_RNN_many_to_many(etf_df):
    dp = DataPreproecss()
    time_step = 5
    etf_df = etf_df.loc[:,['adj close']]
    etf_df_window = dp.make_slide_windows(etf_df, time_step * 2)
    train_X = etf_df_window.iloc[:,:time_step]
    train_X_scale, scaler_X = dp.standardize(train_X)
    train_y = etf_df_window.iloc[:,-time_step:]
    train_y_scale, scaler_y = dp.standardize(train_y)
    # 最先一週
    test_X_scale = scaler_X.transform(etf_df_window.iloc[-1:,-time_step:])
    # reshape
    train_X_scale = train_X_scale.reshape(len(train_X_scale), time_step, 1)
    train_y_scale = train_y_scale.reshape(len(train_X_scale), time_step, 1)
    test_X_scale = test_X_scale.reshape(len(test_X_scale), time_step, 1)
    print("train_X_scale: {}, train_y_scale: {}, test_X_scale: {}".format(train_X_scale.shape, train_y_scale.shape, test_X_scale.shape))
    # train model
    model = Sequential()
    model.add(GRU(20, input_shape=(time_step, 1), return_sequences=True))
    model.add(GRU(20, return_sequences=True))
    model.add(TimeDistributed(Dense(1)))
    model.compile(loss="mse", optimizer="adam", metrics=["accuracy"])
    model.summary()
    model.fit(
        train_X_scale, train_y_scale, 
        epochs=50, batch_size=32, verbose=1, 
        validation_split=0.2)
    # predict
    pred = model.predict(test_X_scale)
    pred = pred.reshape(1, 5)
    pred = scaler_y.inverse_transform(pred).flatten()
    return pred

In [6]:
def perform_RNN_many_to_one(etf_df):
    dp = DataPreproecss()
    feature_num = etf_df.shape[1]
    shift_range = 5
    time_step = 20
    etf_df_supervised = dp.trans_time_series_to_supervised(etf_df, shift_range, 'adj close')

    train_X = etf_df_supervised.loc[:, etf_df_supervised.columns!='y']
    train_y = etf_df_supervised.loc[:, etf_df_supervised.columns=='y']

    train_X = dp.make_slide_windows(train_X, time_step)
    train_y = train_y[time_step-1:]

    train_X_scale, scaler_X = dp.standardize(train_X)
    train_y_scale, scaler_y = dp.standardize(train_y)
    train_X_scale = train_X_scale.reshape(-1, time_step, feature_num)

    test_X = dp.make_slide_windows(etf_df.iloc[-(time_step-1+shift_range):], time_step)
    # 符合 scaler dim
    test_X_scale = scaler_X.transform(test_X)
    test_X_scale = test_X_scale.reshape(-1, time_step, feature_num)
    print("train_X_scale: {}, train_y_scale: {}, test_X_scale: {}".format(train_X_scale.shape, train_y_scale.shape, test_X_scale.shape))
    # train model
    model = Sequential()
    model.add(GRU(40, input_shape=(time_step, feature_num)))
    model.add(Dense(1))
    model.compile(loss="mse", optimizer="adam", metrics=["accuracy"])
    model.summary()
    model.fit(
        train_X_scale, train_y_scale, epochs=40, 
        batch_size=32, verbose=1, validation_split=0.2)
    # predict
    pred = scaler_y.inverse_transform(model.predict(test_X_scale))
    return pred

In [116]:
def make_submission(df, method, add_stock_feature=False):
    submission_list = []
    fe = FeatureExtraction()
    util = Util()
    etf_code_list = list(df.groupby(df['code']).size().index)
    for etf_code in etf_code_list:
        etf_df = df.loc[df['code']==etf_code,:]
        # set to time index
        select_columns = ['open', 'high', 'low', 'close','volume', 'adj close']
        etf_df = etf_df.loc[:,select_columns].set_index(etf_df['date'])
        # extract stock features
        if add_stock_feature:
            etf_df = fe.perform_stock_stat(etf_df)
        method_dict = {
            'linear regression': perform_linear_regression,
            'xgboost': perform_xgboost,
            'ARIMA': perform_ARIMA,
            'RNN many to many': perform_RNN_many_to_many,
            'RNN many to one': perform_RNN_many_to_one
        }
        pred = method_dict.get(method)(etf_df)
        pred = pred.flatten()
        submission_record = util.make_submission_record(etf_code, etf_df, pred)
        submission_list.append(submission_record)
    submission_df = pd.DataFrame(data=submission_list, columns=[
        'ETFid','Mon_ud', 'Mon_cprice', 'Tue_ud', 'Tue_cprice', 'Wed_ud', 'Wed_cprice',
        'Thu_ud', 'Thu_cprice', 'Fri_ud', 'Fri_cprice'
    ])
    return submission_df

In [8]:
def point_formula(t_ud, s_ud, t_p, s_p):
    return (0.5 if t_ud == s_ud else 0) + ((t_p-abs(s_p-t_p))/t_p)*0.5 

In [9]:
def evaluate_point(sub_df, true_df):
    """
    漲跌: 預測正確得0.5
    價格: (實際價格 – 絕對值(預測價格 – 實際價格)) /實際價格)*0.5) , 也就預測越正確, 越接近 0.5 分數權重
    週一: 10%
    週二: 15%
    週三: 20%
    週四: 25%
    週五: 30%
    """
    true_ud_df = true_df.loc[:,true_df.columns.str.contains('ud')]
    sub_ud_df = sub_df.loc[:,sub_df.columns.str.contains('ud')]
    
    true_price_df = true_df.loc[:,true_df.columns.str.contains('price')]
    sub_price_df = sub_df.loc[:,sub_df.columns.str.contains('price')]
    total_value = 0
    for t_ud, s_ud, t_p, s_p in zip(true_ud_df.values, sub_ud_df.values, true_price_df.values, sub_price_df.values):
        mon_point = point_formula(t_ud[0], s_ud[0], t_p[0], s_p[0]) * 0.1
        tue_point = point_formula(t_ud[1], s_ud[1], t_p[1], s_p[1]) * 0.15
        wed_point = point_formula(t_ud[2], s_ud[2], t_p[2], s_p[2]) * 0.2
        thu_point = point_formula(t_ud[3], s_ud[3], t_p[3], s_p[3]) * 0.25
        fri_point = point_formula(t_ud[4], s_ud[4], t_p[4], s_p[4]) * 0.30
        total_value += (mon_point + tue_point + wed_point + thu_point + fri_point)
    return total_value

In [104]:
# collect predict result
def eval_points_by_method(df, method, train_count=None, add_stock_feature=False):
    submission_list = []
    ground_true_list = []
    fe = FeatureExtraction()
    etf_code_list = list(df.groupby(df['code']).size().index)
    for etf_code in etf_code_list:
        etf_df = df.loc[df['code']==etf_code,:]
        # set to time index
        select_columns = ['open', 'high', 'low', 'close','volume', 'adj close']
        etf_df = etf_df.loc[:,select_columns].set_index(etf_df['date'])
        # stock feature
        if add_stock_feature:
            etf_df = fe.perform_stock_stat(etf_df)
        if train_count:
            etf_df = etf_df.tail(train_count)
        train_df = etf_df.iloc[:-5,]
        test_df = etf_df.iloc[-5:,]
        test_y = test_df['adj close'].values

        method_dict = {
            'linear regression': perform_linear_regression,
            'ARIMA': perform_ARIMA,
            'xgboost': perform_xgboost,
            'RNN many to many': perform_RNN_many_to_many,
            'RNN many to one': perform_RNN_many_to_one,
            'prophet': perform_prophet
        }
        pred = method_dict.get(method)(train_df)
        pred = pred.flatten()
        submission_record = util.make_submission_record(etf_code, train_df, pred)
        ground_true_record = util.make_submission_record(etf_code, train_df, test_y)
        submission_list.append(submission_record)
        ground_true_list.append(ground_true_record)
    sub_col = [
        'ETFid','Mon_ud', 'Mon_cprice', 'Tue_ud', 'Tue_cprice', 'Wed_ud', 'Wed_cprice',
        'Thu_ud', 'Thu_cprice', 'Fri_ud', 'Fri_cprice'
    ]
    submission_df = pd.DataFrame(data=submission_list, columns=sub_col)
    ground_true_df = pd.DataFrame(data=ground_true_list, columns=sub_col)
    return evaluate_point(submission_df, ground_true_df)

In [123]:
csv_path = 'csv/TBrain_Round2_DataSet_20180601/'
csv_list = os.listdir(csv_path)

In [124]:
csv_list

['tasharep.csv', 'tetfp.csv', 'taetfp.csv', 'tsharep.csv']

In [9]:
# write CSV to mongoDB
# mongo = MongoBase('mongodb://220.133.208.31:27017/', 'test-database')
# for csv_name in csv_list:
#     df = pd.read_csv('csv/' + csv_name, encoding='cp950', dtype='str')
#     collection_name = csv_name.split('.')[0]
#     mongo.insert_document(collection_name, etf_data_preprocess(df))

In [125]:
# tetfp: 18 檔 ETF
# taetfp: 調整後 18 檔 ETF 
# tsharep: 個股
# tasharep: 調整後個股

df = pd.read_csv(csv_path + 'tetfp.csv', encoding='cp950', dtype='str')
util = Util()
df = util.etf_data_preprocess(df)
adj_closed = pd.read_csv(csv_path + 'taetfp.csv', encoding='cp950', dtype='str')['收盤價(元)']
adj_closed = adj_closed.astype('float64').values
colname_mapping = {
    '代碼': 'code', '日期': 'date', '中文簡稱': 'chinese', '開盤價(元)': 'open',
    '最高價(元)': 'high', '最低價(元)': 'low', '收盤價(元)': 'close', '成交張數(張)': 'volume'
}
df.rename(index=str, columns=colname_mapping, inplace=True)
df['adj close'] = adj_closed

In [126]:
# 資料分布屬性
df.groupby(df['code']).size()

code
0050      1327
0051      1327
0052      1327
0053      1327
0054      1327
0055      1327
0056      1327
0057      1327
0058      1327
0059      1327
006201    1327
006203    1327
006204    1327
006208    1327
00690      289
00692      259
00701      194
00713      165
dtype: int64

In [150]:
method_list = ['linear regression', 'ARIMA', 'RNN many to many', 'RNN many to one', 'xgboost']
eval_points_by_method(df, method_list[0])
submission_df = make_submission(df, method_list[4])

linear_regression train df shape: (1317, 7)
linear_regression train df shape: (1317, 7)
linear_regression train df shape: (1317, 7)
linear_regression train df shape: (1317, 7)
linear_regression train df shape: (1317, 7)
linear_regression train df shape: (1317, 7)
linear_regression train df shape: (1317, 7)
linear_regression train df shape: (1317, 7)
linear_regression train df shape: (1317, 7)
linear_regression train df shape: (1317, 7)
linear_regression train df shape: (1317, 7)
linear_regression train df shape: (1317, 7)
linear_regression train df shape: (1317, 7)
linear_regression train df shape: (1317, 7)
linear_regression train df shape: (279, 7)
linear_regression train df shape: (249, 7)
linear_regression train df shape: (184, 7)
linear_regression train df shape: (155, 7)


14.386108039023279

In [146]:
submission_df

Unnamed: 0,ETFid,Mon_ud,Mon_cprice,Tue_ud,Tue_cprice,Wed_ud,Wed_cprice,Thu_ud,Thu_cprice,Fri_ud,Fri_cprice
0,50,1,81.910004,-1,81.699997,-1,80.870003,-1,80.699997,1,81.32
1,51,-1,33.709999,-1,33.709999,-1,33.700001,1,33.75,-1,33.52
2,52,1,52.040001,-1,51.950001,1,51.950001,1,51.950001,1,51.950001
3,53,-1,35.700001,1,35.799999,-1,35.459999,1,35.73,-1,35.560001
4,54,-1,24.02,1,24.030001,-1,23.99,1,24.0,1,24.040001
5,55,1,17.389999,-1,17.389999,1,17.450001,-1,17.41,1,17.48
6,56,-1,26.139999,1,26.17,1,26.219999,1,26.34,-1,25.91
7,57,1,50.080002,1,50.509998,-1,49.25,-1,48.869999,1,50.27
8,58,-1,46.900002,1,47.16,-1,46.52,1,47.130001,-1,46.959999
9,59,1,42.5,-1,42.400002,1,42.400002,-1,41.439999,1,41.77


In [147]:
submission_df.to_csv('result.csv', index=False)