In [None]:
import pandas as pd
import datetime
from sklearn import preprocessing
import numpy as np

# 資料準備

In [None]:
df = pd.read_csv("220301_221031_1min.csv")
df = df[df["isDay"] == True]


cols = ['time', 'open', 'high', 'low', 'close', 'volume']
df = df[cols]


In [None]:
df.time = pd.to_datetime(df.time)

In [None]:
t = df.reset_index(drop=True)
t = t.set_index("time").groupby(pd.Grouper(freq='3Min')).agg({"open": "first", 
                                             "close": "last", 
                                             "low": "min", 
                                             "high": "max",
                                              "volume": "sum"
                                              })
t.dropna(inplace=True)
t = t.reset_index()
t['date'] = t.time.dt.date

In [None]:
def count_function(_df, start_time):
    
    ths = 5

    if _df.iloc[0]['close'] > df.iloc[0]['open']:
        flag_up = True
        up_count = 1
        down_count = 0
    else:
        flag_up = False
        down_count = 1
        up_count = 0

    tmp_time = _df.iloc[0]['time']

    for idx, row in _df.iloc[1:].iterrows():
        if row['close'] == row['open']: # 十字線 開=收
            if flag_up:
                up_count+=1
            else:
                down_count+=1
                
        elif row['close'] > row['open']: # 當前為紅k
            if flag_up: #前一根是紅k
                up_count += 1

            else:       # 前一根是黑k
                if down_count >= ths:
                    start_time.append([tmp_time, down_count, 0]) # 時間, 連續幾根, 黑k
                tmp_time = row['time']
                down_count = 0
                up_count = 1
                flag_up = True

        else:  # 當前為黑k
            if flag_up: # 前一根是紅k
                if up_count >= ths:
                    start_time.append([tmp_time, up_count, 1]) # 時間, 連續幾根, 紅k
                tmp_time = row['time']
                up_count = 0
                down_count = 1
                flag_up = False

            else:       # 前一根是黑k
                down_count += 1


In [None]:
time_list = []
t.groupby("date").apply(lambda x : count_function(x, time_list))

In [None]:
print(len(time_list))

In [None]:
time_list[:5]

## 製作新特徵

In [None]:
import talib
from talib import abstract

In [None]:
### ref. from https://medium.com/ai%E8%82%A1%E4%BB%94/%E7%94%A8-python-%E5%BF%AB%E9%80%9F%E8%A8%88%E7%AE%97-158-%E7%A8%AE%E6%8A%80%E8%A1%93%E6%8C%87%E6%A8%99-26f9579b8f3a

ta_list = talib.get_functions()

for x in ta_list:
    try:
        # x 為技術指標的代碼，透過迴圈填入，再透過 eval 計算出 output
        output = eval('abstract.'+x+'(df)')
        # 如果輸出是一維資料，幫這個指標取名為 x 本身；多維資料則不需命名
        if type(output) == pd.core.series.Series:
            df[x.lower()] = output 
        else: 
            df = pd.concat([df, output], axis=1)

    except:
        print(x)



### 製作 y label

In [None]:
df = df.set_index("time")
df["y"] = 0

In [None]:
Freq = 3

for i in range(len(time_list)):
    start_time = time_list[i][0] - datetime.timedelta(minutes = 10)
    end_time = time_list[i][0] + datetime.timedelta(minutes = time_list[i][1]*Freq)
    
    if time_list[i][2] == 1:
        df.loc[start_time : end_time, "y"] = 1  # 1 為連續紅k
    else:
        df.loc[start_time : end_time, "y"] = -1 # -1 為連續黑k

In [None]:
pd.value_counts(df["y"])

In [None]:
50700

## 標準化

In [None]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df = df.fillna(0)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_scale = scaler.fit_transform(df.iloc[:,:-1])

In [None]:
from keras.utils.np_utils import to_categorical

y = df.iloc[:,-1:].shift(-29)
y_one_hot = to_categorical(y.dropna().astype(int), num_classes=3)

In [None]:
df.y.value_counts()

In [None]:
print(y_one_hot[:,0].sum())
print(y_one_hot[:,1].sum())
print(y_one_hot[:,2].sum())

In [None]:
# 0 : [1,0,0]
# 1 : [0,1,0]
# -1: [0,0,1]

In [None]:
import tensorflow as tf

train_size = int(df_scale.shape[0] * 0.85)

input_data = df_scale
targets = y_one_hot
train_dataset = tf.keras.utils.timeseries_dataset_from_array(
    input_data[:train_size], targets[:train_size], sequence_length=30, batch_size=128)

test_dataset = tf.keras.utils.timeseries_dataset_from_array(
    input_data[train_size:], targets[train_size:], sequence_length=30, batch_size=128)

In [None]:
df_scale.shape

## TEST

In [None]:
import keras_nlp
from tensorflow import keras

inputs = keras.Input(shape=(30,179))

embeddings = keras.layers.Dense(256, activation="linear")(inputs)
position_embeddings = keras_nlp.layers.PositionEmbedding(sequence_length=30)(embeddings)

x = embeddings + position_embeddings
x = keras_nlp.layers.TransformerEncoder(intermediate_dim=64, num_heads=8)(x)

x = keras.layers.GlobalAveragePooling1D()(x)
outputs = keras.layers.Dense(3, activation="softmax")(x)
model = keras.Model(inputs, outputs)

In [None]:
model.summary()

In [None]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), 
              loss=tf.keras.losses.categorical_crossentropy,
                metrics=["accuracy"],
                jit_compile=True)

model.fit(train_dataset.repeat(), epochs=20,steps_per_epoch=396, shuffle=True)

In [None]:
loss, accuracy = model.evaluate(train_dataset.repeat(), steps=10)
print("Loss :", loss)
print("Accuracy :", accuracy)


In [None]:
loss, accuracy = model.evaluate(test_dataset.repeat(), steps=10)
print("Loss :", loss)
print("Accuracy :", accuracy)


In [None]:
pred = model.predict(test_dataset)

pred # predict prob.

In [None]:
pred_class = np.argmax(pred, axis=1)
pd.value_counts(pred_class)

In [None]:
pred_class