## **Modules**

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
import datetime, os

from pandas import DataFrame

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Embedding, Dropout, LSTM, Input, BatchNormalization, GRU
from tensorflow.keras.callbacks import EarlyStopping

from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)
TF_ENABLE_ONEDNN_OPTS=0

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir=logs

## **Data**

In [None]:
file_path = "../input/data-with-features-ver01/data_with_features_ver01.csv"
df = pd.read_csv(file_path)
df

#### Test naive method:

In [None]:
a = df[(df["volume_SMA4"] >= df["volume_SMA8"])].index
b = df[(df["volume_SMA4"] < df["volume_SMA8"])].index+1
a_b = set(a).intersection(set(b))
lst = sorted(list(a_b))
print((df["close"].to_numpy()[np.array(lst)+1] - df["close"].to_numpy()[lst] >= 0).sum()/len(lst))
print(len(lst))

### **Preprocessing**

In [None]:
lst = [4,8,12,16,20,24,48,72]

In [None]:
for c in df.columns:
    print(c, end="\t\t")

In [None]:
df = pd.read_csv(file_path)

#### 0. label

In [None]:
def relabel_label(a):
    if a < 0:
        return -1
    return 1
    
a = df["close"] - df["close"].shift(1)
df["label"] = pd.DataFrame(np.vectorize(relabel_label)(a.to_numpy()))
df["up"] = 0
df["down"] = 0
df.loc[df[df["label"] == 1].index, "up"] = 1
df.loc[df[df["label"] == -1].index, "down"] = 1

#### 1. price vs ma

In [None]:
def relabel_ma(x, ma, std):
    if x > ma:
        return x/(ma+std)
    if x <= ma:
        return (std-ma)/x

for i in lst:
    col_name = "price_SMA"+str(i)
    df[col_name] = np.vectorize(relabel_ma)(df["close"].to_numpy(), df[col_name].to_numpy(), df["std"+str(i)])
    col_name = "price_EMA"+str(i)
    df[col_name] = np.vectorize(relabel_ma)(df["close"].to_numpy(), df[col_name].to_numpy(), df["std"+str(i)])
    col_name = "price_WMA"+str(i)
    df[col_name] = np.vectorize(relabel_ma)(df["close"].to_numpy(), df[col_name].to_numpy(), df["std"+str(i)])

#### 2. std

In [None]:
def relabel_std(x, std):
    if std != 0:
        return x/std
    return 0

for i in lst:
    col_name = "std"+str(i)
    df[col_name] = np.vectorize(relabel_std)(df["price_diff"].to_numpy(), df[col_name].to_numpy())

#### 3. rsi

In [None]:
def relabel_rsi(rsi):
    if rsi > 50:
        return (1/(50**0.7))*((rsi-50)**0.7)
    return (-1/(50**0.7))*((50-rsi)**0.7)

for i in lst:
    col_name = "RSI"+str(i)
    df[col_name] = np.vectorize(relabel_rsi)(df[col_name].to_numpy())

#### 4. cci

In [None]:
def relabel_cci(cci):
    return -cci/100

for i in lst:
    col_name = "CCI"+str(i)
    df[col_name] = np.vectorize(relabel_cci)(df[col_name].to_numpy())

#### 5. bop

In [None]:
# Just keep this indicators, its range is from -1 to 1

#### 6. volume

In [None]:
df["volume_diff"]  = ((df["volume"] - df["volume"].shift(1))/(df["volume"]).shift(1))

#### 7. mfi

In [None]:
def relabel_mfi(mfi):
    return (-1/50)*(50-mfi)

for i in lst:
    col_name = "MFI"+str(i)
    df[col_name] = np.vectorize(relabel_mfi)(df[col_name].to_numpy())

#### 10. drop

In [None]:
df.drop(columns=["Unnamed: 0", "TR", "plus_DM", "minus_DM", "price_diff", "PPO12-26", "open", "high", "low", "close", "volume", "quote_asset_volume", "number_of_trades", "taker_buy_base_asset_volume", "taker_buy_quote_asset_volume", "raw_money_flow", "typical_price"],inplace=True)
for i in lst:
    df.drop(columns=["RS%d"%i, "ADX%d"%(i), "minus_DI%d"%(i),"plus_DI%d"%i, "ATR%d"%(i), "DX%d"%i, "mad%d"%(i), "smoothed_minus_DM%d"%i, "smoothed_plus_DM%d"%i, "volume_SMA%d"%i, "volume_EMA%d"%i, "volume_WMA%d"%i], inplace=True)

In [None]:
df

In [None]:
# df["day"] = pd.to_datetime(df["Time_UTC_Start"]).dt.day/31
# df["month"] = pd.to_datetime(df["Time_UTC_Start"]).dt.month/12
# df["year"] = pd.to_datetime(df["Time_UTC_Start"]).dt.year/2018
# df["hour"] = pd.to_datetime(df["Time_UTC_Start"]).dt.hour/24

# if diff_percentage == True:
#     df["day"] /= 100
#     df["month"] /= 100
#     df["year"] /= 100
#     df["hour"] /= 100

In [None]:
df.drop(columns=["Time_UTC_Start", "label"], inplace=True)

In [None]:
df.dropna(inplace=True)

In [None]:
# Convert all dataframes to numpy
train_df = df.copy().iloc[0:int(len(df)*0.7),:]
train_data = train_df.to_numpy()
num_rows_train, num_cols_train = train_data.shape

val_df = df.copy().iloc[int(len(df)*0.7):int(len(df)*0.8),:]
val_data = val_df.to_numpy()
num_rows_val, num_cols_val = val_data.shape

test_df = df.copy().iloc[int(len(df)*0.8):].copy()
test_data = test_df.copy().to_numpy()
num_rows_test, num_cols_test = test_data.shape

In [None]:
steps = 8

In [None]:
def split_sequences(data, window_size, down_label_col_idx, up_label_col_idx):
    x = []
    y = []
    for i in range(0,len(data)-window_size):
        x.append(data[i:i+window_size,:]) # Take window_size rows data before
        y.append(data[i+window_size,down_label_col_idx:1+up_label_col_idx]) # To predict the current value of label column
    return np.array(x), np.array(y)

In [None]:
train_data = np.asarray(train_data).astype('float32')
val_data = np.asarray(val_data).astype('float32')
test_data = np.asarray(test_data).astype('float32')

x_train, y_train = split_sequences(train_data, steps, df.columns.get_loc("up"), df.columns.get_loc("down"))
x_val, y_val = split_sequences(val_data, steps, df.columns.get_loc("up"), df.columns.get_loc("down"))
x_test, y_test = split_sequences(test_data, steps, df.columns.get_loc("up"), df.columns.get_loc("down"))

In [None]:
num_of_outputs = 32

## **Original model**

In [None]:
print('Build model...')

class LSTM4_with_t2v(Model):

    def __init__(self, steps:int, df:DataFrame):
        super().__init__()
        
        self.steps = steps
        self.df = df
        
        self.t2v_first_col = Dense(1, input_shape=(steps,df.shape[-1]), activation=None)
        self.t2v_others_col = Dense(255, input_shape=(steps,df.shape[-1]), activation=None)

        self.LSTM1 = LSTM(num_of_outputs, return_sequences=True, recurrent_dropout=0.1)
        self.LSTM2 = LSTM(num_of_outputs, return_sequences=True, recurrent_dropout=0.1)
        self.LSTM3 = LSTM(num_of_outputs, return_sequences=False, recurrent_dropout=0.1)
        
        self.batch_norm = BatchNormalization()
        self.out = Dense(2, activation='softmax')
        
#         self.dense1 = Dense(128, activation='relu')
#         self.dense2 = Dense(32, activation='relu')


    def call(self, inputs):
        t2v_x1 = self.t2v_first_col(inputs)
        t2v_x2 = tf.sin(self.t2v_others_col(inputs))
        t2v = tf.concat([t2v_x1,t2v_x2],-1)
        x1 = self.LSTM1(t2v)
        x1 = self.batch_norm(x1)
        x2 = self.LSTM2(x1)
        x2 = self.batch_norm(x2)
        x3 = self.LSTM3(x2)
#         x3 = self.dense1(x2)
#         x4 = self.dense2(x3)

        return self.out(x3) 

# model = LSTM4_with_t2v(df.columns.get_loc("volume"), df.columns.get_loc("close"))
model = LSTM4_with_t2v(steps=steps, df=df)

opt = tf.keras.optimizers.Adam(
    learning_rate=0.001
)

monitor = EarlyStopping(monitor='val_accuracy', min_delta=1e-3, patience=5, 
                        verbose=1, mode='auto', restore_best_weights=True)
tensorboard_callback = tf.keras.callbacks.TensorBoard("logs")

### *Train*

In [None]:
print('Train...')

opt = tf.keras.optimizers.Adam(
    learning_rate=0.001
)

model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

for i in range(1,2):
    print("Load %i:"%i)
    model.fit(x_train, y_train, batch_size=8, epochs=100, validation_data=(x_val, y_val), callbacks=[monitor, tensorboard_callback])

In [None]:
model.evaluate(x_train, y_train)
model.evaluate(x_val, y_val)
model.evaluate(x_test, y_test)

In [None]:
# print('Train...')

# opt = tf.keras.optimizers.Adam(
#     learning_rate=0.0001
# )

# model.compile(loss='mean_absolute_error', optimizer=opt)

# for i in range(1,2):
#     print("Load %i:"%i)
#     model.fit(x_train, y_train, batch_size=64, epochs=100, validation_data=(x_val, y_val), callbacks=[monitor, tensorboard_callback])

In [None]:
model.summary()

In [None]:
model.save("t2v-256_2-lstm-out32-recdrop0.1_3-dense-128relu-32relu-2softmax", save_format="tf")

## **Stacking**

In [None]:
saved_model = tf.keras.models.load_model("./t2v-256_2-lstm-out32-recdrop0.1_3-dense-128relu-32relu-2softmax")
print(saved_model.summary())

In [None]:
x_train, y_train = split_sequences(train_data, steps, df.columns.get_loc("up"), df.columns.get_loc("down"))
x_val, y_val = split_sequences(val_data, steps, df.columns.get_loc("up"), df.columns.get_loc("down"))
x_test, y_test = split_sequences(test_data, steps, df.columns.get_loc("up"), df.columns.get_loc("down"))

#### Transform x_train

In [None]:
t2v_x1 = saved_model.layers[0](x_train)
t2v_x2 = tf.sin(saved_model.layers[1](x_train))
t2v = tf.concat([t2v_x1,t2v_x2],-1)
x1 = saved_model.layers[2](t2v)
x1 = saved_model.layers[5](x1)
x2 = saved_model.layers[3](x1)
x2 = saved_model.layers[5](x2)
x_train = saved_model.layers[4](x2)

#### Transform x_val

In [None]:
t2v_x1 = saved_model.layers[0](x_val)
t2v_x2 = tf.sin(saved_model.layers[1](x_val))
t2v = tf.concat([t2v_x1,t2v_x2],-1)
x1 = saved_model.layers[2](t2v)
x1 = saved_model.layers[5](x1)
x2 = saved_model.layers[3](x1)
x2 = saved_model.layers[5](x2)
x_val = saved_model.layers[4](x2)

#### Transform x_test

In [None]:
t2v_x1 = saved_model.layers[0](x_test)
t2v_x2 = tf.sin(saved_model.layers[1](x_test))
t2v = tf.concat([t2v_x1,t2v_x2],-1)
x1 = saved_model.layers[2](t2v)
x1 = saved_model.layers[5](x1)
x2 = saved_model.layers[3](x1)
x2 = saved_model.layers[5](x2)
x_test = saved_model.layers[4](x2)

### **Stacked model**

In [None]:
print('Build model...')

stacked_model = Sequential()
stacked_model.add(Dense(1024, activation='relu'))
stacked_model.add(Dense(512, activation='relu'))
stacked_model.add(Dense(128, activation='relu'))
stacked_model.add(Dense(128, activation='relu'))
stacked_model.add(Dense(128, activation='relu'))
stacked_model.add(Dense(32, activation='relu'))
stacked_model.add(Dense(2, activation='softmax'))

opt = tf.keras.optimizers.Adam(
    learning_rate=0.001
)

monitor = EarlyStopping(monitor='val_accuracy', min_delta=1e-3, patience=5, 
                        verbose=1, mode='auto', restore_best_weights=True)

tensorboard_callback = tf.keras.callbacks.TensorBoard("logs")

### *Train*

In [None]:
print('Train...')

opt = tf.keras.optimizers.Adam(
    learning_rate=0.001
)

stacked_model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

for i in range(1,2):
    print("Load %i:"%i)
    stacked_model.fit(x_train, y_train, batch_size=64, epochs=100, validation_data=(x_val, y_val), callbacks=[monitor, tensorboard_callback])

In [None]:
stacked_model.evaluate(x_train, y_train)
stacked_model.evaluate(x_val, y_val)
stacked_model.evaluate(x_test, y_test)

### **Simple RF model**

In [None]:
saved_model = model

In [None]:
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.multioutput import MultiOutputClassifier

for n in range(50,1000,50):
    print("For n = " + str(n))
    clf = MultiOutputClassifier(GBC(n_estimators=n, learning_rate=1.0,max_depth=1, random_state=0))
    clf.fit(x_train, y_train)
    print(clf.score(x_val, y_val))
    print(clf.score(x_test, y_test))

In [None]:
tensorboard --logdir logs --load_fast true

***TEST TECHNICAL ANALYSIS SKILLS***

In [None]:
liist = [4,8,12,16,20,24,48,72]
for name in ["volume"]:
    for ma in ["SMA", "EMA", "WMA"]:
        for i in range(len(liist)):
#             for j in range(i+1, len(liist)):
            name1 = "volume"
            name2 = name+"_"+ma+str(liist[i])
            a = dff[(dff[name1] >= dff[name2])].index
            b = dff[(dff[name1] < dff[name2])].index+1
            a_b = set(a).intersection(set(b))
            lst = sorted(list(a_b))
            lst = np.array(lst)
            lst = lst[lst<33190]
            print(name1 + " - " + name2, (dff["close"].to_numpy()[np.array(lst)+1] - dff["close"].to_numpy()[np.array(lst)] >= 0).sum()/len(lst), len(lst))

In [None]:
file_path = "../input/data-with-features-ver01/data_with_features_ver01.csv"
dff = pd.read_csv(file_path)

In [None]:
liist = [4,8,12,16,20,24,48,72]
for n in ["CCI","RSI","RS"]:
    for i in liist:
        name = n+str(i)
        lst = dff[dff[name] > 70].index
        lst = lst[lst<33190]
        print(name,(dff["close"].to_numpy()[np.array(lst)+1] - dff["close"].to_numpy()[np.array(lst)] < 0).sum()/len(lst), len(lst))

In [None]:
for c in df.columns:
    print(c)