## **Modules**

In [52]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
import datetime, os
import random

from pandas import DataFrame

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, LeakyReLU, Dense, Embedding, Dropout, LSTM, Input, BatchNormalization, Flatten, Bidirectional, Reshape, Concatenate
from tensorflow.keras.callbacks import EarlyStopping

from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)
TF_ENABLE_ONEDNN_OPTS=0

In [53]:
%load_ext tensorboard

In [1]:
%tensorboard --logdir=logs

UsageError: Line magic function `%tensorboard` not found.


## **Data**

In [55]:
file_path = "../input/data-with-features-ver01/data_with_features_ver01.csv"
# file_path = "data_with_features_ver01.csv"
df = pd.read_csv(file_path)
df

#### Test naive method:

In [56]:
a = df[(df["volume_SMA4"] >= df["volume_SMA8"])].index
b = df[(df["volume_SMA4"] < df["volume_SMA8"])].index+1
a_b = set(a).intersection(set(b))
lst = sorted(list(a_b))
print((df["close"].to_numpy()[np.array(lst)+1] - df["close"].to_numpy()[lst] >= 0).sum()/len(lst))
print(len(lst))

### **Preprocessing**

In [69]:
file_path = "../input/data-with-features-ver01/data_with_features_ver01.csv"
original_df = pd.read_csv(file_path)
original_df["price_diff"] = original_df["close"].copy()
original_df.drop(columns=["Unnamed: 0", "Time_UTC_Start", "close"], inplace=True)
original_df.dropna()
df = original_df.copy()

In [68]:
# df["day"] = pd.to_datetime(df["Time_UTC_Start"]).dt.day/31
# df["month"] = pd.to_datetime(df["Time_UTC_Start"]).dt.month/12
# df["year"] = pd.to_datetime(df["Time_UTC_Start"]).dt.year/2018
# df["hour"] = pd.to_datetime(df["Time_UTC_Start"]).dt.hour/24

# if diff_percentage == True:
#     df["day"] /= 100
#     df["month"] /= 100
#     df["year"] /= 100
#     df["hour"] /= 100

### Data normalization

In [70]:
def data_normalization(df: DataFrame, days_range: int=20, include_cur_row: bool=False):
    '''
    Return the normalized data:
    + df: the dataframe to be normalized
    + range: the number of previous rows (or including the current row) to be considered in the normalization
    + include_cur_row: True if we consider the current row in the normalization process (calculate mean and std
    using the current row and (range-1) previous rows), False if we want to use all the passed data for normalization 
    processing ((calculate mean and std using (range) previous rows))
    '''
    
    df_roll = None

    if include_cur_row == False:
        df_roll = df.rolling(days_range, closed='left')
    else:
        df_roll = df.rolling(days_range)
        
    res_df = (df - df_roll.mean()) / df_roll.std()
    res_df.replace([np.inf, -np.inf], 0, inplace=True)
    res_df.dropna(inplace=True)
    res_df.reset_index(drop=True, inplace=True)
    return res_df


def data_denormalization(predictions, original_df: DataFrame, steps: int, col_idx: int, days_range: int=20, include_cur_row: bool=False):
    '''
    Return the rescaled predictions data:
    + df: the dataframe to be denormalized
    + original_df: the dataframe used for denormalizing df before
    + range: the number of previous rows (or including the current row) to be considered in the denormalization
    + include_cur_row: True if we consider the current row in the denormalization process (calculate mean and std
    using the current row and (range-1) previous rows), False if we want to use all the passed data for denormalization 
    processing ((calculate mean and std using (range) previous rows))
    '''
    
    df_roll = None
    if include_cur_row == False:
        df_roll = original_df.rolling(days_range, closed='left')
    else:
        df_roll = original_df.rolling(days_range)
    
    res_df = original_df.copy()
    res_df.iloc[20+steps:,col_idx] = predictions.reshape((-1))
    res_df = res_df * df_roll.std() + df_roll.mean()

    return res_df.iloc[20+steps:,col_idx]

In [71]:
df = data_normalization(df)

In [72]:
# Convert all dataframes to numpy
norm_train_df = df.iloc[0:int(len(df)*0.7),:]
train_df = original_df.iloc[:int(len(df)*0.7)+20,:]
train_data = norm_train_df.to_numpy()
num_rows_train, num_cols_train = train_data.shape

norm_val_df = df.iloc[int(len(df)*0.7):int(len(df)*0.8),:]
val_df = original_df.iloc[int(len(df)*0.7):int(len(df)*0.8)+20,:]
val_data = norm_val_df.to_numpy()
num_rows_val, num_cols_val = val_data.shape

norm_test_df = df.iloc[int(len(df)*0.8):]
test_df = original_df.iloc[int(len(df)*0.8):,:]
test_data = norm_test_df.to_numpy()
num_rows_test, num_cols_test = test_data.shape

In [73]:
norm_train_df

In [74]:
train_data = np.asarray(norm_train_df).astype('float32')
val_data = np.asarray(norm_val_df).astype('float32')
test_data = np.asarray(norm_test_df).astype('float32')

In [75]:
steps = 8

### Split sequence by window size

In [76]:
def split_sequences(data, window_size, price_diff_col_idx):
    x = []
    y = []
    for i in range(0,len(data)-window_size):
        x.append(data[i:i+window_size,:]) # Take window_size rows data before
        y.append(data[i+window_size,price_diff_col_idx]) # To predict the current value of label colume
    return np.array(x), np.array(y).reshape((-1,1))

### Split sequence for GAN training

In [77]:
def split_sequences_gan(data, window_size, price_diff_col_idx):
    x = []
    for i in range(0,len(data)-window_size):
        x.append(data[i:i+window_size+1,price_diff_col_idx])
    return np.array(x)

In [78]:
x_train_gan = split_sequences_gan(train_data, steps, df.columns.get_loc("price_diff"))

x_train, y_train = split_sequences(train_data, steps, df.columns.get_loc("price_diff"))
x_val, y_val = split_sequences(val_data, steps, df.columns.get_loc("price_diff"))
x_test, y_test = split_sequences(test_data, steps, df.columns.get_loc("price_diff"))

In [79]:
num_of_outputs = 64

## **Original model**

In [80]:
cross_entropy = tf.keras.losses.BinaryCrossentropy()
mae = tf.keras.losses.MeanAbsoluteError()

In [81]:
def discriminator_loss(real_output, fake_output):
    real_loss = cross_entropy(tf.ones_like(real_output), real_output)
    fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
    total_loss = real_loss + fake_loss
    return total_loss

def generator_loss(fake_output):
    return 3*mae(y_train.reshape((-1,1)),generator(x_train)) + cross_entropy(tf.ones_like(fake_output), fake_output)

In [82]:
adam = tf.keras.optimizers.Adam(
    learning_rate=0.0001
)

### **1. Generator**

In [83]:
class GeneratorModel(Model):
    def __init__(self, num_of_outputs:int):
        super().__init__()
        self.t2v_first_col = Dense(1, input_shape=(steps,df.shape[-1]), activation=None)
        self.t2v_others_col = Dense(255, input_shape=(steps,df.shape[-1]), activation=None)
        self.LSTM1 = LSTM(num_of_outputs, return_sequences=True, recurrent_dropout=0.3)
        self.LSTM2 = LSTM(num_of_outputs, return_sequences=True, recurrent_dropout=0.3)
        self.LSTM3 = LSTM(num_of_outputs, return_sequences=False, recurrent_dropout=0.3)
        self.batch_norm1 = BatchNormalization()
        self.batch_norm2 = BatchNormalization()
        self.Dense1 = Dense(32)
        self.Dense2 = Dense(1)
        
    def call(self, inputs):
        t2v_x1 = self.t2v_first_col(inputs)
        t2v_x2 = tf.sin(self.t2v_others_col(inputs))
        t2v = tf.concat([t2v_x1,t2v_x2],-1)
        x1 = self.LSTM1(t2v)
        x1 = self.batch_norm1(x1)
        x2 = self.LSTM2(x1)
        x2 = self.batch_norm1(x2)
        x3 = self.LSTM3(x2)
        x4 = self.Dense1(x3)
        x5 = self.batch_norm2(x4)
        return self.Dense2(x5)
        
        
generator = GeneratorModel(num_of_outputs=num_of_outputs)

### **2. Discriminator**

In [84]:
class DiscriminatorModel(Model):
    def __init__(self, num_of_outputs:int):
        super().__init__()
        self.Reshape = Reshape((steps+1,1))
#         self.Bidirectional_LSTM1 = Bidirectional(LSTM(num_of_outputs, return_sequences=False))
        self.batch_norm1 = BatchNormalization()
        self.batch_norm2 = BatchNormalization()
        self.batch_norm3 = BatchNormalization()
        self.batch_norm4 = BatchNormalization()
#         self.Dense1 = Dense(512, activation=tf.keras.layers.LeakyReLU(alpha=0.01))
        self.Dense2 = Dense(128, activation=tf.keras.layers.LeakyReLU(alpha=0.01))
        self.Dense3 = Dense(32, activation=tf.keras.layers.LeakyReLU(alpha=0.01))
        self.Dense4 = Dense(1, activation='sigmoid')
        
    def call(self, inputs):
        x0 = self.Reshape(inputs)
#         x1 = self.Bidirectional_LSTM1(x0)
#         x1 = self.batch_norm1(x1)
#         x2 = self.Dense1(x0)
#         x2 = self.batch_norm2(x2)
        x3 = self.Dense2(x0)
        x3 = self.batch_norm3(x3)
        x4 = self.Dense3(x3)
        x4 = self.batch_norm4(x4)
        return self.Dense4(x4)
        
discriminator = DiscriminatorModel(num_of_outputs=num_of_outputs)

In [85]:
def generate_fake_samples(x, generator, price_diff_col_idx):
    pred = generator(x)
    return Concatenate(axis=1)([x[:,:,df.columns.get_loc("price_diff")],pred])

In [86]:
def generate_idx_data(x_train, rate=0.5):
    faked_idx = np.random.randint(0, x_train.shape[0], int(x_train.shape[0]*rate))
    real_idx = np.array(list(set([i for i in range(x_train.shape[0])]).difference(set(faked_idx))))
    return faked_idx, real_idx

In [87]:
generator_optimizer = adam
discriminator_optimizer = adam

In [123]:
generator.compile(loss="mae")
for i in range(5):
    print("i =", i, ":")
    faked_idx, real_idx = generate_idx_data(x_train)
    x_real = x_train_gan[real_idx]
    
    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        x_faked = generate_fake_samples(x_train[faked_idx], generator, df.columns.get_loc("price_diff"))
        real_output = discriminator(x_real, training=True)
        fake_output = discriminator(x_faked, training=True)

        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(real_output, fake_output)

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    print("Generator loss %s | Discriminator loss %s" %(float(gen_loss), float(disc_loss)))
    print("Val loss evaluation", generator.evaluate(x_val,y_val))
    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))

In [124]:
predictions_train = generator.predict(x_train)
predictions_train = data_denormalization(predictions_train, original_df=train_df, steps=steps, col_idx=df.columns.get_loc("price_diff"))
y_real_train = train_df["price_diff"][20+steps:].reset_index(drop=True)
print("Train Accuracy:",((y_real_train - y_real_train.shift(1))*(predictions_train.to_numpy()-y_real_train.shift(1).to_numpy())>0).sum()/predictions_train.shape[0])

In [125]:
predictions_val = generator.predict(x_val)
predictions_val = data_denormalization(predictions_val, original_df=val_df, steps=steps, col_idx=df.columns.get_loc("price_diff"))
y_real_val = val_df["price_diff"][20+steps:].reset_index(drop=True)
print("Validation Accuracy:",((y_real_val - y_real_val.shift(1))*(predictions_val.to_numpy()-y_real_val.shift(1).to_numpy())>0).sum()/predictions_val.shape[0])

In [126]:
predictions_test = generator.predict(x_test)
predictions_test = data_denormalization(predictions_test, original_df=test_df, steps=steps, col_idx=df.columns.get_loc("price_diff"))
y_real_test = test_df["price_diff"][20+steps:].reset_index(drop=True)
print("Test Accuracy:",((y_real_test - y_real_test.shift(1))*(predictions_test.to_numpy()-y_real_test.shift(1).to_numpy())>0).sum()/predictions_test.shape[0])

In [127]:
plt.figure(figsize=(16,9), dpi=90)
plt.plot(np.arange(100), predictions_test[0:100], label="predictions")
plt.plot(np.arange(100), y_real_test[0:100], label="real")
plt.legend()  
plt.show()