In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

from scipy import optimize

import tensorflow as tf
import torch
import os
os.chdir('..')

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import datetime
import matplotlib as mpl
from matplotlib import cm
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline, FeatureUnion

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [4]:
def train_df_test_df(ticker):
    
    def concat_and_return_csvs(original_df, ticker_files):
        for item in ticker_files[1:]:
            this_df = pd.read_csv(data_path+item)
            original_df = pd.concat([original_df, this_df])
        return original_df
    
    data_path = 'data/daily_data/'
    ticker_files = [item for item in os.listdir(data_path) if ticker in item.split('_')]
    ticker_files.sort()
    
    split_idx = int(len(ticker_files) * 0.8)
    train_ticker_files, test_ticker_files = ticker_files[:split_idx], ticker_files[split_idx:]

    train_df = pd.read_csv(data_path+train_ticker_files[0])
    train_df = concat_and_return_csvs(train_df, train_ticker_files)
    
    test_df = pd.read_csv(data_path+test_ticker_files[0])
    test_df = concat_and_return_csvs(test_df, test_ticker_files)
    
    return train_df, test_df

In [5]:
def get_processed_minute_data(df):
    cols = df.columns.tolist()
    cols_to_drop = cols[:4] + ['label', 'changeOverTime', 'close', 'high', 
                               'low', 'marketAverage', 'marketClose', 
                               'marketOpen', 'volume', 'numberOfTrades', 
                               'notional', 'open', 'marketChangeOverTime']
    df.drop(cols_to_drop, axis=1, inplace=True)
    # necessary
    df.reset_index(drop=True, inplace=True)
    
    idx_to_drop = df.index[df.marketNotional == 0.0]
    df.drop(idx_to_drop, inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    df.date = df.date.map(lambda x: datetime.datetime.strptime(str(x), '%Y%m%d'))
    df['weekday'] = df.date.map(lambda x: str(x.weekday()))
    df['month']   = df.date.map(lambda x: str(x.month))
    
    df.minute = df.minute.map(lambda x: datetime.datetime.strptime(x, '%H:%M'))
    df['hour'] = df.minute.map(lambda x: str(x.hour))
    
    return df

In [6]:
def get_numeric_categoric(df):
    numeric_cols, categorical_cols = [], []

    for col in df:
        if np.issubdtype(df[col].dtype, np.number):
            numeric_cols += [col]
        else:
            categorical_cols += [col]
    
    return numeric_cols, categorical_cols

In [7]:
def delta_dataframe(df, numeric_columns):
    '''
    log numerical columns, then return deltas
    '''
    
    MAX_SHIFT_BACWARD, MAX_SHIFT_FORWARD = -20, 20
    added_columns = []
    for shift in [MAX_SHIFT_BACWARD, -10, -5, 3, 5, 10, MAX_SHIFT_FORWARD]:
        for col in numeric_columns:
            new_col_name = col + '_' + str(shift)
            df[new_col_name] = df[col].shift(shift)
            added_columns += [new_col_name]

    df[numeric_columns+added_columns] = df[numeric_columns+added_columns].apply(np.log)
    
    # for lookbacks
    for new_col in added_columns:
        original_col, added_part = new_col.split('_')
        df[new_col] = df[new_col] - df[original_col] if '-' in added_part else \
                      df[original_col] - df[new_col]

    # for today
    # This line is necessary
    temp = df[numeric_columns] - df[numeric_columns].shift(1)
    df[numeric_columns] = temp
    
    assert (df.index == np.arange(len(df))).all()
    df.drop(df.index[list(range(MAX_SHIFT_FORWARD))], axis=0, inplace=True)
    df.reset_index(drop=True, inplace=True)
    #                            negative max_shift_back...
    df.drop(index=list(range(len(df)+MAX_SHIFT_BACWARD, len(df))), inplace=True)
    
    return df

In [8]:
def load_dataframes(ticker):
    train_df, test_df = train_df_test_df(ticker)
    
#     train_df, test_df = list(map(lambda x: get_processed_minute_data(x), 
#                                  (train_df, test_df)))
    train_df = get_processed_minute_data(train_df)
    test_df  = get_processed_minute_data(test_df)
    
    numeric_cols, categoric_cols = get_numeric_categoric(train_df)
    # This is for the time being...
    categoric_cols = ['weekday', 'month', 'hour']
    
    train_df = delta_dataframe(train_df, numeric_cols)
    test_df  = delta_dataframe(test_df,  numeric_cols)
    
    # Re-evaluate column names from the deltas
    numeric_cols, _ = get_numeric_categoric(train_df)
    
    return train_df, test_df, numeric_cols, categoric_cols

def get_y_cols(numeric_cols):
    price_cols      = [item for item in numeric_cols if '-' in item]
    interested_cols = [item for item in price_cols if 'High' in item or 'Low' in item]
    not_interested_cols = list(set(price_cols) - set(interested_cols))
    return interested_cols, not_interested_cols

# messy code... 
train_df_original, test_df_original, numeric_cols, categoric_cols = load_dataframes('cmg')
y_cols, not_interested = get_y_cols(numeric_cols)
numeric_cols = list(set(numeric_cols) - set(y_cols) - set(not_interested))

In [9]:
train_df, y_train = train_df_original[numeric_cols], train_df_original[y_cols]
test_df, y_test   = test_df_original[numeric_cols], test_df_original[y_cols]
y_train.drop(y_train.columns[-2:], axis=1, inplace=True)
y_test.drop(y_test.columns[-2:], axis=1, inplace=True)
binary_y_train = (y_train>0.002).astype(np.int)
binary_y_test  = (y_test>0.002 ).astype(np.int)

In [10]:
# Use only the ones worked well in autoencoder
transfomer = [
    ('Data after min-max scaling',
        MinMaxScaler()),
    ('Data after max-abs scaling',
        MaxAbsScaler()),
    ('Data after quantile transformation (uniform pdf)',
        QuantileTransformer(output_distribution='uniform')),
    ('Data after sample-wise L2 normalizing',
        Normalizer()),
]

combined = FeatureUnion(transfomer)
combined_fit = combined.fit(train_df)

In [11]:
x_train_transformed = combined.transform(train_df)
x_test_transformed = combined.transform(test_df)

In [12]:
x_train_transformed.shape, x_test_transformed.shape

((8849, 100), (2272, 100))

In [13]:
class LogisticRegressor(nn.Module):
    def __init__(self, input_size, final_output_size):
        super(LogisticRegressor, self).__init__()

        self.l1 = nn.Linear(input_size, 32)
        self.l2 = nn.Linear(32, 16)
#         self.l3 = nn.Linear(32, 16)
        self.l4 = nn.Linear(16, final_output_size)

    def forward(self, x):
        x = torch.relu(self.l1(x))
        x = torch.tanh(self.l2(x))
#         x = torch.tanh(self.l3(x))
        return torch.sigmoid(self.l4(x))

In [14]:
class TickerDataSimple(Dataset):
    def __init__(self, ticker, x, y):
        '''
        :param ticker: string
        :param x: np.array of x
        :param y: np.array of y
        '''
        self.ticker = ticker
        self.x = torch.FloatTensor(x)
        self.y = torch.FloatTensor(y)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, index):
        x = self.x[index]
        y = self.y[index]
        return x, y

In [15]:
x_train_transformed = combined.transform(train_df)
x_test_transformed = combined.transform(test_df)

spy_dataset = TickerDataSimple('spy', x_train_transformed, 
                               torch.from_numpy(binary_y_train.values).float())

BATCH_SIZE = 64
train_dl = DataLoader(spy_dataset, 
                      num_workers=1, 
                      batch_size=BATCH_SIZE)

spy_testset = TickerDataSimple('spy', x_test_transformed, 
                               torch.from_numpy(binary_y_test.values).float())

BATCH_SIZE = 64
test_dl = DataLoader(spy_testset, 
                      num_workers=1, 
                      batch_size=BATCH_SIZE)

In [16]:
iter_train_dl = iter(train_dl)

In [17]:
x, y = next(iter_train_dl)
x.shape, y.shape

(torch.Size([64, 100]), torch.Size([64, 4]))

In [18]:
class CustomLoss(torch.nn.Module):
    '''
    Implement Focal Loss
    '''
    def __init__(self):
        super(CustomLoss,self).__init__()
        
    def forward(self, y_pred, y_target):
        y_pred = y_pred.flatten()
        y_target = y_target.flatten()
        
        def log_p(pred, target):
            return -((1-pred) * torch.log2(pred) * target)
        
        return (log_p(y_pred, y_target) + log_p(1-y_pred, 1-y_target)).mean()

In [19]:
from torch.nn.utils import clip_grad_norm_
import torch.optim as optim

# Each Data Points are 24 (6 * 4)
# Transformer has 4 different ways
model = LogisticRegressor(x_train_transformed.shape[1], y_train.shape[1])

criterion = CustomLoss()
# criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(
    model.parameters(), lr=1e-3, weight_decay=1e-6)

In [20]:
import ignite
from ignite.metrics import BinaryAccuracy, Loss, Precision, Recall
from ignite.engine import Events, \
                          create_supervised_trainer, \
                          create_supervised_evaluator

In [21]:
import sklearn.metrics as sk_metrics
import torch.nn.functional as F

In [22]:
# iter_train_dl = iter(train_dl)

In [23]:
# x, y = next(iter_train_dl)
# _out = model(x)
# _out = _out.flatten()
# y    = y.flatten()
# _zero_one = _out > 0.5
# print('f1_score: {}'.format(sk_metrics.f1_score(_zero_one.detach().numpy(), y)))
# print('accuracy_score: {}'.format(sk_metrics.accuracy_score(_zero_one, y)))
# print('roc_auc_score: {}'.format(sk_metrics.roc_auc_score(y, _out.detach().numpy())))

f1_score: 0.35151515151515145
accuracy_score: 0.58203125
roc_auc_score: 0.5517087192590228


In [24]:
from ignite.metrics import Accuracy
from functools import partial
from sklearn.metrics import roc_auc_score
from ignite.metrics import EpochMetric


def sk_metric_fn(y_preds, y_targets, sk_metrics, activation=None):
    y_true = y_targets.flatten().numpy()
    if activation is not None:
        y_preds = activation(y_preds)
    y_pred = y_preds.flatten().numpy()
    return sk_metrics(y_true, y_pred)

class ROC_AUC(EpochMetric):
    def __init__(self, activation=None, output_transform=lambda x: x):
        super(ROC_AUC, self).__init__(
            partial(sk_metric_fn, 
                    sk_metrics=sk_metrics.roc_auc_score, 
                    activation=activation),
            output_transform=output_transform)

class F1_Score(EpochMetric):
    def __init__(self, activation=None, output_transform=lambda x: x):
        super(F1_Score, self).__init__(
            partial(sk_metric_fn, 
                    sk_metrics=sk_metrics.f1_score, 
                    activation=activation),
            output_transform=output_transform)

class BinaryAccuracy(EpochMetric):
    def __init__(self, activation=None, output_transform=lambda x: x):
        super(BinaryAccuracy, self).__init__(
            partial(sk_metric_fn, 
                    sk_metrics=sk_metrics.accuracy_score, 
                    activation=activation),
            output_transform=output_transform)

In [25]:
def zero_one(y_preds):
    return y_preds > 0.5

def zero_one_transform(output):
    return (zero_one(output[0])).long(), output[1].long()

In [26]:
bce_loss = nn.modules.loss.BCELoss()

trainer = create_supervised_trainer(model, optimizer, criterion)
evaluator = create_supervised_evaluator(
    model,
    metrics={
        'accuracy': BinaryAccuracy(activation=zero_one),
        'bce':      Loss(bce_loss),
        'f1_score': F1_Score(activation=zero_one),
        'roc_auc' : ROC_AUC(),
    })

In [27]:
@trainer.on(Events.EPOCH_COMPLETED)
def log_training_results(trainer):
    evaluator.run(train_dl)
    metrics = evaluator.state.metrics
    print("Training Results  - Epoch: {} Avg accuracy: {:.5f}, Avg BCE: {:.5f}, F1 Score: {:.5f}, ROC_AUC: {:.5f}".format(trainer.state.epoch, 
                  metrics['accuracy'], 
                  metrics['bce'],
                  metrics['f1_score'],
                  metrics['roc_auc'],
                 ))

@trainer.on(Events.EPOCH_COMPLETED)
def log_validation_results(trainer):
    evaluator.run(test_dl)
    metrics = evaluator.state.metrics
    print("Validation Results- Epoch: {} Avg accuracy: {:.5f}, Avg BCE: {:.5f}, F1 Score: {:.5f}, ROC_AUC: {:.5f}".format(trainer.state.epoch, 
                  metrics['accuracy'], 
                  metrics['bce'],
                  metrics['f1_score'],
                  metrics['roc_auc'],
                 ))

In [28]:
trainer.run(train_dl, max_epochs=20)

Training Results  - Epoch: 1 Avg accuracy: 0.73932, Avg BCE: 0.58886, F1 Score: 0.00000, ROC_AUC: 0.55372
Validation Results- Epoch: 1 Avg accuracy: 0.76585, Avg BCE: 0.57387, F1 Score: 0.00000, ROC_AUC: 0.54495
Training Results  - Epoch: 2 Avg accuracy: 0.73932, Avg BCE: 0.58893, F1 Score: 0.00000, ROC_AUC: 0.56028
Validation Results- Epoch: 2 Avg accuracy: 0.76585, Avg BCE: 0.57427, F1 Score: 0.00000, ROC_AUC: 0.54984
Training Results  - Epoch: 3 Avg accuracy: 0.73932, Avg BCE: 0.58918, F1 Score: 0.00000, ROC_AUC: 0.56419
Validation Results- Epoch: 3 Avg accuracy: 0.76585, Avg BCE: 0.57479, F1 Score: 0.00000, ROC_AUC: 0.55174
Training Results  - Epoch: 4 Avg accuracy: 0.73932, Avg BCE: 0.58927, F1 Score: 0.00000, ROC_AUC: 0.56749
Validation Results- Epoch: 4 Avg accuracy: 0.76585, Avg BCE: 0.57506, F1 Score: 0.00000, ROC_AUC: 0.55407
Training Results  - Epoch: 5 Avg accuracy: 0.73932, Avg BCE: 0.58927, F1 Score: 0.00000, ROC_AUC: 0.57016
Validation Results- Epoch: 5 Avg accuracy: 0.7

<ignite.engine.engine.State at 0x7f650f13c8d0>