In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

from scipy import optimize

import tensorflow as tf
import torch
import os
os.chdir('..')

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import datetime
import matplotlib as mpl
from matplotlib import cm
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline, FeatureUnion

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [4]:
def train_df_test_df(ticker):
    
    def concat_and_return_csvs(original_df, ticker_files):
        for item in ticker_files[1:]:
            this_df = pd.read_csv(data_path+item)
            original_df = pd.concat([original_df, this_df])
        return original_df
    
    data_path = 'data/daily_data/'
    ticker_files = [item for item in os.listdir(data_path) if ticker in item.split('_')]
    ticker_files.sort()
    
    split_idx = int(len(ticker_files) * 0.8)
    train_ticker_files, test_ticker_files = ticker_files[:split_idx], ticker_files[split_idx:]

    train_df = pd.read_csv(data_path+train_ticker_files[0])
    train_df = concat_and_return_csvs(train_df, train_ticker_files)
    
    test_df = pd.read_csv(data_path+test_ticker_files[0])
    test_df = concat_and_return_csvs(test_df, test_ticker_files)
    
    return train_df, test_df

In [5]:
def get_processed_minute_data(df):
    cols = df.columns.tolist()
    cols_to_drop = cols[:4] + ['label', 'changeOverTime', 'close', 'high', 
                               'low', 'marketAverage', 'marketClose', 
                               'marketOpen', 'volume', 'numberOfTrades', 
                               'notional', 'open', 'marketChangeOverTime']
    df.drop(cols_to_drop, axis=1, inplace=True)
    # necessary
    df.reset_index(drop=True, inplace=True)
    
    idx_to_drop = df.index[df.marketNotional == 0.0]
    df.drop(idx_to_drop, inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    df.date = df.date.map(lambda x: datetime.datetime.strptime(str(x), '%Y%m%d'))
    df['weekday'] = df.date.map(lambda x: str(x.weekday()))
    df['month']   = df.date.map(lambda x: str(x.month))
    
    df.minute = df.minute.map(lambda x: datetime.datetime.strptime(x, '%H:%M'))
    df['hour'] = df.minute.map(lambda x: str(x.hour))
    
    return df

In [6]:
def get_numeric_categoric(df):
    numeric_cols, categorical_cols = [], []

    for col in df:
        if np.issubdtype(df[col].dtype, np.number):
            numeric_cols += [col]
        else:
            categorical_cols += [col]
    
    return numeric_cols, categorical_cols

In [7]:
def delta_dataframe(df, numeric_columns):
    '''
    log numerical columns, then return deltas
    '''
    
    MAX_SHIFT_BACWARD, MAX_SHIFT_FORWARD = -20, 20
    added_columns = []
    for shift in [MAX_SHIFT_BACWARD, -10, -5, 3, 5, 10, MAX_SHIFT_FORWARD]:
        for col in numeric_columns:
            new_col_name = col + '_' + str(shift)
            df[new_col_name] = df[col].shift(shift)
            added_columns += [new_col_name]

    df[numeric_columns+added_columns] = df[numeric_columns+added_columns].apply(np.log)
    
    # for lookbacks
    for new_col in added_columns:
        original_col, added_part = new_col.split('_')
        df[new_col] = df[new_col] - df[original_col] if '-' in added_part else \
                      df[original_col] - df[new_col]

    # for today
    # This line is necessary
    temp = df[numeric_columns] - df[numeric_columns].shift(1)
    df[numeric_columns] = temp
    
    assert (df.index == np.arange(len(df))).all()
    df.drop(df.index[list(range(MAX_SHIFT_FORWARD))], axis=0, inplace=True)
    df.reset_index(drop=True, inplace=True)
    #                            negative max_shift_back...
    df.drop(index=list(range(len(df)+MAX_SHIFT_BACWARD, len(df))), inplace=True)
    
    return df

In [8]:
def load_dataframes(ticker):
    train_df, test_df = train_df_test_df(ticker)
    
#     train_df, test_df = list(map(lambda x: get_processed_minute_data(x), 
#                                  (train_df, test_df)))
    train_df = get_processed_minute_data(train_df)
    test_df  = get_processed_minute_data(test_df)
    
    numeric_cols, categoric_cols = get_numeric_categoric(train_df)
    # This is for the time being...
    categoric_cols = ['weekday', 'month', 'hour']
    
    train_df = delta_dataframe(train_df, numeric_cols)
    test_df  = delta_dataframe(test_df,  numeric_cols)
    
    # Re-evaluate column names from the deltas
    numeric_cols, _ = get_numeric_categoric(train_df)
    
    return train_df, test_df, numeric_cols, categoric_cols

def get_y_cols(numeric_cols):
    price_cols      = [item for item in numeric_cols if '-' in item]
    interested_cols = [item for item in price_cols if 'High' in item or 'Low' in item]
    not_interested_cols = list(set(price_cols) - set(interested_cols))
    return interested_cols, not_interested_cols

# messy code... 
train_df_original, test_df_original, numeric_cols, categoric_cols = load_dataframes('cmg')
y_cols, not_interested = get_y_cols(numeric_cols)
numeric_cols = list(set(numeric_cols) - set(y_cols) - set(not_interested))

In [9]:
train_df, y_train = train_df_original[numeric_cols], train_df_original[y_cols]
test_df, y_test   = test_df_original[numeric_cols], test_df_original[y_cols]
y_train.drop(y_train.columns[2:], axis=1, inplace=True)
y_test.drop( y_test.columns[2:], axis=1, inplace=True)
binary_y_train = (y_train>0.002).astype(np.int)
binary_y_test  = (y_test >0.002 ).astype(np.int)

In [10]:
# Use only the ones worked well in autoencoder
transfomer = [
    ('Data after min-max scaling',
        MinMaxScaler()),
    ('Data after max-abs scaling',
        MaxAbsScaler()),
    ('Data after quantile transformation (uniform pdf)',
        QuantileTransformer(output_distribution='uniform')),
    ('Data after sample-wise L2 normalizing',
        Normalizer()),
]

combined = FeatureUnion(transfomer)
combined_fit = combined.fit(train_df)

In [11]:
x_train_transformed = combined.transform(train_df)
x_test_transformed = combined.transform(test_df)

In [12]:
x_train_transformed.shape, x_test_transformed.shape

((9229, 100), (2274, 100))

In [13]:
class LogisticRegressor(nn.Module):
    def __init__(self, input_size, final_output_size):
        super(LogisticRegressor, self).__init__()

        self.l1 = nn.Linear(input_size, 32)
        self.l2 = nn.Linear(32, 16)
#         self.l3 = nn.Linear(32, 16)
        self.l4 = nn.Linear(16, final_output_size)

    def forward(self, x):
        x = torch.relu(self.l1(x))
        x = torch.tanh(self.l2(x))
#         x = torch.tanh(self.l3(x))
        return torch.sigmoid(self.l4(x))

In [14]:
class TickerDataSimple(Dataset):
    def __init__(self, ticker, x, y):
        '''
        :param ticker: string
        :param x: np.array of x
        :param y: np.array of y
        '''
        self.ticker = ticker
        self.x = torch.FloatTensor(x)
        self.y = torch.FloatTensor(y)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, index):
        x = self.x[index]
        y = self.y[index]
        return x, y

In [15]:
x_train_transformed = combined.transform(train_df)
x_test_transformed = combined.transform(test_df)

spy_dataset = TickerDataSimple('spy', x_train_transformed, 
                               torch.from_numpy(binary_y_train.values).float())

BATCH_SIZE = 64
train_dl = DataLoader(spy_dataset, 
                      num_workers=1, 
                      batch_size=BATCH_SIZE)

spy_testset = TickerDataSimple('spy', x_test_transformed, 
                               torch.from_numpy(binary_y_test.values).float())

BATCH_SIZE = 64
test_dl = DataLoader(spy_testset, 
                      num_workers=1, 
                      batch_size=BATCH_SIZE)

In [16]:
iter_train_dl = iter(train_dl)

In [17]:
x, y = next(iter_train_dl)
x.shape, y.shape

(torch.Size([64, 100]), torch.Size([64, 2]))

In [18]:
class CustomLoss(torch.nn.Module):
    '''
    Implement Focal Loss
    '''
    def __init__(self):
        super(CustomLoss,self).__init__()
        
    def forward(self, y_pred, y_target):
        y_pred = y_pred.flatten()
        y_target = y_target.flatten()
        
        def log_p(pred, target):
            return -((1-pred) * torch.log2(pred) * target)
        
        return (log_p(y_pred, y_target) + log_p(1-y_pred, 1-y_target)).mean()

In [19]:
from torch.nn.utils import clip_grad_norm_
import torch.optim as optim

# Each Data Points are 24 (6 * 4)
# Transformer has 4 different ways
model = LogisticRegressor(x_train_transformed.shape[1], y_train.shape[1])

# criterion = CustomLoss()
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(
    model.parameters(), lr=5e-3, weight_decay=1e-6)

In [20]:
import ignite
from ignite.metrics import BinaryAccuracy, Loss, Precision, Recall
from ignite.engine import Events, \
                          create_supervised_trainer, \
                          create_supervised_evaluator

In [21]:
import sklearn.metrics as sk_metrics
import torch.nn.functional as F

In [22]:
# iter_train_dl = iter(train_dl)

In [23]:
# x, y = next(iter_train_dl)
# _out = model(x)
# _out = _out.flatten()
# y    = y.flatten()
# _zero_one = _out > 0.5
# print('f1_score: {}'.format(sk_metrics.f1_score(_zero_one.detach().numpy(), y)))
# print('accuracy_score: {}'.format(sk_metrics.accuracy_score(_zero_one, y)))
# print('roc_auc_score: {}'.format(sk_metrics.roc_auc_score(y, _out.detach().numpy())))

In [24]:
from ignite.metrics import Accuracy
from functools import partial
from sklearn.metrics import roc_auc_score
from ignite.metrics import EpochMetric


def sk_metric_fn(y_preds, y_targets, sk_metrics, activation=None):
    y_true = y_targets.flatten().numpy()
    y_pred = y_preds.flatten().numpy()
    if activation is not None:
        y_preds = activation(y_preds)
    
    return sk_metrics(y_true, y_pred)

class ROC_AUC(EpochMetric):
    def __init__(self, activation=None, output_transform=lambda x: x):
        super(ROC_AUC, self).__init__(
            partial(sk_metric_fn, 
                    sk_metrics=sk_metrics.roc_auc_score, 
                    activation=activation),
            output_transform=output_transform)

class F1_Score(EpochMetric):
    def __init__(self, activation=None, output_transform=lambda x: x):
        super(F1_Score, self).__init__(
            partial(sk_metric_fn, 
                    sk_metrics=sk_metrics.f1_score, 
                    activation=activation),
            output_transform=output_transform)

class BinaryAccuracy(EpochMetric):
    def __init__(self, activation=None, output_transform=lambda x: x):
        super(BinaryAccuracy, self).__init__(
            partial(sk_metric_fn, 
                    sk_metrics=sk_metrics.accuracy_score, 
                    activation=activation),
            output_transform=output_transform)

class Precision(EpochMetric):
    def __init__(self, activation=None, output_transform=lambda x: x):
        super(Precision, self).__init__(
            partial(sk_metric_fn, 
                    sk_metrics=sk_metrics.precision_score, 
                    activation=activation),
            output_transform=output_transform)

class Recall(EpochMetric):
    def __init__(self, activation=None, output_transform=lambda x: x):
        super(Recall, self).__init__(
            partial(sk_metric_fn, 
                    sk_metrics=sk_metrics.recall_score, 
                    activation=activation),
            output_transform=output_transform)

In [25]:
def zero_one(y_preds):
    return y_preds > 0.5
    
def zero_one_transform(output):
    return (zero_one(output[0])).long(), output[1].long()

In [26]:
bce_loss = nn.modules.loss.BCELoss()

trainer = create_supervised_trainer(model, optimizer, criterion)
evaluator = create_supervised_evaluator(
    model,
    metrics={
        'accuracy' : BinaryAccuracy(output_transform=zero_one_transform),
        'bce':       Loss(bce_loss),
        'f1_score' : F1_Score(output_transform=zero_one_transform),
        'roc_auc'  : ROC_AUC(),
        'precision': Precision(output_transform=zero_one_transform),
        'recall'   : Recall(output_transform=zero_one_transform),
    })

In [27]:
@trainer.on(Events.EPOCH_COMPLETED)
def log_training_results(trainer):
    evaluator.run(train_dl)
    metrics = evaluator.state.metrics
    print("Training Results  - Epoch: {} Avg accuracy: {:.5f}, Avg BCE: {:.5f}, F1 Score: {:.5f}, ROC_AUC: {:.5f}".format(
                  trainer.state.epoch, 
                  metrics['accuracy'], 
                  metrics['bce'],
                  metrics['f1_score'],
                  metrics['roc_auc'],
                 ))
    print("Training Results  - Epoch: {} Precision: {:.5f}, Recall: {:.5f}".format(
                  trainer.state.epoch, 
                  metrics['precision'], 
                  metrics['recall'],
                 ))

@trainer.on(Events.EPOCH_COMPLETED)
def log_validation_results(trainer):
    evaluator.run(test_dl)
    metrics = evaluator.state.metrics
    print("Validation Results- Epoch: {} Avg accuracy: {:.5f}, Avg BCE: {:.5f}, F1 Score: {:.5f}, ROC_AUC: {:.5f}".format(
                  trainer.state.epoch, 
                  metrics['accuracy'], 
                  metrics['bce'],
                  metrics['f1_score'],
                  metrics['roc_auc'],
                 ))
    print("Validation Results- Epoch: {} Precision: {:.5f}, Recall: {:.5f}".format(
                  trainer.state.epoch, 
                  metrics['precision'],
                  metrics['recall'],
                 ))
        

In [28]:
trainer.run(train_dl, max_epochs=100)

Training Results  - Epoch: 1 Avg accuracy: 0.70143, Avg BCE: 0.62274, F1 Score: 0.00000, ROC_AUC: 0.50290
Training Results  - Epoch: 1 Precision: 0.00000, Recall: 0.00000
Validation Results- Epoch: 1 Avg accuracy: 0.73835, Avg BCE: 0.57927, F1 Score: 0.00000, ROC_AUC: 0.49333
Validation Results- Epoch: 1 Precision: 0.00000, Recall: 0.00000
Training Results  - Epoch: 2 Avg accuracy: 0.70143, Avg BCE: 0.61737, F1 Score: 0.00000, ROC_AUC: 0.50519
Training Results  - Epoch: 2 Precision: 0.00000, Recall: 0.00000
Validation Results- Epoch: 2 Avg accuracy: 0.73835, Avg BCE: 0.57713, F1 Score: 0.00000, ROC_AUC: 0.49640
Validation Results- Epoch: 2 Precision: 0.00000, Recall: 0.00000
Training Results  - Epoch: 3 Avg accuracy: 0.70143, Avg BCE: 0.61642, F1 Score: 0.00000, ROC_AUC: 0.50963
Training Results  - Epoch: 3 Precision: 0.00000, Recall: 0.00000
Validation Results- Epoch: 3 Avg accuracy: 0.73835, Avg BCE: 0.57668, F1 Score: 0.00000, ROC_AUC: 0.49983
Validation Results- Epoch: 3 Precision:

Training Results  - Epoch: 25 Avg accuracy: 0.70398, Avg BCE: 0.61108, F1 Score: 0.02394, ROC_AUC: 0.54851
Training Results  - Epoch: 25 Precision: 0.77011, Recall: 0.01216
Validation Results- Epoch: 25 Avg accuracy: 0.73835, Avg BCE: 0.57612, F1 Score: 0.01653, ROC_AUC: 0.51662
Validation Results- Epoch: 25 Precision: 0.50000, Recall: 0.00840
Training Results  - Epoch: 26 Avg accuracy: 0.70414, Avg BCE: 0.60978, F1 Score: 0.02430, ROC_AUC: 0.54598
Training Results  - Epoch: 26 Precision: 0.79070, Recall: 0.01234
Validation Results- Epoch: 26 Avg accuracy: 0.73857, Avg BCE: 0.57557, F1 Score: 0.01654, ROC_AUC: 0.51618
Validation Results- Epoch: 26 Precision: 0.52632, Recall: 0.00840
Training Results  - Epoch: 27 Avg accuracy: 0.70436, Avg BCE: 0.61097, F1 Score: 0.02571, ROC_AUC: 0.54563
Training Results  - Epoch: 27 Precision: 0.80000, Recall: 0.01306
Validation Results- Epoch: 27 Avg accuracy: 0.73901, Avg BCE: 0.57598, F1 Score: 0.01982, ROC_AUC: 0.51531
Validation Results- Epoch: 2

Training Results  - Epoch: 49 Avg accuracy: 0.70501, Avg BCE: 0.60250, F1 Score: 0.05255, ROC_AUC: 0.57596
Training Results  - Epoch: 49 Precision: 0.63983, Recall: 0.02740
Validation Results- Epoch: 49 Avg accuracy: 0.73769, Avg BCE: 0.58068, F1 Score: 0.03087, ROC_AUC: 0.51513
Validation Results- Epoch: 49 Precision: 0.46341, Recall: 0.01597
Training Results  - Epoch: 50 Avg accuracy: 0.70517, Avg BCE: 0.60160, F1 Score: 0.04893, ROC_AUC: 0.57916
Training Results  - Epoch: 50 Precision: 0.66351, Recall: 0.02540
Validation Results- Epoch: 50 Avg accuracy: 0.73835, Avg BCE: 0.58053, F1 Score: 0.03252, ROC_AUC: 0.51551
Validation Results- Epoch: 50 Precision: 0.50000, Recall: 0.01681
Training Results  - Epoch: 51 Avg accuracy: 0.70555, Avg BCE: 0.60203, F1 Score: 0.04498, ROC_AUC: 0.57790
Training Results  - Epoch: 51 Precision: 0.71111, Recall: 0.02323
Validation Results- Epoch: 51 Avg accuracy: 0.73725, Avg BCE: 0.57956, F1 Score: 0.02129, ROC_AUC: 0.51981
Validation Results- Epoch: 5

Training Results  - Epoch: 73 Avg accuracy: 0.70793, Avg BCE: 0.59670, F1 Score: 0.08050, ROC_AUC: 0.59669
Training Results  - Epoch: 73 Precision: 0.67045, Recall: 0.04282
Validation Results- Epoch: 73 Avg accuracy: 0.73263, Avg BCE: 0.58534, F1 Score: 0.04702, ROC_AUC: 0.51758
Validation Results- Epoch: 73 Precision: 0.34884, Recall: 0.02521
Training Results  - Epoch: 74 Avg accuracy: 0.70853, Avg BCE: 0.59637, F1 Score: 0.08875, ROC_AUC: 0.59818
Training Results  - Epoch: 74 Precision: 0.66667, Recall: 0.04754
Validation Results- Epoch: 74 Avg accuracy: 0.73197, Avg BCE: 0.58643, F1 Score: 0.04840, ROC_AUC: 0.51723
Validation Results- Epoch: 74 Precision: 0.34066, Recall: 0.02605
Training Results  - Epoch: 75 Avg accuracy: 0.70836, Avg BCE: 0.59588, F1 Score: 0.08530, ROC_AUC: 0.59879
Training Results  - Epoch: 75 Precision: 0.67112, Recall: 0.04555
Validation Results- Epoch: 75 Avg accuracy: 0.73087, Avg BCE: 0.58637, F1 Score: 0.04225, ROC_AUC: 0.51714
Validation Results- Epoch: 7

Training Results  - Epoch: 97 Avg accuracy: 0.71075, Avg BCE: 0.58790, F1 Score: 0.18922, ROC_AUC: 0.61336
Training Results  - Epoch: 97 Precision: 0.58007, Recall: 0.11305
Validation Results- Epoch: 97 Avg accuracy: 0.71526, Avg BCE: 0.59105, F1 Score: 0.10874, ROC_AUC: 0.52308
Validation Results- Epoch: 97 Precision: 0.30038, Recall: 0.06639
Training Results  - Epoch: 98 Avg accuracy: 0.71069, Avg BCE: 0.58708, F1 Score: 0.20677, ROC_AUC: 0.61469
Training Results  - Epoch: 98 Precision: 0.57002, Recall: 0.12629
Validation Results- Epoch: 98 Avg accuracy: 0.71020, Avg BCE: 0.59279, F1 Score: 0.11066, ROC_AUC: 0.51978
Validation Results- Epoch: 98 Precision: 0.28082, Recall: 0.06891
Training Results  - Epoch: 99 Avg accuracy: 0.70983, Avg BCE: 0.58711, F1 Score: 0.21143, ROC_AUC: 0.61575
Training Results  - Epoch: 99 Precision: 0.56050, Recall: 0.13028
Validation Results- Epoch: 99 Avg accuracy: 0.70910, Avg BCE: 0.59287, F1 Score: 0.11386, ROC_AUC: 0.52194
Validation Results- Epoch: 9

<ignite.engine.engine.State at 0x7fe787fc7898>

In [29]:
evaluator.state.metrics

{'accuracy': 0.7104221635883905,
 'bce': 0.5935904205946306,
 'f1_score': 0.12723658051689862,
 'roc_auc': 0.5224025405278252,
 'precision': 0.30094043887147337,
 'recall': 0.08067226890756303}

In [30]:
binary_y_test.mean()

marketHigh_-20    0.254617
marketLow_-20     0.268690
dtype: float64