In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import gc
from sklearn.preprocessing import StandardScaler
from datetime import datetime, timedelta
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout, LSTM, GRU, Activation
from keras.losses import mse

In [2]:
from kaggle.competitions import twosigmanews
env = twosigmanews.make_env()

Loading the data... This could take a minute.
Done!


In [3]:
(market_data, news_data) = env.get_training_data()

In [4]:
start = datetime(2009, 1, 1, 0, 0, 0).date()
market_data = market_data.loc[market_data['time'].dt.date >= start].reset_index(drop=True)

del news_data
gc.collect()

market_data.head()

Unnamed: 0,time,assetCode,assetName,volume,close,open,returnsClosePrevRaw1,returnsOpenPrevRaw1,returnsClosePrevMktres1,returnsOpenPrevMktres1,returnsClosePrevRaw10,returnsOpenPrevRaw10,returnsClosePrevMktres10,returnsOpenPrevMktres10,returnsOpenNextMktres10,universe
0,2009-01-02 22:00:00+00:00,A.N,Agilent Technologies Inc,3030118.0,16.24,15.6,0.039028,0.045576,0.029112,0.042122,-0.005511,-0.037037,-0.026992,-0.033293,0.179633,1.0
1,2009-01-02 22:00:00+00:00,AAI.N,AirTran Holdings Inc,1551494.0,4.51,4.36,0.015766,-0.035398,-0.018756,-0.047927,0.1275,0.141361,0.110937,0.144485,0.048476,0.0
2,2009-01-02 22:00:00+00:00,AAP.N,Advance Auto Parts Inc,795900.0,34.14,33.86,0.014562,0.022652,-0.010692,0.009156,0.035283,0.047398,-0.00526,0.054363,0.029782,1.0
3,2009-01-02 22:00:00+00:00,AAPL.O,Apple Inc,26964210.0,90.75,85.58,0.063269,-0.004884,0.033274,-0.015174,0.017833,-0.05956,-0.029117,-0.05191,-0.026166,1.0
4,2009-01-02 22:00:00+00:00,AB.N,AllianceBernstein Holding LP,661549.0,21.25,20.98,0.022126,0.095561,-0.054235,0.055434,0.074317,0.08536,-0.042303,0.107932,0.046983,1.0


In [5]:
def market_data_preprocessing(market_data, cols):
    
    market_data['time'] = market_data['time'].dt.floor('1D')
    
    market_data.loc[:, cols] = market_data.loc[:, cols].fillna(0)
    
    for i in range(len(cols)):
        market_data = market_data[np.abs(market_data[cols[i]]-market_data[cols[i]].mean()) <= (3 * market_data[cols[i]].std())]
        
    market_data.returnsOpenNextMktres10 = market_data.returnsOpenNextMktres10.clip(-1, 1)
    
    market_data['label'] = market_data.returnsOpenNextMktres10.map(lambda x: 0 if x < 0 else 1)
    
    market_data['assetCodeSplit'] = market_data['assetCode']
    
    map_split = {}
    for i in market_data['assetCode'].unique():
        a, splits = i.split('.')
        map_split[i] = splits
    market_data['assetCodeSplit'] = market_data['assetCodeSplit'].map(map_split)
    
    one_hot_df = pd.get_dummies(market_data['assetCodeSplit'].astype(str))
    market_data.drop(columns = ['assetCodeSplit'], inplace=True)
    market_data = pd.concat([market_data, one_hot_df], axis=1)
    
    return market_data

In [7]:
market_data = market_data_preprocessing(market_data, [col for col in market_data.columns if col not in ['universe', 'time', 'assetCode', 'assetName']])
market_data.sort_values(by=['time'], inplace=True)

In [8]:
cols = [col for col in market_data.columns if col not in ['time','assetCode', 'universe', 'label', 'assetName',
                                                               'assetCode_exchange_A', 'assetCode_exchange_N', 'assetCode_exchange_O',
                                                               'assetCode_exchange_OB', 'assetCode_exchange_UNKNOWN', 'returnsOpenNextMktres10']]
std_scaler = StandardScaler(copy=False)
market_data[cols] = market_data[cols].fillna(0)
market_data.loc[:, cols] = std_scaler.fit_transform(market_data.loc[:, cols])

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [9]:
class SequenceGenerator:
    def __init__(self, market_data, cols, train=True):
        self.market_data = market_data
        self.cols = cols
        self.train = train
        
        self.batch_size = 100000
        self.window = 10

    def generate(self):
        
        while True:
            
            X, y, returns, date, universe = [], [], [], [], []
            
            for a, data in self.market_data.groupby(['assetCode'], sort=False):
                
                data = data.sort_values(by=['time'])
                num_sequences = data.shape[0] - self.window 
                
                for seq in range(num_sequences):
    
                    returns.append(data.returnsOpenNextMktres10.iloc[seq+self.window-1])
                    date.append(data.time.iloc[seq+self.window-1])
                    universe.append(data.universe.iloc[seq+self.window-1])
                    X.append(data[self.cols].iloc[seq:seq+self.window].values)
                    y.append(data.label.iloc[seq+self.window-1])
                    
                    if len(X) == self.batch_size: 
                        X_, y_,returns_, date_, universe_ = np.array(X), np.array(y), np.array(returns),np.array(date), np.array(universe)
                        X, y, returns, date, universe = [], [], [], [], []
                        
                        if self.train:
                            yield X_, y_
                        else:
                            yield X_, y_, returns_, date_, universe_
                            
    def steps(self):
        # get number of steps per epoch
        steps = 0
        for _, data in self.market_data.groupby(['assetCode'], sort=False):
            num_sequences = data.shape[0] - self.window 
            steps += num_sequences // self.batch_size
        return steps

In [10]:
def split(*market_data, test_size=0.25):
    splits = []
    for i in range(len(market_data)):
        limit = int(len(market_data[i]) * (1 - test_size))
        splits.append(data[limit+1000:].copy())
        splits.append(data[:limit].copy())
    return sets

train_df, val_df = split(market_data)

In [11]:
cols = [col for col in market_data.columns if col not in ['time','assetCode', 'universe', 'label', 'assetName', 'returnsOpenNextMktres10']]
train_gen = SequenceGenerator(train_df, cols)
test_gen = SequenceGenerator(val_df, cols)
train_steps = train_gen.steps()
test_steps = test_gen.steps()

In [12]:
model = Sequential()
model.add(LSTM(128, input_shape=(10, len(cols))))
model.add(Activation('relu'))
model.add(Dense(64))
model.add(Activation('tanh'))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam',loss='mse', metrics=['accuracy'])

Using TensorFlow backend.


In [13]:
check_point = ModelCheckpoint('model.hdf5',verbose=True, save_best_only=True)
early_stop = EarlyStopping(patience=5,verbose=True)
model.fit_generator(train_gen.generate(), validation_data=test_gen.generate(),epochs=1,steps_per_epoch=train_steps, validation_steps=test_steps,callbacks=[early_stop,check_point]) 

Epoch 1/1

Epoch 00001: val_loss improved from inf to 0.24698, saving model to model.hdf5


<keras.callbacks.History at 0x7f9de0cdf0b8>