In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import matplotlib.pyplot as plt
import gc
from sklearn.preprocessing import StandardScaler
from datetime import datetime, timedelta
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout, LSTM, GRU, Activation
from keras.losses import mse

In [None]:
from kaggle.competitions import twosigmanews
env = twosigmanews.make_env()

In [None]:
(market_data, news_data) = env.get_training_data()

In [None]:
start = datetime(2009, 1, 1, 0, 0, 0).date()
market_data = market_data.loc[market_data['time'].dt.date >= start].reset_index(drop=True)

del news_data
gc.collect()

market_data.head()

In [None]:
def market_data_preprocessing(market_data, cols):
    
    market_data['time'] = market_data['time'].dt.floor('1D')
    
    market_data.loc[:, cols] = market_data.loc[:, cols].fillna(0)
    
    for i in range(len(cols)):
        market_data = market_data[np.abs(market_data[cols[i]]-market_data[cols[i]].mean()) <= (3 * market_data[cols[i]].std())]
        
    market_data.returnsOpenNextMktres10 = market_data.returnsOpenNextMktres10.clip(-1, 1)
    
    market_data['label'] = market_data.returnsOpenNextMktres10.map(lambda x: 0 if x < 0 else 1)
    
    market_data['assetCodeSplit'] = market_data['assetCode']
    
    map_split = {}
    for i in market_data['assetCode'].unique():
        a, splits = i.split('.')
        map_split[i] = splits
    market_data['assetCodeSplit'] = market_data['assetCodeSplit'].map(map_split)
    
    one_hot_df = pd.get_dummies(market_data['assetCodeSplit'].astype(str))
    market_data.drop(columns = ['assetCodeSplit'], inplace=True)
    market_data = pd.concat([market_data, one_hot_df], axis=1)
    
    return market_data

In [None]:
market_data = market_data_preprocessing(market_data, [col for col in market_data.columns if col not in ['universe', 'time', 'assetCode', 'assetName']])
market_data.sort_values(by=['time'], inplace=True)

In [None]:
cols = [col for col in market_data.columns if col not in ['time','assetCode', 'universe', 'label', 'assetName',
                                                               'assetCode_exchange_A', 'assetCode_exchange_N', 'assetCode_exchange_O',
                                                               'assetCode_exchange_OB', 'assetCode_exchange_UNKNOWN', 'returnsOpenNextMktres10']]
std_scaler = StandardScaler(copy=False)
market_data[cols] = market_data[cols].fillna(0)
market_data.loc[:, cols] = std_scaler.fit_transform(market_data.loc[:, cols])

In [None]:
class SequenceGenerator:
    def __init__(self, market_data, cols, train=True):
        self.market_data = market_data
        self.cols = cols
        self.train = train
        
        self.batch_size = 100000
        self.window = 10

    def generate(self):
        
        while True:
            
            X, y, returns, date, universe = [], [], [], [], []
            
            for a, data in self.market_data.groupby(['assetCode'], sort=False):
                
                data = data.sort_values(by=['time'])
                num_sequences = data.shape[0] - self.window 
                
                for seq in range(num_sequences):
    
                    returns.append(data.returnsOpenNextMktres10.iloc[seq+self.window-1])
                    date.append(data.time.iloc[seq+self.window-1])
                    universe.append(data.universe.iloc[seq+self.window-1])
                    X.append(data[self.cols].iloc[seq:seq+self.window].values)
                    y.append(data.label.iloc[seq+self.window-1])
                    
                    if len(X) == self.batch_size: 
                        X_, y_,returns_, date_, universe_ = np.array(X), np.array(y), np.array(returns),np.array(date), np.array(universe)
                        X, y, returns, date, universe = [], [], [], [], []
                        
                        if self.train:
                            yield X_, y_
                        else:
                            yield X_, y_, returns_, date_, universe_
                            
    def steps(self):
        # get number of steps per epoch
        steps = 0
        for _, data in self.market_data.groupby(['assetCode'], sort=False):
            num_sequences = data.shape[0] - self.window 
            steps += num_sequences // self.batch_size
        return steps

In [None]:
def split(*market_data, test_size=0.25):
    splits = []
    for i in range(len(market_data)):
        limit = int(len(market_data[i]) * (1 - test_size))
        splits.append(data[limit+1000:].copy())
        splits.append(data[:limit].copy())
    return sets

train_df, val_df = split(market_data)

In [None]:
cols = [col for col in market_data.columns if col not in ['time','assetCode', 'universe', 'label', 'assetName', 'returnsOpenNextMktres10']]
train_gen = SequenceGenerator(train_df, cols)
test_gen = SequenceGenerator(val_df, cols)
train_steps = train_gen.steps()
test_steps = test_gen.steps()

In [None]:
model = Sequential()
model.add(LSTM(128, input_shape=(10, len(cols))))
model.add(Activation('relu'))
model.add(Dense(64))
model.add(Activation('tanh'))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam',loss='mse', metrics=['accuracy'])

In [None]:
check_point = ModelCheckpoint('model.hdf5',verbose=True, save_best_only=True)
early_stop = EarlyStopping(patience=5,verbose=True)
model.fit_generator(train_gen.generate(), validation_data=test_gen.generate(),epochs=1,steps_per_epoch=train_steps, validation_steps=test_steps,callbacks=[early_stop,check_point]) 