In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/janestreet/data.parquet
/kaggle/input/nn-result/nn_result.csv


This notebook's approach is using LSTM for times-series method.

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings ("ignore")
import gc  

import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
SEED = 1111
tf.random.set_seed(SEED)
np.random.seed(SEED)

In [3]:
# this data is already excluded day <= 85

data = pd.read_parquet('../input/janestreet/data.parquet')

In [4]:
# Select trade with weight !=  0:
data = data[data['weight'] != 0]
# # limit memory use: we change datatype from float64 to float32
data = data.astype({c: np.float32 for c in data.select_dtypes(include='float64').columns}) 

# create target variable
data['action'] = (data['resp'] > 0)*1

# fill null values with mean of each feature
data.fillna(data.mean(),inplace=True)

#create fetures list
features = [c for c in data.columns if 'feature' in c]

In [5]:
# 20 / 80 split
df_trainvalid, df_test = np.split(data, [int(.8*len(data))])

df_train, df_valid = np.split(df_trainvalid, [int(.9*len(df_trainvalid))])

X_train = df_train[features]
y_train = df_train['action']

X_valid = df_valid[features]
y_valid = df_valid['action']

X_test = df_test[features]
y_test = df_test['action']


In [6]:
NORMALIZE_NONE = 0
NORMALIZE_MIN_MAX = 1
NORMALIZE_MEAN = 2


In [7]:
def normalize_data(df):
    if NORMALIZE_TYPE == NORMALIZE_MIN_MAX:
        return (df-df.min())/(df.max()-df.min())
    elif NORMALIZE_TYPE == NORMALIZE_MEAN:
        return (df-df.mean())/df.std()
    else:
        return df;


In [8]:
NORMALIZE_TYPE = NORMALIZE_MEAN

X_train = normalize_data(X_train)
X_valid = normalize_data(X_valid)
X_test = normalize_data(X_test)


In [9]:
def create_windows(data, window_shape, step = 1, start_id = None, end_id = None):
    
    data = np.asarray(data)
    data = data.reshape(-1,1) if np.prod(data.shape) == max(data.shape) else data
        
    start_id = 0 if start_id is None else start_id
    end_id = data.shape[0] if end_id is None else end_id
    
    data = data[int(start_id):int(end_id),:]
    window_shape = (int(window_shape), data.shape[-1])
    step = (int(step),) * data.ndim
    slices = tuple(slice(None, None, st) for st in step)
    indexing_strides = data[slices].strides
    win_indices_shape = ((np.array(data.shape) - window_shape) // step) + 1
    
    new_shape = tuple(list(win_indices_shape) + list(window_shape))
    strides = tuple(list(indexing_strides) + list(data.strides))
    
    window_data = np.lib.stride_tricks.as_strided(data, shape=new_shape, strides=strides)
    
    return np.squeeze(window_data, 1)

In [10]:
look_back = 2
look_ahead = 1
# LSTM expects 3D input (examples, timestep, features)
X_train_reshaped = create_windows(X_train, window_shape=look_back).astype(np.float32)
y_train_reshaped = create_windows(y_train, window_shape = look_ahead, start_id = look_back-1)

X_valid_reshaped = create_windows(X_valid, window_shape=look_back).astype(np.float32)
y_valid_reshaped = create_windows(y_valid, window_shape = look_ahead, start_id = look_back-1)

X_test_reshaped = create_windows(X_test, window_shape=look_back).astype(np.float32)
y_test_reshaped = create_windows(y_test, window_shape = look_ahead, start_id = look_back-1)

print(X_train_reshaped.shape, y_train_reshaped.shape)

(1131417, 2, 130) (1131417, 1, 1)


In [11]:
del data, df_train, df_valid, df_trainvalid, X_train, X_valid, y_train, y_valid, X_test, y_test
gc.collect()

62

In [12]:
batch_size = 4096 
step_train = len(X_train_reshaped)//batch_size
step_valid = len(X_valid_reshaped)//batch_size
step_test = len(X_test_reshaped)//batch_size


# # Create train, valid, test data with batch size 
train = tf.data.Dataset.from_tensor_slices((X_train_reshaped, y_train_reshaped)).batch(batch_size).repeat()
val = tf.data.Dataset.from_tensor_slices((X_valid_reshaped, y_valid_reshaped)).batch(batch_size).repeat()
test = tf.data.Dataset.from_tensor_slices((X_test_reshaped, y_test_reshaped)).batch(batch_size).repeat()


In [13]:
del X_train_reshaped, y_train_reshaped, X_valid_reshaped, y_valid_reshaped, y_test_reshaped
gc.collect()

20

In [14]:
l_r = 0.0001

# A Sequential model
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(64, input_shape=(look_back,130),return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer=tf.keras.optimizers.Adam(learning_rate = l_r),metrics=["AUC"])

model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 2, 64)             49920     
_________________________________________________________________
dropout (Dropout)            (None, 2, 64)             0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
batch_normalization (BatchNo (None, 64)                256       
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 83,265
Trainable params: 83,137
Non-trainable params: 128
__________________________________________________

In [15]:
gc.collect()

7218

In [16]:
%%time

model.fit(train, validation_data=val, epochs=35, steps_per_epoch = step_train, validation_steps = step_valid, verbose = 1,
          callbacks = [EarlyStopping(monitor='val_loss', verbose=1, patience=10)])

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 00025: early stopping
CPU times: user 2min 3s, sys: 9.14 s, total: 2min 12s
Wall time: 1min 55s


<tensorflow.python.keras.callbacks.History at 0x7fdbc8101f90>

In [17]:
# Evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
results = model.evaluate(test, steps = step_test)

print("test loss, test acc:", results)

Evaluate on test data
test loss, test acc: [0.6914218068122864, 0.5313917994499207]


In [18]:
prediction = model.predict_classes(X_test_reshaped).reshape(1,-1)

# Utility score

In [19]:

result_df = pd.DataFrame({'Date': df_test['date'][look_back-1:], 'Weight': df_test['weight'][look_back-1:],
                          'Resp': df_test['resp'][look_back-1:], 'Action': prediction[0]})

result_df['P'] = result_df['Weight']*result_df['Resp']*result_df['Action']
result_df.head()

Unnamed: 0,Date,Weight,Resp,Action,P
1493257,430,11.982266,-0.004582,0,-0.0
1493258,430,1.107787,-0.004491,1,-0.004975
1493259,430,1.312454,0.000542,1,0.000711
1493260,430,0.422074,-0.004623,1,-0.001951
1493261,430,2.849713,-0.01191,0,-0.0


In [20]:
result_groupby_days = result_df[['Date', 'P']].groupby('Date').sum().reset_index()
print(result_groupby_days.shape)
result_groupby_days.head()

(70, 2)


Unnamed: 0,Date,P
0,430,-3.44087
1,431,-13.89181
2,432,-3.72103
3,433,-12.663742
4,434,5.711499


In [21]:
p = result_groupby_days['P'].values

t = (np.sum(p)/(np.sqrt(np.sum(p**2))))*np.sqrt(250/len(p))

u = min(max(t, 0), 6) * np.sum(p)

print(f"Utility score is: {u:.3f}")

Utility score is: 2861.313


In [22]:
result_df[['Date', 'Action']].to_csv(f'LSTM_result_timestep{look_back}.csv')