#  Ubiquant Market Prediction with DNN
## Import Packages

In [1]:
import os
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
from scipy import stats
from tensorflow.python.ops import math_ops
from tensorflow.python.keras import backend as K

## Import dataset

In [2]:
%%time
n_features = 300
features = [f'f_{i}' for i in range(n_features)]
train = pd.read_pickle('../input/ubiquant-market-prediction-half-precision-pickle/train.pkl')
train.head()

CPU times: user 423 ms, sys: 1.6 s, total: 2.03 s
Wall time: 14.7 s


Unnamed: 0,investment_id,time_id,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,...,f_291,f_292,f_293,f_294,f_295,f_296,f_297,f_298,f_299,target
0,1,0,0.932617,0.113708,-0.4021,0.378418,-0.203979,-0.413574,0.96582,1.230469,...,-1.095703,0.200073,0.819336,0.941406,-0.086792,-1.086914,-1.044922,-0.287598,0.321533,-0.300781
1,2,0,0.811035,-0.51416,0.742188,-0.616699,-0.194214,1.771484,1.427734,1.133789,...,0.912598,-0.734375,0.819336,0.941406,-0.387695,-1.086914,-0.929688,-0.974121,-0.343506,-0.231079
2,6,0,0.394043,0.615723,0.567871,-0.60791,0.068909,-1.083008,0.979492,-1.125977,...,0.912598,-0.551758,-1.220703,-1.060547,-0.219116,-1.086914,-0.612305,-0.113953,0.243652,0.568848
3,7,0,-2.34375,-0.011871,1.875,-0.606445,-0.586914,-0.815918,0.77832,0.299072,...,0.912598,-0.266357,-1.220703,0.941406,-0.608887,0.104919,-0.783203,1.151367,-0.773438,-1.064453
4,8,0,0.842285,-0.262939,2.330078,-0.583496,-0.618164,-0.742676,-0.946777,1.230469,...,0.912598,-0.741211,-1.220703,0.941406,-0.588379,0.104919,0.753418,1.345703,-0.737793,-0.531738


In [3]:
investment_id = train.pop("investment_id")
investment_id.head()

0    1
1    2
2    6
3    7
4    8
Name: investment_id, dtype: uint16

In [4]:
_ = train.pop("time_id")

In [5]:
y = train.pop("target")
y.head()

0   -0.300781
1   -0.231079
2    0.568848
3   -1.064453
4   -0.531738
Name: target, dtype: float16

## Create a IntegerLookup layer for investment_id input

In [6]:
%%time
investment_ids = list(investment_id.unique())
investment_id_size = len(investment_ids) + 1
investment_id_lookup_layer = layers.IntegerLookup(max_tokens=investment_id_size)
investment_id_lookup_layer.adapt(pd.DataFrame({"investment_ids":investment_ids}))

2022-03-26 07:37:28.314093: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-26 07:37:28.457133: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-26 07:37:28.457916: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-26 07:37:28.459080: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

CPU times: user 894 ms, sys: 621 ms, total: 1.51 s
Wall time: 3.07 s


## Make Tensorflow dataset

In [7]:
import tensorflow as tf
def preprocess(X, y):
    return X, y
def make_dataset(feature, investment_id, y, batch_size=1024, mode="train"):
    ds = tf.data.Dataset.from_tensor_slices(((investment_id, feature), y))
    ds = ds.map(preprocess)
    if mode == "train":
        ds = ds.shuffle(2048)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
    return ds

## Modeling

In [8]:
def get_model():
    investment_id_inputs = tf.keras.Input((1, ), dtype=tf.uint16)
    features_inputs = tf.keras.Input((300, ), dtype=tf.float16)
    
    investment_id_x = investment_id_lookup_layer(investment_id_inputs)
    investment_id_x = layers.Embedding(investment_id_size, 32, input_length=1)(investment_id_x)
    investment_id_x = layers.Reshape((-1, ))(investment_id_x)
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
    
    feature_x = layers.Dense(256, activation='swish')(features_inputs)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    
    x = layers.Concatenate(axis=1)([investment_id_x, feature_x])
    x = layers.Dense(320, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dense(128, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dense(32, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dense(16, activation='swish', kernel_regularizer="l2")(x)
    output = layers.Dense(1)(x)
    rmse = keras.metrics.RootMeanSquaredError(name="rmse")
    model = tf.keras.Model(inputs=[investment_id_inputs, features_inputs], outputs=[output])
    model.compile(optimizer=tf.optimizers.Adam(0.00001), loss='mse', metrics=['mse', "mae", "mape", rmse])
    return model

def correlation(x, y, axis=-2):
    """Metric returning the Pearson correlation coefficient of two tensors over some axis, default -2."""
    x = tf.convert_to_tensor(x)
    y = math_ops.cast(y, x.dtype)
    n = tf.cast(tf.shape(x)[axis], x.dtype)
    xsum = tf.reduce_sum(x, axis=axis)
    ysum = tf.reduce_sum(y, axis=axis)
    xmean = xsum / n
    ymean = ysum / n
    xvar = tf.reduce_sum( tf.math.squared_difference(x, xmean), axis=axis)
    yvar = tf.reduce_sum( tf.math.squared_difference(y, ymean), axis=axis)
    cov = tf.reduce_sum( (x - xmean) * (y - ymean), axis=axis)
    corr = cov / tf.sqrt(xvar * yvar)
    return corr

def get_model2():
    investment_id_inputs = tf.keras.Input((1, ), dtype=tf.uint16)
    features_inputs = tf.keras.Input((300, ), dtype=tf.float16)
    
    investment_id_x = investment_id_lookup_layer(investment_id_inputs)
    investment_id_x = layers.Embedding(investment_id_size, 32, input_length=1)(investment_id_x)
    investment_id_x = layers.Reshape((-1, ))(investment_id_x)
#     investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
#     investment_id_x = layers.Dropout(0.1)(investment_id_x)
#     investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
#     investment_id_x = layers.Dropout(0.1)(investment_id_x)
    investment_id_x = layers.Dense(1, activation='swish')(investment_id_x)
#     investment_id_x = layers.Dropout(1)(investment_id_x)
    
    feature_x = layers.Dense(256, activation='swish')(features_inputs)
    feature_x = layers.Dropout(0.1)(feature_x)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    feature_x = layers.Dropout(0.1)(feature_x)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    feature_x = layers.Dropout(0.1)(feature_x)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    feature_x = layers.Dropout(0.1)(feature_x)
    
    x = layers.Concatenate(axis=1)([investment_id_x, feature_x])
    x = layers.Dense(512, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dropout(0.1)(x)
    x = layers.Dense(128, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dropout(0.1)(x)
    x = layers.Dense(32, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dropout(0.1)(x)
    output = layers.Dense(1)(x)
#     rmse = keras.metrics.RootMeanSquaredError(name="rmse")
    model = tf.keras.Model(inputs=[investment_id_inputs, features_inputs], outputs=[output])
#     model.compile(optimizer=tf.optimizers.Adam(0.001), loss='mse', metrics=['mse', "mae", "mape", rmse, correlation])
    return model

In [9]:
# student = get_model2()
# student.summary()
# keras.utils.plot_model(student, show_shapes=True)

In [10]:
gc.collect()

798

## teacher

In [11]:
teacher = []
teacher.append(keras.models.load_model(f"../input/simple-model-caves/model_43"))

teacher.append(keras.models.load_model(f"../input/simple-model-caves/model_51"))

teacher.append(keras.models.load_model(f"../input/simple-model-caves/model_48"))

teacher.append(keras.models.load_model(f"../input/simple-model-caves/model_42"))

teacher_weights = [0.1, 0.1, 0.35, 0.45]

In [12]:
class Distiller(keras.Model):

  # Needs both the student and teacher models to create an instance of this class
    def __init__(self, student, teacher):
        super(Distiller, self).__init__()
        self.teacher = teacher
        self.student = student

    # Will be used when calling model.compile()
    def compile(self, optimizer, metrics, student_loss_fn,
              distillation_loss_fn, alpha, temperature):

        # Compile using the optimizer and metrics
        
        super(Distiller, self).compile(optimizer=optimizer, metrics=metrics)

        # Add the other params to the instance
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature

    # Will be used when calling model.fit()
    def train_step(self, data):
        # Data is expected to be a tuple of (features, labels)
        x, y = data

        # Vanilla forward pass of the teacher
        # Note that the teacher is NOT trained
        teacher_predictions = self.teacher[0](x, training=False)*teacher_weights[0]
        for t in range(1, len(self.teacher)):
#             teacher_predictions = t(x, training=False)
            teacher_predictions = teacher_predictions + self.teacher[t](x, training=False)*teacher_weights[t]
            
#         tf.math.reduce_mean()

        # Use GradientTape to save gradients
        with tf.GradientTape() as tape:
            # Vanilla forward pass of the student
            student_predictions = self.student(x, training=True)

            # Compute vanilla student loss
            student_loss = self.student_loss_fn(y, student_predictions)

            # Compute distillation loss
            # Should be KL divergence between logits softened by a temperature factor
#             distillation_loss = self.distillation_loss_fn(
#                 tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
#                 tf.nn.softmax(student_predictions / self.temperature, axis=1))
            distillation_loss = self.distillation_loss_fn(teacher_predictions / self.temperature, 
                                                          student_predictions/self.temperature)

            # Compute loss by weighting the two previous losses using the alpha param
            loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss

        # Use tape to calculate gradients for student
        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        # Update student weights 
        # Note that this done ONLY for the student
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        # Update the metrics
        self.compiled_metrics.update_state(y, student_predictions)
        # Return a performance dictionary
        # You will see this being outputted during training
        results = {m.name: m.result() for m in self.metrics}
        results.update({"student_loss": student_loss, "distillation_loss": distillation_loss})
        return results


    # Will be used when calling model.evaluate()
    def test_step(self, data):
        # Data is expected to be a tuple of (features, labels)
        x, y = data
        # Use student to make predictions
        # Notice that the training param is set to False
        y_prediction = self.student(x, training=False)
        # Calculate student's vanilla loss
        student_loss = self.student_loss_fn(y, y_prediction)
        # Update the metrics
        self.compiled_metrics.update_state(y, y_prediction)
        # Return a performance dictionary
        # You will see this being outputted during inference
        results = {m.name: m.result() for m in self.metrics}
        results.update({"student_loss": student_loss})
        return results

In [13]:
# Create Distiller instance
# teacher = keras.models.load_model(f"../input/simple-model-caves/model_42")


# Distill knowledge from teacher to student (will take around 3 mins with GPU enabled)
# distiller_history = distiller.fit(train_batches, epochs=5, validation_data=validation_batches)

In [14]:
# keras.utils.plot_model(teacher, show_shapes=True)


In [15]:
# keras.utils.plot_model(student, show_shapes=True)

In [16]:
%%time
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(3, shuffle=True, random_state=1997)
models = []
weights = []

def lr_scheduler(epoch, lr):
    decay_rate = 0.5
    decay_step = 8
    if epoch % decay_step == 0 and epoch:
        return lr * decay_rate
    return lr

for index, (train_indices, valid_indices) in enumerate(kfold.split(train, investment_id)):
#     break
    valid_indices = valid_indices[:800700]
#     print(len(train_indices))
    X_train, X_val = train.iloc[train_indices], train.iloc[valid_indices]
    investment_id_train = investment_id[train_indices]
    y_train, y_val = y.iloc[train_indices], y.iloc[valid_indices]
    investment_id_val = investment_id[valid_indices]
    train_ds = make_dataset(X_train, investment_id_train, y_train)
    valid_ds = make_dataset(X_val, investment_id_val, y_val, mode="valid")
    
    student = get_model2()
    distiller = Distiller(student=student, teacher=teacher)
    # Compile Distiller model
    rmse = keras.metrics.RootMeanSquaredError(name="rmse")
    distiller.compile(
        student_loss_fn=keras.losses.MeanSquaredError(),

        optimizer=tf.optimizers.Adam(0.001),
        metrics=[rmse, correlation],
        distillation_loss_fn=keras.losses.MeanSquaredError(reduction="auto", name="mse"),
        alpha=0.05,
        temperature=1)

    checkpoint = keras.callbacks.ModelCheckpoint(f"dess_{index}", save_best_only=True)
    early_stop = keras.callbacks.EarlyStopping(patience=8)
    shedualer = tf.keras.callbacks.LearningRateScheduler(lr_scheduler)
    
    history = distiller.fit(train_ds, epochs=140, validation_data=valid_ds, callbacks=[checkpoint, early_stop, shedualer])
    
#     models.append(keras.models.load_model(f"model_{index}"))

    model = distiller.student
    
    pearson_score = stats.pearsonr(model.predict(valid_ds).ravel(), y_val.values)[0]
    
    model.save(f"dess_{index}")
    models.append(model)
    weights.append(pearson_score)
    
    print('Pearson:', pearson_score)
#     pd.DataFrame(history.history, columns=["mse", "val_mse"]).plot()
#     plt.title("MSE")
#     plt.show()
# #     pd.DataFrame(history.history, columns=["mae", "val_mae"]).plot()
# #     plt.title("MAE")
# #     plt.show()
#     pd.DataFrame(history.history, columns=["correlation", "val_correlation"]).plot()
#     plt.title("correlation")
#     plt.show()
    del investment_id_train
    del investment_id_val
    del X_train
    del X_val
    del y_train
    del y_val
    del train_ds
    del valid_ds
    gc.collect()
#     break

2022-03-26 07:37:49.963426: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1256563800 exceeds 10% of free system memory.
2022-03-26 07:37:51.252766: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1256563800 exceeds 10% of free system memory.
2022-03-26 07:37:53.896759: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1256563800 exceeds 10% of free system memory.
2022-03-26 07:37:54.668702: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1256563800 exceeds 10% of free system memory.


Epoch 1/140


2022-03-26 07:38:24.361345: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1256563800 exceeds 10% of free system memory.


Epoch 2/140
Epoch 3/140
Epoch 4/140
Epoch 5/140
Epoch 6/140
Epoch 7/140
Epoch 8/140
Epoch 9/140
Epoch 10/140
Epoch 11/140
Epoch 12/140
Epoch 13/140
Epoch 14/140
Epoch 15/140
Epoch 16/140
Epoch 17/140
Epoch 18/140
Epoch 19/140
Epoch 20/140
Epoch 21/140
Epoch 22/140
Epoch 23/140
Epoch 24/140
Epoch 25/140
Epoch 26/140
Epoch 27/140
Epoch 28/140
Epoch 29/140
Epoch 30/140
Epoch 31/140
Epoch 32/140
Epoch 33/140
Epoch 34/140
Epoch 35/140
Epoch 36/140
Epoch 37/140
Epoch 38/140
Epoch 39/140
Epoch 40/140
Epoch 41/140
Epoch 42/140
Epoch 43/140
Epoch 44/140
Epoch 45/140
Epoch 46/140
Epoch 47/140
Epoch 48/140
Epoch 49/140
Epoch 50/140
Epoch 51/140
Epoch 52/140
Epoch 53/140
Epoch 54/140
Epoch 55/140
Epoch 56/140
Epoch 57/140
Epoch 58/140
Epoch 59/140
Epoch 60/140
Epoch 61/140
Epoch 62/140
Epoch 63/140
Epoch 64/140
Epoch 65/140
Epoch 66/140
Epoch 67/140
Epoch 68/140
Epoch 69/140
Epoch 70/140
Epoch 71/140
Epoch 72/140
Epoch 73/140
Epoch 74/140
Epoch 75/140
Epoch 76/140
Epoch 77/140
Epoch 78/140
Epoch 7

2022-03-26 08:45:37.741899: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


Pearson: 0.14935313909460374
Epoch 1/140
Epoch 2/140
Epoch 3/140
Epoch 4/140
Epoch 5/140
Epoch 6/140
Epoch 7/140
Epoch 8/140
Epoch 9/140
Epoch 10/140
Epoch 11/140
Epoch 12/140
Epoch 13/140
Epoch 14/140
Epoch 15/140
Epoch 16/140
Epoch 17/140
Epoch 18/140
Epoch 19/140
Epoch 20/140
Epoch 21/140
Epoch 22/140
Epoch 23/140
Epoch 24/140
Epoch 25/140
Epoch 26/140
Epoch 27/140
Epoch 28/140
Epoch 29/140
Epoch 30/140
Epoch 31/140
Epoch 32/140
Epoch 33/140
Epoch 34/140
Epoch 35/140
Epoch 36/140
Epoch 37/140
Epoch 38/140
Epoch 39/140
Epoch 40/140
Epoch 41/140
Epoch 42/140
Epoch 43/140
Epoch 44/140
Epoch 45/140
Epoch 46/140
Epoch 47/140
Epoch 48/140
Epoch 49/140
Epoch 50/140
Epoch 51/140
Epoch 52/140
Epoch 53/140
Epoch 54/140
Epoch 55/140
Epoch 56/140
Epoch 57/140
Epoch 58/140
Epoch 59/140
Epoch 60/140
Epoch 61/140
Epoch 62/140
Epoch 63/140
Epoch 64/140
Epoch 65/140
Epoch 66/140
Epoch 67/140
Epoch 68/140
Epoch 69/140
Epoch 70/140
Epoch 71/140
Epoch 72/140
Epoch 73/140
Epoch 74/140
Epoch 75/140
Epoch

In [17]:
weights = list(np.array(weights)/sum(weights))
weights

[0.3338101835693534, 0.334735387510482, 0.3314544289201647]

## Submission

In [18]:
def preprocess_test(investment_id, feature):
    return (investment_id, feature), 0
def make_test_dataset(feature, investment_id, batch_size=1024):
    ds = tf.data.Dataset.from_tensor_slices(((investment_id, feature)))
    ds = ds.map(preprocess_test)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
    return ds
def inference(models, ds):
    y_preds = []
    for model, w in zip(models, weights):
        y_pred = model.predict(ds)
        y_preds.append(y_pred*w)
    return np.sum(y_preds, axis=0)

In [19]:
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test() 
for (test_df, sample_prediction_df) in iter_test:
    ds = make_test_dataset(test_df[features], test_df["investment_id"])
    sample_prediction_df['target'] = inference(models, ds)
    env.predict(sample_prediction_df) 

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
