# Imports

In [None]:
import numpy as np
import pandas as pd
import tempfile
import os
import polars as pl
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras import Input
from tensorflow.keras.layers import LSTM
from tensorflow.keras.utils import to_categorical

# First Model

In [13]:
def get_model_data_polars_memmap(filepath, sample_size=100, feature_num=40, target_num=5):
    """
    Utilise memmap pour garder les données sur disque au lieu de RAM.
    Un peu plus lent mais utilise <1GB de RAM.
    """
    import polars as pl
    import numpy as np
    import tempfile
    import os

    print("Reading CSV...")
    df = pl.read_csv(filepath)
    total_rows = len(df)
    total_samples = total_rows - sample_size

    print(f"Creating memory-mapped arrays for {total_samples:,} samples...")

    # Créer des fichiers temporaires pour X et Y
    temp_dir = tempfile.gettempdir()
    X_file = os.path.join(temp_dir, 'X_temp.dat')
    Y_file = os.path.join(temp_dir, 'Y_temp.dat')

    # Créer memmap arrays
    X = np.memmap(X_file, dtype='float32', mode='w+',
                  shape=(total_samples, sample_size, feature_num, 1))
    Y = np.memmap(Y_file, dtype='float32', mode='w+',
                  shape=(total_samples, target_num))

    # Traiter par chunks de 10k pour limiter la RAM
    data = df.to_numpy()
    del df

    chunk_size = 10000
    for start in range(0, total_samples, chunk_size):
        end = min(start + chunk_size, total_samples)

        for j, i in enumerate(range(start, end)):
            X[i, :, :, 0] = data[i:i+sample_size, 0:feature_num]
            Y[i] = data[i+sample_size-1, -target_num:]

        if (end % 50000) == 0:
            print(f"Processed {end:,}/{total_samples:,}")

    Y[:] = Y[:] - 1

    print("Converting to regular arrays...")
    X_array = np.array(X, dtype='float32')
    Y_array = np.array(Y, dtype='int32')

    # Nettoyer les fichiers temporaires
    del X, Y
    os.remove(X_file)
    os.remove(Y_file)

    return X_array, Y_array

In [7]:
train_X, train_Y = get_model_data_polars_memmap("Data/FI2010_train.csv")
train_Y = train_Y.astype(int)

Reading CSV...
Creating memory-mapped arrays for 362,300 samples...
Processed 50,000/362,300
Processed 100,000/362,300
Processed 150,000/362,300
Processed 200,000/362,300
Processed 250,000/362,300
Processed 300,000/362,300
Processed 350,000/362,300
Converting to regular arrays...


In [8]:
train_X.shape

(362300, 100, 40, 1)

In [9]:
# the size of a single input is (100,40)
input_tensor = Input(shape=(100,40,1))

# convolutional filter is (1,2) with stride of (1,2)
layer_x = layers.Conv2D(16, (1,2), strides=(1,2))(input_tensor)
layer_x = layers.LeakyReLU(alpha=0.01)(layer_x)
layer_x = layers.Conv2D(16, (4,1), padding='same')(layer_x)
layer_x = layers.LeakyReLU(alpha=0.01)(layer_x)
layer_x = layers.Conv2D(16, (4,1), padding='same')(layer_x)
layer_x = layers.LeakyReLU(alpha=0.01)(layer_x)

layer_x = layers.Conv2D(16, (1,2), strides=(1,2))(layer_x)
layer_x = layers.LeakyReLU(alpha=0.01)(layer_x)
layer_x = layers.Conv2D(16, (4,1), padding='same')(layer_x)
layer_x = layers.LeakyReLU(alpha=0.01)(layer_x)
layer_x = layers.Conv2D(16, (4,1), padding='same')(layer_x)
layer_x = layers.LeakyReLU(alpha=0.01)(layer_x)

layer_x = layers.Conv2D(16, (1,10))(layer_x)
layer_x = layers.LeakyReLU(alpha=0.01)(layer_x)
layer_x = layers.Conv2D(16, (4,1), padding='same')(layer_x)
layer_x = layers.LeakyReLU(alpha=0.01)(layer_x)
layer_x = layers.Conv2D(16, (4,1), padding='same')(layer_x)
layer_x = layers.LeakyReLU(alpha=0.01)(layer_x)

# Inception Module
tower_1 = layers.Conv2D(32, (1,1), padding='same')(layer_x)
tower_1 = layers.LeakyReLU(alpha=0.01)(tower_1)
tower_1 = layers.Conv2D(32, (3,1), padding='same')(tower_1)
tower_1 = layers.LeakyReLU(alpha=0.01)(tower_1)

tower_2 = layers.Conv2D(32, (1,1), padding='same')(layer_x)
tower_2 = layers.LeakyReLU(alpha=0.01)(tower_2)
tower_2 = layers.Conv2D(32, (5,1), padding='same')(tower_2)
tower_2 = layers.LeakyReLU(alpha=0.01)(tower_2)  

tower_3 = layers.MaxPooling2D((3,1), padding='same', strides=(1,1))(layer_x)
tower_3 = layers.Conv2D(32, (1,1), padding='same')(tower_3)
tower_3 = layers.LeakyReLU(alpha=0.01)(tower_3)

layer_x = layers.concatenate([tower_1, tower_2, tower_3], axis=-1)

# concatenate features of tower_1, tower_2, tower_3
layer_x = layers.Reshape((100,96))(layer_x)

# 64 LSTM units
layer_x = LSTM(64)(layer_x)
# The last output layer uses a softmax activation function
output = layers.Dense(3, activation='softmax')(layer_x)
model = Model(input_tensor, output)

model.summary()

2025-11-04 10:00:06.130370: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4
2025-11-04 10:00:06.130531: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 24.00 GB
2025-11-04 10:00:06.130540: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 8.88 GB
2025-11-04 10:00:06.130746: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-11-04 10:00:06.130774: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [10]:
physical_devices = tf.config.list_physical_devices('GPU')
print("Physical devices:", physical_devices)

Physical devices: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [12]:
opt = tf.keras.optimizers.Adam(learning_rate=0.01, epsilon=1)# learning rate and epsilon are the same as paper DeepLOB
y = to_categorical(train_Y[:,0])# y is the next event's mid price (k=1)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

model.fit(train_X, y, epochs=10, batch_size=32)

Epoch 1/10
[1m11322/11322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m393s[0m 34ms/step - accuracy: 0.6387 - loss: 0.8934
Epoch 2/10
[1m11322/11322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m392s[0m 35ms/step - accuracy: 0.6386 - loss: 0.8925
Epoch 3/10
[1m11322/11322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m392s[0m 35ms/step - accuracy: 0.6389 - loss: 0.8924
Epoch 4/10
[1m11322/11322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m393s[0m 35ms/step - accuracy: 0.6388 - loss: 0.8924
Epoch 5/10
[1m11322/11322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m395s[0m 35ms/step - accuracy: 0.6387 - loss: 0.8923
Epoch 6/10
[1m11322/11322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m392s[0m 35ms/step - accuracy: 0.6388 - loss: 0.8922
Epoch 7/10
[1m11322/11322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m393s[0m 35ms/step - accuracy: 0.6389 - loss: 0.8922
Epoch 8/10
[1m11322/11322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m688s[0m 61ms/step - accuracy: 0.6388

<keras.src.callbacks.history.History at 0x3791f0970>

In [16]:
def read_data(path):
    data_list = []
    with open(path, 'r') as f:
        while True:
            line = f.readline()
            if not line:
                break
            d_str = line.split()
            d_tem = [float(d) for d in d_str]
            data_list.append(d_tem)
    data = pd.DataFrame(data_list)
    return data.T

In [21]:
test_X, test_Y = get_model_data_polars_memmap("Data/FI2010_test.csv")
test_Y = test_Y.astype(int)
test_y = to_categorical(test_Y[:,0])

model.evaluate(test_X, test_y)

Reading CSV...
Creating memory-mapped arrays for 31,837 samples...
Converting to regular arrays...
[1m995/995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.5755 - loss: 1.0321


[1.0321112871170044, 0.5754939317703247]

In [22]:
from sklearn.metrics import classification_report, confusion_matrix

predictions = model.predict(test_X)
predicted_classes = np.argmax(predictions, axis=1)
true_classes = np.argmax(test_y, axis=1)

print(classification_report(true_classes, predicted_classes))
print("\nConfusion Matrix:")
print(confusion_matrix(true_classes, predicted_classes))

[1m995/995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step
              precision    recall  f1-score   support

           0       0.14      0.10      0.12      5510
           1       0.66      0.82      0.73     21310
           2       0.16      0.03      0.06      5017

    accuracy                           0.58     31837
   macro avg       0.32      0.32      0.30     31837
weighted avg       0.49      0.58      0.52     31837


Confusion Matrix:
[[  577  4731   202]
 [ 3031 17577   702]
 [  505  4344   168]]


In [23]:
print("Class distribution:", np.bincount(test_Y[:,0]))

Class distribution: [ 5510 21310  5017]


# Way of improvements :

## Class weights

In [None]:
from sklearn.utils.class_weight import compute_class_weight

# Calculate class weights
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(train_Y[:,0]),
    y=train_Y[:,0]
)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

print("Class weights:", class_weight_dict)

# Train with class weights
model.fit(train_X, train_y, 
          class_weight=class_weight_dict,
          epochs=50, 
          batch_size=32,
          validation_split=0.2)

## Resampling

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

# Option A: SMOTE (Synthetic Minority Oversampling)
smote = SMOTE(random_state=42)
train_X_resampled, train_Y_resampled = smote.fit_resample(
    train_X.reshape(train_X.shape[0], -1), 
    train_Y[:,0]
)
train_X_resampled = train_X_resampled.reshape(-1, train_X.shape[1], train_X.shape[2])
train_y_resampled = to_categorical(train_Y_resampled)

# Option B: Random undersampling
rus = RandomUnderSampler(random_state=42)
train_X_resampled, train_Y_resampled = rus.fit_resample(
    train_X.reshape(train_X.shape[0], -1),
    train_Y[:,0]
)
train_X_resampled = train_X_resampled.reshape(-1, train_X.shape[1], train_X.shape[2])
train_y_resampled = to_categorical(train_Y_resampled)

## Focal Loss

In [None]:
import tensorflow as tf

def focal_loss(gamma=2., alpha=0.25):
    def focal_loss_fixed(y_true, y_pred):
        epsilon = tf.keras.backend.epsilon()
        y_pred = tf.clip_by_value(y_pred, epsilon, 1. - epsilon)
        cross_entropy = -y_true * tf.math.log(y_pred)
        weight = alpha * y_true * tf.pow((1 - y_pred), gamma)
        loss = weight * cross_entropy
        return tf.reduce_sum(loss, axis=1)
    return focal_loss_fixed

# Compile model with focal loss
model.compile(optimizer='adam', 
              loss=focal_loss(gamma=2., alpha=0.25),
              metrics=['accuracy'])

## Adjuste Decision Threshold

In [None]:
# Instead of argmax, use custom thresholds
predictions = model.predict(test_X)

# Adjust thresholds to favor minority classes
# This is a simple example - you'd tune these values
adjusted_predictions = predictions.copy()
adjusted_predictions[:,0] *= 1.5  # Boost class 0
adjusted_predictions[:,2] *= 1.5  # Boost class 2
predicted_classes = np.argmax(adjusted_predictions, axis=1)