In [1]:
import os
import random
import numpy as np
from matplotlib import pyplot as plt

# force GPU device
# os.environ["CUDA_VISIBLE_DEVICES"]='0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf

from keras.src.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, CSVLogger
from keras.src.layers import SimpleRNN, LSTM, Dense, Dropout
from keras import Sequential, Input

load data from csv

In [2]:
data_directory = '/home/jrosendahl/datasets/cadets/sequences/'

data = []

labels = None
no_labels = None

distinct_features = [
'EVENT_ACCEPT', 'EVENT_BIND', 'EVENT_CHANGE_PRINCIPAL', 'EVENT_CLOSE', 'EVENT_CONNECT', 'EVENT_CREATE_OBJECT', 'EVENT_EXECUTE', 'EVENT_EXIT', 'EVENT_FCNTL', 'EVENT_FORK', 'EVENT_LINK', 'EVENT_LOGIN', 'EVENT_LSEEK', 'EVENT_MMAP', 'EVENT_MODIFY_FILE_ATTRIBUTES', 'EVENT_MODIFY_PROCESS', 'EVENT_MPROTECT', 'EVENT_OPEN', 'EVENT_OTHER', 'EVENT_READ', 'EVENT_RECVFROM', 'EVENT_RECVMSG', 'EVENT_RENAME', 'EVENT_SENDMSG', 'EVENT_SENDTO', 'EVENT_SIGNAL', 'EVENT_TRUNCATE', 'EVENT_UNLINK', 'EVENT_WRITE', 
]
distinct_features = [ x[6:] for x in distinct_features ]
no_features = len(distinct_features)

count = 0
count_long_sequences_splitted = 0
count_long_sequences_splitted_result = 0
for file_name in os.listdir(data_directory):
    count += 1
    if count % 50000 == 0:
        print(f'Loaded {count} sequences')
    with open(data_directory + file_name, 'r') as f:
        label = file_name.split('_')[0]
        # build dictionary with counts of events
        lines = f.readlines()
        # if sequence is longer than 1000, split into multiple sequences
        if len(lines) > 1000:
            count_long_sequences_splitted += 1
            for i in range(0, len(lines), 1000):
                count_long_sequences_splitted_result += 1
                datapoints = []
                for event in lines[i:i+1000]:
                    event = event.strip()
                    datapoints.append(distinct_features.index(event))
                data.append((label, datapoints))
        else:
            datapoints = []
            for event in lines:
                event = event.strip()
                datapoints.append(distinct_features.index(event))
            data.append((label, datapoints))

        """
        datapoints = []
        for event in f:
            event = event.strip()
            datapoints.append(distinct_features.index(event))
        if not len(datapoints) > 1000:
            data.append((label, datapoints))
        """

print(f'Loaded {len(data)} sequences')
print(f'Found {count_long_sequences_splitted} sequences longer than 1000')
print(f'Split them into {count_long_sequences_splitted_result} sequences')

print(f'Filtered out datapoints with less than 10 occurrences')
print(f'Left with {len(data)} datapoints')

random.shuffle(data)
labels = [ x[0] for x in data ]
data = [ x[1] for x in data ]

Loaded 50000 sequences
Loaded 100000 sequences
Loaded 150000 sequences
Loaded 200000 sequences
Loaded 250000 sequences
Loaded 300000 sequences
Loaded 350000 sequences
Loaded 400000 sequences
Loaded 447415 sequences
Found 1988 sequences longer than 1000
Split them into 18540 sequences
Filtered out datapoints with less than 10 occurrences
Left with 447415 datapoints


In [3]:
distinct_labels = np.unique(np.array(labels))
no_labels = len(distinct_labels)
print(f'Found {no_labels} distinct labels')

# get distinct features
print(f'Found {no_features} distinct features')

# encode labels from strings to integers to one-hot
# labels = np.eye(no_labels)[np.vectorize(distinct_labels.tolist().index)(labels)]
# encode labels to integers
labels = np.vectorize(distinct_labels.tolist().index)(labels)
print(f'Encoded labels to integers')
# encode labels to one-hot
labels = np.eye(no_labels)[labels]
print(f'Encoded labels to one-hot')

# encode sequences from list of integers to list of one-hot
data = [ np.eye(no_features)[x] for x in data ]
print(f'Encoded sequences to one-hot')

# get pad event to pad sequences with when batches are built
pad_event = np.zeros(no_features)

Found 135 distinct labels
Found 29 distinct features
Encoded labels to integers
Encoded labels to one-hot
Encoded sequences to one-hot


In [4]:
print(f'data length: {len(data)}')
print(f'labels shape: {labels.shape}')

data length: 447415
labels shape: (447415, 135)


prepare data + generator

In [4]:
# split data into training and validation
split = int(len(data) * 0.8)

X_train = data[:split]
y_train = np.array(labels[:split])

X_val = data[split:]
y_val = np.array(labels[split:])

print(f'Training on {len(X_train)} samples')
print(f'Validating on {len(X_val)} samples')

Training on 357932 samples
Validating on 89483 samples


In [5]:
class Generator(tf.keras.utils.Sequence):
    def __init__(self, X, y, batch_size, fixed_length, **kwargs):
        # valid **kwargs: workers, use_multiprocessing, max_queue_size
        super().__init__()
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.fixed_length = fixed_length
        self.no_samples = len(X)
        self.no_batches = int(np.ceil(self.no_samples / self.batch_size))

    def __len__(self):
        return self.no_batches

    def __getitem__(self, index):
        start = index * self.batch_size
        end = min(start + self.batch_size, self.no_samples)

        # Get the batch data
        X_batch = self.X[start:end]
        y_batch = self.y[start:end]

        # Pad or truncate each sequence in X_batch to the fixed length
        X_batch_fixed = [self._pad_or_truncate(x, self.fixed_length) for x in X_batch]

        # Convert to numpy arrays
        X_batch_fixed = np.array(X_batch_fixed)
        y_batch = np.array(y_batch)

        return X_batch_fixed, y_batch

    def _pad_or_truncate(self, sequence, length):
        if len(sequence) < length:
            # Pad sequence with zeros to the fixed length
            return np.pad(sequence, ((0, length - len(sequence)), (0, 0)), mode='constant', constant_values=0)
        else:
            # Truncate sequence to the fixed length
            return sequence[:length]

    def on_epoch_end(self):
        pass

build model

In [6]:
model = Sequential(layers=[
    Input(shape=(None, no_features)),
    SimpleRNN(64, return_sequences=True),
    SimpleRNN(64, return_sequences=True),
    SimpleRNN(64, return_sequences=False),
    Dense(no_labels, activation='softmax')
])

model.summary()

In [7]:
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    verbose=1,
    mode='auto'
)

checkpoint_path = ("/home/jrosendahl/sync/models/checkpoints")
model_checkpoint = ModelCheckpoint(
    filepath=f'{checkpoint_path}/rnn_simple.keras',
    monitor='val_loss',
    save_best_only=True,
    verbose=1
)

lr_schedule = ReduceLROnPlateau(
    monitor='loss', 
    factor=0.1, 
    patience=3, 
    min_lr=1e-6
)

csv_logger = CSVLogger(
    filename=f'{checkpoint_path}/rnn.log',
    append=True
)


# 'categorical_focal_crossentropy'
# 'categorical_crossentropy'
model.compile(optimizer='adam', loss='categorical_focal_crossentropy', metrics=['accuracy'])

train model

In [8]:
history = model.fit(
    x=Generator(X_train, y_train, 64, 1000),
    validation_data=Generator(X_val, y_val, 32, 1000),
    epochs=50,
    callbacks=[early_stop, model_checkpoint, lr_schedule, csv_logger],
)

# save history to file
with open(f'{checkpoint_path}/rnn_history.npy', 'wb') as f:
    np.save(f, history.history)

Epoch 1/50


I0000 00:00:1718258609.110637  487352 service.cc:145] XLA service 0x867b760 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1718258609.110685  487352 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce RTX 2080 Ti, Compute Capability 7.5
I0000 00:00:1718258609.110690  487352 service.cc:153]   StreamExecutor device (1): NVIDIA GeForce RTX 2080 Ti, Compute Capability 7.5


[1m   1/5593[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m6:28:56[0m 4s/step - accuracy: 0.0000e+00 - loss: 1.2245

I0000 00:00:1718258610.536306  487352 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m5593/5593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 189ms/step - accuracy: 0.3588 - loss: 0.5451
Epoch 1: val_loss improved from inf to 0.53439, saving model to /home/jrosendahl/sync/models/checkpoints/rnn_simple.keras
[1m5593/5593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1232s[0m 220ms/step - accuracy: 0.3588 - loss: 0.5451 - val_accuracy: 0.3557 - val_loss: 0.5344 - learning_rate: 0.0010
Epoch 2/50
[1m5593/5593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 188ms/step - accuracy: 0.3603 - loss: 0.5284
Epoch 2: val_loss improved from 0.53439 to 0.53325, saving model to /home/jrosendahl/sync/models/checkpoints/rnn_simple.keras
[1m5593/5593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1229s[0m 220ms/step - accuracy: 0.3603 - loss: 0.5284 - val_accuracy: 0.3557 - val_loss: 0.5333 - learning_rate: 0.0010
Epoch 3/50
[1m5593/5593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 189ms/step - accuracy: 0.3648 - loss: 0.5217
Epoch 3: val_loss improved 