In [3]:
import os
import random
import numpy as np
from matplotlib import pyplot as plt

os.environ["CUDA_VISIBLE_DEVICES"]='0'
import tensorflow as tf

from keras.src.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, CSVLogger
from keras.src.layers import SimpleRNN, LSTM, Dense, Dropout
from keras import Sequential, Input

load data from csv

In [23]:
data_directory = '/home/jrosendahl/datasets/cadets/sequences/'

data = []

labels = None
no_labels = None

distinct_features = [
'EVENT_ACCEPT', 'EVENT_BIND', 'EVENT_CHANGE_PRINCIPAL', 'EVENT_CLOSE', 'EVENT_CONNECT', 'EVENT_CREATE_OBJECT', 'EVENT_EXECUTE', 'EVENT_EXIT', 'EVENT_FCNTL', 'EVENT_FORK', 'EVENT_LINK', 'EVENT_LOGIN', 'EVENT_LSEEK', 'EVENT_MMAP', 'EVENT_MODIFY_FILE_ATTRIBUTES', 'EVENT_MODIFY_PROCESS', 'EVENT_MPROTECT', 'EVENT_OPEN', 'EVENT_OTHER', 'EVENT_READ', 'EVENT_RECVFROM', 'EVENT_RECVMSG', 'EVENT_RENAME', 'EVENT_SENDMSG', 'EVENT_SENDTO', 'EVENT_SIGNAL', 'EVENT_TRUNCATE', 'EVENT_UNLINK', 'EVENT_WRITE', 
]
distinct_features = [ x[6:] for x in distinct_features ]
no_features = len(distinct_features)

count = 0
count_long_sequences_splitted = 0
count_long_sequences_splitted_result = 0
for file_name in os.listdir(data_directory):
    count += 1
    if count % 50000 == 0:
        print(f'Loaded {count} sequences')
    with open(data_directory + file_name, 'r') as f:
        label = file_name.split('_')[0]
        # build dictionary with counts of events
        lines = f.readlines()
        # if sequence is longer than 1000, split into multiple sequences
        if len(lines) > 1000:
            count_long_sequences_splitted += 1
            for i in range(0, len(lines), 1000):
                count_long_sequences_splitted_result += 1
                datapoints = []
                for event in lines[i:i+1000]:
                    event = event.strip()
                    datapoints.append(distinct_features.index(event))
                data.append((label, datapoints))
        else:
            datapoints = []
            for event in lines:
                event = event.strip()
                datapoints.append(distinct_features.index(event))
            data.append((label, datapoints))

        """
        datapoints = []
        for event in f:
            event = event.strip()
            datapoints.append(distinct_features.index(event))
        if not len(datapoints) > 1000:
            data.append((label, datapoints))
        """

print(f'Loaded {len(data)} sequences')
print(f'Found {count_long_sequences_splitted} sequences longer than 1000')
print(f'Split them into {count_long_sequences_splitted_result} sequences')

print(f'Filtered out datapoints with less than 10 occurrences')
print(f'Left with {len(data)} datapoints')

random.shuffle(data)
labels = [ x[0] for x in data ]
data = [ x[1] for x in data ]

distinct_labels = np.unique(np.array(labels))
no_labels = len(distinct_labels)
print(f'Found {no_labels} distinct labels')

# get distinct features
print(f'Found {no_features} distinct features')

# encode labels from strings to integers to one-hot
labels = np.eye(no_labels)[np.vectorize(distinct_labels.tolist().index)(labels)]
print(f'Encoded labels to one-hot')

# encode sequences from list of integers to list of one-hot
data = [ np.eye(no_features)[x] for x in data ]

# get pad event to pad sequences with when batches are built
pad_event = np.zeros(no_features)

Loaded 50000 sequences
Loaded 100000 sequences
Loaded 150000 sequences
Loaded 200000 sequences
Loaded 250000 sequences
Loaded 300000 sequences
Loaded 350000 sequences
Loaded 400000 sequences
Loaded 447415 sequences
Found 1988 sequences longer than 1000
Split them into 18540 sequences
Filtered out datapoints with less than 10 occurrences
Left with 447415 datapoints
Found 135 distinct labels
Found 29 distinct features
Encoded labels to one-hot


In [25]:
print(f'data length: {len(data)}')
print(f'labels shape: {labels.shape}')

data length: 447415
labels shape: (447415, 135)


prepare data + generator

In [26]:
# split data into training and validation
split = int(len(data) * 0.8)

X_train = data[:split]
y_train = np.array(labels[:split])

X_val = data[split:]
y_val = np.array(labels[split:])

print(f'Training on {len(X_train)} samples')
print(f'Validating on {len(X_val)} samples')

Training on 357932 samples
Validating on 89483 samples


In [27]:
class Generator(tf.keras.utils.Sequence):
    def __init__(self, X, y, batch_size):
        # valid **kwargs: workers, use_multiprocessing, max_queue_size
        super().__init__()
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.no_samples = len(X)
        self.no_batches = int(np.ceil(self.no_samples / self.batch_size))

    def __len__(self):
        return self.no_batches

    def __getitem__(self, index):
        start = index * self.batch_size
        end = min(start + self.batch_size, self.no_samples)
        X_batch = self.X[start:end]
        y_batch = self.y[start:end]
        max_len = max([ len(x) for x in X_batch ])
        X_batch = [ np.pad(x, ((0, max_len - len(x)), (0, 0)), 'constant', constant_values=0) for x in X_batch ]
        return np.array(X_batch), np.array(y_batch)

    def on_epoch_end(self):
        pass

build model

In [28]:
model = Sequential(layers=[
    Input(shape=(None, no_features)),
    SimpleRNN(64, return_sequences=True),
    SimpleRNN(64, return_sequences=True),
    SimpleRNN(64, return_sequences=False),
    Dense(no_labels, activation='softmax')
])

model.summary()

In [29]:
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    verbose=1,
    mode='auto'
)

checkpoint_path = ("/home/jrosendahl/sync/models/checkpoints")
model_checkpoint = ModelCheckpoint(
    filepath=f'{checkpoint_path}/rnn_simple.keras',
    monitor='val_loss',
    save_best_only=True,
    verbose=1
)

lr_schedule = ReduceLROnPlateau(
    monitor='loss', 
    factor=0.1, 
    patience=3, 
    min_lr=1e-6
)

csv_logger = CSVLogger(
    filename=f'{checkpoint_path}/rnn.log',
    append=True
)


# 'categorical_focal_crossentropy'
# 'categorical_crossentropy'
model.compile(optimizer='adam', loss='categorical_focal_crossentropy', metrics=['accuracy'])

train model

In [30]:
history = model.fit(
    x=Generator(X_train, y_train, 64),
    validation_data=Generator(X_val, y_val, 32),
    epochs=50,
    callbacks=[early_stop, model_checkpoint, lr_schedule, csv_logger],
)

# save history to file
with open(f'{checkpoint_path}/rnn_history.npy', 'wb') as f:
    np.save(f, history.history)

Epoch 1/50


  self._warn_if_super_not_called()


[1m1064/5593[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m20:06[0m 266ms/step - accuracy: 0.3385 - loss: 0.6038