In [1]:
import os
import re
from collections import Counter
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
cm = 1/2.54

# force GPU device
os.environ["CUDA_VISIBLE_DEVICES"]='0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf

from keras.src.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, CSVLogger
from keras.src.layers import SimpleRNN, LSTM, Dense, Dropout, Bidirectional, Embedding, Input, RepeatVector, TimeDistributed, Reshape
from keras import Sequential, Model
from keras.losses import CategoricalCrossentropy, CategoricalFocalCrossentropy
from keras.optimizers import Adam
from keras.models import load_model
from keras.metrics import CategoricalAccuracy, TopKCategoricalAccuracy

from tensorflow.keras.callbacks import Callback
from tensorflow.keras.utils import Sequence

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

from utils import *

In [2]:
data_directory = '/home/jrosendahl/datasets/cadets/sequences_export_benign_filetypes_path_ts/'
experiment_name = 'path_autoencoder'

checkpoint_path = f'saves/{experiment_name}'
log_path = f'{checkpoint_path}/log.csv'
history_path = f'{checkpoint_path}/history.npy'

# ensure directory exists
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)

In [34]:
# list of limitations for paths:
# allowed charaters: alphanumeric chars + '/.'
# all chars are transformed to lowercase
def preprocess_path(path: str) -> str:
    path = path.lower()
    path = re.sub(r'[^a-z0-9/._]', '', path)
    return path

In [35]:
print(preprocess_path('C:/Users/JohnDoe/Documents/important.docx'))
print(preprocess_path('~/.ssh/id_rsa'))
print(preprocess_path('ascrebeGRWGHWR124235+ü,.--<>>~~'))


c/users/johndoe/documents/important.docx
/.ssh/id_rsa
ascrebegrwghwr124235.


In [36]:
# load data, build vocabulary

vocab = set()
X = []
longest_path = 0
files_loaded = 0

for filename in os.listdir(data_directory):
    with open(os.path.join(data_directory, filename), 'r') as f:
        files_loaded += 1
        if files_loaded % 50000 == 0:
            print(f"Files loaded: {files_loaded}")

        for line in f:
            line = line.split(',')
            path1 = line[4]
            path2 = line[5]


            for path in [path1, path2]:
                if path in ['', 'None']:
                    continue
                path = preprocess_path(path1)
                vocab.update(path)
                X.append(path)


mean_length = np.mean([len(path) for path in X])
vocab_size = len(vocab)

char_to_idx = {char: idx+1 for idx, char in enumerate(vocab)}
# add padding character
char_to_idx[''] = 0
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

print(f"Files loaded: {files_loaded}")
print(f'{len(X)=}')
print(f'{vocab_size=}')
print(f'{longest_path=}')
print(f'{mean_length=}')

Files loaded: 50000
Files loaded: 100000
Files loaded: 150000
Files loaded: 200000
Files loaded: 215150
len(X)=8161337
vocab_size=39
longest_path=0
mean_length=16.37407400282576


In [37]:
print(vocab)

{'g', 'd', 'n', 'l', '6', '4', 'm', 'x', 's', 'c', 'e', 'p', '.', 'o', 'w', '_', '2', 'y', 'b', 'u', 'r', 'z', 'f', 'v', '5', '9', '1', '3', 'j', 'k', '/', 'q', '8', '0', 't', 'a', '7', 'h', 'i'}


In [38]:
fixed_length = 50

In [5]:
def encode_paths_optimized(X, char_to_idx, fixed_length):
    # Initialize the array with zeros (for padding)
    encoded_array = np.zeros((len(X), fixed_length), dtype=int)

    # Iterate over each path and fill the appropriate positions in the array
    for i, path in enumerate(X):
        # Convert path to indices and fill in the array up to the fixed length
        path_indices = [char_to_idx[char] for char in path[:fixed_length]]  # Truncate to fixed_length
        assert path_indices is not None
        encoded_array[i, :len(path_indices)] = path_indices  # Place indices in the array

    return encoded_array

In [39]:
X_vectorized = encode_paths_optimized(X, char_to_idx, fixed_length)

In [40]:
print(f'{X_vectorized.shape=}')
print(f'{X_vectorized[0]=}')

X_vectorized.shape=(8161337, 50)
X_vectorized[0]=array([31, 20,  9, 21, 31, 19, 39,  3, 31, 35, 14, 12,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])


In [54]:
latent_dim = 32
expand_dim = 64
recurrent_dim = 128


encoder_input = Input(shape=(fixed_length,))
x = Embedding(input_dim=vocab_size+1, output_dim=expand_dim)(encoder_input)
x = LSTM(recurrent_dim)(x)
encoder_output = Dense(latent_dim, activation='relu')(x)

# Create the encoder model
encoder = Model(encoder_input, encoder_output, name='encoder')

# Decoder definition
decoder_input = Input(shape=(latent_dim,))
x = Dense(expand_dim, activation='relu')(decoder_input)
x = RepeatVector(fixed_length)(x)
x = LSTM(recurrent_dim, return_sequences=True)(x)
decoder_output = TimeDistributed(Dense(vocab_size+1, activation='softmax'))(x)

# Create the decoder model
decoder = Model(decoder_input, decoder_output, name='decoder')

# Autoencoder definition
autoencoder_input = encoder_input
encoded_sequence = encoder(autoencoder_input)
decoded_sequence = decoder(encoded_sequence)

# Create the autoencoder model by combining encoder and decoder
autoencoder = Model(autoencoder_input, decoded_sequence, name='autoencoder')

# Compile the autoencoder model
autoencoder.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=[
        'accuracy',
    ],
)

# Print model summaries
encoder.summary()
decoder.summary()
autoencoder.summary()

In [42]:
class SaveBestModels(Callback):
    def __init__(self, checkpoint_path, encoder, decoder, monitor='val_loss', mode='min', save_best_only=True):
        super(SaveBestModels, self).__init__()
        self.checkpoint_path = checkpoint_path
        self.encoder = encoder
        self.decoder = decoder
        self.monitor = monitor
        self.mode = mode
        self.save_best_only = save_best_only
        self.best = None

        if self.mode == 'min':
            self.best = float('inf')
        elif self.mode == 'max':
            self.best = float('-inf')

    def on_epoch_end(self, epoch, logs=None):
        current = logs.get(self.monitor)
        if current is None:
            return

        if self.mode == 'min' and current < self.best:
            print(f"\nEpoch {epoch + 1}: {self.monitor} improved from {self.best} to {current}. Saving models.")
            self.best = current
            self.encoder.save(os.path.join(self.checkpoint_path, 'encoder.keras'))
            self.decoder.save(os.path.join(self.checkpoint_path, 'decoder.keras'))

        elif self.mode == 'max' and current > self.best:
            print(f"\nEpoch {epoch + 1}: {self.monitor} improved from {self.best} to {current}. Saving models.")
            self.best = current
            self.encoder.save(os.path.join(self.checkpoint_path, 'encoder.keras'))
            self.decoder.save(os.path.join(self.checkpoint_path, 'decoder.keras'))


In [43]:
callbacks = [
    SaveBestModels(
        checkpoint_path=checkpoint_path,
        encoder=encoder,
        decoder=decoder,
        monitor='val_loss',
        mode='min'
    ),
    EarlyStopping(
        patience=9,
        restore_best_weights=True,
        monitor='val_loss',
        verbose=1,
    ),
    ReduceLROnPlateau(
        patience=3,
        factor=0.5,
        verbose=1
    ),
    CSVLogger(log_path),
]

In [51]:
class DataGenerator(Sequence):
    def __init__(self, data, batch_size, fixed_length, vocab_size, shuffle=True):
        # call super
        super(DataGenerator, self).__init__()
        self.data = data
        self.batch_size = batch_size
        self.fixed_length = fixed_length
        self.vocab_size = vocab_size
        self.shuffle = shuffle
        self.indices = np.arange(len(self.data))
        self.on_epoch_end()

    def __len__(self):
        # Number of batches per epoch
        return int(np.floor(len(self.data) / self.batch_size))

    def __getitem__(self, index):
        # Generate indices for the batch
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]

        # Generate data for the batch
        X_batch = self.__data_generation(batch_indices)

        # Since it's an autoencoder, the target data is the same as the input data
        return X_batch, X_batch

    def on_epoch_end(self):
        # Shuffle indices after each epoch if shuffle is True
        if self.shuffle:
            np.random.shuffle(self.indices)

    def __data_generation(self, batch_indices):
        # Generate data for a batch of given indices
        # This is where you can customize your data loading and processing
        batch_data = [self.data[i] for i in batch_indices]

        # Convert the batch data to a numpy array (or whatever format is needed)
        X_batch = np.array(batch_data)

        # Ensure the data is padded or truncated to fixed_length
        X_batch = np.array([np.pad(x, (0, max(0, self.fixed_length - len(x))), constant_values=0)[:self.fixed_length] for x in X_batch])

        return X_batch

In [52]:
# split data 85, 10, 5
train_split = 0.85
val_split = 0.1
test_split = 0.05

train_size = int(len(X_vectorized) * train_split)
val_size = int(len(X_vectorized) * val_split)
test_size = int(len(X_vectorized) * test_split)

X_train_split = X_vectorized[:train_size]
X_val_split = X_vectorized[train_size:train_size+val_size]
X_test_split = X_vectorized[train_size+val_size:]

batch_size = 256

train_generator = DataGenerator(X_train_split, batch_size, fixed_length, vocab_size)
val_generator = DataGenerator(X_val_split, batch_size, fixed_length, vocab_size)
test_generator = DataGenerator(X_test_split, batch_size, fixed_length, vocab_size)

In [None]:
# train autoencoder
history = autoencoder.fit(
    train_generator,
    validation_data=val_generator,
    epochs=200, 
    callbacks=callbacks
)

# save history
with open(history_path, 'wb') as f:
    np.save(f, history.history)

In [18]:
try:
    encoder
    decoder
    autoencoder
except NameError:
    # load encoder, decoder models
    encoder = load_model(f'{checkpoint_path}/encoder.keras')
    decoder = load_model(f'{checkpoint_path}/decoder.keras')

    # build autoencoder model
    autoencoder_input = encoder.input
    encoded_sequence = encoder(autoencoder_input)
    decoded_sequence = decoder(encoded_sequence)
    autoencoder = Model(autoencoder_input, decoded_sequence, name='autoencoder')
    autoencoder.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [11]:
def encode_path(path: str, char_to_idx: dict, fixed_length: int, encoder: Model):
    "take a path as a string, return the result from the encoder"
    path = preprocess_path(path)
    path_indices = [char_to_idx[char] for char in path[:fixed_length]]
    path_indices = np.array(path_indices)
    path_indices = np.pad(path_indices, (0, max(0, fixed_length - len(path_indices))), constant_values=0)[:fixed_length]
    path_indices = np.array([path_indices])
    return encoder.predict(path_indices)

def decode_path(encoded_path: np.ndarray, decoder: Model, idx_to_char: dict):
    "take the result from the encoder, return the decoded path"
    decoded_path = decoder.predict(encoded_path)
    decoded_path = np.argmax(decoded_path, axis=2)
    decoded_path = [''.join([idx_to_char[idx] for idx in path]) for path in decoded_path]
    return decoded_path

In [14]:
test = encode_path('/dev/null', char_to_idx, fixed_length, encoder)
test

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step


array([[0.19452104, 0.269752  , 0.        , 0.        , 0.        ,
        0.        , 2.180835  , 0.        , 0.        , 0.        ,
        0.        , 0.01902695, 0.        , 0.        , 0.        ,
        0.24062942, 2.0384183 , 0.06973484, 1.1193109 , 0.        ,
        1.5873064 , 0.83516556, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 1.0387359 ,
        0.        , 1.2860181 ]], dtype=float32)

In [15]:
test = decode_path(test, decoder, idx_to_char)
test

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 420ms/step


['h5/-hh<<FxQ']

In [20]:
# evaluate autoencoder on X_test
test_generator = DataGenerator(X_test_split, batch_size, fixed_length, vocab_size, shuffle=False)

# Evaluate the model on the test data
test_result = autoencoder.evaluate(test_generator)

print(f'Test logg: {test_result[0]}\nTest accuracy: {test_result[1]}')

[1m  12/1594[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m23s[0m 15ms/step - accuracy: 0.7017 - loss: 4.4755

  self._warn_if_super_not_called()


[1m1594/1594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 13ms/step - accuracy: 0.6386 - loss: 5.1317
Test result: [6.136971950531006, 0.5515469312667847]
