In [1]:
import os
import re
from collections import Counter
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
cm = 1/2.54

# force GPU device
os.environ["CUDA_VISIBLE_DEVICES"]='0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf

from keras.src.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, CSVLogger
from keras.src.layers import SimpleRNN, LSTM, Dense, Dropout, Bidirectional, Embedding, Input, RepeatVector, TimeDistributed, Reshape
from keras import Sequential, Model
from keras.losses import CategoricalCrossentropy, CategoricalFocalCrossentropy
from keras.optimizers import Adam
from keras.models import load_model
from keras.metrics import CategoricalAccuracy, TopKCategoricalAccuracy


from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

from utils import *

In [2]:
data_directory = '/home/jrosendahl/datasets/cadets/sequences_export_benign_filetypes_path_ts/'

In [3]:
# load data, build vocabulary

vocab = set()
X_train = []
longest_path = 0
mean_length = 0

files_loaded = 0

for filename in os.listdir(data_directory):
    with open(os.path.join(data_directory, filename), 'r') as f:
        files_loaded += 1
        if files_loaded % 50000 == 0:
            print(f"Files loaded: {files_loaded}")

        for line in f:
            line = line.split(',')
            path1 = line[4]
            path2 = line[5]
            if path1 == 'None':
                path1 = ''
            if path2 == 'None':
                path2 = ''

            # add all characters to the vocabulary
            vocab.update(path1)
            vocab.update(path2)

            longest_path = max(longest_path, len(path1))
            longest_path = max(longest_path, len(path2))

            if path1 != '':
                X_train.append(path1)
                mean_length += len(path1)
            if path2 != '':
                X_train.append(path2)
                mean_length += len(path2)

mean_length /= len(X_train)
vocab_size = len(vocab)

char_to_idx = {char: idx+1 for idx, char in enumerate(vocab)}
# add padding character
char_to_idx[''] = 0
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

print(f"Files loaded: {files_loaded}")
print(f'{len(X_train)=}')
print(f'{vocab_size=}')
print(f'{longest_path=}')
print(f'{mean_length=}')

Files loaded: 50000
Files loaded: 100000
Files loaded: 150000
Files loaded: 200000
Files loaded: 215150
len(X_train)=8161337
vocab_size=79
longest_path=167
mean_length=16.771043273914557


In [6]:
fixed_length = 50

In [5]:
def encode_paths_optimized(X_train, char_to_idx, fixed_length):
    # Initialize the array with zeros (for padding)
    encoded_array = np.zeros((len(X_train), fixed_length), dtype=int)

    # Iterate over each path and fill the appropriate positions in the array
    for i, path in enumerate(X_train):
        # Convert path to indices and fill in the array up to the fixed length
        path_indices = [char_to_idx[char] for char in path[:fixed_length]]  # Truncate to fixed_length
        assert path_indices is not None
        encoded_array[i, :len(path_indices)] = path_indices  # Place indices in the array

    return encoded_array

In [7]:
X_train_vectorized = encode_paths_optimized(X_train, char_to_idx, fixed_length)

In [9]:
print(X_train_vectorized.shape)
print(X_train_vectorized[0])


(8161337, 50)
[76  2 63  8 76 18 21 43 76 68 73 33  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0]


In [11]:
# autoencoder model, encoder and decoder
# save encoder 

# encoder
encoder = Sequential([
    Embedding(vocab_size+1, 32, input_length=fixed_length),
    LSTM(32),
    Dense(16, activation='relu')
])

# decoder
decoder = Sequential([
    Dense(32, activation='relu', input_shape=(16,)),
    RepeatVector(fixed_length),
    LSTM(32, return_sequences=True),
    TimeDistributed(Dense(vocab_size+1, activation='softmax'))
])

# autoencoder
autoencoder = Sequential([encoder, decoder])

autoencoder.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
autoencoder.summary()

# train autoencoder
autoencoder.fit(X_train_vectorized, X_train_vectorized, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
[1m   974/204034[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:06:55[0m 20ms/step - accuracy: 0.7059 - loss: 1.5587