In [1]:
import os
import re
from collections import Counter
import json
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
cm = 1/2.54

# force GPU device
os.environ["CUDA_VISIBLE_DEVICES"]='1'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf

from keras.src.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, CSVLogger
from keras.src.layers import SimpleRNN, LSTM, Dense, Dropout, Bidirectional, Embedding, Flatten, Concatenate
from keras import Sequential, Input
from keras.losses import CategoricalCrossentropy, CategoricalFocalCrossentropy
from keras.optimizers import Adam
from keras.models import load_model
from keras.models import Model
from keras.metrics import CategoricalAccuracy, TopKCategoricalAccuracy


from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

from utils import *

In [3]:
data_directory = '/home/jrosendahl/datasets/cadets/sequences_export_benign_filetypes_path_ts/'
autoencoder_path='/home/jrosendahl/sync/models/path_autoencoder/saves/path_autoencoder'
export_file = 'path_encoding_map.pkl'


encoder = load_model(os.path.join(autoencoder_path, 'encoder.keras'))

with open(os.path.join(autoencoder_path, 'char_to_idx.json'), 'r') as f:
    encoder_char_to_idx = json.load(f)

In [4]:
print(f'{fixed_length=}')
print(f'{encoder_char_to_idx=}')

fixed_length=50
encoder_char_to_idx={'': 0, '.': 1, '/': 2, '0': 3, '1': 4, '2': 5, '3': 6, '4': 7, '5': 8, '6': 9, '7': 10, '8': 11, '9': 12, '_': 13, 'a': 14, 'b': 15, 'c': 16, 'd': 17, 'e': 18, 'f': 19, 'g': 20, 'h': 21, 'i': 22, 'j': 23, 'k': 24, 'l': 25, 'm': 26, 'n': 27, 'o': 28, 'p': 29, 'q': 30, 'r': 31, 's': 32, 't': 33, 'u': 34, 'v': 35, 'w': 36, 'x': 37, 'y': 38, 'z': 39}


In [5]:
path_encoding_map = {}

paths = []

max_file_count = len(os.listdir(data_directory))
file_count = 0
for filename in os.listdir(data_directory):
    file_count += 1
    if file_count % 10000 == 0:
        print(f'processed {file_count} files ({file_count/max_file_count*100:.2f}%)')
    with open(os.path.join(data_directory, filename), 'r') as file:
        for line in file:
            line = line.strip('\n ').split(',')

            for path in line[4:6]:
                path_pp = preprocess_path(path)
                paths.append(path_pp)

print(f'data contains {len(paths)} paths')
paths = list(set(paths))
print(f'unique paths: {len(paths)}')

processed 10000 files (4.65%)
processed 20000 files (9.30%)
processed 30000 files (13.94%)
processed 40000 files (18.59%)
processed 50000 files (23.24%)
processed 60000 files (27.89%)
processed 70000 files (32.54%)
processed 80000 files (37.18%)
processed 90000 files (41.83%)
processed 100000 files (46.48%)
processed 110000 files (51.13%)
processed 120000 files (55.78%)
processed 130000 files (60.42%)
processed 140000 files (65.07%)
processed 150000 files (69.72%)
processed 160000 files (74.37%)
processed 170000 files (79.01%)
processed 180000 files (83.66%)
processed 190000 files (88.31%)
processed 200000 files (92.96%)
processed 210000 files (97.61%)
data contains 35434964 paths
unique paths: 242602


In [7]:
# path_encoding_map[path_pp] = encoder.predict(np.array([vectorize_datapoint(path, encoder_char_to_idx, fixed_length)]), verbose=0)[0]

# batch inference
batch_size = 4096
batches = len(paths) // batch_size
if len(paths) % batch_size > 0:
    batches += 1

for i in range(batches):
    print(f'processing batch {i+1}/{batches}')
    batch = paths[i*batch_size:(i+1)*batch_size]
    batch = [vectorize_datapoint(path, encoder_char_to_idx, fixed_length) for path in batch]
    batch = np.array(batch)
    batch = encoder.predict(batch, verbose=0)
    for j in range(len(batch)):
        path_encoding_map[paths[i*batch_size+j]] = batch[j]

print(f'encoded {len(path_encoding_map)} paths')

processing batch 1/60
processing batch 2/60
processing batch 3/60
processing batch 4/60
processing batch 5/60
processing batch 6/60
processing batch 7/60
processing batch 8/60
processing batch 9/60
processing batch 10/60
processing batch 11/60
processing batch 12/60
processing batch 13/60
processing batch 14/60
processing batch 15/60
processing batch 16/60
processing batch 17/60
processing batch 18/60
processing batch 19/60
processing batch 20/60
processing batch 21/60
processing batch 22/60
processing batch 23/60
processing batch 24/60
processing batch 25/60
processing batch 26/60
processing batch 27/60
processing batch 28/60
processing batch 29/60
processing batch 30/60
processing batch 31/60
processing batch 32/60
processing batch 33/60
processing batch 34/60
processing batch 35/60
processing batch 36/60
processing batch 37/60
processing batch 38/60
processing batch 39/60
processing batch 40/60
processing batch 41/60
processing batch 42/60
processing batch 43/60
processing batch 44/

In [8]:
# write map to disk
with open(export_file, 'wb') as f:
    pickle.dump(path_encoding_map, f)