# Michał Gromadzki

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
SEED = 1337
tf.random.set_seed(SEED)

2023-06-26 16:57:20.305574: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Load Data

In [2]:
df_test = pd.read_csv("data/cybersecurity_test.csv",sep="|").drop(["n1","n2","n3","n4","n5","n6","n7","n8","n9","n10","score"],axis=1)
df_test.head()

Unnamed: 0,alert_ids,client_code,categoryname,ip,ipcategory_name,ipcategory_scope,parent_category,grandparent_category,overallseverity,timestamp_dist,...,thrcnt_week,thrcnt_day,p6,p9,p5m,p5w,p5d,p8m,p8w,p8d
0,Slg,RLJ,Exploit,MW.YB.50.64,INTERNET,Internet,7,A,3,0,...,298,42,1,0,1,1,1,1,1,1
1,WKM,UZT,Exploit,IJ.NW.77.74,INTERNET,Internet,7,A,5,0,...,11,3,1,0,1,1,1,1,1,1
2,dkm,ZZW,Attack,YT.LB.36.21,INTERNET,Internet,7,A,3,0,...,3601,602,1,0,3,1,1,1,1,1
3,RIX,QXG,Attack,172.BW.LB.105,PRIV-172,Private network,1,A,3,0,...,12,4,1,0,3,1,1,2,1,1
4,qFU,PDU,Exploit,YT.LB.32.110,INTERNET,Internet,7,A,3,258273,...,131,20,1,0,1,1,1,1,1,1


In [3]:
df_train = pd.read_csv("data/cybersecurity_training.csv",sep="|").drop(["n1","n2","n3","n4","n5","n6","n7","n8","n9","n10","score"],axis=1)
df_train.head()

Unnamed: 0,alert_ids,client_code,notified,categoryname,ip,ipcategory_name,ipcategory_scope,parent_category,grandparent_category,overallseverity,...,thrcnt_week,thrcnt_day,p6,p9,p5m,p5w,p5d,p8m,p8w,p8d
0,Nhq,DPM,0,Attack,YT.LB.32.21,INTERNET,Internet,7,A,3,...,4160,675,1,0,2,1,1,1,1,1
1,XZt,FIN,0,Exploit,192.SL.UK.94,PRIV-192,Private network,1,A,5,...,9,2,4,12,3,2,2,2,1,1
2,bBz,CHP,0,Attack,YT.LB.38.21,INTERNET,Internet,7,A,4,...,3788,628,1,0,2,2,1,2,2,1
3,ZNr,HPS,0,Attack,JX.NY.13.20,INTERNET,Internet,7,A,4,...,565,96,0,0,2,2,2,2,2,2
4,poV,OSC,0,Attack,YT.LB.32.21,INTERNET,Internet,7,A,4,...,2790,632,1,0,1,1,1,1,1,1


In [4]:
main_df = pd.read_csv("data/localized_alerts_data.csv", sep="|", low_memory=False).drop(['devicetype', 'reportingdevice_code', 'devicevendor_code', 'srcip', 'dstip', 'srcport', 'dstport', 'protocol'], axis=1)
main_df

Unnamed: 0,alert_ids,alerttype,srcipcategory,dstipcategory,srcportcategory,dstportcategory,direction,alerttime,severity,count,domain,username,signature
0,AAB,NAC: Asset Visibility,PRIV-10,INTERNET,1,1,5,0,1,1,0,1,1
1,AAC,ThreatWatch Outbound,PRIV-192,INTERNET,4,2,3,0,5,19,0,1,1
2,AAE,ThreatWatch Outbound,PRIV-10,INTERNET,4,3,3,0,3,10,0,0,1
3,AAL,ThreatWatch Outbound,PRIV-172,INTERNET,4,2,3,0,5,1,0,0,1
4,AAL,ThreatWatch Inbound,INTERNET,INTERNET,2,4,2,1311,5,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8690699,zzu,Suspicious Port Activity,PRIV-192,INTERNET,3,2,3,222527,1,20,0,0,1
8690700,zzu,Suspicious Port Activity,PRIV-192,INTERNET,3,2,3,222580,1,83,0,0,1
8690701,zzx,IDPS Alert,PRIV-10,INTERNET,3,2,3,0,5,2,0,0,1
8690702,zzx,IDPS Alert,PRIV-10,INTERNET,3,2,3,8,5,2,0,1,1


## Preprocess

In [5]:
#split into train and test based on alert_ids
train_df = main_df[main_df["alert_ids"].isin(df_train["alert_ids"].values)]
test_df = main_df[main_df["alert_ids"].isin(df_test["alert_ids"].values)]
ai_train, ai_test = train_df["alert_ids"].values, test_df["alert_ids"].values
train_df = train_df.drop(['alert_ids'], axis=1)
test_df = test_df.drop(['alert_ids'], axis=1)

In [6]:
#columns to tokenize
tc2= ['alerttype', 'srcipcategory', 'dstipcategory',
       'srcportcategory', 'dstportcategory', 'direction',
       'severity', 'domain', 'username', 'signature']

In [7]:
dicts = []
results_train = []
results_test = []
for col in tc2:
    x_train, x_test = train_df[f"{col}"].values, test_df[f"{col}"].values
    tmp_train, tmp_test = [], []
    unique = np.unique(x_train)
    stoi = { ch:i for i,ch in enumerate(unique) }
    for item in x_train:
        if item in stoi:
            tmp_train.append(stoi[item])
        else:
            tmp_train.append(len(stoi))
    for item in x_test:
        if item in stoi:
            tmp_test.append(stoi[item])
        else:
            tmp_test.append(len(stoi))
    dicts.append(stoi)
    results_train.append(tmp_train)
    results_test.append(tmp_test)

In [8]:
dicts = [len(d) for d in dicts]
results_train = [np.reshape(np.array(item), (len(item), -1)) for item in results_train]
results_test = [np.reshape(np.array(item), (len(item), -1)) for item in results_test]

In [9]:
len(results_train)

10

In [10]:
#rest of the columns - continous features
x_train = train_df[train_df.columns[~train_df.columns.isin(tc2)]].to_numpy()
x_test = test_df[test_df.columns[~test_df.columns.isin(tc2)]].to_numpy()

In [11]:
y_train = x_train.copy()
y_test = x_test.copy()
for item1, item2 in zip(results_train, results_test):
    y_train = np.hstack((y_train, item1))
    y_test = np.hstack((y_test, item2))

In [12]:
#sclae the output of the neural network
scaler = MinMaxScaler()
y_train = scaler.fit_transform(y_train)
y_test = scaler.transform(y_test)

## Model

In [13]:
class emb_layer(tf.keras.layers.Layer):
    def __init__(self, units, n_embed, dict):
        super(emb_layer, self).__init__()
        self.dict = dict
        self.embedding = tf.keras.layers.Embedding(self.dict+1, n_embed, input_length=1)
        self.flat = tf.keras.layers.Flatten()
        self.mlp = tf.keras.layers.Dense(units, activation="relu")

    def call(self, x):
        x = self.embedding(x)
        x = self.flat(x)
        x = self.mlp(x)
        return x

In [14]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, units, n_embed, dicts):
        super(Encoder, self).__init__()
        self.emebd_layers = [emb_layer(units, n_embed, dict) for dict in dicts]
        self.conc = tf.keras.layers.Concatenate()
        self.mlp1 = tf.keras.layers.Dense(2, activation="relu")
        self.mlp2 = tf.keras.layers.Dense(1, activation="relu")
        self.main_mlp1 = tf.keras.layers.Dense(len(dicts)*units+1, activation="relu")
        self.main_mlp2 = tf.keras.layers.Dense(4, activation="relu")


    def call(self, inputs):
        main_inputs, emb_inputs = inputs

        xs = [embed_layer(emb_inputs[i]) for i,embed_layer in enumerate(self.emebd_layers)]
        x1 = self.mlp1(main_inputs)
        x1 = self.mlp2(x1)

        x = self.conc([x1,*xs])
        x = self.main_mlp1(x)
        x = self.main_mlp2(x)
        return x

In [15]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self):
        super(Decoder, self).__init__()
        self.mlp1 = tf.keras.layers.Dense(8, activation="relu")
        self.mlp2 = tf.keras.layers.Dense(12, activation="sigmoid")

    def call(self, x):
        x = self.mlp1(x)
        x = self.mlp2(x)
        return x

In [16]:
class MyModel(tf.keras.Model):
    def __init__(self, units, n_embed, dicts):
        super(MyModel, self).__init__()
        self.encoder = Encoder(units, n_embed, dicts)
        self.decoder = Decoder()

    def call(self, inputs):
        x = self.encoder(inputs)
        x = self.decoder(x)
        return x

In [17]:
model = MyModel(16, 4, dicts)

2023-06-26 16:58:52.475170: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-06-26 16:58:52.741444: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-06-26 16:58:52.741554: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-06-26 16:58:52.753501: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-06-26 16:58:52.753681: I tensorflow/compile

In [18]:
model.compile(optimizer='adam', loss='mse')

In [19]:
model.fit([x_train, results_train], y_train, epochs=5, batch_size=4096, validation_split=0.01)

Epoch 1/5


2023-06-26 16:59:11.538113: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x7f6fa80389b0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-06-26 16:59:11.538229: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): NVIDIA GeForce RTX 2060 with Max-Q Design, Compute Capability 7.5
2023-06-26 16:59:11.657317: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-06-26 16:59:13.812937: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8901
2023-06-26 16:59:15.732734: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f714cf66b00>

In [20]:
model.summary()

Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder (Encoder)           multiple                  27947     
                                                                 
 decoder (Decoder)           multiple                  148       
                                                                 
Total params: 28,095
Trainable params: 28,095
Non-trainable params: 0
_________________________________________________________________


## Save

In [21]:
#create embeddings for test data with trained encoder
batch_size = 8192
result = np.array([])
for i in range(0, len(x_test), batch_size):
    tmp = [item[i:i+batch_size] for item in results_test]
    r = model.encoder([x_test[i:i+batch_size], tmp]).numpy()
    result = np.vstack((result, r)) if result.size else r

In [22]:
result = np.hstack((ai_test.reshape(-1,1), result))
result

array([['AAN', 172.01123046875, 91.82072448730469, 0.0,
        33.66152572631836],
       ['AAN', 167.68844604492188, 89.56840515136719, 0.0,
        33.00950241088867],
       ['AAN', 208.74884033203125, 105.85759735107422, 0.0,
        51.52685546875],
       ...,
       ['zzu', 41.152740478515625, 12.852371215820312, 0.0,
        34.546714782714844],
       ['zzx', 172.01060485839844, 91.82027435302734, 0.0,
        33.6612548828125],
       ['zzx', 167.68841552734375, 89.56839752197266, 0.0,
        33.00950622558594]], dtype=object)

In [23]:
np.save("data/result_test.npy", result)

In [24]:
#create embeddings for train data with trained encoder
batch_size = 8192
result = np.array([])
for i in range(0, len(x_train), batch_size):
    tmp = [item[i:i+batch_size] for item in results_train]
    r = model.encoder([x_train[i:i+batch_size], tmp]).numpy()
    result = np.vstack((result, r)) if result.size else r

In [25]:
result = np.hstack((ai_train.reshape(-1,1), result))
result

array([['AAB', 0.0, 0.0, 0.0, 24.944808959960938],
       ['AAC', 186.409423828125, 90.85149383544922, 0.0,
        67.08934020996094],
       ['AAE', 76.779296875, 60.41259765625, 0.0, 30.33623504638672],
       ...,
       ['zzs', 168.84225463867188, 21.373615264892578, 0.0,
        54.669708251953125],
       ['zzs', 50.104827880859375, 17.268516540527344, 0.0,
        13.768631935119629],
       ['zzz', 189.653564453125, 92.14424133300781, 0.0,
        67.52375030517578]], dtype=object)

In [26]:
np.save("data/result_train.npy", result)