<h4>TextCNN</h4>

<h5>Imports</h5>

In [57]:
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse, unquote_plus
import tensorflow as tf
import numpy as np
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, Input, Dropout
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Model
import jsonlines
import os
import json

In [7]:
file_names=['malicious','normal']
for name in file_names:
    data=[]
    with jsonlines.open('dataset/'+name+'.txt') as reader:
        for line in reader:
            if line['request']['method'] == 'POST':
                data.append({'request':{'method':'POST', 'uri':line['request']['uri'], 'body':line['request']['body'],'headers':line['request']['headers']},'metadata':line['metdata']})
            else:
                data.append({'request':{'method':'GET', 'uri':line['request']['uri'], 'headers':line['request']['headers']},'metadata':line['metdata']})
    with jsonlines.open('dataset/'+name+'_clean.txt', mode='w') as writer:
        writer.write_all(data)

<h4>Load and create datasets</h4>

In [62]:
file_names = ['dataset/malicious_clean.txt' ,'dataset/normal_clean.txt']
data={}
for file in file_names:
    data[file]=[]
    with jsonlines.open(file) as reader:
        for line in reader:
            if line['request']['method'] == 'POST':
                data[file].append('POST'+' '+line['request']['uri']+' '+line['request']['body']+' '+json.dumps(line['request']['headers']))
            else:
                data[file].append('GET'+' '+line['request']['uri']+' '+json.dumps(line['request']['headers']))
                
                
normal = data[file_names[1]]
malicious = data[file_names[0]]
normal_part1 = normal[180000:]
normal_part2 = normal[:180000]

train_examples = normal_part2
test_examples = normal_part1+malicious
train_labels = [0] * len(train_examples)
test_labels = [0]* len(normal_part1)
test_labels.extend([1] * len(malicious))

dataset_train = tf.data.Dataset.from_tensor_slices((list(map(lambda x: unquote_plus(x),train_examples)), train_labels))
dataset_test = tf.data.Dataset.from_tensor_slices((list(map(lambda x: unquote_plus(x),test_examples)), test_labels))


def preprocess_text(text, label):
    text = tf.strings.lower(text)
    punctuation = "=?/(){}[]<>"
    for p in punctuation:
        text = tf.strings.regex_replace(text, "\\" + p, " "+p+" ")
    text = tf.strings.split(text)
    return text, label

def preprocess_text_substitution(text, label):
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text, "[a-zA-Z]+", "a")
    text = tf.strings.regex_replace(text, "[0-9]+", "n")
    text = tf.strings.regex_replace(text, "(a|n){2,}", "x")
    punctuation = "=?/(){}[]<>"
    for p in punctuation:
        text = tf.strings.regex_replace(text, "\\" + p, " "+p+" ")
    text = tf.strings.split(text)
    return text, label


# Map the preprocess function to the dataset
train = dataset_train.map(preprocess_text)
test = dataset_test.map(preprocess_text)

train_sub = dataset_train.map(preprocess_text_substitution)
test_sub = dataset_test.map(preprocess_text_substitution)

for text, label in train_sub.take(1):
    print(text.numpy(), label.numpy())

for text, label in train.take(1):
    print(text.numpy(), label.numpy())

[b'a' b'/' b'a' b'/' b'a' b'/' b'a' b'{' b'"a"' b'=' b'"a"' b'}' b'{'
 b'"a-a":' b'"a",' b'"a":' b'"a' b'/' b'a,*' b'/' b'*",' b'"a-a":' b'"a,'
 b'a,' b'a",' b'"a-a":' b'"a' b'/' b'n.n' b'(' b'a' b'a' b'n.n;' b'x;'
 b'x' b')' b'a' b'/' b'n.n' b'(' b'a,' b'a' b'a' b')' b'a' b'/' b'n.n.n.n'
 b'a' b'/' b'n.n",' b'"a":' b'"a' b'=' b'x;' b'a' b'=' b'x"' b'}'] 0
[b'post' b'/' b'rest' b'/' b'products' b'/' b'reviews' b'{' b'"id"' b'='
 b'"ajgoztlb"' b'}' b'{' b'"accept-language":' b'"de",' b'"accept":'
 b'"text' b'/' b'css,*' b'/' b'*",' b'"accept-encoding":' b'"gzip,'
 b'deflate,' b'br",' b'"user-agent":' b'"mozilla' b'/' b'5.0' b'('
 b'windows' b'nt' b'10.0;' b'win64;' b'x64' b')' b'applewebkit' b'/'
 b'537.36' b'(' b'khtml,' b'like' b'gecko' b')' b'chrome' b'/'
 b'72.0.3626.121' b'safari' b'/' b'537.36",' b'"cookie":' b'"phpsessid'
 b'=' b'33ptq3esqvyehncdmubiygychvpzr7gw;' b'continuecode' b'='
 b'8dywbdjgirzm76ncpx2hf8kbrclgrd76"' b'}'] 0


2023-04-22 12:54:15.094414: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int32 and shape [180000]
	 [[{{node Placeholder/_1}}]]
2023-04-22 12:54:15.156692: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int32 and shape [180000]
	 [[{{node Placeholder/_1}}]]


<h4>Vocabularies</h4>

In [49]:
vectorization = TextVectorization(
    max_tokens=10000,
    output_mode="int", 
    output_sequence_length=256,
    pad_to_max_tokens=True
)

vectorization_sub = TextVectorization(
    max_tokens=1000, 
    output_mode="int", 
    output_sequence_length=256,
    pad_to_max_tokens=True
)

vectorization.adapt(train.map(lambda x, y: x))
vectorization_sub.adapt(train_sub.map(lambda x, y: x))
vectorization.adapt(test.map(lambda x, y: x))
vectorization_sub.adapt(test_sub.map(lambda x, y: x))

2023-04-22 09:56:42.309485: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int32 and shape [180000]
	 [[{{node Placeholder/_1}}]]
2023-04-22 10:00:34.952810: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int32 and shape [180000]
	 [[{{node Placeholder/_1}}]]
2023-04-22 10:04:22.175285: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int32 and shape [117

<h4>Autoencoder</h4>

Epoch 1/10


2023-04-22 13:05:22.147547: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int32 and shape [180000]
	 [[{{node Placeholder/_1}}]]


ValueError: in user code:

    File "/home/konpr/anaconda3/envs/tf/lib/python3.9/site-packages/keras/engine/training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "/home/konpr/anaconda3/envs/tf/lib/python3.9/site-packages/keras/engine/training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/konpr/anaconda3/envs/tf/lib/python3.9/site-packages/keras/engine/training.py", line 1249, in run_step  **
        outputs = model.train_step(data)
    File "/home/konpr/anaconda3/envs/tf/lib/python3.9/site-packages/keras/engine/training.py", line 1050, in train_step
        y_pred = self(x, training=True)
    File "/home/konpr/anaconda3/envs/tf/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/home/konpr/anaconda3/envs/tf/lib/python3.9/site-packages/keras/engine/input_spec.py", line 235, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer 'sequential_4' (type Sequential).
    
    Input 0 of layer "global_average_pooling1d_3" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 64)
    
    Call arguments received by layer 'sequential_4' (type Sequential):
      • inputs=tf.Tensor(shape=(None,), dtype=float32)
      • training=True
      • mask=None


[b'a' b'/' b'a' b'/' b'a' b'/' b'a' b'{' b'"a"' b'=' b'"a"' b'}' b'{'
 b'"a-a":' b'"a",' b'"a":' b'"a' b'/' b'a,*' b'/' b'*",' b'"a-a":' b'"a,'
 b'a,' b'a",' b'"a-a":' b'"a' b'/' b'n.n' b'(' b'a' b'a' b'n.n;' b'x;'
 b'x' b')' b'a' b'/' b'n.n' b'(' b'a,' b'a' b'a' b')' b'a' b'/' b'n.n.n.n'
 b'a' b'/' b'n.n",' b'"a":' b'"a' b'=' b'x;' b'a' b'=' b'x"' b'}']


2023-04-22 12:59:34.394929: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int32 and shape [180000]
	 [[{{node Placeholder/_1}}]]


In [12]:
max_features = 10000
sequence_length = 250

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    split=custom_split
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [23]:
preprocess_data('%2e%2e%2f%2e%2e%2f%2e%2e%2f%2e%2e%2fvar/lib/MAlocate/mlocate.db')

'../../../../var/lib/malocate/mlocate.db'