In [1]:
import os
import re
import pandas as pd
import numpy as np

In [2]:
# Set data directory
DATA_PATH = "/home/mathuis/Development/cyber_wolf/data"

# Set feature definition
FEATUE_DEF = ["path", "header", "body", "length", "a", "b", "c", "d", "e", "f", "g", "h", "i",
              "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
              "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q",
              "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "0", "1", "2", "3", "4", "5", "6", "7",
              "8", "9", "[", "\\", "]", "^", "_", "`", "{", "|", "}", "~", "!", "\"", "#", "$", "%",
              "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", ":", ";", "<", ">", "=", "?", "@"]

NORM_COLS = ["length", "a", "b", "c", "d", "e", "f", "g", "h", "i",
             "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
             "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q",
             "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "0", "1", "2", "3", "4", "5", "6", "7",
             "8", "9", "[", "\\", "]", "^", "_", "`", "{", "|", "}", "~", "!", "\"", "#", "$", "%",
             "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", ":", ";", "<", ">", "=", "?", "@"]

TEST_RATIO = 0.2

In [3]:
# Reads a file and returns the lines as an array
def read_file_content(data_path):
    with open(data_path, "r") as f:
        return f.readlines()


# Loops through lines in request
# Appends results tot base dataframe
def extract_features(request):
    df = pd.DataFrame(columns=FEATUE_DEF)

    for i in range(len(request)):
        frames = get_values(request, i)
        df = df.append(frames, ignore_index=True)

    return df


# Gets the features from the line
# Returns features, appended to the base dataframe
def get_values(request, i):
    df = pd.DataFrame(columns=FEATUE_DEF)
    line = request[i].strip()

    # In query
    if i == 0:
        print(line)
        values = split_query(line)
        df = df.append(histogram(values, "path"), ignore_index=True)

    # In header
    if ": " in line:
        values = split_header(line)
        df = df.append(histogram(values, "header"), ignore_index=True)
        # return df

    # In body
    if line != "\n" and "\n" in request[0:i]:
        print(line)
        values = split_body(line)
        df = df.append(histogram(values, "body"), ignore_index=True)

    return df


# Parses query values
def split_query(line):
    line = line.split(" ")[1]

    if "?" not in line:
        return []

    querystring = line.split("?")[1]

    if len(querystring) == 0:
        return []

    if "&" not in querystring:
        return [querystring.split("=")[1]]

    values = []
    sections = querystring.split("&")
    for section in sections:
        if len(section.split("=")) > 1:
            values.append(section.split("=")[1])

    return values


# Parses header values
def split_header(line):
    return [line.split(": ")[1]]


# Parses body values
def split_body(line):
    if "&" not in line:
        if len(line.split("=")) > 1:
            return [line.split("=")[1]]
        else:
            return [""]

    values = []
    parameters = line.split("&")
    for param in parameters:
        if len(param.split("=")) > 1:
            values.append(param.split("=")[1])

    return values


# Generates the row in the dataframe
# Builds histogram of the data
def histogram(values, location):
    req_df = pd.DataFrame(columns=FEATUE_DEF)

    for value in values:
        df = pd.DataFrame(columns=FEATUE_DEF)
        # Set default values
        df.loc[0] = 0
        row = df.loc[0]
        row["length"] = len(value)
        row[location] = 1

        for char in value:
            if char == " ":
                continue

            row[char] += 1

        req_df = req_df.append(df, ignore_index=True)

    return req_df


# Normalizes the correct values
def normalize(df):
    # Normalize histogram
    # The normalized count is the count in a class divided by the total number of observations.
    # In this case the relative counts are normalized to sum to one (or 100 if a percentage scale is used).
    # This is the intuitive case where the height of the histogram bar represents the proportion of the data in each class.

    # Calculate the sum for each column
    # Calculate normalized ratio based on sum for each row and col in base dataframe
    for col in NORM_COLS:
        print(f"Normalizing: {col}")
        if df[col].sum() != 0:
            df[col] = df.apply(lambda x: x[col] / df[col].sum(), axis=1)

        print(f"Total: {df[col].sum()}")

    return df


def preprocess(request):
    print("Preprocessing request")
    x = extract_features(request)
    x = np.asarray(x).astype("float32")

    return x

In [4]:
# Create base data frame
df = pd.DataFrame(columns=FEATUE_DEF)

files = os.listdir(f"{DATA_PATH}/requests")

print(f"Preprocessing {len(files)} requests")

Preprocessing 253 requests


In [5]:
for file in files:
    # Read request contents
    file_contents = read_file_content(f"{DATA_PATH}/requests/{file}")

    # Extract features from contents
    features = extract_features(file_contents)

    # Add data to the base dataframe
    df = df.append(features, ignore_index=True)

T /Clients?clientId=1&handler=DirectToClient HTTP/1.1
CreateClientName=&__RequestVerificationToken=CfDJ8OIrtYvy4zBHhhHh0UxuXICpXiLl2xyptoyvAnpKJCm_v8pckOPbzeQuUXiH_HHLOdoQjmz49_YM1cbmGYJW-opsuYeDuHJ9dMYI-B-EW3gvSOt-r3k0xP5hThUNvroxiMcuTqBiPAAaeWrZy5Po0Vc
POST /Funds HTTP/1.1
FundTag=SRI&__RequestVerificationToken=CfDJ8OIrtYvy4zBHhhHh0UxuXIDEfcO44JVqjCXp7S_fK1YZmB9E3nbgJn27cABnEdnhssG2nrCGNTc9xqxWeHJokX-vbZmIGlsoOAQgQ8L-k8TECxcUSRccx9zz9bd03MmtL_fjXI11JMzed01MMyGtos8
POST /EditPosition?handler=Buy HTTP/1.1
ClientId=2&FundTag=SRI&Ticker=XDWM&Amount=1.2496729&Price=43.89&__RequestVerificationToken=CfDJ8OIrtYvy4zBHhhHh0UxuXIBF-fAzLkh7TEIpHNrYw9rkoqISv8qKr0P8gQyipqrLveTtsyxSUOR6wNH9LmraKZeoIwNxPkSIxH_GxNUMKmcskJuymcC4lWHDXe0nDvbPoFf-4UAPCjGzUw_4geX_HvM
GET /favicon.ico HTTP/1.1
GET /ClientPositions?id=3 HTTP/1.1
GET /favicon.ico HTTP/1.1
GET /css/site.css HTTP/1.1
GET /ClientPositions?id=2 HTTP/1.1
GET /ClientPositions?id=2 HTTP/1.1
POST /ClientPositions?ticker=XDWI&fund=SRI&handler=EditPos

In [6]:
# Shuffle random
df = df.sample(frac=1).reset_index(drop=True)

In [7]:
# Normalize dataset
# df = normalize(df)
print(df)

     path header body length   a  b  c  d  e  f  ...  -  .  /  :  ;  <  >  =  \
0       0      1    0     43   2  0  1  1  0  0  ...  0  0  3  2  0  0  0  1   
1       0      1    0    135  12  2  4  1  8  1  ...  1  3  8  0  4  0  0  4   
2       0      1    0      2   0  0  0  0  0  0  ...  0  0  0  0  0  0  0  0   
3       1      0    0      1   0  0  0  0  0  0  ...  0  0  0  0  0  0  0  0   
4       0      1    0      1   0  0  0  0  0  0  ...  0  0  0  0  0  0  0  0   
...   ...    ...  ...    ...  .. .. .. .. .. ..  ... .. .. .. .. .. .. .. ..   
4735    0      1    0      2   0  0  0  0  0  0  ...  0  0  0  0  0  0  0  0   
4736    0      1    0      1   0  0  0  0  0  0  ...  0  0  0  0  0  0  0  0   
4737    0      1    0      1   0  0  0  0  0  0  ...  0  0  0  0  0  0  0  0   
4738    0      1    0     10   1  0  0  0  3  0  ...  1  0  0  0  0  0  0  0   
4739    0      1    0      1   0  0  0  0  0  0  ...  0  0  0  0  0  0  0  0   

      ?  @  
0     1  0  
1     0  0  


In [8]:
# Calculate the training set size
train_count = int(len(df) - len(df) * TEST_RATIO)

x_train = df[0:train_count].to_numpy()
x_test = df[train_count:len(df) + 1].to_numpy()

x_train = np.asarray(x_train).astype("float32")
x_test = np.asarray(x_test).astype("float32")

# Save complete dataset to csv
df.to_csv(f"{DATA_PATH}/datasets/notnorm_dataset.csv")
np.save(f"{DATA_PATH}/datasets/notnorm_x_train", x_train)
np.save(f"{DATA_PATH}/datasets/notnorm_x_test", x_test)