### Parser
Parser functions used to parse requests along with their properties from the dataset

In [8]:

import re
import string
import urllib.parse as urlparse
from setuptools import Feature

HTTP_METHODS = {
    "GET": "GET(.|\n)+?(?=GET|POST|\Z)",
    "POST": "POST(.|\n)+?(?=GET|POST|\Z)"
}

URL_REGEX = "http.+?(?= )"
BODY_REGEX = "(?<=\n\n).+(?=\n\n)"



def parse(path, request_reg: string):
    with open(path) as file:
        data = file.read()

        requests = []
        [requests.append(request.group(0)) for request in re.finditer(request_reg, data, re.MULTILINE)]

        return requests

def parseParamsFromUrl(request):
    url = parseUrl(request)
    return urlparse.parse_qs(urlparse.urlparse(url).query)

def parseUrl(request):
    return re.search(URL_REGEX, request).group(0)

def parseParamsFromBody(request):
    params = {}
    body = re.search(BODY_REGEX, request)

    if body is not None:
        body = body.group(0)

        params = urlparse.parse_qs(body)

    return params


### Features
Calculator classes are used to extract feature from the given string. 
With respect to scalability, every feature has its own class that takes care of calculation. 

In [9]:
import string
from abc import ABCMeta, abstractmethod


class FeatureCalculator(metaclass=ABCMeta):
    def __init__(self):
        pass

    @abstractmethod
    def transform(self, s: string, request_type=False):
        pass


In [10]:
class LengthFeatureCalculator(FeatureCalculator):

    def __init__(self):
        super().__init__()

    def transform(self, s: string, request_type=False):
        return len(s)


In [11]:
class LettersFeatureCalculator(FeatureCalculator):

    def __init__(self):
        super().__init__()

    def transform(self, s: string, request_type=False):
        return sum(c.isalpha() for c in s)
    

In [12]:
class NonAlphaFeatureCalculator(FeatureCalculator):

    def __init__(self):
        super().__init__()

    def transform(self, s: string, request_type=False):
        return sum((not c.isalpha()) for c in s)
    

In [13]:
class PathLengthFeatureCalculator(FeatureCalculator):

    def __init__(self):
        super().__init__()

    def transform(self, s: string, request_type=False):

        if not request_type:
            return 0

        return len(parseUrl(s))
    

In [14]:
class PathNonAlphaFeatureCalculator(FeatureCalculator):

    def __init__(self):
        super().__init__()

    def transform(self, s: string, request_type=False):

        if not request_type:
            return 0

        return sum((not c.isalpha()) for c in parseUrl(s))



In [15]:
import collections
import math

class EntropyFeatureCalculator(FeatureCalculator):

    def __init__(self):
        super().__init__()

    def transform(self, s: string, request_type=False):
        return (-1) * sum(
            i / len(s) * math.log2(i / len(s))
            for i in collections.Counter(s).values())
    

In [21]:
class DigitsFeatureCalculator(FeatureCalculator):

    def __init__(self):
        super().__init__()

    def transform(self, s: string, request_type=False):
        return sum(c.isdigit() for c in s)

In [17]:
class ArgumentsLengthFeatureCalculator(FeatureCalculator):

    def __init__(self):
        super().__init__()

    def transform(self, s: string, request_type=False):

        if not request_type:
            return 0

        urlParams = parseParamsFromUrl(s)
        bodyParams = parseParamsFromBody(s)

        params = {**urlParams, **bodyParams}

        length = 0

        for param in params.items():
            length = length + len(param[0])

        return length
    

In [18]:
class ArgumentsNumberFeatureCalculator(FeatureCalculator):

    def __init__(self):
        super().__init__()

    def transform(self, s: string, request_type=False):

        if not request_type:
            return 0

        urlParams = parseParamsFromUrl(s)
        bodyParams = parseParamsFromBody(s)

        params = {**urlParams, **bodyParams}

        return len(params)


### Classifier
Classifier class using One-Class SVM to train and classify on the given datasets


In [19]:
import numpy as np
from sklearn.svm import OneClassSVM


class AnomalyClassifier():
    def __init__(self, training_data_path):
        self.training_data_path = training_data_path
        self.param_features_allowed = True
        self.bodyparams_features_allowed = True

        self.feature_calculators = [
            LengthFeatureCalculator(),
            DigitsFeatureCalculator(),
            LettersFeatureCalculator(),
            NonAlphaFeatureCalculator(),
            EntropyFeatureCalculator(),
            ArgumentsLengthFeatureCalculator(),
            ArgumentsNumberFeatureCalculator(),
            PathLengthFeatureCalculator(),
            PathNonAlphaFeatureCalculator()
        ]

        self.classifiers = {}

    def train_all_methods(self):
        for method, regex in HTTP_METHODS.items():
            self.train(method, regex)

    def train_get_method(self):
        self.train(HTTP_METHODS["GET"].index(), HTTP_METHODS["GET"])

    def train_post_method(self):
        self.train(HTTP_METHODS["POST"].index(), HTTP_METHODS["POST"])

    def train(self, method: string, regex: string):
        requests = parse(self.training_data_path, regex)

        X = []

        print("training for " + method + " method")

        for request in requests:
            X.append(self.calculate_features(request, param_features_allowed=self.param_features_allowed, bodyparam_features_allowed=self.bodyparams_features_allowed))
            if len(X) % 1000 == 0:
                print(str(len(X)) + " examples processed")

        X = self.balance_X(X)

        classif = OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)

        classif.fit(X)
        self.classifiers[method] = classif

    def classify(self, path):

        classification = []

        for method, regex in HTTP_METHODS.items():
            requests = parse(path, regex)

            print("test evaluation with " + method + " method for " + str(len(requests)) + " requests")

            X = []

            for request in requests:
                X.append(self.calculate_features(request, param_features_allowed=self.param_features_allowed, bodyparam_features_allowed=self.bodyparams_features_allowed))

            X = self.balance_X(X)

            classification.append(self.classifiers[method].predict(X))

        Y = np.concatenate(classification)

        return Y

    def calculate_features(self, request: string, param_features_allowed: bool, bodyparam_features_allowed: bool):
        features = []
        for calculator in self.feature_calculators:
            features.append(calculator.transform(request, True))

        if param_features_allowed:
            params = parseParamsFromUrl(request)
            for param in params.items():
                for calculator in self.feature_calculators:
                    features.append(calculator.transform(param[0]))

        if bodyparam_features_allowed:
            params = parseParamsFromBody(request)
            for param in params.items():
                for calculator in self.feature_calculators:
                    features.append(calculator.transform(param[0]))

        return features

    def balance_X(self, X):
        X_processed = np.zeros([len(X), len(max(X, key=lambda x: len(x)))])
        for i, j in enumerate(X):
            X_processed[i][0:len(j)] = j

        return X_processed


### Run the evaluation

In [24]:

TRAIN_DATA_PATH = "data/normalTrafficTraining.txt"
TEST_DATA_NORMAL_PATH = "data/normalTrafficTest.txt"
TEST_DATA_ANOMALY_PATH = "data/anomalousTrafficTest.txt"

classifier = AnomalyClassifier(TRAIN_DATA_PATH)

classifier.train_all_methods()
Y = classifier.classify(TEST_DATA_ANOMALY_PATH)

anomalies = Y[Y == -1].size

print("anomalies ratio: " + str(anomalies) + " / " + str(len(Y)))
print("anomalies percentage cover: " + str((anomalies / len(Y)) * 100) + "%")

training for GET method
1000 examples processed
2000 examples processed
3000 examples processed
4000 examples processed
5000 examples processed
6000 examples processed
7000 examples processed
8000 examples processed
9000 examples processed
10000 examples processed
11000 examples processed
12000 examples processed
13000 examples processed
14000 examples processed
15000 examples processed
16000 examples processed
17000 examples processed
18000 examples processed
19000 examples processed
20000 examples processed
21000 examples processed
22000 examples processed
23000 examples processed
24000 examples processed
25000 examples processed
26000 examples processed
27000 examples processed
28000 examples processed
training for POST method
1000 examples processed
2000 examples processed
3000 examples processed
4000 examples processed
5000 examples processed
6000 examples processed
7000 examples processed
8000 examples processed
test evaluation with GET method for 15088 requests
test evaluation w