### Parser
Parser functions used to parse requests along with their properties from the dataset

In [53]:

import re
import string
import urllib.parse as urlparse
from setuptools import Feature

HTTP_METHODS = {
    "GET": "GET(.|\n)+?(?=GET|POST|\Z)",
    "POST": "POST(.|\n)+?(?=GET|POST|\Z)"
}

URL_REGEX = "http.+?(?= )"
BODY_REGEX = "(?<=\n\n).+(?=\n\n)"



def parse(path, request_reg: string):
    with open(path) as file:
        data = file.read()

        requests = []
        [requests.append(request.group(0)) for request in re.finditer(request_reg, data, re.MULTILINE)]

        return requests

def parseParamsFromUrl(request):
    url = parseUrl(request)
    return urlparse.parse_qs(urlparse.urlparse(url).query)

def parseUrl(request):
    return re.search(URL_REGEX, request).group(0)

def parseParamsFromBody(request):
    params = {}
    body = re.search(BODY_REGEX, request)

    if body is not None:
        body = body.group(0)

        params = urlparse.parse_qs(body)

    return params


### Features
Calculator classes are used to extract feature from the given string. 
With respect to scalability, every feature has its own class that takes care of calculation. 

In [54]:
import string
from abc import ABCMeta, abstractmethod


class FeatureCalculator(metaclass=ABCMeta):
    def __init__(self):
        pass

    @abstractmethod
    def transform(self, s: string, request_type=False):
        pass


In [55]:
class LengthFeatureCalculator(FeatureCalculator):

    def __init__(self):
        super().__init__()

    def transform(self, s: string, request_type=False):
        return len(s)


In [56]:
class LettersFeatureCalculator(FeatureCalculator):

    def __init__(self):
        super().__init__()

    def transform(self, s: string, request_type=False):
        return sum(c.isalpha() for c in s)
    

In [58]:
class NonAlphaFeatureCalculator(FeatureCalculator):

    def __init__(self):
        super().__init__()

    def transform(self, s: string, request_type=False):
        return sum((not c.isalpha()) for c in s)
    

In [57]:
class PathLengthFeatureCalculator(FeatureCalculator):

    def __init__(self):
        super().__init__()

    def transform(self, s: string, request_type=False):

        if not request_type:
            return 0

        return len(parseUrl(s))
    

In [59]:
class PathNonAlphaFeatureCalculator(FeatureCalculator):

    def __init__(self):
        super().__init__()

    def transform(self, s: string, request_type=False):

        if not request_type:
            return 0

        return sum((not c.isalpha()) for c in parseUrl(s))



In [60]:
import collections
import math

class EntropyFeatureCalculator(FeatureCalculator):

    def __init__(self):
        super().__init__()

    def transform(self, s: string, request_type=False):
        return (-1) * sum(
            i / len(s) * math.log2(i / len(s))
            for i in collections.Counter(s).values())
    

In [61]:
class DigitsFeatureCalculator(FeatureCalculator):

    def __init__(self):
        super().__init__()

    def transform(self, s: string, request_type=False):
        return sum(c.isdigit() for c in s)

In [62]:
class ArgumentsLengthFeatureCalculator(FeatureCalculator):

    def __init__(self):
        super().__init__()

    def transform(self, s: string, request_type=False):

        if not request_type:
            return 0

        urlParams = parseParamsFromUrl(s)
        bodyParams = parseParamsFromBody(s)

        params = {**urlParams, **bodyParams}

        length = 0

        for param in params.items():
            length = length + len(param[0])

        return length
    

In [63]:
class ArgumentsNumberFeatureCalculator(FeatureCalculator):

    def __init__(self):
        super().__init__()

    def transform(self, s: string, request_type=False):

        if not request_type:
            return 0

        urlParams = parseParamsFromUrl(s)
        bodyParams = parseParamsFromBody(s)

        params = {**urlParams, **bodyParams}

        return len(params)


### Classifier
Classifier class using One-Class SVM to train and classify on the given datasets


In [76]:
import numpy as np
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import normalize


class AnomalyClassifier():
    def __init__(self, training_data_path):
        self.training_data_path = training_data_path
        self.param_features_allowed = True
        self.bodyparams_features_allowed = True

        self.feature_calculators = [
            LengthFeatureCalculator(),
            DigitsFeatureCalculator(),
            LettersFeatureCalculator(),
            NonAlphaFeatureCalculator(),
            EntropyFeatureCalculator(),
            ArgumentsLengthFeatureCalculator(),
            ArgumentsNumberFeatureCalculator(),
            PathLengthFeatureCalculator(),
            PathNonAlphaFeatureCalculator()
        ]

        self.classifiers = {}

    def train_all_methods(self, nu=0.1, kernel="rbf", gamma=0.1):
        for method, regex in HTTP_METHODS.items():
            self.train(method, regex, nu=nu, kernel=kernel, gamma=gamma)

    def train_get_method(self):
        self.train(HTTP_METHODS["GET"].index(), HTTP_METHODS["GET"])

    def train_post_method(self):
        self.train(HTTP_METHODS["POST"].index(), HTTP_METHODS["POST"])

    def train(self, method: string, regex: string, nu=0.1, kernel="rbf", gamma=0.1):
        requests = parse(self.training_data_path, regex)

        X = []

        print("training for " + method + " method")

        for request in requests:
            X.append(self.calculate_features(request, param_features_allowed=self.param_features_allowed, bodyparam_features_allowed=self.bodyparams_features_allowed))
            if len(X) % 5000 == 0:
                print(str(len(X)) + " examples processed")

        X = self.balance_X(X)

        classif = OneClassSVM(nu=nu, kernel=kernel, gamma=gamma, cache_size=500)

        print("fitting the classifier")
        classif.fit(X)
        self.classifiers[method] = classif

    def classify(self, path):

        classification = []

        for method, regex in HTTP_METHODS.items():
            requests = parse(path, regex)

            print("test evaluation with " + method + " method for " + str(len(requests)) + " requests")

            X = []

            for request in requests:
                X.append(self.calculate_features(request, param_features_allowed=self.param_features_allowed, bodyparam_features_allowed=self.bodyparams_features_allowed))

            X = self.balance_X(X)

            classification.append(self.classifiers[method].predict(X))

        Y = np.concatenate(classification)

        return Y

    def calculate_features(self, request: string, param_features_allowed: bool, bodyparam_features_allowed: bool):
        features = []
        for calculator in self.feature_calculators:
            features.append(calculator.transform(request, True))

        if param_features_allowed:
            params = parseParamsFromUrl(request)
            for param in params.items():
                for calculator in self.feature_calculators:
                    features.append(calculator.transform(param[0]))

        if bodyparam_features_allowed:
            params = parseParamsFromBody(request)
            for param in params.items():
                for calculator in self.feature_calculators:
                    features.append(calculator.transform(param[0]))

        return features

    def balance_X(self, X):
        X_processed = np.zeros([len(X), len(max(X, key=lambda x: len(x)))])
        for i, j in enumerate(X):
            X_processed[i][0:len(j)] = j

        return X_processed


### Evaluation class

In [77]:

TRAIN_DATA_PATH = "data/normalTrafficTraining.txt"
TEST_DATA_NORMAL_PATH = "data/normalTrafficTest.txt"
TEST_DATA_ANOMALY_PATH = "data/anomalousTrafficTest.txt"
ANOMALIES_LABEL = -1
NORMAL_LABEL = 1

class Evaluator():
    def __init__(self):
        pass
    def evaluate_performance(self, nu, kernel, gamma):
        print("[RUNNING EVALUATION FOR {nu: " + str(nu) + ", kernel: " + str(kernel) + ", gamma: " + str(gamma) + "}]")
        print("[Training phase]")
        classifier = AnomalyClassifier(TRAIN_DATA_PATH)
        classifier.train_all_methods(nu=nu, kernel=kernel, gamma=gamma)
        print("- classifier trained")
        print("-----------------")
        
        print("[Testing phase for ANOMALIES dataset]")
        Y = classifier.classify(TEST_DATA_ANOMALY_PATH)
        
        anomalies = Y[Y == ANOMALIES_LABEL].size
        normal = Y[Y == NORMAL_LABEL].size
        
        print("- results: ")
        print("  - anomalies ratio: " + str(anomalies) + " / " + str(len(Y)))
        print("  - anomalies percentage cover: " + str((anomalies / len(Y)) * 100) + "%")
        print("  - error rate: " + str((normal / len(Y)) * 100) + "%")
        print("-----------------")
        
        print("[Testing phase for NORMAL dataset]")
        Y = classifier.classify(TEST_DATA_NORMAL_PATH)
        
        anomalies = Y[Y == ANOMALIES_LABEL].size
        normal = Y[Y == NORMAL_LABEL].size
        
        print("- results: ")
        print("  - normal ratio: " + str(normal) + " / " + str(len(Y)))
        print("  - normal percentage cover: " + str((normal / len(Y)) * 100) + "%")
        print("  - error rate: " + str((anomalies / len(Y)) * 100) + "%")
        print("-----------------")
        print("-----------------")



## Run the evaluation on different configuration parameters

In [78]:
# svm values
NU = [0.1] # [0.1, 0.01, 0.001, 0.0001]
GAMMA = [0.1] # [0.1, 0.01, 0.001, 0.0001]

evaluator = Evaluator()

for nu in NU:
    for gamma in GAMMA:
        evaluator.evaluate_performance(nu, "rbf", gamma)



[RUNNING EVALUATION FOR {nu: 0.1, kernel: rbf, gamma: 0.1}]
[Training phase]
training for GET method
5000 examples processed
10000 examples processed
15000 examples processed
20000 examples processed
25000 examples processed
training for POST method
5000 examples processed
- classifier trained
-----------------
[Testing phase for ANOMALIES dataset]
test evaluation with GET method for 15088 requests
test evaluation with POST method for 9580 requests
- results: 
  - anomalies ratio: 23299 / 24668
  - anomalies percentage cover: 94.45029998378466%
  - error rate: 5.54970001621534%
-----------------
[Testing phase for NORMAL dataset]
test evaluation with GET method for 28000 requests
test evaluation with POST method for 8000 requests
- results: 
  - normal ratio: 29777 / 36000
  - normal percentage cover: 82.71388888888889%
  - error rate: 17.28611111111111%
-----------------
-----------------
