In [1]:
import pandas as pd
import numpy as np
import glob
import os
from sys import platform

# THIS LABELLING SCRIPT IS USED TO LABEL THE CORRECTED VERSION OF CSE-CIC-IDS-2018.
# FOR DETAILS CONSULT OUR WEBSITE:
# https://intrusion-detection.distrinet-research.be/CNS2022/index.html


pd.set_option('display.max_rows', 100)

# Enter the path that contains the CSV files that were generated by the CICFlowMeter tool. The directory structure should
# be the following:
# The dataset path should contain separate subdirectories for each day (e.g. "Wednesday-14-02-2018"). In each
# of these directories, there should be a directory called "csv" which contains the CSV files as generated by the
# CICFlowMeter tool.
DATASET_PATH = ""

# If set to true, a column is added at the front of the CSV with line numbers
print_index = True

In [2]:
# Basic preprocessing before getting started on labelling.
# Deletes rows with "Infinity" and NaNs, converts "Timestamp" to Pandas Datetime, and converts all necessary columns to
# numeric values
def format_csv_for_labelling(df):
    df = df.replace('Infinity', np.nan)
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    for column in df.columns:
        if column not in ['Flow ID' , 'Timestamp', 'Src IP', 'Dst IP', 'Label']:
            df[column] = pd.to_numeric(df[column], errors='coerce')
    return df.dropna()

# Reads all csvs of one day and concatenates them into one dataframe
def read_csvs_from_path_and_reformat(path):
    csv_dataframes = []

    all_files = glob.glob(path + "/*.csv")
    for file in all_files:
        csv_dataframes.extend([pd.read_csv(file)])
    df = pd.concat(csv_dataframes, ignore_index=True)

    print("labels before pre-processing:", df["Label"].value_counts())
    df = format_csv_for_labelling(df)
    print("labels after pre-processing:", df["Label"].value_counts())

    df["Attempted Category"] = -1

    int64_columns = ["Total TCP Flow Time"]

    int32_columns = ["Src Port", "Dst Port", "Flow Duration", "Total Fwd Packet", "Total Bwd packets", "Total Length of Fwd Packet", "Total Length of Bwd Packet", "Fwd Packet Length Max",
        "Fwd Packet Length Min", "Bwd Packet Length Max", "Bwd Packet Length Min", "Flow IAT Max", "Flow IAT Min", "Fwd IAT Total", "Fwd IAT Max", "Fwd IAT Min", "Bwd IAT Total",
        "Bwd IAT Max", "Bwd IAT Min", "Fwd PSH Flags", "Bwd PSH Flags", "Fwd URG Flags", "Bwd URG Flags", "Packet Length Min", "Packet Length Max", "FIN Flag Count", "SYN Flag Count", "RST Flag Count", "PSH Flag Count",
        "ACK Flag Count", "URG Flag Count", "CWR Flag Count", "ECE Flag Count", "Subflow Fwd Packets", "Subflow Fwd Bytes",
        "Subflow Bwd Packets", "Subflow Bwd Bytes", "FWD Init Win Bytes", "Bwd Init Win Bytes", "Fwd Act Data Pkts", "Fwd Seg Size Min", "Active Max",
        "Active Min", "Idle Max", "Idle Min"]

    int16_columns = ["Fwd Header Length", "Bwd Header Length", "ICMP Code", "ICMP Type"]

    for column in int64_columns:
        df[column] = df[column].astype('int64')

    for column in int32_columns:
        df[column] = df[column].astype('int32')

    for column in int16_columns:
        df[column] = df[column].astype('int16')

    return df


# Main labelling function. Only used for labelling Malicious and Malicious - Attempted flows.
# Timestamps are in NANOSECONDS (!) Unix time. Note that the CSV files are in the UTC timezone.
# df = dataframe with flows. Note that labelling happens inplace on the 'df' parameter, and so this function doesn't return anything
# label = the label that will be given to flows matching the criteria specified in the function
# additional_filters = add any additional constraints that cannot be covered by the already provided function arguments
# see examples in the actual labelling logic for correct syntax
# attempted_category = please consult our website (https://intrusion-detection.distrinet-research.be/CNS2022/Tools_Documentation.html)
# for details on how the "Attempted" categories are defined.
# payload_filter = When set to true, this will automatically add a constraint ["Total Length of Fwd Packet"] == 0. Note that
# the Attempted label and category still need to be specified manually
def label_flows(df, label, attack_start_time_nanoseconds, attack_end_time_nanoseconds, src_ip_list=None,
                dst_ip_list=None, dst_port_list=None, attempted_category=-1, additional_filters=[], payload_filter = False):

    # Create initial mask with all values set to True. Squeeze is necessary to remove second axis (of size 1)
    # The reason is that a df of shape (X,), if you '&' it with a df of shape (X,1), gets converted to (1,X)
    custom_mask = pd.DataFrame(True, index=df.index, columns=[df.columns[0]]).squeeze()

    attack_start_datetime = pd.to_datetime(attack_start_time_nanoseconds, unit='ns')
    attack_end_datetime = pd.to_datetime(attack_end_time_nanoseconds, unit='ns')

    custom_mask &= (df["Timestamp"] >= attack_start_datetime)
    custom_mask &= (df["Timestamp"] <= attack_end_datetime)

    if src_ip_list is not None:
        custom_mask &= (df["Src IP"].isin(src_ip_list))
    if dst_ip_list is not None:
        custom_mask &= (df["Dst IP"].isin(dst_ip_list))

    if dst_port_list is not None:
        custom_mask &= (df["Dst Port"].isin(dst_port_list))

    if payload_filter:
        custom_mask &= (df["Total Length of Fwd Packet"] == 0)

    for filter in additional_filters:
        custom_mask &= filter

    df["Label"].mask(custom_mask, label, inplace=True)
    df["Attempted Category"].mask(custom_mask, attempted_category, inplace=True)

# This function is called when all labelling of malicious flows is completed. Anything that has not yet received a label
# so far is labelled as Benign.
def label_rest_as_benign_and_write_csv(df, file_to_write):
    df["Label"].mask(df["Label"] == "NeedManualLabel", "BENIGN", inplace=True)

    # Relabel artefact flows with [Flow Id] = '8.0.6.4-8.6.0.1-0-0-0' to label = 0
    df["Label"].mask(df["Flow ID"] == '8.0.6.4-8.6.0.1-0-0-0', "BENIGN", inplace=True)

    print("label count after labelling:\r\n", df["Label"].value_counts())
    print("Attempted Category count after labelling:\r\n", df["Attempted Category"].value_counts())

    if print_index:
        df.reset_index(inplace=True, drop=True)
        df.index += 1
        df.index.name = 'id'
        df.to_csv(file_to_write)
    else:
        df.to_csv(file_to_write, index=False)

In [3]:
#----------------------+
# WEDNESDAY 14-02-2018 |
#----------------------+

dir_name = "Wednesday-14-02-2018"
wednesday_14022018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + "/csv")

#-- FTP-BruteForce
label_flows(wednesday_14022018_df, "FTP-BruteForce - Attempted", 1518618806*(10**9),
                                    1518624631*(10**9), ["18.221.219.4"], ["172.31.69.25"], attempted_category=1)

# FTP-BruteForce - Attempted (tool accidentally got launched in FTP bruteforce mode instead of SSH bruteforce mode)
# Note that, in order to avoid float imprecisions at the micro- and nanosecond level, the UNIX timestamps such as
# 1518631281.199541000, which is in seconds, needs to be converted to nanoseconds, so that the number is stored
# in int64 instead of float.
label_flows(wednesday_14022018_df, "FTP-BruteForce - Attempted", 1518631281199541000,
                                    1518631281502585000, ["13.58.98.64"], ["172.31.69.25"], [21], attempted_category=4)

#-- SSH-BruteForce
label_flows(wednesday_14022018_df, "SSH-BruteForce", 1518631310*(10**9),
                                    1518636750*(10**9), ["13.58.98.64"], ["172.31.69.25"], [22])
# Payload filter
label_flows(wednesday_14022018_df, "SSH-BruteForce - Attempted", 1518631310*(10**9),
                                    1518636750*(10**9), ["13.58.98.64"], ["172.31.69.25"], [22], attempted_category=0, payload_filter=True)

label_rest_as_benign_and_write_csv(wednesday_14022018_df, DATASET_PATH + dir_name + ".csv")

labels before pre-processing: NeedManualLabel    6268692
Name: Label, dtype: int64
labels after pre-processing: NeedManualLabel    5898350
Name: Label, dtype: int64
label count after labelling:
 BENIGN                        5610799
FTP-BruteForce - Attempted     193354
SSH-BruteForce                  94197
Name: Label, dtype: int64
Attempted Category count after labelling:
 -1    5704996
 1     193324
 4         30
Name: Attempted Category, dtype: int64


In [9]:
#---------------------+
# THURSDAY 15-02-2018 |
#---------------------+

dir_name="Thursday-15-02-2018"
thursday_15022018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + "/csv")

#-- DoS GoldenEye
label_flows(thursday_15022018_df, "DoS GoldenEye", 1518701262*(10**9), 1518703905*(10**9), ["18.219.211.138"],
            ["172.31.69.25"], additional_filters=
            [(thursday_15022018_df["Fwd RST Flags"] == 0) |
             (thursday_15022018_df["Flow Duration"] >= 5050000)])

#-- DoS GoldenEye - Attempted
label_flows(thursday_15022018_df, "DoS GoldenEye - Attempted", 1518701262*(10**9), 1518703905*(10**9), ["18.219.211.138"],
            ["172.31.69.25"], attempted_category=4, additional_filters=
            [thursday_15022018_df["Fwd RST Flags"] > 0,
             thursday_15022018_df["Flow Duration"] < 5050000])

#-- DoS GoldenEye - Attempted
label_flows(thursday_15022018_df, "DoS GoldenEye - Attempted", 1518701262*(10**9), 1518703905*(10**9), ["18.219.211.138"],
            ["172.31.69.25"], attempted_category=6, additional_filters=
            [thursday_15022018_df["Bwd RST Flags"] == 1,
             thursday_15022018_df["Total Length of Bwd Packet"] == 0,
            thursday_15022018_df["Flow Duration"] > 100000000])

# Payload filter
label_flows(thursday_15022018_df, "DoS GoldenEye - Attempted", 1518701262*(10**9), 1518703905*(10**9), ["18.219.211.138"],
            ["172.31.69.25"], attempted_category=0, payload_filter=True)

#-- DoS Slowloris
label_flows(thursday_15022018_df, "DoS Slowloris", 1518706812*(10**9), 1518709321*(10**9), ["18.217.165.70"],
            ["172.31.69.25"])

# Payload filter
label_flows(thursday_15022018_df, "DoS Slowloris - Attempted", 1518706812*(10**9), 1518709321*(10**9), ["18.217.165.70"],
            ["172.31.69.25"], attempted_category=0, payload_filter=True)

label_rest_as_benign_and_write_csv(thursday_15022018_df, DATASET_PATH + dir_name + ".csv")

labels before pre-processing: NeedManualLabel    5762777
Name: Label, dtype: int64
labels after pre-processing: NeedManualLabel    5410102
Name: Label, dtype: int64
label count after labelling:
 BENIGN                       5372471
DoS GoldenEye                  22560
DoS Slowloris                   8490
DoS GoldenEye - Attempted       4301
DoS Slowloris - Attempted       2280
Name: Label, dtype: int64
Attempted Category count after labelling:
 -1    5403521
 4       4248
 0       2280
 6         53
Name: Attempted Category, dtype: int64


In [5]:
#-------------------+
# FRIDAY 16-02-2018 |
#-------------------+

dir_name="Friday-16-02-2018"
friday_16022018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + "/csv")

#-- FTP-Patator - Attempted
label_flows(friday_16022018_df, "FTP-BruteForce - Attempted", 1518790334*(10**9), 1518793513*(10**9), ["13.59.126.31"],
            ["172.31.69.25"], attempted_category=1)

#-- DoS Hulk
label_flows(friday_16022018_df, "DoS Hulk", 1518803127*(10**9), 1518803903*(10**9), ["18.219.193.20"], ["172.31.69.25"])

# Payload filter
label_flows(friday_16022018_df, "DoS Hulk - Attempted", 1518803127*(10**9), 1518803903*(10**9), ["18.219.193.20"],
            ["172.31.69.25"], attempted_category=0, payload_filter=True)

#-- Dos Slowhttptest: No actual DoS Slowloris flows are present on this day in this dataset!
#   Instead we only find failed FTP-Patator traffic, which is exactly what is covered earlier in this cell

label_rest_as_benign_and_write_csv(friday_16022018_df, DATASET_PATH + dir_name + ".csv")

labels before pre-processing: NeedManualLabel    7719001
Name: Label, dtype: int64
labels after pre-processing: NeedManualLabel    7390266
Name: Label, dtype: int64
label count after labelling:
 BENIGN                        5481500
DoS Hulk                      1803160
FTP-BruteForce - Attempted     105520
DoS Hulk - Attempted               86
Name: Label, dtype: int64
Attempted Category count after labelling:
 -1    7284660
 1     105520
 0         86
Name: Attempted Category, dtype: int64


In [6]:
#--------------------+
# TUESDAY 20-02-2018 |
#--------------------+

dir_name="Tuesday-20-02-2018"
tuesday_20022018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + "/csv")

#-- DDoS LOIC HTTP
label_flows(tuesday_20022018_df, "DDoS-LOIC-HTTP", 1519136034*(10**9), 1519139809*(10**9),
            ["18.218.115.60", "18.219.9.1", "18.219.32.43", "18.218.55.126", "52.14.136.135",
             "18.219.5.43", "18.216.200.189", "18.218.229.235", "18.218.11.51", "18.216.24.42"],
            ["172.31.69.25"], additional_filters=[
        tuesday_20022018_df["Protocol"] == 6
    ])

# Payload filter
label_flows(tuesday_20022018_df, "DDoS-LOIC-HTTP - Attempted", 1519136034*(10**9), 1519139809*(10**9),
            ["18.218.115.60", "18.219.9.1", "18.219.32.43", "18.218.55.126", "52.14.136.135",
             "18.219.5.43", "18.216.200.189", "18.218.229.235", "18.218.11.51", "18.216.24.42"],
            ["172.31.69.25"], attempted_category=0, payload_filter=True, additional_filters=[tuesday_20022018_df["Protocol"] == 6])

#-- DDoS LOIC UDP
label_flows(tuesday_20022018_df, "DDoS-LOIC-UDP", 1519146857*(10**9), 1519147756*(10**9),
            ["18.218.115.60", "18.219.9.1", "18.219.32.43", "18.218.55.126", "52.14.136.135",
             "18.219.5.43", "18.216.200.189", "18.218.229.235", "18.218.11.51", "18.216.24.42"],
            ["172.31.69.25"], additional_filters=[
        tuesday_20022018_df["Protocol"] == 17])

# Payload filter
label_flows(tuesday_20022018_df, "DDoS-LOIC-UDP - Attempted", 1519146857*(10**9), 1519147756*(10**9),
            ["18.218.115.60", "18.219.9.1", "18.219.32.43", "18.218.55.126", "52.14.136.135",
             "18.219.5.43", "18.216.200.189", "18.218.229.235", "18.218.11.51", "18.216.24.42"],
            ["172.31.69.25"], attempted_category=0, payload_filter=True, additional_filters=[tuesday_20022018_df["Protocol"] == 17])

# Attempted - Target unresponsive (the ICMP destination unreachable answers to the attack - using protocol = 1 for ICMP)
label_flows(tuesday_20022018_df, "DDoS-LOIC-UDP - Attempted", 1519146857*(10**9), 1519147756*(10**9),
            ["172.31.69.25"], ["18.218.115.60", "18.219.9.1", "18.219.32.43", "18.218.55.126", "52.14.136.135",
             "18.219.5.43", "18.216.200.189", "18.218.229.235", "18.218.11.51", "18.216.24.42"],
            attempted_category=6, additional_filters=[(tuesday_20022018_df["Protocol"] == 1)])

label_rest_as_benign_and_write_csv(tuesday_20022018_df, DATASET_PATH + dir_name + ".csv")

labels before pre-processing: NeedManualLabel    6411771
Name: Label, dtype: int64
labels after pre-processing: NeedManualLabel    6054702
Name: Label, dtype: int64
label count after labelling:
 BENIGN                       5764497
DDoS-LOIC-HTTP                289328
DDoS-LOIC-UDP                    797
DDoS-LOIC-UDP - Attempted         80
Name: Label, dtype: int64
Attempted Category count after labelling:
 -1    6054622
 6         80
Name: Attempted Category, dtype: int64


In [7]:
#----------------------+
# WEDNESDAY 21-02-2018 |
#----------------------+

dir_name = "Wednesday-21-02-2018"
wednesday_21022018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + "/csv")

#-- DDoS LOIC UDP
label_flows(wednesday_21022018_df, "DDoS-LOIC-UDP", 1519222131*(10**9), 1519224219*(10**9),
            ["18.218.115.60", "18.219.9.1", "18.219.32.43", "18.218.55.126", "52.14.136.135",
             "18.219.5.43", "18.216.200.189", "18.218.229.235", "18.218.11.51", "18.216.24.42"],
            ["172.31.69.28"], additional_filters=[
        wednesday_21022018_df["Protocol"] == 17
    ])

# Payload filter
label_flows(wednesday_21022018_df, "DDoS-LOIC-UDP - Attempted", 1519222131*(10**9), 1519224219*(10**9),
            ["18.218.115.60", "18.219.9.1", "18.219.32.43", "18.218.55.126", "52.14.136.135",
             "18.219.5.43", "18.216.200.189", "18.218.229.235", "18.218.11.51", "18.216.24.42"],
            ["172.31.69.28"], attempted_category=0, payload_filter=True, additional_filters=[wednesday_21022018_df["Protocol"] == 17])

# Attempted - Target unresponsive (the ICMP destination unreachable answers to the attack - using protocol = 1 for ICMP)
label_flows(wednesday_21022018_df, "DDoS-LOIC-UDP - Attempted", 1519222131*(10**9), 1519224219*(10**9),
            ["172.31.69.28"], ["18.218.115.60", "18.219.9.1", "18.219.32.43", "18.218.55.126", "52.14.136.135",
             "18.219.5.43", "18.216.200.189", "18.218.229.235", "18.218.11.51", "18.216.24.42"],
            attempted_category=6, additional_filters=[(wednesday_21022018_df["Protocol"] == 1)])

#-- DDoS HOIC
label_flows(wednesday_21022018_df, "DDoS-HOIC", 1519236668*(10**9), 1519239955*(10**9),
            ["18.218.115.60", "18.219.9.1", "18.219.32.43", "18.218.55.126", "52.14.136.135",
             "18.219.5.43", "18.216.200.189", "18.218.229.235", "18.218.11.51", "18.216.24.42"],
            ["172.31.69.28"], additional_filters=[
        wednesday_21022018_df["Protocol"] == 6
    ])

# Payload filter
label_flows(wednesday_21022018_df, "DDoS-HOIC - Attempted", 1519236668*(10**9), 1519239955*(10**9),
            ["18.218.115.60", "18.219.9.1", "18.219.32.43", "18.218.55.126", "52.14.136.135",
             "18.219.5.43", "18.216.200.189", "18.218.229.235", "18.218.11.51", "18.216.24.42"],
            ["172.31.69.28"], attempted_category=0, payload_filter=True, additional_filters=[wednesday_21022018_df["Protocol"] == 6])

label_rest_as_benign_and_write_csv(wednesday_21022018_df, DATASET_PATH + dir_name + ".csv")

labels before pre-processing: NeedManualLabel    7295839
Name: Label, dtype: int64
labels after pre-processing: NeedManualLabel    6962593
Name: Label, dtype: int64
label count after labelling:
 BENIGN                       5878399
DDoS-HOIC                    1082293
DDoS-LOIC-UDP                   1730
DDoS-LOIC-UDP - Attempted        171
Name: Label, dtype: int64
Attempted Category count after labelling:
 -1    6962422
 6        171
Name: Attempted Category, dtype: int64


In [8]:
#---------------------+
# THURSDAY 22-02-2018 |
#---------------------+

dir_name = "Thursday-22-02-2018"
thursday_22022018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + "/csv")

#-- Web Attack SQL
label_flows(thursday_22022018_df, "Web Attack - SQL", 1519330590418906000, 1519331276022793000, ["18.218.115.60"],
            ["172.31.69.28"], additional_filters=
            [thursday_22022018_df["Total Length of Fwd Packet"] > 0,
             thursday_22022018_df["Total Length of Bwd Packet"] > 0])

# Attack startup artefact
label_flows(thursday_22022018_df, "Web Attack - SQL - Attempted", 1519330470169342000, 1519330498599986000, ["18.218.115.60"],
            ["172.31.69.28"], attempted_category=2)

# Payload filter
label_flows(thursday_22022018_df, "Web Attack - SQL - Attempted", 1519330590418906000, 1519331276022793000, ["18.218.115.60"],
            ["172.31.69.28"], attempted_category=0, payload_filter=True)

#-- Web Attack XSS
# Port 63782 is attack setup (navigating to website)
label_flows(thursday_22022018_df, "Web Attack - XSS", 1519321899783923000, 1519324181827037000, ["18.218.115.60"],
            ["172.31.69.28"], additional_filters=
            [~(thursday_22022018_df["Src Port"].isin([63782, 64144]))])

# Attempted attack setup
label_flows(thursday_22022018_df, "Web Attack - XSS - Attempted", 1519321899783923000, 1519324181827037000, ["18.218.115.60"],
            ["172.31.69.28"], attempted_category=2, additional_filters=
            [thursday_22022018_df["Src Port"] == 63782])

label_flows(thursday_22022018_df, "Web Attack - XSS - Attempted", 1519321899783923000, 1519324181827037000, ["18.218.115.60"],
            ["172.31.69.28"], attempted_category=3, additional_filters=
            [thursday_22022018_df["Src Port"] == 64144])

# Payload filter
label_flows(thursday_22022018_df, "Web Attack - XSS - Attempted", 1519321899783923000, 1519324181827037000, ["18.218.115.60"],
            ["172.31.69.28"], attempted_category=0, payload_filter=True, additional_filters=
            [~(thursday_22022018_df["Src Port"].isin([63782, 64144]))])

#-- Web Attack Brute Force & Attempted

label_flows(thursday_22022018_df, "Web Attack - Brute Force", 1519309071336902000, 1519313039858533000, ["18.218.115.60"],
            ["172.31.69.28"], additional_filters=
            [thursday_22022018_df["Total Fwd Packet"] > 20])

label_flows(thursday_22022018_df, "Web Attack - Brute Force - Attempted", 1519309071336902000, 1519313039858533000,
            ["18.218.115.60"], ["172.31.69.28"], attempted_category=5, additional_filters=
            [(thursday_22022018_df["Total Fwd Packet"] <= 20) & (thursday_22022018_df["Total Length of Fwd Packet"] > 0)])

label_flows(thursday_22022018_df, "Web Attack - Brute Force - Attempted", 1519308824965705000, 1519308947920399000, ["18.218.115.60"],
            ["172.31.69.28"], attempted_category=2)

# Payload filter
label_flows(thursday_22022018_df, "Web Attack - Brute Force - Attempted", 1519309071336902000, 1519313039858533000,
            ["18.218.115.60"], ["172.31.69.28"], attempted_category=0, payload_filter=True)

label_rest_as_benign_and_write_csv(thursday_22022018_df, DATASET_PATH + dir_name + ".csv")


labels before pre-processing: NeedManualLabel    6483351
Name: Label, dtype: int64
labels after pre-processing: NeedManualLabel    6071153
Name: Label, dtype: int64
label count after labelling:
 BENIGN                                  6070945
Web Attack - Brute Force - Attempted         76
Web Attack - Brute Force                     69
Web Attack - XSS                             40
Web Attack - SQL                             16
Web Attack - SQL - Attempted                  4
Web Attack - XSS - Attempted                  3
Name: Label, dtype: int64
Attempted Category count after labelling:
 -1    6071070
 5         66
 2         12
 0          4
 3          1
Name: Attempted Category, dtype: int64


In [9]:
#-------------------+
# FRIDAY 23-02-2018 |
#-------------------+

dir_name = "Friday-23-02-2018"
friday_23022018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + "/csv")

#-- Web Attack SQL
label_flows(friday_23022018_df, "Web Attack - SQL", 1519412792126122000, 1519413444947957000 , ["18.218.115.60"],
            ["172.31.69.28"], additional_filters=
            [friday_23022018_df["Total Length of Fwd Packet"] > 0,
             friday_23022018_df["Total Length of Bwd Packet"] > 0])

# Attack startup artefact
label_flows(friday_23022018_df, "Web Attack - SQL - Attempted", 1519412722675686000, 1519412787879296000, ["18.218.115.60"],
            ["172.31.69.28"], attempted_category=2)

# Payload filter
label_flows(friday_23022018_df, "Web Attack - SQL - Attempted", 1519412792126122000, 1519413444947957000 , ["18.218.115.60"],
            ["172.31.69.28"], attempted_category=0, payload_filter=True)

#-- Web Attack XSS
label_flows(friday_23022018_df, "Web Attack - XSS", 1519405264559707000, 1519409428237472000, ["18.218.115.60"],
            ["172.31.69.28"], additional_filters=
            [~(friday_23022018_df["Src Port"].isin([59173]))])

label_flows(friday_23022018_df, "Web Attack - XSS - Attempted", 1519405264559707000, 1519409428237472000, ["18.218.115.60"],
            ["172.31.69.28"], attempted_category=2, additional_filters=
            [(friday_23022018_df["Src Port"].isin([59173]))])

# Payload filter
label_flows(friday_23022018_df, "Web Attack - XSS - Attempted", 1519405264559707000, 1519409428237472000, ["18.218.115.60"],
            ["172.31.69.28"], attempted_category=0, payload_filter=True)

#-- Web Attack Brute Force & Attempted
label_flows(friday_23022018_df, "Web Attack - Brute Force", 1519394670193975000, 1519398186406294000, ["18.218.115.60"],
            ["172.31.69.28"], additional_filters=
            [friday_23022018_df["Total Fwd Packet"] > 20])

label_flows(friday_23022018_df, "Web Attack - Brute Force - Attempted", 1519394670193975000, 1519398186406294000,
            ["18.218.115.60"], ["172.31.69.28"], attempted_category=5, additional_filters=
            [(friday_23022018_df["Total Fwd Packet"] <= 20) & (friday_23022018_df["Total Length of Fwd Packet"] > 0)])

# Payload filter:
label_flows(friday_23022018_df, "Web Attack - Brute Force - Attempted", 1519394670193975000, 1519398186406294000,
            ["18.218.115.60"], ["172.31.69.28"], attempted_category=0, payload_filter=True)

label_rest_as_benign_and_write_csv(friday_23022018_df, DATASET_PATH + dir_name + ".csv")

labels before pre-processing: NeedManualLabel    6313169
Name: Label, dtype: int64
labels after pre-processing: NeedManualLabel    5976481
Name: Label, dtype: int64
label count after labelling:
 BENIGN                                  5976251
Web Attack - XSS                             73
Web Attack - Brute Force                     62
Web Attack - Brute Force - Attempted         61
Web Attack - SQL                             23
Web Attack - SQL - Attempted                 10
Web Attack - XSS - Attempted                  1
Name: Label, dtype: int64
Attempted Category count after labelling:
 -1    5976409
 5         60
 0          6
 2          6
Name: Attempted Category, dtype: int64


In [10]:
#----------------------+
# WEDNESDAY 28-02-2018 |
#----------------------+

dir_name = "Wednesday-28-02-2018"
wednesday_28022018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + "/csv")

#-- Infiltration - Dropbox Download
label_flows(wednesday_28022018_df, "Infiltration - Dropbox Download", 1519828404*(10**9), 1519829172*(10**9),
            ["172.31.69.24"],
            ["162.125.3.1", "162.125.3.5", "162.125.3.6", "162.125.248.1", "162.125.18.133"])

label_flows(wednesday_28022018_df, "Infiltration - Dropbox Download", 1519839771*(10**9), 1519839824*(10**9),
            ["172.31.69.24"],
            ["162.125.3.1", "162.125.3.5", "162.125.3.6", "162.125.248.1", "162.125.18.133"])

# Payload filter
label_flows(wednesday_28022018_df, "Infiltration - Dropbox Download - Attempted", 1519828404*(10**9), 1519829172*(10**9),
            ["172.31.69.24"],
            ["162.125.3.1", "162.125.3.5", "162.125.3.6", "162.125.248.1", "162.125.18.133"],
            attempted_category=0, payload_filter=True)

label_flows(wednesday_28022018_df, "Infiltration - Dropbox Download - Attempted", 1519839771*(10**9), 1519839824*(10**9),
            ["172.31.69.24"],
            ["162.125.3.1", "162.125.3.5", "162.125.3.6", "162.125.248.1", "162.125.18.133"],
            attempted_category=0, payload_filter=True)

# Attempted - Attack artefact
label_flows(wednesday_28022018_df, "Infiltration - Dropbox Download - Attempted", 1519828404*(10**9), 1519829172*(10**9),
            ["172.31.69.24"],
            ["104.16.100.29", "104.16.99.29", "52.84.128.3", "52.85.101.236", "52.85.131.81", "52.85.95.206"], attempted_category=4)

label_flows(wednesday_28022018_df, "Infiltration - Dropbox Download - Attempted", 1519839771*(10**9), 1519839824*(10**9),
            ["172.31.69.24"],
            ["104.16.100.29", "104.16.99.29", "52.84.128.3", "52.85.101.236", "52.85.131.81", "52.85.95.206"], attempted_category=4)

#-- Infiltration - Communication Victim Attacker
label_flows(wednesday_28022018_df, "Infiltration - Communication Victim Attacker", 1519829140*(10**9),
            1519834135*(10**9), ["172.31.69.24"], ["13.58.225.34"])

label_flows(wednesday_28022018_df, "Infiltration - Communication Victim Attacker", 1519839839*(10**9),
            1519843200*(10**9), ["172.31.69.24"], ["13.58.225.34"])

# Payload filter

label_flows(wednesday_28022018_df, "Infiltration - Communication Victim Attacker - Attempted", 1519829140*(10**9),
            1519834135*(10**9), ["172.31.69.24"], ["13.58.225.34"], attempted_category=0, payload_filter=True)

label_flows(wednesday_28022018_df, "Infiltration - Communication Victim Attacker - Attempted", 1519839839*(10**9),
            1519843200*(10**9), ["172.31.69.24"], ["13.58.225.34"], attempted_category=0, payload_filter=True)

#-- Infiltration - NMAP Portscan
label_flows(wednesday_28022018_df, "Infiltration - NMAP Portscan", 1519829182*(10**9), 1519843140746247000,
            ["172.31.69.24"],
            ["172.31.69.1", "172.31.69.10", "172.31.69.11", "172.31.69.12", "172.31.69.13", "172.31.69.14",
             "172.31.69.16", "172.31.69.17", "172.31.69.19", "172.31.69.20", "172.31.69.23", "172.31.69.4",
             "172.31.69.5", "172.31.69.6", "172.31.69.8", "172.31.69.9", "172.31.69.7", "172.31.69.22",
             "172.31.69.15", "172.31.69.21", "172.31.69.18",], additional_filters=
            [~(wednesday_28022018_df["Src Port"] == 68)])

label_rest_as_benign_and_write_csv(wednesday_28022018_df, DATASET_PATH + dir_name + ".csv")

labels before pre-processing: NeedManualLabel    7173690
Name: Label, dtype: int64
labels after pre-processing: NeedManualLabel    6568726
Name: Label, dtype: int64
label count after labelling:
 BENIGN                                          6518882
Infiltration - NMAP Portscan                      49740
Infiltration - Dropbox Download                      46
Infiltration - Communication Victim Attacker         43
Infiltration - Dropbox Download - Attempted          15
Name: Label, dtype: int64
Attempted Category count after labelling:
 -1    6568711
 4         15
Name: Attempted Category, dtype: int64


In [11]:
#---------------------+
# THURSDAY 01-03-2018 |
#---------------------+

dir_name = "Thursday-01-03-2018"
thursday_01032018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + "/csv")

#-- Infiltration - Dropbox Download
label_flows(thursday_01032018_df, "Infiltration - Dropbox Download", 1519912390*(10**9), 1519912760*(10**9),
            ["172.31.69.13"], ["162.125.3.1", "162.125.3.6", "162.125.248.1", "162.125.18.133"])

label_flows(thursday_01032018_df, "Infiltration - Dropbox Download", 1519913032*(10**9), 1519918454*(10**9),
            ["172.31.69.13"], ["162.125.3.1", "162.125.3.6", "162.125.248.1", "162.125.18.133"])

# Payload filter
label_flows(thursday_01032018_df, "Infiltration - Dropbox Download - Attempted", 1519912390*(10**9), 1519912760*(10**9),
            ["172.31.69.13"],
            ["162.125.3.1", "162.125.3.6", "162.125.248.1", "162.125.18.133"], attempted_category=0, payload_filter=True)

label_flows(thursday_01032018_df, "Infiltration - Dropbox Download - Attempted", 1519913032*(10**9), 1519918454*(10**9),
            ["172.31.69.13"],
            ["162.125.3.1", "162.125.3.6", "162.125.248.1", "162.125.18.133"], attempted_category=0, payload_filter=True)

# Attempted - Attack artefact
label_flows(thursday_01032018_df, "Infiltration - Dropbox Download - Attempted", 1519912390*(10**9), 1519912760*(10**9),
            ["172.31.69.13"], ["104.16.100.29", "13.32.168.125", "52.85.112.72"], attempted_category=4)

label_flows(thursday_01032018_df, "Infiltration - Dropbox Download - Attempted", 1519913032*(10**9), 1519918454*(10**9),
            ["172.31.69.13"], ["104.16.100.29", "13.32.168.125", "52.85.112.72"], attempted_category=4)

#-- Infiltration - Communication Victim Attacker
label_flows(thursday_01032018_df, "Infiltration - Communication Victim Attacker", 1519912674*(10**9),
            1519912745*(10**9), ["172.31.69.13"], ["13.58.225.34"])

label_flows(thursday_01032018_df, "Infiltration - Communication Victim Attacker", 1519913075*(10**9),
            1519928245*(10**9), ["172.31.69.13"], ["13.58.225.34"])

label_flows(thursday_01032018_df, "Infiltration - Communication Victim Attacker", 1519928295*(10**9),
            1519933041*(10**9), ["172.31.69.13"], ["13.58.225.34"])

# Payload filter
label_flows(thursday_01032018_df, "Infiltration - Communication Victim Attacker - Attempted", 1519912674*(10**9),
            1519912745*(10**9), ["172.31.69.13"], ["13.58.225.34"], attempted_category=0, payload_filter=True)

label_flows(thursday_01032018_df, "Infiltration - Communication Victim Attacker - Attempted", 1519913075*(10**9),
            1519928245*(10**9), ["172.31.69.13"], ["13.58.225.34"], attempted_category=0, payload_filter=True)

label_flows(thursday_01032018_df, "Infiltration - Communication Victim Attacker - Attempted", 1519928295*(10**9),
            1519933041*(10**9), ["172.31.69.13"], ["13.58.225.34"], attempted_category=0, payload_filter=True)

#-- Infiltration - NMAP Portscan (TODO: do we not need to filter out DHCP background traffic on port 68 in NMAP
#   of previous day as well?)
label_flows(thursday_01032018_df, "Infiltration - NMAP Portscan", 1519913388354333000, 1519933092182726000,
            ["172.31.69.13"],
            ["172.31.69.1", "172.31.69.11", "172.31.69.12", "172.31.69.16", "172.31.69.8", "172.31.69.9",
             "172.31.69.10", "172.31.69.14", "172.31.69.4", "172.31.69.5", "172.31.69.6", "172.31.69.17",
             "172.31.69.20", "172.31.69.23", "172.31.69.24", "172.31.69.19", "172.31.69.7", "172.31.69.15",
             "172.31.69.18", "172.31.69.22", "172.31.69.21"], additional_filters=
            [thursday_01032018_df["Src Port"] != 68])

label_rest_as_benign_and_write_csv(thursday_01032018_df, DATASET_PATH + dir_name + ".csv")

labels before pre-processing: NeedManualLabel    7252549
Name: Label, dtype: int64
labels after pre-processing: NeedManualLabel    6551401
Name: Label, dtype: int64
label count after labelling:
 BENIGN                                          6511554
Infiltration - NMAP Portscan                      39634
Infiltration - Communication Victim Attacker        161
Infiltration - Dropbox Download                      39
Infiltration - Dropbox Download - Attempted          13
Name: Label, dtype: int64
Attempted Category count after labelling:
 -1    6551388
 4         13
Name: Attempted Category, dtype: int64


In [3]:
#-------------------+
# FRIDAY 02-03-2018 |
#-------------------+

dir_name = "Friday-02-03-2018"
friday_02032018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + "/csv")

#-- Botnet Ares
label_flows(friday_02032018_df, "Botnet Ares", 1520000008*(10**9), 1520020492*(10**9), additional_filters=
            [(friday_02032018_df["Src IP"] == "18.219.211.138") | (friday_02032018_df["Dst IP"] == "18.219.211.138")])

#-- Botnet Ares - Attempted: Tear-down artefact. Botnet slave has ongoing TCP connection to master which is prematurely terminated by master sending RST packet
label_flows(friday_02032018_df, "Botnet Ares - Attempted", 1520020424*(10**9), 1520020492*(10**9), attempted_category=2, additional_filters=
            [(friday_02032018_df["Dst IP"] == "18.219.211.138") &
             (friday_02032018_df["Total Length of Fwd Packet"] > 0) &
             (friday_02032018_df["Bwd RST Flags"] > 0)])


# Payload filter
label_flows(friday_02032018_df, "Botnet Ares - Attempted", 1520000008*(10**9), 1520020492*(10**9), attempted_category=0, additional_filters=
            [((friday_02032018_df["Src IP"] == "18.219.211.138") | (friday_02032018_df["Dst IP"] == "18.219.211.138")) &
             (friday_02032018_df["Total Length of Fwd Packet"] == 0) & (friday_02032018_df["Total Length of Bwd Packet"] == 0)])

label_rest_as_benign_and_write_csv(friday_02032018_df, DATASET_PATH + dir_name + ".csv")



labels before pre-processing: NeedManualLabel    6637636
Name: Label, dtype: int64
labels after pre-processing: NeedManualLabel    6311371
Name: Label, dtype: int64
label count after labelling:
 BENIGN                     6168188
Botnet Ares                 142921
Botnet Ares - Attempted        262
Name: Label, dtype: int64
Attempted Category count after labelling:
 -1    6311109
 0        258
 2          4
Name: Attempted Category, dtype: int64
