In [1]:
import pandas as pd
import numpy as np
import glob
import os
from sys import platform

# THIS LABELLING SCRIPT IS USED TO LABEL THE OLD VERSION OF CSE-CIC-IDS-2017. THIS VERSION SHOULD ONLY BE USED IF YOU
# WISH TO RECREATE OUR RESULTS AS REPORTED IN OUR PAPER: https://intrusion-detection.distrinet-research.be/CNS2022/index.html

# THIS SCRIPT ACCEPTS AS INPUT THE ORIGINAL CSVs AS RELEASED BY THE DATASET AUTHORS: https://www.unb.ca/cic/datasets/ids-2018.html


pd.set_option('display.max_rows', 100)


DATASET_PATH = ""

# unset to remove line index (to refer to line numbers when writing final csv)
print_index = True

In [2]:
def format_csv_for_labelling(df):
    # strip leading whitespaces in column names
    df.columns = df.columns.str.lstrip(" ")

    print("labels before pre-processing:", df["Label"].value_counts())

    # Keep track of header rows that occur in the middle of the flow traces. Drop them
    # temporarily for ease of labeling and dataframe manipulation and then merge them
    # back in at the very end. The intention is to preserve the original published files
    # exactly except with the corrected labelling. This makes lining up mismatches between
    # the original and corrected version easier, using line number as the reference.
    # This is for 2018 version only, as the 2017 version does not contain header rows in
    # the middle of flow traces.
    header_rows = df[(df["Timestamp"] == "Timestamp") & (df.index > 0)]
    df = df.drop(header_rows.index)

    # Since CICIDS 2018 authors used 12-hour format but removed AM/PM, we need to reconstruct it
    # We do this based on the knowledge they collected traffic from roughly 9:00 AM to 5:00 PM.
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M:%S')
    #for i, item in enumerate(df['Timestamp']):
    #    try:
    #        new_item = pd.to_datetime(item, format='%d/%m/%Y %H:%M:%S')
    #    except ValueError:
    #        print('ERROR at index {}: {}'.format(i, item))

    df['Timestamp'] = df['Timestamp'].apply(lambda x: x + pd.DateOffset(hours=12) if x.hour < 7 else x)

    # Convert to UTC from New Brunswick winter timezone (UTC-4)
    df['Timestamp'] = df['Timestamp'] + pd.DateOffset(hours=4)

    for column in df.columns:
        if column not in ['Flow ID' , 'Timestamp', 'Src IP', 'Dst IP', 'Label']:
            df[column] = pd.to_numeric(df[column])

    # Add attempted category column and initialise to -1
    df["Attempted Category"] = -1

    # CICIDS 2018 author-released version comes prelabelled. This makes sure previous labels don't interfere
    df["Label"] = "NeedManualLabel"

    print("labels after pre-processing:", df["Label"].value_counts())

    return df, header_rows

def read_csvs_from_path_and_reformat(path):
    df = pd.read_csv(path + "/merged.csv")

    df, header_rows = format_csv_for_labelling(df)

    return df, header_rows

# Important note: you should not use the also_flip_flow_direction if you set the additional_filters with a "Fwd" or "Bwd"
# column filtering
def label_flows(df, label, attack_start_time_nanoseconds, attack_end_time_nanoseconds, src_ip_list=None,
                dst_ip_list= None, src_port_list=None, dst_port_list=None, attempted_category = -1, additional_filters=[],
                also_flip_flow_direction=False, payload_filter=False):
    # Note that labelling happens inplace on the 'df' parameter, and so this function doesn't return anything

    # Create initial mask with all values set to True. Squeeze is necessary to remove second axis (with value 1)
    # The reason is that a df of shape (X,) gets converted to (1,X) if you '&' it with a df of shape (X,1)
    custom_mask = pd.DataFrame(True, index=df.index, columns=[df.columns[0]]).squeeze()

    # Need to round the start time down to the nearest second to prevent edge-case issues with flows being mislabelled as benign
    attack_start_datetime = pd.to_datetime(attack_start_time_nanoseconds, unit='ns').floor(freq='S')
    attack_end_datetime = pd.to_datetime(attack_end_time_nanoseconds, unit='ns')

    custom_mask &= (df["Timestamp"] >= attack_start_datetime)
    custom_mask &= (df["Timestamp"] <= attack_end_datetime)

    if src_ip_list is not None:
        custom_mask &= (df["Src IP"].isin(src_ip_list))
    if dst_ip_list is not None:
        custom_mask &= (df["Dst IP"].isin(dst_ip_list))

    if src_port_list is not None:
        custom_mask &= (df["Src Port"].isin(src_port_list))
    if dst_port_list is not None:
        custom_mask &= (df["Dst Port"].isin(dst_port_list))

    # IMPORTANT NOTE: If you decide to add TotLen Fwd Pkt == 6 for catching RST packets, you still have to manually alter some additional_filters for flipped flows where
    # you couldn't use payload_filter boolean function input value
    if payload_filter:
        custom_mask &= (df["TotLen Fwd Pkts"] == 0)

    for filter in additional_filters:
        custom_mask &= filter

    df["Label"].mask(custom_mask, label, inplace=True)
    df["Attempted Category"].mask(custom_mask, attempted_category, inplace=True)

    if also_flip_flow_direction:

        custom_mask = pd.DataFrame(True, index=df.index, columns=[df.columns[0]]).squeeze()

        custom_mask &= (df["Timestamp"] >= attack_start_datetime)
        custom_mask &= (df["Timestamp"] <= attack_end_datetime)

        if src_ip_list is not None:
            custom_mask &= (df["Dst IP"].isin(src_ip_list))
        if dst_ip_list is not None:
            custom_mask &= (df["Src IP"].isin(dst_ip_list))

        if src_port_list is not None:
            custom_mask &= (df["Dst Port"].isin(src_port_list))
        if dst_port_list is not None:
            custom_mask &= (df["Src Port"].isin(dst_port_list))

        if payload_filter:
            custom_mask &= (df["TotLen Bwd Pkts"] == 0)

        for filter in additional_filters:
            custom_mask &= filter

        df["Label"].mask(custom_mask, label, inplace=True)
        df["Attempted Category"].mask(custom_mask, attempted_category, inplace=True)



def label_rest_as_benign_and_write_csv(df, header_rows, file_to_write):
    df["Label"].mask(df["Label"] == "NeedManualLabel", "BENIGN", inplace=True)

    # Relabel artefact flows with [Flow Id] = '8.0.6.4-8.6.0.1-0-0-0' to label = 0
    df["Label"].mask(df["Flow ID"] == '8.0.6.4-8.6.0.1-0-0-0', "BENIGN", inplace=True)

    print("label count after labelling:\r\n", df["Label"].value_counts())
    print("Attempted Category count after labelling:\r\n", df["Attempted Category"].value_counts())

    full_df = pd.concat([df, header_rows], sort=False).sort_index()

    if print_index:
        full_df.reset_index(inplace=True, drop=True)
        full_df.index += 1
        full_df.index.name = 'id'
        full_df.to_csv(file_to_write)
    else:
        full_df.to_csv(file_to_write, index=False)

In [3]:
#----------------------+
# WEDNESDAY 14-02-2018 |
#----------------------+

dir_name = "Wednesday-14-02-2018"
wednesday_14022018_df, wednesday_14022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)

#-- FTP-BruteForce
label_flows(wednesday_14022018_df, "FTP-BruteForce - Attempted", 1518618806*(10**9),
                                    1518624631*(10**9), ["18.221.219.4"], ["172.31.69.25"], attempted_category=1, also_flip_flow_direction=True)

# FTP-BruteForce - Attempted (tool accidentally got launched in FTP bruteforce mode instead of SSH bruteforce mode)
# Note that, in order to avoid float imprecisions at the micro- and nanosecond level, the UNIX timestamps such as
# 1518631281.199541000, which is in seconds, needs to be converted to nanoseconds, so that the number is stored
# in int64 instead of float.
label_flows(wednesday_14022018_df, "FTP-BruteForce - Attempted", 1518631281,
                                    1518631281, ["13.58.98.64"], ["172.31.69.25"], dst_port_list=[21], attempted_category=4)

#-- SSH-BruteForce
label_flows(wednesday_14022018_df, "SSH-BruteForce", 1518631310*(10**9),
                                    1518636750*(10**9), ["13.58.98.64"], ["172.31.69.25"], dst_port_list=[22], also_flip_flow_direction=True)
# Payload filter
label_flows(wednesday_14022018_df, "SSH-BruteForce - Attempted", 1518631310*(10**9),
                                    1518636750*(10**9), ["13.58.98.64"], ["172.31.69.25"], dst_port_list=[22],
                                    attempted_category=0, payload_filter=True, also_flip_flow_direction=True)

label_rest_as_benign_and_write_csv(wednesday_14022018_df, wednesday_14022018_df_header_rows, DATASET_PATH + dir_name + ".csv")

wednesday_14022018_df = None

labels before pre-processing: Benign            6702133
FTP-BruteForce     193360
SSH-Bruteforce     187589
Name: Label, dtype: int64
labels after pre-processing: NeedManualLabel    7083082
Name: Label, dtype: int64
label count after labelling:
 BENIGN                        6701304
FTP-BruteForce - Attempted     193360
SSH-BruteForce - Attempted      94211
SSH-BruteForce                  94207
Name: Label, dtype: int64
Attempted Category count after labelling:
 -1    6795511
 1     193360
 0      94211
Name: Attempted Category, dtype: int64


In [4]:
#---------------------+
# THURSDAY 15-02-2018 |
#---------------------+

dir_name="Thursday-15-02-2018"
thursday_15022018_df, thursday_15022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)

#-- DoS GoldenEye
label_flows(thursday_15022018_df, "DoS GoldenEye", 1518701262*(10**9), 1518703905*(10**9), ["18.219.211.138"],
            ["172.31.69.25"], also_flip_flow_direction=True)

# Payload filter
label_flows(thursday_15022018_df, "DoS GoldenEye - Attempted", 1518701262*(10**9), 1518703905*(10**9), ["18.219.211.138"],
            ["172.31.69.25"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)

# Target system unresponsive
label_flows(thursday_15022018_df, "DoS GoldenEye - Attempted", 1518701262*(10**9), 1518703905*(10**9), ["18.219.211.138"],
            ["172.31.69.25"], attempted_category=6, additional_filters=[
        (thursday_15022018_df["TotLen Bwd Pkts"] == 0) & (thursday_15022018_df["TotLen Fwd Pkts"] > 0) &
        (thursday_15022018_df["Tot Fwd Pkts"] > 2) & (thursday_15022018_df["Flow Duration"] > 100000000)
    ])

#-- DoS Slowloris
label_flows(thursday_15022018_df, "DoS Slowloris", 1518706812*(10**9), 1518709321*(10**9), ["18.217.165.70"],
            ["172.31.69.25"], also_flip_flow_direction=True)

# Payload filter
label_flows(thursday_15022018_df, "DoS Slowloris - Attempted", 1518706812*(10**9), 1518709321*(10**9), ["18.217.165.70"],
            ["172.31.69.25"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)

# Attack artefact (exclusively for original version
label_flows(thursday_15022018_df, "DoS Slowloris - Attempted", 1518706812*(10**9), 1518709321*(10**9), ["172.31.69.25"], ["18.217.165.70"],
           attempted_category=4, additional_filters=[
        (thursday_15022018_df["Tot Fwd Pkts"] == 1) & (thursday_15022018_df["Tot Bwd Pkts"] == 2) & (thursday_15022018_df["TotLen Fwd Pkts"] == 0) &
        (thursday_15022018_df["TotLen Bwd Pkts"] == 238)
    ])

label_rest_as_benign_and_write_csv(thursday_15022018_df, thursday_15022018_df_header_rows, DATASET_PATH + dir_name + ".csv")

thursday_15022018_df = None

labels before pre-processing: Benign                   6565262
DoS attacks-GoldenEye      41508
DoS attacks-Slowloris      10990
Name: Label, dtype: int64
labels after pre-processing: NeedManualLabel    6617760
Name: Label, dtype: int64
label count after labelling:
 BENIGN                       6564757
DoS GoldenEye                  27719
DoS GoldenEye - Attempted      13789
DoS Slowloris                   8585
DoS Slowloris - Attempted       2910
Name: Label, dtype: int64
Attempted Category count after labelling:
 -1    6601061
 0      16638
 6         53
 4          8
Name: Attempted Category, dtype: int64


In [5]:
#-------------------+
# FRIDAY 16-02-2018 |
#-------------------+

dir_name="Friday-16-02-2018"
friday_16022018_df, friday_16022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)

#-- FTP-Patator - Attempted
label_flows(friday_16022018_df, "FTP-BruteForce - Attempted", 1518790334*(10**9), 1518793513*(10**9), ["13.59.126.31"],
            ["172.31.69.25"], attempted_category=1, also_flip_flow_direction=True)

#-- DoS Hulk
label_flows(friday_16022018_df, "DoS Hulk", 1518803127*(10**9), 1518803903*(10**9), ["18.219.193.20"], ["172.31.69.25"],
            also_flip_flow_direction=True)

label_flows(friday_16022018_df, "DoS Hulk - Attempted", 1518803127*(10**9), 1518803903*(10**9), ["18.219.193.20"],
            ["172.31.69.25"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)

#-- Dos Slowhttptest: No actual DoS Slowloris flows are present on this day in this dataset!
#   Instead we only find failed FTP-Patator traffic, which is exactly what is covered earlier in this cell

label_rest_as_benign_and_write_csv(friday_16022018_df, friday_16022018_df_header_rows, DATASET_PATH + dir_name + ".csv")

friday_16022018_df = None

  friday_16022018_df, friday_16022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)


labels before pre-processing: Benign                      7413958
DoS attacks-Hulk             923824
DoS attacks-SlowHTTPTest     182868
Label                             1
Name: Label, dtype: int64
labels after pre-processing: NeedManualLabel    8520650
Name: Label, dtype: int64
label count after labelling:
 BENIGN                        6521192
DoS Hulk                       935504
DoS Hulk - Attempted           881086
FTP-BruteForce - Attempted     182868
Name: Label, dtype: int64
Attempted Category count after labelling:
 -1    7456696
 0     881086
 1     182868
Name: Attempted Category, dtype: int64


In [6]:
#--------------------+
# TUESDAY 20-02-2018 |
#--------------------+

dir_name="Tuesday-20-02-2018"
tuesday_20022018_df, tuesday_20022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)

#-- DDoS LOIC HTTP
label_flows(tuesday_20022018_df, "DDoS-LOIC-HTTP", 1519136034*(10**9), 1519139808*(10**9),
            ["18.218.115.60", "18.219.9.1", "18.219.32.43", "18.218.55.126", "52.14.136.135",
             "18.219.5.43", "18.216.200.189", "18.218.229.235", "18.218.11.51", "18.216.24.42"],
            ["172.31.69.25"], additional_filters=[
        tuesday_20022018_df["Protocol"] == 6
    ])

# Payload filter
label_flows(tuesday_20022018_df, "DDoS-LOIC-HTTP - Attempted", 1519136034*(10**9), 1519139808*(10**9),
            ["18.218.115.60", "18.219.9.1", "18.219.32.43", "18.218.55.126", "52.14.136.135",
             "18.219.5.43", "18.216.200.189", "18.218.229.235", "18.218.11.51", "18.216.24.42"],
            ["172.31.69.25"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True,
            additional_filters=[tuesday_20022018_df["Protocol"] == 6])

#-- DDoS LOIC UDP
label_flows(tuesday_20022018_df, "DDoS-LOIC-UDP", 1519146857*(10**9), 1519147756*(10**9),
            ["18.218.115.60", "18.219.9.1", "18.219.32.43", "18.218.55.126", "52.14.136.135",
             "18.219.5.43", "18.216.200.189", "18.218.229.235", "18.218.11.51", "18.216.24.42"],
            ["172.31.69.25"], additional_filters=[
        tuesday_20022018_df["Protocol"] == 17])

# Payload filter
label_flows(tuesday_20022018_df, "DDoS-LOIC-UDP - Attempted", 1519146857*(10**9), 1519147756*(10**9),
            ["18.218.115.60", "18.219.9.1", "18.219.32.43", "18.218.55.126", "52.14.136.135",
             "18.219.5.43", "18.216.200.189", "18.218.229.235", "18.218.11.51", "18.216.24.42"],
            ["172.31.69.25"], attempted_category=0, payload_filter=True,
            additional_filters=[tuesday_20022018_df["Protocol"] == 17])

# Attempted - Target unresponsive (the ICMP destination unreachable answers to the attack - using protocol == 17 (UDP) because original CICFlowMeter does not recognise ICMP)
label_flows(tuesday_20022018_df, "DDoS-LOIC-UDP - Attempted", 1519146857*(10**9), 1519147756*(10**9),
            ["172.31.69.25"], ["18.218.115.60", "18.219.9.1", "18.219.32.43", "18.218.55.126", "52.14.136.135",
             "18.219.5.43", "18.216.200.189", "18.218.229.235", "18.218.11.51", "18.216.24.42"],
            attempted_category=6, additional_filters=[tuesday_20022018_df["Protocol"] == 17])


label_rest_as_benign_and_write_csv(tuesday_20022018_df, tuesday_20022018_df_header_rows, DATASET_PATH + dir_name + ".csv")

tuesday_20022018_df = None

labels before pre-processing: Benign                    7372557
DDoS attacks-LOIC-HTTP     576191
Name: Label, dtype: int64
labels after pre-processing: NeedManualLabel    7948748
Name: Label, dtype: int64
label count after labelling:
 BENIGN                        7371764
DDoS-LOIC-HTTP                 289289
DDoS-LOIC-HTTP - Attempted     286196
DDoS-LOIC-UDP                     797
DDoS-LOIC-UDP - Attempted         702
Name: Label, dtype: int64
Attempted Category count after labelling:
 -1    7661850
 0     286196
 6        702
Name: Attempted Category, dtype: int64


In [7]:
#----------------------+
# WEDNESDAY 21-02-2018 |
#----------------------+

dir_name = "Wednesday-21-02-2018"
wednesday_21022018_df, wednesday_21022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)

#-- DDoS LOIC UDP
label_flows(wednesday_21022018_df, "DDoS-LOIC-UDP", 1519222131*(10**9), 1519224219*(10**9),
            ["18.218.115.60", "18.219.9.1", "18.219.32.43", "18.218.55.126", "52.14.136.135",
             "18.219.5.43", "18.216.200.189", "18.218.229.235", "18.218.11.51", "18.216.24.42"],
            ["172.31.69.28"], additional_filters=[
        wednesday_21022018_df["Protocol"] == 17
    ])

# Payload filter
label_flows(wednesday_21022018_df, "DDoS-LOIC-UDP - Attempted", 1519222131*(10**9), 1519224219*(10**9),
            ["18.218.115.60", "18.219.9.1", "18.219.32.43", "18.218.55.126", "52.14.136.135",
             "18.219.5.43", "18.216.200.189", "18.218.229.235", "18.218.11.51", "18.216.24.42"],
            ["172.31.69.28"], attempted_category=0, payload_filter=True,
            additional_filters=[wednesday_21022018_df["Protocol"] == 17])

# Attempted - Target unresponsive (the ICMP destination unreachable answers to the attack - using protocol == 17 (UDP) because original CICFlowMeter does not recognise ICMP)
label_flows(wednesday_21022018_df, "DDoS-LOIC-UDP - Attempted", 1519222131*(10**9), 1519224219*(10**9),
            ["172.31.69.28"], ["18.218.115.60", "18.219.9.1", "18.219.32.43", "18.218.55.126", "52.14.136.135",
            "18.219.5.43", "18.216.200.189", "18.218.229.235", "18.218.11.51", "18.216.24.42"],
            attempted_category=6,
            additional_filters=[wednesday_21022018_df["Protocol"] == 17])

#-- DDoS HOIC
label_flows(wednesday_21022018_df, "DDoS-HOIC", 1519236668*(10**9), 1519239954*(10**9),
            ["18.218.115.60", "18.219.9.1", "18.219.32.43", "18.218.55.126", "52.14.136.135",
             "18.219.5.43", "18.216.200.189", "18.218.229.235", "18.218.11.51", "18.216.24.42"],
            ["172.31.69.28"], also_flip_flow_direction=True, additional_filters=[
        wednesday_21022018_df["Protocol"] == 6
    ])

# Payload filter
label_flows(wednesday_21022018_df, "DDoS-HOIC - Attempted", 1519236668*(10**9), 1519239954*(10**9),
            ["18.218.115.60", "18.219.9.1", "18.219.32.43", "18.218.55.126", "52.14.136.135",
             "18.219.5.43", "18.216.200.189", "18.218.229.235", "18.218.11.51", "18.216.24.42"],
            ["172.31.69.28"], payload_filter=True, also_flip_flow_direction=True,
            attempted_category=0, additional_filters=[wednesday_21022018_df["Protocol"] == 6])


label_rest_as_benign_and_write_csv(wednesday_21022018_df, wednesday_21022018_df_header_rows, DATASET_PATH + dir_name + ".csv")

wednesday_21022018_df = None

labels before pre-processing: Benign                  8355458
DDOS attack-HOIC        1246034
DDOS attack-LOIC-UDP       1730
Name: Label, dtype: int64
labels after pre-processing: NeedManualLabel    9603222
Name: Label, dtype: int64
label count after labelling:
 BENIGN                       7435307
DDoS-HOIC - Attempted        1082294
DDoS-HOIC                    1082293
DDoS-LOIC-UDP                   1730
DDoS-LOIC-UDP - Attempted       1598
Name: Label, dtype: int64
Attempted Category count after labelling:
 -1    8519330
 0    1082294
 6       1598
Name: Attempted Category, dtype: int64


In [8]:
#---------------------+
# THURSDAY 22-02-2018 |
#---------------------+

dir_name = "Thursday-22-02-2018"
thursday_22022018_df, thursday_22022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)

#-- Web Attack SQL
label_flows(thursday_22022018_df, "Web Attack - SQL", 1519330590418906000, 1519331276022793000, ["18.218.115.60"],
            ["172.31.69.28"], also_flip_flow_direction=True, additional_filters=
            [thursday_22022018_df["TotLen Fwd Pkts"] > 0,
             thursday_22022018_df["TotLen Bwd Pkts"] > 0])

# Attack startup artefact
label_flows(thursday_22022018_df, "Web Attack - SQL - Attempted", 1519330470*(10**9), 1519330498*(10**9), ["18.218.115.60"],
            ["172.31.69.28"], attempted_category=2, also_flip_flow_direction=True)

# Payload filter
label_flows(thursday_22022018_df, "Web Attack - SQL - Attempted", 1519330590418906000, 1519331276022793000, ["18.218.115.60"],
            ["172.31.69.28"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)

#-- Web Attack XSS
# Port 63782 is attack setup (navigating to website)
label_flows(thursday_22022018_df, "Web Attack - XSS", 1519321899783923000, 1519324181827037000, ["18.218.115.60"],
            ["172.31.69.28"], additional_filters=
            [~(thursday_22022018_df["Src Port"].isin([63782, 64144]))])

#Flip
label_flows(thursday_22022018_df, "Web Attack - XSS", 1519321899783923000, 1519324181827037000,
            ["172.31.69.28"], ["18.218.115.60"], additional_filters=
            [~(thursday_22022018_df["Dst Port"].isin([63782, 64144]))])

# Attempted attack setup
label_flows(thursday_22022018_df, "Web Attack - XSS - Attempted", 1519321899783923000, 1519324181827037000, ["18.218.115.60"],
            ["172.31.69.28"], attempted_category=2, additional_filters=
            [thursday_22022018_df["Src Port"] == 63782])

label_flows(thursday_22022018_df, "Web Attack - XSS - Attempted", 1519321899783923000, 1519324181827037000, ["18.218.115.60"],
            ["172.31.69.28"], attempted_category=3, additional_filters=
            [thursday_22022018_df["Src Port"] == 64144])

# Payload filter
label_flows(thursday_22022018_df, "Web Attack - XSS - Attempted", 1519321899783923000, 1519324181827037000, ["18.218.115.60"],
            ["172.31.69.28"], attempted_category=0, payload_filter=True, additional_filters=
            [~(thursday_22022018_df["Src Port"].isin([63782, 64144]))])

#Flip
label_flows(thursday_22022018_df, "Web Attack - XSS - Attempted", 1519321899783923000, 1519324181827037000,
            ["172.31.69.28"], ["18.218.115.60"], attempted_category=0, additional_filters=
            [~(thursday_22022018_df["Dst Port"].isin([63782, 64144])) &
             (thursday_22022018_df["TotLen Bwd Pkts"] == 0)])

#-- Web Attack Brute Force & Attempted
label_flows(thursday_22022018_df, "Web Attack - Brute Force", 1519309071336902000, 1519313039858533000, ["18.218.115.60"],
            ["172.31.69.28"], additional_filters=
            [thursday_22022018_df["Tot Fwd Pkts"] > 20])

#Flip
label_flows(thursday_22022018_df, "Web Attack - Brute Force", 1519309071336902000, 1519313039858533000,
            ["172.31.69.28"], ["18.218.115.60"], additional_filters=
            [thursday_22022018_df["Tot Bwd Pkts"] > 20])

label_flows(thursday_22022018_df, "Web Attack - Brute Force - Attempted", 1519309071336902000, 1519313039858533000,
            ["18.218.115.60"], ["172.31.69.28"], attempted_category=5, additional_filters=
            [(thursday_22022018_df["Tot Fwd Pkts"] <= 20) & (thursday_22022018_df["TotLen Fwd Pkts"] > 0)])

#Flip
label_flows(thursday_22022018_df, "Web Attack - Brute Force - Attempted", 1519309071336902000, 1519313039858533000,
            ["172.31.69.28"],  ["18.218.115.60"], attempted_category=5, additional_filters=
            [(thursday_22022018_df["Tot Bwd Pkts"] <= 20) & (thursday_22022018_df["TotLen Bwd Pkts"] > 0)])

label_flows(thursday_22022018_df, "Web Attack - Brute Force - Attempted", 1519308824965705000, 1519308947920399000, ["18.218.115.60"],
            ["172.31.69.28"], attempted_category=2, also_flip_flow_direction=True)

# Payload filter
label_flows(thursday_22022018_df, "Web Attack - Brute Force - Attempted", 1519309071336902000, 1519313039858533000,
            ["18.218.115.60"], ["172.31.69.28"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)

label_rest_as_benign_and_write_csv(thursday_22022018_df, thursday_22022018_df_header_rows, DATASET_PATH + dir_name + ".csv")

thursday_22022018_df = None


labels before pre-processing: Benign              8179253
Brute Force -Web        249
Brute Force -XSS         79
SQL Injection            34
Name: Label, dtype: int64
labels after pre-processing: NeedManualLabel    8179615
Name: Label, dtype: int64
label count after labelling:
 BENIGN                                  8179201
Web Attack - Brute Force - Attempted        221
Web Attack - Brute Force                     69
Web Attack - XSS - Attempted                 44
Web Attack - XSS                             40
Web Attack - SQL - Attempted                 24
Web Attack - SQL                             16
Name: Label, dtype: int64
Attempted Category count after labelling:
 -1    8179326
 0        197
 5         66
 2         24
 3          2
Name: Attempted Category, dtype: int64


In [9]:
#-------------------+
# FRIDAY 23-02-2018 |
#-------------------+

dir_name = "Friday-23-02-2018"
friday_23022018_df, friday_23022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)

#-- Web Attack SQL
label_flows(friday_23022018_df, "Web Attack - SQL", 1519412792126122000, 1519413444947957000 , ["18.218.115.60"],
            ["172.31.69.28"], also_flip_flow_direction=True, additional_filters=
            [friday_23022018_df["TotLen Fwd Pkts"] > 0,
             friday_23022018_df["TotLen Bwd Pkts"] > 0])

# Attack startup artefact
label_flows(friday_23022018_df, "Web Attack - SQL - Attempted", 1519412722*(10**9), 1519412787*(10**9) , ["18.218.115.60"],
            ["172.31.69.28"], attempted_category=2, also_flip_flow_direction=True)

# Payload filter
label_flows(friday_23022018_df, "Web Attack - SQL - Attempted", 1519412792126122000, 1519413444947957000 , ["18.218.115.60"],
            ["172.31.69.28"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)

#-- Web Attack XSS
label_flows(friday_23022018_df, "Web Attack - XSS", 1519405264559707000, 1519409428237472000, ["18.218.115.60"],
            ["172.31.69.28"], additional_filters=
            [~(friday_23022018_df["Src Port"].isin([59173]))])

#Flip
label_flows(friday_23022018_df, "Web Attack - XSS", 1519405264559707000, 1519409428237472000,
            ["172.31.69.28"], ["18.218.115.60"], additional_filters=
            [~(friday_23022018_df["Dst Port"].isin([59173]))])

label_flows(friday_23022018_df, "Web Attack - XSS - Attempted", 1519405264559707000, 1519409428237472000, ["18.218.115.60"],
            ["172.31.69.28"], attempted_category=2, src_port_list=[59173], also_flip_flow_direction=True)

# Payload filter
label_flows(friday_23022018_df, "Web Attack - XSS - Attempted", 1519405264559707000, 1519409428237472000, ["18.218.115.60"],
            ["172.31.69.28"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)

#-- Web Attack Brute Force & Attempted
label_flows(friday_23022018_df, "Web Attack - Brute Force", 1519394670193975000, 1519398186406294000, ["18.218.115.60"],
            ["172.31.69.28"], additional_filters=
            [friday_23022018_df["Tot Fwd Pkts"] > 20])

#Flip
label_flows(friday_23022018_df, "Web Attack - Brute Force", 1519394670193975000, 1519398186406294000,
            ["172.31.69.28"], ["18.218.115.60"], additional_filters=
            [friday_23022018_df["Tot Bwd Pkts"] > 20])

label_flows(friday_23022018_df, "Web Attack - Brute Force - Attempted", 1519394670193975000, 1519398186406294000,
            ["18.218.115.60"], ["172.31.69.28"], attempted_category=5, additional_filters=
            [(friday_23022018_df["Tot Fwd Pkts"] <= 20) & (friday_23022018_df["TotLen Fwd Pkts"] > 0)])

#Flip
label_flows(friday_23022018_df, "Web Attack - Brute Force - Attempted", 1519394670193975000, 1519398186406294000,
            ["172.31.69.28"], ["18.218.115.60"], attempted_category=5, additional_filters=
            [(friday_23022018_df["Tot Bwd Pkts"] <= 20) & (friday_23022018_df["TotLen Bwd Pkts"] > 0)])

# Payload filter:
label_flows(friday_23022018_df, "Web Attack - Brute Force - Attempted", 1519394670193975000, 1519398186406294000,
            ["18.218.115.60"], ["172.31.69.28"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)

label_rest_as_benign_and_write_csv(friday_23022018_df, friday_23022018_df_header_rows, DATASET_PATH + dir_name + ".csv")

friday_23022018_df = None

labels before pre-processing: Benign              7927630
Brute Force -Web        362
Brute Force -XSS        151
SQL Injection            53
Name: Label, dtype: int64
labels after pre-processing: NeedManualLabel    7928196
Name: Label, dtype: int64
label count after labelling:
 BENIGN                                  7927736
Web Attack - Brute Force - Attempted        184
Web Attack - XSS - Attempted                 75
Web Attack - XSS                             73
Web Attack - Brute Force                     62
Web Attack - SQL - Attempted                 43
Web Attack - SQL                             23
Name: Label, dtype: int64
Attempted Category count after labelling:
 -1    7927894
 0        231
 5         60
 2         11
Name: Attempted Category, dtype: int64


In [10]:
#----------------------+
# WEDNESDAY 28-02-2018 |
#----------------------+

dir_name = "Wednesday-28-02-2018"
wednesday_28022018_df, wednesday_28022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)

#-- Infiltration - Dropbox Download
label_flows(wednesday_28022018_df, "Infiltration - Dropbox Download", 1519828404*(10**9), 1519829172*(10**9),
            ["172.31.69.24"],
            ["162.125.3.1", "162.125.3.5", "162.125.3.6", "162.125.248.1", "162.125.18.133"],
            also_flip_flow_direction=True)

label_flows(wednesday_28022018_df, "Infiltration - Dropbox Download", 1519839771*(10**9), 1519839824*(10**9),
            ["172.31.69.24"],
            ["162.125.3.1", "162.125.3.5", "162.125.3.6", "162.125.248.1", "162.125.18.133"],
            also_flip_flow_direction=True)

# Payload filter
label_flows(wednesday_28022018_df, "Infiltration - Dropbox Download - Attempted", 1519828404*(10**9), 1519829172*(10**9),
            ["172.31.69.24"],
            ["162.125.3.1", "162.125.3.5", "162.125.3.6", "162.125.248.1", "162.125.18.133"],
            attempted_category=0, payload_filter=True, also_flip_flow_direction=True)

label_flows(wednesday_28022018_df, "Infiltration - Dropbox Download - Attempted", 1519839771*(10**9), 1519839824*(10**9),
            ["172.31.69.24"],
            ["162.125.3.1", "162.125.3.5", "162.125.3.6", "162.125.248.1", "162.125.18.133"],
            attempted_category=0, payload_filter=True, also_flip_flow_direction=True)

# Attempted - Attack artefact
label_flows(wednesday_28022018_df, "Infiltration - Dropbox Download - Attempted", 1519828404*(10**9), 1519829172*(10**9),
            ["172.31.69.24"],
            ["104.16.100.29", "104.16.99.29", "52.84.128.3", "52.85.101.236", "52.85.131.81", "52.85.95.206"], attempted_category=4, also_flip_flow_direction=True)

label_flows(wednesday_28022018_df, "Infiltration - Dropbox Download - Attempted", 1519839771*(10**9), 1519839824*(10**9),
            ["172.31.69.24"],
            ["104.16.100.29", "104.16.99.29", "52.84.128.3", "52.85.101.236", "52.85.131.81", "52.85.95.206"], attempted_category=4, also_flip_flow_direction=True)

#-- Infiltration - Communication Victim Attacker
label_flows(wednesday_28022018_df, "Infiltration - Communication Victim Attacker", 1519829140*(10**9),
            1519834135*(10**9), ["172.31.69.24"], ["13.58.225.34"], also_flip_flow_direction=True)

label_flows(wednesday_28022018_df, "Infiltration - Communication Victim Attacker", 1519839839*(10**9),
            1519843199*(10**9), ["172.31.69.24"], ["13.58.225.34"], also_flip_flow_direction=True)

# Payload filter

label_flows(wednesday_28022018_df, "Infiltration - Communication Victim Attacker - Attempted", 1519829140*(10**9),
            1519834135*(10**9), ["172.31.69.24"], ["13.58.225.34"], payload_filter=True, also_flip_flow_direction=True)

label_flows(wednesday_28022018_df, "Infiltration - Communication Victim Attacker - Attempted", 1519839839*(10**9),
            1519843199*(10**9), ["172.31.69.24"], ["13.58.225.34"], payload_filter=True, also_flip_flow_direction=True)

#-- Infiltration - NMAP Portscan
label_flows(wednesday_28022018_df, "Infiltration - NMAP Portscan", 1519829182*(10**9), 1519843140746247000,
            ["172.31.69.24"],
            ["172.31.69.1", "172.31.69.10", "172.31.69.11", "172.31.69.12", "172.31.69.13", "172.31.69.14",
             "172.31.69.16", "172.31.69.17", "172.31.69.19", "172.31.69.20", "172.31.69.23", "172.31.69.4",
             "172.31.69.5", "172.31.69.6", "172.31.69.8", "172.31.69.9", "172.31.69.7", "172.31.69.22",
             "172.31.69.15", "172.31.69.21", "172.31.69.18",], additional_filters=
            [~(wednesday_28022018_df["Src Port"] == 68)])

label_rest_as_benign_and_write_csv(wednesday_28022018_df, wednesday_28022018_df_header_rows, DATASET_PATH + dir_name + ".csv")

wednesday_28022018_df = None

  wednesday_28022018_df, wednesday_28022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)


labels before pre-processing: Benign           544200
Infilteration     68871
Label                33
Name: Label, dtype: int64
labels after pre-processing: NeedManualLabel    613072
Name: Label, dtype: int64
label count after labelling:
 BENIGN                                          553425
Infiltration - NMAP Portscan                     59494
Infiltration - Dropbox Download - Attempted         63
Infiltration - Dropbox Download                     46
Infiltration - Communication Victim Attacker        44
Name: Label, dtype: int64
Attempted Category count after labelling:
 -1    613009
 0        39
 4        24
Name: Attempted Category, dtype: int64


In [11]:
#---------------------+
# THURSDAY 01-03-2018 |
#---------------------+

dir_name = "Thursday-01-03-2018"
thursday_01032018_df, thursday_01032018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)

#-- Infiltration - Dropbox Download
label_flows(thursday_01032018_df, "Infiltration - Dropbox Download", 1519912390*(10**9), 1519916360*(10**9),
            ["172.31.69.13"], ["162.125.3.1", "162.125.3.6", "162.125.248.1", "162.125.18.133"], also_flip_flow_direction=True)

label_flows(thursday_01032018_df, "Infiltration - Dropbox Download", 1519913032*(10**9), 1519918454*(10**9),
            ["172.31.69.13"], ["162.125.3.1", "162.125.3.6", "162.125.248.1", "162.125.18.133"], also_flip_flow_direction=True)

# Payload filter
label_flows(thursday_01032018_df, "Infiltration - Dropbox Download - Attempted", 1519912390*(10**9), 1519916360*(10**9),
            ["172.31.69.13"],
            ["162.125.3.1", "162.125.3.6", "162.125.248.1", "162.125.18.133"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)

label_flows(thursday_01032018_df, "Infiltration - Dropbox Download - Attempted", 1519913032*(10**9), 1519918454*(10**9),
            ["172.31.69.13"],
            ["162.125.3.1", "162.125.3.6", "162.125.248.1", "162.125.18.133", "104.16.100.29"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)

# Attempted - Attack artefact
label_flows(thursday_01032018_df, "Infiltration - Dropbox Download - Attempted", 1519912390*(10**9), 1519916360*(10**9),
            ["172.31.69.13"], ["104.16.100.29", "13.32.168.125", "52.85.112.72"], attempted_category=4, also_flip_flow_direction=True)

label_flows(thursday_01032018_df, "Infiltration - Dropbox Download - Attempted", 1519913032*(10**9), 1519918454*(10**9),
            ["172.31.69.13"], ["104.16.100.29", "13.32.168.125", "52.85.112.72"], attempted_category=4, also_flip_flow_direction=True)

#-- Infiltration - Communication Victim Attacker
label_flows(thursday_01032018_df, "Infiltration - Communication Victim Attacker", 1519912674*(10**9),
            1519912745*(10**9), ["172.31.69.13"], ["13.58.225.34"], also_flip_flow_direction=True)

label_flows(thursday_01032018_df, "Infiltration - Communication Victim Attacker", 1519913075*(10**9),
            1519928245*(10**9), ["172.31.69.13"], ["13.58.225.34"], also_flip_flow_direction=True)

label_flows(thursday_01032018_df, "Infiltration - Communication Victim Attacker", 1519928295*(10**9),
            1519933041*(10**9), ["172.31.69.13"], ["13.58.225.34"], also_flip_flow_direction=True)

# Payload filter
label_flows(thursday_01032018_df, "Infiltration - Communication Victim Attacker - Attempted", 1519912674*(10**9),
            1519912745*(10**9), ["172.31.69.13"], ["13.58.225.34"], attempted_category=0,
            payload_filter=True, also_flip_flow_direction=True)

label_flows(thursday_01032018_df, "Infiltration - Communication Victim Attacker - Attempted", 1519913075*(10**9),
            1519928245*(10**9), ["172.31.69.13"], ["13.58.225.34"], attempted_category=0,
            payload_filter=True, also_flip_flow_direction=True)

label_flows(thursday_01032018_df, "Infiltration - Communication Victim Attacker - Attempted", 1519928295*(10**9),
            1519933041*(10**9), ["172.31.69.13"], ["13.58.225.34"], attempted_category=0,
            payload_filter=True, also_flip_flow_direction=True)

#-- Infiltration - NMAP Portscan
label_flows(thursday_01032018_df, "Infiltration - NMAP Portscan", 1519913388*(10**9), 1519933092182726000,
            ["172.31.69.13"],
            ["172.31.69.1", "172.31.69.11", "172.31.69.12", "172.31.69.16", "172.31.69.8", "172.31.69.9",
             "172.31.69.10", "172.31.69.14", "172.31.69.4", "172.31.69.5", "172.31.69.6", "172.31.69.17",
             "172.31.69.20", "172.31.69.23", "172.31.69.24", "172.31.69.19", "172.31.69.7", "172.31.69.15",
             "172.31.69.18", "172.31.69.22", "172.31.69.21"], additional_filters=
            [thursday_01032018_df["Src Port"] != 68])

label_rest_as_benign_and_write_csv(thursday_01032018_df, thursday_01032018_df_header_rows, DATASET_PATH + dir_name + ".csv")

thursday_01032018_df = None

  thursday_01032018_df, thursday_01032018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)


labels before pre-processing: Benign           238037
Infilteration     93063
Label                25
Name: Label, dtype: int64
labels after pre-processing: NeedManualLabel    331101
Name: Label, dtype: int64
label count after labelling:
 BENIGN                                                      290058
Infiltration - NMAP Portscan                                 40804
Infiltration - Communication Victim Attacker                   162
Infiltration - Dropbox Download                                 39
Infiltration - Dropbox Download - Attempted                     37
Infiltration - Communication Victim Attacker - Attempted         1
Name: Label, dtype: int64
Attempted Category count after labelling:
 -1    331063
 4        21
 0        17
Name: Attempted Category, dtype: int64


In [12]:
#-------------------+
# FRIDAY 02-03-2018 |
#-------------------+

dir_name = "Friday-02-03-2018"
friday_02032018_df, friday_02032018_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)

#-- Botnet Ares
label_flows(friday_02032018_df, "Botnet Ares", 1520000008*(10**9), 1520020492*(10**9), also_flip_flow_direction=True,
            additional_filters=
            [(friday_02032018_df["Src IP"] == "18.219.211.138") | (friday_02032018_df["Dst IP"] == "18.219.211.138")])

# Payload filter
label_flows(friday_02032018_df, "Botnet Ares - Attempted", 1520000008*(10**9), 1520020492*(10**9), attempted_category=0, additional_filters=
            [((friday_02032018_df["Src IP"] == "18.219.211.138") | (friday_02032018_df["Dst IP"] == "18.219.211.138")) &
             (friday_02032018_df["TotLen Fwd Pkts"] == 0) & (friday_02032018_df["TotLen Bwd Pkts"] == 0)])

label_rest_as_benign_and_write_csv(friday_02032018_df, friday_02032018_header_rows, DATASET_PATH + dir_name + ".csv")

friday_02032018_df = None

labels before pre-processing: Benign    7931011
Bot        286191
Name: Label, dtype: int64
labels after pre-processing: NeedManualLabel    8217202
Name: Label, dtype: int64
label count after labelling:
 BENIGN                     7931011
Botnet Ares - Attempted     143263
Botnet Ares                 142928
Name: Label, dtype: int64
Attempted Category count after labelling:
 -1    8073939
 0     143263
Name: Attempted Category, dtype: int64
