In [1]:
import pandas as pd
import numpy as np
import glob
import os
from sys import platform
import datetime

# THIS LABELLING SCRIPT IS USED TO LABEL THE OLD VERSION OF CIC-IDS-2017. THIS VERSION SHOULD ONLY BE USED IF YOU
# WISH TO RECREATE OUR RESULTS AS REPORTED IN OUR PAPER: https://intrusion-detection.distrinet-research.be/CNS2022/index.html

# THIS SCRIPT ACCEPTS AS INPUT THE ORIGINAL CSVs AS RELEASED BY THE DATASET AUTHORS: https://www.unb.ca/cic/datasets/ids-2017.html

pd.set_option('display.max_rows', 100)


DATASET_PATH = ""
OUTPUT_PATH = ""

# unset to remove line index (to refer to line numbers when writing final csv)
print_index = True

In [2]:
def format_csv_for_labelling(df):
    # strip leading whitespaces in column names
    df.columns = df.columns.str.lstrip(" ")

    print("labels before pre-processing:", df["Label"].value_counts())

    # Since CICIDS 2017 authors used 12-hour format but removed AM/PM, we need to reconstruct it
    # We do this based on the knowledge they collected traffic from 9:00 AM to 5:00 PM.
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M')
    df['Timestamp'] = df['Timestamp'].apply(lambda x: x + pd.DateOffset(hours=12) if x.hour < 7 else x)

    # Convert to UTC from New Brunswick summer timezone (UTC-3)
    df['Timestamp'] = df['Timestamp'] + pd.DateOffset(hours=3)

    for column in df.columns:
        if column not in ['Flow ID' , 'Timestamp', 'Source IP', 'Destination IP', 'Label']:
            df[column] = pd.to_numeric(df[column])

    # Add attempted category column and initialise to -1
    df["Attempted Category"] = -1

    # CICIDS 2017 author-released version comes prelabelled. This makes sure previous labels don't interfere
    df["Label"] = "NeedManualLabel"

    print("labels after pre-processing:", df["Label"].value_counts())

    return df

def read_csvs_from_path_and_reformat(path):
    df= pd.read_csv(path, encoding='cp1252')

    df = format_csv_for_labelling(df)

    return df

def label_flows(df, label, attack_start_time_nanoseconds, attack_end_time_nanoseconds, src_ip_list=None,
                dst_ip_list= None, src_port_list=None, dst_port_list=None, attempted_category = -1, additional_filters=[],
                also_flip_flow_direction=False, payload_filter=False):
    # Note that labelling happens inplace on the 'df' parameter, and so this function doesn't return anything

    # Create initial mask with all values set to True. Squeeze is necessary to remove second axis (with value 1)
    # The reason is that a df of shape (X,) gets converted to (1,X) if you '&' it with a df of shape (X,1)
    custom_mask = pd.DataFrame(True, index=df.index, columns=[df.columns[0]]).squeeze()

    # Need to round the start time down to the nearest minute because otherwise some flows at the start of the attack
    # are labelled as benign
    attack_start_datetime = pd.to_datetime(attack_start_time_nanoseconds, unit='ns').floor(freq='T')
    attack_end_datetime = pd.to_datetime(attack_end_time_nanoseconds, unit='ns')

    custom_mask &= (df["Timestamp"] >= attack_start_datetime)
    custom_mask &= (df["Timestamp"] <= attack_end_datetime)

    if src_ip_list is not None:
        custom_mask &= (df["Source IP"].isin(src_ip_list))
    if dst_ip_list is not None:
        custom_mask &= (df["Destination IP"].isin(dst_ip_list))

    if src_port_list is not None:
        custom_mask &= (df["Source Port"].isin(src_port_list))
    if dst_port_list is not None:
        custom_mask &= (df["Destination Port"].isin(dst_port_list))

    # IMPORTANT NOTE: If you decide to add TotLen Fwd Pkt == 6 for catching RST packets, you still have to manually alter some additional_filters for flipped flows where
    # you couldn't use payload_filter boolean function input value
    if payload_filter:
        custom_mask &= (df["Total Length of Fwd Packets"] == 0)

    for filter in additional_filters:
        custom_mask &= filter

    df["Label"].mask(custom_mask, label, inplace=True)
    df["Attempted Category"].mask(custom_mask, attempted_category, inplace=True)

    if also_flip_flow_direction:
        if additional_filters:
            raise AttributeError("Cannot set also_flip_flow_direction to True when additional_filters is not empty")

        custom_mask = pd.DataFrame(True, index=df.index, columns=[df.columns[0]]).squeeze()

        custom_mask &= (df["Timestamp"] >= attack_start_datetime)
        custom_mask &= (df["Timestamp"] <= attack_end_datetime)

        if src_ip_list is not None:
            custom_mask &= (df["Destination IP"].isin(src_ip_list))
        if dst_ip_list is not None:
            custom_mask &= (df["Source IP"].isin(dst_ip_list))

        if src_port_list is not None:
            custom_mask &= (df["Destination Port"].isin(src_port_list))
        if dst_port_list is not None:
            custom_mask &= (df["Source Port"].isin(dst_port_list))

        if payload_filter:
            custom_mask &= (df["Total Length of Bwd Packets"] == 0)

        for filter in additional_filters:
            custom_mask &= filter

        df["Label"].mask(custom_mask, label, inplace=True)
        df["Attempted Category"].mask(custom_mask, attempted_category, inplace=True)

def label_rest_as_benign_and_write_csv(df, file_to_write):
    df["Label"].mask(df["Label"] == "NeedManualLabel", "BENIGN", inplace=True)

    # Relabel artefact flows with [Flow Id] = '8.0.6.4-8.6.0.1-0-0-0' to label = 0
    df["Label"].mask(df["Flow ID"] == '8.0.6.4-8.6.0.1-0-0-0', "BENIGN", inplace=True)

    print("label count after labelling:\r\n", df["Label"].value_counts())
    print("Attempted Category count after labelling:\r\n", df["Attempted Category"].value_counts())

    if print_index:
        df.reset_index(inplace=True, drop=True)
        df.index += 1
        df.index.name = 'id'
        df.to_csv(file_to_write)
    else:
        df.to_csv(file_to_write, index=False)


In [3]:
#--------------------+
# TUESDAY 04-07-2017 |
#--------------------+

tuesday_df = read_csvs_from_path_and_reformat(DATASET_PATH + "tuesday/Tuesday-WorkingHours.pcap_ISCX.csv")

# FTP-PATATOR
# -----------

label_flows(tuesday_df, "FTP-Patator", 1499170672838272000, 1499174416931403000, ["172.16.0.1"],
            ["192.168.10.50"], dst_port_list=[21], also_flip_flow_direction=True)

# Default payload filter
label_flows(tuesday_df, "FTP-Patator - Attempted", 1499170672838272000, 1499174416931403000, ["172.16.0.1"],
            ["192.168.10.50"], dst_port_list=[21], payload_filter=True, also_flip_flow_direction=True, attempted_category=0)

label_flows(tuesday_df, "FTP-Patator - Attempted", 1499170672838272000, 1499174416931403000, ["172.16.0.1"],
            ["192.168.10.50"], dst_port_list=[21], src_port_list=[52108], attempted_category=2)

# Flows with RSTs that are technically TCP appendices, but not picked up by payload filter because of non-zero payload
label_flows(tuesday_df, "FTP-Patator - Attempted", 1499170672838272000, 1499174416931403000, ["172.16.0.1"],
            ["192.168.10.50"], dst_port_list=[21], additional_filters=
            [
                (tuesday_df["Source Port"] != 52108) & (tuesday_df["Total Length of Bwd Packets"] == 0) &
                (tuesday_df["Total Length of Fwd Packets"] > 0)
            ], attempted_category=3)

label_flows(tuesday_df, "FTP-Patator - Attempted", 1499170672838272000, 1499174416931403000, ["192.168.10.50"],
            ["172.16.0.1"], src_port_list=[21], additional_filters=
            [
                (tuesday_df["Destination Port"] != 52108) & (tuesday_df["Total Length of Fwd Packets"] == 0) &
                (tuesday_df["Total Length of Bwd Packets"] > 0)
            ], attempted_category=3)


# SSH-Patator
# -----------

label_flows(tuesday_df, "SSH-Patator", 1499188141049616000, 1499195059018486000, ["172.16.0.1"],
            ["192.168.10.50"], dst_port_list=[22], also_flip_flow_direction=True)

#Payload filter
label_flows(tuesday_df, "SSH-Patator - Attempted", 1499188141049616000, 1499195059018486000, ["172.16.0.1"],
            ["192.168.10.50"], dst_port_list=[22], payload_filter=True, also_flip_flow_direction=True, attempted_category=0)

label_flows(tuesday_df, "SSH-Patator - Attempted", 1499188141049616000, 1499195059018486000, ["172.16.0.1"],
            ["192.168.10.50"], dst_port_list=[22], additional_filters=
            [
                (tuesday_df["Total Length of Fwd Packets"] <= 32) & (tuesday_df["Total Length of Bwd Packets"] == 0)
            ], attempted_category=3)

label_rest_as_benign_and_write_csv(tuesday_df, OUTPUT_PATH + "Tuesday-WorkingHours.pcap_ISCX.csv")

tuesday_df = None

labels before pre-processing: BENIGN         432074
FTP-Patator      7938
SSH-Patator      5897
Name: Label, dtype: int64
labels after pre-processing: NeedManualLabel    445909
Name: Label, dtype: int64
label count after labelling:
 BENIGN                     430465
FTP-Patator - Attempted      5489
FTP-Patator                  3991
SSH-Patator - Attempted      3003
SSH-Patator                  2961
Name: Label, dtype: int64
Attempted Category count after labelling:
 -1    437417
 3      6918
 0      1571
 2         3
Name: Attempted Category, dtype: int64


In [3]:
#----------------------+
# WEDNESDAY 05-07-2017 |
#----------------------+

wednesday_df = read_csvs_from_path_and_reformat(DATASET_PATH + "wednesday/Wednesday-workingHours.pcap_ISCX.csv")

# DoS Slowloris
# -------------

# Accidental early launch of the tool with wrong parameters
label_flows(wednesday_df, "DoS Slowloris - Attempted", 1499258926211817000, 1499258927000000000, ["172.16.0.1"],
            ["192.168.10.50"], dst_port_list=[80], attempted_category=5)

label_flows(wednesday_df, "DoS Slowloris", 1499258934539220000, 1499260278500956000, ["172.16.0.1"],
            ["192.168.10.50"], dst_port_list=[80], additional_filters=[
        ~(wednesday_df["Source Port"].isin([33358, 33360, 33362, 54114]))
    ])

label_flows(wednesday_df, "DoS Slowloris", 1499258934539220000, 1499260278500956000,
            ["192.168.10.50"], ["172.16.0.1"], src_port_list=[80], additional_filters=[
        ~(wednesday_df["Destination Port"].isin([33358, 33360, 33362, 54114]))
    ])

# port 33358, 33360 and 33362 contain attack teardown flows
label_flows(wednesday_df, "DoS Slowloris - Attempted", 1499258934539220000, 1499260278500956000, ["172.16.0.1"],
            ["192.168.10.50"], src_port_list=[33358, 33360, 33362], dst_port_list=[80], attempted_category=2)

#Payload filter (order is important, this part needs to come before Attempted category 6) (can't flip with boolean function input because of additional filters)
label_flows(wednesday_df, "DoS Slowloris - Attempted", 1499258934539220000, 1499260278500956000, ["172.16.0.1"],
            ["192.168.10.50"], dst_port_list=[80], attempted_category=0, payload_filter=True, additional_filters=[
        ~(wednesday_df["Source Port"].isin([33358, 33360, 33362, 54114]))])

label_flows(wednesday_df, "DoS Slowloris - Attempted", 1499258934539220000, 1499260278500956000,
            ["192.168.10.50"], ["172.16.0.1"], src_port_list=[80], attempted_category=0, additional_filters=[
        ~(wednesday_df["Destination Port"].isin([33358, 33360, 33362, 54114])) & (wednesday_df["Total Length of Bwd Packets"] == 0)
    ])

#Target unresponsive because of DoS, no payloads in these flows
label_flows(wednesday_df, "DoS Slowloris - Attempted", 1499258934539220000, 1499260278500956000,
            ["192.168.10.50"], ["172.16.0.1"], src_port_list=[80], attempted_category=6, additional_filters=[
        ~(wednesday_df["Destination Port"].isin([33358, 33360, 33362, 54114])) & (wednesday_df["Total Length of Bwd Packets"] == 0)
        & (wednesday_df["Flow Duration"] >= 199800)
    ])

# Artefact likely from authors checking the webserver
label_flows(wednesday_df, "DoS Slowloris - Attempted", 1499258934539220000, 1499260278500956000, ["172.16.0.1"],
            ["192.168.10.50"], src_port_list=[54114], dst_port_list=[80], attempted_category=4)

# DoS Slowhttptest
# ----------------

label_flows(wednesday_df, "DoS Slowhttptest", 1499260537936810000, 1499261869331517000, ["172.16.0.1"],
            ["192.168.10.50"], dst_port_list=[80], additional_filters=[
        ~(wednesday_df["Source Port"].isin([33372]))
    ]
    )

label_flows(wednesday_df, "DoS Slowhttptest", 1499260537936810000, 1499261869331517000,
            ["192.168.10.50"], ["172.16.0.1"], src_port_list=[80], additional_filters=[
        ~(wednesday_df["Destination Port"].isin([33372]))
    ]
    )

# Attack startup artefact
label_flows(wednesday_df, "DoS Slowhttptest - Attempted", 1499260537936810000, 1499261869331517000, ["172.16.0.1"],
            ["192.168.10.50"], src_port_list=[33372], dst_port_list=[80], attempted_category=2)

#Payload filter (order of this is important, before attempted category 6) (can't flip with boolean function input because of additional filters)
label_flows(wednesday_df, "DoS Slowhttptest - Attempted", 1499260537936810000, 1499261869331517000, ["172.16.0.1"],
            ["192.168.10.50"], dst_port_list=[80], attempted_category=0, payload_filter=True, additional_filters=[
        ~(wednesday_df["Source Port"].isin([33372, 37670]))])

label_flows(wednesday_df, "DoS Slowhttptest - Attempted", 1499260537936810000, 1499261869331517000,
            ["192.168.10.50"], ["172.16.0.1"], src_port_list=[80], attempted_category=0, additional_filters=[
        ~(wednesday_df["Destination Port"].isin([33372, 37670])) & (wednesday_df["Total Length of Bwd Packets"] == 0)
    ]
    )

# Retransmissions because target web server is brought down (No need to flip direction, I double-checked)
label_flows(wednesday_df, "DoS Slowhttptest - Attempted", 1499260537936810000, 1499261869331517000, ["172.16.0.1"],
            ["192.168.10.50"], dst_port_list=[80], attempted_category=6, additional_filters=[
        ~(wednesday_df["Source Port"].isin([33372])) & (wednesday_df["Total Length of Fwd Packets"] == 0) &
        (wednesday_df["Flow Duration"] >= 199984) & (wednesday_df["Total Backward Packets"] == 0)
    ]
    )

# Artefact from authors likely checking the webserver
label_flows(wednesday_df, "DoS Slowhttptest - Attempted", 1499260537936810000, 1499261869331517000, ["172.16.0.1"],
            ["192.168.10.50"], src_port_list=[37670], dst_port_list=[80], attempted_category=4)


# DoS Hulk
# --------

# Note that ports 48678 and 43664 have a benign flow launched by attacker IP while attack is already ongoing,
# containing benign HTTP request. This will be labelled as Attack artefact
label_flows(wednesday_df, "DoS Hulk", 1499262203194704000, 1499262299999999999, ["172.16.0.1"],
            ["192.168.10.50"], dst_port_list=[80], additional_filters=[
        ~(wednesday_df["Source Port"].isin([48678 , 43664]))
    ])

label_flows(wednesday_df, "DoS Hulk", 1499262203194704000, 1499262299999999999,
            ["192.168.10.50"], ["172.16.0.1"], src_port_list=[80], additional_filters=[
        ~(wednesday_df["Destination Port"].isin([48678 , 43664]))
    ])

label_flows(wednesday_df, "DoS Hulk", 1499262300000000000, 1499263641326171000, ["172.16.0.1"],
            ["192.168.10.50"], dst_port_list=[80], also_flip_flow_direction=True)

#Attack artefact - likely authors checking webserver mid-attack.
label_flows(wednesday_df, "DoS Hulk - Attempted", 1499262203194704000, 1499262299999999999, ["172.16.0.1"],
            ["192.168.10.50"], src_port_list=[48678 , 43664], dst_port_list=[80], attempted_category=4)

#Payload filter (can't flip with boolean function input because of additional filters)
label_flows(wednesday_df, "DoS Hulk - Attempted", 1499262203194704000, 1499263641326171000, ["172.16.0.1"],
            ["192.168.10.50"], dst_port_list=[80], attempted_category=0, payload_filter=True, additional_filters=[
        ~(wednesday_df["Source Port"].isin([48678 , 43664]))])

label_flows(wednesday_df, "DoS Hulk - Attempted", 1499262203194704000, 1499263641326171000,
            ["192.168.10.50"], ["172.16.0.1"], src_port_list=[80], attempted_category=0, additional_filters=[
        ~(wednesday_df["Destination Port"].isin([48678 , 43664])) & (wednesday_df["Total Length of Bwd Packets"] == 0)
    ])

# Artefacts caused by either attack tool or non-empty TCP appendices. Reasoning is that 282 is minimum size of malicious payload
label_flows(wednesday_df, "DoS Hulk - Attempted", 1499262203194704000, 1499263641326171000, ["172.16.0.1"],
            ["192.168.10.50"], dst_port_list=[80], attempted_category=3, additional_filters=[
        ~(wednesday_df["Source Port"].isin([48678 , 43664])) & (wednesday_df["Total Length of Fwd Packets"] > 0)
        & (wednesday_df["Total Length of Fwd Packets"] < 282)
    ])

label_flows(wednesday_df, "DoS Hulk - Attempted", 1499262203194704000, 1499263641326171000,
            ["192.168.10.50"], ["172.16.0.1"], src_port_list=[80], attempted_category=3, additional_filters=[
        ~(wednesday_df["Destination Port"].isin([48678 , 43664])) & (wednesday_df["Total Length of Bwd Packets"] > 0)
        & (wednesday_df["Total Length of Bwd Packets"] <282)
    ])

# DoS GoldenEye
# -------------

label_flows(wednesday_df, "DoS GoldenEye", 1499263803231753000, 1499264408915718000, ["172.16.0.1"],
            ["192.168.10.50"], dst_port_list=[80], also_flip_flow_direction=True)

#Payload filter
label_flows(wednesday_df, "DoS GoldenEye - Attempted", 1499263803231753000, 1499264408915718000, ["172.16.0.1"],
            ["192.168.10.50"], dst_port_list=[80], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)

# Heartbleed
# ----------

label_flows(wednesday_df, "Heartbleed", 1499278335650811000, 1499279563294455000, ["172.16.0.1"],
            ["192.168.10.51"], dst_port_list=[444], src_port_list=[45022], also_flip_flow_direction=True)

#Payload filter
label_flows(wednesday_df, "Heartbleed - Attempted", 1499278335650811000, 1499279563294455000, ["172.16.0.1"],
            ["192.168.10.51"], dst_port_list=[444], src_port_list=[45022], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)

label_rest_as_benign_and_write_csv(wednesday_df, OUTPUT_PATH + "Wednesday-workingHours.pcap_ISCX.csv")

wednesday_df = None


FileNotFoundError: [Errno 2] No such file or directory: '/media/farodin/AEAA59A1AA59673D/CICIDS2017/CSV_newest_CICFlowMeter_20220728/Unlabelled/wednesday/Wednesday-workingHours.pcap_ISCX.csv'

In [5]:
#---------------------+
# THURSDAY 06-07-2017 |
#---------------------+

thursday_df = read_csvs_from_path_and_reformat(DATASET_PATH + "thursday/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv")

# Web Attack - Brute Force
# ------------------------

label_flows(thursday_df, "Web Attack - Brute Force - Attempted", 1499343354880049000, 1499343531179279000,
            ["172.16.0.1"], ["192.168.10.50"], dst_port_list=[80], attempted_category=2, also_flip_flow_direction=True)

label_flows(thursday_df, "Web Attack - Brute Force", 1499343567660566000, 1499346011622209000,
            ["172.16.0.1"], ["192.168.10.50"], dst_port_list=[80], additional_filters=
            [
                (thursday_df["Total Fwd Packets"] > 20) | (thursday_df["Source Port"] == 44464)
            ])
#Flip
label_flows(thursday_df, "Web Attack - Brute Force", 1499343567660566000, 1499346011622209000,
             ["192.168.10.50"], ["172.16.0.1"], src_port_list=[80], additional_filters=
            [
                (thursday_df["Total Backward Packets"] > 20) | (thursday_df["Destination Port"] == 44464)
            ])

#Payload filter (can't use switch_flow_direction because there are additional_filters)
label_flows(thursday_df, "Web Attack - Brute Force - Attempted", 1499343567660566000, 1499346011622209000,
            ["172.16.0.1"], ["192.168.10.50"], dst_port_list=[80], payload_filter=True, attempted_category=0,
            additional_filters=
            [~((thursday_df["Total Fwd Packets"] > 20) | (thursday_df["Source Port"] == 44464))])

label_flows(thursday_df, "Web Attack - Brute Force - Attempted", 1499343567660566000, 1499346011622209000,
             ["192.168.10.50"], ["172.16.0.1"], src_port_list=[80], attempted_category=0,
            additional_filters=
            [
                ~((thursday_df["Total Backward Packets"] > 20) | (thursday_df["Destination Port"] == 44464))
                & (thursday_df["Total Length of Bwd Packets"] == 0)
            ])

label_flows(thursday_df, "Web Attack - Brute Force - Attempted", 1499343567660566000, 1499346011622209000,
            ["172.16.0.1"], ["192.168.10.50"], dst_port_list=[80], attempted_category=4,
            additional_filters=
            [
                (thursday_df["Total Length of Fwd Packets"] > 0) & ~(thursday_df["Source Port"] == 44464) &
                (thursday_df["Total Fwd Packets"] == 4) & (thursday_df["Total Backward Packets"] == 4)
            ])

label_flows(thursday_df, "Web Attack - Brute Force - Attempted", 1499343567660566000, 1499346011622209000,
             ["192.168.10.50"], ["172.16.0.1"], src_port_list=[80], attempted_category=4,
            additional_filters=
            [
                (thursday_df["Total Length of Bwd Packets"] > 0) & ~(thursday_df["Destination Port"] == 44464) &
                (thursday_df["Total Backward Packets"] == 4) & (thursday_df["Total Fwd Packets"] == 4)
            ])

# Web Attack - XSS
# ----------------

label_flows(thursday_df, "Web Attack - XSS", 1499346935283859000, 1499348121341704000, ["172.16.0.1"],
            ["192.168.10.50"], dst_port_list=[80], additional_filters=
            [
                ~(thursday_df["Source Port"].isin([36180, 36182, 36184, 36186, 36188, 36190])) &
                (thursday_df["Total Fwd Packets"] >= 150)
            ])
#Flip
label_flows(thursday_df, "Web Attack - XSS", 1499346935283859000, 1499348121341704000,
            ["192.168.10.50"], ["172.16.0.1"], src_port_list=[80], additional_filters=
            [
                ~(thursday_df["Destination Port"].isin([36180, 36182, 36184, 36186, 36188, 36190])) &
                (thursday_df["Total Backward Packets"] >= 150)
            ])

label_flows(thursday_df, "Web Attack - XSS - Attempted", 1499346935283859000, 1499348121341704000, ["172.16.0.1"],
            ["192.168.10.50"], dst_port_list=[80], attempted_category=0, payload_filter=True, additional_filters=
            [
                ~(thursday_df["Source Port"].isin([36180, 36182, 36184, 36186, 36188, 36190]))])
#Flip
label_flows(thursday_df, "Web Attack - XSS - Attempted", 1499346935283859000, 1499348121341704000, ["172.16.0.1"],
            ["192.168.10.50"], src_port_list=[80], attempted_category=0, additional_filters=
            [
                ~(thursday_df["Destination Port"].isin([36180, 36182, 36184, 36186, 36188, 36190])) &
                (thursday_df["Total Length of Bwd Packets"] == 0)
            ])

label_flows(thursday_df, "Web Attack - XSS - Attempted", 1499346935283859000, 1499348121341704000, ["172.16.0.1"],
            ["192.168.10.50"], dst_port_list=[80], attempted_category=2, additional_filters=
            [
                ~(thursday_df["Source Port"].isin([36180, 36182, 36184, 36186, 36188, 36190])) &
                (thursday_df["Total Length of Fwd Packets"] > 0) & (thursday_df["Total Fwd Packets"] < 150)
            ])

#Flip
label_flows(thursday_df, "Web Attack - XSS - Attempted", 1499346935283859000, 1499348121341704000,
            ["192.168.10.50"], ["172.16.0.1"], src_port_list=[80], attempted_category=2, additional_filters=
            [
                ~(thursday_df["Destination Port"].isin([36180, 36182, 36184, 36186, 36188, 36190])) &
                (thursday_df["Total Length of Bwd Packets"] > 0) & (thursday_df["Total Backward Packets"] < 150)
            ])

# Web Attack - SQL Injection
# --------------------------

label_flows(thursday_df, "Web Attack - SQL Injection - Attempted", 1499348127852814000, 1499348145720612000,
            ["172.16.0.1"], ["192.168.10.50"], dst_port_list=[80], attempted_category=2,
            additional_filters=[
                thursday_df["Source Port"].isin([36180, 36182, 36184, 36186, 36188])
            ])

#Flip
label_flows(thursday_df, "Web Attack - SQL Injection - Attempted", 1499348127852814000, 1499348145720612000,
             ["192.168.10.50"], ["172.16.0.1"], src_port_list=[80], attempted_category=2,
            additional_filters=[
                thursday_df["Destination Port"].isin([36180, 36182, 36184, 36186, 36188])
            ])

label_flows(thursday_df, "Web Attack - SQL Injection", 1499348145732950000, 1499348575320284000,
            ["172.16.0.1"], ["192.168.10.50"], dst_port_list=[80],
            additional_filters=[
                ~(thursday_df["Source Port"].isin([36180, 36182, 36184, 36186, 36188]))
            ])

#Flip
label_flows(thursday_df, "Web Attack - SQL Injection", 1499348145732950000, 1499348575320284000,
            ["192.168.10.50"],  ["172.16.0.1"], src_port_list=[80],
            additional_filters=[
                ~(thursday_df["Destination Port"].isin([36180, 36182, 36184, 36186, 36188]))
            ])
#Payload filter
label_flows(thursday_df, "Web Attack - SQL Injection - Attempted", 1499348127852814000, 1499348145720612000,
            ["172.16.0.1"], ["192.168.10.50"], dst_port_list=[80], attempted_category=0,
           payload_filter=True, also_flip_flow_direction=True)


label_rest_as_benign_and_write_csv(thursday_df,
    OUTPUT_PATH + "Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv")

# Infiltration
# 5.1 Dropbox Download
# ------------
thursday_df = read_csvs_from_path_and_reformat(DATASET_PATH +
                    "thursday/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv")

label_flows(thursday_df, "Infiltration", 1499361542547210000, 1499366769364731000, ["192.168.10.8"], ["205.174.165.73"],
            also_flip_flow_direction=True)

#Payload filter
label_flows(thursday_df, "Infiltration - Attempted", 1499361542547210000, 1499366769364731000, ["192.168.10.8"],
            ["205.174.165.73"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)


label_flows(thursday_df, "Infiltration - Attempted", 1499361228830533000, 1499361301251276000 , ["192.168.10.9"],
            ["205.174.165.73"], attempted_category=2, also_flip_flow_direction=True)

# 5.2 Cooldisk Mac

label_flows(thursday_df, "Infiltration", 1499363616453990000, 1499371339347892000, ["192.168.10.25"], ["205.174.165.73"],
            also_flip_flow_direction=True)

#Payload filter
label_flows(thursday_df, "Infiltration - Attempted", 1499363616453990000, 1499371339347892000, ["192.168.10.25"],
            ["205.174.165.73"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)


# 5.3 NMAP + Portscan

# Round 1

label_flows(thursday_df, "Infiltration - Portscan", 1499360400000000000, 1499360460000000000, ["172.16.0.1"],
            ["192.168.10.51"], additional_filters=[
        (thursday_df["Source Port"] == 50122)
    ])

# Round 2

label_flows(thursday_df, "Infiltration - Portscan", 1499362410884008000, 1499362444285175000, ["192.168.10.8"],
            ["192.168.10.5"])

# Round 3

label_flows(thursday_df, "Infiltration - Portscan", 1499364314425162000, 1499366764331875000, ["192.168.10.8"],
            ["192.168.10.5", "192.168.10.9", "192.168.10.12", "192.168.10.14", "192.168.10.15", "192.168.10.16",
            "192.168.10.17", "192.168.10.19", "192.168.10.25", "192.168.10.50", "192.168.10.51"], additional_filters= [
        ~((thursday_df["Fwd Packet Length Max"] == 408) & (thursday_df["Destination IP"] == "192.168.10.50")) &
        ~((thursday_df["Total Length of Fwd Packets"].isin([176, 20514])) & (thursday_df["Destination IP"] == "192.168.10.50"))
    ]
)

label_rest_as_benign_and_write_csv(thursday_df,
    OUTPUT_PATH + "Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv")

thursday_df = None

  thursday_df = read_csvs_from_path_and_reformat(DATASET_PATH + "thursday/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv")


labels before pre-processing: BENIGN                        168186
Web Attack – Brute Force        1507
Web Attack – XSS                 652
Web Attack – Sql Injection        21
Name: Label, dtype: int64
labels after pre-processing: NeedManualLabel    458968
Name: Label, dtype: int64
label count after labelling:
 BENIGN                                    455536
Web Attack - Brute Force - Attempted        2660
Web Attack - XSS - Attempted                 616
Web Attack - Brute Force                      74
Web Attack - SQL Injection - Attempted        39
Web Attack - SQL Injection                    25
Web Attack - XSS                              18
Name: Label, dtype: int64
Attempted Category count after labelling:
 -1    455653
 0      3222
 4        71
 2        22
Name: Attempted Category, dtype: int64
labels before pre-processing: BENIGN          288566
Infiltration        36
Name: Label, dtype: int64
labels after pre-processing: NeedManualLabel    288602
Name: Label, dtype: int

In [6]:
#-------------------+
# FRIDAY 07-07-2017 |
#-------------------+

friday_df = read_csvs_from_path_and_reformat(DATASET_PATH + "friday/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv")

# Portscan
# --------

#First round
label_flows(friday_df, "Portscan", 1499446532117090000, 1499447948582083000, ["172.16.0.1"], ["192.168.10.50"],
            also_flip_flow_direction=True)


#Second round
label_flows(friday_df, "Portscan", 1499449860000000000, 1499449919000000000, ["172.16.0.1"], ["192.168.10.50"],
            additional_filters=[
                ~(friday_df["Source Port"].isin([0, 35952, 35954, 35956, 35958]))
            ]
)

label_flows(friday_df, "Portscan", 1499449920000000000, 1499451841699238000, ["172.16.0.1"], ["192.168.10.50"])

#Putting Bot labelling in here too because Bot occurs throughout the day
label_flows(friday_df, "Botnet", 1499432653990571000, 1499436122903736000, ["192.168.10.15", "192.168.10.9",
            "192.168.10.14", "192.168.10.5", "192.168.10.8"], ["205.174.165.73"], also_flip_flow_direction=True)

#Payload filter
label_flows(friday_df, "Botnet - Attempted", 1499432653990571000, 1499436122903736000, ["192.168.10.15", "192.168.10.9",
            "192.168.10.14", "192.168.10.5", "192.168.10.8"], ["205.174.165.73"], attempted_category=0,
            payload_filter=True, also_flip_flow_direction=True)


label_flows(friday_df, "Botnet - Attempted", 1499436180000000000, 1499457684606663000, ["192.168.10.15", "192.168.10.9",
            "192.168.10.14", "192.168.10.5", "192.168.10.8"], ["205.174.165.73"], attempted_category=1, also_flip_flow_direction=True)

label_rest_as_benign_and_write_csv(friday_df,
    OUTPUT_PATH + "Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv")

# Botnet
# ------

friday_df = read_csvs_from_path_and_reformat(DATASET_PATH + "friday/Friday-WorkingHours-Morning.pcap_ISCX.csv")

label_flows(friday_df, "Botnet", 1499432653990571000, 1499436122903736000, ["192.168.10.15", "192.168.10.9",
            "192.168.10.14", "192.168.10.5", "192.168.10.8"], ["205.174.165.73"], also_flip_flow_direction=True)

#Payload filter
label_flows(friday_df, "Botnet - Attempted", 1499432653990571000, 1499436122903736000, ["192.168.10.15", "192.168.10.9",
            "192.168.10.14", "192.168.10.5", "192.168.10.8"], ["205.174.165.73"], attempted_category=0,
            payload_filter=True, also_flip_flow_direction=True)

label_flows(friday_df, "Botnet - Attempted", 1499436180000000000, 1499457684606663000, ["192.168.10.15", "192.168.10.9",
            "192.168.10.14", "192.168.10.5", "192.168.10.8"], ["205.174.165.73"], attempted_category=1, also_flip_flow_direction=True)

label_rest_as_benign_and_write_csv(friday_df,
    OUTPUT_PATH + "Friday-WorkingHours-Morning.pcap_ISCX.csv")

# DDoS
# ----

friday_df = read_csvs_from_path_and_reformat(DATASET_PATH + "friday/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv")

label_flows(friday_df, "DDoS", 1499453791796937000, 1499454972216560000, ["172.16.0.1"], ["192.168.10.50"],
            also_flip_flow_direction=True)

# Payload filter
label_flows(friday_df, "DDoS - Attempted", 1499453791796937000, 1499454972216560000, ["172.16.0.1"], ["192.168.10.50"],
            attempted_category=0, payload_filter=True, also_flip_flow_direction=True)

label_flows(friday_df, "DDoS - Attempted", 1499453791796937000, 1499454972216560000, ["192.168.10.50"], ["172.16.0.1"],
            attempted_category=0, additional_filters=[
        (friday_df["Total Length of Bwd Packets"] == 0)
    ])

# Putting Bot labelling in here too because Bot occurs throughout the day
label_flows(friday_df, "Botnet", 1499432653990571000, 1499436122903736000, ["192.168.10.15", "192.168.10.9",
            "192.168.10.14", "192.168.10.5", "192.168.10.8"], ["205.174.165.73"], also_flip_flow_direction=True)

#Payload filter
label_flows(friday_df, "Botnet - Attempted", 1499432653990571000, 1499436122903736000, ["192.168.10.15", "192.168.10.9",
            "192.168.10.14", "192.168.10.5", "192.168.10.8"], ["205.174.165.73"], attempted_category=0,
            payload_filter=True, also_flip_flow_direction=True)

label_flows(friday_df, "Botnet - Attempted", 1499436180000000000, 1499457684606663000, ["192.168.10.15", "192.168.10.9",
            "192.168.10.14", "192.168.10.5", "192.168.10.8"], ["205.174.165.73"], attempted_category=1, also_flip_flow_direction=True)


label_rest_as_benign_and_write_csv(friday_df, OUTPUT_PATH + "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv")

labels before pre-processing: PortScan    158930
BENIGN      127537
Name: Label, dtype: int64
labels after pre-processing: NeedManualLabel    286467
Name: Label, dtype: int64
label count after labelling:
 Portscan              158939
BENIGN                126905
Botnet - Attempted       623
Name: Label, dtype: int64
Attempted Category count after labelling:
 -1    285844
 1       623
Name: Attempted Category, dtype: int64
labels before pre-processing: BENIGN    189067
Bot         1966
Name: Label, dtype: int64
labels after pre-processing: NeedManualLabel    191033
Name: Label, dtype: int64
label count after labelling:
 BENIGN                189071
Botnet                  1472
Botnet - Attempted       490
Name: Label, dtype: int64
Attempted Category count after labelling:
 -1    190543
 1       490
Name: Attempted Category, dtype: int64
labels before pre-processing: DDoS      128027
BENIGN     97718
Name: Label, dtype: int64
labels after pre-processing: NeedManualLabel    225745
Nam