In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Get access to Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define the path
dataset_path = "/content/drive/My Drive/Projects (Github)/NF-UNSW-NB15/subset-NF-UNSW-NB15-dataset.csv"

# Load the dataset
df = pd.read_csv(dataset_path, sep=',', encoding='utf-8')

# Check dimensions
print(df.shape)

# Print the first three rows and transform the output due to the dimensions
print(df.head(3).T)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
(24347, 14)
                                        0              1              2
IPV4_SRC_ADDR                  59.166.0.8     59.166.0.6     59.166.0.7
L4_SRC_PORT                         22023          26214          10429
IPV4_DST_ADDR               149.171.126.2  149.171.126.9  149.171.126.2
L4_DST_PORT                          1577          37823           2854
PROTOCOL                                6             17              6
L7_PROTO                             36.0           11.0            0.0
IN_BYTES                             2854            536           2766
OUT_BYTES                           30622            304          25812
IN_PKTS                                46              4             44
OUT_PKTS                               48              4             46
TCP_FLAGS                              27              0           

In [None]:
# Remove any rows with missing values

# Replace infinite values (if any) with NaNs in the DataFrame
df.replace([np.inf, -np.inf], np.nan, inplace=True)
# Print the number of rows in the DataFrame that contain at least one NaN
print(df.isna().any(axis=1).sum(), "rows with at least one NaN to remove")
# Remove rows from the DataFrame that contain at least one NaN - there is none in the dataset
df.dropna(inplace=True)


# Remove any rows that is a duplicate
# Print the number of rows in the DataFrame that are duplicated
print(df.duplicated().sum(), "duplicate rows to remove")
# Remove duplicated rows from the DataFrame
df.drop_duplicates(inplace=True)
# Reset the DataFrame's index and drop the old index column
df.reset_index(inplace=True, drop=True)



0 rows with at least one NaN to remove
15 duplicate rows to remove


In [None]:
# Confirmation that the duplicates have been removed
print(df.shape)
# Consider what data type each feature is by default
print(df.dtypes)
# Count unique value for quick analysis
print(df.nunique())

(24332, 14)
IPV4_SRC_ADDR                  object
L4_SRC_PORT                     int64
IPV4_DST_ADDR                  object
L4_DST_PORT                     int64
PROTOCOL                        int64
L7_PROTO                      float64
IN_BYTES                        int64
OUT_BYTES                       int64
IN_PKTS                         int64
OUT_PKTS                        int64
TCP_FLAGS                       int64
FLOW_DURATION_MILLISECONDS      int64
Label                           int64
Attack                         object
dtype: object
IPV4_SRC_ADDR                    32
L4_SRC_PORT                   20077
IPV4_DST_ADDR                    36
L4_DST_PORT                    8174
PROTOCOL                        112
L7_PROTO                         65
IN_BYTES                       1497
OUT_BYTES                      1699
IN_PKTS                         289
OUT_PKTS                        292
TCP_FLAGS                         9
FLOW_DURATION_MILLISECONDS     2357
Label     

In [None]:
# Separate the ocetets into separate features - perhaps useful for later

# Separate IP address SRC and DST into multiple columns, so they can work as integers
df[['IP_SRC_1', 'IP_SRC_2', 'IP_SRC_3', 'IP_SRC_4']] = df['IPV4_SRC_ADDR'].str.split(".", expand=True)
df[['IP_DST_1', 'IP_DST_2', 'IP_DST_3', 'IP_DST_4']] = df['IPV4_DST_ADDR'].str.split(".", expand=True)


# For loop, to change them into integers
for col in ['IP_SRC_1', 'IP_SRC_2', 'IP_SRC_3', 'IP_SRC_4', 'IP_DST_1', 'IP_DST_2', 'IP_DST_3', 'IP_DST_4']:
    df[col] = df[col].astype(int)

df.dtypes

IPV4_SRC_ADDR                  object
L4_SRC_PORT                     int64
IPV4_DST_ADDR                  object
L4_DST_PORT                     int64
PROTOCOL                        int64
L7_PROTO                      float64
IN_BYTES                        int64
OUT_BYTES                       int64
IN_PKTS                         int64
OUT_PKTS                        int64
TCP_FLAGS                       int64
FLOW_DURATION_MILLISECONDS      int64
Label                           int64
Attack                         object
IP_SRC_1                        int64
IP_SRC_2                        int64
IP_SRC_3                        int64
IP_SRC_4                        int64
IP_DST_1                        int64
IP_DST_2                        int64
IP_DST_3                        int64
IP_DST_4                        int64
dtype: object

Internet Assigned Numbers Authority (IANA) has created the RFC 6335
that introduces a uniform procedure and conventions on port numbers. Below
is an overview of the port ranges and what they can be categorised as (Cotton
et al., 2011, p. 11):
• Range 0 - 1023 are system ports and are known as well known ports
assigned by IANA. These are the ports that system processes or services
listen to. For example, port 80 is used by HTTP and port 443 is used by
HTTPS.
• Range 1024 - 49151 are user ports and are known as the registered ports
also assigned by IANA. These can be registered by software applications
for use. For example, port 3306 is registered for MySQL.
• Range 49152 - 65535 are dynamic ports and are known as private or
ephemeral ports and these are never assigned.They are used for outbound connections and on a temporary basis. They are chosen at random when an application needs a port to use for a short period of time

We will use this inforamtion and create 3 port groups that broadly can be categorized. This may or may not be useful for the analysis.


In [None]:
# Creating a new variable based on grouping the source and destination ports
bins = [-np.inf, 1023, 49151, np.inf]
labels = [1, 2, 3]
df['Port_dst_group'] = pd.cut(df['L4_DST_PORT'], bins=bins, labels=labels)
df['Port_src_group'] = pd.cut(df['L4_SRC_PORT'], bins=bins, labels=labels)


In [None]:
# Confirmation that the new features have been added.
print(df.shape)


(24332, 24)


# Examination of the data and each variable

First we will consider ports

In [None]:
# We can quickly consider the distribution to see that we have each group represented as well as how much.
print(df['Port_src_group'].value_counts(normalize=True))
print(df['Port_dst_group'].value_counts(normalize=True))

# Just a sanity check to ensure that the ranges are corretly set based on IANA
test_port_range = df[df['Port_src_group'] == 2]['L4_SRC_PORT'].value_counts().sort_index()
print(test_port_range)

2    0.734670
3    0.256822
1    0.008507
Name: Port_src_group, dtype: float64
1    0.483766
2    0.419324
3    0.096909
Name: Port_dst_group, dtype: float64
1024     1
1025     1
1026     2
1028     1
1032     2
        ..
49133    1
49137    1
49147    1
49150    1
49151    1
Name: L4_SRC_PORT, Length: 14890, dtype: int64


## L7 protocol

This feature is somewhat interesting as it has more values than protocols available, and it also has float values. L7 protocol is the only feature that is a float data type. Unfortunately, there is no documentation on how this feature is represented numerically, and unlike the protocol feature, there is no standard for numerically assigning an application layer protocol. Below is an attempt to consider the distribution as well as consider how many are actual floats (have decimals that are not zero). Ultimately nothing is done for this feature as it will be pure speculation on my part on how to understand the categorical values.

In [None]:
# This will allow printing all rows to consider the distribution.
with pd.option_context('display.max_rows', None):
    print(df['L7_PROTO'].value_counts(normalize=True).round(6))

# Create a mask where each value is True if the float value is not equal to its integer part, and False otherwise.
mask = df['L7_PROTO'] != df['L7_PROTO'].astype(int)

# Apply the mask to the DataFrame to filter it to only the rows where 'X' is not an integer, and count the unique values.
count = df[mask]['L7_PROTO'].nunique()

# This number tells us there are 170 values out of 265 that end with .0 (an integer).
# Less than 100 are floats and they cover a small part of the distribution.
print(count)

0.000      0.318059
36.000     0.170064
5.000      0.127445
7.000      0.106444
11.000     0.093211
3.000      0.045208
37.000     0.040893
4.000      0.026056
92.000     0.025234
1.000      0.022809
13.000     0.011220
10.160     0.002754
17.000     0.001438
41.000     0.000986
2.680      0.000945
127.000    0.000781
20.000     0.000740
115.000    0.000699
175.000    0.000699
85.000     0.000575
100.000    0.000575
81.000     0.000247
10.000     0.000247
96.000     0.000164
18.000     0.000164
80.000     0.000123
139.000    0.000123
131.700    0.000123
91.000     0.000123
243.000    0.000123
114.360    0.000082
7.131      0.000082
84.000     0.000082
78.000     0.000082
174.370    0.000082
111.000    0.000082
95.360     0.000082
181.110    0.000041
174.360    0.000041
85.360     0.000041
223.000    0.000041
14.000     0.000041
47.360     0.000041
79.000     0.000041
114.000    0.000041
132.370    0.000041
8.110      0.000041
77.000     0.000041
19.360     0.000041
164.000    0.000041


## TCP flags


Representing TCP flags numerically is
done by representing the TCP flags field as a 6-bit field as such:

URG - ACK - PSH - RST- SYN - FIN

0 -------- 0 ------ 0 ------ 0 ----- 0 ----- 0


Note that the order is important and the standard principle is: URG, ACK,
PSH, RST, SYN, FIN. Having this 6-bit field order, it allows us to count in
binary. Below we can see what the value would correspond to if
only one of the fields were assigned as 1.

URG - ACK - PSH - RST- SYN - FIN

32 ------ 16 ----- 8 ------ 4 ---- 2 ---- 1


So, when we encounter the value of 36 in the dataset for TCP flags, we
are able to convert it to binary to see which flags are set. 36 in binary would
represent 100100, which means the URG and ACK flags are set. Considering
that we only have 15 unique values for this feature, it could be worth to consider each individually and perhaps grouping them into common and uncommon
combinations. For now it is left as it is.

In [None]:
# TCP FLAGS - nothing needs to be done
print(df['TCP_FLAGS'].value_counts(normalize=True))

27    0.678119
0     0.280824
19    0.032015
18    0.003247
26    0.002260
25    0.001644
2     0.001438
24    0.000247
22    0.000205
Name: TCP_FLAGS, dtype: float64


## Label and Attacks (target variable)
Label = attack or not attack (binary)
Attack = what type of attack (note that the "Benign" value represents "not an attack")


In [None]:
# Examine how many attacks exists
print(df['Label'].value_counts())
# To see the distribution in percentage
print(df['Label'].value_counts(normalize=True))

# Just to verify that with Attack we can consider the total attacks as well
print(df['Attack'].value_counts())

# Exclude benign from attacks so we can see the distribution of the attack types
filtered_df = df[df['Attack'] != 'Benign']

# Use value_counts() on the filtered dataframe
print(filtered_df['Attack'].value_counts(normalize=True))


0    23246
1     1086
Name: Label, dtype: int64
0    0.955367
1    0.044633
Name: Label, dtype: float64
Benign            23246
Exploits            371
Fuzzers             292
Reconnaissance      184
Generic              84
DoS                  76
Analysis             30
Backdoor             27
Shellcode            20
Worms                 2
Name: Attack, dtype: int64
Exploits          0.341621
Fuzzers           0.268877
Reconnaissance    0.169429
Generic           0.077348
DoS               0.069982
Analysis          0.027624
Backdoor          0.024862
Shellcode         0.018416
Worms             0.001842
Name: Attack, dtype: float64


In [None]:
# We finalize this with changing some of the feature types as of these features is more representable as categorical values.
df['PROTOCOL'] = df['PROTOCOL'].astype('category')
df['L4_SRC_PORT'] = df['L4_SRC_PORT'].astype('category')
df['L4_DST_PORT'] = df['L4_DST_PORT'].astype('category')
df['IPV4_DST_ADDR'] = df['IPV4_DST_ADDR'].astype('category')
df['IPV4_SRC_ADDR'] = df['IPV4_SRC_ADDR'].astype('category')
df['L7_PROTO'] = df['L7_PROTO'].astype('category')
df['TCP_FLAGS'] = df['TCP_FLAGS'].astype('category')
df['Label'] = df['Label'].astype('int')
df['Attack'] = df['Attack'].astype('category')

# Change name for the ease of writing these columns.
df = df.rename(columns={'IPV4_SRC_ADDR': 'IP_src', 'IPV4_DST_ADDR': 'IP_dst', 'L4_SRC_PORT': 'Port_src', 'L4_DST_PORT': 'Port_dst', 'PROTOCOL': 'Proto', 'L7_PROTO': 'L7proto', 'TCP_FLAGS': 'TCP_flags', 'IN_BYTES': 'InBytes', 'OUT_BYTES': 'OutBytes', 'IN_PKTS': 'InPkts', 'OUT_PKTS': 'OutPkts', 'FLOW_DURATION_MILLISECONDS': 'FlowDurMS'
})


df.dtypes

IP_src            category
Port_src          category
IP_dst            category
Port_dst          category
Proto             category
L7proto           category
InBytes              int64
OutBytes             int64
InPkts               int64
OutPkts              int64
TCP_flags         category
FlowDurMS            int64
Label                int64
Attack            category
IP_SRC_1             int64
IP_SRC_2             int64
IP_SRC_3             int64
IP_SRC_4             int64
IP_DST_1             int64
IP_DST_2             int64
IP_DST_3             int64
IP_DST_4             int64
Port_dst_group    category
Port_src_group    category
dtype: object

In [None]:
#Change the order so the last column is our y (target variable)
df = df.reindex(columns=[c for c in df.columns if c != 'Attack'] + ['Attack'])
df = df.reindex(columns=[c for c in df.columns if c != 'Label'] + ['Label'])

df.columns

Index(['IP_src', 'Port_src', 'IP_dst', 'Port_dst', 'Proto', 'L7proto',
       'InBytes', 'OutBytes', 'InPkts', 'OutPkts', 'TCP_flags', 'FlowDurMS',
       'IP_SRC_1', 'IP_SRC_2', 'IP_SRC_3', 'IP_SRC_4', 'IP_DST_1', 'IP_DST_2',
       'IP_DST_3', 'IP_DST_4', 'Port_dst_group', 'Port_src_group', 'Attack',
       'Label'],
      dtype='object')

In [None]:
print(df.dtypes)
print(df.shape)

IP_src            category
Port_src          category
IP_dst            category
Port_dst          category
Proto             category
L7proto           category
InBytes              int64
OutBytes             int64
InPkts               int64
OutPkts              int64
TCP_flags         category
FlowDurMS            int64
IP_SRC_1             int64
IP_SRC_2             int64
IP_SRC_3             int64
IP_SRC_4             int64
IP_DST_1             int64
IP_DST_2             int64
IP_DST_3             int64
IP_DST_4             int64
Port_dst_group    category
Port_src_group    category
Attack            category
Label                int64
dtype: object
(24332, 24)


In [None]:
save_path = "/content/drive/My Drive/Projects (Github)/NF-UNSW-NB15/subset-cleanedNFdata.csv"
df.to_csv(save_path, index=False)  # `index=False` ensures that row indices are not saved
