# Sampling
This notebook performs sampling of traffic packets for training/validation/testing datasets.

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import random
import os

In [2]:
kitsune_features_path = "./kitsune_features/"
csv_data_path = "../69897e94e24170c0_UQIOT2022_A7369/data/"

In [2]:
random.seed(42)

common_attack_types = [
    "ACK_Flooding",
    "ARP_Spoofing",
    "Port_Scanning",
    "Service_Detection",
    "SYN_Flooding",
    "UDP_Flooding"
]

device_attacks = {
    "Cam_1": common_attack_types,
    "Google-Nest-Mini_1": common_attack_types,
    "Lenovo_Bulb_1": common_attack_types ,
    "Raspberry_Pi_telnet": common_attack_types + ["HTTP_Flooding", "Telnet-brute_Force"],
    "Smart_Clock_1": common_attack_types,
    "Smartphone_1": common_attack_types,
    "Smartphone_2": common_attack_types,
    "SmartTV": common_attack_types,
    "General": ["Host_Discovery", "Benign"]
}

## Number of Samples to Use

Requires pcap files to be converted to csv (can be done manually via WireShark). Assumes features have been extracted using Kitsune FE. One-hot encodes protocols.

In [3]:
data = {
    "Traffic_Type": [
        "ACK_Flooding", "ARP_Spoofing", "Port_Scanning", "Service_Detection", 
        "SYN_Flooding", "UDP_Flooding", "Telnet-brute_Force", "HTTP_Flooding", 
        "Host_Discovery", "Benign"
    ],
    "Cam_1": [
        1355787, 621, 4213, 7533, 797217, 1719733, 0, 0, 0, 0
    ],
    "Google-Nest-Mini_1": [
        1023660, 4275, 4223, 19233, 965978, 1270097, 0, 0, 0, 0
    ],
    "Lenovo_Bulb_1": [
        187622, 759, 4401, 5002, 187820, 297774, 0, 0, 0, 0
    ],
    "Raspberry_Pi_telnet": [
        1324669, 220, 4050, 4949, 661368, 460805, 128653, 683408, 0, 0
    ],
    "Smart_Clock_1": [
        897760, 2659, 4389, 28922, 598337, 561931, 0, 0, 0, 0
    ],
    "Smartphone_1": [
        1220486, 159, 4206, 4455, 1265998, 1478419, 0, 0, 0, 0
    ],
    "Smartphone_2": [
        1022158, 4275, 10189, 14865, 347909, 1627320, 0, 0, 0, 0
    ],
    "SmartTV": [
        1877857, 2334, 5953, 7685, 696542, 1145485, 0, 0, 0, 0
    ],
    "General": [
        0, 0, 0, 0, 0, 0, 0, 0, 25775, 17422159
    ]
}

# Create the original DataFrame
df_total_counts = pd.DataFrame(data)

numeric_cols = df_total_counts.select_dtypes(include=[np.number]).columns

# Create a copy for the sampling counts DataFrame
df_train_counts = df_total_counts.copy()

minority_class_indices = [1, 2, 3, 6, 8]

# Conditional sampling
def conditional_scaling(row, index):
    if index in minority_class_indices:
        return row * 0.7  # Minority classes
    else:
        return row * 0.05  # Majority classes

df_train_counts[numeric_cols] = df_train_counts[numeric_cols].apply(lambda row: conditional_scaling(row[numeric_cols], row.name), axis=1).round()

# Copy for the testing/validation counts DataFrame
df_test_counts = df_total_counts.copy()

minority_class_indices = [1, 2, 3, 6, 8]

# Conditional sampling
def conditional_scaling(row, index):
    if index in minority_class_indices:
        return row * 0.15 # Minority classes
    else:
        return row * 0.05  # Majority classes

df_test_counts[numeric_cols] = df_test_counts[numeric_cols].apply(lambda row: conditional_scaling(row[numeric_cols], row.name), axis=1).round()

# Add a row for the sum of each column in DataFrame
df_total_counts.loc['Device Type Totals'] = df_total_counts[numeric_cols].sum()
df_train_counts.loc['Device Type Totals'] = df_train_counts[numeric_cols].sum()
df_test_counts.loc['Device Type Totals'] = df_test_counts[numeric_cols].sum()

# Add a column for the sum of each row in DataFrame
df_total_counts['Traffic Type Totals'] = df_total_counts[numeric_cols].sum(axis=1)
df_train_counts['Traffic Type Totals'] = df_train_counts[numeric_cols].sum(axis=1)
df_test_counts['Traffic Type Totals'] = df_test_counts[numeric_cols].sum(axis=1)

In [4]:
# Display the DataFrames
print("Total Counts DataFrame:")
df_total_counts

Total Counts DataFrame:


Unnamed: 0,Traffic_Type,Cam_1,Google-Nest-Mini_1,Lenovo_Bulb_1,Raspberry_Pi_telnet,Smart_Clock_1,Smartphone_1,Smartphone_2,SmartTV,General,Traffic Type Totals
0,ACK_Flooding,1355787.0,1023660.0,187622.0,1324669.0,897760.0,1220486.0,1022158.0,1877857.0,0.0,8909999.0
1,ARP_Spoofing,621.0,4275.0,759.0,220.0,2659.0,159.0,4275.0,2334.0,0.0,15302.0
2,Port_Scanning,4213.0,4223.0,4401.0,4050.0,4389.0,4206.0,10189.0,5953.0,0.0,41624.0
3,Service_Detection,7533.0,19233.0,5002.0,4949.0,28922.0,4455.0,14865.0,7685.0,0.0,92644.0
4,SYN_Flooding,797217.0,965978.0,187820.0,661368.0,598337.0,1265998.0,347909.0,696542.0,0.0,5521169.0
5,UDP_Flooding,1719733.0,1270097.0,297774.0,460805.0,561931.0,1478419.0,1627320.0,1145485.0,0.0,8561564.0
6,Telnet-brute_Force,0.0,0.0,0.0,128653.0,0.0,0.0,0.0,0.0,0.0,128653.0
7,HTTP_Flooding,0.0,0.0,0.0,683408.0,0.0,0.0,0.0,0.0,0.0,683408.0
8,Host_Discovery,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25775.0,25775.0
9,Benign,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17422159.0,17422159.0


Will need to undersample large classes and oversample small classes. Take 10% of all majority classes (flooding attacks and benign), and 70% of minority classes (ARP spoofing, port scanning, service detection, telnet brute force, host discovery) to decrease the class imbalance in the training set. For the validation and testing sets, take 10% of total samples for each majority class from those that do not exist in other sets, and for minority classes, split 50-50 between val/test from remaining samples.

In [5]:
print("Training Counts DataFrame:")
df_train_counts

Training Counts DataFrame:


Unnamed: 0,Traffic_Type,Cam_1,Google-Nest-Mini_1,Lenovo_Bulb_1,Raspberry_Pi_telnet,Smart_Clock_1,Smartphone_1,Smartphone_2,SmartTV,General,Traffic Type Totals
0,ACK_Flooding,67789.0,51183.0,9381.0,66233.0,44888.0,61024.0,51108.0,93893.0,0.0,445499.0
1,ARP_Spoofing,435.0,2992.0,531.0,154.0,1861.0,111.0,2992.0,1634.0,0.0,10710.0
2,Port_Scanning,2949.0,2956.0,3081.0,2835.0,3072.0,2944.0,7132.0,4167.0,0.0,29136.0
3,Service_Detection,5273.0,13463.0,3501.0,3464.0,20245.0,3118.0,10406.0,5380.0,0.0,64850.0
4,SYN_Flooding,39861.0,48299.0,9391.0,33068.0,29917.0,63300.0,17395.0,34827.0,0.0,276058.0
5,UDP_Flooding,85987.0,63505.0,14889.0,23040.0,28097.0,73921.0,81366.0,57274.0,0.0,428079.0
6,Telnet-brute_Force,0.0,0.0,0.0,90057.0,0.0,0.0,0.0,0.0,0.0,90057.0
7,HTTP_Flooding,0.0,0.0,0.0,34170.0,0.0,0.0,0.0,0.0,0.0,34170.0
8,Host_Discovery,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18042.0,18042.0
9,Benign,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,871108.0,871108.0


In [6]:
print("Testing/Validation Sample Counts DataFrame:")
df_test_counts

Testing/Validation Sample Counts DataFrame:


Unnamed: 0,Traffic_Type,Cam_1,Google-Nest-Mini_1,Lenovo_Bulb_1,Raspberry_Pi_telnet,Smart_Clock_1,Smartphone_1,Smartphone_2,SmartTV,General,Traffic Type Totals
0,ACK_Flooding,67789.0,51183.0,9381.0,66233.0,44888.0,61024.0,51108.0,93893.0,0.0,445499.0
1,ARP_Spoofing,93.0,641.0,114.0,33.0,399.0,24.0,641.0,350.0,0.0,2295.0
2,Port_Scanning,632.0,633.0,660.0,608.0,658.0,631.0,1528.0,893.0,0.0,6243.0
3,Service_Detection,1130.0,2885.0,750.0,742.0,4338.0,668.0,2230.0,1153.0,0.0,13896.0
4,SYN_Flooding,39861.0,48299.0,9391.0,33068.0,29917.0,63300.0,17395.0,34827.0,0.0,276058.0
5,UDP_Flooding,85987.0,63505.0,14889.0,23040.0,28097.0,73921.0,81366.0,57274.0,0.0,428079.0
6,Telnet-brute_Force,0.0,0.0,0.0,19298.0,0.0,0.0,0.0,0.0,0.0,19298.0
7,HTTP_Flooding,0.0,0.0,0.0,34170.0,0.0,0.0,0.0,0.0,0.0,34170.0
8,Host_Discovery,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3866.0,3866.0
9,Benign,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,871108.0,871108.0


Will use above numbers for building the training, validation and testing sets (validation and testing use same sizes but different samples).

## Creating Training, Validation and Testing Samples

In [7]:
traffic_row_map = {
    "ACK_Flooding": 0, "ARP_Spoofing": 1, "Port_Scanning": 2, "Service_Detection": 3, 
    "SYN_Flooding": 4, "UDP_Flooding": 5, "Telnet-brute_Force": 6, "HTTP_Flooding": 7, 
    "Host_Discovery": 8, "Benign": 9
}

labels = {
    "Benign": 0,
    "ACK_Flooding": 1,
    "ARP_Spoofing": 2,
    "Port_Scanning": 3,
    "Service_Detection": 4,
    "SYN_Flooding": 5,
    "UDP_Flooding": 6,
    "HTTP_Flooding": 7,
    "Telnet-brute_Force": 8,
    "Host_Discovery": 9,
}

In [None]:
# Find all possible protocols for one-hot encoding
possible_protocols = set()
for device, traffic_types in device_attacks.items():
    for traffic in traffic_types:
        sample_type = "benign_samples" if traffic == "Benign" else "attack_samples"
        df = pd.read_csv(csv_data_path + sample_type + "/" + device + "/" + traffic + "_" + device + ".csv", encoding="latin1", usecols=["Protocol"]) # Get all protocols
        possible_protocols.update(df["Protocol"].unique())

In [10]:
def sample_traffic(device, traffic_type, df_total_counts, df_train_counts, df_test_counts, traffic_row_map, labels, possible_protocols, kitsune_path, csv_path, save_path):
    """
    Creates a dataset consisting of the key features for each traffic packet.

    Parameters:
        traffic_type: Type of traffic, used for labelling the dataset.
        device: Device type the traffic applies to.
        csv_path: Path to directory where raw feature data is stored.
        
    Returns:
        train_set, val_set, test_set: Datasets containing all Kitsune features, with the protocol, length and label added for the training/validation/testing sets.
    """
    # Indexes for samples going into each set
    avail_samples = int(df_total_counts[device][traffic_row_map[traffic_type]]) # Number of samples available for a particular device + traffic combination
    num_train_samples = int(df_train_counts[device][traffic_row_map[traffic_type]]) # Number of samples to take to create the train set
    num_test_samples = int(df_test_counts[device][traffic_row_map[traffic_type]]) # Number of samples to take to create the val/test set

    train_indices = random.sample(range(avail_samples), num_train_samples)
    remaining_indices = list(set(range(avail_samples)) - set(train_indices))
    val_indices = random.sample(remaining_indices, num_test_samples - 1)
    remaining_indices = list(set(remaining_indices) - set(val_indices))
    test_indices = random.sample(remaining_indices, num_test_samples)

    # Get extracted Kitsune features (only read required rows to save memory)
    def get_npy_shape_and_dtype(filename):
        with open(filename, 'rb') as f:
            # Read the magic string
            np.lib.format.read_magic(f)
            # Read the header length
            header = np.lib.format.read_array_header_1_0(f)
            shape = header[0]
        return shape
    
    kitsune_file = kitsune_path + device + "/" + traffic_type + ".npy"
    
    shape = get_npy_shape_and_dtype(kitsune_file)
    kitsune_features = np.memmap(kitsune_file, dtype=np.float64, mode='r')
    kitsune_features = kitsune_features[16:].reshape(shape) # First 16 elements are junk, ignore these before reshaping

    sample_type = "benign_samples" if traffic_type == "Benign" else "attack_samples"
    raw_packet_data = pd.read_csv(csv_path + sample_type + "/" + device + "/" + traffic_type + "_" + device + ".csv", encoding="latin1", usecols=["Protocol", "Length"]) # Raw packet data (for protocol and length)

    # Columns for final datasets
    column_order = ["length"] + [f"kit_fe_{i}" for i in range(0, 100)] + [f"protocol_{col}" for col in possible_protocols] + ["label"]

    # Function to create a dataset
    def create_dataset(indices, label, save_location):
        dataset = pd.DataFrame(columns=column_order)
        dataset["length"] = raw_packet_data.iloc[indices]["Length"] # Length of each packet
        dataset.iloc[:, 1:101] = kitsune_features[indices] # Kitsune features for each packet

        # One-hot encoding the protocols
        encoded_df = pd.get_dummies(raw_packet_data, columns=["Protocol"], prefix="protocol", prefix_sep="_", dtype=int)
        
        # Add missing encoded columns for one-hot encoding
        missing_columns = possible_protocols - set(encoded_df.columns)
        missing_df = pd.DataFrame(0, index=encoded_df.index, columns=[f"protocol_{col}" for col in missing_columns], dtype=int)
        missing_df = missing_df.loc[:, ~missing_df.columns.isin(encoded_df.columns)]
        encoded_df = pd.concat([encoded_df, missing_df], axis=1)
        encoded_df = encoded_df.reindex(columns=column_order)

        dataset.iloc[:, 101:-1] = encoded_df.iloc[indices][[f"protocol_{col}" for col in possible_protocols]]

        dataset["label"] = label

        dataset = dataset.reindex(columns=column_order)

        dataset.to_csv(save_location, index=False)

    # Print indices
    print(device, traffic_type)
    # print(train_indices)
    # print(val_indices)
    # print(test_indices)
    
    # Create train, validation, and test sets
    create_dataset(train_indices, labels[traffic_type], save_path + "_train.csv")
    print("training set complete")
    create_dataset(val_indices, labels[traffic_type], save_path + "_val.csv")
    print("validation set complete")
    create_dataset(test_indices, labels[traffic_type], save_path + "_test.csv")
    print("testing set complete")
    print("")

In [11]:
# Create full datasets for each traffic and device
for device in device_attacks.keys():
    full_dataset_save_path = "./sampled_datasets/" + device + "/"
    os.makedirs(os.path.dirname(full_dataset_save_path), exist_ok=True)

    for traffic in device_attacks[device]:
        sample_traffic(device, traffic, df_total_counts, df_train_counts, df_test_counts, traffic_row_map, labels, possible_protocols, kitsune_features_path, csv_data_path, full_dataset_save_path + traffic)

Cam_1 ACK_Flooding
training set complete
validation set complete
testing set complete

Cam_1 ARP_Spoofing
training set complete
validation set complete
testing set complete

Cam_1 Port_Scanning
training set complete
validation set complete
testing set complete

Cam_1 Service_Detection
training set complete
validation set complete
testing set complete

Cam_1 SYN_Flooding
training set complete
validation set complete
testing set complete

Cam_1 UDP_Flooding
training set complete
validation set complete
testing set complete

Google-Nest-Mini_1 ACK_Flooding
training set complete
validation set complete
testing set complete

Google-Nest-Mini_1 ARP_Spoofing
training set complete
validation set complete
testing set complete

Google-Nest-Mini_1 Port_Scanning
training set complete
validation set complete
testing set complete

Google-Nest-Mini_1 Service_Detection
training set complete
validation set complete
testing set complete

Google-Nest-Mini_1 SYN_Flooding
training set complete
validation s