# Sampling Strategy

Here we implement the sampling strategy used in the GAICIA project

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/concatenated/concat.csv")

In [2]:
# Keep only two columns to speed up the process
df = df[[" Destination Port", " Label"]].rename(columns={" Destination Port": "Port", " Label": "Label"})

## Sampling Strategy

In [3]:
df["Label"].value_counts()

Label
BENIGN                        2273097
DoS Hulk                       231073
PortScan                       158930
DDoS                           128027
DoS GoldenEye                   10293
FTP-Patator                      7938
SSH-Patator                      5897
DoS slowloris                    5796
DoS Slowhttptest                 5499
Bot                              1966
Web Attack � Brute Force         1507
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: count, dtype: int64

### Define variables

In [4]:
# Total amount of benign samples we wanr
TOTAL_BENIGN_TARGET_VALUE = 10_000

# Total amount of attack samples we want
TOTAL_ATTACK_TARGET_VALUE = 10_000

# If non-BENIGN labels are below this amount, leave them as they are
ATTACK_BOTTOM_VALUE = 1_000

### Undersample BENIGN

In [5]:
from imblearn.under_sampling import RandomUnderSampler

def undersample_benign(df):
	"""
	Perform undersampling on the 'BENIGN' label in the given DataFrame to a specific target value.

	This function dynamically defines an undersampling strategy for the 'BENIGN' label
	if its count is greater than the MAJORITY_TARGET_VALUE. It then applies the undersampling
	strategy using RandomUnderSampler from the imbalanced-learn library.

	Parameters:
	df (pd.DataFrame): The input DataFrame containing at least a 'Label' column.

	Returns:
	pd.DataFrame: A resampled DataFrame with the 'BENIGN' label undersampled.
	"""
	
	label_counts = df["Label"].value_counts()

	# Define undersampling strategy only for the 'BENIGN' label
	undersample_strategy = {
		"BENIGN": TOTAL_BENIGN_TARGET_VALUE
	} if label_counts.get("BENIGN", 0) > TOTAL_BENIGN_TARGET_VALUE else {}

	if undersample_strategy:
		undersample = RandomUnderSampler(sampling_strategy=undersample_strategy, random_state=287)
		df_resampled, _ = undersample.fit_resample(df, df["Label"])
		return df_resampled
	else:
		return df  # Return the original DataFrame if no undersampling is needed


In [6]:
df_with_undersampled_benign = undersample_benign(df)
df_with_undersampled_benign["Label"].value_counts()

Label
DoS Hulk                      231073
PortScan                      158930
DDoS                          128027
DoS GoldenEye                  10293
BENIGN                         10000
FTP-Patator                     7938
SSH-Patator                     5897
DoS slowloris                   5796
DoS Slowhttptest                5499
Bot                             1966
Web Attack � Brute Force        1507
Web Attack � XSS                 652
Infiltration                      36
Web Attack � Sql Injection        21
Heartbleed                        11
Name: count, dtype: int64

Okay so now BENIGN is undersampled to 10.000

### Handle attack classes

Now we just need to do 2 more steps:

1. First check how many attack labels are below the `ATTACK_BOTTOM_VALUE`. We will leave them as they are, but we need to know how many there are (so take the sum of the attack labels that are below the `ATTACK_BOTTOM_VALUE`).
2. Then we will downsample the rest of the attack labels to 10.000 - `ATTACK_BOTTOM_VALUE`, so that the sum of all the attack labels is 10.000

In [7]:
# 1. First check how many attack labels are below the `ATTACK_BOTTOM_VALUE`. We will leave them as they are, but we need to know how many there are (so take the sum of the attack labels that are below the `ATTACK_BOTTOM_VALUE`).
# 2. Then we will downsample the rest of the attack labels to 10.000 - `ATTACK_BOTTOM_VALUE`, so that the sum of all the attack labels is 10.000

def sample_or_leave_attack_labels(df):
	"""
    Perform undersampling on the non-'BENIGN' labels in the given DataFrame to a specific target value, if they are above the threshold `ATTACK_BOTTOM_VALUE`.
    
    We first need to check how many attack labels are below the `ATTACK_BOTTOM_VALUE`. We will leave them as they are, but we need to know how many there are (so take the sum of the attack labels that are below the `ATTACK_BOTTOM_VALUE`).
	Then we will downsample the rest of the attack labels to 10.000 - `ATTACK_BOTTOM_VALUE`, so that the sum of all the attack labels is 10.000
    
    Parameters:
	df (pd.DataFrame): The input DataFrame containing at least a 'Label' column.

	Returns:
	pd.DataFrame: A resampled DataFrame with the attack labels undersampled.
	"""
 
	attack_labels_without_benign = df["Label"].value_counts().loc[lambda x: x.index != "BENIGN"]
 
	# Count how many samples there are of attack labels below the `ATTACK_BOTTOM_VALUE`
	amount_of_attack_samples_below_threshold = attack_labels_without_benign.loc[lambda x: x < ATTACK_BOTTOM_VALUE].sum()
 
	# Count how many attack samples we have left to reach the target value
	leftover_attack_target_value = TOTAL_ATTACK_TARGET_VALUE - amount_of_attack_samples_below_threshold
 
	# Count how many attack labels are above the `ATTACK_BOTTOM_VALUE`
	attack_labels_above_threshold = attack_labels_without_benign.loc[lambda x: x >= ATTACK_BOTTOM_VALUE]
	downsample_attack_label_above_threshold_to_this_amount = int(leftover_attack_target_value / len(attack_labels_above_threshold))
 
	# We need to downsample the attack labels above the threshold to (`leftover_attack_target_value` / `attack_labels_above_threshold`) so that the sum of all the attack labels is 10.000
	undersample_strategy = {
		label: downsample_attack_label_above_threshold_to_this_amount if count >= ATTACK_BOTTOM_VALUE else count
		for label, count in attack_labels_without_benign.items()
	}

	undersample = RandomUnderSampler(sampling_strategy=undersample_strategy, random_state=287)
	df_resampled, _ = undersample.fit_resample(df, df["Label"])
	return df_resampled

df_with_undersampled_attack = sample_or_leave_attack_labels(df_with_undersampled_benign)
df_with_undersampled_attack["Label"].value_counts()

Label
BENIGN                        10000
Bot                             928
DDoS                            928
DoS GoldenEye                   928
DoS Hulk                        928
DoS Slowhttptest                928
DoS slowloris                   928
FTP-Patator                     928
PortScan                        928
SSH-Patator                     928
Web Attack � Brute Force        928
Web Attack � XSS                652
Infiltration                     36
Web Attack � Sql Injection       21
Heartbleed                       11
Name: count, dtype: int64

### Convert to binary classification

In [8]:
# Change all values in the column "Label" to 0 if the value is "BENIGN" and to 1 if the value is not "BENIGN"
df_with_undersampled_attack["Label"] = df_with_undersampled_attack["Label"].apply(lambda x: 0 if x == "BENIGN" else 1)
df_with_undersampled_attack["Label"].value_counts()

Label
0    10000
1    10000
Name: count, dtype: int64