# CICIDS

### **accelerated_random_forest.ipynb**

The CICIDS2017 dataset is a comprehensive dataset for network intrusion detection, created by the Canadian Institute for Cybersecurity. It includes a diverse set of attack scenarios and normal traffic, making it suitable for training and evaluating intrusion detection systems.

The dataset includes various types of attacks such as Brute Force, Heartbleed, Botnet, DoS (Denial of Service), DDoS (Distributed Denial of Service), Web attacks, and Infiltration of the network from inside.

**This notebook demonstrates how to accelerate Random Forest training using cuML's Random Forest implementation, and using Optuna for hyperparameter optimization.**

In [None]:
model_name = "random_forest"

In [None]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

## Step 1. Read data and import necessary libraries

In [None]:
import cudf
df_train = cudf.read_csv("../data/concatenated/concat.csv")

In [None]:
df_train.head(5)

In [None]:
df_train.shape

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
# Remove leading and trailing whitespaces from column names
df_train.columns = df_train.columns.str.strip()

In [None]:
df_train.columns

## Step 2. Data Cleaning

### A. Missing values

In [None]:
print(df_train.isna().sum().sum())

In [None]:
df_train.dropna(subset=["Flow Bytes/s"], inplace=True)

In [None]:
print(df_train.isna().sum().sum())

### Inf. values

In [None]:
import numpy as np
df_train = df_train.replace([np.inf, -np.inf], np.nan)
print("Number of NaNs in the dataset: ", df_train.isna().sum().sum())
df_train = df_train.dropna()
print("Number of NaNs in the dataset after dropping: ", df_train.isna().sum().sum())

## Step 3. Data Preparation

### A. Normalise numeric features

In [None]:
# Get all numerical columns
numerical_columns = df_train.select_dtypes(include="number").columns

In [None]:
from cuml.preprocessing import MinMaxScaler # cuML's MinMaxScaler

scaler = MinMaxScaler()
df_train[numerical_columns] = scaler.fit_transform(df_train[numerical_columns])

### B. Map Labels to Multi-class

In [None]:
df_train["Label"].value_counts()

In [None]:
attack_mapping = {
	"BENIGN": 0,
	"DoS Hulk": 1,
	"PortScan": 2,
	"DDoS": 3,
	"DoS GoldenEye": 4,
	"FTP-Patator": 5,
	"SSH-Patator": 6,
	"DoS slowloris": 7,
	"DoS Slowhttptest": 8,
	"Bot": 9,
	"Web Attack � Brute Force": 10,
	"Web Attack � XSS": 11,
	"Infiltration": 12,
	"Web Attack � Sql Injection": 13,
	"Heartbleed": 14,
}

df_train["Label"] = df_train["Label"].map(attack_mapping)

In [None]:
df_train["Label"].value_counts()

### C. Data Splitting

In [None]:
# Drop all nan values
df_train.dropna(inplace=True)

In [None]:
df_train.isna().sum().sum()

In [None]:
X = df_train.drop(columns="Label")
y = df_train["Label"]

In [None]:
print("Null values in X: ", X.isna().sum().sum())
print("Null values in y: ", y.isna().sum().sum())

In [None]:
from cuml.model_selection import train_test_split # cuML's train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### D. Apply SMOTE to balance the training data

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# 1. Undersample the majority class
undersampling_strategy = {
    0: 1000,
    1: 1000,
    2: 1000,
    3: 1000,
    4: 1000,
    5: 1000,
    6: 1000,
    7: 1000,
    8: 1000,
    9: 1000,
	10: 1000,
}
rus = RandomUnderSampler(random_state=42, sampling_strategy=undersampling_strategy)
X_train_undersampled, y_train_undersampled = rus.fit_resample(X_train, y_train)

# 2. Oversample the minority class
smote = SMOTE(random_state=42, sampling_strategy="auto")
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_undersampled, y_train_undersampled)

In [None]:
# Check class distribution after SMOTE
from collections import Counter

print(f"Class distribution before SMOTE: {Counter(y_train)}")
print(f"Class distribution after SMOTE: {Counter(y_train_balanced)}")

## Step 4. Model

#### A. Find best hyperparameters using Optuna

In [None]:
import cudf
import cupy as cp
import optuna
from cuml.ensemble import RandomForestClassifier as cuRF
from cuml.metrics import accuracy_score, f1_score, classification_report
from cuml.model_selection import train_test_split
from cuml.preprocessing import RandomUnderSampler # cuML's RandomUnderSampler
from tqdm import tqdm

In [None]:
# Create an objective function for Optuna to optimize
def objective(trial):
    # Define the hyperparameters to tune
    n_estimators = trial.suggest_int("n_estimators", 50, 300, step=50)
    max_depth = trial.suggest_categorical("max_depth", [None, 10, 20, 30, 40])
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10, step=1)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 4, step=1)
    max_features = trial.suggest_categorical("max_features", ["auto", "sqrt", "log2"])
    bootstrap = trial.suggest_categorical("bootstrap", [True, False])

    # Create the cuML RandomForestClassifier
    rf = cuRF(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        bootstrap=bootstrap,
        random_state=42
    )

    # Train the model
    rf.fit(X_train_balanced, y_train_balanced)

    # Predict and evaluate the model
    y_pred = rf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    return accuracy

In [None]:
# Create the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

# Print the best hyperparameters and accuracy
print("Best hyperparameters found: ", study.best_params)
print("Best accuracy found: ", study.best_value)

### B. Training the final model with the best hyperparameters

In [None]:
# Train the final model with the best hyperparameters
best_params = study.best_params
rf_final = cuRF(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    min_samples_split=best_params["min_samples_split"],
    min_samples_leaf=best_params["min_samples_leaf"],
    max_features=best_params["max_features"],
    bootstrap=best_params["bootstrap"],
    random_state=42
)

In [None]:
rf_final.fit(X_train_balanced, y_train_balanced)

### G. Evaluating the model

In [None]:
# Sample target for balancing
SAMPLE_TARGET = 1000

# Define undersampling strategy (using cuML's RandomUnderSampler)
undersampling_strategy_test_set = {
    0: SAMPLE_TARGET if y_test.value_counts().get(0) > SAMPLE_TARGET else y_test.value_counts().get(0),  # BENIGN
    1: SAMPLE_TARGET if y_test.value_counts().get(1) > SAMPLE_TARGET else y_test.value_counts().get(1),  # DoS Hulk
    2: SAMPLE_TARGET if y_test.value_counts().get(2) > SAMPLE_TARGET else y_test.value_counts().get(2),  # PortScan
    3: SAMPLE_TARGET if y_test.value_counts().get(3) > SAMPLE_TARGET else y_test.value_counts().get(3),  # DDoS
    4: SAMPLE_TARGET if y_test.value_counts().get(4) > SAMPLE_TARGET else y_test.value_counts().get(4),  # DoS GoldenEye
    5: SAMPLE_TARGET if y_test.value_counts().get(5) > SAMPLE_TARGET else y_test.value_counts().get(5),  # FTP-Patator
    6: SAMPLE_TARGET if y_test.value_counts().get(6) > SAMPLE_TARGET else y_test.value_counts().get(6),  # SSH-Patator
    7: SAMPLE_TARGET if y_test.value_counts().get(7) > SAMPLE_TARGET else y_test.value_counts().get(7),  # DoS slowloris
    8: SAMPLE_TARGET if y_test.value_counts().get(8) > SAMPLE_TARGET else y_test.value_counts().get(8),  # DoS Slowhttptest
    9: SAMPLE_TARGET if y_test.value_counts().get(9) > SAMPLE_TARGET else y_test.value_counts().get(9),  # Bot
    10: SAMPLE_TARGET if y_test.value_counts().get(10) > SAMPLE_TARGET else y_test.value_counts().get(10),  # Web Attack - Brute Force
    11: SAMPLE_TARGET if y_test.value_counts().get(11) > SAMPLE_TARGET else y_test.value_counts().get(11),  # Web Attack - XSS
    12: SAMPLE_TARGET if y_test.value_counts().get(12) > SAMPLE_TARGET else y_test.value_counts().get(12),  # Infiltration
    13: SAMPLE_TARGET if y_test.value_counts().get(13) > SAMPLE_TARGET else y_test.value_counts().get(13),  # Web Attack - SQL Injection
    14: SAMPLE_TARGET if y_test.value_counts().get(14) > SAMPLE_TARGET else y_test.value_counts().get(14),  # Heartbleed
}

# Use cuML's RandomUnderSampler
rus_test = RandomUnderSampler(random_state=42, sampling_strategy=undersampling_strategy_test_set)

# Balance the test set using cuML
X_test_balanced, y_test_balanced = rus_test.fit_resample(X_test, y_test)


In [None]:
# Final evaluation with cuML
y_pref_final = rf_final.predict(X_test_balanced)

In [None]:
accuracy = accuracy_score(y_test_balanced, y_pref_final)
f1 = f1_score(y_test_balanced, y_pref_final, average="weighted")

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

In [None]:
print(classification_report(y_test_balanced, y_pred_final, target_names=attack_mapping.keys()))

### **Previous results:**

| Metric             | Previous    | Current     | improvement?                           |
| ------------------ | ----------- | ----------- | -------------------------------------- |
| Accuracy           | 0.95        | 0.98        | <span style="color:#20ff20;">yes</span>|
| MA Range           | 0.95 - 0.95 | 0.89 - 0.90 | <span style="color:#ff4040;">no</span> |
| Precision range    | 0.66 - 1.00 | 0.36 - 1.00 | <span style="color:#ff4040;">no</span> |
| Recall range       | 0.57 - 1.00 | 0.55 - 1.00 | <span style="color:#ff4040;">no</span> |
| F1 range           | 0.63 - 1.00 | 0.43 - 1.00 | <span style="color:#ff4040;">no</span> |
 