# Active Learning with Uncertainty Sampling on the IDS2017

In this notebook, an active learning approach is used to progressively learn to detect attacks in the IDS2017 with a partially labelled dataset leveragin uncertainty sampling. The fully labeled dataset is used to simulate a human oracle.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, average_precision_score, make_scorer, precision_score, accuracy_score, confusion_matrix, recall_score, f1_score, roc_auc_score
%matplotlib inline
%load_ext autoreload
%autoreload 2
file_path = r"..\CIC-IDS-2017\CSVs\GeneratedLabelledFlows\TrafficLabelling\processed\ids2017_processed.csv"

def load_dataset(file_path):
    df = pd.read_csv(file_path)
    convert_dict = {'label': 'category'}
    df = df.astype(convert_dict)
    df['id'] = df.index
    df.info()
    return df

attack_labels = {
    0: 'BENIGN',
    7: 'FTP-Patator',
    11: 'SSH-Patator',
    6: 'DoS slowloris',
    5: 'DoS Slowhttptest',
    4: 'DoS Hulk',
    3: 'DoS GoldenEye',
    8: 'Heartbleed',
    12: 'Web Attack - Brute Force',
    14: 'Web Attack - XSS',
    13: 'Web Attack - Sql Injection',
    9: 'Infiltration',
    1: 'Bot',
    10: 'PortScan',
    2: 'DDoS'
}

In [2]:
from imblearn.over_sampling import SMOTE

# Function to resample dataset using SMOTE
def resample_dataset(X, Y, min_samples, attack_labels):
    Y = Y.drop(columns=['label'])  # Exclude 'label' from Y
    combined = pd.concat([X, Y], axis=1)
    counts = Y['label_code'].value_counts()
    samples_number = {i: max(counts[i], min_samples) for i in np.unique(Y['label_code'])}
    combined_array = combined.values
    y_array = Y['label_code'].values
    resampler = SMOTE(random_state=42, sampling_strategy=samples_number)
    resampled_array, y_resampled = resampler.fit_resample(combined_array, y_array)
    X_resampled = resampled_array[:, :-Y.shape[1]]
    Y_resampled = resampled_array[:, -Y.shape[1]:]
    X_resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
    Y_resampled_df = pd.DataFrame(Y_resampled, columns=Y.columns)
    Y_resampled_df['label'] = Y_resampled_df['label_code'].map(attack_labels)
    Y_resampled_df['label'] = Y_resampled_df['label'].astype('category')
    return X_resampled_df, Y_resampled_df

# Function to dynamically calculate the minimum number of samples for SMOTE
def calculate_min_samples(labeled_data, base_ratio=0.1, min_value=1000):
    # Calculate the minimum number of samples as a percentage of the labeled data
    num_samples = max(int(len(labeled_data) * base_ratio), min_value)
    return num_samples

## Preparing the dataset

In [3]:
df = load_dataset(file_path)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2830743 entries, 0 to 2830742
Data columns (total 97 columns):
 #   Column                       Dtype   
---  ------                       -----   
 0   destination_port             int64   
 1   protocol                     int64   
 2   flow_duration                int64   
 3   total_fwd_packets            int64   
 4   total_backward_packets       int64   
 5   total_length_of_fwd_packets  float64 
 6   total_length_of_bwd_packets  float64 
 7   fwd_packet_length_max        float64 
 8   fwd_packet_length_min        float64 
 9   fwd_packet_length_mean       float64 
 10  fwd_packet_length_std        float64 
 11  bwd_packet_length_max        float64 
 12  bwd_packet_length_min        float64 
 13  bwd_packet_length_mean       float64 
 14  bwd_packet_length_std        float64 
 15  flow_bytes_s                 float64 
 16  flow_packets_s               float64 
 17  flow_iat_mean                float64 
 18  flow_iat_std          

In [4]:
def replace_invalid(df):
    # Select only numeric columns
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    
    # Identify columns with NaN, infinite, or negative values
    nan_columns = df[numeric_columns].columns[df[numeric_columns].isna().any()]
    inf_columns = df[numeric_columns].columns[np.isinf(df[numeric_columns]).any()]

    # Drop rows with NaN values (low percentage of NaN values)
    df = df.dropna(subset=nan_columns)

    # Drop rows with infinite values (assuming low percentage)
    for col in inf_columns:
        df = df[np.isfinite(df[col])]
    
    return df

df = replace_invalid(df)

In [5]:
X = df.iloc[:, 0:79]
Y = df[['label', 'label_code', 'is_attack']]
X.info()
Y.info()
print(Y.label.value_counts())

<class 'pandas.core.frame.DataFrame'>
Index: 2827876 entries, 0 to 2830742
Data columns (total 79 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   destination_port             int64  
 1   protocol                     int64  
 2   flow_duration                int64  
 3   total_fwd_packets            int64  
 4   total_backward_packets       int64  
 5   total_length_of_fwd_packets  float64
 6   total_length_of_bwd_packets  float64
 7   fwd_packet_length_max        float64
 8   fwd_packet_length_min        float64
 9   fwd_packet_length_mean       float64
 10  fwd_packet_length_std        float64
 11  bwd_packet_length_max        float64
 12  bwd_packet_length_min        float64
 13  bwd_packet_length_mean       float64
 14  bwd_packet_length_std        float64
 15  flow_bytes_s                 float64
 16  flow_packets_s               float64
 17  flow_iat_mean                float64
 18  flow_iat_std                 float64
 19  flow_

## Feature Selection

First, the columns with no variance are dropped as they have no impact on the target variables.

In [6]:
stats = X.describe()
std = stats.loc["std"]
features_no_var = std[std == 0.0].index
# Exclude non-numeric columns (e.g., categorical columns) from the features with zero variance
features_no_var_numeric = [col for col in features_no_var if col in X.select_dtypes(include=[np.number]).columns]
print(features_no_var_numeric)

['bwd_psh_flags', 'bwd_urg_flags', 'fwd_avg_bytes_bulk', 'fwd_avg_packets_bulk', 'fwd_avg_bulk_rate', 'bwd_avg_bytes_bulk', 'bwd_avg_packets_bulk', 'bwd_avg_bulk_rate']


The destination port feature is dropped because it can act as a shortcut predictor and cause high overfitting for the training set as show in this [paper](https://link.springer.com/chapter/10.1007/978-3-031-09484-2_2)

In [7]:
X = X.drop(columns=features_no_var_numeric)
X = X.drop(columns=['destination_port'])
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2827876 entries, 0 to 2830742
Data columns (total 70 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   protocol                     int64  
 1   flow_duration                int64  
 2   total_fwd_packets            int64  
 3   total_backward_packets       int64  
 4   total_length_of_fwd_packets  float64
 5   total_length_of_bwd_packets  float64
 6   fwd_packet_length_max        float64
 7   fwd_packet_length_min        float64
 8   fwd_packet_length_mean       float64
 9   fwd_packet_length_std        float64
 10  bwd_packet_length_max        float64
 11  bwd_packet_length_min        float64
 12  bwd_packet_length_mean       float64
 13  bwd_packet_length_std        float64
 14  flow_bytes_s                 float64
 15  flow_packets_s               float64
 16  flow_iat_mean                float64
 17  flow_iat_std                 float64
 18  flow_iat_max                 float64
 19  flow_

### Remove Collinear Variables

In [8]:
def correlation_feature_selection(df, threshold=0.9):
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    return df.drop(columns=to_drop)
X = correlation_feature_selection(X)
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2827876 entries, 0 to 2830742
Data columns (total 39 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   protocol                     int64  
 1   flow_duration                int64  
 2   total_fwd_packets            int64  
 3   total_length_of_fwd_packets  float64
 4   fwd_packet_length_max        float64
 5   fwd_packet_length_min        float64
 6   fwd_packet_length_mean       float64
 7   bwd_packet_length_max        float64
 8   bwd_packet_length_min        float64
 9   flow_bytes_s                 float64
 10  flow_packets_s               float64
 11  flow_iat_mean                float64
 12  flow_iat_std                 float64
 13  flow_iat_min                 float64
 14  fwd_iat_min                  float64
 15  bwd_iat_total                float64
 16  bwd_iat_mean                 float64
 17  bwd_iat_std                  float64
 18  bwd_iat_max                  float64
 19  fwd_p

### Information Gain Selection

In [9]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
import pandas as pd

def oversample_minority_classes(X, Y, sample_size=1000):
    y = Y["label_code"]
    # Create a subset of the oversampled data
    X_sample, _, y_sample, _ = train_test_split(X, y, train_size=sample_size, stratify=y, random_state=42)
    ros = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = ros.fit_resample(X_sample, y_sample)
    return X_resampled, y_resampled

def information_gain_feature_selection(X, Y, sample_size=1000):
    # Create an oversampled subset of the data
    X_sample, y_sample = oversample_minority_classes(X, Y, sample_size)
    # Create is_attack column based on label_code
    y_sample = (y_sample != 0).astype(int)
    # Perform feature selection on the oversampled subset
    info_gain = mutual_info_classif(X_sample, y_sample)
    info_gain_df = pd.DataFrame({'Feature': X.columns, 'Information Gain': info_gain})
    info_gain_df = info_gain_df.sort_values(by='Information Gain', ascending=False)
    print(info_gain_df)
    selected_features = info_gain_df[info_gain_df['Information Gain'] > 0.1]['Feature'].tolist()
    return selected_features

# Determine the selected features using the oversampled subset
selected_features = information_gain_feature_selection(X, Y)

# Apply the selected features to the main dataset
X = X[selected_features]

# Display information about the selected features
X.info()

                        Feature  Information Gain
1                 flow_duration          0.226366
10               flow_packets_s          0.217815
3   total_length_of_fwd_packets          0.213526
11                flow_iat_mean          0.208003
9                  flow_bytes_s          0.203715
23                bwd_packets_s          0.199149
31       init_win_bytes_forward          0.186198
4         fwd_packet_length_max          0.185081
6        fwd_packet_length_mean          0.182463
12                 flow_iat_std          0.174562
7         bwd_packet_length_max          0.165083
32      init_win_bytes_backward          0.153307
18                  bwd_iat_max          0.150998
15                bwd_iat_total          0.146592
16                 bwd_iat_mean          0.145545
21            fwd_header_length          0.141878
24            min_packet_length          0.112994
5         fwd_packet_length_min          0.112187
13                 flow_iat_min          0.106014


## Split Dataset

The dataset is split into a labelled and non labelled dataset. We keep a fully labelled copy of the dataset as a human oracle. As the non-labelled rows already contain a label, this will act as a human oracle.

In [10]:
# Apply SMOTE to the entire dataset before splitting
X_resampled, Y_resampled = resample_dataset(X, Y, min_samples=100000, attack_labels=attack_labels)

# Combine resampled X and Y
df_resampled = pd.concat([X_resampled, Y_resampled], axis=1)
df_resampled['id'] = range(len(df_resampled))  # Reassign IDs after resampling

In [11]:
from sklearn.model_selection import train_test_split

# Split the dataset into labeled, unlabeled, and test portions
def split_dataset(df, initial_labeled_ratio=0.05, test_size=0.2):
    # First split out the test data
    train_val_df, test_df = train_test_split(df, test_size=test_size, stratify=df['label'], random_state=42)
    train_val_df['Labeled'] = 0    # Initially, mark all as unlabeled
    # Then split the remaining data into labeled and unlabeled sets
    labeled_df, unlabeled_df = train_test_split(train_val_df, test_size=1-initial_labeled_ratio, stratify=train_val_df['label'], random_state=42)
    labeled_df['Labeled'] = 1  # Mark initial small portion as labeled
    combined_df = pd.concat([labeled_df, unlabeled_df]).sample(frac=1, random_state=42).reset_index(drop=True)
    
    return combined_df, test_df

combined_df, test_df = split_dataset(df_resampled)
combined_df.info()
combined_df.Labeled.value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3110618 entries, 0 to 3110617
Data columns (total 25 columns):
 #   Column                       Dtype   
---  ------                       -----   
 0   flow_duration                float64 
 1   flow_packets_s               float64 
 2   total_length_of_fwd_packets  float64 
 3   flow_iat_mean                float64 
 4   flow_bytes_s                 float64 
 5   bwd_packets_s                float64 
 6   init_win_bytes_forward       float64 
 7   fwd_packet_length_max        float64 
 8   fwd_packet_length_mean       float64 
 9   flow_iat_std                 float64 
 10  bwd_packet_length_max        float64 
 11  init_win_bytes_backward      float64 
 12  bwd_iat_max                  float64 
 13  bwd_iat_total                float64 
 14  bwd_iat_mean                 float64 
 15  fwd_header_length            float64 
 16  min_packet_length            float64 
 17  fwd_packet_length_min        float64 
 18  flow_iat_min          

Labeled
0    2955088
1     155530
Name: count, dtype: int64

In [12]:
from sklearn.preprocessing import StandardScaler

# Standardize the data and return the scaler used
def standardize_data(X_train, X_test=None):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    if X_test is not None:
        X_test_scaled = scaler.transform(X_test)
        return X_train_scaled, X_test_scaled, scaler
    return X_train_scaled, scaler


## Model

In [13]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, LSTM, TimeDistributed, RepeatVector, Input
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

metrics = {}
# Build a simple neural network model
# Define DNN model
def build_model(input_shape):
    model = Sequential()
    model.add(Input(shape=(input_shape,)))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [14]:
# Modified training function with dynamic SMOTE resampling
def train_model_with_smote(combined_df, attack_labels, base_ratio=0.1, min_value=1000):
    labeled_data = combined_df[combined_df['Labeled'] == 1]
    X_train = labeled_data.drop(columns=['label', 'Labeled', 'id', 'is_attack'])
    Y_train = labeled_data[['label', 'is_attack', 'label_code']]
    
    # Calculate dynamic min_samples based on the size of the labeled data
    min_samples = calculate_min_samples(labeled_data, base_ratio, min_value)
    
    # Apply SMOTE to the labeled data
    X_resampled, Y_resampled = resample_dataset(X_train, Y_train, min_samples, attack_labels)
    
    # Standardize the resampled data
    X_resampled_scaled, scaler = standardize_data(X_resampled)
    
    # Train the model
    model = build_model(X_resampled_scaled.shape[1])
    model.fit(X_resampled_scaled, Y_resampled['is_attack'], epochs=10, batch_size=32, verbose=2)
    
    return model, scaler

In [15]:
# Function for uncertainty sampling
def uncertainty_sampling(model, unlabeled_data, scaler):
    X_unlabeled = unlabeled_data.drop(columns=['label', 'Labeled', 'id', 'is_attack'])
    
    # Standardize the unlabeled data using the same scaler
    X_unlabeled_scaled = scaler.transform(X_unlabeled)
    
    preds = model.predict(X_unlabeled_scaled)
    uncertainty = np.abs(preds - 0.5).reshape(-1)
    uncertain_indices = np.argsort(uncertainty)[:1000]  # Select 10 most uncertain samples
    actual_indices = unlabeled_data.iloc[uncertain_indices].index.tolist()
    return actual_indices


In [None]:
# Active learning loop with dynamic SMOTE resampling
def active_learning_with_dynamic_smote(combined_df, test_df, n_rounds=5, attack_labels=None, base_ratio=0.1, min_value=1000):
    round_acc = {}
    
    # Initial training of the model before entering the loop
    model, scaler = train_model_with_smote(combined_df, attack_labels, base_ratio, min_value)
    
    for round_num in range(n_rounds):
        print(f"Round {round_num + 1} of Active Learning")
        
        # Unlabeled data
        unlabeled_data = combined_df[combined_df['Labeled'] == 0]
        uncertain_indices = uncertainty_sampling(model, unlabeled_data, scaler)
        
        # "Label" the selected samples by setting 'Labeled' to 1
        combined_df.loc[uncertain_indices, 'Labeled'] = 1
        
        # Retrain the model with dynamic SMOTE resampling
        model, scaler = train_model_with_smote(combined_df, attack_labels, base_ratio, min_value)
        
        # Evaluate the model
        X_test = test_df.drop(columns=['label', 'Labeled', 'id', 'is_attack'])
        y_test = test_df['is_attack']
        
        # Standardize the test data using the same scaler
        X_test_scaled = scaler.transform(X_test)
        
        # Evaluate model performance
        loss, accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)
        print(f"Test Accuracy after Round {round_num + 1}: {accuracy}\n")
        
        # Store accuracy for this round
        round_acc[round_num + 1] = accuracy
    
    return round_acc

# Example usage
round_acc = active_learning_with_dynamic_smote(combined_df, test_df, n_rounds=10, attack_labels=attack_labels, base_ratio=0.1, min_value=1000)

Epoch 1/10


In [None]:
import matplotlib.pyplot as plt

# Function to plot accuracy over rounds
def plot_accuracies(round_acc_dict):
    plt.figure(figsize=(10, 6))
    plt.plot(list(round_acc_dict.keys()), list(round_acc_dict.values()), marker='.', label='Active Learning Accuracy')
    
    # Add horizontal line for a desired accuracy threshold (optional)
    plt.axhline(y=0.97, color='r', linestyle='--', label='Desired Accuracy = 0.97')
    
    # Formatting
    plt.title('Model Accuracy by Active Learning Round')
    plt.xlabel('Round')
    plt.ylabel('Accuracy')
    plt.grid(True)
    plt.legend()
    plt.show()

# Plotting the results
plot_accuracies(round_acc)
