In [1]:
!pip install gym



In [2]:
!pip install stable-baselines3[extra]



In [3]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import numpy as np
import gymnasium
from gymnasium import spaces
import matplotlib.pyplot as plt
from stable_baselines3 import DQN
from stable_baselines3.common.env_checker import check_env

In [4]:
df=pd.read_csv(Path("dataset/ALLFLOWMETER_HIKARI2021.csv"))
df = df.select_dtypes(exclude=['object'])
df.drop(['Unnamed: 0.1','Unnamed: 0'], axis=1, inplace=True)

  and should_run_async(code)


In [5]:
X = df.drop(['Label'], axis=1)
y = df['Label']

  and should_run_async(code)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  and should_run_async(code)


In [7]:
def drop_highly_correlated_columns(X_train, threshold=0.7):
    # Calculate the correlation matrix
    corr_matrix = X_train.corr().abs()

    # Identify columns to drop
    to_drop = set()
    for i in range(len(corr_matrix.columns)):
        for j in range(i + 1, len(corr_matrix.columns)):
            if corr_matrix.iloc[i, j] > threshold:
                colname = corr_matrix.columns[i]
                to_drop.add(colname)

    # Drop the identified columns
    X_train.drop(columns=to_drop, inplace=True)

    # Print the remaining columns
    print("Remaining columns after dropping highly correlated ones:", X_train.columns)

    return to_drop

In [8]:
dropped_columns = drop_highly_correlated_columns(X_train, threshold=0.7)

Remaining columns after dropping highly correlated ones: Index(['originp', 'flow_pkts_per_sec', 'down_up_ratio', 'flow_RST_flag_count',
       'flow_ACK_flag_count', 'fwd_URG_flag_count', 'bwd_URG_flag_count',
       'flow_CWR_flag_count', 'flow_ECE_flag_count', 'fwd_pkts_payload.avg',
       'bwd_pkts_payload.min', 'flow_pkts_payload.min',
       'flow_pkts_payload.std', 'bwd_iat.min', 'bwd_iat.tot', 'flow_iat.avg',
       'payload_bytes_per_second', 'bwd_subflow_bytes', 'fwd_bulk_bytes',
       'fwd_bulk_packets', 'bwd_bulk_packets', 'fwd_bulk_rate',
       'bwd_bulk_rate', 'active.std', 'idle.tot', 'idle.avg', 'idle.std',
       'fwd_init_window_size', 'bwd_init_window_size', 'fwd_last_window_size'],
      dtype='object')


In [9]:
X_test.drop(columns=dropped_columns, inplace=True)

  and should_run_async(code)


In [10]:
def get_top_k_features(X, y, k=20, model_type="classifier", random_state=123):
    # Initialize the model
    if model_type == "classifier":
        rf_model = RandomForestClassifier(random_state=random_state)
    else:
        raise ValueError("model_type should be 'classifier'")

    # Fit the model
    rf_model.fit(X, y)

    # Get feature importances
    feature_importances = rf_model.feature_importances_

    # Create a DataFrame to store feature importances
    feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

    # Sort features by importance in descending order
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
    print(feature_importance_df[:k])

    # Select top k features
    top_k_features = feature_importance_df['Feature'][:k].tolist()

    # Ensure top features are in the original dataset's columns
    top_k_features = [feature for feature in top_k_features if feature in X.columns]

    return top_k_features, feature_importance_df

  and should_run_async(code)


In [11]:
top_k_features, feature_importance_df = get_top_k_features(X_train, y_train, k=20, model_type="classifier")

                     Feature  Importance
0                    originp    0.095590
16  payload_bytes_per_second    0.094157
14               bwd_iat.tot    0.092211
15              flow_iat.avg    0.088706
1          flow_pkts_per_sec    0.087247
9       fwd_pkts_payload.avg    0.073482
17         bwd_subflow_bytes    0.066258
12     flow_pkts_payload.std    0.064241
22             bwd_bulk_rate    0.055118
28      bwd_init_window_size    0.047781
13               bwd_iat.min    0.047473
2              down_up_ratio    0.042797
3        flow_RST_flag_count    0.036370
4        flow_ACK_flag_count    0.033751
29      fwd_last_window_size    0.020578
24                  idle.tot    0.012581
25                  idle.avg    0.010661
23                active.std    0.009060
20          bwd_bulk_packets    0.008244
11     flow_pkts_payload.min    0.005434


In [12]:
X_train = X_train[top_k_features]

  and should_run_async(code)


In [13]:
X_test = X_test[top_k_features]

  and should_run_async(code)


In [14]:
model = DQN.load("traffic_dqn_model")

  and should_run_async(code)


In [15]:
print(X_test.values[0])

[5.39190000e+04 7.74952766e+03 1.07383728e+03 8.47363472e+03
 1.57350815e+02 3.10000000e+01 1.35000000e+02 2.69118930e+01
 0.00000000e+00 0.00000000e+00 1.07383728e+03 1.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 3.10000000e+01]


  and should_run_async(code)


In [16]:
y_pred = model.predict(X_test.values)

In [23]:
print(y_pred[0][40])

0


  and should_run_async(code)


In [20]:
print(y_test.values[40])

0
