In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# Load your dataset (assuming it's a CSV or a Pandas DataFrame)
df = pd.read_csv('/kaggle/input/anomaly-full/NF-UNSW-NB15.csv')

# Drop 'IPV4_SRC_ADDR' and 'IPV4_DST_ADDR'
df = df.drop(['IPV4_SRC_ADDR', 'IPV4_DST_ADDR'], axis=1)

df

Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label,Attack
0,62073,56082,6,0.0,9672,416,11,8,25,15,0,Benign
1,32284,1526,6,0.0,1776,104,6,2,25,0,0,Benign
2,21,21971,6,1.0,1842,1236,26,22,25,1111,0,Benign
3,23800,46893,6,0.0,528,8824,10,12,27,124,0,Benign
4,63062,21,6,1.0,1786,2340,32,34,25,1459,0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...
1623113,1640,53,17,0.0,130,162,2,2,0,0,0,Benign
1623114,3610,21,6,1.0,2044,2404,36,34,26,0,0,Benign
1623115,4667,40725,6,0.0,320,1918,6,8,27,0,0,Benign
1623116,5641,56243,6,0.0,528,8824,10,12,27,0,0,Benign


In [3]:
# Encode categorical columns
label_encoders = {}
categorical_columns = ['PROTOCOL', 'L7_PROTO', 'TCP_FLAGS', 'Attack']

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [4]:
# Splitting data into features (X) and target (y)
X = df.drop(columns=['Attack'])
y = df['Attack']

In [5]:
# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)

In [6]:
# Handle imbalanced classes using class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))

# Alternatively, use SMOTE to oversample minority classes
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [7]:
# Build the Deep Q-Learning Neural Network (DQN)
model = Sequential()

# Input layer
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.3))

# Hidden layers
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(256, activation='relu'))
model.add(Dropout(0.3))

# Output layer - assuming 10 unique classes (0-9 in the 'Attack' column)
model.add(Dense(10, activation='softmax'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(X_train_resampled, y_train_resampled,
                    validation_data=(X_test, y_test),
                    epochs=50,
                    batch_size=1024,
                    class_weight=class_weights_dict,  # Handling imbalance
                    callbacks=[early_stopping])

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc}")
print(f"Test Loss: {test_loss}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m12115/12115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 10ms/step - accuracy: 0.4146 - loss: 50.9873 - val_accuracy: 0.9068 - val_loss: 0.3650
Epoch 2/50
[1m12115/12115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 10ms/step - accuracy: 0.5686 - loss: 37.8775 - val_accuracy: 0.9318 - val_loss: 0.2394
Epoch 3/50
[1m12115/12115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 10ms/step - accuracy: 0.5863 - loss: 36.7821 - val_accuracy: 0.9383 - val_loss: 0.1589
Epoch 4/50
[1m12115/12115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 10ms/step - accuracy: 0.5991 - loss: 35.4349 - val_accuracy: 0.9407 - val_loss: 0.1493
Epoch 5/50
[1m12115/12115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 10ms/step - accuracy: 0.6055 - loss: 35.0752 - val_accuracy: 0.9531 - val_loss: 0.1503
Epoch 6/50
[1m12115/12115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 10ms/step - accuracy: 0.6066 - loss: 34.9177 - val_accuracy: 0.9563

In [3]:
# Load your dataset (assuming it's a CSV or a Pandas DataFrame)
df = pd.read_csv('/kaggle/input/nf-unsw-nb15-v2csv/NF-UNSW-NB15-v2.csv')

# Drop 'IPV4_SRC_ADDR' and 'IPV4_DST_ADDR'
df = df.drop(['IPV4_SRC_ADDR', 'IPV4_DST_ADDR'], axis=1)

df

Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack
0,1305,21,6,1.0,9,1,193,3,24,24,...,0,7240,0,0,0,0,0,331.0,0,Benign
1,1305,21,6,1.0,261,5,469,7,24,24,...,8688,8688,18944,74,0,0,0,230.0,0,Benign
2,1305,21,6,1.0,481,9,750,11,24,24,...,10136,10136,33792,132,0,0,0,229.0,0,Benign
3,1305,21,6,1.0,701,13,1054,15,24,24,...,11584,11584,48640,190,0,0,0,125.0,0,Benign
4,1305,21,6,1.0,1031,19,1474,21,24,24,...,14480,13032,64256,251,0,0,0,230.0,0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2390270,58663,5190,6,0.0,1064,12,2364,14,27,27,...,10136,10136,29696,116,0,0,0,0.0,0,Benign
2390271,60977,53,17,0.0,146,2,178,2,0,0,...,0,0,0,0,19348,1,60,0.0,0,Benign
2390272,54553,80,6,7.0,994,10,8896,10,26,26,...,10136,7240,7424,29,0,0,0,0.0,0,Benign
2390273,55026,8248,6,0.0,4014,68,60268,70,27,27,...,44888,14480,8960,35,0,0,0,0.0,0,Benign


In [4]:
# Encode categorical columns
label_encoders = {}
categorical_columns = ['PROTOCOL', 'L7_PROTO', 'TCP_FLAGS', 'Attack']

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [5]:
# Splitting data into features (X) and target (y)
X = df.drop(columns=['Attack'])
y = df['Attack']

In [6]:
# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)

In [7]:
# Handle imbalanced classes using class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))

# Alternatively, use SMOTE to oversample minority classes
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [8]:
# Build the Deep Q-Learning Neural Network (DQN)
model = Sequential()

# Input layer
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.3))

# Hidden layers
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(256, activation='relu'))
model.add(Dropout(0.3))

# Output layer - assuming 10 unique classes (0-9 in the 'Attack' column)
model.add(Dense(10, activation='softmax'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(X_train_resampled, y_train_resampled,
                    validation_data=(X_test, y_test),
                    epochs=50,
                    batch_size=1024,
                    class_weight=class_weights_dict,  # Handling imbalance
                    callbacks=[early_stopping])

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc}")
print(f"Test Loss: {test_loss}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m17932/17932[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m208s[0m 11ms/step - accuracy: 0.5150 - loss: 42.3333 - val_accuracy: 0.9195 - val_loss: 0.4469
Epoch 2/50
[1m17932/17932[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m208s[0m 11ms/step - accuracy: 0.6579 - loss: 30.2326 - val_accuracy: 0.8839 - val_loss: 0.4082
Epoch 3/50
[1m17932/17932[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m209s[0m 12ms/step - accuracy: 0.6677 - loss: 29.6633 - val_accuracy: 0.8360 - val_loss: 0.4805
Epoch 4/50
[1m17932/17932[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m208s[0m 11ms/step - accuracy: 0.6730 - loss: 29.2861 - val_accuracy: 0.9018 - val_loss: 0.3226
Epoch 5/50
[1m17932/17932[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 11ms/step - accuracy: 0.6796 - loss: 29.0857 - val_accuracy: 0.8895 - val_loss: 0.3467
Epoch 6/50
[1m17932/17932[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m209s[0m 12ms/step - accuracy: 0.6815 - loss: 28.8258 - val_accuracy: 0.8817