In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from tensorflow.keras.layers import Input, LSTM, Dropout, Dense, RepeatVector, TimeDistributed
from tensorflow.keras.models import Model, Sequential
from sklearn.metrics import classification_report

In [2]:
# Load the dataset
data = pd.read_csv("Datasets/UNSW_NB15_training_set.csv")

In [3]:
# Define selected features
selected_features = ['dur', 'spkts', 'dpkts', 'sbytes', 
                     'dbytes', 'sload', 'dload', 
                     'sinpkt', 'dinpkt', 'ct_srv_src', 
                     'ct_dst_ltm', 'ct_src_ltm']

In [4]:
# Filter the dataset based on selected features
data_selected = data[selected_features + ['label']]

In [5]:
# Split features and labels
X = data_selected.drop('label', axis=1)
y = data_selected['label']

In [6]:
# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
# Define and train Autoencoder model
input_dim = X_train_scaled.shape[1]
encoding_dim = 64

In [9]:
autoencoder = Sequential([
    Dense(32, activation='relu', input_shape=(input_dim,)),
    Dense(encoding_dim, activation='relu'),
    Dense(32, activation='relu'),
    Dense(input_dim, activation=None)
])

In [10]:
autoencoder.compile(optimizer='adam', loss='mean_squared_error')
autoencoder.fit(X_train_scaled, X_train_scaled, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1bfda9d0310>

In [11]:
'''

# Initialize a list to store resampled X_train and y_train for each output
X_train_resampled_list = []
y_train_resampled_list = []

# Get the maximum number of samples after resampling
max_samples = 0

# Loop through each output variable
for i in range(y_train.shape[1]):
    # Extract the target variable for the current output variable
    y_train_i = y_train.iloc[:, i]
    
    # Apply SMOTE to balance the classes for the current output variable
    smote = SMOTE(random_state=42)
    X_train_resampled_i, y_train_resampled_i = smote.fit_resample(X_train_encoded, y_train_i)
    
    # Update the maximum number of samples if needed
    max_samples = max(max_samples, len(X_train_resampled_i))
    
    # Append the resampled data to the list
    X_train_resampled_list.append(X_train_resampled_i)
    y_train_resampled_list.append(y_train_resampled_i)

# Pad the resampled data for each output variable to ensure they have the same number of samples
X_train_resampled_padded = []
y_train_resampled_padded = []

for i in range(len(X_train_resampled_list)):
    X_train_resampled_padded.append(np.pad(X_train_resampled_list[i], ((0, max_samples - len(X_train_resampled_list[i])), (0, 0)), mode='constant', constant_values=0))
    y_train_resampled_padded.append(np.pad(y_train_resampled_list[i], (0, max_samples - len(y_train_resampled_list[i])), mode='constant', constant_values=0))

# Concatenate the padded resampled data for all output variables
X_train_resampled = np.concatenate(X_train_resampled_padded, axis=1)
y_train_resampled = np.column_stack(y_train_resampled_padded)

'''

"\n\n# Initialize a list to store resampled X_train and y_train for each output\nX_train_resampled_list = []\ny_train_resampled_list = []\n\n# Get the maximum number of samples after resampling\nmax_samples = 0\n\n# Loop through each output variable\nfor i in range(y_train.shape[1]):\n    # Extract the target variable for the current output variable\n    y_train_i = y_train.iloc[:, i]\n    \n    # Apply SMOTE to balance the classes for the current output variable\n    smote = SMOTE(random_state=42)\n    X_train_resampled_i, y_train_resampled_i = smote.fit_resample(X_train_encoded, y_train_i)\n    \n    # Update the maximum number of samples if needed\n    max_samples = max(max_samples, len(X_train_resampled_i))\n    \n    # Append the resampled data to the list\n    X_train_resampled_list.append(X_train_resampled_i)\n    y_train_resampled_list.append(y_train_resampled_i)\n\n# Pad the resampled data for each output variable to ensure they have the same number of samples\nX_train_resampl

In [12]:
'''

# Apply SMOTE for class imbalance handling
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_encoded, y_train)

'''

'\n\n# Apply SMOTE for class imbalance handling\nsmote = SMOTE(random_state=42)\nX_train_resampled, y_train_resampled = smote.fit_resample(X_train_encoded, y_train)\n\n'

In [13]:
# Extract features using the encoder part of the trained Autoencoder
encoder = Sequential(autoencoder.layers[:2])
X_train_encoded = encoder.predict(X_train_scaled)
X_test_encoded = encoder.predict(X_test_scaled)



In [14]:
# Reshape the encoded features for LSTM input
X_train_reshaped = X_train_encoded.reshape(X_train_encoded.shape[0], 1, X_train_encoded.shape[1])
X_test_reshaped = X_test_encoded.reshape(X_test_encoded.shape[0], 1, X_test_encoded.shape[1])

In [15]:
# Define LSTM model
lstm_model = Sequential([
    LSTM(64, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

In [16]:
# Compile the model
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [17]:
# Train the model
history = lstm_model.fit(X_train_reshaped, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:
# Make predictions
y_pred_prob = lstm_model.predict(X_test_reshaped)
y_pred_classes = (y_pred_prob > 0.5).astype(int)



In [19]:
# Generate classification report
class_report = classification_report(y_test, y_pred_classes)
print("Classification Report:")
print(class_report)

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.79      0.84     11169
           1       0.91      0.96      0.93     23900

    accuracy                           0.91     35069
   macro avg       0.90      0.88      0.89     35069
weighted avg       0.91      0.91      0.90     35069



In [20]:
# Evaluate the model on the test data
test_loss, test_accuracy = lstm_model.evaluate(X_test_reshaped, y_test)

print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

Test Loss: 0.2050
Test Accuracy: 0.9062


In [51]:
# Save the trained model in native Keras format
lstm_model.save('MODELS/test.keras')

In [52]:
# Save the trained model in native Keras format
lstm_model.save('MODELS/test.h5')

  saving_api.save_model(


In [55]:
encoder.save('MODELS/autoencoder.h5')



## TESTING

In [49]:
def process_test_case(test_input):
    
    # Convert the test input to a DataFrame
    test_df = pd.DataFrame(test_input, index=[0])
    
    # Scale the test input using the same scaler used for training data
    test_input_scaled = scaler.transform(test_df)
    
    # Encode the scaled test input using the encoder part of the trained autoencoder
    test_input_encoded = encoder.predict(test_input_scaled)
    
    # Reshape the encoded test input to match the shape expected by the LSTM model
    test_input_reshaped = test_input_encoded.reshape(test_input_encoded.shape[0], 1, test_input_encoded.shape[1])
    
    # Make predictions using the LSTM model
    y_pred_prob = lstm_model.predict(test_input_reshaped)
    y_pred_classes = (y_pred_prob > 0.5).astype(int)
    
    if y_pred_classes[0] == 1:
        return str(y_pred_classes[0]) + " ---> ATTACK"
    else:
        return str(y_pred_classes[0]) + " ---> NORMAL"

In [50]:
# Provided data points
data_points = [
    [1.830248, 12, 8, 5116, 354, 20499.95508, 1355.007568, 162.58554, 249.264578, 5, 2, 2],
    [0.130145, 6, 2, 986, 86, 50528.25781, 2643.205566, 26.029, 0.002, 7, 5, 4],
    [0.353716, 6, 2, 986, 86, 18591.1875, 972.531677, 70.7432, 0.006, 7, 3, 2],
    [0.263532, 6, 2, 986, 86, 24953.32422, 1305.344238, 52.7064, 0.001, 7, 3, 2],
    [2.315174, 10, 8, 564, 354, 1755.375488, 1071.193726, 246.328778, 311.599438, 5, 1, 3]
]
# ANSWER --> 1, 0, 0, 0, 1

# Column names
columns = ['dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'sload', 'dload', 'sinpkt', 'dinpkt', 'ct_srv_src', 'ct_dst_ltm', 'ct_src_ltm']

# Generate test cases
test_cases = []
for data_point in data_points:
    test_case = dict(zip(columns, data_point))
    test_cases.append(test_case)
    
# Print test cases
for i, test_case in enumerate(test_cases, 1):
    #print(f"Test Case {i}: {test_case}")
    print()
    print(process_test_case(test_case))
    print()


[1] ---> ATTACK


[0] ---> NORMAL


[0] ---> NORMAL


[0] ---> NORMAL


[1] ---> ATTACK



In [53]:
import tensorflow as tf

# Check TensorFlow version
print(tf.__version__)

2.13.0


In [54]:
import keras

# Check Keras version
print(keras.__version__)

2.13.1
