In [5]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler

# Load and preprocess data from a CSV file
def preprocess_data_from_csv(file_path):
    # Load the dataset from a CSV file
    data = pd.read_csv('smaller_update.csv', header=None, on_bad_lines='warn')

    processed_data = []
    max_length = 0

    # First pass: find the maximum length of rows (number of hex values)
    for row in data[0]:
        row = row.replace('<head>', '').replace('<pkt>', '').replace('</s>', '').strip()
        hex_values = row.split()
        max_length = max(max_length, len(hex_values))
    
    # Second pass: convert hex values to integers, remove src/dst IP (if present), and pad
    for row in data[0]:
        row = row.replace('<head>', '').replace('<pkt>', '').replace('</s>', '').strip()
        hex_values = row.split()
        
        # Convert hex values to integers
        int_values = [int(x, 16) for x in hex_values]
        
        # Ignore source/destination IP addresses (bytes 12–19 in a standard IPv4 header),
        # if the row is at least 20 bytes long
        if len(int_values) >= 20:
            del int_values[12:20]  # remove source and destination IP (8 bytes total)
        
        # Update the new max_length if needed
        # (since we've potentially removed bytes, we might have a new max length)
        max_length = max(max_length, len(int_values))
        
        processed_data.append(int_values)
    
    # Pad rows with zeros to ensure uniform length after IP removal
    for i in range(len(processed_data)):
        if len(processed_data[i]) < max_length:
            processed_data[i].extend([0] * (max_length - len(processed_data[i])))
    
    # Convert to a numpy array
    processed_data = np.array(processed_data)
    
    # Normalize the data between 0 and 1 using MinMaxScaler
    scaler = MinMaxScaler()
    processed_data = scaler.fit_transform(processed_data)

    return processed_data, scaler

# Define Autoencoder Model
def build_autoencoder(input_dim):
    input_layer = Input(shape=(input_dim,))
    
    # Encoder
    encoded = Dense(64, activation='relu')(input_layer)
    encoded = Dense(32, activation='relu')(encoded)
    
    # Latent representation (code layer)
    encoded = Dense(16, activation='relu')(encoded)
    
    # Decoder
    decoded = Dense(32, activation='relu')(encoded)
    decoded = Dense(64, activation='relu')(decoded)
    decoded = Dense(input_dim, activation='sigmoid')(decoded)
    
    autoencoder = Model(inputs=input_layer, outputs=decoded)
    autoencoder.compile(optimizer='adam', loss='mse')
    
    return autoencoder

# Train Autoencoder
def train_autoencoder(autoencoder, data):
    # Using a slightly more robust early stopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, mode='min', restore_best_weights=True)
    
    history = autoencoder.fit(
        data, data,
        epochs=10,           # Increased epochs for better training
        batch_size=64,       # You can adjust batch size based on GPU/CPU memory
        shuffle=True,
        validation_split=0.2,
        callbacks=[early_stopping]
    )
    
    return history

# Save Autoencoder Model
def save_autoencoder(autoencoder, filename='newest_model.keras'):
    autoencoder.save(filename)

# Load Autoencoder Model
def load_autoencoder(filename='newest_model.keras'):
    return tf.keras.models.load_model(filename)


# ------------------
# Example usage
# ------------------

if __name__ == "__main__":

    # CSV file path containing the training data
    csv_file_path = 'smaller_update.csv'

    # Step 1: Load and preprocess the training dataset
    processed_data, scaler = preprocess_data_from_csv(csv_file_path)

    # Step 2: Build the autoencoder model
    input_dim = processed_data.shape[1]  # Number of features based on the CSV file
    print("Input dimension after removing src/dst IPs:", input_dim)
    
    autoencoder = build_autoencoder(input_dim)

    # Step 3: Train the autoencoder
    history = train_autoencoder(autoencoder, processed_data)

    # Step 4: Save the autoencoder
    save_autoencoder(autoencoder, filename='newest_model.keras')
    print("Finished Training! and Saved")

    # In this improved version, we do not test on a single packet.
    # Any anomaly detection or evaluation will happen separately.


Input dimension after removing src/dst IPs: 104
Epoch 1/10
[1m250000/250000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 611us/step - loss: 0.0191 - val_loss: 0.0168
Epoch 2/10
[1m250000/250000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 575us/step - loss: 0.0172 - val_loss: 0.0166
Epoch 3/10
[1m250000/250000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 560us/step - loss: 0.0171 - val_loss: 0.0166
Epoch 4/10
[1m250000/250000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 561us/step - loss: 0.0170 - val_loss: 0.0166
Epoch 5/10
[1m250000/250000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 561us/step - loss: 0.0169 - val_loss: 0.0165
Epoch 6/10
[1m250000/250000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 564us/step - loss: 0.0168 - val_loss: 0.0163
Epoch 7/10
[1m250000/250000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 565us/step - loss: 0.0167 - val_loss: 0.0163
Epoch 8/10
[1m250000/250000[0m [32m━━━━