In [49]:
import re
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import tensorflow.keras.backend as K
import os

In [50]:

def parse_logs(file_paths):
    """Parse network log files while preserving all original data exactly."""
    data = []
    for file_path in file_paths:
        with open(file_path, 'r') as file:
            entry = {}
            for line in file:
                line = line.strip()

                # Match timestamp (e.g., "Thu Mar 20 23:55:13 2025")
                if re.match(r'^[A-Za-z]{3} [A-Za-z]{3} \d{1,2} \d{2}:\d{2}:\d{2} \d{4}$', line):
                    if entry:
                        data.append(entry)
                        entry = {}
                    entry['timestamp'] = line

                # Match MAC addresses (e.g., "a0:59:50:93:56:70 -> 6a:61:ab:9d:ff:79")
                elif re.match(r'^[0-9a-f]{2}(:[0-9a-f]{2}){5} -> [0-9a-f]{2}(:[0-9a-f]{2}){5}$', line):
                    macs = line.split(' -> ')
                    entry['src_mac'] = macs[0]
                    entry['dst_mac'] = macs[1]

                # Match IPv4 information (e.g., "IPv4: 192.168.135.162:61426 -> 192.168.135.90:53")
                elif 'IPv4:' in line:
                    ip_match = re.search(r'IPv4: (\d+\.\d+\.\d+\.\d+):(\d+) -> (\d+\.\d+\.\d+\.\d+):(\d+)', line)
                    if ip_match:
                        entry['src_ip'], entry['src_port'], entry['dst_ip'], entry['dst_port'] = ip_match.groups()

                # Match packet length information
                elif 'DgmLen:' in line:
                    length_match = re.search(r'DgmLen:(\d+)', line)
                    if length_match:
                        entry['packet_length'] = int(length_match.group(1))

                # Match payload if available
                elif 'PL(' in line and '):' in line:
                    hex_payload = line.split('):')[1].strip().split()
                    if hex_payload:
                        entry['payload_sum'] = sum(int(byte, 16) for byte in hex_payload if len(byte) == 2)
                        entry['payload_len'] = len(hex_payload)

            if entry:
                data.append(entry)

    # Create DataFrame while preserving all original values
    df = pd.DataFrame(data)

    # Only fill missing numeric values, leave other fields as-is
    numeric_defaults = {
        'packet_length': 0,
        'payload_sum': 0,
        'payload_len': 0,
        'src_port': '0',  # Keep as string to match original format
        'dst_port': '0'
    }
    df.fillna(numeric_defaults, inplace=True)

    # Create numerical versions of IPs for modeling while preserving originals
    def ip_to_num(ip):
        if pd.isna(ip):
            return 0
        try:
            return sum(int(part) * (256 ** i) for i, part in enumerate(ip.split('.')[::-1]))
        except:
            return 0

    df['src_ip_num'] = df['src_ip'].apply(ip_to_num)
    df['dst_ip_num'] = df['dst_ip'].apply(ip_to_num)

    return df

In [51]:
def build_autoencoder(input_dim):
    """Build a basic autoencoder model."""
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(32, activation='relu')(input_layer)
    decoded = Dense(input_dim, activation='sigmoid')(encoded)
    autoencoder = Model(input_layer, decoded)
    return autoencoder

In [52]:
def build_sparse_autoencoder(input_dim):
    """Build a sparse autoencoder with L1 regularization."""
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(32, activation='relu',
                   activity_regularizer=regularizers.l1(10e-5))(input_layer)
    decoded = Dense(input_dim, activation='sigmoid')(encoded)
    autoencoder = Model(input_layer, decoded)
    return autoencoder

In [53]:
def build_variational_autoencoder(input_dim):
    """Build a variational autoencoder."""
    # Encoder
    input_layer = Input(shape=(input_dim,))
    h = Dense(32, activation='relu')(input_layer)
    z_mean = Dense(16)(h)
    z_log_var = Dense(16)(h)

    # Sampling function
    def sampling(args):
        z_mean, z_log_var = args
        epsilon = K.random_normal(shape=(K.shape(z_mean)[0], 16))
        return z_mean + K.exp(z_log_var / 2) * epsilon

    z = Lambda(sampling)([z_mean, z_log_var])

    # Decoder
    decoder_h = Dense(32, activation='relu')
    decoder_mean = Dense(input_dim, activation='sigmoid')
    h_decoded = decoder_h(z)
    x_decoded_mean = decoder_mean(h_decoded)

    vae = Model(input_layer, x_decoded_mean)
    return vae

In [54]:
def build_stacked_autoencoder(input_dim):
    """Build a stacked autoencoder with multiple layers."""
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(64, activation='relu')(input_layer)
    encoded = Dense(32, activation='relu')(encoded)
    decoded = Dense(64, activation='relu')(encoded)
    decoded = Dense(input_dim, activation='sigmoid')(decoded)
    autoencoder = Model(input_layer, decoded)
    return autoencoder

In [55]:
def detect_anomalies(reconstructions, original, threshold_percentile=95):
    """Detect anomalies based on reconstruction error."""
    anomaly_scores = np.mean(np.abs(original - reconstructions), axis=1)
    threshold = np.percentile(anomaly_scores, threshold_percentile)
    return anomaly_scores > threshold

In [56]:

def main():
    # List of log files to process
    log_files = ["logsys1.txt"]  # Replace with your actual log file(s)

    # Parse logs while preserving original IPs and all data
    print("Parsing log files...")
    df = parse_logs(log_files)

    # Select features for anomaly detection (using numerical IPs)
    features = ['packet_length', 'src_ip_num', 'dst_ip_num', 'payload_sum', 'payload_len']
    X = df[features].values

    # Normalize features
    print("Normalizing data...")
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # Build and compare models
    models = {
        'AutoEncoder': build_autoencoder(X_scaled.shape[1]),
        'Sparse AutoEncoder': build_sparse_autoencoder(X_scaled.shape[1]),
        'Variational AutoEncoder': build_variational_autoencoder(X_scaled.shape[1]),
        'Stacked AutoEncoder': build_stacked_autoencoder(X_scaled.shape[1])
    }

    results = {}
    print("\nTraining models...")

    for name, model in models.items():
        model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
        start_time = time.time()
        model.fit(X_scaled, X_scaled,
                 epochs=50,
                 batch_size=8,
                 shuffle=True,
                 validation_split=0.2,
                 verbose=0)
        training_time = time.time() - start_time

        reconstructions = model.predict(X_scaled)
        mse = mean_squared_error(X_scaled, reconstructions)
        accuracy = 1 - mse
        results[name] = {
            'Accuracy': accuracy,
            'MSE': mse,
            'Training Time': training_time
        }
        print(f"{name}: Accuracy={accuracy:.4f}, MSE={mse:.6f}, Training Time={training_time:.2f}s")

    # Select best model
    best_model_name = max(results, key=lambda k: results[k]['Accuracy'])
    best_model = models[best_model_name]
    print(f"\nBest Model: {best_model_name} with Accuracy {results[best_model_name]['Accuracy']:.4f}")

    # Detect anomalies with best model
    print("Detecting anomalies...")
    reconstructions = best_model.predict(X_scaled)
    df['Anomaly'] = detect_anomalies(reconstructions, X_scaled)

    # Save results with original IPs and all data
    output_columns = ['timestamp', 'src_mac', 'dst_mac', 'src_ip', 'dst_ip',
                     'src_port', 'dst_port', 'packet_length', 'payload_len',
                     'payload_sum', 'Anomaly']

    # Ensure all columns exist (in case some logs didn't have all fields)
    for col in output_columns:
        if col not in df.columns:
            df[col] = None

    df[output_columns].to_csv('network_anomalies.csv', index=False)
    print("\nAnomaly detection complete. Results saved to 'network_anomalies.csv'")

    # Show some detected anomalies
    print("\nSample anomalies detected:")
    print(df[df['Anomaly']][output_columns].head())

    return df

if __name__ == "__main__":
    df = main()

Parsing log files...
Normalizing data...

Training models...
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
AutoEncoder: Accuracy=0.9995, MSE=0.000527, Training Time=118.49s
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Sparse AutoEncoder: Accuracy=0.9999, MSE=0.000076, Training Time=115.43s
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Variational AutoEncoder: Accuracy=1.0000, MSE=0.000050, Training Time=116.44s
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Stacked AutoEncoder: Accuracy=1.0000, MSE=0.000029, Training Time=118.00s

Best Model: Stacked AutoEncoder with Accuracy 1.0000
Detecting anomalies...
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Anomaly detection complete. Results saved to 'network_anomalies.csv'

Sample anomalies detected:
                    timestamp            src_mac            dst_mac  \
49   Thu Mar 20 23:55:31 20