In [1]:
import re
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import tensorflow.keras.backend as K
import os

In [2]:
# Load and preprocess log data from multiple files
def parse_logs(file_paths):
    data = []
    for file_path in file_paths:
        with open(file_path, 'r') as file:
            entry = {}
            for line in file:
                line = line.strip()

                if re.match(r'^[A-Za-z]{3} \w{3} \d{2} \d{2}:\d{2}:\d{2} \d{4}$', line):
                    if entry:
                        data.append(entry)
                        entry = {}
                    entry['timestamp'] = line
                elif 'Packet length' in line:
                    entry['packet_length'] = int(re.search(r'\d+', line).group())
                elif 'Source:' in line:
                    entry['source_ip'] = line.split(': ')[1]
                elif 'Dest:' in line:
                    entry['dest_ip'] = line.split(': ')[1]
                elif 'Payload (hex):' in line:
                    hex_payload = line.split(': ')[1].split()
                    entry['payload_sum'] = sum(int(byte, 16) for byte in hex_payload)
                    entry['payload_len'] = len(hex_payload)

            if entry:
                data.append(entry)

    df = pd.DataFrame(data)
    df.fillna({'source_ip': '0.0.0.0', 'dest_ip': '0.0.0.0', 'packet_length': 0, 'payload_sum': 0, 'payload_len': 0}, inplace=True)
    return df


In [3]:
# Convert IPs to numerical features
def ip_to_int(ip):
    try:
        parts = ip.split('.')
        return sum(int(part) * (256 ** i) for i, part in enumerate(reversed(parts)))
    except:
        return 0

In [4]:
# List all uploaded log files with absolute paths
log_files = [
    "log.txt",
    "log1.txt",
    "log2.txt",
    "logfile_compressed.txt",
    "test.txt"
]

In [5]:
df = parse_logs(log_files)
df['source_ip'] = df['source_ip'].apply(ip_to_int)
df['dest_ip'] = df['dest_ip'].apply(ip_to_int)

In [6]:
# Normalize features
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df[['packet_length', 'source_ip', 'dest_ip', 'payload_sum', 'payload_len']])

input_dim = df_scaled.shape[1]

In [7]:
def detect_anomalies(reconstructions, threshold_percentile=95):
    anomaly_scores = np.mean(np.abs(df_scaled - reconstructions), axis=1)
    threshold = np.percentile(anomaly_scores, threshold_percentile)
    return anomaly_scores > threshold

In [11]:
def build_autoencoder():
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(32, activation='relu')(input_layer)  # Adjust the number of units as needed
    decoded = Dense(input_dim, activation='sigmoid')(encoded)
    autoencoder = Model(input_layer, decoded)
    return autoencoder

In [9]:
def build_sparse_autoencoder():
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(32, activation='relu', activity_regularizer=regularizers.l1(10e-5))(input_layer)
    decoded = Dense(input_dim, activation='sigmoid')(encoded)
    autoencoder = Model(input_layer, decoded)
    return autoencoder

def build_variational_autoencoder():
    # This is a placeholder, needs implementation for VAE
    return build_autoencoder()

def build_stacked_autoencoder():
    # This is a placeholder, needs implementation for Stacked AE
    return build_autoencoder()


In [12]:
models = {
    'AutoEncoder': build_autoencoder(),
    'Sparse AutoEncoder': build_sparse_autoencoder(),
    'Variational AutoEncoder': build_variational_autoencoder(),
    'Stacked AutoEncoder': build_stacked_autoencoder()
}

In [13]:
results = {}

In [15]:
for name, model in models.items():
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    start_time = time.time()
    model.fit(df_scaled, df_scaled, epochs=10, batch_size=8, shuffle=True, validation_split=0.2, verbose=0)
    training_time = time.time() - start_time
    reconstructions = model.predict(df_scaled)
    mse = mean_squared_error(df_scaled, reconstructions)
    accuracy = 1 - mse
    results[name] = {'Accuracy': accuracy, 'MSE': mse, 'Training Time': training_time}
    print(f"{name}: Accuracy={accuracy:.4f}, MSE={mse:.6f}, Training Time={training_time:.2f}s")

[1m785/785[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
AutoEncoder: Accuracy=0.9740, MSE=0.025965, Training Time=70.06s
[1m785/785[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Sparse AutoEncoder: Accuracy=0.9719, MSE=0.028061, Training Time=87.39s
[1m785/785[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Variational AutoEncoder: Accuracy=0.9742, MSE=0.025812, Training Time=73.65s
[1m785/785[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Stacked AutoEncoder: Accuracy=0.9742, MSE=0.025807, Training Time=78.49s


In [16]:
best_model = max(results, key=lambda k: results[k]['Accuracy'])
print(f"Best Model: {best_model} with Accuracy {results[best_model]['Accuracy']:.4f}")

Best Model: Stacked AutoEncoder with Accuracy 0.9742


In [17]:
reconstructions = models[best_model].predict(df_scaled)
df['Anomaly'] = detect_anomalies(reconstructions)
df.to_csv('log_anomalies.csv', index=False)
print("Anomaly detection complete. Results saved to log_anomalies.csv.")

[1m785/785[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Anomaly detection complete. Results saved to log_anomalies.csv.
