In [6]:
import re
import pandas as pd
import numpy as np
import time
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import tensorflow.keras.backend as K
from collections import defaultdict

In [7]:
class NetworkAnomalyDetector:
    def __init__(self, threshold_percentile=95):
        self.threshold_percentile = threshold_percentile
        self.scaler = MinMaxScaler()
        self.models = {}
        self.feature_processor = None
        self.best_model = None
        self.feature_columns = None

    def parse_logs(self, file_paths):
        data = []
        feature_stats = defaultdict(list)

        for file_path in file_paths:
            with open(file_path, 'r') as file:
                entry = {}
                for line in file:
                    line = line.strip()

                    if re.match(r'^[A-Za-z]{3} [A-Za-z]{3} \d{1,2} \d{2}:\d{2}:\d{2} \d{4}$', line) or \
                       re.match(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', line):
                        if entry:
                            data.append(entry)
                            entry = {}
                        entry['timestamp'] = line

                    elif re.match(r'^([0-9a-f]{2}:){5}[0-9a-f]{2}( -> ([0-9a-f]{2}:){5}[0-9a-f]{2})?$', line):
                        if ' -> ' in line:
                            macs = line.split(' -> ')
                            entry['src_mac'] = macs[0]
                            entry['dst_mac'] = macs[1]
                        else:
                            entry['mac'] = line

                    elif re.match(r'^(IPv4:|IPv6:)?\s*([0-9a-f.:]+)(:\d+)?( -> ([0-9a-f.:]+)(:\d+)?)?', line):
                        if 'IPv4:' in line or 'IPv6:' in line:
                            line = line.split(':', 1)[1].strip()
                        if ' -> ' in line:
                            parts = line.split(' -> ')
                            src_part = parts[0].split(':')
                            dst_part = parts[1].split(':')
                            entry['src_ip'] = src_part[0]
                            if len(src_part) > 1:
                                entry['src_port'] = src_part[1]
                            entry['dst_ip'] = dst_part[0]
                            if len(dst_part) > 1:
                                entry['dst_port'] = dst_part[1]

                    elif re.match(r'(DgmLen:|Length:)\s*\d+', line, re.IGNORECASE):
                        length_match = re.search(r'\d+', line)
                        if length_match:
                            entry['packet_length'] = int(length_match.group())

                    elif re.match(r'(PL\(|Payload:).*([0-9a-f]{2}\s*)+', line, re.IGNORECASE):
                        hex_payload = re.findall(r'[0-9a-f]{2}', line.split(':')[-1])
                        if hex_payload:
                            entry['payload_sum'] = sum(int(byte, 16) for byte in hex_payload)
                            entry['payload_len'] = len(hex_payload)

                    elif re.match(r'(Proto:|Protocol:)\s*\w+', line, re.IGNORECASE):
                        proto_match = re.search(r'\w+$', line)
                        if proto_match:
                            entry['protocol'] = proto_match.group().lower()

                if entry:
                    data.append(entry)

        df = pd.DataFrame(data)
        self._analyze_features(df)
        return df

    def _analyze_features(self, df):
        numeric_features = []
        categorical_features = []

        for col in df.columns:
            if pd.api.types.is_numeric_dtype(df[col]):
                numeric_features.append(col)
            elif col not in ['timestamp', 'src_mac', 'dst_mac', 'src_ip', 'dst_ip']:
                try:
                    df[col] = pd.to_numeric(df[col])
                    numeric_features.append(col)
                except:
                    if df[col].nunique() < 20:
                        categorical_features.append(col)

        ip_features = []
        for col in ['src_ip', 'dst_ip']:
            if col in df.columns:
                ip_features.append(col)
                df[f'{col}_num'] = df[col].apply(self._ip_to_num)
                numeric_features.append(f'{col}_num')

        numeric_defaults = {col: 0 for col in numeric_features}
        df.fillna(numeric_defaults, inplace=True)

        if categorical_features:
            df = pd.get_dummies(df, columns=categorical_features)
            numeric_features.extend([col for col in df.columns if col.endswith('_1')])

        self.feature_columns = numeric_features
        print(f"Automatically selected features: {self.feature_columns}")

    def _ip_to_num(self, ip):
        if pd.isna(ip):
            return 0
        try:
            if ':' in ip:
                return int(ip.replace(':', ''), 16)
            else:
                return sum(int(part) * (256 ** i) for i, part in enumerate(ip.split('.')[::-1]))
        except:
            return 0

    def _build_models(self, input_dim):
        models = {
            'AutoEncoder': self._build_autoencoder(input_dim),
            'Sparse AutoEncoder': self._build_sparse_autoencoder(input_dim),
            'Variational AutoEncoder': self._build_variational_autoencoder(input_dim),
            'Stacked AutoEncoder': self._build_stacked_autoencoder(input_dim)
        }
        return models

    def _build_autoencoder(self, input_dim):
        input_layer = Input(shape=(input_dim,))
        encoded = Dense(32, activation='relu')(input_layer)
        decoded = Dense(input_dim, activation='sigmoid')(encoded)
        return Model(input_layer, decoded)

    def _build_sparse_autoencoder(self, input_dim):
        input_layer = Input(shape=(input_dim,))
        encoded = Dense(32, activation='relu',
                       activity_regularizer=regularizers.l1(10e-5))(input_layer)
        decoded = Dense(input_dim, activation='sigmoid')(encoded)
        return Model(input_layer, decoded)

    def _build_variational_autoencoder(self, input_dim):
        input_layer = Input(shape=(input_dim,))
        h = Dense(32, activation='relu')(input_layer)
        z_mean = Dense(16)(h)
        z_log_var = Dense(16)(h)

        def sampling(args):
            z_mean, z_log_var = args
            epsilon = K.random_normal(shape=(K.shape(z_mean)[0], 16))
            return z_mean + K.exp(z_log_var / 2) * epsilon

        z = Lambda(sampling)([z_mean, z_log_var])
        decoder_h = Dense(32, activation='relu')
        decoder_mean = Dense(input_dim, activation='sigmoid')
        h_decoded = decoder_h(z)
        x_decoded_mean = decoder_mean(h_decoded)
        return Model(input_layer, x_decoded_mean)

    def _build_stacked_autoencoder(self, input_dim):
        input_layer = Input(shape=(input_dim,))
        encoded = Dense(64, activation='relu')(input_layer)
        encoded = Dense(32, activation='relu')(encoded)
        decoded = Dense(64, activation='relu')(encoded)
        decoded = Dense(input_dim, activation='sigmoid')(decoded)
        return Model(input_layer, decoded)

    def detect_anomalies(self, reconstructions, original):
        anomaly_scores = np.mean(np.abs(original - reconstructions), axis=1)
        threshold = np.percentile(anomaly_scores, self.threshold_percentile)
        return anomaly_scores > threshold

    def train(self, file_paths, epochs=50, batch_size=32):
        print("Parsing log files...")
        df = self.parse_logs(file_paths)

        if not self.feature_columns:
            raise ValueError("No features detected in the log files")

        X = df[self.feature_columns].values
        print("Normalizing data...")
        X_scaled = self.scaler.fit_transform(X)

        print("\nBuilding models...")
        self.models = self._build_models(X_scaled.shape[1])

        results = {}
        print("\nTraining models...")

        for name, model in self.models.items():
            model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
            start_time = time.time()
            model.fit(X_scaled, X_scaled,
                     epochs=epochs,
                     batch_size=batch_size,
                     shuffle=True,
                     validation_split=0.2,
                     verbose=0)
            training_time = time.time() - start_time

            reconstructions = model.predict(X_scaled)
            mse = mean_squared_error(X_scaled, reconstructions)
            accuracy = 1 - mse
            results[name] = {
                'Accuracy': accuracy,
                'MSE': mse,
                'Training Time': training_time
            }
            print(f"{name}: Accuracy={accuracy:.4f}, MSE={mse:.6f}, Training Time={training_time:.2f}s")


        self.best_model_name = max(results, key=lambda k: results[k]['Accuracy'])
        self.best_model = self.models[self.best_model_name]
        print(f"\nBest Model: {self.best_model_name} with Accuracy {results[self.best_model_name]['Accuracy']:.4f}")


        print("Detecting anomalies...")
        reconstructions = self.best_model.predict(X_scaled)
        df['Anomaly'] = self.detect_anomalies(reconstructions, X_scaled)
        df['Anomaly_Score'] = np.mean(np.abs(X_scaled - reconstructions), axis=1)

        return df

    def analyze_new_data(self, file_paths):
        if not self.best_model:
            raise ValueError("Model not trained yet. Call train() first.")

        print("Parsing new log files...")
        df = self.parse_logs(file_paths)

        if not self.feature_columns:
            raise ValueError("No features detected in the log files")

        missing_features = set(self.feature_columns) - set(df.columns)
        for feat in missing_features:
            df[feat] = 0

        X = df[self.feature_columns].values
        X_scaled = self.scaler.transform(X)

        print("Detecting anomalies...")
        reconstructions = self.best_model.predict(X_scaled)
        df['Anomaly'] = self.detect_anomalies(reconstructions, X_scaled)
        df['Anomaly_Score'] = np.mean(np.abs(X_scaled - reconstructions), axis=1)

        return df

    def save_results(self, df, output_file='network_anomalies.csv'):
        output_columns = [col for col in df.columns if not col.endswith('_num')]
        df[output_columns].to_csv(output_file, index=False)
        print(f"\nResults saved to '{output_file}'")

        if 'Anomaly' in df.columns:
            print("\nSample anomalies detected:")
            print(df[df['Anomaly']][output_columns].head())

        return df

In [8]:
if __name__ == "__main__":
    detector = NetworkAnomalyDetector(threshold_percentile=95)
    log_files = ["logsys1.txt", "logts1.txt","logsnort1.txt"]
    df = detector.train(log_files)
    detector.save_results(df)

Parsing log files...
Automatically selected features: ['src_port', 'dst_port', 'payload_sum', 'payload_len', 'src_ip_num', 'dst_ip_num']
Normalizing data...

Building models...

Training models...
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
AutoEncoder: Accuracy=0.9997, MSE=0.000320, Training Time=28.10s
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Sparse AutoEncoder: Accuracy=0.9994, MSE=0.000611, Training Time=28.63s
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Variational AutoEncoder: Accuracy=0.9999, MSE=0.000132, Training Time=32.87s
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Stacked AutoEncoder: Accuracy=0.9999, MSE=0.000085, Training Time=33.85s

Best Model: Stacked AutoEncoder with Accuracy 0.9999
Detecting anomalies...
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Results saved to 'network_anomalies.csv'

Sample anomalies dete