In [None]:
%pip install scikit-learn
%pip install matplotlib
%pip install pandas
%pip install tensorflow
%pip install numpy
%pip install ipympl
%pip install ipython
%pip install pyarrow
%pip install dask
%pip install joblib

In [None]:
import json
import multiprocessing as mp
import os
import time
import warnings
from datetime import datetime
from functools import partial
from multiprocessing import Pool
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from tensorflow.keras import layers, regularizers
from tensorflow.keras.callbacks import (
    EarlyStopping,
    ModelCheckpoint,
    ReduceLROnPlateau,
    TensorBoard,
)
from tqdm import tqdm
import joblib

pd.set_option("display.max_columns", None)
warnings.filterwarnings("ignore")

In [None]:
import logging
import sys

nblog = open("nb.log", "a+")
sys.stdout.echo = nblog
sys.stderr.echo = nblog

get_ipython().log.handlers[0].stream = nblog
get_ipython().log.setLevel(logging.INFO)

In [None]:
# Training files
TRAIN_FILES = [
    "conn.log.labeled",
    "conn2.log.labeled",
    "conn3.log.labeled",
    "conn4.log.labeled",
    "conn5.log.labeled",
]

columns = [
    "ts",
    "uid",
    "id.orig_h",
    "id.orig_p",
    "id.resp_h",
    "id.resp_p",
    "proto",
    "service",
    "duration",
    "orig_bytes",
    "resp_bytes",
    "conn_state",
    "local_orig",
    "local_resp",
    "missed_bytes",
    "history",
    "orig_pkts",
    "orig_ip_bytes",
    "resp_pkts",
    "resp_ip_bytes",
    "tunnel_parents",
    "label",
    "detailed-label",
]

In [None]:
for file in TRAIN_FILES:
    with open(file, "r") as f:
        lines = f.readlines()

        with open(f"{file}.csv", "w") as ff:
            for line in lines[8:-1]:
                line = line.replace("\t", ",").replace("   ", ",")
                ff.write(line)

In [None]:
# data_train = pd.concat([pd.read_csv(f'{file}.csv', names=columns) for file in TRAIN_FILES], ignore_index=True)


def load_csv_parallel(file, columns):
    """Optimized CSV loading function for large files"""
    chunks = pd.read_csv(
        f"{file}.csv",
        names=columns,
        engine="c",
        low_memory=False,
        memory_map=True,
        cache_dates=True,
        chunksize=1_000_000,  # Process 1M rows at a time
    )
    return pd.concat(chunks, ignore_index=True, copy=False)


# Optimize for 64 cores, leaving 2 cores free for system processes
num_cores = 62

# Create process pool
with mp.Pool(num_cores) as pool:
    # Map the loading function across files
    dataframes = pool.starmap(
        load_csv_parallel, [(file, columns) for file in TRAIN_FILES]
    )

# Combine all files' dataframes
data_train = pd.concat(dataframes, ignore_index=True, copy=False)

# Clear unneeded variables to free memory
del dataframes

In [None]:
data_train.head()

In [None]:
data_train.describe().style.background_gradient(cmap="Blues").set_properties(
    **{"font-family": "Segoe UI"}
)

In [None]:
def pie_plot(df, cols_list, rows, cols):
    fig, axes = plt.subplots(rows, cols)
    for ax, col in zip(axes.ravel(), cols_list):
        df[col].value_counts().plot(
            ax=ax, kind="pie", figsize=(15, 15), fontsize=10, autopct="%1.0f%%"
        )
        ax.set_title(str(col), fontsize=12)
    plt.show()


pie_plot(data_train, ["label", "proto"], 1, 2)

In [None]:
def Scaling(df_num, cols):
    """Optimized scaling function with progress tracking"""
    print(f"Starting RobustScaler on {len(cols)} columns...")
    t0 = time.time()
    
    # Initialize and fit scaler
    scaler = RobustScaler(copy=True)
    scaled_values = scaler.fit_transform(df_num)
    
    # Convert to DataFrame
    scaled_df = pd.DataFrame(scaled_values, columns=cols, index=df_num.index)
    
    print(f"Scaling completed in {time.time() - t0:.2f}s")
    return scaled_df

def preprocess(dataframe):
    """Optimized preprocessing pipeline with detailed progress tracking"""
    print("\n    Starting preprocessing pipeline...")
    print(f"    Initial dataframe shape: {dataframe.shape}")
    t_start = time.time()
    
    # Define columns to process
    cat_cols = ["proto", "service", "conn_state", "history"]
    drop_cols = [
       "ts", "uid", "id.orig_h", "id.resp_h", "id.orig_p", 
       "id.resp_p", "tunnel_parents", "detailed-label"
    ]
    
    # 1. Drop unnecessary columns
    print("\n    [1/7] Dropping unnecessary columns...")
    t0 = time.time()
    dataframe = dataframe.drop(columns=drop_cols, errors="ignore")
    print(f"    Columns dropped in {time.time() - t0:.2f}s")
    print(f"    Shape after dropping: {dataframe.shape}")
    
    # 2. Replace dashes with NaN
    print("\n    [2/7] Replacing dashes with NaN...")
    t0 = time.time()
    dataframe.replace("-", np.nan, inplace=True)
    print(f"    Replacement completed in {time.time() - t0:.2f}s")
    
    # 3. Process numeric columns
    print("\n    [3/7] Processing numeric columns...")
    t0 = time.time()
    numeric_cols = dataframe.columns.difference(cat_cols + ["label"])
    print(f"    Found {len(numeric_cols)} numeric columns")
    
    # Convert to numeric in chunks for better memory usage
    chunk_size = 5
    for i in range(0, len(numeric_cols), chunk_size):
       chunk_cols = numeric_cols[i:i+chunk_size]
       for col in chunk_cols:
           dataframe[col] = pd.to_numeric(dataframe[col], errors="coerce")
           
    print(f"    Numeric conversion completed in {time.time() - t0:.2f}s")
    
    # 4. Handle numeric data
    print("\n    [4/7] Processing numeric data and handling NaN values...")
    t0 = time.time()
    df_num = dataframe[numeric_cols]
    
    # Check for all-NaN columns
    all_nan_cols = df_num.columns[df_num.isna().all()]
    if len(all_nan_cols) > 0:
       print(f"    Found {len(all_nan_cols)} columns with all NaN values")
       df_num[all_nan_cols] = 0
    
    # Impute missing values
    print("Imputing missing values...")
    imputer = SimpleImputer(strategy="mean", copy=False)
    imputed_values = imputer.fit_transform(df_num)
    df_num = pd.DataFrame(imputed_values, columns=numeric_cols, index=dataframe.index)
    print(f"    Numeric processing completed in {time.time() - t0:.2f}s")
    
    # 5. Scale numeric data
    print("\n    [5/7] Scaling numeric data...")
    scaled_df = Scaling(df_num, df_num.columns)
    dataframe[df_num.columns] = scaled_df.values
    del scaled_df  # Free memory
    
    # 6. Convert labels
    print("\n    [6/7] Converting labels...")
    t0 = time.time()
    dataframe["label"] = (dataframe["label"] != "Benign").astype(np.int8)  # More efficient than lambda
    print(f"    Label conversion completed in {time.time() - t0:.2f}s")
    
    # 7. One-hot encode categorical columns
    print("\n    [7/7] One-hot encoding categorical columns...")
    t0 = time.time()
    dataframe = pd.get_dummies(dataframe, columns=cat_cols, drop_first=True, sparse=False)
    print(f"    One-hot encoding completed in {time.time() - t0:.2f}s")
    
    # Final statistics
    print(f"\n    Final dataframe shape: {dataframe.shape}")
    print(f"    Memory usage: {dataframe.memory_usage().sum() / 1024**2:.2f} MB")
    print(f"    Total preprocessing time: {time.time() - t_start:.2f}s")
    
    return dataframe

In [None]:
print(f"[1/7] Starting preprocessing pipeline...")
print(f"Input shape: {data_train.shape}")
t0 = time.time()

# Preprocess data with progress tracking
print("[2/7] Preprocessing data...")
scaled_train = preprocess(data_train)
print(f"Preprocessing completed in {time.time() - t0:.2f}s")
print(f"Preprocessed shape: {scaled_train.shape}")

# Convert to float32 and split features/target
print("[3/7] Converting features to float32...")
t1 = time.time()
x = scaled_train.drop(["label"], axis=1, errors="ignore").values
x = np.asarray(x, dtype=np.float32)  # More efficient than .astype()
print(f"Features conversion completed in {time.time() - t1:.2f}s")
print(f"Features shape: {x.shape}")

print("[4/7] Converting labels to int32...")
t2 = time.time()
y = np.asarray(scaled_train["label"].values, dtype=np.int32)  # int32 is usually sufficient
print(f"Labels conversion completed in {time.time() - t2:.2f}s")
print(f"Labels shape: {y.shape}")

# Free memory
print("[5/7] Clearing unused data to free memory...")
del scaled_train
del data_train

# PCA with progress info
print("[6/7] Performing PCA reduction to 20 components...")
t3 = time.time()
pca = PCA(n_components=20, random_state=42)
x_reduced = pca.fit_transform(x)
print(f"PCA completed in {time.time() - t3:.2f}s")
print(f"Explained variance ratio: {pca.explained_variance_ratio_.sum():.3f}")
print(f"Reduced features shape: {x_reduced.shape}")

# Free more memory
del x

# Train test split
print("[7/7] Performing train-test split...")
t4 = time.time()
x_train, x_test, y_train, y_test = train_test_split(
   x_reduced, 
   y, 
   test_size=0.2, 
   random_state=42,
   shuffle=True,  # Explicit shuffle
   stratify=y     # Maintain label distribution
)
print(f"Split completed in {time.time() - t4:.2f}s")

# Print final shapes
print("\nFinal shapes:")
print(f"x_train: {x_train.shape}")
print(f"x_test: {x_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")

print(f"\nTotal pipeline time: {time.time() - t0:.2f}s")

# Memory cleanup
del x_reduced
    
# Save PCA model
print("\nSaving PCA model...")
joblib.dump(pca, "pca_model.joblib")

In [None]:
log_dir = "logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")


def create_packet_classifier(input_shape, num_classes=1):
    """
    Creates an optimized deep learning model for network packet classification
    with improved architecture and regularization.

    Args:
        input_shape: Shape of input features
        num_classes: Number of output classes (default 1 for binary classification)
    """

    # Initialize regularization parameters
    reg_config = {
        "kernel": regularizers.L1L2(l1=1e-6, l2=1e-5),
        "bias": regularizers.L2(1e-5),
        "activity": regularizers.L2(1e-6),
    }

    model = tf.keras.Sequential(
        [
            # Input layer with batch normalization
            layers.InputLayer(input_shape=input_shape),
            layers.BatchNormalization(),
            # First block - smaller layers for feature extraction
            layers.Dense(
                32,
                activation="relu",
                kernel_regularizer=reg_config["kernel"],
                bias_regularizer=reg_config["bias"],
                activity_regularizer=reg_config["activity"],
            ),
            layers.BatchNormalization(),
            layers.Dropout(0.3),
            # Second block - moderate size for pattern recognition
            layers.Dense(
                64,
                activation="relu",
                kernel_regularizer=reg_config["kernel"],
                bias_regularizer=reg_config["bias"],
                activity_regularizer=reg_config["activity"],
            ),
            layers.BatchNormalization(),
            layers.Dropout(0.3),
            # Third block - larger size for complex pattern learning
            layers.Dense(
                256,
                activation="relu",
                kernel_regularizer=reg_config["kernel"],
                bias_regularizer=reg_config["bias"],
                activity_regularizer=reg_config["activity"],
            ),
            layers.BatchNormalization(),
            layers.Dropout(0.4),
            # Output layer
            layers.Dense(num_classes, activation="sigmoid"),
        ]
    )

    return model


def train_model(model, x_train, y_train, x_val, y_val, batch_size=32, max_epochs=50):
    """
    Trains the model with optimized parameters and callbacks.
    """

    # Define callbacks
    callbacks = [
        # Early stopping to prevent overfitting
        EarlyStopping(
            monitor="val_loss", patience=5, restore_best_weights=True, verbose=1
        ),
        # Reduce learning rate when training plateaus
        ReduceLROnPlateau(
            monitor="val_loss", factor=0.5, patience=3, min_lr=1e-6, verbose=1
        ),
        # Save best model with .keras extension
        ModelCheckpoint(
            "best_packet_classifier.keras",  # Changed from .h5 to .keras
            monitor="val_accuracy",
            save_best_only=True,
            verbose=1,
        ),
        TensorBoard(log_dir=log_dir, histogram_freq=1),
    ]

    # Compile model with optimized parameters
    optimizer = tf.keras.optimizers.Adam(
        learning_rate=1e-3,
        clipnorm=1.0,  # Gradient clipping to prevent exploding gradients
    )

    model.compile(
        optimizer=optimizer,
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
        metrics=[
            "accuracy",
            tf.keras.metrics.AUC(),
            tf.keras.metrics.Precision(),
            tf.keras.metrics.Recall(),
        ],
    )

    # Train model
    history = model.fit(
        x_train,
        y_train,
        validation_data=(x_val, y_val),
        batch_size=batch_size,
        epochs=max_epochs,
        callbacks=callbacks,
        verbose=1,
    )

    return history


# Example usage:
def preprocess_data(columns, data):
    """
    Preprocesses the network packet data.
    """
    # Convert categorical columns to numerical
    categorical_columns = [
        "proto",
        "service",
        "conn_state",
        "history",
        "tunnel_parents",
    ]
    numerical_columns = [
        col
        for col in columns
        if col not in categorical_columns + ["label", "detailed-label"]
    ]

    # Normalize numerical features
    scaler = StandardScaler()
    data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

    # One-hot encode categorical features
    for col in categorical_columns:
        data = pd.get_dummies(data, columns=[col], prefix=col)

    return data

In [None]:
input_shape = x_train.shape[1:]
print("Shaped")
model = create_packet_classifier(input_shape)
print("model")
history = train_model(model, x_train, y_train, x_test, y_test)

In [None]:
with open("test3.log.labeled", "r") as f:
    lines = f.readlines()

    with open("test3.log.labeled.csv", "w") as ff:
        for line in lines[8:-1]:
            line = line.replace("\t", ",").replace("   ", ",")
            ff.write(line)

In [None]:
df = pd.read_csv("test3.log.labeled.csv")

columns = [
    "ts",
    "uid",
    "id.orig_h",
    "id.orig_p",
    "id.resp_h",
    "id.resp_p",
    "proto",
    "service",
    "duration",
    "orig_bytes",
    "resp_bytes",
    "conn_state",
    "local_orig",
    "local_resp",
    "missed_bytes",
    "history",
    "orig_pkts",
    "orig_ip_bytes",
    "resp_pkts",
    "resp_ip_bytes",
    "tunnel_parents",
    "label",
    "detailed-label",
]

df.columns = columns

required_columns = [
    "proto",
    "service",
    "conn_state",
    "history",
    "local_orig",
    "local_resp",
    "orig_bytes",
    "resp_bytes",
    "orig_ip_bytes",
    "resp_ip_bytes",
    "resp_pkts",
]

for col in required_columns:
    if col not in df.columns:
        df[col] = np.nan

df_new = df.copy()

# Add history column with default value if missing
if "history" not in df.columns:
    df["history"] = "S"


# Add label column (will be dropped during preprocessing)
df["label"] = "unknown"

# Preprocess the data using your existing preprocess function
preprocessed_data = preprocess(df)

# Ensure columns match training data
train_columns = scaled_train.drop("label", axis=1).columns
missing_cols = set(train_columns) - set(preprocessed_data.columns)
for col in missing_cols:
    preprocessed_data[col] = 0

# Reorder columns to match training data
preprocessed_data = preprocessed_data[train_columns]

# Convert to numpy array
x_new = preprocessed_data.values.astype("float32")

# Transform with PCA
x_new_reduced = pca.transform(x_new)

# Make predictions
predictions = model.predict(x_new_reduced)
predicted_outcomes = (predictions > 0.5).astype("int")

# Add predictions to original DataFrame
df["predicted_outcome"] = predicted_outcomes

# Calculate basic statistics
total_packets = len(predicted_outcomes)
malicious_packets = np.sum(predicted_outcomes)
normal_packets = total_packets - malicious_packets
malicious_percentage = (malicious_packets / total_packets) * 100

print("-" * 55)

print("Prediction Summary:")
print(f"Total packets analyzed: {total_packets}")
print(f"Malicious packets detected: {malicious_packets} ({malicious_percentage:.2f}%)")
print(f"Normal packets detected: {normal_packets} ({100-malicious_percentage:.2f}%)")

print("-" * 55)

# Actual summary
print("Summary of actual data")
print("Total packets: ", len(df_new))
print("Unique protocols: ", df_new["proto"].nunique())

malicious_packets = df_new["label"].value_counts()[0]
normal_packets = df_new["label"].value_counts()[1]
total_packets = len(df_new)
malicious_percentage = (malicious_packets / total_packets) * 100

print(" ")
# Print the statistics
print(f"Total packets analyzed: {total_packets}")
print(f"Malicious packets detected: {malicious_packets} ({malicious_percentage:.2f}%)")
print(f"Normal packets detected: {normal_packets} ({100-malicious_percentage:.2f}%)")

print("-" * 55)

pie_plot(df_new, ["label", "proto"], 1, 2)


# Create time-based analysis
df["timestamp"] = pd.to_datetime(df["ts"], unit="s")
df["minute"] = df["timestamp"].dt.floor("T")

# Analyze predictions over time
temporal_analysis = (
    df.groupby("minute").agg({"predicted_outcome": ["count", "sum"]}).reset_index()
)
temporal_analysis.columns = ["minute", "total_packets", "malicious_packets"]
temporal_analysis["normal_packets"] = (
    temporal_analysis["total_packets"] - temporal_analysis["malicious_packets"]
)
temporal_analysis["malicious_ratio"] = (
    temporal_analysis["malicious_packets"] / temporal_analysis["total_packets"]
)

# Visualize results
plt.style.use("default")
fig = plt.figure(figsize=(15, 10))

# Plot 1: Timeline of predictions
plt.subplot(2, 1, 1)
plt.plot(
    temporal_analysis["minute"],
    temporal_analysis["normal_packets"],
    label="Normal",
    color="#2ecc71",
    alpha=0.7,
    linewidth=2,
)
plt.plot(
    temporal_analysis["minute"],
    temporal_analysis["malicious_packets"],
    label="Malicious",
    color="#e74c3c",
    alpha=0.7,
    linewidth=2,
)
plt.title("Packet Classification Over Time", pad=20, fontsize=12, fontweight="bold")
plt.xlabel("Time", fontsize=10)
plt.ylabel("Number of Packets", fontsize=10)
plt.legend(frameon=True, fancybox=True, shadow=True)
plt.grid(True, alpha=0.3, linestyle="--")

# Plot 2: Malicious ratio over time
plt.subplot(2, 1, 2)
plt.plot(
    temporal_analysis["minute"],
    temporal_analysis["malicious_ratio"],
    color="#9b59b6",
    alpha=0.7,
    linewidth=2,
)
plt.title(
    "Ratio of Malicious Packets Over Time", pad=20, fontsize=12, fontweight="bold"
)
plt.xlabel("Time", fontsize=10)
plt.ylabel("Ratio of Malicious Packets", fontsize=10)
plt.grid(True, alpha=0.3, linestyle="--")

plt.tight_layout()
plt.show()

# Save results
os.makedirs("data", exist_ok=True)
df.to_csv("data/predicted_new_data.csv", index=False)
temporal_analysis.to_csv("data/temporal_analysis.csv", index=False)

# Save summary statistics
summary_stats = {
    "total_packets": total_packets,
    "malicious_packets": int(malicious_packets),
    "normal_packets": int(normal_packets),
    "malicious_percentage": float(malicious_percentage),
    "analysis_timestamp": pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S"),
    "available_features": df.columns.tolist(),
}

with open("data/verification_results.json", "w") as f:
    json.dump(summary_stats, f, indent=4)

print("\nResults saved to 'data' directory")