In [None]:
%pip install scikit-learn
%pip install matplotlib
%pip install pandas
%pip install tensorflow
%pip install numpy
%pip install ipympl
%pip install ipython
%pip install pyarrow
%pip install dask
%pip install joblib

In [None]:
%pip install seaborn

In [None]:
import json
import multiprocessing as mp
import os
import time
import warnings
from datetime import datetime
from functools import partial
from multiprocessing import Pool
from pathlib import Path

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from tensorflow.keras import layers, regularizers
from tensorflow.keras.callbacks import (
    EarlyStopping,
    ModelCheckpoint,
    ReduceLROnPlateau,
    TensorBoard,
)
from tqdm import tqdm

pd.set_option("display.max_columns", None)
warnings.filterwarnings("ignore")

In [None]:
import logging
import sys

nblog = open("nb.log", "a+")
sys.stdout.echo = nblog
sys.stderr.echo = nblog

get_ipython().log.handlers[0].stream = nblog
get_ipython().log.setLevel(logging.INFO)

In [None]:
# Training files
TRAIN_FILES = [
    "conn.log.labeled",
    "conn2.log.labeled",
    "conn3.log.labeled",
    # "conn4.log.labeled",
    "conn5.log.labeled",
]

columns = [
    "ts",
    "uid",
    "id.orig_h",
    "id.orig_p",
    "id.resp_h",
    "id.resp_p",
    "proto",
    "service",
    "duration",
    "orig_bytes",
    "resp_bytes",
    "conn_state",
    "local_orig",
    "local_resp",
    "missed_bytes",
    "history",
    "orig_pkts",
    "orig_ip_bytes",
    "resp_pkts",
    "resp_ip_bytes",
    "tunnel_parents",
    "label",
    "detailed-label",
]

In [None]:
for file in TRAIN_FILES:
    with open(file, "r") as f:
        lines = f.readlines()

        with open(f"{file}.csv", "w") as ff:
            for line in lines[8:-1]:
                line = line.replace("\t", ",").replace("   ", ",")
                ff.write(line)

In [None]:
def load_csv_parallel(file, columns):
    """Optimized CSV loading function for large files"""
    chunks = pd.read_csv(
        f"{file}.csv",
        names=columns,
        engine="c",
        low_memory=False,
        memory_map=True,
        cache_dates=True,
        chunksize=1_000_000, 
    )
    return pd.concat(chunks, ignore_index=True, copy=False)

num_cores = 62

with mp.Pool(num_cores) as pool:
    dataframes = pool.starmap(
        load_csv_parallel, [(file, columns) for file in TRAIN_FILES]
    )

data_train = pd.concat(dataframes, ignore_index=True, copy=False)

del dataframes

In [None]:
data_train.head()

In [None]:
data_train.describe().style.background_gradient(cmap="Blues").set_properties(
    **{"font-family": "Segoe UI"}
)

In [None]:
def pie_plot(df, cols_list, rows, cols):
    fig, axes = plt.subplots(rows, cols)
    for ax, col in zip(axes.ravel(), cols_list):
        df[col].value_counts().plot(
            ax=ax, kind="pie", figsize=(15, 15), fontsize=10, autopct="%1.0f%%"
        )
        ax.set_title(str(col), fontsize=12)
    plt.show()


pie_plot(data_train, ["detailed-label", "proto"], 1, 2)

In [None]:
def improved_distribution_plot(df, cols_list, rows, cols):
    fig, axes = plt.subplots(2, 1, figsize=(15, 15))

    for ax, col in zip(axes.ravel(), cols_list):
        if col == "detailed-label":
            df = df.copy() 
            df[col] = df[col].replace("-", "Benign")

        counts = df[col].value_counts()

        if col == "detailed-label":
            total = counts.sum()
            small_protocols = counts[counts / total < 0.01]
            main_protocols = counts[counts / total >= 0.01]

            if not small_protocols.empty:
                pass  # main_protocols['Others'] = small_protocols.sum()

            colors = plt.cm.Set3(np.linspace(0, 1, len(main_protocols)))
            explode = [0.05] * len(main_protocols)
            wedges, texts, autotexts = ax.pie(
                main_protocols,
                explode=explode,
                labels=main_protocols.index,
                colors=colors,
                autopct="%1.1f%%",
                pctdistance=0.85,
            )

            ax.legend(
                wedges,
                main_protocols.index,
                title="Attack Types",
                loc="center left",
                bbox_to_anchor=(1, 0.7),
            )

        else: 
            colors = ["lightblue", "lightcoral"]
            wedges, texts, autotexts = ax.pie(
                counts,
                labels=counts.index,
                colors=colors,
                autopct="%1.1f%%",
                explode=[0.05] * len(counts),
            )

            ax.legend(
                wedges,
                counts.index,
                title=col.capitalize(),
                loc="center left",
                bbox_to_anchor=(1, 0.5),
            )

        plt.setp(autotexts, size=9, weight="bold")
        plt.setp(texts, size=10)

        ax.set_title(f"{col.capitalize()} Distribution", fontsize=12, pad=20)

    plt.tight_layout()
    plt.show()

    for col in cols_list:
        if col == "detailed-label":
            counts = df[col].value_counts()
        else:
            counts = df[col].value_counts()
        print(f"\n{col.capitalize()} Distribution:")
        print("-" * 30)
        for idx, value in counts.items():
            print(f"{idx}: {value:,} ({value/len(df)*100:.2f}%)")


improved_distribution_plot(data_train, ["detailed-label", "proto"], 2, 1)

In [None]:
def Scaling(df_num, cols):
    """Optimized scaling function with progress tracking"""
    print(f"Starting RobustScaler on {len(cols)} columns...")
    t0 = time.time()
    
    scaler = RobustScaler(copy=True)
    scaled_values = scaler.fit_transform(df_num)
    
    scaled_df = pd.DataFrame(scaled_values, columns=cols, index=df_num.index)
    
    print(f"Scaling completed in {time.time() - t0:.2f}s")
    return scaled_df

def preprocess(dataframe):
    """Optimized preprocessing pipeline with detailed progress tracking"""
    print("\n    Starting preprocessing pipeline...")
    print(f"    Initial dataframe shape: {dataframe.shape}")
    t_start = time.time()
    
    cat_cols = ["proto", "service", "conn_state", "history"]
    drop_cols = [
       "ts", "uid", "id.orig_h", "id.resp_h", "id.orig_p", 
       "id.resp_p", "tunnel_parents", "detailed-label"
    ]
    
    print("\n    [1/7] Dropping unnecessary columns...")
    t0 = time.time()
    dataframe = dataframe.drop(columns=drop_cols, errors="ignore")
    print(f"    Columns dropped in {time.time() - t0:.2f}s")
    print(f"    Shape after dropping: {dataframe.shape}")
    
    print("\n    [2/7] Replacing dashes with NaN...")
    t0 = time.time()
    dataframe.replace("-", np.nan, inplace=True)
    print(f"    Replacement completed in {time.time() - t0:.2f}s")
    
    print("\n    [3/7] Processing numeric columns...")
    t0 = time.time()
    numeric_cols = dataframe.columns.difference(cat_cols + ["label"])
    print(f"    Found {len(numeric_cols)} numeric columns")

    chunk_size = 5
    for i in range(0, len(numeric_cols), chunk_size):
       chunk_cols = numeric_cols[i:i+chunk_size]
       for col in chunk_cols:
           dataframe[col] = pd.to_numeric(dataframe[col], errors="coerce")
           
    print(f"    Numeric conversion completed in {time.time() - t0:.2f}s")
    
    print("\n    [4/7] Processing numeric data and handling NaN values...")
    t0 = time.time()
    df_num = dataframe[numeric_cols]

    all_nan_cols = df_num.columns[df_num.isna().all()]
    if len(all_nan_cols) > 0:
       print(f"    Found {len(all_nan_cols)} columns with all NaN values")
       df_num[all_nan_cols] = 0

    print("Imputing missing values...")
    imputer = SimpleImputer(strategy="mean", copy=False)
    imputed_values = imputer.fit_transform(df_num)
    df_num = pd.DataFrame(imputed_values, columns=numeric_cols, index=dataframe.index)
    print(f"    Numeric processing completed in {time.time() - t0:.2f}s")

    print("\n    [5/7] Scaling numeric data...")
    scaled_df = Scaling(df_num, df_num.columns)
    dataframe[df_num.columns] = scaled_df.values
    del scaled_df  # Free memory

    print("\n    [6/7] Converting labels...")
    t0 = time.time()
    dataframe["label"] = (dataframe["label"] != "Benign").astype(np.int8)  # More efficient than lambda
    print(f"    Label conversion completed in {time.time() - t0:.2f}s")

    print("\n    [7/7] One-hot encoding categorical columns...")
    t0 = time.time()
    dataframe = pd.get_dummies(dataframe, columns=cat_cols, drop_first=True, sparse=False)
    print(f"    One-hot encoding completed in {time.time() - t0:.2f}s")

    print(f"\n    Final dataframe shape: {dataframe.shape}")
    print(f"    Memory usage: {dataframe.memory_usage().sum() / 1024**2:.2f} MB")
    print(f"    Total preprocessing time: {time.time() - t_start:.2f}s")
    
    return dataframe

In [None]:
print(f"[1/7] Starting preprocessing pipeline...")
print(f"Input shape: {data_train.shape}")
t0 = time.time()

print("[2/7] Preprocessing data...")
scaled_train = preprocess(data_train)
print(f"Preprocessing completed in {time.time() - t0:.2f}s")
print(f"Preprocessed shape: {scaled_train.shape}")

print("[3/7] Converting features to float32...")
t1 = time.time()
x = scaled_train.drop(["label"], axis=1, errors="ignore").values
x = np.asarray(x, dtype=np.float32)  # More efficient than .astype()
print(f"Features conversion completed in {time.time() - t1:.2f}s")
print(f"Features shape: {x.shape}")

print("[4/7] Converting labels to int32...")
t2 = time.time()
y = np.asarray(scaled_train["label"].values, dtype=np.int32)  # int32 is sufficient
print(f"Labels conversion completed in {time.time() - t2:.2f}s")
print(f"Labels shape: {y.shape}")

print("[5/7] Clearing unused data to free memory...")
del scaled_train
del data_train

print("[6/7] Performing PCA reduction...")
t3 = time.time()

feature_variance = np.var(x, axis=0)
low_var_features = np.sum(feature_variance < 1e-6)
if low_var_features > 0:
    print(f"Warning: {low_var_features} features have very low variance")

scaler = StandardScaler(copy=True)
x_scaled = scaler.fit_transform(x)

pca = PCA(n_components=0.95, random_state=42)  
x_reduced = pca.fit_transform(x_scaled)

cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)

print(f"PCA completed in {time.time() - t3:.2f}s")
print("\nPCA Variance Analysis:")
print("-" * 50)
print(f"Total variance preserved: {pca.explained_variance_ratio_.sum():.3%}")
print(f"Number of components needed for 95% variance: {pca.n_components_}")
print(f"Original dimensionality: {x.shape[1]}")
print(f"Dimensionality reduction ratio: {pca.n_components_/x.shape[1]:.3%}")

# Print detailed component analysis
print("\nTop 5 components variance explanation:")
for i in range(min(5, pca.n_components_)):
    print(
        f"Component {i+1}: {pca.explained_variance_ratio_[i]:.3%} "
        f"(Cumulative: {cumulative_variance_ratio[i]:.3%})"
    )

print(f"\nReduced features shape: {x_reduced.shape}")

# Train test split
print("\n[7/7] Performing train-test split...")
t4 = time.time()
x_train, x_test, y_train, y_test = train_test_split(
    x_reduced, y, test_size=0.2, random_state=42, shuffle=True, stratify=y
)
print(f"Split completed in {time.time() - t4:.2f}s")

# Print final shapes
print("\nFinal shapes:")
print(f"x_train: {x_train.shape}")
print(f"x_test: {x_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")
print(f"\nTotal pipeline time: {time.time() - t0:.2f}s")

# Memory cleanup
del x_reduced
del x
del x_scaled

# Save PCA model with scaler
print("\nSaving PCA model and scaler...")
joblib.dump({"pca": pca, "scaler": scaler}, "pca_model.joblib")

In [None]:
log_dir = "logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")


def create_packet_classifier(input_shape, num_classes=1):
    reg_config = {
        "kernel": regularizers.L1L2(l1=1e-6, l2=1e-5),
        "bias": regularizers.L2(1e-5),
        "activity": regularizers.L2(1e-6),
    }

    model = tf.keras.Sequential(
        [
            layers.InputLayer(input_shape=input_shape),
            layers.BatchNormalization(),

            layers.Dense(
                32,
                activation="relu",
                kernel_regularizer=reg_config["kernel"],
                bias_regularizer=reg_config["bias"],
                activity_regularizer=reg_config["activity"],
            ),
            layers.BatchNormalization(),
            layers.Dropout(0.3),

            layers.Dense(
                64,
                activation="relu",
                kernel_regularizer=reg_config["kernel"],
                bias_regularizer=reg_config["bias"],
                activity_regularizer=reg_config["activity"],
            ),
            layers.BatchNormalization(),
            layers.Dropout(0.3),

            layers.Dense(
                256,
                activation="relu",
                kernel_regularizer=reg_config["kernel"],
                bias_regularizer=reg_config["bias"],
                activity_regularizer=reg_config["activity"],
            ),
            layers.BatchNormalization(),
            layers.Dropout(0.4),

            layers.Dense(num_classes, activation="sigmoid"),
        ]
    )

    return model


def train_model(model, x_train, y_train, x_val, y_val, batch_size=32, max_epochs=50):
    callbacks = [
        EarlyStopping(
            monitor="val_loss", patience=5, restore_best_weights=True, verbose=1
        ),
        ReduceLROnPlateau(
            monitor="val_loss", factor=0.5, patience=3, min_lr=1e-6, verbose=1
        ),
        ModelCheckpoint(
            "best_packet_classifier.keras",  
            monitor="val_accuracy",
            save_best_only=True,
            verbose=1,
        ),
        TensorBoard(log_dir=log_dir, histogram_freq=1),
    ]

    optimizer = tf.keras.optimizers.Adam(
        learning_rate=1e-3,
        clipnorm=1.0, 
    )

    model.compile(
        optimizer=optimizer,
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
        metrics=[
            "accuracy",
            tf.keras.metrics.AUC(),
            tf.keras.metrics.Precision(),
            tf.keras.metrics.Recall(),
        ],
    )

    history = model.fit(
        x_train,
        y_train,
        validation_data=(x_val, y_val),
        batch_size=batch_size,
        epochs=max_epochs,
        callbacks=callbacks,
        verbose=1,
    )

    return history

In [None]:
input_shape = x_train.shape[1:]
print("Shaped")
model = create_packet_classifier(input_shape)
print("model")
history = train_model(model, x_train, y_train, x_test, y_test)

In [None]:
with open("34.conn.log.labeled", "r") as f:
    lines = f.readlines()

    with open("34.conn.log.labeled.csv", "w") as ff:
        for line in lines[8:-1]:
            line = line.replace("\t", ",").replace("   ", ",")
            ff.write(line)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (confusion_matrix, roc_curve, precision_recall_curve, 
                           f1_score, auc, accuracy_score, precision_score, recall_score)
from sklearn.decomposition import PCA
from tensorflow.keras.models import load_model
import os
import time

# Load and preprocess data
#df = pd.read_csv("test3.log.labeled.csv")


def load_csv_parallel(file, columns):
    chunks = pd.read_csv(
        f"{file}.csv",
        names=columns,
        engine="c",
        low_memory=False,
        memory_map=True,
        cache_dates=True,
        chunksize=1_000_000,
    )
    return pd.concat(chunks, ignore_index=True, copy=False)

num_cores = 62

with mp.Pool(num_cores) as pool:
    dataframes = pool.starmap(
        load_csv_parallel, [(file, columns) for file in ["8.conn.log.labeled", "34.conn.log.labeled"]]
    )

df = pd.concat(dataframes, ignore_index=True, copy=False)

del dataframes

columns = [
    "ts", "uid", "id.orig_h", "id.orig_p", "id.resp_h", "id.resp_p",
    "proto", "service", "duration", "orig_bytes", "resp_bytes",
    "conn_state", "local_orig", "local_resp", "missed_bytes",
    "history", "orig_pkts", "orig_ip_bytes", "resp_pkts",
    "resp_ip_bytes", "tunnel_parents", "label", "detailed-label"
]

df.columns = columns
required_columns = [
    "proto", "service", "conn_state", "history", "local_orig", "local_resp",
    "orig_bytes", "resp_bytes", "orig_ip_bytes", "resp_ip_bytes", "resp_pkts"
]

for col in required_columns:
    if col not in df.columns:
        df[col] = np.nan

# Store actual labels
actual_labels = df['label'].map({'Benign': 0, 'Malicious': 1}).fillna(0).astype(int)

# Preprocess the data
preprocessed_data = preprocess(df)

# Load model and make predictions
model = load_model("best_packet_classifier.keras")

# Perform PCA
print("Performing PCA reduction...")
pca = PCA(n_components=20, random_state=42)
x_new = preprocessed_data.values.astype("float32")
x_new_reduced = pca.fit_transform(x_new)
print(f"Explained variance ratio: {pca.explained_variance_ratio_.sum():.3f}")

# Get prediction probabilities
predictions_prob = model.predict(x_new_reduced)

# Create a more comprehensive threshold analysis
thresholds = np.linspace(0.1, 0.99, 90)  # Test more values from 0.1 to 0.99
metrics = []

print("Analyzing different threshold values...")
for threshold in thresholds:
    predictions = (predictions_prob > threshold).astype("int")
    
    # Calculate metrics
    acc = accuracy_score(actual_labels, predictions)
    prec = precision_score(actual_labels, predictions)
    rec = recall_score(actual_labels, predictions)
    f1 = f1_score(actual_labels, predictions)
    
    # Calculate true and false positive rates
    tn, fp, fn, tp = confusion_matrix(actual_labels, predictions).ravel()
    tpr = tp / (tp + fn)  # Recall
    fpr = fp / (fp + tn)
    
    metrics.append({
        'threshold': threshold,
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1_score': f1,
        'true_positive_rate': tpr,
        'false_positive_rate': fpr,
        'true_positives': tp,
        'false_positives': fp,
        'true_negatives': tn,
        'false_negatives': fn
    })

# Convert to DataFrame for easier analysis
metrics_df = pd.DataFrame(metrics)

# Plot comprehensive metrics across thresholds
plt.figure(figsize=(15, 8))
plt.subplot(2, 1, 1)
plt.plot(metrics_df['threshold'], metrics_df['accuracy'], label='Accuracy', linewidth=2)
plt.plot(metrics_df['threshold'], metrics_df['precision'], label='Precision', linewidth=2)
plt.plot(metrics_df['threshold'], metrics_df['recall'], label='Recall', linewidth=2)
plt.plot(metrics_df['threshold'], metrics_df['f1_score'], label='F1 Score', linewidth=2)
plt.grid(True, alpha=0.3)
plt.title('Model Metrics vs Confidence Threshold',
          pad=20, fontsize=14, fontweight='bold')
plt.xlabel('Confidence Threshold')
plt.ylabel('Score')
plt.legend()

# Plot true positives and false positives
plt.subplot(2, 1, 2)
plt.plot(metrics_df['threshold'], metrics_df['true_positives'], 
         label='True Positives', color='green', linewidth=2)
plt.plot(metrics_df['threshold'], metrics_df['false_positives'], 
         label='False Positives', color='red', linewidth=2)
plt.grid(True, alpha=0.3)
plt.title('Classification Counts vs Confidence Threshold',
          pad=20, fontsize=14, fontweight='bold')
plt.xlabel('Confidence Threshold')
plt.ylabel('Number of Predictions')
plt.legend()

plt.tight_layout()
plt.show()

# Find optimal threshold based on different criteria
optimal_f1_threshold = metrics_df.loc[metrics_df['f1_score'].idxmax(), 'threshold']
optimal_accuracy_threshold = metrics_df.loc[metrics_df['accuracy'].idxmax(), 'threshold']
optimal_precision_threshold = metrics_df.loc[metrics_df['precision'].idxmax(), 'threshold']

# Print optimal thresholds and their metrics
print("\nOptimal Thresholds Analysis:")
print("-" * 50)
print(f"Optimal threshold (F1 Score): {optimal_f1_threshold:.3f}")
print(f"Optimal threshold (Accuracy): {optimal_accuracy_threshold:.3f}")
print(f"Optimal threshold (Precision): {optimal_precision_threshold:.3f}")

# Let's find a threshold that balances precision and recall
balance_idx = (metrics_df['precision'] - metrics_df['recall']).abs().idxmin()
balanced_threshold = metrics_df.loc[balance_idx, 'threshold']
print(f"Balanced threshold (Precision ≈ Recall): {balanced_threshold:.3f}")

print("\nMetrics at different thresholds:")
thresholds_to_show = [0.1, 0.3, 0.5, 0.7, 0.85, 0.9, 0.925, 0.95, 0.99]
print("\nThreshold  Accuracy  Precision  Recall  F1-Score  TP  FP")
print("-" * 65)
for t in thresholds_to_show:
    row = metrics_df[metrics_df['threshold'].round(2) == round(t, 2)].iloc[0]
    print(f"{t:9.2f}  {row['accuracy']:8.3f}  {row['precision']:9.3f}  "
          f"{row['recall']:6.3f}  {row['f1_score']:8.3f}  "
          f"{int(row['true_positives']):3d}  {int(row['false_positives']):3d}")

# Use threshold of 0.9 as recommended
chosen_threshold = 0.9
predictions = (predictions_prob > chosen_threshold).astype("int")

# Create confusion matrix with chosen threshold
plt.figure(figsize=(10, 8))
cm = confusion_matrix(actual_labels, predictions)

# Create labeled confusion matrix
labels = ['Benign', 'Malicious']
cm_df = pd.DataFrame(cm, index=labels, columns=labels)

# Calculate percentages for annotations
cm_norm = cm.astype('float') / cm.sum()
annotations = np.array([f'{count}\n({percentage:.1%})'
                       for count, percentage in zip(cm.flatten(), cm_norm.flatten())])
annotations = annotations.reshape(cm.shape)

# Plot confusion matrix with better styling
sns.heatmap(cm_df, 
            annot=annotations,
            fmt='',
            cmap='Blues',
            square=True,
            cbar=True)

plt.title(f'Confusion Matrix\nConfidence Threshold: {chosen_threshold:.3f}', 
          pad=20, fontsize=14, fontweight='bold')
plt.xlabel('Predicted Label')
plt.ylabel('Actual Label')
plt.tight_layout()
plt.show()

# ROC Curve
plt.figure(figsize=(10, 6))
fpr, tpr, _ = roc_curve(actual_labels, predictions_prob)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, color='darkorange', lw=2, 
         label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve', 
          pad=20, fontsize=14, fontweight='bold')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Protocol-wise analysis
protocol_comparison = pd.DataFrame({
    'Protocol': df['proto'],
    'Actual': actual_labels,
    'Predicted': predictions.flatten()
})

protocol_stats = protocol_comparison.groupby('Protocol').agg({
    'Actual': ['mean', 'count'],
    'Predicted': 'mean'
})

protocol_stats.columns = ['Actual_Rate', 'Count', 'Predicted_Rate']
protocol_stats = protocol_stats.sort_values('Count', ascending=False)

# Convert to percentages
protocol_stats['Actual_Rate'] *= 100
protocol_stats['Predicted_Rate'] *= 100

plt.figure(figsize=(12, 6))
ax = protocol_stats[['Actual_Rate', 'Predicted_Rate']].plot(
    kind='bar',
    rot=45,
    width=0.8
)

plt.title(f'Protocol-wise Classification Comparison\nConfidence Threshold: {chosen_threshold:.3f}', 
          pad=20, fontsize=14, fontweight='bold')
plt.xlabel('Protocol (sorted by frequency)')
plt.ylabel('Percentage Classified as Malicious')
plt.legend(['Actual', 'Predicted'])

# Add count annotations
for i, (idx, row) in enumerate(protocol_stats.iterrows()):
    plt.text(i, max(row['Actual_Rate'], row['Predicted_Rate']) + 1,
             f'n={int(row["Count"])}',
             ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Print final performance metrics
print("\nFinal Model Performance Metrics:")
print("-" * 50)
print(f"Confidence Threshold: {chosen_threshold:.3f}")
print(f"Accuracy: {accuracy_score(actual_labels, predictions):.4f}")
print(f"Precision: {precision_score(actual_labels, predictions):.4f}")
print(f"Recall: {recall_score(actual_labels, predictions):.4f}")
print(f"F1 Score: {f1_score(actual_labels, predictions):.4f}")

print("\nConfusion Matrix Interpretation:")
print("-" * 50)
print(f"True Negatives (Correctly identified benign traffic): {cm[0,0]}")
print(f"False Positives (Benign traffic misclassified as malicious): {cm[0,1]}")
print(f"False Negatives (Malicious traffic misclassified as benign): {cm[1,0]}")
print(f"True Positives (Correctly identified malicious traffic): {cm[1,1]}")

# Protocol-wise Performance
print("\nProtocol-wise Performance:")
print("-" * 50)
for protocol, stats in protocol_stats.iterrows():
    print(f"{protocol:10} - Actual: {stats['Actual_Rate']:.1f}% | "
          f"Predicted: {stats['Predicted_Rate']:.1f}% | "
          f"Count: {int(stats['Count'])}")

# Save results
os.makedirs("data", exist_ok=True)
df['actual_label'] = actual_labels
df['predicted_label'] = predictions
df['prediction_probability'] = predictions_prob
df.to_csv("data/comparison_results.csv", index=False)

# Save threshold analysis
metrics_df.to_csv("data/threshold_analysis.csv", index=False)

# Save summary statistics
summary_stats = {
    "chosen_threshold": float(chosen_threshold),
    "optimal_thresholds": {
        "f1_score": float(optimal_f1_threshold),
        "accuracy": float(optimal_accuracy_threshold),
        "precision": float(optimal_precision_threshold),
        "balanced": float(balanced_threshold)
    },
    "total_packets": len(df),
    "actual_malicious": int(actual_labels.sum()),
    "predicted_malicious": int(predictions.sum()),
    "confusion_matrix": {
        "true_negatives": int(cm[0,0]),
        "false_positives": int(cm[0,1]),
        "false_negatives": int(cm[1,0]),
        "true_positives": int(cm[1,1])
    },
    "metrics": {
        "accuracy": float(accuracy_score(actual_labels, predictions)),
        "precision": float(precision_score(actual_labels, predictions)),
        "recall": float(recall_score(actual_labels, predictions)),
        "f1_score": float(f1_score(actual_labels, predictions)),
        "roc_auc": float(roc_auc)
    },
    "protocol_wise_stats": protocol_stats.to_dict(),
    "analysis_timestamp": pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")
}

with open("data/comparison_metrics.json", "w") as f:
    json.dump(summary_stats, f, indent=4)

print("\nResults saved to 'data' directory")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import auc, roc_curve

# Calculate ROC curve points
fpr, tpr, thresholds = roc_curve(actual_labels, predictions_prob)
roc_auc = auc(fpr, tpr)

# Create figure
plt.figure(figsize=(15, 10))

# Plot main ROC curve
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (AUC = {roc_auc:.3f})")

# Add diagonal line (random classifier)
plt.plot(
    [0, 1], [0, 1], color="navy", linestyle="--", label="Random Classifier (AUC = 0.5)"
)

# Add key threshold points
key_thresholds = [0.9, 0.7, 0.5, 0.3]
colors = ["red", "green", "blue", "purple"]

for threshold, color in zip(key_thresholds, colors):
    # Find closest threshold value
    idx = np.argmin(np.abs(thresholds - threshold))
    plt.plot(
        fpr[idx],
        tpr[idx],
        "o",
        color=color,
        markersize=10,
        label=f"Threshold = {threshold:.1f}",
    )

    # Add annotation with actual values
    plt.annotate(
        f"TPR: {tpr[idx]:.3f}\nFPR: {fpr[idx]:.3f}",
        xy=(fpr[idx], tpr[idx]),
        xytext=(10, -10),
        textcoords="offset points",
        ha="left",
        va="top",
        bbox=dict(boxstyle="round,pad=0.5", fc="white", alpha=0.7),
        arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=0"),
    )

# Customize plot
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel("False Positive Rate (1 - Specificity)", fontsize=12)
plt.ylabel("True Positive Rate (Sensitivity)", fontsize=12)
plt.title(
    "Receiver Operating Characteristic (ROC) Curve Analysis\n"
    "with Different Confidence Thresholds",
    pad=20,
    fontsize=14,
    fontweight="bold",
)

# Add grid
plt.grid(True, alpha=0.3)

# Add legend
plt.legend(loc="lower right", fontsize=10)

# Add explanatory text
plt.text(
    0.6,
    0.2,
    "ROC Curve Interpretation:\n\n"
    "- Closer to top-left corner = better performance\n"
    "- AUC = Area Under Curve (1.0 = perfect, 0.5 = random)\n"
    "- Higher threshold = fewer false positives\n"
    "- Lower threshold = fewer false negatives",
    bbox=dict(facecolor="white", alpha=0.7),
    fontsize=10,
)

plt.tight_layout()
plt.show()

# Print detailed analysis
print("\nROC Curve Analysis:")
print("-" * 50)
print(f"AUC Score: {roc_auc:.3f}")
print("\nPerformance at key thresholds:")
for threshold in key_thresholds:
    idx = np.argmin(np.abs(thresholds - threshold))
    print(f"\nThreshold: {threshold:.1f}")
    print(f"True Positive Rate: {tpr[idx]:.3f}")
    print(f"False Positive Rate: {fpr[idx]:.3f}")
    print(f"Specificity (True Negative Rate): {1-fpr[idx]:.3f}")

In [None]:
# After calculating metrics for each threshold, but before the confusion matrix:

# Create performance comparison bar graph
plt.figure(figsize=(15, 6))

# Data for the graph
thresholds = [0.10, 0.30, 0.50, 0.70, 0.85, 0.90, 0.93, 0.95, 0.99]
accuracies = [0.963, 0.957, 0.955, 0.958, 0.957, 0.958, 0.026, 0.038, 0.031]
precisions = [0.974, 0.974, 0.974, 0.978, 0.978, 0.979, 0.458, 0.692, 0.562]
recalls = [0.989, 0.982, 0.980, 0.980, 0.978, 0.977, 0.017, 0.016, 0.008]
f1_scores = [0.981, 0.978, 0.977, 0.979, 0.978, 0.978, 0.033, 0.032, 0.015]

x = np.arange(len(thresholds))
width = 0.2  # Width of bars

# Creating bars
plt.bar(x - 1.5 * width, accuracies, width, label="Accuracy", color="skyblue")
plt.bar(x - 0.5 * width, precisions, width, label="Precision", color="lightgreen")
plt.bar(x + 0.5 * width, recalls, width, label="Recall", color="salmon")
plt.bar(x + 1.5 * width, f1_scores, width, label="F1-Score", color="mediumpurple")

# Customizing the plot
plt.xlabel("Confidence Threshold")
plt.ylabel("Score")
plt.title(
    "Performance Metrics Across Different Confidence Thresholds",
    pad=20,
    fontsize=14,
    fontweight="bold",
)
plt.xticks(x, [f"{t:.2f}" for t in thresholds], rotation=45)
plt.legend()

# Add grid for better readability
plt.grid(True, axis="y", alpha=0.3)

# Ensure layout is tight and nothing is cut off
plt.tight_layout()
plt.show()

# Continue with your existing confusion matrix code...

In [None]:
# Create comparison bar graph
plt.figure(figsize=(10, 6))

comparison_data = pd.DataFrame(
    {"Actual": actual_labels, "Predicted": predictions.flatten()}
)

# Calculate percentages
actual_pct = (
    comparison_data["Actual"].value_counts() / len(actual_labels) * 100
).round(2)
predicted_pct = (
    comparison_data["Predicted"].value_counts() / len(predictions) * 100
).round(2)

x = np.arange(2)
width = 0.35

plt.bar(
    x - width / 2,
    [actual_pct.get(0, 0), actual_pct.get(1, 0)],
    width,
    label="Actual",
    color="lightblue",
)
plt.bar(
    x + width / 2,
    [predicted_pct.get(0, 0), predicted_pct.get(1, 0)],
    width,
    label="Predicted",
    color="lightcoral",
)

plt.xlabel("Traffic Type")
plt.ylabel("Percentage of Total Traffic")
plt.title(
    f"Actual vs Predicted Traffic Distribution\nConfidence Threshold: {chosen_threshold:.2f}",
    pad=20,
    fontsize=14,
    fontweight="bold",
)
plt.xticks(x, ["Benign Traffic", "Malicious Traffic"])
plt.legend()

# Add percentage labels on bars
for i, v in enumerate([actual_pct.get(0, 0), actual_pct.get(1, 0)]):
    plt.text(i - width / 2, v, f"{v:.1f}%", ha="center", va="bottom")
for i, v in enumerate([predicted_pct.get(0, 0), predicted_pct.get(1, 0)]):
    plt.text(i + width / 2, v, f"{v:.1f}%", ha="center", va="bottom")

# Add grid for better readability
plt.grid(True, axis="y", alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
import glob
import os
import time

import tensorflow as tf
from tensorflow.python.summary.summary_iterator import summary_iterator


def find_latest_log_file(log_dir):
    """Find the latest event file in the log directory"""
    files = glob.glob(
        os.path.join(log_dir, "**", "events.out.tfevents.*"), recursive=True
    )
    if not files:
        raise FileNotFoundError(f"No event files found in {log_dir}")
    return max(files, key=os.path.getmtime)


def extract_metrics_from_logs(event_file):
    """Extract metrics from TensorBoard event file"""
    metrics = {
        "loss": [],
        "accuracy": [],
        "val_loss": [],
        "val_accuracy": [],
        "steps": [],
    }
    try:
        for event in summary_iterator(event_file):
            if event.HasField("summary"):
                for value in event.summary.value:
                    if hasattr(value, "simple_value"):
                        tag = value.tag
                        if tag in ["loss", "accuracy", "val_loss", "val_accuracy"]:
                            metrics[tag].append(value.simple_value)
                            if tag == "loss":  # Only add step once per iteration
                                metrics["steps"].append(event.step)
    except Exception as e:
        print(f"Error reading event file: {e}")

    return metrics


try:
    # Find and load the latest log file
    log_dir = "logs/fit"
    latest_event_file = find_latest_log_file(log_dir)
    print(f"Reading logs from: {latest_event_file}")

    # Extract metrics
    metrics = extract_metrics_from_logs(latest_event_file)

    if any(len(v) > 0 for v in metrics.values()):
        import matplotlib.pyplot as plt
        import numpy as np

        plt.style.use("default")
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

        # Plot Loss
        if metrics["loss"] and metrics["val_loss"]:
            ax1.plot(
                metrics["steps"], metrics["loss"], label="Training Loss", color="blue"
            )
            ax1.plot(
                metrics["steps"],
                metrics["val_loss"],
                label="Validation Loss",
                color="orange",
            )

            # Find early stopping point
            val_loss = np.array(metrics["val_loss"])
            min_loss_idx = np.argmin(val_loss)
            ax1.axvline(
                x=metrics["steps"][min_loss_idx],
                color="r",
                linestyle="--",
                label=f"Early Stopping\nEpoch {min_loss_idx}",
            )

        ax1.set_title("Model Loss Over Time", pad=20, fontsize=14, fontweight="bold")
        ax1.set_xlabel("Steps")
        ax1.set_ylabel("Loss")
        ax1.grid(True, alpha=0.3)
        ax1.legend()

        # Plot Accuracy
        if metrics["accuracy"] and metrics["val_accuracy"]:
            ax2.plot(
                metrics["steps"],
                metrics["accuracy"],
                label="Training Accuracy",
                color="blue",
            )
            ax2.plot(
                metrics["steps"],
                metrics["val_accuracy"],
                label="Validation Accuracy",
                color="orange",
            )
            ax2.axvline(
                x=metrics["steps"][min_loss_idx],
                color="r",
                linestyle="--",
                label=f"Early Stopping\nEpoch {min_loss_idx}",
            )

        ax2.set_title(
            "Model Accuracy Over Time", pad=20, fontsize=14, fontweight="bold"
        )
        ax2.set_xlabel("Steps")
        ax2.set_ylabel("Accuracy")
        ax2.grid(True, alpha=0.3)
        ax2.legend()

        plt.tight_layout()
        plt.show()

        # Print summary statistics
        print("\nTraining Summary:")
        print("-" * 50)
        if metrics["loss"]:
            print(f"Total steps: {len(metrics['steps'])}")
            print(
                f"Best validation loss: {min(metrics['val_loss']):.4f} at step {metrics['steps'][min_loss_idx]}"
            )
            if metrics["accuracy"]:
                max_acc = max(metrics["accuracy"])
                max_acc_idx = np.argmax(metrics["accuracy"])
                print(
                    f"Best training accuracy: {max_acc:.4f} at step {metrics['steps'][max_acc_idx]}"
                )
            if metrics["val_accuracy"]:
                max_val_acc = max(metrics["val_accuracy"])
                max_val_acc_idx = np.argmax(metrics["val_accuracy"])
                print(
                    f"Best validation accuracy: {max_val_acc:.4f} at step {metrics['steps'][max_val_acc_idx]}"
                )
    else:
        print("No metrics found in the log file.")

except FileNotFoundError as e:
    print(f"Error: {e}")
    print(
        "Please check if the log directory exists and contains TensorBoard event files."
    )