In [5]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Concatenate

# Paths to datasets
csv_path = 'csv/'
cicids_path = 'cicids/'

# Function to load CSV files from a folder and merge them into a single DataFrame
def load_and_merge_csv(folder_path):
    all_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.csv')]
    dataframes = [pd.read_csv(file) for file in all_files]
    return pd.concat(dataframes, ignore_index=True)

# Load datasets
csv_data = load_and_merge_csv(csv_path)
cicids_data = load_and_merge_csv(cicids_path)

# Preprocessing for CSV dataset
def preprocess_csv_data(data):
    # Standardize numerical fields
    scaler = StandardScaler()
    numerical_cols = ['time', 'data_len', 'src_port', 'dst_port']
    data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

    # Encode protocol field
    data['proto'] = LabelEncoder().fit_transform(data['proto'])

    # Convert IP addresses to numerical representations
    data['ip_src'] = data['ip_src'].apply(lambda x: int.from_bytes(map(int, x.split('.')), 'big'))
    data['ip_dst'] = data['ip_dst'].apply(lambda x: int.from_bytes(map(int, x.split('.')), 'big'))

    return data

# Preprocessing for CICIDS dataset
def preprocess_cicids_data(data):
    # Drop rows with any missing or infinite values
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data.dropna(inplace=True)

    # Separate features and labels
    labels = data[' Label']  # Extract labels
    features = data.drop(columns=[' Label'])  # Remove label column from features

    # Ensure all features are numeric
    features = features.apply(pd.to_numeric, errors='coerce')  # Convert non-numeric values
    features.fillna(0, inplace=True)  # Replace remaining NaNs with 0

    # Encode labels (BENIGN = 0, others = 1)
    labels = LabelEncoder().fit_transform(labels)

    # Standardize numerical fields
    features = StandardScaler().fit_transform(features)

    return features, labels

# Model creation
def build_model(csv_input_shape, cicids_input_shape):
    # Input layers
    csv_input = Input(shape=(csv_input_shape,), name="csv_input")
    cicids_input = Input(shape=(cicids_input_shape,), name="cicids_input")

    # CSV branch
    csv_branch = Dense(64, activation='relu')(csv_input)
    csv_branch = BatchNormalization()(csv_branch)
    csv_branch = Dropout(0.3)(csv_branch)

    # CICIDS branch
    cicids_branch = Dense(64, activation='relu')(cicids_input)
    cicids_branch = BatchNormalization()(cicids_branch)
    cicids_branch = Dropout(0.3)(cicids_branch)

    # Concatenate branches
    merged = Concatenate()([csv_branch, cicids_branch])
    merged = Dense(128, activation='relu')(merged)
    merged = Dropout(0.4)(merged)
    merged = Dense(64, activation='relu')(merged)

    # Output layer
    output = Dense(1, activation='sigmoid', name="output")(merged)

    # Create model
    model = Model(inputs=[csv_input, cicids_input], outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

# Preprocess both datasets
csv_data = preprocess_csv_data(csv_data)  # Returns a DataFrame
cicids_features, cicids_labels = preprocess_cicids_data(cicids_data)  # Features and labels

# Ensure both datasets have the same number of samples
min_samples = min(len(csv_data), len(cicids_features))
csv_data = csv_data.iloc[:min_samples]
cicids_features = cicids_features[:min_samples]
cicids_labels = cicids_labels[:min_samples]

# Split datasets
from sklearn.model_selection import train_test_split

# Split CSV data into training and testing sets
X_csv_train, X_csv_test = train_test_split(csv_data, test_size=0.2, random_state=42)

# Split CICIDS data into training and testing sets
X_cicids_train, X_cicids_test, y_cicids_train, y_cicids_test = train_test_split(
    cicids_features, cicids_labels, test_size=0.2, random_state=42
)

# Check if the number of samples match
print("X_csv_train shape:", X_csv_train.shape)
print("X_cicids_train shape:", X_cicids_train.shape)
print("y_cicids_train shape:", y_cicids_train.shape)

# Ensure the number of samples match between X_csv_train, X_cicids_train, and y_cicids_train
assert X_csv_train.shape[0] == X_cicids_train.shape[0] == y_cicids_train.shape[0], "Mismatched sample sizes!"

# Build the model
model = build_model(X_csv_train.shape[1], X_cicids_train.shape[1])

# Train the model
history = model.fit(
    [X_csv_train, X_cicids_train], y_cicids_train,
    validation_split=0.2,
    epochs=10,
    batch_size=32,
    verbose=1
)

# Save the model
model.save('self_healing_network_model.h5')

# Evaluate the model
evaluation = model.evaluate([X_csv_test, X_cicids_test], y_cicids_test, verbose=1)

print("Test Loss:", evaluation[0])
print("Test Accuracy:", evaluation[1])


X_csv_train shape: (2262300, 7)
X_cicids_train shape: (2262300, 78)
y_cicids_train shape: (2262300,)
Epoch 1/10
[1m56558/56558[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 3ms/step - accuracy: 0.2527 - loss: -145214423040.0000 - val_accuracy: 0.0729 - val_loss: -1552586768384.0000
Epoch 2/10
[1m56558/56558[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 3ms/step - accuracy: 0.2581 - loss: -7787124883456.0000 - val_accuracy: 0.0430 - val_loss: -20391265304576.0000
Epoch 3/10
[1m56558/56558[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 3ms/step - accuracy: 0.2589 - loss: -61957403049984.0000 - val_accuracy: 0.0776 - val_loss: 2426378423107584.0000
Epoch 4/10
[1m56558/56558[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 3ms/step - accuracy: 0.2640 - loss: -243487232491520.0000 - val_accuracy: 0.0737 - val_loss: 1589964278071296.0000
Epoch 5/10
[1m56558/56558[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 3ms/step - accuracy: 0.2650 - los



[1m17675/17675[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 2ms/step - accuracy: 0.0794 - loss: 535270229228388352.0000
Test Loss: 4.20437990737707e+17
Test Accuracy: 0.07966037839651108


In [1]:
import os

# Define the path to the .h5 file
h5_file_path = 'self_healing_network_model.keras'

# Define the new .keras file path
keras_file_path = 'self_healing_network_model.h5'

# Rename the file
os.rename(h5_file_path, keras_file_path)

print(f"File renamed to: {keras_file_path}")


File renamed to: self_healing_network_model.h5
