In [None]:

def load_dataset_from_folder(folder_path):
    """Load all CSV files from folder and combine into single DataFrame"""
    dataframes = []
    
    for file in os.listdir(folder_path):
        if file.endswith('.csv'):
            file_path = os.path.join(folder_path, file)
            df = pd.read_csv(file_path)
            dataframes.append(df)
            print(f"Loaded {file}: {df.shape}")
    
    if not dataframes:
        raise ValueError("No CSV files found in the folder")
    
    # Combine all dataframes
    combined_df = pd.concat(dataframes, ignore_index=True)
    print(f"\nFinal dataset shape: {combined_df.shape}")
    
    return combined_df

# Usage - replace with your actual folder path
attack_path = "c:\\Users\\Korad\\Downloads\\ctu_13_testing\\attack"
normal_path = "C:\\Users\\korad\\Downloads\\ctu_13_testing\\normal"
df_attack = load_dataset_from_folder(attack_path)
df_normal = load_dataset_from_folder(normal_path)

Loaded CTU13_Attack_Traffic.csv: (38898, 59)

Final dataset shape: (38898, 59)
Loaded CTU13_Normal_Traffic.csv: (53314, 59)

Final dataset shape: (53314, 59)


In [7]:
import pandas as pd
def get_feature_set(df: pd.DataFrame, set_type: str) -> pd.DataFrame:
    if set_type.upper() == "A":
        selected_features = [f for f in set_a_features if f in df.columns]
    elif set_type.upper() == "B":
        selected_features = [f for f in set_b_features if f in df.columns]
    else:
        raise ValueError("Invalid set_type. Use 'A' for LSTM/CNN or 'B' for Random Forest.")
    
    return df[selected_features].copy()


def extract_row_features(row: pd.Series, set_type: str) -> pd.DataFrame:
    # Convert row to DataFrame
    row_df = pd.DataFrame([row])
    return get_feature_set(row_df, set_type)

# ----------------------------
# Example: file-level usage
# ----------------------------

def process_file(file_path: str, set_type: str) -> pd.DataFrame:
    df = pd.read_csv(file_path)
    filtered_df = get_feature_set(df, set_type)
    return filtered_df
def all_feats(file_path: str) -> pd.DataFrame:
    df = pd.read_csv(file_path)
    return df



In [15]:

def ensure_features(df: pd.DataFrame, required_features: list) -> pd.DataFrame:
    for feature in required_features:
        if feature not in df.columns:
            df[feature] = 0
    return df
def conversion_list_to_df(df_attack_layer_2,available_set_a):
    df_attack_layer_2 = pd.DataFrame(df_attack_layer_2, columns=list(available_set_a) + ["label"])
    return df_attack_layer_2


In [9]:
df_normal=all_feats("C:\\Users\\korad\\Downloads\\benign.csv")
df_normal=ensure_features(df_normal,['A','B'])

In [18]:
import os
df_attack_layer_2 = []
folder_path ="C:\\Users\\korad\\Downloads\\attacks_file\\csv_output"

for file_path in os.listdir(folder_path):
    full_path = os.path.join(folder_path, file_path)
    file_name = os.path.splitext(file_path)[0]  # get file name without extension
    df = process_file(full_path, 'A')  # process the file
    if file_name == "neris":
        df["label"] = 1
    elif file_name in ["rbot_1", "rbot_2"]:
        df["label"] = 2
    else:
        df["label"] = 3
    
    df_attack_layer_2.append(df)
df_attack_layer_2 = pd.concat(df_attack_layer_2, ignore_index=True)


In [11]:
df_attack_layer_3=[]
for file_path in os.listdir(folder_path):
    full_path = os.path.join(folder_path, file_path)
    file_name = os.path.splitext(file_path)[0]  # get file name without extension
    df = process_file(full_path, 'B')  # process the file
    if file_name == "menti":
        df["label"] = 1
    elif file_name=="murlo":
        df["label"] = 2
    elif file_name=="nsisay":
        df["label"] = 3
    elif file_name=="virut":
        df["label"] = 4
    
    df_attack_layer_3.append(df)

In [12]:
df_attack_layer_4_menti=[]
df_attack_layer_4_murlo=[]
df_attack_layer_4_nsisay=[]
df_attack_layer_4_virut=[]

for file_path in os.listdir(folder_path):
    full_path = os.path.join(folder_path, file_path)
    file_name = os.path.splitext(file_path)[0]  # get file name without extension
    df = all_feats(full_path) 
    df=ensure_features(df,['A','B'])
    if file_name == "menti":
        df_attack_layer_4_menti.append(df)
    elif file_name=="murlo":
        df_attack_layer_4_murlo.append(df)
    elif file_name=="nsisay":
        df_attack_layer_4_nsisay.append(df)
    elif file_name=="murlo":
        df_attack_layer_4_virut.append(df)

In [None]:
def build_vae_simple(input_dim, latent_dim=8):
    """Simple VAE without complex tensor operations"""
    # Encoder
    inputs = layers.Input(shape=(input_dim,))
    
    # Encoder
    x = layers.Dense(32, activation='relu')(inputs)
    x = layers.Dense(16, activation='relu')(x)
    
    # Latent space
    z_mean = layers.Dense(latent_dim, name='z_mean')(x)
    z_log_var = layers.Dense(latent_dim, name='z_log_var')(x)
    
    # Simple sampling using Keras ops only
    def sampling(args):
        z_mean, z_log_var = args
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon
    
    z = layers.Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
    
    # Decoder
    x = layers.Dense(16, activation='relu')(z)
    x = layers.Dense(32, activation='relu')(x)
    outputs = layers.Dense(input_dim, activation='linear')(x)
    
    vae = Model(inputs, outputs)
    vae.compile(optimizer='adam', loss='mse')
    
    return vae

# Train function
def train_layer1_vae(df):
    numeric_df = df.select_dtypes(include=[np.number])
    input_dim = numeric_df.shape[1]
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(numeric_df)
    
    vae = build_vae_simple(input_dim)
    
    print("Training VAE Layer 1...")
    history = vae.fit(X_scaled, X_scaled, 
                     epochs=50, 
                     batch_size=32, 
                     validation_split=0.2,
                     verbose=1)
    
    return vae, scaler

# Train
vae_layer1, scaler_layer1 = train_layer1_vae(df_normal)

In [None]:

#NEEDS TO BE VERIFIED
def detect_attacks_layer1(test_df, vae_model, scaler, threshold=0.1):
    """Run trained VAE on test data and detect attacks based on threshold"""
    
    # Select numeric features and scale
    numeric_test = test_df.select_dtypes(include=[np.number])
    X_test_scaled = scaler.transform(numeric_test)
    
    # Get reconstructions
    reconstructions = vae_model.predict(X_test_scaled)
    
    # Calculate MAE (Mean Absolute Error)
    mae = np.mean(np.abs(X_test_scaled - reconstructions), axis=1)
    
    # Classify as attack if MAE > threshold
    attack_mask = mae > threshold
    attack_df = test_df[attack_mask].copy()
    
    print(f"Detected {len(attack_df)} attacks out of {len(test_df)} samples")
    print(f"Attack rate: {len(attack_df)/len(test_df):.2%}")
    
    return attack_df, mae

# Usage
attack_df_layer1, mae_scores = detect_attacks_layer1(df_attack, vae_layer1, scaler_layer1, threshold=0.1)

In [4]:
set_a_features = [
    # Flow time & rate features
    'Flow Duration', 'Flow Byts/s', 'Flow Pkts/s',
    'Fwd Pkts/s', 'Bwd Pkts/s',

  # Inter-arrival times
    'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
    'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
    'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Bwd IAT Tot', 'Fwd IAT Tot',

    # Packet size stats (for temporal variance)
    'Pkt Len Mean', 'Pkt Len Std', 'Pkt Size Avg',

    # Active/Idle patterns (C&C periodic behaviour)
    'Active Mean', 'Idle Mean',

    # Subflow dynamics (important for Neris/Rbot)
    'Subflow Fwd Byts', 'Subflow Bwd Byts'
]
set_b_features = [
    # Protocol and ports
    'Protocol',

    # Flag-based patterns
    'SYN Flag Cnt', 'ACK Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt', 'FIN Flag Cnt',

    # Flow-level packet stats
    'Flow Duration', 'Flow Byts/s', 'Flow Pkts/s',
    'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Max', 'Pkt Len Min',

    # Header and ratio cues
    'Fwd Header Len', 'Bwd Header Len', 'Down/Up Ratio'
]


In [None]:
import numpy as np
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
available_set_a = set_a_features
def build_lstm_cnn_model(input_shape):
    """Build combined LSTM + CNN model for Layer 2 - Fixed version"""
    inputs = layers.Input(shape=input_shape)
    
    # LSTM branch for temporal patterns
    lstm_branch = layers.LSTM(64, return_sequences=False)(inputs)  # Remove return_sequences
    
    # CNN branch for local patterns - need to handle 1D conv properly
    # For 1D CNN, input should be (batch, steps, features)
    cnn_branch = layers.Conv1D(32, kernel_size=1, activation='relu')(inputs)  # kernel_size=1 for single step
    cnn_branch = layers.Conv1D(64, kernel_size=1, activation='relu')(cnn_branch)
    cnn_branch = layers.GlobalMaxPooling1D()(cnn_branch)
    
    # Combine both branches
    combined = layers.concatenate([lstm_branch, cnn_branch])
    combined = layers.Dense(32, activation='relu')(combined)
    combined = layers.Dropout(0.3)(combined)
    
    # Output layer
    outputs = layers.Dense(1, activation='sigmoid')(combined)
    
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

def prepare_layer2_data(attack_df, temporal_features):
    """Prepare data for Layer 2 LSTM+CNN model"""
    # Select temporal features
    X = attack_df[temporal_features].values
    
    # Reshape for LSTM/CNN (samples, timesteps=1, features)
    X_reshaped = X.reshape(X.shape[0], 1, X.shape[1])
    
    # Create binary labels (1 for Neris/Rbot, 0 for others)
    # You need to adjust this based on your actual label column
    y = attack_df['Label']
    return X_reshaped, y

# Train Layer 2
def train_layer2_model(df_attack_layer_2):
    """Train LSTM+CNN model on temporal features"""
    # Separate features and label
    X = df_attack_layer_2.drop("label", axis=1).values.astype(np.float32)
    y = df_attack_layer_2["label"].values.astype(np.int32)

    # Reshape for LSTM/CNN (samples, timesteps=1, features)
    X_reshaped = X.reshape(X.shape[0], 1, X.shape[1])

    # Build model
    input_shape = (1, X.shape[1])   # (timesteps=1, features)
    model = build_lstm_cnn_model(input_shape)

    print("Training Layer 2 (LSTM+CNN)...")
    history = model.fit(
        X_reshaped, y,
        epochs=30,
        batch_size=32,
        validation_split=0.2,
        verbose=1
    )

    return model

# Train Layer 2 on attacks from Layer 1
# df_attack_layer_2=conversion_list_to_df(df_attack_layer_2,set_a_features)
layer2_model = train_layer2_model(df_attack_layer_2)


In [40]:
def predict_layer2(model, test_data, set_a_features):
    """Predict using Layer 2 model and filter samples with label prediction 2"""
    # Select Set A features from test data
    available_features = [f for f in set_a_features if f in test_data.columns]
    X_test = test_data[available_features].values
    
    # Reshape for model input
    X_reshaped = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
    
    # Get predictions (probabilities)
    predictions = model.predict(X_reshaped)
    
    # Convert to class labels (0, 1, 2)
    # Assuming: 0=benign, 1=Neris/Rbot, 2=other attacks
    predicted_labels = np.argmax(predictions, axis=1) if predictions.shape[1] > 1 else (predictions > 0.5).astype(int)
    
    # Filter samples where prediction is 2
    mask_label_2 = predicted_labels == 2
    layer2_attacks = test_data[mask_label_2].copy()
    
    print(f"Layer 2 detected {len(layer2_attacks)} attacks with label 2")
    
    return layer2_attacks, predicted_labels

# Usage
layer2_attacks_df, all_predictions = predict_layer2(layer2_model,attack_df_layer1, available_set_a)

[1m1160/1160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 978us/step
Layer 2 detected 0 attacks with label 2


In [45]:
%pip install RandomForestClassifier

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement RandomForestClassifier (from versions: none)
ERROR: No matching distribution found for RandomForestClassifier


In [50]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
def train_layer3_rf(df_attack):
    available_set_b = [f for f in set_b_features if f in df_attack.columns]
    X = df_attack[available_set_b]
    # Encode labels
    le = LabelEncoder()
    y = le.fit_transform(df_attack['Label'])
    
    # Train Random Forest
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X, y)
    
    print(f"Trained Layer 3 RF on {len(df_attack)} samples, {len(available_set_b)} features")
    print(f"Classes: {le.classes_}")
    
    return rf_model, le
    
    return rf_model, le
rf_layer3,label_encoder  = train_layer3_rf(df_attack)

Trained Layer 3 RF on 38898 samples, 17 features
Classes: [1]


In [53]:
def predict_layer3(rf_model, label_encoder, layer2_output_df):
    """Predict specific botnet classes using Random Forest"""
    # Select Set B features
    X = layer2_output_df[set_b_features]
    
    # Predict classes
    predictions = rf_model.predict(X)
    predicted_classes = label_encoder.inverse_transform(predictions)
    
    # Add predictions to dataframe
    result_df = layer2_output_df.copy()
    result_df['Predicted_Class'] = predicted_classes
    result_df['Predicted_Label'] = predictions
    
    print(f"Layer 3 predictions: {np.unique(predicted_classes, return_counts=True)}")
    
    return result_df



# Predict using Layer 3
layer3_results = predict_layer3(rf_layer3, label_encoder, df_attack)


Layer 3 predictions: (array([1], dtype=int64), array([38898], dtype=int64))


In [54]:
def build_four_vaes(df_attack):
    """Build 4 separate VAEs for each botnet class in Layer 4"""
    # Get unique botnet classes from labels
    unique_classes = df_attack['Label'].unique()
    vae_models = {}
    scalers = {}
    
    for botnet_class in unique_classes:
        print(f"Training VAE for {botnet_class}...")
        
        # Filter data for this specific botnet class
        class_data = df_attack[df_attack['Label'] == botnet_class]
        
        # Select numeric features
        numeric_data = class_data.select_dtypes(include=[np.number])
        
        if len(numeric_data) == 0:
            print(f"No numeric data for {botnet_class}, skipping")
            continue
            
        # Scale data
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(numeric_data)
        
        # Build and train VAE
        input_dim = X_scaled.shape[1]
        vae = build_vae_simple(input_dim)
        vae.fit(X_scaled, X_scaled, epochs=30, batch_size=16, verbose=0)
        
        # Store model and scaler
        vae_models[botnet_class] = vae
        scalers[botnet_class] = scaler
        
        print(f"Trained VAE for {botnet_class} on {len(class_data)} samples")
    
    return vae_models, scalers

def layer4_zero_day_detection(layer3_results, vae_models, scalers, threshold=0.1):
    """Layer 4: Check if predictions match known patterns using VAEs"""
    results = []
    
    for idx, row in layer3_results.iterrows():
        predicted_class = row['Predicted_Class']
        actual_data = row[set_b_features].values.reshape(1, -1)
        
        if predicted_class in vae_models:
            # Get the VAE and scaler for this class
            vae = vae_models[predicted_class]
            scaler = scalers[predicted_class]
            
            # Scale the data
            data_scaled = scaler.transform(actual_data)
            
            # Get reconstruction
            reconstruction = vae.predict(data_scaled)
            
            # Calculate MAE
            mae = np.mean(np.abs(data_scaled - reconstruction))
            
            # If MAE is low, it's a known attack; if high, it's zero-day
            is_known_attack = mae <= threshold
            results.append({
                'index': idx,
                'predicted_class': predicted_class,
                'mae': mae,
                'is_known_attack': is_known_attack,
                'is_zero_day': not is_known_attack
            })
        else:
            # No VAE for this class - treat as zero-day
            results.append({
                'index': idx,
                'predicted_class': predicted_class,
                'mae': None,
                'is_known_attack': False,
                'is_zero_day': True
            })
    
    return pd.DataFrame(results)

# Build 4 VAEs for Layer 4
vae_models_layer4, scalers_layer4 = build_four_vaes(df_attack)

# Apply Layer 4 detection
layer4_results = layer4_zero_day_detection(layer3_results, vae_models_layer4, scalers_layer4)

print("Layer 4 Zero-day Detection Complete:")
print(f"Known attacks: {len(layer4_results[layer4_results['is_known_attack']])}")
print(f"Zero-day attacks: {len(layer4_results[layer4_results['is_zero_day']])}")

Training VAE for 1...
Trained VAE for 1 on 38898 samples




ValueError: X has 17 features, but StandardScaler is expecting 59 features as input.