In [4]:
import pandas as pd
import os

# Define the folder containing CSV files
folder_path = 'F:/Intrusion detection datasets/Lateral-Movement-Dataset--LMD_Collections/LMD_essential'


In [3]:
# List of features (columns) to check
selected_features = [
    "EventID", "Version", "Level", "Task", "Opcode", "SystemTime", 
    "EventRecordID", "Execution_ProcessID", "ProcessID", "ThreadID", "Channel", 
    "Computer", "UserID", "RuleName", "UtcTime", "ProcessGuid", "ProcessId", 
    "Image", "CommandLine", "CurrentDirectory", "ParentProcessGuid", 
    "ParentProcessId", "ParentImage", "ParentCommandLine", "Protocol", 
    "SourceIsIpv6", "SourceIp", "SourcePort", "DestinationIsIpv6", "DestinationIp", 
    "DestinationPort", "Signed", "Signature", "SignatureStatus", "TargetFilename", 
    "CreationUtcTime", "EventType", "TargetObject", "Details", "QueryName", 
    "QueryStatus", "IsExecutable", "SourceProcessGuid", "TargetProcessGuid"
]

# Function to check columns in each CSV file
def check_columns_in_files(folder_path):
    # Iterate through all CSV files in the folder
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        
        # Check if the file is a CSV file
        if file_path.endswith('.csv'):
            # Read only the header (first row) to get available columns
            available_columns = pd.read_csv(file_path, nrows=1).columns
            
            # Determine which columns are present and which are missing
            present_columns = [col for col in selected_features if col in available_columns]
            missing_columns = [col for col in selected_features if col not in available_columns]
            
            # Print the results for each file
            print(f"\nFile: {file_name}")
            print(f"Present columns: {present_columns}")
            print(f"Missing columns: {missing_columns}")

# Run the column check
check_columns_in_files(folder_path)



File: LMD-2022 [870K Elements][Labelled].csv
Present columns: ['EventID', 'Version', 'Level', 'Task', 'Opcode', 'SystemTime', 'EventRecordID', 'Execution_ProcessID', 'ProcessID', 'ThreadID', 'Channel', 'Computer', 'UserID', 'RuleName', 'UtcTime', 'ProcessGuid', 'ProcessId', 'Image', 'CommandLine', 'CurrentDirectory', 'ParentProcessGuid', 'ParentProcessId', 'ParentImage', 'ParentCommandLine', 'Protocol', 'SourceIsIpv6', 'SourceIp', 'SourcePort', 'DestinationIsIpv6', 'DestinationIp', 'DestinationPort', 'Signed', 'Signature', 'SignatureStatus', 'TargetFilename', 'CreationUtcTime', 'EventType', 'TargetObject', 'Details', 'QueryName', 'QueryStatus', 'IsExecutable', 'SourceProcessGuid', 'TargetProcessGuid']
Missing columns: []

File: LMD-2023 [1.75M Elements][Labelled]checked.csv
Present columns: ['EventID', 'Version', 'Level', 'Task', 'Opcode', 'SystemTime', 'EventRecordID', 'Execution_ProcessID', 'ProcessID', 'ThreadID', 'Channel', 'Computer', 'UserID', 'RuleName', 'UtcTime', 'ProcessGuid

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import HashingVectorizer

def preprocess_chunk(data):
    """
    Preprocess a single chunk of data according to the specified encoding and transformations.
    Parameters:
        data (DataFrame): A chunk of data to preprocess.
    Returns:
        DataFrame: The preprocessed chunk.
    """

    # Define feature categories based on requirements

    # Low Cardinality Features (Categorical features with few unique values)
    low_cardinality_features = [
        'Level',            # log.level
        'Task',             # winlog.task
        'Opcode',           # winlog.opcode
        'Channel',          # winlog.channel
        'Signed',           # winlog.event_data.Signed
        'SignatureStatus'   # winlog.event_data.SignatureStatus
    ]

    # High Cardinality Features (Categorical features with many unique values)
    high_cardinality_features = [
        'Computer',         # winlog.computer_name
        'UserID',           # winlog.user.identifier
        'RuleName',         # winlog.event_data.RuleName
        'ProcessGuid',      # winlog.event_data.ProcessGuid
        'ParentProcessGuid',# winlog.event_data.ParentProcessGuid
        'ParentImage'       # winlog.event_data.ParentImage
    ]

    # Textual Features (Text data requiring tokenization, including paths)
    textual_features = [
        'message',          # Message field if available, not listed here but common
        'Image',            # winlog.event_data.Image
        'TargetFilename',   # winlog.event_data.TargetFilename
        'CommandLine',      # winlog.event_data.CommandLine
        'CurrentDirectory', # winlog.event_data.CurrentDirectory
        'ParentImage',      # winlog.event_data.ParentImage
        'ParentCommandLine' # winlog.event_data.ParentCommandLine
    ]

    # ID Features (Numerical IDs that can benefit from binary encoding)
    id_features = [
        'EventRecordID',        # winlog.record_id
        'Execution_ProcessID',  # winlog.process.pid
        'ProcessID',            # winlog.event_data.ProcessId
        'ThreadID',             # winlog.process.thread.id
        'SourceProcessGuid',    # winlog.event_data.SourceProcessGuid
        'TargetProcessGuid'     # winlog.event_data.TargetProcessGuid
    ]

    # Binary Features (True/False or binary values)
    binary_features = [
        'SourceIsIpv6',         # winlog.event_data.SourceIsIpv6
        'DestinationIsIpv6',    # winlog.event_data.DestinationIsIpv6
        'IsExecutable'          # winlog.event_data.IsExecutable
    ]

    # Network Features (IPs and ports that can be split into octets or encoded)
    network_features = [
        'SourceIp',             # winlog.event_data.SourceIp
        'DestinationIp',        # winlog.event_data.DestinationIp
        'SourcePort',           # winlog.event_data.SourcePort
        'DestinationPort'       # winlog.event_data.DestinationPort
    ]

    # Miscellaneous Features (Features that do not fit into other categories)
     

    # Helper function for binary encoding (converting to binary and one-hot encoding)
    def binary_encode(value, length=8):
        # Convert to binary, fill up to desired length, then split each bit
        return [int(bit) for bit in f"{value:0{length}b}"]

    # Preprocess low cardinality features with Label Encoding + Binary Encoding
    for feature in low_cardinality_features:
        le = LabelEncoder()
        data[feature] = le.fit_transform(data[feature])
        # Apply binary encoding (binary representation of integer label)
        binary_encoded_df = data[feature].apply(lambda x: binary_encode(x, length=8))
        binary_encoded_df = pd.DataFrame(binary_encoded_df.tolist(), index=data.index)
        data = data.drop(columns=[feature]).join(binary_encoded_df, rsuffix=f"_{feature}_bin")

    # Preprocess high cardinality features with Hashing + Binary Encoding
    for feature in high_cardinality_features:
        hasher = HashingVectorizer(n_features=8, binary=True)  # Adjust n_features based on dataset size
        hashed_data = hasher.transform(data[feature].astype(str)).toarray()
        hashed_df = pd.DataFrame(hashed_data, columns=[f"{feature}_hash_{i}" for i in range(hashed_data.shape[1])])
        data = pd.concat([data, hashed_df], axis=1).drop(columns=[feature])

    # Tokenize textual data with slashes for paths and prepare for positional encoding
    for feature in textual_features:
        data[feature] = data[feature].str.split('/')  # Tokenize by slashes for paths
        data[feature] = data[feature].apply(lambda x: [token for token in x if token])

    # Binary encode numerical features that are treated as IDs
    for feature in id_features:
        data[feature] = data[feature].apply(lambda x: binary_encode(int(x), length=16))

    # Process network features by splitting IPs into octets and binary encoding each octet
    for feature in network_features:
        data[feature] = data[feature].apply(lambda ip: ip.split('.') if isinstance(ip, str) else [])
        # Convert octets to binary encoded form
        octet_df = data[feature].apply(lambda x: [binary_encode(int(octet), length=8) for octet in x])
        flattened_octet_df = pd.DataFrame(octet_df.tolist(), index=data.index)
        data = pd.concat([data, flattened_octet_df], axis=1).drop(columns=[feature])

    # Convert binary features to 1 and 0
    for feature in binary_features:
        data[feature] = data[feature].astype(int)

    return data  # Return the preprocessed chunk


In [None]:
input_folder = 'F:/Intrusion detection datasets/Lateral-Movement-Dataset--LMD_Collections/LMD_essential'
output_folder = 'F:/Intrusion detection datasets/Lateral-Movement-Dataset--LMD_Collections/LMD_essential_preprocessed'

# Ensure output directory exists
os.makedirs(output_folder, exist_ok=True)

def load_and_process_in_chunks(input_folder, output_folder, chunk_size=100000):
    """
    Load and preprocess data in chunks from CSV files, applying transformations, 
    and save the processed chunks to the output folder.
    """
    # Get a list of CSV files in the input folder
    csv_files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

    # Process each file in the input folder
    for file_name in csv_files:
        file_path = os.path.join(input_folder, file_name)
        
        # Initialize chunk processing
        chunk_number = 0
        
        # Load and process each chunk
        for chunk in pd.read_csv(file_path, chunksize=chunk_size):
            # Apply preprocessing to the chunk
            processed_chunk = preprocess_chunk(chunk)
            
            # Save processed chunk to output folder
            save_processed_chunk(processed_chunk, output_folder, file_name, chunk_number)
            
            print(f"Processed and saved chunk {chunk_number} for file {file_name}")
            chunk_number += 1


def save_processed_chunk(processed_chunk, output_folder, original_file_name, chunk_number):
    """
    Save the processed chunk to the output folder with a chunk identifier in the file name.
    """
    # Construct a unique file name for each chunk
    base_name = os.path.splitext(original_file_name)[0]
    output_file_name = f"{base_name}_processed_chunk_{chunk_number}.csv"
    output_path = os.path.join(output_folder, output_file_name)
    
    # Save the processed chunk as a CSV
    processed_chunk.to_csv(output_path, index=False)
    print(f"Saved processed chunk to {output_path}")

# Execute the function to process and save all files in chunks
load_and_process_in_chunks(input_folder, output_folder)

In [5]:
import os
import pandas as pd

# Path to the folder containing the CSV files
folder_path = "F:/Intrusion detection datasets/Lateral-Movement-Dataset--LMD_Collections/LMD_essential_processed"

# List to store column names for each file
file_column_names = {}

# Read all CSV files in the folder
csv_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]

# Process each CSV file
for file_index, file_path in enumerate(csv_files, start=1):
    print(f"Processing File {file_index}/{len(csv_files)}: {file_path}")
    try:
        # Read only the header row
        column_names = pd.read_csv(file_path, nrows=0).columns.tolist()
        file_column_names[file_path] = column_names
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")

# Display results
for file, columns in file_column_names.items():
    print(f"\nFile: {file}")
    print(f"Columns: {columns}")


Processing File 1/4: F:/Intrusion detection datasets/Lateral-Movement-Dataset--LMD_Collections/LMD_essential_processed\LMD-2022 [870K Elements][Labelled].csv
Processing File 2/4: F:/Intrusion detection datasets/Lateral-Movement-Dataset--LMD_Collections/LMD_essential_processed\LMD-2023 [1.75M Elements][Labelled]checked.csv
Processing File 3/4: F:/Intrusion detection datasets/Lateral-Movement-Dataset--LMD_Collections/LMD_essential_processed\LMD-2023 [1.87M Elements][Labelled]checked.csv
Processing File 4/4: F:/Intrusion detection datasets/Lateral-Movement-Dataset--LMD_Collections/LMD_essential_processed\LMD-2023 [2.3M Elements][Labelled]checked.csv

File: F:/Intrusion detection datasets/Lateral-Movement-Dataset--LMD_Collections/LMD_essential_processed\LMD-2022 [870K Elements][Labelled].csv
Columns: ['SystemTime', 'UtcTime', 'Image', 'Description', 'Product', 'Company', 'CommandLine', 'CurrentDirectory', 'ParentImage', 'ParentCommandLine', 'Initiated', 'SourceIsIpv6', 'DestinationIsIpv6',

In [3]:
import tensorflow as tf

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print("Is GPU available: ", tf.test.is_gpu_available())


Num GPUs Available:  0
Is GPU available:  False
