In [16]:
import pandas as pd
import numpy as np
import os
import mmh3  # For MurmurHash
from sklearn.preprocessing import MinMaxScaler
import re
import ipaddress
from sklearn.feature_extraction.text import TfidfVectorizer
# Initialize TF-IDF vectorizer globally to ensure consistency across chunks
tfidf_vectorizer = TfidfVectorizer(max_features=300)  # Limit to 100 features for scalability
from transformers import BertTokenizer, BertModel
import torch

In [17]:
# Path to the folder containing the CSV files
folder_path = "F:/Intrusion detection datasets/Lateral-Movement-Dataset--LMD_Collections/LMD_essential"
output_folder_path = "F:/Intrusion detection datasets/Lateral-Movement-Dataset--LMD_Collections/LMD_essential_processed"

# Parameters for encoding
HASH_BITS = 16  # For high cardinality features
BINARY_BITS_LOW = 8  # For low cardinality features
BINARY_BITS_HIGH = 16  # For high cardinality features

# List of features to drop
features_to_drop = ["Opcode", "Correlation", "Name", "Keywords", "Channel", "Archived"]

# Initialize MinMaxScaler for temporal normalization
scaler = MinMaxScaler()


In [18]:
def preprocess_temporal_features(df):
    temporal_features = ["SystemTime"]
    duration_features = ["PreviousCreationUtcTime", "UtcTime", "CreationUtcTime"]

    # Process datetime-based temporal features
    for feature in temporal_features:
        if feature in df.columns:
            # Convert to UNIX timestamp if not already in numeric form
            df[feature] = pd.to_datetime(df[feature], errors='coerce').astype(np.int64) // 10**9

    # Process duration-based temporal features
    def convert_to_seconds(time_str):
        try:
            # Ensure the value is a string before attempting to split
            if isinstance(time_str, str):
                minutes, seconds = map(float, time_str.split(':'))
                return minutes * 60 + seconds
            else:
                return None  # Handle non-string values gracefully
        except ValueError:
            return None  # Handle invalid formats gracefully


    for feature in duration_features:
        if feature in df.columns:
            df[feature] = df[feature].apply(convert_to_seconds)

    # Combine temporal and duration features for normalization
    all_temporal_features = [f for f in temporal_features + duration_features if f in df.columns]

    # Normalize all temporal features
    if all_temporal_features:
        df[all_temporal_features] = scaler.fit_transform(df[all_temporal_features])
    
    return df


In [19]:
# Function to preprocess path features
def preprocess_path_features(df):
    path_features = [
        "Image", "ParentImage", "SourceImage", "TargetImage", "SourceUser", "TargetUser",
        "CommandLine", "ParentCommandLine", "CurrentDirectory", "ImageLoaded",
        "TargetFilename", "TargetObject", "PipeName", "CallTrace", "Details"
    ]

    def tokenize_path(path):
        try:
            # Handle NaN or empty values by returning "0"
            if pd.isna(path) or not path:
                return "0"
            # Convert to lowercase
            path = path.lower()
            # Split by common delimiters (\\, /, and .)
            tokens = re.split(r"[\\/\.]+", path)
            # Filter out empty tokens
            return " ".join([token for token in tokens if token])
        except Exception:
            return "0"  # Return "0" in case of unexpected errors

    for feature in path_features:
        if feature in df.columns:
            df[feature] = df[feature].astype(str).apply(tokenize_path)

    return df


In [20]:
# Function to preprocess IP address features
def preprocess_ip_address_features(df, prefix_length=24, hash_bits=16):
    """
    Preprocesses IP address features by generating two hashes:
    - One for the full IP address
    - One for the network prefix (based on CIDR prefix length)
    Drops the original columns after processing.

    Args:
        df (pd.DataFrame): Input DataFrame.
        prefix_length (int): Length of the network prefix for CIDR notation.
        hash_bits (int): Number of bits for hashing.

    Returns:
        pd.DataFrame: Processed DataFrame with hashed features.
    """
    ip_features = ["SourceIp", "DestinationIp"]

    def get_network_prefix(ip, prefix_length):
        """Extract the network prefix from an IP address."""
        try:
            network = ipaddress.ip_network(f"{ip}/{prefix_length}", strict=False)
            return str(network.network_address)
        except ValueError:
            return "0.0.0.0"  # Default for invalid IPs

    for feature in ip_features:
        if feature in df.columns:
            # Generate hash for the full IP address
            df[f"{feature}_hash"] = df[feature].astype(str).apply(
                lambda x: mmh3.hash(x, signed=False) % (2 ** hash_bits)
            )
            
            # Extract network prefixes
            df[f"{feature}_prefix"] = df[feature].astype(str).apply(
                lambda x: get_network_prefix(x, prefix_length=prefix_length)
            )
            
            # Generate hash for the network prefix
            df[f"{feature}_prefix_hash"] = df[f"{feature}_prefix"].apply(
                lambda x: mmh3.hash(x, signed=False) % (2 ** hash_bits)
            )
            
            # Drop the original IP column and intermediate prefix column
            df = df.drop(columns=[feature, f"{feature}_prefix"])

    return df


In [21]:
# # Function to preprocess textual features
# def preprocess_textual_features(df):
#     textual_features = [ "Description", "Product", "Company"]
#     for feature in textual_features:
#         if feature in df.columns:
#             # Apply TF-IDF vectorization
#             tfidf_vectors = tfidf_vectorizer.fit_transform(df[feature].fillna("").astype(str)).toarray()
#             tfidf_columns = [f"{feature}_tfidf_{i}" for i in range(tfidf_vectors.shape[1])]
#             tfidf_df = pd.DataFrame(tfidf_vectors, columns=tfidf_columns)
#             df = df.drop(columns=[feature]).join(tfidf_df)
#     return df


In [22]:
# Function to preprocess numerical features as categorical
def preprocess_numerical_features_as_categorical(df, hash_bits=16):
    """
    Converts numerical features into hashed columns without binary encoding.
    """
    numerical_features = [
        "EventID", "ProcessID", "ThreadID", "Execution_ProcessID", 
        "ParentProcessId", "TargetProcessId", "EventRecordID", 
        "SourcePort", "DestinationPort", "ProcessId"
    ]

    for feature in numerical_features:
        if feature in df.columns:
            # Convert strings to numeric values, handling NaN and invalid values
            df[feature] = pd.to_numeric(df[feature], errors='coerce').fillna(0).astype(int)

            # Hash the feature values
            df[f"{feature}_hash"] = df[feature].astype(str).apply(
                lambda x: mmh3.hash(x, signed=False) % (2 ** hash_bits)
            )

            # Drop the original feature column
            df = df.drop(columns=[feature])

    return df

In [23]:
# Function to preprocess binary features into a single column
def preprocess_binary_features(df):
    binary_features = ["SourceIsIpv6", "DestinationIsIpv6", "Signed", "IsExecutable", "Initiated","Signed","SignatureStatus","State"]
    for feature in binary_features:
        if feature in df.columns:
            # Standardize values to 0/1
            df[feature] = df[feature].fillna("0")  # Replace NaN with "0"
            df[feature] = df[feature].astype(str).str.lower().apply(
                lambda x: 1 if x in ["true", "1", "yes","Started","Valid"] else 0
            )
    return df


In [24]:
# Function to preprocess low-cardinality categorical features
def preprocess_low_card_categorical_features(df, hash_bits=16):
    """
    Preprocess low-cardinality categorical features by hashing their values and storing the hash in a single column.
    """
    low_card_features = [
        "Level", "Task", "Protocol", "Version2", "SchemaVersion", 
        "Signature", "EventType", "StartFunction", "ID", "Configuration", 
        "ConfigurationFileHash", "IntegrityLevel", "UserID", "Computer", 
        "RuleName", "TerminalSessionId", "Version", "User", "ParentUser","FileVersion"
    ]
    
    for feature in low_card_features:
        if feature in df.columns:
            # Ensure all values are strings and handle missing values
            df[feature] = df[feature].fillna("0").astype(str)
            
            # Apply hashing
            df[f"{feature}_hash"] = df[feature].apply(
                lambda x: mmh3.hash(x, signed=False) % (2 ** hash_bits)
            )
            
            # Drop the original feature column
            df = df.drop(columns=[feature])
    
    return df


In [25]:
# Function to preprocess high-cardinality categorical features
def preprocess_high_card_categorical_features(df, hash_bits=16):
    """
    Preprocess high-cardinality categorical features by hashing their values and storing the hash in a single column.
    """
    high_card_features = [
        "Guid", "ProcessGuid", "ParentProcessGuid", "SourceProcessGUID", "TargetProcessGUID", "SourceProcessGuid",
        "LogonGuid", "ThreadID", "LogonId", "SourceHostname", "OriginalFileName", "TargetProcessGuid",
        "DestinationHostname", "SourcePortName", "DestinationPortName", "StartAddress", "StartModule",
        "NewThreadId", "GrantedAccess", "QueryName", "QueryResults", "Hashes", "Hash", "Contents"
    ]

    for feature in high_card_features:
        if feature in df.columns:
            # Apply MurmurHash to hash feature values
            df[f"{feature}_hash"] = df[feature].astype(str).apply(
                lambda x: mmh3.hash(x, signed=False) % (2 ** hash_bits)
            )
            # Drop the original feature column
            df = df.drop(columns=[feature])

    return df



In [26]:
def preprocess_all_features(df):
    df = preprocess_temporal_features(df)  # Temporal Features
    print("Temporal Features")
    df = preprocess_low_card_categorical_features(df)  # Low Cardinality Features
    print("Low Cardinality Features")
    df = preprocess_high_card_categorical_features(df)  # High Cardinality Features
    print("High Cardinality Features")
    df = preprocess_ip_address_features(df)  # IP Address Features
    print("IP Address Features")
    df = preprocess_path_features(df) # Path Features
    print("Path Features")
    # df = preprocess_textual_features(df)  # Textual Features
    # print("Textual")
    df = preprocess_numerical_features_as_categorical(df)  # Numerical Features
    print("Numerical")
    df = preprocess_binary_features(df)  # Binary Features
    print("Binary")
    return df


In [27]:
def process_files_in_chunks(input_folder, output_folder):
    all_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".csv")]
    
    for file in all_files:
        output_file = os.path.join(output_folder, os.path.basename(file))
        chunk_iter = pd.read_csv(file, chunksize=100000)  # Adjust chunksize based on memory
        
        for chunk_idx, chunk in enumerate(chunk_iter):
            # Print progress information
            print(f"Processing file: {os.path.basename(file)}, Chunk number: {chunk_idx + 1}")
            
            # Drop unnecessary features
            chunk.drop(columns=features_to_drop, inplace=True, errors='ignore')
            
            # Preprocess all feature types using the unified function
            chunk = preprocess_all_features(chunk)
            
            # Save processed chunk to output CSV
            if chunk_idx == 0:
                chunk.to_csv(output_file, mode='w', index=False)
            else:
                chunk.to_csv(output_file, mode='a', index=False, header=False)
            


In [28]:
# Execute preprocessing
process_files_in_chunks(folder_path, output_folder_path)

  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2022 [870K Elements][Labelled].csv, Chunk number: 1


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary
Processing file: LMD-2022 [870K Elements][Labelled].csv, Chunk number: 2


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2022 [870K Elements][Labelled].csv, Chunk number: 3


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2022 [870K Elements][Labelled].csv, Chunk number: 4


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary
Processing file: LMD-2022 [870K Elements][Labelled].csv, Chunk number: 5


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2022 [870K Elements][Labelled].csv, Chunk number: 6


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2022 [870K Elements][Labelled].csv, Chunk number: 7


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2022 [870K Elements][Labelled].csv, Chunk number: 8


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2022 [870K Elements][Labelled].csv, Chunk number: 9
Temporal Features


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.75M Elements][Labelled]checked.csv, Chunk number: 1
Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary
Processing file: LMD-2023 [1.75M Elements][Labelled]checked.csv, Chunk number: 2
Temporal Features


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.75M Elements][Labelled]checked.csv, Chunk number: 3
Temporal Features


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.75M Elements][Labelled]checked.csv, Chunk number: 4


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.75M Elements][Labelled]checked.csv, Chunk number: 5
Temporal Features


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.75M Elements][Labelled]checked.csv, Chunk number: 6
Temporal Features


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.75M Elements][Labelled]checked.csv, Chunk number: 7


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.75M Elements][Labelled]checked.csv, Chunk number: 8
Temporal Features


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.75M Elements][Labelled]checked.csv, Chunk number: 9
Temporal Features


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.75M Elements][Labelled]checked.csv, Chunk number: 10
Temporal Features


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.75M Elements][Labelled]checked.csv, Chunk number: 11


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.75M Elements][Labelled]checked.csv, Chunk number: 12


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.75M Elements][Labelled]checked.csv, Chunk number: 13
Temporal Features


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.75M Elements][Labelled]checked.csv, Chunk number: 14


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.75M Elements][Labelled]checked.csv, Chunk number: 15


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.75M Elements][Labelled]checked.csv, Chunk number: 16
Temporal Features


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.75M Elements][Labelled]checked.csv, Chunk number: 17


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.75M Elements][Labelled]checked.csv, Chunk number: 18
Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.87M Elements][Labelled]checked.csv, Chunk number: 1
Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary
Processing file: LMD-2023 [1.87M Elements][Labelled]checked.csv, Chunk number: 2
Temporal Features


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.87M Elements][Labelled]checked.csv, Chunk number: 3
Temporal Features


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.87M Elements][Labelled]checked.csv, Chunk number: 4


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.87M Elements][Labelled]checked.csv, Chunk number: 5
Temporal Features


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.87M Elements][Labelled]checked.csv, Chunk number: 6
Temporal Features


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.87M Elements][Labelled]checked.csv, Chunk number: 7


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.87M Elements][Labelled]checked.csv, Chunk number: 8


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.87M Elements][Labelled]checked.csv, Chunk number: 9
Temporal Features


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.87M Elements][Labelled]checked.csv, Chunk number: 10
Temporal Features


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.87M Elements][Labelled]checked.csv, Chunk number: 11


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.87M Elements][Labelled]checked.csv, Chunk number: 12


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.87M Elements][Labelled]checked.csv, Chunk number: 13
Temporal Features


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.87M Elements][Labelled]checked.csv, Chunk number: 14


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.87M Elements][Labelled]checked.csv, Chunk number: 15


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.87M Elements][Labelled]checked.csv, Chunk number: 16
Temporal Features


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.87M Elements][Labelled]checked.csv, Chunk number: 17


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.87M Elements][Labelled]checked.csv, Chunk number: 18


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [1.87M Elements][Labelled]checked.csv, Chunk number: 19
Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary
Processing file: LMD-2023 [1.87M Elements][Labelled]checked.csv, Chunk number: 20
Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [2.3M Elements][Labelled]checked.csv, Chunk number: 1
Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [2.3M Elements][Labelled]checked.csv, Chunk number: 2


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [2.3M Elements][Labelled]checked.csv, Chunk number: 3
Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [2.3M Elements][Labelled]checked.csv, Chunk number: 4


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [2.3M Elements][Labelled]checked.csv, Chunk number: 5
Temporal Features


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [2.3M Elements][Labelled]checked.csv, Chunk number: 6


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [2.3M Elements][Labelled]checked.csv, Chunk number: 7


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [2.3M Elements][Labelled]checked.csv, Chunk number: 8


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [2.3M Elements][Labelled]checked.csv, Chunk number: 9


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [2.3M Elements][Labelled]checked.csv, Chunk number: 10


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [2.3M Elements][Labelled]checked.csv, Chunk number: 11


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [2.3M Elements][Labelled]checked.csv, Chunk number: 12
Temporal Features


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary
Processing file: LMD-2023 [2.3M Elements][Labelled]checked.csv, Chunk number: 13
Temporal Features


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [2.3M Elements][Labelled]checked.csv, Chunk number: 14


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary
Processing file: LMD-2023 [2.3M Elements][Labelled]checked.csv, Chunk number: 15
Temporal Features


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [2.3M Elements][Labelled]checked.csv, Chunk number: 16


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [2.3M Elements][Labelled]checked.csv, Chunk number: 17


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [2.3M Elements][Labelled]checked.csv, Chunk number: 18
Temporal Features


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [2.3M Elements][Labelled]checked.csv, Chunk number: 19
Temporal Features


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [2.3M Elements][Labelled]checked.csv, Chunk number: 20


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):


Processing file: LMD-2023 [2.3M Elements][Labelled]checked.csv, Chunk number: 21


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary


  for chunk_idx, chunk in enumerate(chunk_iter):
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Processing file: LMD-2023 [2.3M Elements][Labelled]checked.csv, Chunk number: 22
Temporal Features
Low Cardinality Features
High Cardinality Features
IP Address Features
Path Features
Numerical
Binary
