In [1]:
import pandas as pd
import re
import os
from scipy.stats import entropy

In [16]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [17]:

def read_and_combine_csv_files(directory_path):
    """
    Reads CSV files from the specified directory. If there's only one CSV file, it reads that file directly.
    If there are multiple CSV files, it combines them into a single DataFrame.

    Args:
        directory_path (str): Path to the directory containing the CSV file(s).

    Returns:
        pd.DataFrame: A DataFrame with the combined data from all CSV files in the directory, including additional
                      process name columns if applicable.
    """
    # List all CSV files in the directory
    csv_files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]

    # Check if there are any CSV files in the directory
    if not csv_files:
        raise ValueError("No CSV files found in the specified directory.")

    # If there's only one CSV file, read it directly
    if len(csv_files) == 1:
        df = pd.read_csv(os.path.join(directory_path, csv_files[0]))
    else:
        # If there are multiple CSV files, read and combine them
        df = pd.concat((pd.read_csv(os.path.join(directory_path, f)) for f in csv_files), ignore_index=True)

    # Add process name columns if the relevant columns are present
    if 'process.executable' in df.columns:
        df['process.name'] = df['process.executable'].apply(lambda x: x.split('\\')[-1])
    if 'process.parent.executable' in df.columns:
        df['process.parent.name'] = df['process.parent.executable'].apply(lambda x: x.split('\\')[-1])

    return df

In [18]:
def read_csvs_from_subfolders(root_folder):
    """
    Reads CSV files from each subfolder in the specified root folder. For each subfolder, it reads CSV files
    using the `read_and_combine_csv_files` function and returns a list of DataFrames.

    Args:
        root_folder (str): Path to the root folder containing subfolders with CSV files.

    Returns:
        list: A list of DataFrames, each corresponding to the combined data from the CSV files in a subfolder.
    """
    dataframes = []

    # Iterate over each subfolder in the root folder
    for subfolder in os.listdir(root_folder):
        subfolder_path = os.path.join(root_folder, subfolder)

        # Check if the path is a directory
        if os.path.isdir(subfolder_path):
            try:
                # Use the previously defined function to read/merge CSV files in the subfolder
                df = read_and_combine_csv_files(subfolder_path)
                dataframes.append(df)  # Add the DataFrame to the list
            except ValueError as e:
                print(f"Warning: {e} in subfolder {subfolder_path}. Skipping this subfolder.")

    return dataframes


In [19]:
root_folder = '/content/drive/MyDrive/Colab Notebooks/paper'
dataframes = read_csvs_from_subfolders(root_folder)


In [20]:
import os
import pandas as pd
from scipy.stats import entropy

# Load the dataset and sample
def load_and_sample_data(df):
    return df.copy(deep=True)#.sample(frac=1.0)

# Clean the data by removing columns with identical values or high missing values
def clean_data(df, threshold=0.9):
    same_value_columns = [col for col in df.columns if df[col].nunique() == 1]
    missing_value_columns = [col for col in df.columns if df[col].isna().mean() > threshold]
    columns_to_drop = set(same_value_columns + missing_value_columns)
    return df.drop(columns=columns_to_drop)

# Engineer features from the 'event.action' column
def engineer_event_action_features(df, engineered_features):
    selected_actions = [
        'File created (rule: FileCreate)', 'File Delete archived (rule: FileDelete)',
        'File creation time changed (rule: FileCreateTime)', 'Registry value set (rule: RegistryEvent)',
        'Process Create (rule: ProcessCreate)', 'Pipe Created (rule: PipeEvent)'
    ]

    df['event.action_filtered'] = df['event.action'].apply(lambda x: x if x in selected_actions else None)
    one_hot_encoded_actions = pd.get_dummies(df['event.action_filtered']).astype(int)
    short_column_names = {col: col.split(' (rule')[0].replace(' ', '_') for col in one_hot_encoded_actions.columns}
    engineered_features = pd.concat([engineered_features, one_hot_encoded_actions.rename(columns=short_column_names)], axis=1)
    return engineered_features

# Engineer features from the 'event.type' column
def engineer_event_type_features(df, engineered_features):
    def categorize_event_type(event_type):
        categories = {
            'process-related': ['process'],
            'file-related': ['file'],
            'network-related': ['network'],
            'error-related': ['error'],
            'driver-related': ['driver'],
            'registry-related': ['configuration, registry'],
        }
        for category, types in categories.items():
            if event_type in types:
                return category
        return 'miscellaneous'

    df['event_type_group'] = df['event.category'].apply(categorize_event_type)
    event_type_encoded = pd.get_dummies(df['event_type_group']).astype(int)
    engineered_features = pd.concat([engineered_features, event_type_encoded[['process-related', 'file-related', 'network-related', 'driver-related', 'registry-related']]], axis=1)
    return engineered_features

# Engineer features from 'process.executable' column
def engineer_process_executable_features(df, engineered_features):
    suspicious_keywords = ["Temp", "AppData", "Roaming", "Startup", "Downloads", "ProgramData", "Users"]
    trusted_dirs = ['C:\\Windows\\System32', 'C:\\Program Files', 'C:\\Windows']

    df['suspicious_path'] = df['process.executable'].apply(lambda x: any(keyword in x for keyword in suspicious_keywords) if isinstance(x, str) else 0)
    df['system_executable'] = df['process.executable'].apply(lambda x: any(x.startswith(trusted) for trusted in trusted_dirs) if isinstance(x, str) else 0)
    df['path_length'] = df['process.executable'].apply(lambda x: len(x) if isinstance(x, str) else 0)
    df['directory_depth'] = df['process.executable'].apply(lambda x: x.count('\\') if isinstance(x, str) else 0)

    engineered_features['suspicious_path'] = df['suspicious_path'].astype(int)
    engineered_features['system_executable'] = df['system_executable'].astype(int)
    engineered_features['path_length'] = df['path_length']
    engineered_features['directory_depth'] = df['directory_depth']

    return engineered_features

# Engineer features from 'process.name' and 'process.parent.name'
def engineer_process_name_features(df, engineered_features):
    process_name_freq = df['process.name'].value_counts()
    parent_name_freq = df['process.parent.name'].value_counts()

    df['process_name_freq'] = df['process.name'].map(process_name_freq)
    df['parent_name_freq'] = df['process.parent.name'].map(parent_name_freq)
    df['process_vs_parent_freq_ratio'] = df['process_name_freq'] / (df['parent_name_freq'] + 1)
    df['process_name_length'] = df['process.name'].apply(lambda x: len(x) if isinstance(x, str) else 0)

    engineered_features['process_vs_parent_freq_ratio'] = df['process_vs_parent_freq_ratio']
    engineered_features['process_name_length'] = df['process_name_length']

    return engineered_features

# Engineer additional process and parent process features
def engineer_parent_process_features(df, engineered_features):
    system_directories = ['C:\\Windows\\System32', 'C:\\Program Files']

    df['executable_depth'] = df['process.executable'].apply(lambda x: x.count('\\') if isinstance(x, str) else 0)
    df['parent_executable_depth'] = df['process.parent.executable'].apply(lambda x: x.count('\\') if isinstance(x, str) else 0)
    df['executable_depth_diff'] = abs(df['executable_depth'] - df['parent_executable_depth'])

    df['parent_is_system_executable'] = df['process.parent.executable'].apply(
        lambda x: 1 if isinstance(x, str) and any(dir in x for dir in system_directories) else 0)

    df['process_extension'] = df['process.executable'].apply(lambda x: x.split('.')[-1] if isinstance(x, str) and '.' in x else None)
    df['parent_process_extension'] = df['process.parent.executable'].apply(lambda x: x.split('.')[-1] if isinstance(x, str) and '.' in x else None)
    df['extension_similarity'] = df.apply(lambda row: 1 if row['process_extension'] == row['parent_process_extension'] else 0, axis=1)

    engineered_features['executable_depth_diff'] = df['executable_depth_diff']
    engineered_features['parent_is_system_executable'] = df['parent_is_system_executable']
    engineered_features['extension_similarity'] = df['extension_similarity']

    return engineered_features

# Calculate entropy for the 'file.name' column
def calculate_entropy(string):
    if not string:
        return 0
    probabilities = [float(string.count(c)) / len(string) for c in set(string)]
    return entropy(probabilities, base=2)

def engineer_file_name_features(df, engineered_features):
    df['file_name_entropy'] = df['file.name'].fillna('').apply(calculate_entropy)
    engineered_features['file_name_entropy'] = df['file_name_entropy']
    return engineered_features

# Main feature engineering function
def engineer_features(df):
    df_cleaned = clean_data(load_and_sample_data(df))
    engineered_features = pd.DataFrame()

    # Apply all feature engineering steps
    engineered_features = engineer_event_action_features(df_cleaned, engineered_features)
    engineered_features = engineer_event_type_features(df_cleaned, engineered_features)
    engineered_features = engineer_process_executable_features(df_cleaned, engineered_features)
    engineered_features = engineer_process_name_features(df_cleaned, engineered_features)
    engineered_features = engineer_parent_process_features(df_cleaned, engineered_features)
    engineered_features = engineer_file_name_features(df_cleaned, engineered_features)

    # Add 'target-class-name' and 'target-class' columns to the end of engineered_features DataFrame
    engineered_features['target-class'] = df_cleaned['target-class']

    if "target-class-name" in df_cleaned.columns:
        engineered_features['target-class-name'] = df_cleaned['target-class-name']
    elif "target" in df_cleaned.columns:
        engineered_features['target'] = df_cleaned['target']      #the data drift csv has these other columns so had to put them here without breaking the code using the if condition. Unmet condition reverts back to default.

    #the data drift csv has these other columns so had to put them here without breaking the code using the if condition. Unmet condition reverts back to default.
    if 'phase'in df_cleaned.columns:
        engineered_features['phase'] = df_cleaned['phase']

    return engineered_features

# Save each DataFrame with engineered features to a new CSV file in each subfolder
def process_and_save_features(root_folder):
    dataframes = read_csvs_from_subfolders(root_folder)
    for i, (df, subfolder) in enumerate(zip(dataframes, os.listdir(root_folder))):
        subfolder_path = os.path.join(root_folder, subfolder)
        engineered_features = engineer_features(df)

        # Create "Engineered_Features" directory if it doesn't exist
        output_dir = os.path.join(subfolder_path, "Engineered_Features")
        os.makedirs(output_dir, exist_ok=True)

        # Save engineered features to CSV
        output_path = os.path.join(output_dir, "processed.csv")
        engineered_features.to_csv(output_path, index=False)
        print(f"Processed {subfolder} and saved engineered features to {output_path}")

#Engineer all features here:
root_folder = '/content/drive/MyDrive/Colab Notebooks/paper'
process_and_save_features(root_folder)


Processed drift-data and saved engineered features to /content/drive/MyDrive/Colab Notebooks/paper/drift-data/Engineered_Features/processed.csv
