In [1]:
import os
import pandas as pd
import numpy as np
import datetime
from sentence_transformers.util import cos_sim
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, MultiLabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from torch_geometric.data import Data
import networkx as nx
import pytz
import hdbscan
from tqdm import tqdm 
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import normalize


extract_dir = "extracted_dataset"
parquet_processed_filename = "processed_dataset_with_labels.parquet"

In [2]:
df = pd.read_parquet(os.path.join(extract_dir, parquet_processed_filename))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2600263 entries, 0 to 2600262
Data columns (total 19 columns):
 #   Column             Dtype  
---  ------             -----  
 0   full_log           object 
 1   @timestamp         object 
 2   location           object 
 3   id                 float64
 4   filename           object 
 5   agent_ip           object 
 6   data_srcip         object 
 7   rule_firedtimes    int64  
 8   rule_level         int64  
 9   rule_pci_dss       object 
 10  rule_tsc           object 
 11  rule_description   object 
 12  rule_groups        object 
 13  rule_id            object 
 14  rule_nist_800_53   object 
 15  rule_gdpr          object 
 16  unix_timestamp     float64
 17  type_attack_label  object 
 18  attack_label       object 
dtypes: float64(2), int64(2), object(15)
memory usage: 376.9+ MB


In [3]:
df.head()

Unnamed: 0,full_log,@timestamp,location,id,filename,agent_ip,data_srcip,rule_firedtimes,rule_level,rule_pci_dss,rule_tsc,rule_description,rule_groups,rule_id,rule_nist_800_53,rule_gdpr,unix_timestamp,type_attack_label,attack_label
0,Jan 15 02:32:32 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:32.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,1,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d],1642214000.0,false_positive,benign
1,Jan 15 02:32:32 taylorcruz-mail freshclam[2851...,2022-01-15T02:32:32.000000Z,/var/log/syslog,1686147000.0,fox,192.168.128.170,,2,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d],1642214000.0,false_positive,benign
2,Jan 15 02:32:37 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:37.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,3,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d],1642214000.0,false_positive,benign
3,Jan 15 02:32:42 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:42.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,4,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d],1642214000.0,false_positive,benign
4,Jan 15 02:32:47 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:47.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,5,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d],1642214000.0,false_positive,benign


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2600263 entries, 0 to 2600262
Data columns (total 19 columns):
 #   Column             Dtype  
---  ------             -----  
 0   full_log           object 
 1   @timestamp         object 
 2   location           object 
 3   id                 float64
 4   filename           object 
 5   agent_ip           object 
 6   data_srcip         object 
 7   rule_firedtimes    int64  
 8   rule_level         int64  
 9   rule_pci_dss       object 
 10  rule_tsc           object 
 11  rule_description   object 
 12  rule_groups        object 
 13  rule_id            object 
 14  rule_nist_800_53   object 
 15  rule_gdpr          object 
 16  unix_timestamp     float64
 17  type_attack_label  object 
 18  attack_label       object 
dtypes: float64(2), int64(2), object(15)
memory usage: 376.9+ MB


In [5]:
df['type_attack_label'].value_counts()

type_attack_label
dirb                    1671940
false_positive           882739
wpscan                    28021
dnsteal                    8603
cracking                   5271
service_scans              1768
network_scans              1570
privilege_escalation        158
webshell                    109
reverse_shell                80
service_stop                  4
Name: count, dtype: int64

In [6]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained Sentence Transformer model.
# This model is excellent for semantic similarity tasks.
print("\nLoading sentence transformer model...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded.")


Loading sentence transformer model...
Model loaded.


In [7]:
logs_df = df.sort_values(by='@timestamp').reset_index(drop=True)

In [None]:
# --- Vectorization Step ---

# Get the list of descriptions to encode.
descriptions = logs_df['rule_description'].tolist()

batch_size = 256

# Encode the descriptions into vectors.
# The model.encode() method processes the list of strings and returns a list of numpy arrays (vectors).
print("\nEncoding descriptions into vectors...")
description_vectors = embedding_model.encode(
    descriptions, 
    batch_size=batch_size,
    device='cuda' if torch.cuda.is_available() else 'cpu',
    show_progress_bar=True)

description_vectors = normalize(description_vectors)

--- DataFrame before vectorization ---
                          @timestamp         agent_ip  \
0        2022-01-14T00:00:09.728242Z  192.168.104.218   
1        2022-01-14T00:00:09.728242Z  192.168.104.218   
2        2022-01-14T00:00:09.729820Z       10.229.0.4   
3        2022-01-14T00:00:09.729820Z       10.229.0.4   
4        2022-01-14T00:00:23.976670Z       10.229.0.4   
...                              ...              ...   
2600258  2022-02-08T23:47:19.000000Z     10.237.2.255   
2600259  2022-02-08T23:47:22.000000Z   10.182.193.181   
2600260  2022-02-08T23:47:22.000000Z   10.182.193.181   
2600261  2022-02-08T23:47:24.000000Z     10.237.2.255   
2600262  2022-02-08T23:47:24.000000Z     10.237.2.255   

                                          rule_description  
0        Suricata: Alert - ET INFO Observed DNS Query t...  
1                  First time this IDS alert is generated.  
2        Suricata: Alert - ET INFO Observed DNS Query t...  
3                  First time th

Batches:   0%|          | 0/10158 [00:00<?, ?it/s]

In [8]:
# Save the DataFrame (without the vector column) to Parquet.
# This is fast and efficient for tabular data.
print("Saving DataFrame to Parquet...")
logs_df.to_parquet(os.path.join(extract_dir, "sorted_ds_with_labels.parquet"))

Saving DataFrame to Parquet...


In [None]:
# Save the vectors using NumPy's native, efficient binary format.
# This is the key step to avoid the memory crash.
print("Saving vectors to .npy file...")
np.save(os.path.join(extract_dir, "vectorized_descr.npy"), description_vectors)

In [9]:
# Load the data you saved previously
logs_df = pd.read_parquet(os.path.join(extract_dir, 'sorted_ds_with_labels.parquet'))

description_vectors = np.load(os.path.join(extract_dir, 'vectorized_descr.npy'))

In [10]:
# --- Add the vectorized data to your DataFrame ---
# (Assuming 'logs_df' is your DataFrame and 'description_vectors' is the numpy array from the last step)
logs_df['description_vector'] = list(description_vectors)

In [11]:
embeddings = np.vstack(logs_df['description_vector'].values)
embeddings = normalize(embeddings)

In [12]:
def check_vectorization(embeddings, index_to_check):
    text_from_df = logs_df['rule_description'].iloc[index_to_check]
    print(f"Original text at index {index_to_check}: '{text_from_df}'")

    # Get the pre-computed vector from your saved NumPy array
    precomputed_vector = embeddings[index_to_check]

    # Generate a new vector from ONLY that specific text
    newly_generated_vector = embedding_model.encode([text_from_df])[0]

    # precomputed_vector = normalize(precomputed_vector.reshape(1, -1))[0]
    newly_generated_vector = normalize(newly_generated_vector.reshape(1, -1))[0]

    sim = np.dot(precomputed_vector, newly_generated_vector)
    print(f"Cosine similarity between precomputed and newly generated vector: {sim}")
    if sim >= 0.8:
        return True
    else: return False

check_vectorization(embeddings, 15)

Original text at index 15: 'IDS event.'
Cosine similarity between precomputed and newly generated vector: 1.0000001192092896


True

In [None]:
# --- Check if vectorized description at index X corresponds to X-th log description ---

def check_vectorized_descr(index_to_check):
    # Get the original text description from the DataFrame at that index
    text_from_df = logs_df['rule_description'].iloc[index_to_check]
    print(f"Original text at index {index_to_check}: '{text_from_df}'")

    # Get the pre-computed vector from your saved NumPy array
    precomputed_vector = logs_df['description_vector'].iloc[index_to_check]

    # Generate a new vector from ONLY that specific text
    newly_generated_vector = embedding_model.encode([text_from_df])[0]

    # Compare the two vectors
    # Use np.allclose() to account for tiny floating-point inaccuracies
    are_vectors_the_same = np.allclose(precomputed_vector, newly_generated_vector, atol=1e-6)

    print(f"\nVerification successful: {are_vectors_the_same}")

    # Print the actual difference between the vectors
    vector_difference = precomputed_vector - newly_generated_vector
    print(f"\nMax difference between any two elements: {np.max(np.abs(vector_difference))}")

    # You can also check their similarity score, which should be ~1.0
    similarity = cos_sim(precomputed_vector, newly_generated_vector)
    print(f"Cosine Similarity: {similarity.item()}")

check_vectorized_descr(10)

Original text at index 10: 'Suricata: Alert - ET POLICY GNU/Linux APT User-Agent Outbound likely related to package management'

Verification successful: True

Max difference between any two elements: 7.07896106177941e-08
Cosine Similarity: 1.0


In [None]:
# --- Preprocess Multi-Label Columns (NIST & GDPR) ---
print("--- Preprocessing list-like columns (NIST, GDPR)... ---")

# Handle potential NaNs by filling them with empty tuples, which the binarizer can handle.
df['rule_nist_800_53'] = df['rule_nist_800_53'].apply(lambda x: x if isinstance(x, list) else [])
df['rule_gdpr'] = df['rule_gdpr'].apply(lambda x: x if isinstance(x, list) else [])

# Use MultiLabelBinarizer for each column
mlb_nist = MultiLabelBinarizer()
nist_features = mlb_nist.fit_transform(df['rule_nist_800_53'])

mlb_gdpr = MultiLabelBinarizer()
gdpr_features = mlb_gdpr.fit_transform(df['rule_gdpr'])

print(f"NIST features shape: {nist_features.shape}")
print(f"GDPR features shape: {gdpr_features.shape}")

# --- Preprocess the rest of the tabular features ---
print("--- Preprocessing other numerical/categorical columns... ---")

# Define features, EXCLUDING the ones we just handled
other_features_df = df.drop(columns=[
    'rule_description', 'attack_label', 'rule_nist_800_53', 'rule_gdpr' 
])

# These are the simple categorical features left
simple_categorical_features = ['data_srcip', 'location', 'rule_id']
numeric_features = ['rule_level']

# Create a pipeline for simple categorical features to first impute, then encode.
# This fixes the logical bug where the imputer was defined but not used.
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Create the ColumnTransformer for the remaining features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', categorical_pipeline, simple_categorical_features)
    ],
    remainder='drop'
)

other_features_processed = preprocessor.fit_transform(other_features_df)
print(f"Other processed features shape: {other_features_processed.shape}")


# --- Combine ALL Tabular Features ---
# Stack the multi-label features and the other processed features together
processed_tabular_features = np.hstack([nist_features, gdpr_features, other_features_processed])
print(f"Final combined tabular features shape: {processed_tabular_features.shape}")


# --- Encode Labels ---
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['attack_label'])


# --- Save Processed Data to Disk (to avoid MemoryError) ---
# This is the key to handling large datasets.
print("--- Saving processed arrays to disk... ---")
np.save('processed_tabular.npy', processed_tabular_features)
np.save('labels.npy', y_encoded)


# --- Create a Memory-Efficient PyTorch Dataset ---
# This class loads data from the saved files ON-THE-FLY, solving the MemoryError.
class ScalableAlertDataset(Dataset):
    def __init__(self, tabular_path, vectors_path, labels_path, indices):
        self.indices = indices
        # Load arrays in memory-map mode. It doesn't load the full file into RAM.
        self.tabular_data = np.load(tabular_path, mmap_mode='r')
        self.vector_data = np.load(vectors_path, mmap_mode='r')
        self.labels = np.load(labels_path, mmap_mode='r')

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        # Get the actual index from our train/test split
        data_idx = self.indices[idx]
        
        # Fetch only the required data from the memory-mapped files
        tabular_features = self.tabular_data[data_idx]
        vector_features = self.vector_data[data_idx]
        label = self.labels[data_idx]
        
        # Combine the features for this single item
        combined_features = np.concatenate((vector_features, tabular_features))
        
        return (
            torch.tensor(combined_features, dtype=torch.float32), 
            torch.tensor(label, dtype=torch.float32).unsqueeze(0)
        )

# --- Create DataLoaders ---
# Create indices for train/test split instead of splitting the giant tensor
num_samples = len(y_encoded)
indices = np.arange(num_samples)
train_indices, test_indices, _, _ = train_test_split(
    indices, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

train_dataset = ScalableAlertDataset(
    'processed_tabular.npy', 'vectorized_descr.npy', 'labels.npy', train_indices
)
test_dataset = ScalableAlertDataset(
    'processed_tabular.npy', 'vectorized_descr.npy', 'labels.npy', test_indices
)

BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print("\n--- DataLoaders created successfully! Ready for training. ---")

TypeError: Encoders require their input argument must be uniformly strings or numbers. Got ['NoneType', 'ndarray']

In [None]:
# --- Model Definition (No Change Needed) ---
# Assuming your SimpleClassifier class is defined as before
class SimpleClassifier(nn.Module):
    def __init__(self, input_features):
        super(SimpleClassifier, self).__init__()
        self.layer_1 = nn.Linear(input_features, 64)
        self.layer_2 = nn.Linear(64, 32)
        self.output_layer = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        x = self.relu(self.layer_1(x))
        x = self.dropout(x)
        x = self.relu(self.layer_2(x))
        x = self.sigmoid(self.output_layer(x))
        return x

# --- Get Input Size and Instantiate Model ---
# This is the new, robust way to get the input size.
first_features, _ = train_dataset[0]
input_size = first_features.shape[0]

# Set up device (use GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"--- Using device: {device} ---")

model = SimpleClassifier(input_features=input_size).to(device)
print("\n--- Model Architecture ---")
print(model)

In [None]:
# --- Train and Evaluate the Model ---
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
epochs = 10 # Increase for a real dataset

print("\n--- Starting Training ---")
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for batch_features, batch_labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        # Move data to the selected device
        batch_features = batch_features.to(device)
        batch_labels = batch_labels.to(device)
        
        # Forward pass
        outputs = model(batch_features)
        loss = criterion(outputs, batch_labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/{epochs}], Training Loss: {running_loss/len(train_loader):.4f}")

    # --- Evaluation loop within training ---
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for features, labels in test_loader:
            features = features.to(device)
            labels = labels.to(device)
            
            outputs = model(features)
            predicted = (outputs.data > 0.5).float()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Test Accuracy: {100 * correct / total:.2f}%")


# --- Prediction Function for a New Sample ---
def predict_single(new_data_dict):
    """
    Preprocesses and predicts the class for a single new log entry.
    Requires the fitted preprocessors from your previous script:
    - embedding_model
    - mlb_nist
    - mlb_gdpr
    - preprocessor (the ColumnTransformer)
    - label_encoder
    - model
    """
    print("\n--- Making a prediction on a new sample ---")
    model.eval() # Ensure model is in evaluation mode

    new_df = pd.DataFrame([new_data_dict])

    # Preprocess text
    text_vector = embedding_model.encode(new_df['rule_description'].tolist())

    # Preprocess list-like columns (handle missing keys)
    nist_list = new_df.get('rule_nist_8_53', pd.Series([[] for _ in range(len(new_df))]))
    gdpr_list = new_df.get('rule_gdpr', pd.Series([[] for _ in range(len(new_df))]))
    
    nist_features = mlb_nist.transform(nist_list)
    gdpr_features = mlb_gdpr.transform(gdpr_list)

    # Preprocess other tabular features
    # Ensure all required columns exist, fill with None if not
    for col in preprocessor.feature_names_in_:
        if col not in new_df.columns:
            new_df[col] = None
    
    other_features_processed = preprocessor.transform(new_df)
    
    # Combine all features in the correct order
    combined_features_np = np.hstack([nist_features, gdpr_features, other_features_processed])
    
    # The text vector must be the first part of the final combined array
    final_combined_features = np.hstack([text_vector, combined_features_np])

    # Convert to tensor and predict
    new_data_tensor = torch.tensor(final_combined_features, dtype=torch.float32).to(device)
    
    with torch.no_grad():
        prediction_prob = model(new_data_tensor).item()
        prediction_class_idx = 1 if prediction_prob > 0.5 else 0
        prediction_label = label_encoder.inverse_transform([prediction_class_idx])[0]

    print(f"Prediction Probability: {prediction_prob:.4f}")
    print(f"Predicted Label: {prediction_label}")
    return prediction_label, prediction_prob

# --- Example Usage of the Prediction Function ---
# Create a new sample log entry as a dictionary
new_log = {
    'rule_description': 'sshd: brute force attack',
    'data_srcip': '192.168.1.105',
    'location': '/var/log/auth.log',
    'rule_id': 5712,
    'rule_level': 10,
    'rule_nist_800_53': ['AC.7', 'SI.4'],
    'rule_gdpr': ['IV_32.2']
}

# Call the function to get a prediction
predicted_label, _ = predict_single(new_log)

In [30]:
# --- Sessionization by Timestamp ---

logs_df['@timestamp'] = pd.to_datetime(logs_df['@timestamp'])

# Sort values to ensure correct time difference calculation
logs_df = logs_df.sort_values(by=['agent_ip', '@timestamp']).reset_index(drop=True)

# Define the inactivity threshold for a new session (e.g., 15 minutes)
session_timeout = pd.Timedelta(minutes=15)

# Calculate the time difference between consecutive logs for each agent
time_diffs = logs_df.groupby('agent_ip')['@timestamp'].diff()

# A new session starts where the time difference is greater than the timeout
new_session_flags = (time_diffs > session_timeout).astype(int)

# Create a session ID by taking the cumulative sum of the new session flags for each agent
session_ids = new_session_flags.groupby(logs_df['agent_ip']).cumsum()

# Create a globally unique session identifier
logs_df['session_id'] = logs_df['agent_ip'] + '_' + session_ids.astype(str)

print(f"\n--- Sessionization Complete ---")
print(f"Total unique sessions created: {logs_df['session_id'].nunique()}")
print(logs_df[['@timestamp', 'agent_ip', 'session_id']])


--- Sessionization Complete ---
Total unique sessions created: 4846
                              @timestamp      agent_ip        session_id
0       2022-01-19 01:02:04.669876+00:00   10.132.56.1     10.132.56.1_0
1       2022-01-19 01:02:04.669876+00:00   10.132.56.1     10.132.56.1_0
2       2022-01-19 01:02:04.676411+00:00   10.132.56.1     10.132.56.1_0
3       2022-01-19 01:02:04.676411+00:00   10.132.56.1     10.132.56.1_0
4       2022-01-19 01:02:04.704695+00:00   10.132.56.1     10.132.56.1_0
...                                  ...           ...               ...
2600258        2022-01-23 19:25:33+00:00  192.168.99.0  192.168.99.0_115
2600259        2022-01-23 20:25:33+00:00  192.168.99.0  192.168.99.0_116
2600260        2022-01-23 21:25:33+00:00  192.168.99.0  192.168.99.0_117
2600261        2022-01-23 22:25:33+00:00  192.168.99.0  192.168.99.0_118
2600262        2022-01-23 23:25:33+00:00  192.168.99.0  192.168.99.0_119

[2600263 rows x 3 columns]


In [31]:
# --- Clustering setup ---
grouped_sessions = logs_df.groupby('session_id')
categorical_features = ['data_srcip', 'rule_id']
numerical_features = ['rule_level']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)],
    remainder='passthrough')

In [32]:
# --- Enhanced Session Analysis ---
print("--- Analyzing All Sessions ---")
session_analysis_results = []

for session_name, session_group in grouped_sessions:
    session_analysis_results.append({
        'session_id': session_name,
        'log_count': len(session_group),
        'filename_variety': session_group['filename'].nunique(),
        'unique_filenames': list(session_group['filename'].unique()), # Get the actual unique values
        'attack_label_variety': session_group['attack_label'].nunique(),
        'unique_attack_labels': list(session_group['attack_label'].unique()), # Get the actual unique values
        'start_time': session_group['@timestamp'].min(),
        'end_time': session_group['@timestamp'].max(),
    })

# Convert analysis results to a DataFrame for a clean summary view
analysis_df = pd.DataFrame(session_analysis_results)
print("Session Analysis Summary:")
print(analysis_df.to_string()) # .to_string() ensures all columns are displayed

print("\n--- Analysis Highlights ---")

# Calculate the number of sessions with just one log
single_log_sessions_count = (analysis_df['log_count'] == 1).sum()
print(f"Number of sessions with only one log: {single_log_sessions_count}")

multi_log_sessions_count = (analysis_df['log_count'] > 1).sum()
print(f"Number of sessions with more than one log: {multi_log_sessions_count}")

# Find the session with the maximum number of logs
max_logs_session = analysis_df.loc[analysis_df['log_count'].idxmax()]
print(f"Session with maximum logs ('{max_logs_session['session_id']}'): {max_logs_session['log_count']} logs")

# Find the session with the minimum number of logs
min_logs_session = analysis_df.loc[analysis_df['log_count'].idxmin()]
print(f"Session with minimum logs ('{min_logs_session['session_id']}'): {min_logs_session['log_count']} logs")

max_attack_label_session = analysis_df.loc[analysis_df['attack_label_variety'].idxmax()]
print(f"Session with maximum attack label variety ('{max_attack_label_session['session_id']}'): {max_attack_label_session['attack_label_variety']} unique attack labels")

max_scenario_session = analysis_df.loc[analysis_df['filename_variety'].idxmax()]
print(f"Session with maximum filename variety ('{max_scenario_session['session_id']}'): {max_scenario_session['filename_variety']} unique filenames")

--- Analyzing All Sessions ---
Session Analysis Summary:
               session_id  log_count  filename_variety   unique_filenames  attack_label_variety                                                                                                                          unique_attack_labels                       start_time                         end_time
0         10.132.56.171_0          6                 1         [wardbeck]                     1                                                                                                                              [false_positive]        2022-01-19 00:38:02+00:00        2022-01-19 00:38:22+00:00
1         10.132.56.171_1          6                 1         [wardbeck]                     1                                                                                                                              [false_positive]        2022-01-19 01:38:22+00:00        2022-01-19 01:38:42+00:00
2        10.132.56.171_10      

In [33]:
# Define the session ID you want to inspect
target_session_id = '10.38.242.195_10'

# Filter the original logs_df to get only the rows matching that session ID
specific_session_df = logs_df[logs_df['session_id'] == target_session_id]

# Print the resulting DataFrame containing all logs for that session
print(f"--- Displaying all {len(specific_session_df)} logs for session: {target_session_id} ---")

# Use .to_string() to ensure all rows and columns are displayed without truncation
print(specific_session_df.to_string())

--- Displaying all 1344 logs for session: 10.38.242.195_10 ---
                                                                                                                                                                        full_log                @timestamp            location            id filename       agent_ip data_srcip  rule_firedtimes  rule_level rule_pci_dss               rule_tsc                 rule_description                        rule_groups rule_id rule_nist_800_53    rule_gdpr  unix_timestamp    attack_label                                                                                                                                                                                                                                                                                                                                                                                                                                                                               

In [25]:
# --- DIAGNOSE SESSION SIZES ---
print("--- Analyzing Session Sizes ---")
session_sizes = grouped_sessions.size().sort_values(ascending=False)
print("Largest sessions:")
# session sizes with at most 5000 logs
print(len(session_sizes[session_sizes <= 5000]))
print(len(session_sizes[session_sizes == 1]))
print(len(session_sizes[session_sizes > 10000]))

# You can set a threshold for what you consider "too large"
LARGE_SESSION_THRESHOLD = 5000

--- Analyzing Session Sizes ---
Largest sessions:
4808
2000
22


In [None]:
import warnings

# --- Session Clustering Phase ---
print("--- Starting Clustering Phase ---")
clustering_results = {}

with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=FutureWarning)
    for session_name, session_group in tqdm(grouped_sessions, desc="Clustering Sessions"):
        # A guard clause to skip sessions that are too small to cluster
        if len(session_group) < 2:
            continue

        # --- A) Prepare Data for Clustering ---
        # Stack the pre-computed description vectors into a single NumPy array
        description_vectors = np.vstack(session_group['description_vector'].values)
        
        # Use the preprocessor to scale numerical and one-hot encode categorical features
        prepared_tabular_data = preprocessor.fit_transform(session_group[numerical_features + categorical_features])
        
        # If the output of the preprocessor is a sparse matrix, convert it to a dense array
        if hasattr(prepared_tabular_data, "toarray"):
            prepared_tabular_data = prepared_tabular_data.toarray()

        # --- B) Combine All Features ---
        # Horizontally stack the tabular data and the text vectors into one final feature matrix
        feature_matrix = np.hstack([prepared_tabular_data, description_vectors])

        # --- C) Run the Clustering Algorithm ---
        # - min_cluster_size: The smallest number of points to be considered a cluster.
        # - min_samples: Controls how conservative the clustering is (higher value = more noise).
        # - core_dist_n_jobs=-1: Use all available CPU cores to speed up computation
        clusterer = hdbscan.HDBSCAN(min_cluster_size=15, 
                                    min_samples=2, 
                                    metric='euclidean',
                                    core_dist_n_jobs=-1)
        
        # Fit the model and predict the cluster for each log in the session
        cluster_labels = clusterer.fit_predict(feature_matrix)

        # --- D) Store the Results ---
        # Add the cluster labels as a new column to the session's DataFrame
        session_group_with_clusters = session_group.copy()
        session_group_with_clusters['cluster_id'] = cluster_labels
        
        # Store the resulting DataFrame in a dictionary, keyed by the session ID
        clustering_results[session_name] = session_group_with_clusters

--- Starting Clustering Phase ---


Clustering Sessions:  26%|██▌       | 1257/4846 [39:27<01:52, 31.77it/s]  

In [None]:
# --- Display Clustering Results ---
print(f"\n--- Detailed Clustering Results ({len(clustering_results)} sessions clustered) ---")
for session_id, clustered_data in clustering_results.items():
    print(f"\n--- Session ID: {session_id} ---")
    
    # Calculate summary statistics for this session's clustering results
    # A cluster_id of -1 indicates a "noise" point (an outlier)
    clusters_found = len(set(clustered_data['cluster_id']) - {-1})
    noise_points = np.sum(clustered_data['cluster_id'] == -1)

    print(f"  Clusters Found: {clusters_found}")
    print(f"  Noise Points (Anomalies): {noise_points}")
    print("  Clustered Log Details:")
    # Display the most relevant fields along with the new cluster_id
    print(clustered_data[['@timestamp', 'data_srcip', 'rule_description', 'cluster_id']].to_string())

In [53]:
def convert_timestamp(timestp):
    log_time = datetime.strptime(timestp, "%Y-%m-%dT%H:%M:%S.%f%z")  # example: 2022-01-21T00:17:54.308261+0000
    log_time = log_time.replace(tzinfo=pytz.utc)
    log_time = log_time.timestamp()
    return int(log_time)


# --- Feature preparation ---
categorical_features = ['data_srcip', 'rule_id']
numerical_features = ['rule_level']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='drop'
)

logs_df["@timestamp"] = pd.to_datetime(logs_df["@timestamp"])
logs_df["description_vector"] = logs_df["description_vector"].apply(np.array)

# Handle missing IPs (replace None with "unknown")
logs_df["data_srcip"] = logs_df["data_srcip"].fillna("unknown")

# Prepare tabular features
tabular_features = preprocessor.fit_transform(logs_df[numerical_features + categorical_features])

# If you also have text vectors (e.g., BERT embeddings)
if "description_vector" in logs_df.columns:
    desc_vectors = np.vstack(logs_df["description_vector"].values)
    node_features = np.hstack([tabular_features, desc_vectors])
else:
    node_features = tabular_features

print(f"Final node feature matrix shape: {node_features.shape}")


MemoryError: Unable to allocate 3.72 GiB for an array with shape (2600263, 384) and data type float32

In [None]:
# ---------------------------
# Graph Construction
# ---------------------------

# Initialize a NetworkX graph
G = nx.Graph()

# Add nodes — each row in logs_df is a node
for idx, row in logs_df.iterrows():
    G.add_node(idx)

# ---- Add edges for same agent_ip ----
for agent_ip, group in logs_df.groupby("agent_ip"):
    indices = group.index.tolist()
    for i in range(len(indices) - 1):
        G.add_edge(indices[i], indices[i + 1])  # link consecutive alerts

# ---- 2) Add edges for same rule_id ----
for rule_id, group in logs_df.groupby("rule_id"):
    indices = group.index.tolist()
    for i in range(len(indices) - 1):
        G.add_edge(indices[i], indices[i + 1])

# ---- 3) Add edges based on semantic similarity (if you have embeddings) ----
if "description_vector" in logs_df.columns:
    desc_matrix = np.vstack(logs_df["description_vector"])
    sim_matrix = cosine_similarity(desc_matrix)
    threshold = 0.85  # tune this threshold
    for i in range(len(sim_matrix)):
        for j in range(i + 1, len(sim_matrix)):
            if sim_matrix[i, j] > threshold:
                G.add_edge(i, j)

print(f"Total nodes: {G.number_of_nodes()}, Total edges: {G.number_of_edges()}")


# ---------------------------
# Convert to PyTorch Geometric graph
# ---------------------------
edge_index = torch.tensor(list(G.edges)).t().contiguous()
x = torch.tensor(node_features, dtype=torch.float)

data = Data(x=x, edge_index=edge_index)

print(data)

In [46]:
# ---------------- Load labels.csv ----------------
labels_path = os.path.join(extract_dir, "labels.csv")
if os.path.exists(labels_path):
    labels_df = pd.read_csv(labels_path, sep=",")
else:
    print("labels.csv not found inside inner zip")

# Sort by scenario and start time to ensure correct order
df_sorted = labels_df.sort_values(by=['scenario', 'start']).reset_index(drop=True)

# Group by scenario and calculate the gap from the previous event's end time
df_sorted['previous_end'] = df_sorted.groupby('scenario')['end'].shift(1)
df_sorted['gap_from_previous_sec'] = df_sorted['start'] - df_sorted['previous_end']

# Analyze the distribution of these gaps
gap_analysis = df_sorted['gap_from_previous_sec'].dropna()

print("--- Statistical Analysis of Time Gaps (in seconds) ---")
print(gap_analysis.describe())

--- Statistical Analysis of Time Gaps (in seconds) ---
count        71.000000
mean      10061.394366
std       33845.977209
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max      219451.000000
Name: gap_from_previous_sec, dtype: float64
