In [1]:
import os
import pandas as pd
import numpy as np
import datetime
from sentence_transformers.util import cos_sim
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, MultiLabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from torch_geometric.data import Data
import networkx as nx
import pytz
import hdbscan
from tqdm import tqdm 
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import normalize


extract_dir = "extracted_dataset"
parquet_processed_filename = "processed_dataset_with_labels.parquet"

In [2]:
df = pd.read_parquet(os.path.join(extract_dir, parquet_processed_filename))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2600263 entries, 0 to 2600262
Data columns (total 19 columns):
 #   Column             Dtype  
---  ------             -----  
 0   full_log           object 
 1   @timestamp         object 
 2   location           object 
 3   id                 float64
 4   filename           object 
 5   agent_ip           object 
 6   data_srcip         object 
 7   rule_firedtimes    int64  
 8   rule_level         int64  
 9   rule_pci_dss       object 
 10  rule_tsc           object 
 11  rule_description   object 
 12  rule_groups        object 
 13  rule_id            object 
 14  rule_nist_800_53   object 
 15  rule_gdpr          object 
 16  unix_timestamp     float64
 17  type_attack_label  object 
 18  attack_label       object 
dtypes: float64(2), int64(2), object(15)
memory usage: 376.9+ MB


In [3]:
df.head()

Unnamed: 0,full_log,@timestamp,location,id,filename,agent_ip,data_srcip,rule_firedtimes,rule_level,rule_pci_dss,rule_tsc,rule_description,rule_groups,rule_id,rule_nist_800_53,rule_gdpr,unix_timestamp,type_attack_label,attack_label
0,Jan 15 02:32:32 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:32.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,1,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d],1642214000.0,false_positive,benign
1,Jan 15 02:32:32 taylorcruz-mail freshclam[2851...,2022-01-15T02:32:32.000000Z,/var/log/syslog,1686147000.0,fox,192.168.128.170,,2,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d],1642214000.0,false_positive,benign
2,Jan 15 02:32:37 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:37.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,3,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d],1642214000.0,false_positive,benign
3,Jan 15 02:32:42 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:42.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,4,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d],1642214000.0,false_positive,benign
4,Jan 15 02:32:47 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:47.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,5,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d],1642214000.0,false_positive,benign


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2600263 entries, 0 to 2600262
Data columns (total 19 columns):
 #   Column             Dtype  
---  ------             -----  
 0   full_log           object 
 1   @timestamp         object 
 2   location           object 
 3   id                 float64
 4   filename           object 
 5   agent_ip           object 
 6   data_srcip         object 
 7   rule_firedtimes    int64  
 8   rule_level         int64  
 9   rule_pci_dss       object 
 10  rule_tsc           object 
 11  rule_description   object 
 12  rule_groups        object 
 13  rule_id            object 
 14  rule_nist_800_53   object 
 15  rule_gdpr          object 
 16  unix_timestamp     float64
 17  type_attack_label  object 
 18  attack_label       object 
dtypes: float64(2), int64(2), object(15)
memory usage: 376.9+ MB


In [5]:
df['type_attack_label'].value_counts()

type_attack_label
dirb                    1671940
false_positive           882739
wpscan                    28021
dnsteal                    8603
cracking                   5271
service_scans              1768
network_scans              1570
privilege_escalation        158
webshell                    109
reverse_shell                80
service_stop                  4
Name: count, dtype: int64

In [6]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained Sentence Transformer model.
# This model is excellent for semantic similarity tasks.
print("\nLoading sentence transformer model...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded.")


Loading sentence transformer model...
Model loaded.


In [7]:
logs_df = df.sort_values(by='@timestamp').reset_index(drop=True)

In [None]:
# --- Vectorization Step ---

# Get the list of descriptions to encode.
descriptions = logs_df['rule_description'].tolist()

batch_size = 256

# Encode the descriptions into vectors.
# The model.encode() method processes the list of strings and returns a list of numpy arrays (vectors).
print("\nEncoding descriptions into vectors...")
description_vectors = embedding_model.encode(
    descriptions, 
    batch_size=batch_size,
    device='cuda' if torch.cuda.is_available() else 'cpu',
    show_progress_bar=True)

description_vectors = normalize(description_vectors)

--- DataFrame before vectorization ---
                          @timestamp         agent_ip  \
0        2022-01-14T00:00:09.728242Z  192.168.104.218   
1        2022-01-14T00:00:09.728242Z  192.168.104.218   
2        2022-01-14T00:00:09.729820Z       10.229.0.4   
3        2022-01-14T00:00:09.729820Z       10.229.0.4   
4        2022-01-14T00:00:23.976670Z       10.229.0.4   
...                              ...              ...   
2600258  2022-02-08T23:47:19.000000Z     10.237.2.255   
2600259  2022-02-08T23:47:22.000000Z   10.182.193.181   
2600260  2022-02-08T23:47:22.000000Z   10.182.193.181   
2600261  2022-02-08T23:47:24.000000Z     10.237.2.255   
2600262  2022-02-08T23:47:24.000000Z     10.237.2.255   

                                          rule_description  
0        Suricata: Alert - ET INFO Observed DNS Query t...  
1                  First time this IDS alert is generated.  
2        Suricata: Alert - ET INFO Observed DNS Query t...  
3                  First time th

Batches:   0%|          | 0/10158 [00:00<?, ?it/s]

In [8]:
# Save the DataFrame (without the vector column) to Parquet.
# This is fast and efficient for tabular data.
print("Saving DataFrame to Parquet...")
logs_df.to_parquet(os.path.join(extract_dir, "sorted_ds_with_labels.parquet"))

Saving DataFrame to Parquet...


In [None]:
# Save the vectors using NumPy's native, efficient binary format.
# This is the key step to avoid the memory crash.
print("Saving vectors to .npy file...")
np.save(os.path.join(extract_dir, "vectorized_descr.npy"), description_vectors)

In [9]:
# Load the data you saved previously
logs_df = pd.read_parquet(os.path.join(extract_dir, 'sorted_ds_with_labels.parquet'))

description_vectors = np.load(os.path.join(extract_dir, 'vectorized_descr.npy'))

In [10]:
# --- Add the vectorized data to your DataFrame ---
# (Assuming 'logs_df' is your DataFrame and 'description_vectors' is the numpy array from the last step)
logs_df['description_vector'] = list(description_vectors)

In [11]:
embeddings = np.vstack(logs_df['description_vector'].values)
embeddings = normalize(embeddings)

In [12]:
def check_vectorization(embeddings, index_to_check):
    text_from_df = logs_df['rule_description'].iloc[index_to_check]
    print(f"Original text at index {index_to_check}: '{text_from_df}'")

    # Get the pre-computed vector from your saved NumPy array
    precomputed_vector = embeddings[index_to_check]

    # Generate a new vector from ONLY that specific text
    newly_generated_vector = embedding_model.encode([text_from_df])[0]

    # precomputed_vector = normalize(precomputed_vector.reshape(1, -1))[0]
    newly_generated_vector = normalize(newly_generated_vector.reshape(1, -1))[0]

    sim = np.dot(precomputed_vector, newly_generated_vector)
    print(f"Cosine similarity between precomputed and newly generated vector: {sim}")
    if sim >= 0.8:
        return True
    else: return False

check_vectorization(embeddings, 15)

Original text at index 15: 'IDS event.'
Cosine similarity between precomputed and newly generated vector: 1.0000001192092896


True

In [None]:
# --- Check if vectorized description at index X corresponds to X-th log description ---

def check_vectorized_descr(index_to_check):
    # Get the original text description from the DataFrame at that index
    text_from_df = logs_df['rule_description'].iloc[index_to_check]
    print(f"Original text at index {index_to_check}: '{text_from_df}'")

    # Get the pre-computed vector from your saved NumPy array
    precomputed_vector = logs_df['description_vector'].iloc[index_to_check]

    # Generate a new vector from ONLY that specific text
    newly_generated_vector = embedding_model.encode([text_from_df])[0]

    # Compare the two vectors
    # Use np.allclose() to account for tiny floating-point inaccuracies
    are_vectors_the_same = np.allclose(precomputed_vector, newly_generated_vector, atol=1e-6)

    print(f"\nVerification successful: {are_vectors_the_same}")

    # Print the actual difference between the vectors
    vector_difference = precomputed_vector - newly_generated_vector
    print(f"\nMax difference between any two elements: {np.max(np.abs(vector_difference))}")

    # You can also check their similarity score, which should be ~1.0
    similarity = cos_sim(precomputed_vector, newly_generated_vector)
    print(f"Cosine Similarity: {similarity.item()}")

check_vectorized_descr(10)

Original text at index 10: 'Suricata: Alert - ET POLICY GNU/Linux APT User-Agent Outbound likely related to package management'

Verification successful: True

Max difference between any two elements: 7.07896106177941e-08
Cosine Similarity: 1.0
