In [7]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
import pytz

extract_dir = "extracted_dataset"
parquet_processed_filename = "processed_dataset.parquet"
labels_dir = os.path.join('alert-data-set', 'alerts_csv') 

In [8]:
df = pd.read_parquet(os.path.join(extract_dir, parquet_processed_filename))

df.head()

Unnamed: 0,full_log,@timestamp,location,id,filename,agent_ip,data_srcip,rule_firedtimes,rule_level,rule_pci_dss,rule_tsc,rule_description,rule_groups,rule_id,rule_nist_800_53,rule_gdpr
0,Jan 15 02:32:32 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:32.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,1,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d]
1,Jan 15 02:32:32 taylorcruz-mail freshclam[2851...,2022-01-15T02:32:32.000000Z,/var/log/syslog,1686147000.0,fox,192.168.128.170,,2,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d]
2,Jan 15 02:32:37 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:37.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,3,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d]
3,Jan 15 02:32:42 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:42.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,4,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d]
4,Jan 15 02:32:47 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:47.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,5,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d]


In [16]:
def convert_timestamp(timestp):
    log_time = datetime.strptime(timestp, "%Y-%m-%dT%H:%M:%S.%f%z") # 2022-01-21T00:17:54.308261+0000
    log_time = log_time.replace(tzinfo=pytz.utc)
    log_time = log_time.timestamp()
    
    return log_time

In [15]:
# ---------------- Load labels.csv ----------------
labels_path = os.path.join(extract_dir, "labels.csv")
if os.path.exists(labels_path):
    labels_df = pd.read_csv(labels_path, sep=",")
else:
    print("labels.csv not found inside inner zip")

In [25]:
logs_df = df.copy()
logs_df['unix_timestamp'] = logs_df['@timestamp'].apply(convert_timestamp)

# ---  OPTIMIZED LABELING ---

# Step A: Create lookup dictionary (No changes)
print("Creating an optimized lookup dictionary for labels...")
labels_dict = {}
for scenario, group in labels_df.groupby('scenario'):
    labels_dict[scenario] = list(group[['start', 'end', 'attack']].itertuples(index=False, name=None))

# Step B: Define labeling function
def find_attack_label(timestamp, intervals):
    for start, end, attack_label in intervals:
        if start <= timestamp <= end:
            return attack_label
    return 'false_positive'

# Step C: Define the group function
def get_labels_for_group(group):
    """
    This function now returns ONLY the Series of labels for the group.
    """
    scenario_name = group.name
    intervals_for_scenario = labels_dict.get(scenario_name, [])
    
    # Return just the result of the .apply() call
    return group['unix_timestamp'].apply(
        find_attack_label,
        args=(intervals_for_scenario,)
    )

print("Applying labels to logs using the grouped method...")
# Step D: Apply the function and assign the result directly to logs_df (MODIFIED call)
# The `group_keys=False` argument is key here. It prevents pandas from
# adding the group names ('filename') to the index of the result.
attack_label_series = logs_df.groupby('filename', group_keys=False).apply(get_labels_for_group)

# Now, assign this perfectly aligned Series as a new column in the original logs_df
logs_df['attack_label'] = attack_label_series

# --- VERIFY THE RESULTS ---

print("\n--- Summary of Assigned Labels in original logs_df ---")
print(logs_df['attack_label'].value_counts())

print("\n--- Final logs_df with new column ---")
print(logs_df.head().to_string())

Creating an optimized lookup dictionary for labels...
Applying labels to logs using the grouped method...


  attack_label_series = logs_df.groupby('filename', group_keys=False).apply(get_labels_for_group)



--- Summary of Assigned Labels in original logs_df ---
attack_label
dirb                    1671940
false_positive           882739
wpscan                    28021
dnsteal                    8603
cracking                   5271
service_scans              1768
network_scans              1570
privilege_escalation        158
webshell                    109
reverse_shell                80
service_stop                  4
Name: count, dtype: int64

--- Final logs_df with new column ---
                                                                                                                                  full_log                   @timestamp         location            id filename         agent_ip data_srcip  rule_firedtimes  rule_level rule_pci_dss rule_tsc        rule_description                rule_groups rule_id rule_nist_800_53    rule_gdpr  unix_timestamp    attack_label
0             Jan 15 02:32:32 mail freshclam[29266]: Sat Jan 15 02:32:32 2022 -> ClamAV update process sta

In [26]:
logs_df.head()

Unnamed: 0,full_log,@timestamp,location,id,filename,agent_ip,data_srcip,rule_firedtimes,rule_level,rule_pci_dss,rule_tsc,rule_description,rule_groups,rule_id,rule_nist_800_53,rule_gdpr,unix_timestamp,attack_label
0,Jan 15 02:32:32 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:32.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,1,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d],1642214000.0,false_positive
1,Jan 15 02:32:32 taylorcruz-mail freshclam[2851...,2022-01-15T02:32:32.000000Z,/var/log/syslog,1686147000.0,fox,192.168.128.170,,2,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d],1642214000.0,false_positive
2,Jan 15 02:32:37 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:37.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,3,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d],1642214000.0,false_positive
3,Jan 15 02:32:42 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:42.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,4,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d],1642214000.0,false_positive
4,Jan 15 02:32:47 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:47.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,5,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d],1642214000.0,false_positive


In [29]:
logs_df.to_parquet(path=os.path.join(extract_dir, "processed_dataset_with_labels.parquet"))

In [4]:
all_labels_dfs = []

print(f"Starting to load label files from: {labels_dir}")

for root, dirs, files in os.walk(labels_dir):
    for f in files:
        if f.endswith(".txt"):
            csv_path = os.path.join(root, f)
            try:
                csv_df = pd.read_csv(csv_path, sep=',')
                scenario_name = f.split("_")[0] 
                csv_df['scenario'] = scenario_name
                all_labels_dfs.append(csv_df)
                print(f"Loaded {f} ({len(csv_df)} rows) from scenario: {scenario_name}")
            
            except Exception as e:
                print(f"Could not read {f} at {csv_path}. Skipping. Error: {e}")

if all_labels_dfs:
    labels_data = pd.concat(all_labels_dfs, ignore_index=True)
    
    print("\n--- Combined Labels Dataset ---")
    print(f"Total rows loaded: {len(labels_data)}")
    print("Column information:")
    labels_data.info()
    print("\nFirst 5 rows of combined labels_data:")
    print(labels_data.head())
else:
    print("\nNo label files (.txt) were found in the specified directory.")

Starting to load label files from: alert-data-set\alerts_csv
Loaded fox_alerts.txt (462523 rows) from scenario: fox
Loaded harrison_alerts.txt (583754 rows) from scenario: harrison
Loaded russellmitchell_alerts.txt (41488 rows) from scenario: russellmitchell
Loaded santos_alerts.txt (126513 rows) from scenario: santos
Loaded shaw_alerts.txt (68539 rows) from scenario: shaw
Loaded wardbeck_alerts.txt (88204 rows) from scenario: wardbeck
Loaded wheeler_alerts.txt (603939 rows) from scenario: wheeler
Loaded wilson_alerts.txt (625303 rows) from scenario: wilson

--- Combined Labels Dataset ---
Total rows loaded: 2600263
Column information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2600263 entries, 0 to 2600262
Data columns (total 9 columns):
 #   Column       Dtype  
---  ------       -----  
 0   id           float64
 1   time         int64  
 2   name         object 
 3   ip           object 
 4   host         object 
 5   short        object 
 6   time_label   object 
 7   event

In [11]:
merge_key = 'id'

# Columns to keep from the labels_data (Right side).
# We keep the original 'time' and new labels for the final dataset.
desired_label_columns = ['time', 'name', 'short', 'time_label']

# Create a minimal labels DataFrame subset, ensuring 'id' is present
labels_to_merge = labels_data[[merge_key] + desired_label_columns].copy()

# De-duplicate the labels on the ID. Since ID is supposed to be unique, 
# this protects against errors if label source accidentally generated duplicate rows.
labels_to_merge = labels_to_merge.drop_duplicates(subset=[merge_key], keep='first')

# Perform the single-key Left Merge
merged_df = pd.merge(
    df,                        # logs (keep all rows)
    labels_to_merge,           # labels
    on=merge_key,              # Use the common 'id' column
    how='left',
    suffixes=('_log', '_label') # Helps distinguish any conflicting columns
)
# Convert the 'time' column to Int64
merged_df['time'] = merged_df['time'].astype('Int64')

# --- 3. OUTPUT ---

print("--- Resulting Merged DataFrame ---")
merged_df.info()

--- Resulting Merged DataFrame ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2600263 entries, 0 to 2600262
Data columns (total 20 columns):
 #   Column            Dtype  
---  ------            -----  
 0   full_log          object 
 1   @timestamp        object 
 2   location          object 
 3   id                float64
 4   filename          object 
 5   agent_ip          object 
 6   data_srcip        object 
 7   rule_firedtimes   int64  
 8   rule_level        int64  
 9   rule_pci_dss      object 
 10  rule_tsc          object 
 11  rule_description  object 
 12  rule_groups       object 
 13  rule_id           object 
 14  rule_nist_800_53  object 
 15  rule_gdpr         object 
 16  time              Int64  
 17  name              object 
 18  short             object 
 19  time_label        object 
dtypes: Int64(1), float64(1), int64(2), object(16)
memory usage: 399.2+ MB


In [14]:
# Count the number of rows where the 'time_label' column is Null/NaN
none_time_label_count = merged_df['time_label'].isnull().sum()

# Print the result in a clear, formatted string
print(f"Number of logs with a 'None' time_label: {none_time_label_count}")

merged_df['time_label'].value_counts(dropna=False)

Number of logs with a 'None' time_label: 750159


time_label
dirb                    1155766
NaN                      750159
false_positive           662608
wpscan                    19521
dnsteal                    6042
cracking                   3685
network_scans              1116
service_scans              1113
privilege_escalation        117
webshell                     81
reverse_shell                51
service_stop                  4
Name: count, dtype: int64

In [22]:
merged_df.head()

Unnamed: 0,full_log,@timestamp,location,id,filename,agent_ip,data_srcip,rule_firedtimes,rule_level,rule_pci_dss,rule_tsc,rule_description,rule_groups,rule_id,rule_nist_800_53,rule_gdpr,time,name,short,time_label
0,Jan 15 02:32:32 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:32.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,1,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d],1642213952,Wazuh: ClamAV database update,W-Sys-Cav,false_positive
1,Jan 15 02:32:32 taylorcruz-mail freshclam[2851...,2022-01-15T02:32:32.000000Z,/var/log/syslog,1686147000.0,fox,192.168.128.170,,2,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d],1642213952,Wazuh: ClamAV database update,W-Sys-Cav,false_positive
2,Jan 15 02:32:37 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:37.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,3,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d],1642213957,Wazuh: ClamAV database update,W-Sys-Cav,false_positive
3,Jan 15 02:32:42 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:42.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,4,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d],1642213962,Wazuh: ClamAV database update,W-Sys-Cav,false_positive
4,Jan 15 02:32:47 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:47.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,5,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d],1642213967,Wazuh: ClamAV database update,W-Sys-Cav,false_positive


In [7]:
merged_df.to_parquet(path=os.path.join(extract_dir, "processed_dataset_with_labels.parquet"))