# Labels processing

- Script 3

In [1]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
import pytz

extract_dir = "extracted_dataset"
parquet_processed_filename = "processed_dataset.parquet"

In [2]:
df = pd.read_parquet(os.path.join(extract_dir, parquet_processed_filename))

df.head()

Unnamed: 0,full_log,@timestamp,location,id,filename,agent_ip,data_srcip,rule_firedtimes,rule_level,rule_pci_dss,rule_tsc,rule_description,rule_groups,rule_id,rule_nist_800_53,rule_gdpr
0,Jan 15 02:32:32 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:32.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,1,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d]
1,Jan 15 02:32:32 taylorcruz-mail freshclam[2851...,2022-01-15T02:32:32.000000Z,/var/log/syslog,1686147000.0,fox,192.168.128.170,,2,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d]
2,Jan 15 02:32:37 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:37.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,3,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d]
3,Jan 15 02:32:42 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:42.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,4,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d]
4,Jan 15 02:32:47 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:47.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,5,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d]


In [3]:
def convert_timestamp(timestp):
    log_time = datetime.strptime(timestp, "%Y-%m-%dT%H:%M:%S.%f%z") # 2022-01-21T00:17:54.308261+0000
    log_time = log_time.replace(tzinfo=pytz.utc)
    log_time = log_time.timestamp()
    
    return log_time

In [4]:
# ---------------- Load labels.csv ----------------
labels_path = os.path.join(extract_dir, "labels.csv")
if os.path.exists(labels_path):
    labels_df = pd.read_csv(labels_path, sep=",")
else:
    print("labels.csv not found inside inner zip")

In [5]:
logs_df = df.copy()
logs_df['unix_timestamp'] = logs_df['@timestamp'].apply(convert_timestamp)

# ---  OPTIMIZED LABELING ---

# Step A: Create lookup dictionary
print("Creating an optimized lookup dictionary for labels...")
labels_dict = {}
for scenario, group in labels_df.groupby('scenario'):
    labels_dict[scenario] = list(group[['start', 'end', 'attack']].itertuples(index=False, name=None))

# Step B: Define labeling function
def find_attack_label(timestamp, intervals):
    for start, end, attack_label in intervals:
        if start <= timestamp <= end:
            return attack_label
    return 'false_positive'

# Step C: Define the group function
def get_labels_for_group(group):
    """
    This function now returns ONLY the Series of labels for the group.
    """
    scenario_name = group.name
    intervals_for_scenario = labels_dict.get(scenario_name, [])
    
    # Return just the result of the .apply() call
    return group['unix_timestamp'].apply(
        find_attack_label,
        args=(intervals_for_scenario,)
    )

print("Applying labels to logs using the grouped method...")
# Step D: Apply the function and assign the result directly to logs_df (MODIFIED call)
# The `group_keys=False` argument is key here. It prevents pandas from
# adding the group names ('filename') to the index of the result.
attack_label_series = logs_df.groupby('filename', group_keys=False).apply(get_labels_for_group)

# Now, assign this perfectly aligned Series as a new column in the original logs_df
logs_df['type_attack_label'] = attack_label_series
logs_df ['attack_label'] = np.where(logs_df['type_attack_label'] != 'false_positive', 'attack', 'benign')

# --- VERIFY THE RESULTS ---

print("\n--- Summary of Assigned Labels in original logs_df ---")
print(logs_df['type_attack_label'].value_counts())

print("\n--- Final logs_df with new column ---")
print(logs_df.head().to_string())

Creating an optimized lookup dictionary for labels...
Applying labels to logs using the grouped method...


  attack_label_series = logs_df.groupby('filename', group_keys=False).apply(get_labels_for_group)



--- Summary of Assigned Labels in original logs_df ---
type_attack_label
dirb                    1671940
false_positive           882739
wpscan                    28021
dnsteal                    8603
cracking                   5271
service_scans              1768
network_scans              1570
privilege_escalation        158
webshell                    109
reverse_shell                80
service_stop                  4
Name: count, dtype: int64

--- Final logs_df with new column ---
                                                                                                                                  full_log                   @timestamp         location            id filename         agent_ip data_srcip  rule_firedtimes  rule_level rule_pci_dss rule_tsc        rule_description                rule_groups rule_id rule_nist_800_53    rule_gdpr  unix_timestamp type_attack_label attack_label
0             Jan 15 02:32:32 mail freshclam[29266]: Sat Jan 15 02:32:32 2022 -> ClamA

In [6]:
logs_df.head()

Unnamed: 0,full_log,@timestamp,location,id,filename,agent_ip,data_srcip,rule_firedtimes,rule_level,rule_pci_dss,rule_tsc,rule_description,rule_groups,rule_id,rule_nist_800_53,rule_gdpr,unix_timestamp,type_attack_label,attack_label
0,Jan 15 02:32:32 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:32.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,1,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d],1642214000.0,false_positive,benign
1,Jan 15 02:32:32 taylorcruz-mail freshclam[2851...,2022-01-15T02:32:32.000000Z,/var/log/syslog,1686147000.0,fox,192.168.128.170,,2,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d],1642214000.0,false_positive,benign
2,Jan 15 02:32:37 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:37.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,3,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d],1642214000.0,false_positive,benign
3,Jan 15 02:32:42 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:42.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,4,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d],1642214000.0,false_positive,benign
4,Jan 15 02:32:47 mail freshclam[29266]: Sat Jan...,2022-01-15T02:32:47.000000Z,/var/log/syslog,1686147000.0,fox,172.17.131.81,,5,3,[5.2],[A1.2],ClamAV database update,"[clamd, freshclam, virus]",52507,[SI.3],[IV_35.7.d],1642214000.0,false_positive,benign


In [7]:
logs_df.to_parquet(path=os.path.join(extract_dir, "processed_dataset_with_labels.parquet"))