In [7]:
import pandas as pd

# === 1. File paths (Windows-safe raw strings) ===
alert_file = r"C:\Users\USER\Desktop\Capstone\GeneratedLabelledFlows\TrafficLabelling\IMP\SNORT\alert.csv"
cicids_file = r"C:\Users\USER\Desktop\Capstone\GeneratedLabelledFlows\TrafficLabelling\IMP\SNORT\Wednesday-workingHours-preprocessed.csv"
true_labels_file = r"C:\Users\USER\Desktop\Capstone\GeneratedLabelledFlows\TrafficLabelling\IMP\SNORT\true_labels.csv"

# === 2. Load Snort alerts (no headers) ===
num_cols = 27  # adjust if necessary
col_names = [f"col{i}" for i in range(num_cols)]
alerts = pd.read_csv(alert_file, header=None, names=col_names)

print(f"[INFO] Loaded {len(alerts)} alerts from {alert_file}")

# Correct mapping according to your file structure
alerts = alerts.rename(columns={
    "col5": "protocol",   # TCP/UDP
    "col6": "src_ip",
    "col7": "src_port",
    "col8": "dst_ip",
    "col9": "dst_port"
})

# Map protocol text to CICIDS numeric codes
proto_map = {'TCP': 6, 'UDP': 17}
alerts['protocol'] = alerts['protocol'].map(proto_map).fillna(alerts['protocol'])

# Normalize alert fields
alerts['src_ip'] = alerts['src_ip'].astype(str).str.strip()
alerts['dst_ip'] = alerts['dst_ip'].astype(str).str.strip()
alerts['src_port'] = pd.to_numeric(alerts['src_port'], errors='coerce').fillna(0).astype(int)
alerts['dst_port'] = pd.to_numeric(alerts['dst_port'], errors='coerce').fillna(0).astype(int)
alerts['protocol'] = alerts['protocol'].astype(str).str.strip()

# Add a stable alert_id for alignment later
alerts = alerts.reset_index(drop=False).rename(columns={'index': 'alert_id'})

# === 3. CICIDS chunked reading settings ===
merge_keys = ['src_ip', 'src_port', 'dst_ip', 'dst_port', 'protocol']
use_cols = ['Source IP', 'Source Port', 'Destination IP', 'Destination Port', 'Protocol', 'Label']
dtype = {
    'Source IP': 'string',
    'Destination IP': 'string',
    'Source Port': 'Int32',
    'Destination Port': 'Int32',
    'Protocol': 'Int8',
    'Label': 'string'
}
chunk_size = 100000  # adjust for available memory

# Preallocate labels_out with None
labels_out = [None] * len(alerts)

# === 4. Process CICIDS file in chunks and assign labels ===
for chunk in pd.read_csv(cicids_file, usecols=use_cols, dtype=dtype, chunksize=chunk_size):
    # Rename to align with alerts dataframe
    chunk = chunk.rename(columns={
        'Source IP': 'src_ip',
        'Source Port': 'src_port',
        'Destination IP': 'dst_ip',
        'Destination Port': 'dst_port',
        'Protocol': 'protocol',
        'Label': 'label'
    })
    # Normalize
    chunk['src_ip'] = chunk['src_ip'].astype(str).str.strip()
    chunk['dst_ip'] = chunk['dst_ip'].astype(str).str.strip()
    chunk['src_port'] = pd.to_numeric(chunk['src_port'], errors='coerce').fillna(0).astype(int)
    chunk['dst_port'] = pd.to_numeric(chunk['dst_port'], errors='coerce').fillna(0).astype(int)
    chunk['protocol'] = chunk['protocol'].astype(str).str.strip()

    # Merge keeping alert_id
    merged_chunk = alerts.merge(chunk, how='left', on=merge_keys)

    # Assign label for each alert_id if still empty
    for alert_id_val, label in zip(merged_chunk['alert_id'], merged_chunk['label']):
        if labels_out[alert_id_val] is None and pd.notna(label):
            labels_out[alert_id_val] = label

# === 5. Default unmatched alerts to BENIGN ===
labels_out = [lbl if lbl is not None else "BENIGN" for lbl in labels_out]

# === 6. Save aligned labels ===
pd.DataFrame({'label': labels_out}).to_csv(true_labels_file, index=False)

print(f"[INFO] Saved aligned true_labels.csv with {len(labels_out)} rows, matches alert.csv row count.")
print("[INFO] Final label distribution:")
print(pd.Series(labels_out).value_counts(dropna=False))


[INFO] Loaded 400170 alerts from C:\Users\USER\Desktop\Capstone\GeneratedLabelledFlows\TrafficLabelling\IMP\SNORT\alert.csv
[INFO] Saved aligned true_labels.csv with 400170 rows, matches alert.csv row count.
[INFO] Final label distribution:
BENIGN              240155
DoS Hulk             68439
DoS Slowhttptest     47415
DoS slowloris        44161
Name: count, dtype: int64
