In [3]:
import os
import pandas as pd


In [4]:
import re

def split_log(log: str):
    # Priority patterns (put <*> at the top to match it first)
    patterns = [
        r'<\*>',                                      # The <*> token
        r'\b\d{1,3}(?:\.\d{1,3}){3}\b',               # IP addresses
        r'\b[\w\.-]+@[\w\.-]+\.\w{2,}\b',             # Email addresses
        r'\b\d{4}-\d{2}-\d{2}\b',                     # Date: YYYY-MM-DD
        r'\b\d{2}/\d{2}/\d{4}\b',                     # Date: DD/MM/YYYY
        r'\b\d{2}:\d{2}:\d{2}\b',                     # Time: HH:MM:SS
        r'\b\d+\b',                                   # Numbers
        r'\b\w+\b',                                   # Alphanumeric words
        r'\S'                                         # Other non-whitespace characters
    ]

    # Compile the full regex
    combined_pattern = '|'.join(patterns)

    # Find all tokens
    tokens = re.findall(combined_pattern, log)

    return tokens


In [5]:
paths = [
    r"nulog\logs\Linux\Linux_2k.log_templates.csv",
    r"nulog\logs\Andriod\Andriod_2k.log_templates.csv",     
    r"nulog\logs\Apache\Apache_2k.log_templates.csv",
    r"nulog\logs\BGL\BGL_2k.log_templates.csv",
    r"nulog\logs\Hadoop\Hadoop_2k.log_templates.csv",
    r"nulog\logs\HDFS\HDFS_2k.log_templates.csv",
    r"nulog\logs\HealthApp\HealthApp_2k.log_templates.csv",
    r"nulog\logs\HPC\HPC_2k.log_templates.csv",
    r"nulog\logs\Mac\Mac_2k.log_templates.csv",
    r"nulog\logs\OpenSSH\OpenSSH_2k.log_templates.csv",
    r"nulog\logs\OpenStack\OpenStack_2k.log_templates.csv",
    r"nulog\logs\Proxifier\Proxifier_2k.log_templates.csv",
    r"nulog\logs\Spark\Spark_2k.log_templates.csv",
    r"nulog\logs\Thunderbird\Thunderbird_2k.log_templates.csv",
    r"nulog\logs\Windows\Windows_2k.log_templates.csv",
    r"nulog\logs\Zookeeper\Zookeeper_2k.log_templates.csv"
    ]



In [6]:
example = df["EventTemplate"][12]
example

NameError: name 'df' is not defined

In [7]:
split_log(example)

NameError: name 'example' is not defined

In [20]:
dictionary = set()

In [21]:
for path in paths:

    df = pd.read_csv(path)

    for log in df["EventTemplate"]:
        tokens = split_log(log)
        
        for token in tokens:
            if token != '<*>':
                dictionary.add(token)

In [22]:
len(dictionary)

3193

In [11]:
import hashlib
import socket
import struct
import re

def stable_hash(value: str, mod: int, offset: int) -> int:
    """Hash a string value to a range [offset, offset+mod-1]."""
    h = int(hashlib.md5(value.encode()).hexdigest(), 16)
    return offset + (h % mod)


In [12]:
def hash_ip(ip: str) -> int:
    """Hash IP address into [1000–1999]"""
    try:
        packed_ip = struct.unpack("!I", socket.inet_aton(ip))[0]
        return 10000 + (packed_ip % 3000)
    except Exception:
        return stable_hash(ip, 3000, 10000)

def hash_email(email: str) -> int:
    """Hash email into [2000–2999]"""
    return stable_hash(email.lower(), 3000, 13000)

def hash_large_number(number_str: str) -> int:
    """Hash numbers > 10000 into [3000–3999]"""
    return stable_hash(number_str, 3000, 16000)

def hash_date(date_str: str) -> int:
    """Hash date string into [4000–4999]"""
    return stable_hash(date_str, 3000, 19000)

def hash_oov_word(word: str) -> int:
    """Hash unknown word into [5000–5999]"""
    return stable_hash(word.lower(), 3000, 22000)

def hash_unknown(token: str) -> int:
    """Catch-all hash into [6000–6999]"""
    return stable_hash(token, 3000, 25000)


In [13]:
print(hash_ip("192.168.0.1"))         # ➜ 1000–1999
print(hash_email("admin@test.com"))   # ➜ 2000–2999
print(hash_large_number("123456"))    # ➜ 3000–3999
print(hash_date("2024-04-08"))        # ➜ 4000–4999
print(hash_oov_word("john123"))       # ➜ 5000–5999
print(hash_unknown("!!!"))            # ➜ 6000–6999


12521
15906
18358
20214
22835
25226


In [14]:
def hash_token(token: str) -> tuple[str, int]:
    token = token.strip()

    # Detect IP
    ip_pattern = re.compile(r'^(\d{1,3}\.){3}\d{1,3}$')
    if ip_pattern.match(token):
        return "ip", hash_ip(token)

    # Detect email
    email_pattern = re.compile(r'^[\w\.-]+@[\w\.-]+\.\w+$')
    if email_pattern.match(token):
        return "email", hash_email(token)

    # Detect date
    date_patterns = [
        r'^\d{4}-\d{2}-\d{2}$',           # 2025-04-08
        r'^\d{2}/\d{2}/\d{4}$',           # 08/04/2025
        r'^\d{4}/\d{2}/\d{2}$',           # 2025/04/08
        r'^\d{2}-\d{2}-\d{4}$',           # 08-04-2025
        r'^\d{4}\.\d{2}\.\d{2}$',         # 2025.04.08
    ]
    for pat in date_patterns:
        if re.match(pat, token):
            return "date", hash_date(token)

    # Detect large number
    if token.isdigit() and int(token) > 10000:
        return "large_number", hash_large_number(token)

    # Detect OOV word (contains letters and/or numbers)
    if re.match(r'^[\w\-]+$', token):
        return "word", hash_oov_word(token)

    # Fallback
    return "other", hash_unknown(token)


In [15]:
examples = [
    "192.168.0.1",
    "admin@example.com",
    "2025-04-08",
    "123456",
    "john123",
    "<*>"
]

for t in examples:
    kind, hashed = hash_token(t)
    print(f"{t} → {kind} → {hashed}")

192.168.0.1 → ip → 12521
admin@example.com → email → 13745
2025-04-08 → date → 20061
123456 → large_number → 18358
john123 → word → 22835
<*> → other → 26512


In [None]:
def log_to_token_seq(log):
    tokens = split_log(log)
    return [hash_token(t)[1] for t in tokens]
        