In [1]:
import pandas as pd
import re

In [12]:
with open('./data/test.txt', 'r') as f:
    log_data = f.read()

In [None]:
print(log_data)

In [38]:
from typing import List, Dict

def normalize_audit_log(data: str) -> List[Dict]:
    lines = data.strip().split('\n')
    events = []

    for line in lines:
        line = line.strip()
        if not line:
            continue

        # 1. แยก type และส่วน msg=audit(...)
        header_match = re.match(r'type=([A-Z_]+) msg=audit\([^)]+\): (.*)', line)
        if not header_match:
            continue

        event_type = header_match.group(1)
        rest = header_match.group(2)

        # 2. Dictionary สำหรับเก็บ key-value ทั้งหมด
        pairs: Dict[str, str] = {}

        # 3. จับ key=value ที่อยู่นอก msg='' ก่อน (เช่น pid, uid, auid, ...)
        for match in re.finditer(r'(\w+)=([^ ]+)(?=\s|$)', rest):
            key, value = match.groups()
            # ลบเครื่องหมาย " หรือ ' ที่อาจหลงเหลือ
            value = value.strip('"').strip("'")
            pairs[key] = value

        # 4. จับส่วน msg='...' หรือ msg="..." แล้วแยก key=value ภายใน
        msg_match = re.search(r"msg=['\"](.*?)['\"](?=\s|$|\w+=)", rest, re.DOTALL)
        if msg_match:
            msg_content = msg_match.group(1)
            # แยก key=value ภายใน msg (รองรับทั้ง quoted และ unquoted)
            i = 0
            while i < len(msg_content):
                # ข้าม whitespace
                while i < len(msg_content) and msg_content[i].isspace():
                    i += 1
                if i >= len(msg_content):
                    break

                # หา key=
                eq_pos = msg_content.find('=', i)
                if eq_pos == -1:
                    break
                key = msg_content[i:eq_pos].strip()
                i = eq_pos + 1

                # ดูว่าค่าเริ่มด้วย quote หรือไม่
                if i < len(msg_content) and msg_content[i] in ("'", '"'):
                    quote = msg_content[i]
                    i += 1
                    end_quote = msg_content.find(quote, i)
                    if end_quote == -1:
                        end_quote = len(msg_content)
                    value = msg_content[i:end_quote]
                    i = end_quote + 1
                else:
                    # ไม่มี quote → จนถึง space หรือจบ
                    end = msg_content.find(' ', i)
                    if end == -1:
                        end = len(msg_content)
                    value = msg_content[i:end]
                    i = end

                pairs[key] = value.strip()

        # 5. สร้าง event ที่ standardize แล้ว
        event = {
            "event_type": event_type,
            "pid": pairs.get("pid"),
            "uid": pairs.get("uid"),
            "auid": pairs.get("auid"),
            "ses": pairs.get("ses"),
            "exe": pairs.get("exe"),
            "acct": pairs.get("acct"),
            "op": pairs.get("op"),
            "unit": pairs.get("unit"),
            "comm": pairs.get("comm"),
            "cmd": pairs.get("cmd"),               # จะได้ hex หรือ plain text
            "cwd": pairs.get("cwd"),
            "terminal": pairs.get("terminal"),
            "hostname": pairs.get("hostname"),
            "addr": pairs.get("addr"),
            "res": pairs.get("res"),
            "success": pairs.get("success"),       # สำหรับ SYSCALL
            "proctitle": pairs.get("proctitle"),   # hex หรือ decoded
        }

        # เพิ่ม field เฉพาะบาง type ถ้าต้องการ
        if event_type in ("SYSCALL", "LOGIN"):
            event["syscall"] = pairs.get("syscall")
            event["arch"] = pairs.get("arch")

        events.append(event)

    return events

In [46]:
events = normalize_audit_log(log_data)
df = pd.DataFrame(events)

In [48]:
df[:3]

Unnamed: 0,event_type,pid,uid,auid,ses,exe,acct,op,unit,comm,cmd,cwd,terminal,hostname,addr,res,success,proctitle,syscall,arch
0,CRED_REFR,2861,1000,1000,2,/usr/bin/sudo,root,PAM:setcred,,,,,/dev/tty1,front,?,"success'UID=""user",,,,
1,USER_START,2861,1000,1000,2,/usr/bin/sudo,root,PAM:session_open,,,,,/dev/tty1,front,?,"success'UID=""user",,,,
2,USER_END,2861,1000,1000,2,/usr/bin/sudo,root,PAM:session_close,,,,,/dev/tty1,front,?,"success'UID=""user",,,,


In [53]:
useful_cols = [
    'event_type', 'uid', 'auid', 'exe', 'terminal', 'res', 'op', 'acct'
]
df = df[useful_cols + [c for c in df.columns if c not in useful_cols]]

In [57]:
df[:3]

Unnamed: 0,event_type,uid,auid,exe,terminal,res,op,acct,success,pid,ses,unit,comm,cmd,cwd,hostname,addr,proctitle,syscall,arch
0,CRED_REFR,1000,1000,/usr/bin/sudo,/dev/tty1,"success'UID=""user",PAM:setcred,root,,2861,2,,,,,front,?,,,
1,USER_START,1000,1000,/usr/bin/sudo,/dev/tty1,"success'UID=""user",PAM:session_open,root,,2861,2,,,,,front,?,,,
2,USER_END,1000,1000,/usr/bin/sudo,/dev/tty1,"success'UID=""user",PAM:session_close,root,,2861,2,,,,,front,?,,,


In [None]:
# ภาพรวม
print(df['event_type'].value_counts())

event_type
SERVICE_START    4
CRED_REFR        3
USER_START       3
SERVICE_STOP     3
USER_END         2
CRED_DISP        2
USER_ACCT        2
USER_CMD         1
CRED_ACQ         1
LOGIN            1
SYSCALL          1
PROCTITLE        1
Name: count, dtype: int64


In [None]:
# sudo
sudo_events = df[df['exe'].str.contains('sudo', na=False)]
print(sudo_events[['uid', 'auid', 'res']])

    uid  auid                res
0  1000  1000  success'UID="user
1  1000  1000  success'UID="user
2  1000  1000  success'UID="user
3  1000  1000  success'UID="user
6  1000  1000  success'UID="user
7  1000  1000  success'UID="user
8  1000  1000  success'UID="user
9  1000  1000  success'UID="user


In [None]:
# คำสั่งที่รันบ่อย
cmd_count = df['cmd'].value_counts().head(10)
print("คำสั่งที่รันบ่อย:\n", cmd_count)

คำสั่งที่รันบ่อย:
 cmd
7461696C202D66202F7661722F6C6F672F61756469742F61756469742E6C6F67    1
Name: count, dtype: int64


In [None]:
# Event ที่ล้มเหลว
failed = df[df['res'].str.contains('fail|no|error', na=False, case=False)]
print("เหตุการณ์ล้มเหลว:\n", failed[['event_type', 'exe', 'cmd']])

เหตุการณ์ล้มเหลว:
 Empty DataFrame
Columns: [event_type, exe, cmd]
Index: []


In [None]:
# User ธรรมดาเปลี่ยนไป root
root_acct = df[df['acct'] == 'root']
print("พยายามเปลี่ยนเป็น root:\n", root_acct[['uid', 'auid', 'exe', 'cmd']])

พยายามเปลี่ยนเป็น root:
      uid        auid             exe  cmd
0   1000        1000   /usr/bin/sudo  NaN
1   1000        1000   /usr/bin/sudo  NaN
2   1000        1000   /usr/bin/sudo  NaN
3   1000        1000   /usr/bin/sudo  NaN
8   1000        1000   /usr/bin/sudo  NaN
9   1000        1000   /usr/bin/sudo  NaN
10     0  4294967295  /usr/sbin/cron  NaN
11     0  4294967295  /usr/sbin/cron  NaN
15     0           0  /usr/sbin/cron  NaN
16     0           0  /usr/sbin/cron  NaN
17     0           0  /usr/sbin/cron  NaN
18     0           0  /usr/sbin/cron  NaN


In [67]:
suspicious = df[
    # ตัวอย่าง rule
    (df['exe'].str.contains('sudo', na=False)) &
    (df['uid'] != '0') &
    (df['auid'] != '0') &
    (~df['cmd'].str.contains('tail|less|cat|nano|vim', na=True, case=False))
]
print("อาจน่าสงสัย:", suspicious)

อาจน่าสงสัย:   event_type   uid  auid            exe terminal                res   op acct  \
7   USER_CMD  1000  1000  /usr/bin/sudo     tty1  success'UID="user  NaN  NaN   

  success   pid ses unit comm  \
7     NaN  2870   2  NaN  NaN   

                                                 cmd         cwd hostname  \
7  7461696C202D66202F7661722F6C6F672F61756469742F...  /home/user      NaN   

  addr proctitle syscall arch  
7  NaN       NaN     NaN  NaN  
