In [1]:
%load_ext lab_black
%cd ..

/home/shim/cev/dl/log-analytics


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
from pytorch_transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    BertConfig,
)
from torch.optim import AdamW
import torch.nn.functional as F
from tqdm import tqdm
import sys
import re
from collections import defaultdict

In [3]:
def first_word(text, deli=" "):
    for i, t in enumerate(text):
        if t == deli:
            break
    return text[:i]

In [4]:
train = pd.read_csv("data/ori/train.csv")

In [5]:
train

Unnamed: 0,id,level,full_log
0,0,0,"Sep 24 10:02:22 localhost kibana: {""type"":""err..."
1,1,0,Feb 8 16:21:00 localhost logstash: [2021-02-0...
2,2,0,"Jan 13 01:50:40 localhost kibana: {""type"":""err..."
3,3,0,"Jan 4 10:18:31 localhost kibana: {""type"":""err..."
4,4,1,type=SYSCALL msg=audit(1603094402.016:52981): ...
...,...,...,...
472967,472967,0,Feb 28 10:10:06 localhost logstash: 7738 error:
472968,472968,1,type=SYSCALL msg=audit(1611890993.458:321827):...
472969,472969,0,"Oct 12 02:20:29 localhost kibana: {""type"":""log..."
472970,472970,0,"Jan 15 09:02:43 localhost kibana: {""type"":""err..."


In [6]:
train.to_numpy()[:, 0]

array([0, 1, 2, ..., 472969, 472970, 472971], dtype=object)

In [8]:
first_word(train.iloc[0]["full_log"])

'Sep'

In [9]:
seasons = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]

In [10]:
first_words = set()
for row, (id, level, full_log) in tqdm(train.iterrows(), total=len(train), ncols=100, file=sys.stdout):
    t = first_word(full_log)
    first_words.add(t)
    # break

100%|████████████████████████████████████████████████████| 472972/472972 [00:24<00:00, 19115.08it/s]


In [23]:
first_words

{'--MARK--:',
 '2020',
 '2021',
 'Dec',
 'E:',
 'Feb',
 'File',
 'Jan',
 'Mar',
 'NTFS',
 'Nov',
 'Oct',
 'OpenSCAP',
 'Sep',
 'System',
 'The',
 'Trojaned',
 'Windows',
 'error:',
 'junipe',
 'level',
 'oscap:',
 'ossec:',
 'type=AVC',
 'type=SYSCALL',
 'type=USER_AVC'}

In [31]:
first_word_cnt = defaultdict(int)
for row, (id, level, full_log) in tqdm(train.iterrows(), total=len(train), ncols=100, file=sys.stdout):
    t = first_word(full_log)
    first_word_cnt[(level, t)] += 1
    # break

100%|████████████████████████████████████████████████████| 472972/472972 [00:28<00:00, 16653.52it/s]


In [43]:
for key in sorted(first_word_cnt.keys()):
    print(f"{str(key):20} :", first_word_cnt[key])

(0, '--MARK--:')     : 12
(0, 'Dec')           : 24370
(0, 'E:')            : 297
(0, 'Feb')           : 33114
(0, 'Jan')           : 200574
(0, 'Mar')           : 11368
(0, 'Nov')           : 18980
(0, 'Oct')           : 20973
(0, 'OpenSCAP')      : 3
(0, 'Sep')           : 22392
(0, 'error:')        : 459
(0, 'junipe')        : 4
(0, 'level')         : 1519
(1, 'Dec')           : 1498
(1, 'Feb')           : 2381
(1, 'Jan')           : 3019
(1, 'Mar')           : 804
(1, 'Nov')           : 2646
(1, 'Oct')           : 2904
(1, 'Sep')           : 799
(1, 'System')        : 820
(1, 'Windows')       : 13
(1, 'level')         : 736
(1, 'ossec:')        : 29
(1, 'type=AVC')      : 369
(1, 'type=SYSCALL')  : 116496
(1, 'type=USER_AVC') : 3
(2, 'Sep')           : 1
(2, 'The')           : 11
(3, '2020')          : 9
(3, '2021')          : 22
(3, 'Dec')           : 14
(3, 'Feb')           : 44
(3, 'File')          : 181
(3, 'Jan')           : 3414
(3, 'Mar')           : 1
(3, 'Nov')           :

### 날짜 제거?

만약에 날짜 정보를 제거하면 첫 단어가 어떻게 될지?

In [11]:
def remove_pattern(pattern, full_log):
    for s in re.finditer(pattern, full_log):
        a, b = s.span()
        full_log = (full_log[:a] + full_log[b:]).strip()
    return full_log

In [14]:
def remove_date(full_log):
    t = first_word(full_log)
    if len(t) == 4 and t.isdigit() and t[:2] in ("19", "20", "21"):
        full_log = full_log[5:].strip()

    t = first_word(full_log)
    if len(t) == 3 and t in seasons:
        full_log = full_log[4:].strip()

        t = first_word(full_log)
        if t.isdigit():
            full_log = full_log[len(t) + 1 :].strip()

    # 00:00:00 형식의 시간 이면?
    if re.match(r"\d{2}:\d{2}:\d{2}", full_log):
        full_log = full_log[9:].strip()

    if full_log.startswith("localhost"):
        full_log = full_log[10:].strip()

    # sshd[pid] 에서 pid 제거
    # t = first_word(full_log)
    # if re.match(r"[\w\d]+\[\d+\]", t):
    #    u = first_word(t, deli="[")
    #    full_log = (u + " " + full_log[len(t) + 1 :]).strip()

    # @timestamp: "~~~~Z"
    full_log = remove_pattern(r'"@timestamp"\s?:\s?"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z?",?', full_log)
    # "pid": "4567"
    full_log = remove_pattern(r'"pid"\s?:\s?\d+,?', full_log)
    # [pid]
    full_log = remove_pattern(r"\[\d+\](:,)?", full_log)
    
    # 2021-05-05 추가
    ############################################################################
    # IP주소 + port
    full_log = remove_pattern(r"\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}:\d+,?", full_log)
    full_log = remove_pattern(r"\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3},?", full_log)
    # port
    full_log = remove_pattern(r"port \d{1,5},?", full_log)
    # level
    full_log = remove_pattern(r"level\s*:\s*\d,?", full_log)
    # log
    full_log = remove_pattern(r"log\s*:", full_log)
    # uid, gid
    full_log = remove_pattern(r"[ug]id=\d+,?", full_log)
    # user
    full_log = remove_pattern(r"user=\w+", full_log)
    # logname=
    full_log = remove_pattern(r"logname=", full_log)
    ############################################################################

    full_log = full_log.replace("\n", " ")
    full_log = full_log.replace("\r", " ")
    full_log = re.sub(r"\s+", " ", full_log)

    return full_log

In [15]:
first_words = set()
for row, (id, level, full_log) in tqdm(train.iterrows(), total=len(train), ncols=100, file=sys.stdout):
    full_log = remove_date(full_log)
    t = first_word(full_log)
    first_words.add(t)
    # break

100%|█████████████████████████████████████████████████████| 472972/472972 [00:54<00:00, 8611.62it/s]


In [16]:
first_words

{"'commit",
 '***SCTP',
 '--MARK--:',
 '0,',
 'A',
 'ALG',
 'AMT',
 'APPID',
 'Aborted.',
 'Aborting',
 'Aborting,',
 'Aborting...A',
 'Ack',
 'Acquire',
 'Adding',
 'All',
 'An',
 'Another',
 'AntiSpam:',
 'AntiVirus:',
 'AntiVirus:The',
 'AppTrack',
 'Application',
 'ApplicationProxy:',
 'Assertion',
 'Assumed',
 'Assuming',
 'Attempt',
 'Authenticated',
 'Authentication',
 'Automatic',
 'BFD',
 'BGP',
 'Backup',
 'Bandwidth',
 'Blade',
 'Boot',
 'CB',
 'CFM',
 'CLEAR:',
 'CLI',
 'COSMAN:',
 'Cannot',
 'Certificate',
 'Change',
 'Chassis',
 'Child',
 'Circuit',
 'Clearing',
 'Client',
 'Command',
 'Commandstopped:PID',
 'Commencing',
 'Commit',
 'Committing',
 'Compilation',
 'Cond-Groups:',
 'Connection',
 'Console',
 'Content',
 'Corruption',
 'Could',
 'Counter',
 'DDOS',
 'DFW:',
 'DHCP',
 'Daemon',
 'Database',
 'Decryption',
 'Deleting',
 'Designated',
 'Destination',
 'Detail',
 'Discarded',
 'Dropped',
 'Dropping',
 'Duplicate',
 'Dynamic',
 'E:',
 'ERR:',
 'ERROR:error-messa

In [195]:
first_word_cnt = defaultdict(int)
for row, (id, level, full_log) in tqdm(train.iterrows(), total=len(train), ncols=100, file=sys.stdout):
    full_log = remove_date(full_log)
    t = first_word(full_log)
    first_word_cnt[(level, t)] += 1
    # break

100%|████████████████████████████████████████████████████| 472972/472972 [00:35<00:00, 13260.90it/s]


In [196]:
for key in sorted(first_word_cnt.keys()):
    print(f"{str(key):20} :", first_word_cnt[key])

(0, '--MARK--:')     : 12
(0, 'E:')            : 297
(0, 'OpenSCAP')      : 3
(0, 'auditd:')       : 510
(0, 'augenrules:')   : 8
(0, 'bluetoothd:')   : 2
(0, 'dbus-daemon:')  : 1
(0, 'dbus:')         : 1
(0, 'elasticsearch:') : 7
(0, 'error:')        : 459
(0, 'esild-ml-start.sh:') : 4383
(0, 'gnome-session:') : 1
(0, 'journal:')      : 92
(0, 'junipe')        : 4
(0, 'kdumpctl:')     : 1
(0, 'kernel:')       : 146
(0, 'kibana:')       : 170219
(0, 'level')         : 1519
(0, 'logstash:')     : 138436
(0, 'm2datateksolaris') : 4
(0, 'mcelog:')       : 2
(0, 'polkitd:')      : 2
(0, 'pulseaudio:')   : 1
(0, 'rc.local:')     : 2
(0, 'sshd:')         : 3
(0, 'suricata:')     : 17948
(0, 'sv260')         : 2
(1, 'System')        : 820
(1, 'Windows')       : 13
(1, 'kernel:')       : 22
(1, 'level')         : 736
(1, 'ossec:')        : 29
(1, 'postfix/master:') : 3
(1, 'sshd:')         : 210
(1, 'su:')           : 2
(1, 'sudo:')         : 13812
(1, 'sv260')         : 2
(1, 'type=AVC')     

"E:"는 아마도 ERROR 라는 뜻인듯?

logstach, kibana 관련 오류는 수는 많은데 전부 0임.

sshd가 가장 다양하게 갖고있고, --> 레벨별로 메세지 정리해서 확인해보기

System, Windows 같은거는 1정도를 갖는듯?

되도록 학습으로 처리해달라고 한 이유는, kibana, logstash라도 1 이상의 상활이 혹시 발생할지 모르기 때문이 아닐지?
그러면 최종 결과에 weight를 주는 방법으로?

In [204]:
i = 0

In [218]:
print(train.level[i], remove_date(train.full_log[i]))
i += 1

1 type=SYSCALL msg=audit(1611889244.855:247124): arch=c000003e syscall=2 success=yes exit=3 a0=7f1c14d535a4 a1=80000 a2=1 a3=7f1c14f594f8 items=1 ppid=100038 pid=100039 auid=4294967295 uid=0 gid=980 euid=0 suid=0 fsuid=0 egid=980 sgid=980 fsgid=980 tty=(none) ses=4294967295 comm="ps" exe="/usr/bin/ps" subj=system_u:system_r:unconfined_service_t:s0 key="audit-wazuh-r" type=CWD msg=audit(1611889244.855:247124): cwd="/" type=PATH msg=audit(1611889244.855:247124): item=0 name="/etc/ld.so.cache" inode=35395307 dev=fd:00 mode=0100644 ouid=0 ogid=0 rdev=00:00 obj=unconfined_u:object_r:ld_so_cache_t:s0 objtype=NORMAL cap_fp=0000000000000000 cap_fi=0000000000000000 cap_fe=0 cap_fver=0 type=PROCTITLE msg=audit(1611889244.855:247124): proctitle=2F62696E2F7073002D700032353336
