In [1]:
%load_ext lab_black
%cd ..

/home/shim/cev/dl/log-analytics


In [2]:
import argparse
import math
import multiprocessing
import sys
from datetime import datetime
from pathlib import Path
from pprint import pformat

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_optimizer
import yaml
from easydict import EasyDict
from pytorch_transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    DistilBertForSequenceClassification,
    DistilBertTokenizer,
    RobertaForSequenceClassification,
    RobertaTokenizer,
)
from sklearn.model_selection import StratifiedKFold
from torch.optim import Adam, AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (
    AlbertForSequenceClassification,
    AlbertTokenizer,
    DebertaForSequenceClassification,
    DebertaTokenizer,
    SqueezeBertTokenizer,
    SqueezeBertForSequenceClassification,
    XLNetTokenizer,
    XLNetForSequenceClassification,
)

from datasets import load_test_data, load_train_data, load_train_total_data
from utils import SAM, AverageMeter, CustomLogger, FocalLoss, seed_everything

from main import MyTrainer
from collections import defaultdict
import matplotlib.pyplot as plt
import random
from sklearn.metrics import f1_score, classification_report

In [3]:
df = pd.read_csv("data/ori/train.csv")

In [7]:
for text in df.full_log[df.level == 6]:
    print(text)

Jan 30 08:23:17 localhost sshd[18415]: Bad protocol version identification '\003' from 78.128.113.18 port 1073
Feb  8 16:16:47 localhost kernel: device virbr0-nic entered promiscuous mode
Mar  8 15:29:30 localhost kernel: device enp2s0 entered promiscuous mode
Jan 29 11:28:59 localhost useradd[88679]: new group: name=test, GID=1001
Jan 22 15:18:49 localhost sshd[19015]: error: maximum authentication attempts exceeded for root from 192.168.0.197 port 61153 ssh2 [preauth]
Feb  2 17:22:58 localhost kernel: device virbr0-nic entered promiscuous mode
Feb  4 09:59:33 localhost kernel: device virbr0-nic entered promiscuous mode
Feb  4 09:59:43 localhost kernel: device enp2s0 entered promiscuous mode


In [8]:
for text in df.full_log[df.level == 4]:
    print(text)

Nov 29 05:44:21 localhost sshd[6008]: Did not receive identification string from 211.253.243.66 port 57487
Nov 29 22:16:32 sv260 sshd[39585]: Did not receive identification string from 211.253.243.66
Nov 3 10:38:05 localhost sshd[76373]: Did not receive identification string from 192.168.0.195 port 3819
Nov 21 01:02:56 sv260 sshd[9196]: Did not receive identification string from 211.253.243.66
Nov 27 08:32:12 sv260 sshd[3110]: Did not receive identification string from 192.168.0.195
Nov 19 01:03:09 sv260 sshd[13841]: Did not receive identification string from 192.168.0.195
Nov 12 04:27:24 localhost sshd[98418]: Did not receive identification string from 61.41.101.142 port 8975
Jan 27 20:20:54 localhost sshd[5025]: Did not receive identification string from 35.184.222.44 port 46232
Nov 3 06:03:17 localhost sshd[4346]: Did not receive identification string from 192.168.0.181 port 82066
Nov 8 20:25:42 sv260 sshd[37597]: Did not receive identification string from 192.168.0.181


In [9]:
for text in df.full_log[df.level == 2]:
    print(text)

The average number of logs between 10:00 and 11:00 is 5399. We reached 37090.
The average number of logs between 5:00 and 6:00 is 222. We reached 77946.
The average number of logs between 6:00 and 7:00 is 6420. We reached 28494.
The average number of logs between 21:00 and 22:00 is 5325. We reached 95.
The average number of logs between 5:00 and 6:00 is 8362. We reached 93476.
The average number of logs between 4:00 and 5:00 is 2806. We reached 11476.
The average number of logs between 14:00 and 15:00 is 9734. We reached 26958.
The average number of logs between 15:00 and 16:00 is 4411. We reached 9487.
Sep 18 11:02:59 localhost sudo:  apache : TTY=unknown ; PWD=/var/www/html/management ; USER=root ; COMMAND=/bin/curl -XGET localhost:9200/_cat/snapshots/esild_backup
The average number of logs between 2:00 and 3:00 is 9428. We reached 75572.
The average number of logs between 10:00 and 11:00 is 12612. We reached 31532.
The average number of logs between 13:00 and 14:00 is 2697. We reach

In [18]:
tokendic = defaultdict(list)
for text in df.full_log[df.level == 2]:
    for i, token in enumerate(text.split()):
        tokendic[i].append(token)

In [23]:
df[df.level == 6]

Unnamed: 0,id,level,full_log
87783,87783,6,Jan 30 08:23:17 localhost sshd[18415]: Bad pro...
170386,170386,6,Feb 8 16:16:47 localhost kernel: device virbr...
213304,213304,6,Mar 8 15:29:30 localhost kernel: device enp2s...
221847,221847,6,Jan 29 11:28:59 localhost useradd[88679]: new ...
223644,223644,6,Jan 22 15:18:49 localhost sshd[19015]: error: ...
361823,361823,6,Feb 2 17:22:58 localhost kernel: device virbr...
406796,406796,6,Feb 4 09:59:33 localhost kernel: device virbr...
429489,429489,6,Feb 4 09:59:43 localhost kernel: device enp2s...


In [63]:
dic = df[df.level == 6].to_dict()
dic["id"] = list(range(1000000, 1000000 + len(dic["id"])))
dic["level"] = dic["level"].values()
dic["full_log"] = dic["full_log"].values()
# dic = pd.DataFrame(dic)

In [64]:
dic

{'id': [1000000,
  1000001,
  1000002,
  1000003,
  1000004,
  1000005,
  1000006,
  1000007],
 'level': dict_values([6, 6, 6, 6, 6, 6, 6, 6]),
 'full_log': dict_values(["Jan 30 08:23:17 localhost sshd[18415]: Bad protocol version identification '\\003' from 78.128.113.18 port 1073", 'Feb  8 16:16:47 localhost kernel: device virbr0-nic entered promiscuous mode', 'Mar  8 15:29:30 localhost kernel: device enp2s0 entered promiscuous mode', 'Jan 29 11:28:59 localhost useradd[88679]: new group: name=test, GID=1001', 'Jan 22 15:18:49 localhost sshd[19015]: error: maximum authentication attempts exceeded for root from 192.168.0.197 port 61153 ssh2 [preauth]', 'Feb  2 17:22:58 localhost kernel: device virbr0-nic entered promiscuous mode', 'Feb  4 09:59:33 localhost kernel: device virbr0-nic entered promiscuous mode', 'Feb  4 09:59:43 localhost kernel: device enp2s0 entered promiscuous mode'])}

In [65]:
pd.DataFrame(dic)

Unnamed: 0,id,level,full_log
0,1000000,6,Jan 30 08:23:17 localhost sshd[18415]: Bad pro...
1,1000001,6,Feb 8 16:16:47 localhost kernel: device virbr...
2,1000002,6,Mar 8 15:29:30 localhost kernel: device enp2s...
3,1000003,6,Jan 29 11:28:59 localhost useradd[88679]: new ...
4,1000004,6,Jan 22 15:18:49 localhost sshd[19015]: error: ...
5,1000005,6,Feb 2 17:22:58 localhost kernel: device virbr...
6,1000006,6,Feb 4 09:59:33 localhost kernel: device virbr...
7,1000007,6,Feb 4 09:59:43 localhost kernel: device enp2s...


In [67]:
dic["full_log"].values()

AttributeError: 'dict_values' object has no attribute 'values'

In [78]:
list(dic["full_log"]) * 4

["Jan 30 08:23:17 localhost sshd[18415]: Bad protocol version identification '\\003' from 78.128.113.18 port 1073",
 'Feb  8 16:16:47 localhost kernel: device virbr0-nic entered promiscuous mode',
 'Mar  8 15:29:30 localhost kernel: device enp2s0 entered promiscuous mode',
 'Jan 29 11:28:59 localhost useradd[88679]: new group: name=test, GID=1001',
 'Jan 22 15:18:49 localhost sshd[19015]: error: maximum authentication attempts exceeded for root from 192.168.0.197 port 61153 ssh2 [preauth]',
 'Feb  2 17:22:58 localhost kernel: device virbr0-nic entered promiscuous mode',
 'Feb  4 09:59:33 localhost kernel: device virbr0-nic entered promiscuous mode',
 'Feb  4 09:59:43 localhost kernel: device enp2s0 entered promiscuous mode',
 "Jan 30 08:23:17 localhost sshd[18415]: Bad protocol version identification '\\003' from 78.128.113.18 port 1073",
 'Feb  8 16:16:47 localhost kernel: device virbr0-nic entered promiscuous mode',
 'Mar  8 15:29:30 localhost kernel: device enp2s0 entered promiscuou

In [4]:
len(df)

472972

In [4]:
from datasets.loader import oversample

In [7]:
df = oversample(df, 4)

In [6]:
len(df)

472972

In [7]:
df = oversample(df, 4)

In [8]:
len(df)

473092

In [10]:
scale = 50

In [11]:
dic = df[df.level == 2].to_dict()
dic["id"] = list(range(1000000, 1000000 + len(dic["id"]) * scale))
dic["level"] = list(dic["level"]) * scale
dic["full_log"] = list(dic["full_log"]) * scale
df.append(pd.DataFrame(dic))

Unnamed: 0,id,level,full_log
0,0,0,"Sep 24 10:02:22 localhost kibana: {""type"":""err..."
1,1,0,Feb 8 16:21:00 localhost logstash: [2021-02-0...
2,2,0,"Jan 13 01:50:40 localhost kibana: {""type"":""err..."
3,3,0,"Jan 4 10:18:31 localhost kibana: {""type"":""err..."
4,4,1,type=SYSCALL msg=audit(1603094402.016:52981): ...
...,...,...,...
595,1000595,345903,345903
596,1000596,365348,365348
597,1000597,376681,376681
598,1000598,380583,380583


In [12]:
df

Unnamed: 0,id,level,full_log
0,0,0,"Sep 24 10:02:22 localhost kibana: {""type"":""err..."
1,1,0,Feb 8 16:21:00 localhost logstash: [2021-02-0...
2,2,0,"Jan 13 01:50:40 localhost kibana: {""type"":""err..."
3,3,0,"Jan 4 10:18:31 localhost kibana: {""type"":""err..."
4,4,1,type=SYSCALL msg=audit(1603094402.016:52981): ...
...,...,...,...
472967,472967,0,Feb 28 10:10:06 localhost logstash: 7738 error:
472968,472968,1,type=SYSCALL msg=audit(1611890993.458:321827):...
472969,472969,0,"Oct 12 02:20:29 localhost kibana: {""type"":""log..."
472970,472970,0,"Jan 15 09:02:43 localhost kibana: {""type"":""err..."
