In [1]:
%load_ext lab_black
%cd ..

/mnt/h/hev/log-analytics


In [2]:
import argparse
import math
import multiprocessing
import sys
from datetime import datetime
from pathlib import Path
from pprint import pformat

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_optimizer
import yaml
from easydict import EasyDict
from pytorch_transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    DistilBertForSequenceClassification,
    DistilBertTokenizer,
    RobertaForSequenceClassification,
    RobertaTokenizer,
)
from sklearn.model_selection import StratifiedKFold
from torch.optim import Adam, AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (
    AlbertForSequenceClassification,
    AlbertTokenizer,
    DebertaForSequenceClassification,
    DebertaTokenizer,
    SqueezeBertTokenizer,
    SqueezeBertForSequenceClassification,
    XLNetTokenizer,
    XLNetForSequenceClassification,
)

from datasets import load_test_data, load_train_data, MyDataset
from utils import SAM, AverageMeter, CustomLogger, FocalLoss, seed_everything

from main import MyTrainer
from collections import defaultdict
import matplotlib.pyplot as plt
import random

In [5]:
data = np.load("results/distilbert-base-uncased/distilbert-base-uncased-focal-AdamW-lr1e-05_1-dist.npz")
sfeats = torch.from_numpy(data["sfeats"])
dists = torch.from_numpy(data["dists"])
tlevels = torch.from_numpy(data["tlevels"])
fclevels = torch.from_numpy(data["fclevels"])

In [15]:
sfeats  # middle layer feature 데이터 --> 아마 여기서는 안 쓰이겠지?

tensor([[ 0.2330,  0.1847,  1.0939,  ..., -0.3893, -0.4712, -0.3539],
        [ 0.2157,  0.2010,  1.0847,  ..., -0.3552, -0.4799, -0.3493],
        [ 1.5141, -0.1199,  1.4249,  ..., -0.4438,  0.0475, -0.0883],
        ...,
        [ 1.5236, -0.1345,  1.4231,  ..., -0.4462,  0.0464, -0.0912],
        [ 0.4066,  0.2460,  0.9621,  ..., -0.4044, -0.5476, -0.1343],
        [ 0.2157,  0.2009,  1.0846,  ..., -0.3553, -0.4799, -0.3495]])

In [13]:
dists  # sfeats이랑 deck(모든 train 데이터로 만든 feature 뭉치)와 비교해서 가장 가까운 8개 까지의 거리

tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0308, 0.0312, 0.0324,  ..., 0.0374, 0.0380, 0.0389],
        ...,
        [0.0142, 0.0167, 0.0245,  ..., 0.0288, 0.0303, 0.0305],
        [0.0403, 0.0409, 0.0412,  ..., 0.0486, 0.0487, 0.0488],
        [0.0000, 0.0027, 0.0027,  ..., 0.0027, 0.0027, 0.0027]])

In [21]:
dists.shape

torch.Size([1418916, 8])

In [8]:
tlevels  # 가장 거리가 가까운 것의 실제 level

tensor([0, 0, 1,  ..., 1, 0, 0])

In [9]:
fclevels  # FullyConnected 레이어를 통해 구한 level (tlevels와 차이가 있는 것도 있음. TDOO: 확인해봐야함)

tensor([[ 4.6420, -3.8289, -8.8472,  ..., -9.0210, -4.3306, -8.9362],
        [ 4.5163, -3.7682, -8.6209,  ..., -8.8145, -4.2965, -8.7409],
        [-2.8406,  4.8419, -5.8681,  ..., -7.0440, -3.7289, -6.9653],
        ...,
        [-2.8393,  4.8370, -5.8676,  ..., -7.0415, -3.7251, -6.9628],
        [ 4.7729, -3.9848, -8.6707,  ..., -8.9454, -4.3192, -8.7292],
        [ 4.5166, -3.7683, -8.6212,  ..., -8.8145, -4.2965, -8.7409]])

In [36]:
df = pd.read_csv("data/ori/test.csv")

In [37]:
df

Unnamed: 0,id,full_log
0,1000000,"Feb 8 15:47:26 localhost kibana: {""type"":""err..."
1,1000001,"Sep 24 03:46:39 localhost kibana: {""type"":""err..."
2,1000002,type=SYSCALL msg=audit(1611888200.428:210563):...
3,1000003,"Jan 18 11:24:06 localhost kibana: {""type"":""err..."
4,1000004,type=SYSCALL msg=audit(1603081202.050:46851): ...
...,...,...
1418911,2418911,"Jan 13 05:07:11 localhost kibana: {""type"":""err..."
1418912,2418912,"Jan 5 02:24:50 localhost kibana: {""type"":""log..."
1418913,2418913,type=SYSCALL msg=audit(1611884593.462:38222): ...
1418914,2418914,Jan 22 01:38:19 localhost logstash: [2021-01-2...


## 실험

### dist의 변화 폭이 가장 큰 것은?

In [22]:
ddists = dists[:, 7] - dists[:, 0]

In [28]:
ddists.argmax()

tensor(601290)

In [30]:
dists[601290]

tensor([0.0393, 5.0289, 5.3250, 9.8381, 9.8963, 9.8963, 9.8963, 9.8968])

In [31]:
tlevels[601290]

tensor(3)

In [32]:
fclevels[601290]

tensor([ 0.2962, -0.0976, -8.0913, -1.0417, -8.2778, -2.7040, -8.2528])

In [39]:
# 여기에서 이게 어떤 텍스트였는지 알 수 있다면 좋았을텐데...
# 추가로 deck의 어떤 데이터를 가르키는지 볼 수 있으면? --> index를 저장하고, train/valid 데이터셋을 순서 랜덤하지 않게 만들었어야함

In [38]:
df.full_log[601290]

'Oct 21 10:20:00 localhost suricata[1444]: [1:2010494:4] ET SCAN Multiple MySQL Login Failures Possible Brute Force Attempt [Classification: Attempted Information Leak] [Priority: 2] {TCP} 192.168.0.42:3306 -> 192.168.0.160:35798'