In [1]:
%load_ext lab_black
%cd ..

/mnt/h/hev/log-analytics


In [2]:
import argparse
import math
import multiprocessing
import sys
from datetime import datetime
from pathlib import Path
from pprint import pformat

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_optimizer
import yaml
from easydict import EasyDict
from pytorch_transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    DistilBertForSequenceClassification,
    DistilBertTokenizer,
    RobertaForSequenceClassification,
    RobertaTokenizer,
)
from sklearn.model_selection import StratifiedKFold
from torch.optim import Adam, AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (
    AlbertForSequenceClassification,
    AlbertTokenizer,
    DebertaForSequenceClassification,
    DebertaTokenizer,
    SqueezeBertTokenizer,
    SqueezeBertForSequenceClassification,
    XLNetTokenizer,
    XLNetForSequenceClassification,
)

from datasets import load_test_data, load_train_data, MyDataset, load_train_total_data
from utils import SAM, AverageMeter, CustomLogger, FocalLoss, seed_everything

from main import MyTrainer
from collections import defaultdict
import matplotlib.pyplot as plt
import random

In [4]:
with open("config/distilbert-base-uncased.yaml", "r") as f:
    C = EasyDict(yaml.load(f, yaml.FullLoader))
    C.result_dir = Path(C.result_dir)
    C.dataset.dir = Path(C.dataset.dir)
    seed_everything(C.seed, deterministic=False)

In [5]:
C

{'model': {'name': 'distilbert-base-uncased'},
 'comment': None,
 'result_dir': PosixPath('results/distilbert-base-uncased'),
 'debug': False,
 'seed': 20210425,
 'train': {'SAM': False,
  'folds': [1],
  'checkpoints': [None],
  'loss': {'name': 'focal', 'gamma': 2},
  'optimizer': {'name': 'AdamW'},
  'finetune': {'do': True, 'step1_epochs': 2, 'step2_epochs': 4},
  'max_epochs': 10,
  'lr': 1e-05,
  'scheduler': {'name': 'ReduceLROnPlateau',
   'params': {'factor': 0.5, 'patience': 3, 'verbose': True}}},
 'dataset': {'dir': PosixPath('data/ori'), 'batch_size': 30, 'num_workers': 8}}

In [6]:
trainer = MyTrainer(C, 1, "results/distilbert-base-uncased/distilbert-base-uncased-focal-AdamW-lr1e-05_1.pth")

Load pretrained results/distilbert-base-uncased/distilbert-base-uncased-focal-AdamW-lr1e-05_1.pth


In [7]:
model = trainer.model
model.eval()
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7fa02d56de10>

In [8]:
activation = []


def hook(model, input, output):
    activation.append(output.detach().cpu())

In [9]:
model.pre_classifier.register_forward_hook(hook)

<torch.utils.hooks.RemovableHandle at 0x7fa02d869150>

In [10]:
dl = load_train_total_data(C.dataset.dir, trainer.tokenizer, 100, 6)

## tdeck 만들기 (모든 train 데이터에 대한 집합)

In [12]:
activation = []
deck = {
    "fcfeat": [],
    "tlevel": [],
    "fclevel": [],
    "otext": [],
}
with tqdm(total=len(dl.dataset), ncols=100, file=sys.stdout) as t:
    for id, text, tlevel, otext in dl:
        pred = model(text.cuda(non_blocking=True))[0].cpu()
        deck["fcfeat"].append(pred)
        deck["tlevel"].append(tlevel)
        deck["fclevel"].append(pred.argmax(dim=1))
        deck["otext"].extend(otext)

        t.update(len(id))

100%|██████████████████████████████████████████████████████| 472972/472972 [23:49<00:00, 330.85it/s]


In [13]:
deck["fcfeat"] = torch.cat(deck["fcfeat"])

In [14]:
deck["tlevel"] = torch.cat(deck["tlevel"])

In [15]:
deck["fclevel"] = torch.cat(deck["fclevel"])

In [16]:
deck["feat"] = torch.cat(activation)

In [17]:
deck.keys()

dict_keys(['fcfeat', 'tlevel', 'fclevel', 'otext', 'feat'])

In [18]:
np.savez_compressed(
    "results/distilbert-base-uncased/distilbert-base-uncased-focal-AdamW-lr1e-05_1-deck1.npz",
    fcfeat=deck["fcfeat"].numpy(),
    tlevel=deck["tlevel"].numpy(),
    fclevel=deck["fclevel"].numpy(),
    feat=deck["feat"].numpy(),
)

## sdeck 만들기 (모든 test 데이터에 대한 집합)

reload

In [11]:
# sfeats 저장
activation = []
deck = {"fcfeat": [], "fclevel": [], "otext": []}
with tqdm(total=len(trainer.dl_test.dataset), ncols=100, file=sys.stdout) as t:
    for _, text, otext in trainer.dl_test:
        pred = model(text.cuda(non_blocking=True))[0].cpu()
        deck["fcfeat"].append(pred)
        deck["fclevel"].append(pred.argmax(dim=1))
        deck["otext"].extend(otext)
        t.update(len(text))

100%|██████████████████████████████████████████████████| 1418916/1418916 [1:11:35<00:00, 330.29it/s]


In [12]:
deck["fcfeat"] = torch.cat(deck["fcfeat"])
deck["fclevel"] = torch.cat(deck["fclevel"])
deck["feat"] = torch.cat(activation)

In [13]:
deck.keys()

dict_keys(['fcfeat', 'fclevel', 'otext', 'feat'])

In [14]:
np.savez_compressed(
    "results/distilbert-base-uncased/distilbert-base-uncased-focal-AdamW-lr1e-05_1-deck2.npz",
    fcfeat=deck["fcfeat"].numpy(),
    fclevel=deck["fclevel"].numpy(),
    feat=deck["feat"].numpy(),
)

## dist값, KNN level, FC level 을 저장

여기서 reload

In [4]:
deck1 = np.load("results/distilbert-base-uncased/distilbert-base-uncased-focal-AdamW-lr1e-05_1-deck1.npz")
deck2 = np.load("results/distilbert-base-uncased/distilbert-base-uncased-focal-AdamW-lr1e-05_1-deck2.npz")

In [6]:
tdeck = {"feat": torch.from_numpy(deck1["feat"]).cuda(), "tlevel": torch.from_numpy(deck1["tlevel"])}

In [7]:
sdeck = {
    "feat": torch.from_numpy(deck2["feat"]).cuda(),
    "fcfeat": torch.from_numpy(deck2["fcfeat"]),
    "fclevel": torch.from_numpy(deck2["fclevel"]),
}

In [9]:
tdeck["feat"].shape, sdeck["feat"].shape

(torch.Size([472972, 768]), torch.Size([1418916, 768]))

In [31]:
def get_dist(deck, feat, topk):
    dist = torch.norm(deck - feat[None], dim=1, p=None)
    values, indices = dist.topk(topk, largest=False)  # knn
    return values, indices

In [43]:
# 실험
values, indices = get_dist(tdeck["feat"], sdeck["feat"][2], 8)
print(values)
print(indices)
print(tdeck["tlevel"][indices])

tensor([0.0308, 0.0312, 0.0324, 0.0325, 0.0374, 0.0374, 0.0380, 0.0389],
       device='cuda:0')
tensor([465707, 165781, 324718, 398767,  98264, 468848,  79222, 446725],
       device='cuda:0')
tensor([1, 1, 1, 1, 1, 1, 1, 1])


In [46]:
# dist를 구함
dists, indices, fcfeats, tlevels = [], [], [], []
with tqdm(total=len(sdeck["feat"]), ncols=100, file=sys.stdout) as t:
    for i in range(len(sdeck["feat"])):
        dist_, index_ = get_dist(tdeck["feat"], sdeck["feat"][i], 8)
        dist = dist_.cpu()
        index = index_.cpu()
        fcfeat = sdeck["fcfeat"][i]
        tlevel = tdeck["tlevel"][index]
        dists.append(dist)
        indices.append(index)
        fcfeats.append(fcfeat)
        tlevels.append(tlevel)

        t.update()

100%|██████████████████████████████████████████████████| 1418916/1418916 [3:18:55<00:00, 118.88it/s]


In [47]:
dists[0], indices[0], fcfeats[0], tlevels[0]

(tensor([0., 0., 0., 0., 0., 0., 0., 0.]),
 tensor([3201, 2550, 1541, 2047,  406,  215,  521,  748]),
 tensor([ 4.6420, -3.8289, -8.8472, -3.1232, -9.0210, -4.3306, -8.9362]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0]))

In [48]:
dists[0].shape, indices[0].shape, fcfeats[0].shape, tlevels[0].shape

(torch.Size([8]), torch.Size([8]), torch.Size([7]), torch.Size([8]))

In [49]:
dists_ = torch.stack(dists)
indices_ = torch.stack(indices)
fcfeats_ = torch.stack(fcfeats)
tlevels_ = torch.stack(tlevels)

In [50]:
np.savez_compressed(
    "results/distilbert-base-uncased/distilbert-base-uncased-focal-AdamW-lr1e-05_1-deck3.npz",
    dists=dists_.numpy(),
    indices=indices_.numpy(),
    fcfeats=fcfeats_.numpy(),
    tlevels=tlevels_.numpy(),
)

## deck결과 연구

In [38]:
dl = load_train_total_data(C.dataset.dir, DistilBertTokenizer.from_pretrained(C.model.name), 100, 6)
ds = dl.dataset
df = pd.read_csv("data/ori/test.csv")
tdf = pd.read_csv("data/ori/train.csv")

In [9]:
deck3 = np.load("results/distilbert-base-uncased/distilbert-base-uncased-focal-AdamW-lr1e-05_1-deck3.npz")

In [12]:
deck3 = {k: torch.from_numpy(v) for k, v in deck3.items()}

In [None]:
deck3["fclevels"] = deck3["fcfeats"].argmax(1)

In [227]:
deck3

{'dists': tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0308, 0.0312, 0.0324,  ..., 0.0374, 0.0380, 0.0389],
         ...,
         [0.0142, 0.0167, 0.0245,  ..., 0.0288, 0.0303, 0.0305],
         [0.0403, 0.0409, 0.0412,  ..., 0.0486, 0.0487, 0.0488],
         [0.0027, 0.0027, 0.0027,  ..., 0.0027, 0.0027, 0.0027]]),
 'indices': tensor([[  3201,   2550,   1541,  ...,    215,    521,    748],
         [    17,     13,      8,  ...,      0,      3,      6],
         [465707, 165781, 324718,  ..., 468848,  79222, 446725],
         ...,
         [224941, 102391, 402115,  ..., 396347, 270139, 114644],
         [121620, 393346, 291825,  ..., 183438, 211321, 181783],
         [    17,     13,      8,  ...,      0,      3,      6]]),
 'fcfeats': tensor([[ 4.6420, -3.8289, -8.8472,  ..., -9.0210, -4.3306, -8.9362],
         [ 4.5163, -3.7682, -8.6209,  ..., -8.8145, -4.2965, -8.7409],
         [-2.84

In [149]:
knn = deck3["dists"][:, 0].topk(2000)
print(" ".join(map(str, knn.indices.tolist())))

885168 102363 877420 1085297 1321286 776033 631395 933984 1274395 711251 93934 536615 1090762 968749 329118 384420 833900 1285687 100875 409965 1335932 349066 586315 892213 868277 138723 340297 7890 689722 660701 649234 868865 1293703 912728 1361861 54247 930198 867344 1209119 613148 563330 1401434 1248775 1380816 1336665 673688 843165 1203974 1136864 639179 342753 555778 834516 240023 159643 175138 1025057 492659 1116409 774349 1310723 918705 615806 470928 351430 488060 169905 61121 1352554 1348825 903635 823869 89806 1060154 724797 905187 245082 994160 143352 27854 203669 1367641 58797 385750 1003823 1350456 692950 16932 1344909 1249219 589640 964905 38592 1053197 1393580 1290929 117120 97456 1282788 1093714 79586 1001476 282414 984313 391260 713456 612374 656900 871076 1314694 1036955 672860 698144 1224682 498019 1320714 280846 148979 50899 147281 133911 860037 557779 222456 103617 555595 1378228 523049 113459 725843 201568 538927 262584 324320 1268785 618541 1267055 1038198 507487 

In [226]:
1075055 in knn.indices

False

In [103]:
# 정책
# level 3 같은 경우는 유저명/IP 주소 등이 들어간 경우가 많아서 미묘하게 dist가 있는 0.5~0.6(더 작을수도?) 결과가 나온다.
# tlevels의 모든 level이 다 같은 값인지 확인해보고, 다 같이 3 또는 5이면 dist가 커도 3으로 한다.
# dist[0] > 0.3 이고, tlevels가 1과 0 왔다갔다하면 level 7 ? 이건 잘 모르겠네;;
# dist가 3보다 크면 다 상관없이 level 7
# dist가 1보다 큰데, tlevels가 8개 중에서 왔다갔다 하면 level 7
# dist[7] - dist[0] > 1 이면 level 7 ??
# dist[7] - dist[0] <= 0.4 이고, tlevel이 모두 같으면 tlevel[0]으로
# dist[7] - dist[0] <= 0.4 이고, tlevel이 0~1 왔다갔다하면 fclevel으로

# 전처리 추가할 부분
# 날짜 확실히 없애나?
# 1. IP주소: "\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}"
# 2. port: "port \d{1,5}"
# 3. level: "level\s*:\s*\d,?"
# 4. log: "log\s*:"
# 5. uid, gid: "[ug]id=\d+,?"
# 6. user: "user=root"
# 7. logname=: "logname="


In [231]:
j = 208421
print(deck3["dists"][j])
print(deck3["indices"][j])
print(deck3["fclevels"][j])
print(deck3["tlevels"][j])
print(df.full_log[j])

print()
for k in range(8):
    kk = deck3["indices"][j][k].item()
    print(tdf.level[kk], tdf.full_log[kk])
    print()

tensor([0.1488, 0.2238, 1.5763, 1.7137, 1.9992, 1.9994, 2.0052, 2.0060])
tensor([398128, 342699, 207799,  66367, 370437,    900, 216371, 440727])
tensor(0)
tensor([0, 0, 0, 0, 0, 0, 0, 0])
Jan 20 11:06:15 localhost kernel: traps: python3.8[21760] trap invalid opcode ip:7efc19ee3e51 sp:7fffa52d9630 error:0 in _pywrap_tensorflow_internal.so[7efc0d698000+23a66000]

0 Jan 20 11:03:55 localhost kernel: traps: python3.8[21047] trap invalid opcode ip:7fab61e2ce51 sp:7ffdf46f5180 error:0 in _pywrap_tensorflow_internal.so[7fab555e1000+23a66000]

0 Jan 20 11:13:31 localhost kernel: traps: python3.8[3367] trap invalid opcode ip:7f654f0dae51 sp:7ffd06665c70 error:0 in _pywrap_tensorflow_internal.so[7f654288f000+23a66000]

0 Jan 20 13:04:43 localhost kernel: traps: python[30618] trap invalid opcode ip:7f5d8db18050 sp:7ffe69ca7b98 error:0 in _pywrap_tensorflow_internal.so[7f5d82a83000+47368000]

0 Jan 20 13:06:06 localhost kernel: traps: python[30800] trap invalid opcode ip:7fee924d8050 sp:7ffe7fc1f

In [205]:
for i in knn.indices:
    if deck3["tlevels"][i][0] in [2]:
        print(i)

## submission 파일 만들기

In [235]:
# ver1
def politic_draw(dists, indices, fclevel, tlevels):
    dd = dists[-1] - dists[0]
    dist = dists[0]
    same = (tlevels == tlevels[0]).sum() == 8

    # policy1: dist와 관계 없이 모든 tlevels가 3 또는 5이면 그 값을 출력
    if same and tlevels[0] in [3, 5]:
        return tlevels[0].item()

    # policy2: dist와 관계 없이 모든 tlevels 중 앞의 4개가 2 또는 4 또는 6이면 그 값을 출력
    if tlevels[0] in [2, 4, 6] and (tlevels[1:4] == tlevels[0]).sum() == 3:
        return tlevels[0].item()

    # policy3: dist가 3보다 크면 level 7
    if dist > 3:
        return 7

    # policy4: dist가 1보다 크고, tlevels가 왔다갔다하면 level 7 ?
    if dist > 1 and not same:
        return 7

    # policy5: dd가 0.5보다 크고, tlevels가 왔다갔다하면 level 7 ?
    if dd > 0.5 and not same:
        return 7

    # policy: dist가 0.5보다 크면 fclevel
    if dist > 0.5:
        return fclevel.item()

    # 나머지
    return tlevels[0].item()

In [236]:
# ver2
def politic_draw(dists, indices, fclevel, tlevels):
    dd = dists[-1] - dists[0]
    dist = dists[0]
    same = (tlevels == tlevels[0]).sum() == 8

    # policy1: dist와 관계 없이 모든 tlevels가 3 또는 5이면 그 값을 출력
    if same and tlevels[0] in [3, 5]:
        return tlevels[0].item()

    # policy2: dist와 관계 없이 모든 tlevels 중 앞의 4개가 2 또는 4 또는 6이면 그 값을 출력
    if tlevels[0] in [2, 4, 6] and (tlevels[1:4] == tlevels[0]).sum() == 3:
        return tlevels[0].item()

    # policy: dist가 0.5보다 크면 level 7
    if dist > 0.5:
        return fclevel.item()

    # 나머지
    return tlevels[0].item()

In [244]:
# ver3
def politic_draw(dists, indices, fclevel, tlevels):
    dd = dists[-1] - dists[0]
    dist = dists[0]
    same = (tlevels == tlevels[0]).sum() == 8

    # policy1: dist와 관계 없이 모든 tlevels가 3 또는 5이면 그 값을 출력
    if same and tlevels[0] in [3, 5]:
        return tlevels[0].item()

    # policy2: dist와 관계 없이 모든 tlevels 중 앞의 4개가 2 또는 4 또는 6이면 그 값을 출력
    if tlevels[0] in [2, 4, 6] and (tlevels[1:4] == tlevels[0]).sum() == 3:
        return tlevels[0].item()

    # policy: dist가 0.5보다 크면 level 7
    if dist > 0.5:
        return 7

    # 나머지
    return tlevels[0].item()

In [245]:
i = 100
politic_draw(deck3["dists"][i], deck3["indices"][i], deck3["fclevels"][i], deck3["tlevels"][i])

0

In [None]:
N = len(deck3["dists"])
outdic = {"id": list(range(1000000, 2418915 + 1)), "level": []}
with tqdm(total=N, ncols=100, file=sys.stdout) as t:
    for i in range(N):
        v = politic_draw(deck3["dists"][i], deck3["indices"][i], deck3["fclevels"][i], deck3["tlevels"][i])
        outdic["level"].append(v)
        t.update()

 72%|████████████████████████████████████▋              | 1021915/1418916 [02:05<00:48, 8248.16it/s]

In [None]:
outdf = pd.DataFrame(outdic)

In [None]:
outdf.to_csv("results/distilbert-base-uncased/distilbert-base-uncased-focal-AdamW-lr1e-05_1-ver3.csv", index=False)