In [1]:
%load_ext lab_black
%cd ..

/mnt/h/hev/log-analytics


In [2]:
import argparse
import math
import multiprocessing
import sys
from datetime import datetime
from pathlib import Path
from pprint import pformat

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_optimizer
import yaml
from easydict import EasyDict
from pytorch_transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    DistilBertForSequenceClassification,
    DistilBertTokenizer,
    RobertaForSequenceClassification,
    RobertaTokenizer,
)
from sklearn.model_selection import StratifiedKFold
from torch.optim import Adam, AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (
    AlbertForSequenceClassification,
    AlbertTokenizer,
    DebertaForSequenceClassification,
    DebertaTokenizer,
    SqueezeBertTokenizer,
    SqueezeBertForSequenceClassification,
    XLNetTokenizer,
    XLNetForSequenceClassification,
)

from datasets import load_test_data, load_train_data, MyDataset, load_train_total_data
from utils import SAM, AverageMeter, CustomLogger, FocalLoss, seed_everything

from main import MyTrainer
from collections import defaultdict
import matplotlib.pyplot as plt
import random

In [3]:
postfix = "distilbert-base-uncased-focal-AdamW-lr1e-05-dsver2_2"

In [4]:
with open("config/distilbert-base-uncased-ver2.yaml", "r") as f:
    C = EasyDict(yaml.load(f, yaml.FullLoader))
    C.result_dir = Path(C.result_dir)
    C.dataset.dir = Path(C.dataset.dir)
    seed_everything(C.seed, deterministic=False)

In [5]:
C

{'model': {'name': 'distilbert-base-uncased'},
 'comment': None,
 'result_dir': PosixPath('results/distilbert-base-uncased'),
 'debug': False,
 'seed': 20210425,
 'train': {'SAM': False,
  'folds': [3],
  'checkpoints': ['results/distilbert-base-uncased/distilbert-base-uncased-focal-AdamW-lr1e-05-dsver2_3.pth'],
  'loss': {'name': 'focal', 'gamma': 2},
  'optimizer': {'name': 'AdamW'},
  'finetune': {'do': True, 'step1_epochs': 2, 'step2_epochs': 4},
  'max_epochs': 10,
  'lr': 1e-05,
  'scheduler': {'name': 'ReduceLROnPlateau',
   'params': {'factor': 0.5, 'patience': 3, 'verbose': True}}},
 'dataset': {'dir': PosixPath('data/ori'),
  'ver': 2,
  'batch_size': 35,
  'num_workers': 8}}

In [5]:
trainer = MyTrainer(C, 2, f"results/distilbert-base-uncased/{postfix}.pth")

Load pretrained results/distilbert-base-uncased/distilbert-base-uncased-focal-AdamW-lr1e-05-dsver2_2.pth


In [6]:
model = trainer.model
model.eval()
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7f6449cb2150>

In [7]:
activation = []


def hook(model, input, output):
    activation.append(output.detach().cpu())

In [8]:
model.pre_classifier.register_forward_hook(hook)

<torch.utils.hooks.RemovableHandle at 0x7f6449cd14d0>

In [9]:
dl = load_train_total_data(C.dataset.dir, trainer.tokenizer, 100, 6, ver=2)

## tdeck 만들기 (모든 train 데이터에 대한 집합)

In [10]:
activation = []
deck = {
    "fcfeat": [],
    "tlevel": [],
    "fclevel": [],
    "otext": [],
}
with tqdm(total=len(dl.dataset), ncols=100, file=sys.stdout) as t:
    for id, text, tlevel, otext in dl:
        pred = model(text.cuda(non_blocking=True))[0].cpu()
        deck["fcfeat"].append(pred)
        deck["tlevel"].append(tlevel)
        deck["fclevel"].append(pred.argmax(dim=1))
        deck["otext"].extend(otext)

        t.update(len(id))

100%|██████████████████████████████████████████████████████| 472972/472972 [25:39<00:00, 307.15it/s]


In [11]:
deck["fcfeat"] = torch.cat(deck["fcfeat"])

In [12]:
deck["tlevel"] = torch.cat(deck["tlevel"])

In [13]:
deck["fclevel"] = torch.cat(deck["fclevel"])

In [14]:
deck["feat"] = torch.cat(activation)

In [15]:
deck.keys()

dict_keys(['fcfeat', 'tlevel', 'fclevel', 'otext', 'feat'])

In [16]:
np.savez_compressed(
    f"results/distilbert-base-uncased/{postfix}-deck1.npz",
    fcfeat=deck["fcfeat"].numpy(),
    tlevel=deck["tlevel"].numpy(),
    fclevel=deck["fclevel"].numpy(),
    feat=deck["feat"].numpy(),
    otext=deck["otext"],
)

## sdeck 만들기 (모든 test 데이터에 대한 집합)

reload

In [10]:
# sfeats 저장
activation = []
deck = {"fcfeat": [], "fclevel": [], "otext": []}
with tqdm(total=len(trainer.dl_test.dataset), ncols=100, file=sys.stdout) as t:
    for _, text, otext in trainer.dl_test:
        pred = model(text.cuda(non_blocking=True))[0].cpu()
        deck["fcfeat"].append(pred)
        deck["fclevel"].append(pred.argmax(dim=1))
        deck["otext"].extend(otext)
        t.update(len(text))

100%|██████████████████████████████████████████████████| 1418916/1418916 [1:17:44<00:00, 304.22it/s]


In [17]:
deck["fcfeat"] = torch.cat(deck["fcfeat"])
deck["fclevel"] = torch.cat(deck["fclevel"])
deck["feat"] = torch.cat(activation)

torch.Size([1418916, 7])

In [22]:
deck.keys()

dict_keys(['fcfeat', 'fclevel', 'otext', 'feat'])

In [23]:
np.savez_compressed(
    f"results/distilbert-base-uncased/{postfix}-deck2.npz",
    fcfeat=deck["fcfeat"].numpy(),
    fclevel=deck["fclevel"].numpy(),
    feat=deck["feat"].numpy(),
    otext=deck["otext"],
)

## dist값, KNN level, FC level 을 저장

여기서 reload

In [5]:
deck1 = np.load(f"results/distilbert-base-uncased/{postfix}-deck1.npz")
deck2 = np.load(f"results/distilbert-base-uncased/{postfix}-deck2.npz")

In [6]:
tdeck = {"feat": torch.from_numpy(deck1["feat"]).cuda(), "tlevel": torch.from_numpy(deck1["tlevel"])}

In [7]:
sdeck = {
    "feat": torch.from_numpy(deck2["feat"]).cuda(),
    "fcfeat": torch.from_numpy(deck2["fcfeat"]),
    "fclevel": torch.from_numpy(deck2["fclevel"]),
}

In [8]:
tdeck["feat"].shape, sdeck["feat"].shape

(torch.Size([472972, 768]), torch.Size([1418916, 768]))

In [9]:
def get_dist(deck, feat, topk):
    dist = torch.norm(deck - feat[None], dim=1, p=None)
    values, indices = dist.topk(topk, largest=False)  # knn
    return values, indices

In [11]:
# 실험
values, indices = get_dist(tdeck["feat"], sdeck["feat"][2], 8)
print(values)
print(indices)
print(tdeck["tlevel"][indices])

tensor([0.0368, 0.0373, 0.0453, 0.0457, 0.0458, 0.0467, 0.0468, 0.0484],
       device='cuda:0')
tensor([329544, 324718, 432974, 465707, 192642, 438212, 271713, 300748],
       device='cuda:0')
tensor([1, 1, 1, 1, 1, 1, 1, 1])


In [12]:
# dist를 구함
dists, indices, fcfeats, tlevels = [], [], [], []
with tqdm(total=len(sdeck["feat"]), ncols=100, file=sys.stdout) as t:
    for i in range(len(sdeck["feat"])):
        dist_, index_ = get_dist(tdeck["feat"], sdeck["feat"][i], 8)
        dist = dist_.cpu()
        index = index_.cpu()
        fcfeat = sdeck["fcfeat"][i]
        tlevel = tdeck["tlevel"][index]
        dists.append(dist)
        indices.append(index)
        fcfeats.append(fcfeat)
        tlevels.append(tlevel)

        t.update()

100%|██████████████████████████████████████████████████| 1418916/1418916 [3:14:42<00:00, 121.46it/s]


In [13]:
dists[0], indices[0], fcfeats[0], tlevels[0]

(tensor([0., 0., 0., 0., 0., 0., 0., 0.]),
 tensor([3201, 2550, 1541, 2047,  406,  215,  521,  748]),
 tensor([  5.6391,  -6.5677, -10.9962,  -2.0778, -11.2075,  -5.0317, -10.6327]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0]))

In [14]:
dists[0].shape, indices[0].shape, fcfeats[0].shape, tlevels[0].shape

(torch.Size([8]), torch.Size([8]), torch.Size([7]), torch.Size([8]))

In [15]:
dists_ = torch.stack(dists)
indices_ = torch.stack(indices)
fcfeats_ = torch.stack(fcfeats)
tlevels_ = torch.stack(tlevels)

In [16]:
np.savez_compressed(
    f"results/distilbert-base-uncased/{postfix}-deck3.npz",
    dists=dists_.numpy(),
    indices=indices_.numpy(),
    fcfeats=fcfeats_.numpy(),
    tlevels=tlevels_.numpy(),
)

## deck결과 연구

In [18]:
dl = load_train_total_data(C.dataset.dir, DistilBertTokenizer.from_pretrained(C.model.name), 100, 6, ver=2)
ds = dl.dataset
df = pd.read_csv("data/ori/test.csv")
tdf = pd.read_csv("data/ori/train.csv")

In [19]:
deck3 = np.load(f"results/distilbert-base-uncased/{postfix}-deck3.npz")

In [20]:
deck3 = {k: torch.from_numpy(v) for k, v in deck3.items()}

In [21]:
deck3["fclevels"] = deck3["fcfeats"].argmax(1)

In [22]:
deck3

{'dists': tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0368, 0.0373, 0.0453,  ..., 0.0467, 0.0468, 0.0484],
         ...,
         [0.0062, 0.0072, 0.0085,  ..., 0.0104, 0.0109, 0.0109],
         [0.0027, 0.0027, 0.0027,  ..., 0.0027, 0.0027, 0.0027],
         [0.0025, 0.0025, 0.0025,  ..., 0.0025, 0.0025, 0.0025]]),
 'indices': tensor([[  3201,   2550,   1541,  ...,    215,    521,    748],
         [    17,     13,      8,  ...,      0,      3,      6],
         [329544, 324718, 432974,  ..., 438212, 271713, 300748],
         ...,
         [224941, 102391, 309258,  ..., 237236, 446597,  95542],
         [   102,     71,     66,  ...,     12,     60,     61],
         [    17,     13,      8,  ...,      0,      3,      6]]),
 'fcfeats': tensor([[  5.6391,  -6.5677, -10.9962,  ..., -11.2075,  -5.0317, -10.6327],
         [  6.0033,  -6.9053, -11.4707,  ..., -11.7123,  -5.1550, -11.0984],
   

In [23]:
knn = deck3["dists"][:, 0].topk(2000)
print(" ".join(map(str, knn.indices.tolist())))

776033 1321286 812987 833900 100875 878261 1380816 930198 843165 1401434 1209119 351430 867344 1248775 868865 1336665 912728 613148 54247 1203974 649234 673688 488060 1361861 823869 868277 563330 305976 179014 1028956 1293703 1216814 560070 968749 102363 877420 7890 68203 1101250 1030009 363378 66649 229188 411340 1257904 1001613 1375881 633250 265925 1111068 64630 1380802 146134 22246 784227 148482 885168 79909 133353 435978 640872 507906 948013 66248 464309 1032137 93830 1169358 26611 625020 124131 417984 699735 41200 278138 830753 1201878 1147646 190307 65387 459701 1288254 442860 1111058 74637 198598 1086224 323664 1318463 70089 340297 200861 411194 541624 316400 818149 897437 471875 153382 765739 1267226 758508 1386509 296054 76838 1179074 287035 72520 487726 12250 1240629 1398587 447285 986930 1197207 1294230 400172 792421 254063 283776 1129966 689722 536615 711251 384420 1090762 1392979 1285687 1239920 631395 933984 937013 1186929 723219 634373 683371 1315846 1289695 362550 1170

In [23]:
j = 848132
print(deck3["dists"][j])
print(deck3["indices"][j])
print(deck3["fclevels"][j])
print(deck3["tlevels"][j])
print(df.full_log[j])

print()
for k in range(8):
    kk = deck3["indices"][j][k].item()
    print(tdf.level[kk], tdf.full_log[kk])
    print()

tensor([ 1.9472,  3.7096,  3.7096,  3.8276,  3.8276,  3.8276,  6.8809, 12.9704])
tensor([221847, 429489, 213304, 406796, 361823, 170386,  87783,  64919])
tensor(6)
tensor([6, 6, 6, 6, 6, 6, 6, 4])
Sep 18 10:31:30 localhost groupadd[5200]: new group: name=kibana, GID=977

6 Jan 29 11:28:59 localhost useradd[88679]: new group: name=test, GID=1001

6 Feb  4 09:59:43 localhost kernel: device enp2s0 entered promiscuous mode

6 Mar  8 15:29:30 localhost kernel: device enp2s0 entered promiscuous mode

6 Feb  4 09:59:33 localhost kernel: device virbr0-nic entered promiscuous mode

6 Feb  2 17:22:58 localhost kernel: device virbr0-nic entered promiscuous mode

6 Feb  8 16:16:47 localhost kernel: device virbr0-nic entered promiscuous mode

6 Jan 30 08:23:17 localhost sshd[18415]: Bad protocol version identification '\003' from 78.128.113.18 port 1073

4 Nov 29 05:44:21 localhost sshd[6008]: Did not receive identification string from 211.253.243.66 port 57487



In [20]:
for i in knn.indices:
    if deck3["tlevels"][i][0] in [6]:
        print(i)

tensor(563330)
tensor(179014)
tensor(1293703)
tensor(848132)
tensor(64268)
tensor(147553)


## level7 validation 확인

In [24]:
deck1 = np.load(f"results/distilbert-base-uncased/{postfix}-deck1.npz")
deck2 = np.load(f"results/distilbert-base-uncased/{postfix}-deck2.npz")

In [25]:
tdeck = {"feat": torch.from_numpy(deck1["feat"]).cuda(), "tlevel": torch.from_numpy(deck1["tlevel"])}

In [26]:
sdeck = {
    "feat": torch.from_numpy(deck2["feat"]).cuda(),
    "fcfeat": torch.from_numpy(deck2["fcfeat"]),
    "fclevel": torch.from_numpy(deck2["fclevel"]),
}

In [27]:
tdeck["feat"].shape, sdeck["feat"].shape

(torch.Size([472972, 768]), torch.Size([1418916, 768]))

In [28]:
def get_dist(deck, feat, topk):
    dist = torch.norm(deck - feat[None], dim=1, p=None)
    values, indices = dist.topk(topk, largest=False)  # knn
    return values, indices

### 1

In [29]:
id, text, otext = trainer.dl_test2.dataset[0]

NameError: name 'trainer' is not defined

In [None]:
activation = []
fclevel = model(text[None].cuda())[0].cpu()
fclevel

In [None]:
get_dist(tdeck["feat"], activation[0][0].cuda(), 8), otext

### 2

In [35]:
id, text, otext = trainer.dl_test2.dataset[1]

In [36]:
activation = []
fclevel = model(text[None].cuda())[0].cpu()
fclevel

tensor([[ -4.0427,  -3.1190, -10.1804,   1.6434,  -9.0297,  -4.3041,  -8.9703]])

In [37]:
get_dist(tdeck["feat"], activation[0][0].cuda(), 8), otext

((tensor([1.6136, 1.6294, 1.7145, 1.7167, 1.7201, 1.7224, 1.7280, 1.7363],
         device='cuda:0'),
  tensor([163298, 299508, 310029, 328946,  11113, 240492, 316291, 260730],
         device='cuda:0')),
 'oscap: msg: "xccdf-result", scan-id: "0001600739632", content: "ssg-centos-7-ds.xml", title: "Prevent Log In to Accounts With Empty Password", id: "xccdf_org.ssgproject.content_rule_no_empty_passwords", result: "fail", severity: "high", description: "If an account is configured for password authentication but does not have an assigned password, it may be possible to log into the account without authentication. Remove any instances of the nullok option in /etc/pam.d/system-auth to prevent logins with empty p')

### 3

In [38]:
id, text, otext = trainer.dl_test2.dataset[2]

In [39]:
activation = []
fclevel = model(text[None].cuda())[0].cpu()
fclevel

tensor([[ 0.3696, -0.4240, -5.7242, -2.2731, -6.0591, -1.3458, -5.9721]])

In [40]:
get_dist(tdeck["feat"], activation[0][0].cuda(), 8), otext

((tensor([3.5613, 3.5856, 3.6131, 3.6169, 3.6803, 3.6803, 3.6803, 3.6897],
         device='cuda:0'),
  tensor([ 14398, 340609, 433752, 412324, 379846, 236552,  38289, 115741],
         device='cuda:0')),
 'kernel: Out of memory: Kill process 1736 (probe_rpmverify) score 243 or sacrifice child')

## submission 파일 만들기

In [6]:
dl = load_train_total_data(C.dataset.dir, DistilBertTokenizer.from_pretrained(C.model.name), 100, 6, ver=2)
ds = dl.dataset
df = pd.read_csv("data/ori/test.csv")
tdf = pd.read_csv("data/ori/train.csv")

In [7]:
deck3 = np.load(f"results/distilbert-base-uncased/{postfix}-deck3.npz")

In [8]:
deck3 = {k: torch.from_numpy(v) for k, v in deck3.items()}

In [9]:
deck3["fclevels"] = deck3["fcfeats"].argmax(1)

In [10]:
# ver1
def politic_draw(dists, indices, fclevel, tlevels):
    dd = dists[-1] - dists[0]
    dist = dists[0]
    same = (tlevels == tlevels[0]).sum() == 8

    # policy1: dist와 관계 없이 모든 tlevels가 3 또는 5이면 그 값을 출력
    if same and tlevels[0] in [3, 5]:
        return tlevels[0].item()

    # policy2: dist와 관계 없이 모든 tlevels 중 앞의 4개가 2 또는 4 또는 6이면 그 값을 출력
    if tlevels[0] in [2, 4, 6] and (tlevels[1:4] == tlevels[0]).sum() == 3:
        return tlevels[0].item()

    # policy3: dist가 3보다 크면 level 7
    if dist > 3:
        return 7

    # policy4: dist가 1보다 크고, tlevels가 왔다갔다하면 level 7 ?
    if dist > 1 and not same:
        return 7

    # policy5: dd가 0.5보다 크고, tlevels가 왔다갔다하면 level 7 ?
    if dd > 0.5 and not same:
        return 7

    # policy: dist가 0.5보다 크면 fclevel
    if dist > 0.5:
        return fclevel.item()

    # 나머지
    return tlevels[0].item()

In [26]:
# ver2
def politic_draw(dists, indices, fclevel, tlevels):
    dd = dists[-1] - dists[0]
    dist = dists[0]
    same = (tlevels == tlevels[0]).sum() == 8

    # policy1: dist와 관계 없이 모든 tlevels가 3 또는 5이면 그 값을 출력
    if same and tlevels[0] in [3, 5]:
        return tlevels[0].item()

    # policy2: dist와 관계 없이 모든 tlevels 중 앞의 4개가 2 또는 4 또는 6이면 그 값을 출력
    if tlevels[0] in [2, 4, 6] and (tlevels[1:4] == tlevels[0]).sum() == 3:
        return tlevels[0].item()

    # policy: dist가 0.5보다 크면 level 7
    if dist > 0.5:
        return fclevel.item()

    # 나머지
    return tlevels[0].item()

In [244]:
# ver3
def politic_draw(dists, indices, fclevel, tlevels):
    dd = dists[-1] - dists[0]
    dist = dists[0]
    same = (tlevels == tlevels[0]).sum() == 8

    # policy1: dist와 관계 없이 모든 tlevels가 3 또는 5이면 그 값을 출력
    if same and tlevels[0] in [3, 5]:
        return tlevels[0].item()

    # policy2: dist와 관계 없이 모든 tlevels 중 앞의 4개가 2 또는 4 또는 6이면 그 값을 출력
    if tlevels[0] in [2, 4, 6] and (tlevels[1:4] == tlevels[0]).sum() == 3:
        return tlevels[0].item()

    # policy: dist가 0.5보다 크면 level 7
    if dist > 0.5:
        return 7

    # 나머지
    return tlevels[0].item()

In [10]:
# ver4
def politic_draw(dists, indices, fclevel, tlevels):
    dd = dists[-1] - dists[0]
    dist = dists[0]
    same = (tlevels == tlevels[0]).sum() == 8

    # policy1: dist와 관계 없이 앞의 5개가 모두 tlevels가 3 또는 5이면 그 값을 출력
    if tlevels[0] in [3, 5] and (tlevels[:5] == tlevels[0]).sum() == 5:
        return tlevels[0].item()

    # policy2: dist와 관계 없이 모든 tlevels 중 앞의 3개가 2 또는 4 또는 6이면 그 값을 출력
    if tlevels[0] in [2, 4, 6] and (tlevels[:3] == tlevels[0]).sum() == 3:
        return tlevels[0].item()

    # policy: dist가 0.5보다 크면 level 7
    if dist > 0.4:
        return 7

    # 나머지
    return tlevels[0].item()

In [11]:
i = 100
politic_draw(deck3["dists"][i], deck3["indices"][i], deck3["fclevels"][i], deck3["tlevels"][i])

0

In [12]:
N = len(deck3["dists"])
outdic = {"id": list(range(1000000, 2418915 + 1)), "level": []}
with tqdm(total=N, ncols=100, file=sys.stdout) as t:
    for i in range(N):
        v = politic_draw(deck3["dists"][i], deck3["indices"][i], deck3["fclevels"][i], deck3["tlevels"][i])
        outdic["level"].append(v)
        t.update()

100%|███████████████████████████████████████████████████| 1418916/1418916 [02:43<00:00, 8652.41it/s]


In [13]:
outdf = pd.DataFrame(outdic)

In [14]:
outdf.to_csv(f"results/distilbert-base-uncased/{postfix}-ver4.csv", index=False)