In [1]:
%load_ext lab_black
%cd ..

/home/shim/cev/dl/log-analytics


In [2]:
import argparse
import math
import multiprocessing
import sys
from datetime import datetime
from pathlib import Path
from pprint import pformat

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_optimizer
import yaml
from easydict import EasyDict
from pytorch_transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    DistilBertForSequenceClassification,
    DistilBertTokenizer,
    RobertaForSequenceClassification,
    RobertaTokenizer,
)
from sklearn.model_selection import StratifiedKFold
from torch.optim import Adam, AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (
    AlbertForSequenceClassification,
    AlbertTokenizer,
    DebertaForSequenceClassification,
    DebertaTokenizer,
    SqueezeBertTokenizer,
    SqueezeBertForSequenceClassification,
    XLNetTokenizer,
    XLNetForSequenceClassification,
)

from datasets import load_test_data, load_train_data, MyDataset, load_train_total_data
from utils import SAM, AverageMeter, CustomLogger, FocalLoss, seed_everything

from main import MyTrainer
from collections import defaultdict
import matplotlib.pyplot as plt
import random

In [3]:
with open("config/squeezebert-uncased.yaml", "r") as f:
    C = EasyDict(yaml.load(f, yaml.FullLoader))
    C.result_dir = Path(C.result_dir)
    C.dataset.dir = Path(C.dataset.dir)
    seed_everything(C.seed, deterministic=False)

In [4]:
C

{'model': {'name': 'squeezebert/squeezebert-uncased'},
 'comment': None,
 'result_dir': PosixPath('results/squeezebert-uncased'),
 'debug': False,
 'seed': 20210425,
 'train': {'SAM': False,
  'folds': [4],
  'checkpoints': [None],
  'loss': {'name': 'focal', 'gamma': 2},
  'optimizer': {'name': 'AdamW'},
  'finetune': {'do': True, 'step1_epochs': 3, 'step2_epochs': 5},
  'max_epochs': 10,
  'lr': 1e-05,
  'scheduler': {'name': 'ReduceLROnPlateau',
   'params': {'factor': 0.5, 'patience': 3, 'verbose': True}}},
 'dataset': {'dir': PosixPath('data/ori'), 'batch_size': 20, 'num_workers': 8}}

In [6]:
trainer = MyTrainer(
    C, 1, "results/squeezebert-uncased/squeezebert-uncased-focal-AdamW-lr1e-05_1.pth"
)

Some weights of the model checkpoint at squeezebert/squeezebert-uncased were not used when initializing SqueezeBertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing SqueezeBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing SqueezeBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of SqueezeBertForSequenceClassification were no

Load pretrained results/squeezebert-uncased/squeezebert-uncased-focal-AdamW-lr1e-05_1.pth


In [7]:
model = trainer.model
model.eval()
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7f23843b9c90>

In [8]:
model

SqueezeBertForSequenceClassification(
  (transformer): SqueezeBertModel(
    (embeddings): SqueezeBertEmbeddings(
      (word_embeddings): Embedding(30528, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): SqueezeBertEncoder(
      (layers): ModuleList(
        (0): SqueezeBertModule(
          (attention): SqueezeBertSelfAttention(
            (query): Conv1d(768, 768, kernel_size=(1,), stride=(1,), groups=4)
            (key): Conv1d(768, 768, kernel_size=(1,), stride=(1,), groups=4)
            (value): Conv1d(768, 768, kernel_size=(1,), stride=(1,), groups=4)
            (dropout): Dropout(p=0.1, inplace=False)
            (softmax): Softmax(dim=-1)
            (matmul_qk): MatMulWrapper()
            (matmul_qkv): MatMulWrapper()
          )
          (post_attention): ConvDr

In [9]:
activation = []


def hook(model, input, output):
    activation.append(output.detach().cpu())

In [10]:
model.transformer.pooler.dense.register_forward_hook(hook)

<torch.utils.hooks.RemovableHandle at 0x7f23843b9750>

In [11]:
dl = load_train_total_data(C.dataset.dir, trainer.tokenizer, 100, 6)

## tdeck 만들기 (모든 train 데이터에 대한 집합)

In [13]:
activation = []
deck = {
    "fcfeat": [],
    "tlevel": [],
    "fclevel": [],
    "otext": [],
}
with tqdm(total=len(dl.dataset), ncols=100, file=sys.stdout) as t:
    for id, text, tlevel, otext in dl:
        pred = model(text.cuda(non_blocking=True))[0].cpu()
        deck["fcfeat"].append(pred)
        deck["tlevel"].append(tlevel)
        deck["fclevel"].append(pred.argmax(dim=1))
        deck["otext"].extend(otext)

        t.update(len(id))

100%|██████████████████████████████████████████████████████| 472972/472972 [42:06<00:00, 187.18it/s]


In [14]:
deck["fcfeat"] = torch.cat(deck["fcfeat"])

In [15]:
deck["tlevel"] = torch.cat(deck["tlevel"])

In [16]:
deck["fclevel"] = torch.cat(deck["fclevel"])

In [17]:
deck["feat"] = torch.cat(activation)

In [18]:
deck.keys()

dict_keys(['fcfeat', 'tlevel', 'fclevel', 'otext', 'feat'])

In [21]:
np.savez_compressed(
    "results/squeezebert-uncased/squeezebert-uncased-focal-AdamW-lr1e-05_1-deck1.npz",
    fcfeat=deck["fcfeat"].numpy(),
    tlevel=deck["tlevel"].numpy(),
    fclevel=deck["fclevel"].numpy(),
    feat=deck["feat"].numpy(),
)

## sdeck 만들기 (모든 test 데이터에 대한 집합)

reload

In [12]:
# sfeats 저장
activation = []
deck = {"fcfeat": [], "fclevel": [], "otext": []}
with tqdm(total=len(trainer.dl_test.dataset), ncols=100, file=sys.stdout) as t:
    for _, text, otext in trainer.dl_test:
        pred = model(text.cuda(non_blocking=True))[0].cpu()
        deck["fcfeat"].append(pred)
        deck["fclevel"].append(pred.argmax(dim=1))
        deck["otext"].extend(otext)
        t.update(len(text))

100%|██████████████████████████████████████████████████| 1418916/1418916 [2:16:34<00:00, 173.16it/s]


In [13]:
deck["fcfeat"] = torch.cat(deck["fcfeat"])
deck["fclevel"] = torch.cat(deck["fclevel"])
deck["feat"] = torch.cat(activation)

In [14]:
np.savez_compressed(
    "results/squeezebert-uncased/squeezebert-uncased-focal-AdamW-lr1e-05_1-deck2.npz",
    fcfeat=deck["fcfeat"].numpy(),
    fclevel=deck["fclevel"].numpy(),
    feat=deck["feat"].numpy(),
)

## dist값, KNN level, FC level 을 저장

여기서 reload

In [3]:
deck1 = np.load(
    "results/squeezebert-uncased/squeezebert-uncased-focal-AdamW-lr1e-05_1-deck1.npz"
)
deck2 = np.load(
    "results/squeezebert-uncased/squeezebert-uncased-focal-AdamW-lr1e-05_1-deck2.npz"
)

In [4]:
tdeck = {
    "feat": torch.from_numpy(deck1["feat"]).cuda(),
    "tlevel": torch.from_numpy(deck1["tlevel"]),
}

In [5]:
sdeck = {
    "feat": torch.from_numpy(deck2["feat"]).cuda(),
    "fcfeat": torch.from_numpy(deck2["fcfeat"]),
    "fclevel": torch.from_numpy(deck2["fclevel"]),
}

In [6]:
tdeck["feat"].shape, sdeck["feat"].shape

(torch.Size([472972, 768]), torch.Size([1418916, 768]))

In [7]:
def get_dist(deck, feat, topk):
    dist = torch.norm(deck - feat[None], dim=1, p=None)
    values, indices = dist.topk(topk, largest=False)  # knn
    return values, indices

In [8]:
# 실험
values, indices = get_dist(tdeck["feat"], sdeck["feat"][2], 8)
print(values)
print(indices)
print(tdeck["tlevel"][indices])

tensor([0.0308, 0.0328, 0.0368, 0.0369, 0.0370, 0.0381, 0.0381, 0.0382],
       device='cuda:0')
tensor([324718, 252143, 449088, 356215,  83123, 329544, 159483, 327411],
       device='cuda:0')
tensor([1, 1, 1, 1, 1, 1, 1, 1])


In [10]:
# dist를 구함
dists, indices, fcfeats, tlevels = [], [], [], []
with tqdm(total=len(sdeck["feat"]), ncols=100, file=sys.stdout) as t:
    for i in range(len(sdeck["feat"])):
        dist_, index_ = get_dist(tdeck["feat"], sdeck["feat"][i], 8)
        dist = dist_.cpu()
        index = index_.cpu()
        fcfeat = sdeck["fcfeat"][i]
        tlevel = tdeck["tlevel"][index]
        dists.append(dist)
        indices.append(index)
        fcfeats.append(fcfeat)
        tlevels.append(tlevel)

        t.update()

100%|██████████████████████████████████████████████████| 1418916/1418916 [3:04:35<00:00, 128.11it/s]


In [11]:
dists[0], indices[0], fcfeats[0], tlevels[0]

(tensor([0., 0., 0., 0., 0., 0., 0., 0.]),
 tensor([3201, 2550, 1541, 2047,  406,  215,  521,  748]),
 tensor([ 4.4087, -1.7369, -3.3400, -2.2295, -4.4363, -2.2275, -3.1217]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0]))

In [12]:
dists[0].shape, indices[0].shape, fcfeats[0].shape, tlevels[0].shape

(torch.Size([8]), torch.Size([8]), torch.Size([7]), torch.Size([8]))

In [13]:
dists_ = torch.stack(dists)
indices_ = torch.stack(indices)
fcfeats_ = torch.stack(fcfeats)
tlevels_ = torch.stack(tlevels)

In [14]:
np.savez_compressed(
    "results/squeezebert-uncased/squeezebert-uncased-focal-AdamW-lr1e-05_1-deck3.npz",
    dists=dists_.numpy(),
    indices=indices_.numpy(),
    fcfeats=fcfeats_.numpy(),
    tlevels=tlevels_.numpy(),
)

## deck결과 연구

In [5]:
dl = load_train_total_data(C.dataset.dir, SqueezeBertTokenizer.from_pretrained(C.model.name), 100, 6)
ds = dl.dataset
df = pd.read_csv("data/ori/test.csv")
tdf = pd.read_csv("data/ori/train.csv")

In [7]:
deck3 = np.load("results/squeezebert-uncased/squeezebert-uncased-focal-AdamW-lr1e-05_1-deck3.npz")

In [8]:
deck3 = {k: torch.from_numpy(v) for k, v in deck3.items()}

In [12]:
deck3["fclevels"] = deck3["fcfeats"].argmax(1)

In [13]:
deck3

{'dists': tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0308, 0.0328, 0.0368,  ..., 0.0381, 0.0381, 0.0382],
         ...,
         [0.0237, 0.0278, 0.0351,  ..., 0.0372, 0.0389, 0.0396],
         [0.0228, 0.0236, 0.0245,  ..., 0.0262, 0.0279, 0.0287],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]]),
 'indices': tensor([[  3201,   2550,   1541,  ...,    215,    521,    748],
         [    17,     13,      8,  ...,      0,      3,      6],
         [324718, 252143, 449088,  ..., 329544, 159483, 327411],
         ...,
         [206732,  10751, 222608,  ..., 145230, 221758, 216741],
         [153755, 387601, 124924,  ...,  97088, 177289, 290495],
         [    17,     13,      8,  ...,      0,      3,      6]]),
 'fcfeats': tensor([[ 4.4087, -1.7369, -3.3400,  ..., -4.4363, -2.2275, -3.1217],
         [ 4.4719, -1.7817, -3.2795,  ..., -4.4334, -2.2657, -3.2150],
         [-1.68

In [10]:
knn = deck3["dists"][:, 0].topk(2000)
print(" ".join(map(str, knn.indices.tolist())))

1274395 536615 812987 7890 1170801 349066 586315 892213 1239920 1392979 706410 93934 1019475 648933 340297 956511 329693 247539 160810 1413155 1271251 737453 81139 384420 199860 77440 1085297 1223462 171740 427956 64268 1131555 672208 660701 1313835 1007511 581422 451841 100875 968749 503551 507397 792421 1059158 1093095 1405043 843241 1132003 776033 1321286 85474 763582 290496 276564 880670 667667 1090762 258420 268382 848132 741654 1373814 1284064 398587 82287 649135 539834 153891 214469 521295 1224816 711251 1356744 182445 1213976 122720 1014018 227078 1350761 102363 877420 963100 671198 1030009 1213279 6003 1118924 1257904 147553 1111068 138723 594674 829306 1391989 1376627 926229 526783 51953 784227 633250 64630 66649 538051 22246 1375881 1293703 146134 1001613 1169229 363378 229188 68203 1101250 764979 594972 179014 1400122 148482 547899 872936 521634 184448 540106 397914 933984 631395 451943 170457 607089 134165 464615 337847 652696 1388489 725055 583008 82047 1335932 1229763 69

In [19]:
knn = deck3["dists"][:, 0].topk(2000)
print(" ".join(map(lambda x: f"{x:.2f}", knn.values.tolist())))

13.02 11.84 10.90 10.74 10.59 9.97 9.97 9.97 9.14 7.39 7.16 6.89 6.76 6.65 6.63 6.58 6.56 6.49 6.44 6.44 6.35 6.32 6.27 6.27 6.24 6.19 6.17 6.12 6.09 6.02 5.84 5.81 5.67 5.62 5.61 5.60 5.58 5.58 5.56 5.53 5.52 5.52 5.51 5.51 5.50 5.48 5.47 5.44 5.43 5.43 5.42 5.40 5.39 5.39 5.39 5.36 5.35 5.30 5.30 5.27 5.27 5.25 5.22 5.22 5.18 5.16 5.16 5.16 5.15 5.15 5.15 5.14 5.13 5.11 5.11 5.09 5.09 5.09 5.09 5.08 5.08 5.05 5.05 5.04 5.03 5.03 5.03 5.03 5.02 5.02 5.02 5.00 5.00 5.00 5.00 4.99 4.99 4.98 4.98 4.98 4.98 4.97 4.97 4.97 4.97 4.96 4.95 4.95 4.95 4.95 4.94 4.93 4.93 4.93 4.92 4.92 4.91 4.90 4.89 4.88 4.88 4.86 4.86 4.84 4.83 4.83 4.82 4.82 4.82 4.81 4.80 4.80 4.79 4.78 4.78 4.77 4.76 4.75 4.74 4.74 4.73 4.72 4.72 4.72 4.72 4.70 4.70 4.70 4.69 4.69 4.68 4.67 4.67 4.67 4.66 4.66 4.66 4.64 4.63 4.63 4.63 4.61 4.60 4.58 4.57 4.57 4.56 4.56 4.54 4.54 4.53 4.53 4.50 4.48 4.47 4.46 4.45 4.42 4.42 4.42 4.41 4.41 4.40 4.40 4.38 4.38 4.37 4.36 4.36 4.34 4.34 4.32 4.32 4.30 4.30 4.29 4.29 4.28 4.28 

In [21]:
j = 538051
print(deck3["dists"][j])
print(deck3["indices"][j])
print(deck3["fclevels"][j])
print(deck3["tlevels"][j])
print(df.full_log[j])

print()
for k in range(8):
    kk = deck3["indices"][j][k].item()
    print(tdf.level[kk], tdf.full_log[kk])
    print()

tensor([4.9702, 4.9781, 4.9800, 4.9847, 4.9921, 5.0372, 5.0385, 5.0829])
tensor([339998, 227437, 240241, 328741, 202870,  10516, 110767,  17628])
tensor(1)
tensor([1, 1, 1, 1, 1, 1, 1, 1])
type=ANOM_ABEND msg=audit(1610091955.122:54204): auid=4294967295 uid=48 gid=48 ses=4294967295 subj=system_u:system_r:httpd_t:s0 pid=25413 comm="httpd" reason="memory violation" sig=11

1 type=USER_AVC msg=audit(1613363937.137:2481): pid=813 uid=81 auid=4294967295 ses=4294967295 subj=system_u:system_r:system_dbusd_t:s0-s0:c0.c1023 msg='avc:  denied  { send_msg } for msgtype=method_call interface=org.freedesktop.login1.Manager member=CreateSession dest=org.freedesktop.login1 spid=30614 tpid=860 scontext=system_u:system_r:httpd_t:s0 tcontext=system_u:system_r:systemd_logind_t:s0 tclass=dbus  exe="/usr/bin/dbus-daemon" sauid=81 hostname=? addr=? terminal=?'

1 type=AVC msg=audit(1608006885.884:140093): avc:  denied  { nlmsg_relay } for  pid=9856 comm="sudo" scontext=system_u:system_r:httpd_t:s0 tcontext=

In [27]:
# ver2
def politic_draw(dists, indices, fclevel, tlevels):
    dd = dists[-1] - dists[0]
    dist = dists[0]
    same = (tlevels == tlevels[0]).sum() == 8

    # policy1: dist와 관계 없이 모든 tlevels가 3 또는 5이면 그 값을 출력
    if same and tlevels[0] in [3, 5]:
        return tlevels[0].item()

    # policy2: dist와 관계 없이 모든 tlevels 중 앞의 4개가 2 또는 4 또는 6이면 그 값을 출력
    if tlevels[0] in [2, 4, 6] and (tlevels[1:4] == tlevels[0]).sum() == 3:
        return tlevels[0].item()

    # policy: dist가 0.5보다 크면 level 7
    if dist > 0.5:
        return fclevel.item()

    # 나머지
    return tlevels[0].item()

In [28]:
# ver3
def politic_draw(dists, indices, fclevel, tlevels):
    dd = dists[-1] - dists[0]
    dist = dists[0]
    same = (tlevels == tlevels[0]).sum() == 8

    # policy1: dist와 관계 없이 모든 tlevels가 3 또는 5이면 그 값을 출력
    if same and tlevels[0] in [3, 5]:
        return tlevels[0].item()

    # policy2: dist와 관계 없이 모든 tlevels 중 앞의 4개가 2 또는 4 또는 6이면 그 값을 출력
    if tlevels[0] in [2, 4, 6] and (tlevels[1:4] == tlevels[0]).sum() == 3:
        return tlevels[0].item()

    # policy: dist가 0.5보다 크면 level 7
    if dist > 0.5:
        return 7

    # 나머지
    return tlevels[0].item()

In [29]:
i = 100
politic_draw(deck3["dists"][i], deck3["indices"][i], deck3["fclevels"][i], deck3["tlevels"][i])

0

In [30]:
N = len(deck3["dists"])
outdic = {"id": list(range(1000000, 2418915 + 1)), "level": []}
with tqdm(total=N, ncols=100, file=sys.stdout) as t:
    for i in range(N):
        v = politic_draw(deck3["dists"][i], deck3["indices"][i], deck3["fclevels"][i], deck3["tlevels"][i])
        outdic["level"].append(v)
        t.update()

100%|██████████████████████████████████████████████████| 1418916/1418916 [02:08<00:00, 11068.10it/s]


In [31]:
outdf = pd.DataFrame(outdic)

In [32]:
outdf.to_csv("results/squeezebert-uncased/squeezebert-uncased-focal-AdamW-lr1e-05_1-ver3.csv", index=False)