In [1]:
%load_ext lab_black
%cd ../../..

/home/shim/cev/dl/log-analytics


In [12]:
import argparse
import math
import multiprocessing
import sys
from datetime import datetime
from pathlib import Path
from pprint import pformat

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_optimizer
import yaml
from easydict import EasyDict
from pytorch_transformers import DistilBertForSequenceClassification, DistilBertTokenizer

from datasets.dataset_ver6 import DatasetGeneratorVer6
from utils import SAM, AverageMeter, CustomLogger, FocalLoss, seed_everything

from main_ver6 import MyTrainer
from collections import defaultdict
import matplotlib.pyplot as plt
import random
from sklearn.metrics import f1_score, classification_report
from tqdm import tqdm

In [3]:
postfix = "distilbert-base-uncased-focal-AdamW-lr2e-05-ver6-os100_1"
fold = 1

In [4]:
with open("config/distilbert-base-uncased-ver6.yaml", "r") as f:
    C = EasyDict(yaml.load(f, yaml.FullLoader))
    C.result_dir = Path(C.result_dir)
    C.dataset.dir = Path(C.dataset.dir)
    seed_everything(C.seed, deterministic=False)

In [5]:
C

{'model': {'name': 'distilbert-base-uncased'},
 'comment': None,
 'result_dir': PosixPath('results/distilbert-base-uncased-ver6'),
 'debug': False,
 'seed': 20210425,
 'ver': 6,
 'train': {'SAM': False,
  'folds': [1],
  'checkpoints': [None],
  'loss': {'name': 'focal',
   'params': {'gamma': 2.0, 's': 45.0, 'm': 0.1, 'crit': 'focal'}},
  'optimizer': {'name': 'AdamW'},
  'finetune': {'do': True, 'step1_epochs': 2, 'step2_epochs': 3},
  'max_epochs': 6,
  'lr': 2e-05,
  'scheduler': {'name': 'ReduceLROnPlateau',
   'params': {'factor': 0.5, 'patience': 3, 'verbose': True}}},
 'dataset': {'dir': PosixPath('data/ver6'),
  'batch_size': 35,
  'num_workers': 8,
  'oversampling': True,
  'oversampling_scale': 100}}

In [6]:
trainer = MyTrainer(C, fold, f"results/distilbert-base-uncased-ver6/{postfix}.pth")

Load pretrained results/distilbert-base-uncased-ver6/distilbert-base-uncased-focal-AdamW-lr2e-05-ver6-os100_1.pth


In [7]:
model = trainer.model
model.eval()
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7fa431f32e50>

In [8]:
activation = []


def hook(model, input, output):
    activation.append(output.detach().cpu())

In [9]:
model.pre_classifier.register_forward_hook(hook)

<torch.utils.hooks.RemovableHandle at 0x7fa593263250>

In [10]:
train_dl = trainer.dsgen.train_only()

## 계획

train / valid deck을 만들고, valid에 대해 euclidian/cos distance를 통해 구한 결과의 F1-score 차이를 구해본다. 어떤 metric이 좋을지 확인.

## make train deck

In [None]:
activation = []
deck = {"feat": [], "otext": [], "tlevel": [], "fclevel": []}
with tqdm(total=len(train_dl.dataset), ncols=100, file=sys.stdout) as t:
    for text, tlevel, otext in train_dl:
        fclevel = model(text.cuda(non_blocking=True))[0].argmax(dim=1).cpu()
        deck["fclevel"].append(fclevel)
        deck["tlevel"].append(tlevel)
        deck["otext"].extend(otext)
        t.update(text.size(0))

 12%|██████▎                                                | 48685/421079 [02:23<18:20, 338.26it/s]

In [15]:
deck["feat"] = torch.cat(activation)
deck["tlevel"] = torch.cat(deck["tlevel"])
deck["fclevel"] = torch.cat(deck["fclevel"])

In [16]:
deck["tlevel"].shape, deck["fclevel"].shape

(torch.Size([378377]), torch.Size([378377]))

In [17]:
deck["feat"].shape

torch.Size([378377, 768])

In [18]:
np.savez_compressed(
    f"results/distilbert-base-uncased/{postfix}-tdeck.npz",
    feat=deck["feat"].numpy(),
    tlevel=deck["tlevel"].numpy(),
    fclevel=deck["fclevel"].numpy(),
    otext=deck["otext"],
)

## make valid deck

In [19]:
activation = []
deck = {"feat": [], "otext": [], "tlevel": [], "fclevel": []}
with tqdm(total=len(vdl.dataset), ncols=100, file=sys.stdout) as t:
    for id, text, tlevel, otext in vdl:
        fclevel = model(text.cuda(non_blocking=True))[0].argmax(dim=1).cpu()
        deck["fclevel"].append(fclevel)
        deck["tlevel"].append(tlevel)
        deck["otext"].extend(otext)
        t.update(len(id))

100%|████████████████████████████████████████████████████████| 94595/94595 [05:05<00:00, 309.63it/s]


In [20]:
deck["feat"] = torch.cat(activation)

In [21]:
deck["tlevel"] = torch.cat(deck["tlevel"])

In [22]:
deck["fclevel"] = torch.cat(deck["fclevel"])

In [23]:
deck["tlevel"].shape, deck["fclevel"].shape

(torch.Size([94595]), torch.Size([94595]))

In [24]:
deck["feat"].shape

torch.Size([94595, 768])

In [25]:
np.savez_compressed(
    f"results/distilbert-base-uncased/{postfix}-vdeck.npz",
    feat=deck["feat"].numpy(),
    tlevel=deck["tlevel"].numpy(),
    fclevel=deck["fclevel"].numpy(),
    otext=deck["otext"],
)

## valid 데이터에 대해 euclidean distance를 구한다

In [6]:
tdeck = np.load(f"results/distilbert-base-uncased/{postfix}-tdeck.npz")
vdeck = np.load(f"results/distilbert-base-uncased/{postfix}-vdeck.npz")

In [7]:
tfeat = torch.from_numpy(tdeck["feat"])
tlevel = torch.from_numpy(tdeck["tlevel"])
vfeat = torch.from_numpy(vdeck["feat"])
vlevel = torch.from_numpy(vdeck["tlevel"])

In [8]:
tfeat_ = tfeat.cuda()
vfeat_ = vfeat.cuda()

In [9]:
vfclevel = torch.from_numpy(vdeck["fclevel"])

In [10]:
dists, distis = [], []
with tqdm(total=vfeat.size(0), ncols=100, file=sys.stdout) as t:
    for v_, l in zip(vfeat_, vlevel):
        dist_, disti_ = (tfeat_ - v_[None]).norm(p=None, dim=1).topk(4, largest=False)
        dists.append(dist_.cpu())
        distis.append(disti_.cpu())
        t.update()

100%|████████████████████████████████████████████████████████| 94595/94595 [09:47<00:00, 161.11it/s]


In [11]:
euc_dists = torch.stack(dists)

In [12]:
euc_distis = torch.stack(distis)

In [13]:
# accuracy
(tlevel[euc_distis[:, 0]] == vlevel).sum() / vlevel.size(0)

tensor(0.9975)

In [14]:
# f1-score with metric learning(nearest only)
f1_score(vlevel, tlevel[euc_distis[:, 0]], average="macro", zero_division=1)

0.9943748841526491

In [15]:
print(classification_report(vlevel, tlevel[euc_distis[:, 0]], zero_division=1))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     66813
           1       1.00      1.00      1.00     26504
           2       1.00      1.00      1.00         2
           3       1.00      1.00      1.00       828
           4       1.00      1.00      1.00         2
           5       0.97      0.97      0.97       444
           6       1.00      1.00      1.00         2

    accuracy                           1.00     94595
   macro avg       1.00      0.99      0.99     94595
weighted avg       1.00      1.00      1.00     94595



In [16]:
# f1-score with fully connected result
f1_score(vlevel, vfclevel, average="macro", zero_division=1)

0.7965123538395745

In [17]:
print(classification_report(vlevel, vfclevel, zero_division=1))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     66813
           1       1.00      0.99      1.00     26504
           2       0.50      1.00      0.67         2
           3       1.00      0.95      0.97       828
           4       1.00      1.00      1.00         2
           5       0.91      0.97      0.94       444
           6       1.00      0.00      0.00         2

    accuracy                           1.00     94595
   macro avg       0.92      0.84      0.80     94595
weighted avg       1.00      1.00      1.00     94595



## valid 데이터에 대해 cosine distance를 구한다.

In [18]:
tfeat_star_ = tfeat_.pow(2).sum(dim=1).sqrt_()

In [19]:
dists, distis = [], []
with tqdm(total=vfeat.size(0), ncols=100, file=sys.stdout) as t:
    for v_, l in zip(vfeat_, vlevel):
        vstar_ = v_[None].pow(2).sum(dim=1).sqrt_()
        dist_ = (tfeat_ * v_[None]).sum(dim=1) / (tfeat_star_ * vstar_).clamp_(min=1e-8)
        dist_, disti_ = dist_.topk(4, largest=True)
        dists.append(dist_.cpu())
        distis.append(disti_.cpu())
        t.update()

100%|████████████████████████████████████████████████████████| 94595/94595 [10:32<00:00, 149.61it/s]


In [20]:
ang_dists = torch.stack(dists)

In [21]:
ang_distis = torch.stack(distis)

In [22]:
# accuracy
(tlevel[ang_distis[:, 0]] == vlevel).sum() / vlevel.size(0)

tensor(0.9975)

In [23]:
# f1-score with metric learning(nearest only)
f1_score(vlevel, tlevel[ang_distis[:, 0]], average="macro", zero_division=1)

0.9944678530549746

In [24]:
print(classification_report(vlevel, tlevel[ang_distis[:, 0]], zero_division=1))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     66813
           1       1.00      1.00      1.00     26504
           2       1.00      1.00      1.00         2
           3       1.00      1.00      1.00       828
           4       1.00      1.00      1.00         2
           5       0.97      0.97      0.97       444
           6       1.00      1.00      1.00         2

    accuracy                           1.00     94595
   macro avg       1.00      0.99      0.99     94595
weighted avg       1.00      1.00      1.00     94595



In [25]:
np.savez_compressed(
    f"results/distilbert-base-uncased/{postfix}-valid_dists.npz",
    euc_dists=euc_dists.numpy(),
    euc_distis=euc_distis.numpy(),
    ang_dists=ang_dists.numpy(),
    ang_distis=ang_distis.numpy(),
)

## 사담

### 과연 ArcFace Loss를 쓰는게 좋을까?