In [1]:
%load_ext lab_black
%cd ..

/data3/SIG/log-analytics


In [28]:
import argparse
import math
import multiprocessing
import sys
from datetime import datetime
from pathlib import Path
from pprint import pformat

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_optimizer
import yaml
from easydict import EasyDict
from pytorch_transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    DistilBertForSequenceClassification,
    DistilBertTokenizer,
    RobertaForSequenceClassification,
    RobertaTokenizer,
)
from sklearn.model_selection import StratifiedKFold
from torch.optim import Adam, AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (
    AlbertForSequenceClassification,
    AlbertTokenizer,
    DebertaForSequenceClassification,
    DebertaTokenizer,
    SqueezeBertTokenizer,
    SqueezeBertForSequenceClassification,
    XLNetTokenizer,
    XLNetForSequenceClassification,
)

from datasets import load_test_data, load_train_data, MyDataset, load_train_total_data
from utils import SAM, AverageMeter, CustomLogger, FocalLoss, seed_everything

from main import MyTrainer
from collections import defaultdict
import matplotlib.pyplot as plt
import random
from sklearn.metrics import f1_score, classification_report

In [3]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [4]:
postfix = "distilbert-base-uncased-focal-AdamW-lr1e-05_4"
fold = 4

In [5]:
with open("config/distilbert-base-uncased.yaml", "r") as f:
    C = EasyDict(yaml.load(f, yaml.FullLoader))
    C.result_dir = Path(C.result_dir)
    C.dataset.dir = Path(C.dataset.dir)
    seed_everything(C.seed, deterministic=False)

In [6]:
C

{'model': {'name': 'distilbert-base-uncased'},
 'comment': None,
 'result_dir': PosixPath('results/distilbert-base-uncased'),
 'debug': False,
 'seed': 20210425,
 'train': {'SAM': False,
  'folds': [2, 3],
  'checkpoints': ['results/distilbert-base-uncased/distilbert-base-uncased-focal-AdamW-lr1e-05_2.pth',
   'results/distilbert-base-uncased/distilbert-base-uncased-focal-AdamW-lr1e-05_3.pth'],
  'loss': {'name': 'focal', 'gamma': 2},
  'optimizer': {'name': 'AdamW'},
  'finetune': {'do': True, 'step1_epochs': 2, 'step2_epochs': 4},
  'max_epochs': 12,
  'lr': 1e-05,
  'scheduler': {'name': 'ReduceLROnPlateau',
   'params': {'factor': 0.5, 'patience': 3, 'verbose': True}}},
 'dataset': {'dir': PosixPath('data/ori'),
  'ver': 1,
  'batch_size': 35,
  'num_workers': 8}}

In [7]:
trainer = MyTrainer(C, fold, f"results/distilbert-base-uncased/{postfix}.pth")

Load pretrained results/distilbert-base-uncased/distilbert-base-uncased-focal-AdamW-lr1e-05_4.pth


In [8]:
model = trainer.model
model.eval()
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7f05a06d79b0>

In [9]:
activation = []


def hook(model, input, output):
    activation.append(output.detach().cpu())

In [10]:
model.pre_classifier.register_forward_hook(hook)

<torch.utils.hooks.RemovableHandle at 0x7f05a06ce208>

In [11]:
tdl, vdl = load_train_data(
    C.dataset.dir, C.seed, fold, trainer.tokenizer, 50, C.dataset.num_workers, C.dataset.ver, train_shuffle=False
)

## 계획

train / valid deck을 만들고, valid에 대해 euclidian/cos distance를 통해 구한 결과의 F1-score 차이를 구해본다. 어떤 metric이 좋을지 확인.

## make train deck

In [12]:
activation = []
deck = {"feat": [], "otext": [], "tlevel": [], "fclevel": []}
with tqdm(total=len(tdl.dataset), ncols=100, file=sys.stdout) as t:
    for id, text, tlevel, otext in tdl:
        fclevel = model(text.cuda(non_blocking=True))[0].argmax(dim=1).cpu()
        deck["fclevel"].append(fclevel)
        deck["tlevel"].append(tlevel)
        deck["otext"].extend(otext)
        t.update(len(id))

100%|██████████████████████████████████████████████████████| 378378/378378 [39:31<00:00, 159.58it/s]


In [13]:
deck["feat"] = torch.cat(activation)

In [14]:
deck["tlevel"] = torch.cat(deck["tlevel"])

In [15]:
deck["fclevel"] = torch.cat(deck["fclevel"])

In [16]:
deck["tlevel"].shape, deck["fclevel"].shape

(torch.Size([378378]), torch.Size([378378]))

In [17]:
deck["feat"].shape

torch.Size([378378, 768])

In [18]:
np.savez_compressed(
    f"results/distilbert-base-uncased/{postfix}-tdeck.npz",
    feat=deck["feat"].numpy(),
    tlevel=deck["tlevel"].numpy(),
    fclevel=deck["fclevel"].numpy(),
    otext=deck["otext"],
)

## make valid deck

In [12]:
activation = []
deck = {"feat": [], "otext": [], "tlevel": [], "fclevel": []}
with tqdm(total=len(vdl.dataset), ncols=100, file=sys.stdout) as t:
    for id, text, tlevel, otext in vdl:
        fclevel = model(text.cuda(non_blocking=True))[0].argmax(dim=1).cpu()
        deck["fclevel"].append(fclevel)
        deck["tlevel"].append(tlevel)
        deck["otext"].extend(otext)
        t.update(len(id))

100%|████████████████████████████████████████████████████████| 94594/94594 [10:30<00:00, 150.08it/s]


In [13]:
deck["feat"] = torch.cat(activation)

In [14]:
deck["tlevel"] = torch.cat(deck["tlevel"])

In [15]:
deck["fclevel"] = torch.cat(deck["fclevel"])

In [16]:
deck["tlevel"].shape, deck["fclevel"].shape

(torch.Size([94594]), torch.Size([94594]))

In [17]:
deck["feat"].shape

torch.Size([94594, 768])

In [18]:
np.savez_compressed(
    f"results/distilbert-base-uncased/{postfix}-vdeck.npz",
    feat=deck["feat"].numpy(),
    tlevel=deck["tlevel"].numpy(),
    fclevel=deck["fclevel"].numpy(),
    otext=deck["otext"],
)

## valid 데이터에 대해 euclidean distance를 구한다

In [7]:
tdeck = np.load(f"results/distilbert-base-uncased/{postfix}-tdeck.npz")
vdeck = np.load(f"results/distilbert-base-uncased/{postfix}-vdeck.npz")

In [8]:
tfeat = torch.from_numpy(tdeck["feat"])
tlevel = torch.from_numpy(tdeck["tlevel"])
vfeat = torch.from_numpy(vdeck["feat"])
vlevel = torch.from_numpy(vdeck["tlevel"])

In [9]:
tfeat_ = tfeat.cuda()
vfeat_ = vfeat.cuda()

In [30]:
vfclevel = torch.from_numpy(vdeck["fclevel"])

In [11]:
dists, distis = [], []
with tqdm(total=vfeat.size(0), ncols=100, file=sys.stdout) as t:
    for v_, l in zip(vfeat_, vlevel):
        dist_, disti_ = (tfeat_ - v_[None]).norm(p=None, dim=1).topk(4, largest=False)
        dists.append(dist_.cpu())
        distis.append(disti_.cpu())
        t.update()

100%|████████████████████████████████████████████████████████| 94594/94594 [13:22<00:00, 117.92it/s]


In [15]:
euc_dists = torch.stack(dists)

In [18]:
euc_distis = torch.stack(distis)

In [26]:
# accuracy
(tlevel[euc_distis[:, 0]] == vlevel).sum() / vlevel.size(0)

tensor(0.9975)

In [29]:
# f1-score with metric learning(nearest only)
f1_score(vlevel, tlevel[euc_distis[:, 0]], average="macro", zero_division=1)

0.85166449681792

In [46]:
print(classification_report(vlevel, tlevel[euc_distis[:, 0]], zero_division=1))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     66813
           1       1.00      1.00      1.00     26503
           2       1.00      1.00      1.00         3
           3       1.00      1.00      1.00       828
           4       1.00      1.00      1.00         2
           5       0.98      0.96      0.97       444
           6       1.00      0.00      0.00         1

    accuracy                           1.00     94594
   macro avg       1.00      0.85      0.85     94594
weighted avg       1.00      1.00      1.00     94594



In [32]:
# f1-score with fully connected result
f1_score(vlevel, vfclevel, average="macro", zero_division=1)

0.8534070957211375

In [47]:
print(classification_report(vlevel, vfclevel, zero_division=1))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     66813
           1       1.00      0.99      1.00     26503
           2       1.00      1.00      1.00         3
           3       1.00      1.00      1.00       828
           4       1.00      1.00      1.00         2
           5       1.00      0.96      0.98       444
           6       1.00      0.00      0.00         1

    accuracy                           1.00     94594
   macro avg       1.00      0.85      0.85     94594
weighted avg       1.00      1.00      1.00     94594



## valid 데이터에 대해 cosine distance를 구한다.

In [33]:
tfeat_star_ = tfeat_.pow(2).sum(dim=1).sqrt_()

In [48]:
dists, distis = [], []
with tqdm(total=vfeat.size(0), ncols=100, file=sys.stdout) as t:
    for v_, l in zip(vfeat_, vlevel):
        vstar_ = v_[None].pow(2).sum(dim=1).sqrt_()
        dist_ = (tfeat_ * v_[None]).sum(dim=1) / (tfeat_star_ * vstar_).clamp_(min=1e-8)
        dist_, disti_ = dist_.topk(4, largest=True)
        dists.append(dist_.cpu())
        distis.append(disti_.cpu())
        t.update()

100%|████████████████████████████████████████████████████████| 94594/94594 [14:16<00:00, 110.46it/s]


In [51]:
ang_dists = torch.stack(dists)

In [52]:
ang_distis = torch.stack(distis)

In [53]:
# accuracy
(tlevel[ang_distis[:, 0]] == vlevel).sum() / vlevel.size(0)

tensor(0.9975)

In [54]:
# f1-score with metric learning(nearest only)
f1_score(vlevel, tlevel[ang_distis[:, 0]], average="macro", zero_division=1)

0.8516586039454035

In [55]:
print(classification_report(vlevel, tlevel[ang_distis[:, 0]], zero_division=1))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     66813
           1       1.00      1.00      1.00     26503
           2       1.00      1.00      1.00         3
           3       1.00      1.00      1.00       828
           4       1.00      1.00      1.00         2
           5       0.98      0.96      0.97       444
           6       1.00      0.00      0.00         1

    accuracy                           1.00     94594
   macro avg       1.00      0.85      0.85     94594
weighted avg       1.00      1.00      1.00     94594



In [56]:
np.savez_compressed(
    f"results/distilbert-base-uncased/{postfix}-valid_dists.npz",
    euc_dists=euc_dists.numpy(),
    euc_distis=euc_distis.numpy(),
    ang_dists=ang_dists.numpy(),
    ang_distis=ang_distis.numpy(),
)

## 사담

### 과연 ArcFace Loss를 쓰는게 좋을까?