In [1]:
%load_ext lab_black
%cd ..

/data3/SIG/log-analytics


In [None]:
import argparse
import math
import multiprocessing
import sys
from datetime import datetime
from pathlib import Path
from pprint import pformat

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_optimizer
import yaml
from easydict import EasyDict
from pytorch_transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    DistilBertForSequenceClassification,
    DistilBertTokenizer,
    RobertaForSequenceClassification,
    RobertaTokenizer,
)
from sklearn.model_selection import StratifiedKFold
from torch.optim import Adam, AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (
    AlbertForSequenceClassification,
    AlbertTokenizer,
    DebertaForSequenceClassification,
    DebertaTokenizer,
    SqueezeBertTokenizer,
    SqueezeBertForSequenceClassification,
    XLNetTokenizer,
    XLNetForSequenceClassification,
)

from datasets import load_test_data, load_train_data, load_train_total_data
from utils import SAM, AverageMeter, CustomLogger, FocalLoss, seed_everything
from copy import deepcopy

from main import MyTrainer
from collections import defaultdict
import matplotlib.pyplot as plt
import random
from sklearn.metrics import f1_score, classification_report

In [3]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [4]:
postfix = "distilbert-base-uncased-arcface-AdamW-lr5e-05-dsver5_1"
fold = 1

In [5]:
with open("config/distilbert-base-uncased-ver5-arcface_focal.yaml", "r") as f:
    C = EasyDict(yaml.load(f, yaml.FullLoader))
    C.result_dir = Path(C.result_dir)
    C.dataset.dir = Path(C.dataset.dir)
    C.log = CustomLogger()
    seed_everything(C.seed, deterministic=False)

In [6]:
C

{'model': {'name': 'distilbert-base-uncased'},
 'comment': None,
 'result_dir': PosixPath('results/distilbert-base-uncased'),
 'debug': False,
 'seed': 20210425,
 'train': {'SAM': False,
  'folds': [1],
  'checkpoints': [None],
  'loss': {'name': 'arcface',
   'params': {'gamma': 2.0, 's': 45.0, 'm': 0.1, 'crit': 'focal'}},
  'optimizer': {'name': 'AdamW'},
  'finetune': {'do': True, 'step1_epochs': 3, 'step2_epochs': 5},
  'max_epochs': 12,
  'lr': 5e-05,
  'scheduler': {'name': 'ReduceLROnPlateau',
   'params': {'factor': 0.5, 'patience': 3, 'verbose': True}}},
 'dataset': {'dir': PosixPath('data/ori'),
  'ver': 5,
  'batch_size': 35,
  'num_workers': 8,
  'oversampling': True,
  'oversampling_scale': 20},
 'log': <utils.CustomLogger at 0x7fa7f7529b00>}

In [7]:
trainer = MyTrainer(C, fold, f"results/distilbert-base-uncased/{postfix}.pth")

Load pretrained results/distilbert-base-uncased/distilbert-base-uncased-arcface-AdamW-lr5e-05-dsver5_1.pth
[34m[2021-05-09 21:46:40  INFO] Oversampling with scale 20[0m


In [8]:
model = trainer.model
model.eval()
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7fa7f75255f8>

In [9]:
activation = []


def hook(model, input, output):
    activation.append(output.detach().cpu())

In [10]:
model.pre_classifier.register_forward_hook(hook)

<torch.utils.hooks.RemovableHandle at 0x7fa7f7560438>

In [11]:
tdl, vdl = load_train_data(
    C.dataset.dir, C.seed, fold, trainer.tokenizer, 50, C.dataset.num_workers, C.dataset.ver, train_shuffle=False
)

----

In [37]:
tdf = pd.read_csv("data/ori/train.csv")
sdf = pd.read_csv("data/ori/test.csv")

In [12]:
deck1 = np.load("results/distilbert-base-uncased/distilbert-base-uncased-arcface-AdamW-lr5e-05-dsver5_1-deck1.npz")
ttlevels = torch.tensor(deck1["tlevel"])
tfeats = torch.tensor(deck1["feat"])

In [13]:
deck2 = np.load("results/distilbert-base-uncased/distilbert-base-uncased-arcface-AdamW-lr5e-05-dsver5_1-deck2.npz")
sfeats = torch.tensor(deck2["feat"])

In [14]:
tfeats.shape, sfeats.shape, ttlevels.shape

(torch.Size([472972, 768]), torch.Size([1418916, 768]), torch.Size([472972]))

In [15]:
tfeats_ = tfeats.cuda()
sfeats_ = sfeats.cuda()

In [16]:
tfeats_star_ = tfeats_.pow(2).sum(dim=1).sqrt_()

In [77]:
sfeat_ = sfeats_[7]

In [78]:
sfeat_star_ = sfeat_[None].pow(2).sum(dim=1).sqrt_()

In [79]:
dist_ = (tfeats_ * sfeat_[None]).sum(dim=1) / (tfeats_star_ * sfeat_star_).clamp(min=1e-8)

In [84]:
values_, indices_ = dist_.topk(100, dim=0, largest=True)
print(values_[-100:])
print(indices_[-100:])

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000], device='cuda:0')
tensor([358099, 355501, 358837, 362058, 275334, 274706, 275514, 277581, 250713,
      

In [88]:
ttlevels[355501]

tensor(3)

In [89]:
tdf.full_log[355501]

'Jan 27 20:28:22 localhost sshd[6222]: Failed password for invalid user www from 35.184.222.44 port 49768 ssh2'

In [83]:
sdf.full_log[7]

'Jan 22 14:13:43 localhost systemd: Unit esild-ml.service entered failed state.'