# HierBERT

In [8]:
import os

import hydra
from omegaconf import OmegaConf
from IPython.display import HTML, display
import torch
import pytorch_lightning as pl
from pytorch_lightning.plugins import DDPPlugin
from transformers import BertJapaneseTokenizer

from src.visualization.plot_attention import plot_word_attentions, plot_sent_attentions

In [9]:
abs_data_path = os.path.abspath("data/nested_sample/")
baseline_name = 'ave_pooled_base-v2'

with hydra.initialize(config_path='config'):
    cfg = hydra.compose(
        config_name="defaults.yaml",
        overrides=[
            f"name={baseline_name}",
            "model=HierBERT",
            "data=wereWolf_sample",
            f"data.dir={abs_data_path}/",
            "trainer.gpus=[6]",
            "model.tokenizer.pretrained_model=cl-tohoku/bert-base-japanese-v2",
            "model.sent_level_BERT_config.hidden_size=768",
            "model.sent_level_BERT_config.num_hidden_layers=12",
            "model.sent_level_BERT_config.num_attention_heads=12",
        ]
    )

print(OmegaConf.to_yaml(cfg, resolve=True))

data_module = hydra.utils.instantiate(
    cfg.model.data_module,
    data_dir=cfg.data.dir,
    tokenizer=cfg.model.tokenizer,
    _recursive_=False,
)

model = hydra.utils.instantiate(
        cfg.model.model,
        pretrained_model=cfg.model.tokenizer.pretrained_model,
        sent_level_BERT_config=cfg.model.sent_level_BERT_config,
        optim=cfg.optim,
        _recursive_=False,
)

#tb_logger = pl.loggers.TensorBoardLogger(".", "", "", log_graph=True, default_hp_metric=False)

trainer = pl.Trainer(
    **OmegaConf.to_container(cfg.trainer),
    logger=False,
#    callbacks=[tb_logger],
    plugins=DDPPlugin(),
)



model:
  name: HierBERT
  model:
    num_labels: 2
    _target_: src.model.HierBERT.HierchicalBERT
    use_ave_pooled_output: true
    output_attentions: true
    is_japanese: true
  tokenizer:
    _target_: src.tokenizer.tokenizer_HierBERT.HierBertTokenizer
    sent_length: 256
    doc_length: 256
    pretrained_model: cl-tohoku/bert-base-japanese-v2
  data_module:
    _target_: src.model.HierBERTDataModule.CreateHierBertDataModule
    batch_size: 64
  sent_level_BERT_config:
    _target_: transformers.BertConfig
    hidden_size: 768
    num_hidden_layers: 12
    num_attention_heads: 12
data:
  name: wereWolf_sample
  dir: /disk/ssd14tb/haoki/Documents/vscode-workplaces/lie_detector/data/nested_sample/
optim:
  name: AdamW
  optimizer:
    _target_: torch.optim.AdamW
    lr: 0.001
    weight_decay: 0.01
experiment: baseline
name: ave_pooled_base-v2
message: null
trainer:
  accumulate_grad_batches: 1
  benchmark: true
  deterministic: true
  fast_dev_run: false
  gpus:
  - 6
  max_epoc

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-v2 were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification

In [10]:
best_epoch = 2
# TODO: data.nameの所をweweWolfにする。
ckpt_path = f'outputs/wereWolf/{cfg.model.name}/baseline/{cfg.name}/checkpoints/epoch={best_epoch}.ckpt'
print(ckpt_path)
outputs = trainer.predict(model=model, datamodule=data_module, ckpt_path=ckpt_path)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]


outputs/wereWolf/HierBERT/baseline/ave_pooled_base-v2/checkpoints/epoch=2.ckpt


Predicting: 0it [00:00, ?it/s]

In [None]:
logits = torch.cat([p['logits'] for p in outputs], dim=0)
word_attentions = torch.cat([torch.stack(p['word_attentions']).permute(1, 0, 2) for p in outputs])
sent_attentions = torch.cat([p['sent_attentions'] for p in outputs])
input_ids = torch.cat([p['input_ids'] for p in outputs])
labels = torch.cat([p['labels'] for p in outputs])

tokenizer = BertJapaneseTokenizer.from_pretrained(f'{cfg.model.tokenizer.pretrained_model}', additional_special_tokens=['<person>'])


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Word attention

In [None]:
ploted_doc = []
for _input_ids, _word_attentions in zip(input_ids, word_attentions):
    tokens = [list(map(lambda x: x.replace(' ', ''), tokenizer.batch_decode(ids))) for ids in _input_ids]
    ploted_doc.append(plot_word_attentions(doc=tokens, weights_list=_word_attentions, threshold=0.01, size=3))

In [None]:
display(HTML(ploted_doc[5]))

## Sentence attention

In [None]:
ploted_doc = []
for _input_ids, _sent_attentions in zip(input_ids, sent_attentions):
    doc = [list(map(lambda x: x.replace(' ', ''), tokenizer.batch_decode(ids))) for ids in _input_ids]
    ploted_doc.append(plot_sent_attentions(doc=doc, weights_list=_sent_attentions, threshold=0.0001, size=3, color_level=50))

In [None]:
display(HTML(ploted_doc[5]))

# HAN

In [1]:
import os

import hydra
from omegaconf import OmegaConf
from IPython.display import HTML, display
import torch
import pytorch_lightning as pl
from pytorch_lightning.plugins import DDPPlugin
from transformers import BertJapaneseTokenizer

from src.visualization.plot_attention import plot_attentions, plot_word_attentions, plot_sent_attentions

2021-10-29 18:14:03.221346: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-10-29 18:14:03.221394: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
abs_data_path = os.path.abspath("data/nested_sample/")
abs_cache_dir = os.path.abspath("src/tokenizer")
baseline_name = '200dim'

with hydra.initialize(config_path='config'):
    cfg = hydra.compose(
        config_name="defaults.yaml",
        overrides=[
            f"name={baseline_name}",
            "model=HAN",
            "data=wereWolf_sample",
            f"data.dir={abs_data_path}/",
            "trainer.gpus=[7]",
            f"model.tokenizer.cache_dir={abs_cache_dir}/",
        ]
    )

print(OmegaConf.to_yaml(cfg, resolve=True))

#print(cfg.data.dir)

tokenizer = hydra.utils.instantiate(
    cfg.model.tokenizer,
    data_dir=cfg.data.dir,
)

data_module = hydra.utils.instantiate(
    cfg.model.data_module,
    data_dir=cfg.data.dir,
    tokenizer=tokenizer,
)

model = hydra.utils.instantiate(
        cfg.model.general,
        optim=cfg.optim,
        embedding_matrix=tokenizer.embedding_matrix,
        _recursive_=False,
)

#tb_logger = pl.loggers.TensorBoardLogger(".", "", "", log_graph=True, default_hp_metric=False)

trainer = pl.Trainer(
    **OmegaConf.to_container(cfg.trainer),
    logger=False,
#    callbacks=[tb_logger],
    plugins=DDPPlugin(),
)

experiment: baseline
name: 200dim
message: null
mode: train
best_epoch: 0
trainer:
  accumulate_grad_batches: 1
  benchmark: true
  deterministic: true
  fast_dev_run: false
  gpus:
  - 7
  limit_test_batches: 1.0
  max_epochs: 10
  overfit_batches: 0.0
  precision: 16
  terminate_on_nan: true
early_stopping:
  _target_: pytorch_lightning.callbacks.EarlyStopping
  monitor: val_loss
  min_delta: 0.005
  patience: 3
  mode: min
  check_on_train_epoch_end: false
model:
  name: HAN
  general:
    _target_: src.model.HAN.HierAttnNet
    vocab_size: 32000
    weight_drop: 0.0
    locked_drop: 0.0
    embed_drop: 0.0
    last_drop: 0.0
    word_hidden_dim: 50
    sent_hidden_dim: 50
    padding_idx: 1
    num_class: 2
  data_module:
    _target_: src.model.HANDataModule.CreateHANDataModule
    batch_size: 64
  tokenizer:
    _target_: src.tokenizer.tokenizer_HAN.HANtokenizer
    sent_length: 256
    doc_length: 256
    vocab_size: 32000
    min_freq: 1
    embed_dim: 200
    tokenizer_type: m

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  self.word_embed.weight = nn.Parameter(torch.tensor(embedding_matrix))
Using native 16bit precision.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [3]:
best_epoch = 0
# TODO: modelを変える。
ckpt_path = f'outputs/wereWolf/{cfg.model.name}/baseline/{cfg.name}/checkpoints/epoch={best_epoch}.ckpt'
#ckpt_path = f'outputs/wereWolf/{cfg.model.name}/baseline/{cfg.name}/checkpoints/epoch={best_epoch}.ckpt'
print(ckpt_path)
predict_model = model.load_from_checkpoint(ckpt_path)
outputs = trainer.predict(model=predict_model, datamodule=data_module)

initializing ddp: GLOBAL_RANK: 0, MEMBER: 1/1
----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All DDP processes registered. Starting ddp with 1 processes
----------------------------------------------------------------------------------------------------



outputs/wereWolf/HAN/baseline/200dim/checkpoints/epoch=0.ckpt


A100-PCIE-40GB with CUDA capability sm_80 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_70.
If you want to use the A100-PCIE-40GB GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]


Predicting: 0it [00:00, ?it/s]

In [5]:
logits = torch.cat([p['logits'] for p in outputs], dim=0).cpu()
word_attentions = torch.cat([p['word_attentions'] for p in outputs]).cpu()
sent_attentions = torch.cat([p['sent_attentions'].squeeze(2) for p in outputs]).cpu()
input_ids = torch.cat([p['input_ids'] for p in outputs]).cpu()
labels = torch.cat([p['labels'] for p in outputs]).cpu()

In [7]:
import torch
m = torch.nn.Softmax(dim=1)

In [14]:
labels

tensor([0, 1, 0, 0, 1, 1], device='cuda:7')

In [13]:
logits.cpu()

tensor([[ 0.4109, -0.4001],
        [ 0.9790, -0.8857],
        [ 0.4670, -0.4424],
        [-0.3625,  0.2448],
        [ 0.9790, -0.8857],
        [ 0.9790, -0.8857]], dtype=torch.float16)

In [11]:
preds = m(logits)
preds

tensor([[0.6924, 0.3076],
        [0.8657, 0.1342],
        [0.7129, 0.2871],
        [0.3528, 0.6475],
        [0.8657, 0.1342],
        [0.8657, 0.1342]], dtype=torch.float16)

In [9]:
argmax = logits.argmax(dim=1)
argmax

In [10]:
preds[0][0]

tensor(0.6924, device='cuda:7', dtype=torch.float16)

## plot attention

In [10]:
ignore_tokens = ['<PAD>', '<unk>']
ploted_doc = []
for _input_ids, _word_attentions, _sent_attentions in zip(input_ids, word_attentions, sent_attentions):
    doc = [list(map(lambda x: x.replace(' ', ''), tokenizer.batch_decode(ids.tolist()))) for ids in _input_ids]
    ploted_doc.append(plot_attentions(doc=doc, word_weights=_word_attentions, sent_weights=_sent_attentions,
                                        threshold=0.01, word_cmap="Blues" , sent_cmap="Reds",
                                        word_color_level=5, sent_color_level=40, size=3, ignore_tokens=ignore_tokens, pad_token='<PAD>'))

In [11]:
display(HTML(ploted_doc[5]))

## Word attentions

In [None]:
ploted_doc = []
for _input_ids, _word_attentions in zip(input_ids, word_attentions):
    tokens = [list(map(lambda x: x.replace(' ', ''), tokenizer.batch_decode(ids))) for ids in _input_ids]
    ploted_doc.append(plot_word_attentions(doc=tokens, weights_list=_word_attentions, threshold=0.01, size=3))

In [4]:
display(HTML(ploted_doc[5]))

NameError: name 'ploted_doc' is not defined

## Sentence attentions

In [None]:
ploted_doc = []
for _input_ids, _sent_attentions in zip(input_ids, sent_attentions):
    doc = [list(map(lambda x: x.replace(' ', ''), tokenizer.batch_decode(ids))) for ids in _input_ids]
    ploted_doc.append(plot_sent_attentions(doc=doc, weights_list=_sent_attentions, threshold=0.0001, size=3, color_level=50))

In [None]:
display(HTML(ploted_doc[5]))