# HierBERT

In [None]:
import os

import hydra
from omegaconf import OmegaConf
from IPython.display import HTML, display
import torch
import pytorch_lightning as pl
from pytorch_lightning.plugins import DDPPlugin
from transformers import BertJapaneseTokenizer

from src.visualization.plot_attention import plot_word_attentions, plot_sent_attentions

In [3]:
abs_data_path = os.path.abspath("data/nested_sample/")
baseline_name = 'ave_pooled_base-v2'

with hydra.initialize(config_path='config'):
    cfg = hydra.compose(
        config_name="defaults.yaml",
        overrides=[
            "experiment=predict",
            f"name={baseline_name}",
            "model=HierBERT",
            "data=wereWolf_sample",
            f"data.dir={abs_data_path}/",
            "trainer.gpus=[6]",
            "model.tokenizer.pretrained_model=cl-tohoku/bert-base-japanese-v2",
            "model.sent_level_BERT_config.hidden_size=768",
            "model.sent_level_BERT_config.num_hidden_layers=12",
            "model.sent_level_BERT_config.num_attention_heads=12",
        ]
    )

print(OmegaConf.to_yaml(cfg, resolve=True))

data_module = hydra.utils.instantiate(
    cfg.model.data_module,
    data_dir=cfg.data.dir,
    tokenizer=cfg.model.tokenizer,
    _recursive_=False,
)

model = hydra.utils.instantiate(
        cfg.model.model,
        pretrained_model=cfg.model.tokenizer.pretrained_model,
        sent_level_BERT_config=cfg.model.sent_level_BERT_config,
        optim=cfg.optim,
        _recursive_=False,
)

#tb_logger = pl.loggers.TensorBoardLogger(".", "", "", log_graph=True, default_hp_metric=False)

trainer = pl.Trainer(
    **OmegaConf.to_container(cfg.trainer),
#    callbacks=[tb_logger],
    plugins=DDPPlugin(),
)



model:
  name: HierBERT
  model:
    num_labels: 2
    _target_: src.model.HierBERT.HierchicalBERT
    use_ave_pooled_output: true
    output_attentions: true
    is_japanese: true
  tokenizer:
    _target_: src.tokenizer.tokenizer_HierBERT.HierBertTokenizer
    sent_length: 256
    doc_length: 256
    pretrained_model: cl-tohoku/bert-base-japanese-v2
  data_module:
    _target_: src.model.HierBERTDataModule.CreateHierBertDataModule
    batch_size: 64
  sent_level_BERT_config:
    _target_: transformers.BertConfig
    hidden_size: 768
    num_hidden_layers: 12
    num_attention_heads: 12
data:
  name: wereWolf_sample
  dir: /disk/ssd14tb/haoki/Documents/vscode-workplaces/lie_detector/data/nested_sample/
optim:
  name: AdamW
  optimizer:
    _target_: torch.optim.AdamW
    lr: 0.001
    weight_decay: 0.01
experiment: predict
name: ave_pooled_base-v2
message: null
trainer:
  accumulate_grad_batches: 1
  benchmark: true
  deterministic: true
  fast_dev_run: false
  gpus:
  - 6
  max_epoch

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-v2 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification

In [7]:
best_epoch = 2
# TODO: data.nameの所をweweWolfにする。
ckpt_path = f'outputs/wereWolf/{cfg.model.name}/baseline/{cfg.name}/checkpoints/epoch={best_epoch}.ckpt'
print(ckpt_path)
outputs = trainer.predict(model=model, datamodule=data_module, ckpt_path=ckpt_path)

outputs/wereWolf/HAN/baseline/200dim_no_dropout/checkpoints/epoch=2.ckpt


MisconfigurationException: Dataloader not found for `Trainer.predict`

In [5]:
logits = torch.cat([p['logits'] for p in outputs], dim=0)
word_attentions = torch.cat([torch.stack(p['word_attentions']).permute(1, 0, 2) for p in outputs])
sent_attentions = torch.cat([p['sent_attentions'] for p in outputs])
input_ids = torch.cat([p['input_ids'] for p in outputs])
labels = torch.cat([p['labels'] for p in outputs])

tokenizer = BertJapaneseTokenizer.from_pretrained(f'{cfg.model.tokenizer.pretrained_model}', additional_special_tokens=['<person>'])


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Word attention

In [6]:
ploted_doc = []
for _input_ids, _word_attentions in zip(input_ids, word_attentions):
    tokens = [list(map(lambda x: x.replace(' ', ''), tokenizer.batch_decode(ids))) for ids in _input_ids]
    ploted_doc.append(plot_word_attentions(doc=tokens, weights_list=_word_attentions, threshold=0.01, size=3))

In [21]:
display(HTML(ploted_doc[5]))

## Sentence attention

In [19]:
ploted_doc = []
for _input_ids, _sent_attentions in zip(input_ids, sent_attentions):
    doc = [list(map(lambda x: x.replace(' ', ''), tokenizer.batch_decode(ids))) for ids in _input_ids]
    ploted_doc.append(plot_sent_attentions(doc=doc, weights_list=_sent_attentions, threshold=0.0001, size=3, color_level=50))

In [None]:
display(HTML(ploted_doc[5]))

# HAN

In [1]:
import os

import hydra
from omegaconf import OmegaConf
from IPython.display import HTML, display
import torch
import pytorch_lightning as pl
from pytorch_lightning.plugins import DDPPlugin
from transformers import BertJapaneseTokenizer

from src.visualization.plot_attention import plot_word_attentions, plot_sent_attentions

2021-10-21 21:40:53.984034: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-10-21 21:40:53.984062: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
abs_data_path = os.path.abspath("data/nested_sample/")
abs_cache_dir = os.path.abspath("src/tokenizer")
baseline_name = '200dim_no_dropout'

with hydra.initialize(config_path='config'):
    cfg = hydra.compose(
        config_name="defaults.yaml",
        overrides=[
            "experiment=predict",
            f"name={baseline_name}",
            "model=HAN",
            "data=wereWolf_sample",
            f"data.dir={abs_data_path}/",
            "trainer.gpus=[6]",
            f"model.tokenizer.cache_dir={abs_cache_dir}/",
        ]
    )

print(OmegaConf.to_yaml(cfg, resolve=True))

#print(cfg.data.dir)

tokenizer = hydra.utils.instantiate(
    cfg.model.tokenizer,
    data_dir=cfg.data.dir,
)

data_module = hydra.utils.instantiate(
    cfg.model.data_module,
    data_dir=cfg.data.dir,
    tokenizer=tokenizer,
)

model = hydra.utils.instantiate(
        cfg.model.general,
        optim=cfg.optim,
        embedding_matrix=tokenizer.embedding_matrix,
        _recursive_=False,
)

#tb_logger = pl.loggers.TensorBoardLogger(".", "", "", log_graph=True, default_hp_metric=False)

trainer = pl.Trainer(
    **OmegaConf.to_container(cfg.trainer),
#    callbacks=[tb_logger],
    plugins=DDPPlugin(),
)



model:
  name: HAN
  general:
    _target_: src.model.HAN.HierAttnNet
    vocab_size: 32000
    weight_drop: 0.0
    locked_drop: 0.0
    embed_drop: 0.0
    last_drop: 0.0
    word_hidden_dim: 512
    sent_hidden_dim: 512
    padding_idx: 1
    num_class: 2
  data_module:
    _target_: src.model.HANDataModule.CreateHANDataModule
    batch_size: 64
  tokenizer:
    _target_: src.tokenizer.tokenizer_HAN.HANtokenizer
    sent_length: 256
    doc_length: 256
    vocab_size: 32000
    min_freq: 1
    embed_dim: 200
    tokenizer: mecab-wordpiece
    cache_dir: /disk/ssd14tb/haoki/Documents/vscode-workplaces/lie_detector/src/tokenizer/
data:
  name: wereWolf_sample
  dir: /disk/ssd14tb/haoki/Documents/vscode-workplaces/lie_detector/data/nested_sample/
optim:
  name: AdamW
  optimizer:
    _target_: torch.optim.AdamW
    lr: 0.001
    weight_decay: 0.01
experiment: predict
name: 200dim_no_dropout
message: null
trainer:
  accumulate_grad_batches: 1
  benchmark: true
  deterministic: true
  fa

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  self.word_embed.weight = nn.Parameter(torch.tensor(embedding_matrix))
Using native 16bit precision.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [4]:
best_epoch = 9
# TODO: data.nameの所をweweWolfにする。
ckpt_path = f'outputs/wereWolf/{cfg.model.name}/baseline/{cfg.name}/checkpoints/epoch={best_epoch}.ckpt'
print(ckpt_path)
outputs = trainer.predict(model=model, datamodule=data_module, ckpt_path=ckpt_path)

outputs/wereWolf/HAN/baseline/200dim_no_dropout/checkpoints/epoch=9.ckpt


  rank_zero_deprecation(
  rank_zero_deprecation(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]


TypeError: array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0. has type numpy.ndarray, but expected one of: int, long, float

In [None]:
logits = torch.cat([p['logits'] for p in outputs], dim=0)
word_attentions = torch.cat([torch.stack(p['word_attentions']).permute(1, 0, 2) for p in outputs])
sent_attentions = torch.cat([p['sent_attentions'] for p in outputs])
input_ids = torch.cat([p['input_ids'] for p in outputs])
labels = torch.cat([p['labels'] for p in outputs])

tokenizer = BertJapaneseTokenizer.from_pretrained(f'{cfg.model.tokenizer.pretrained_model}', additional_special_tokens=['<person>'])

## Word attentions

In [None]:
ploted_doc = []
for _input_ids, _word_attentions in zip(input_ids, word_attentions):
    tokens = [list(map(lambda x: x.replace(' ', ''), tokenizer.batch_decode(ids))) for ids in _input_ids]
    ploted_doc.append(plot_word_attentions(doc=tokens, weights_list=_word_attentions, threshold=0.01, size=3))

In [None]:
display(HTML(ploted_doc[5]))

## Sentence attentions

In [None]:
ploted_doc = []
for _input_ids, _sent_attentions in zip(input_ids, sent_attentions):
    doc = [list(map(lambda x: x.replace(' ', ''), tokenizer.batch_decode(ids))) for ids in _input_ids]
    ploted_doc.append(plot_sent_attentions(doc=doc, weights_list=_sent_attentions, threshold=0.0001, size=3, color_level=50))

In [None]:
display(HTML(ploted_doc[5]))