# BERTVis on FBDD-LLM
# Manas Mahale <manas.mahale@bcp.edu.in>

In [None]:
import os
from transformers import BertForMaskedLM, BertModel, pipeline, PreTrainedTokenizerFast, utils
from tokenizers.processors import TemplateProcessing
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import WhitespaceSplit

In [None]:
from bertviz import model_view

In [None]:
utils.logging.set_verbosity_error()

In [None]:
tokenizer = PreTrainedTokenizerFast.from_pretrained('./tokenizer/')

In [None]:
tokenizer.mask_token = "[MASK]"
tokenizer.unk_token = "[UNK]"
tokenizer.pad_token = "[PAD]"
tokenizer.sep_token = "[SEP]"
tokenizer.cls_token = "[CLS]"

In [None]:
model = BertForMaskedLM.from_pretrained(os.path.join('./model/', "checkpoint-250"), output_attentions=True)

In [None]:
inputs = tokenizer.encode("N c1ccc2ncccc2c1 [MASK] c1cscn1 CC", return_tensors='pt')
outputs = model(inputs)

In [None]:
attention = outputs[-1]

In [None]:
tokens = tokenizer.convert_ids_to_tokens(inputs[0]) 

In [None]:
model_view(attention, tokens)

In [None]:
from bertviz import head_view
head_view(attention, tokens)

In [None]:
# Import specialized versions of models (that return query/key vectors)
from bertviz.transformers_neuron_view import BertForMaskedLM, PreTrainedTokenizer
from bertviz.neuron_view import show

model = BertForMaskedLM.from_pretrained(os.path.join('./model/', "checkpoint-250"), output_attentions=True)
tokenizer = PreTrainedTokenizerFast.from_pretrained('./tokenizer/')
tokenizer.mask_token = "[MASK]"
tokenizer.unk_token = "[UNK]"
tokenizer.pad_token = "[PAD]"
tokenizer.sep_token = "[SEP]"
tokenizer.cls_token = "[CLS]"

sentence_a = "N c1ccc2ncccc2c1 CC=O c1cscn1 CC"
sentence_b = "N c1ccc2ncccc2c1 S c1cscn1 CC"

show(model, 'bert', tokenizer, sentence_a, sentence_b, layer=2, head=0)