In [21]:
import torch

# BERT 💥

In [206]:
from transformers import (
    AutoConfig, 
    AutoModel, 
    AutoTokenizer
)

backbone = 'Maltehb/danish-bert-botxo'
config = AutoConfig.from_pretrained(backbone)
model = AutoModel.from_pretrained(backbone)
tokenizer = AutoTokenizer.from_pretrained(backbone)

Some weights of the model checkpoint at Maltehb/danish-bert-botxo were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Tokenizer 🧩

Convert text to tensors using the tokenizer!

In [207]:
encoding = tokenizer(
    text='Jeg sender dig en fødselsdagshilsnen!', 
    return_tensors='pt'
)


print('type:\t\t',type(encoding))
print('dict keys:\t',encoding.keys())
print('input_ids shape:', encoding['input_ids'].shape)

print('\ntokens:\t\t', encoding.tokens())


type:		 <class 'transformers.tokenization_utils_base.BatchEncoding'>
dict keys:	 dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
input_ids shape: torch.Size([1, 11])

tokens:		 ['[CLS]', 'jeg', 'sender', 'dig', 'en', 'fødselsdag', '##sh', '##ils', '##nen', '!', '[SEP]']


## BERT Model 😎

Run the tokenized text through BERT!

Choose weather or not to return output hidden stattes and output attentions wiht the <i>output_hidden_states</i> and <i>output_attentions</i> args.

<b>Bonus tip:</b><br>
Read documentation for BERTModel's forward method:<br>
https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel

In [208]:
bert_output = model(**encoding, output_hidden_states=True, output_attentions=True)

print('output keys:'.upper())
print(bert_output.keys())

OUTPUT KEYS:
odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states', 'attentions'])


In [209]:
print('BERT Output:'.upper())
print(type(bert_output))

print('\n\nAttention Output:'.upper())
print(type(bert_output.attentions))
print(len(bert_output.attentions))
print(type(bert_output.attentions[0]))
print(bert_output.attentions[0].shape)

print('\n\nOutput Hidden States:'.upper())
print(type(bert_output.hidden_states))
print('len:', len(bert_output.hidden_states))
print(type(bert_output.hidden_states[0]))
print(bert_output.hidden_states[0].shape)

print('\n\nPooler Output:'.upper())
print(type(bert_output.pooler_output))


BERT OUTPUT:
<class 'transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions'>


ATTENTION OUTPUT:
<class 'tuple'>
12
<class 'torch.Tensor'>
torch.Size([1, 12, 11, 11])


OUTPUT HIDDEN STATES:
<class 'tuple'>
len: 13
<class 'torch.Tensor'>
torch.Size([1, 11, 768])


POOLER OUTPUT:
<class 'torch.Tensor'>


# BERT Submodules 🔥

Link to BERT submodules: <br>
https://github.com/huggingface/transformers/blob/2aa3cd935d0f3bcd04ce66be6af4b760493d2ffe/src/transformers/models/bert/modeling_bert.py

In [198]:
from transformers.models.bert.modeling_bert import (
    BertSelfAttention, 
    BertEmbeddings,
    BertLayer
)


## Embedding Module

In [217]:
embeddings = BertEmbeddings(config)
bert_input = embeddings(encoding['input_ids'])
print(bert_input.shape)

torch.Size([1, 11, 768])


## BERT Layer 🍰

In [221]:
bl = BertLayer(config)
bl(bert_input)[0].shape

torch.Size([1, 11, 768])

## Self-attention Module

In [223]:
self_attention = BertSelfAttention(config)