In [4]:
import torch
from transformers import BertTokenizer, BertModel

# Data preprocessing

## How to load tokenizer

In [5]:
tokenizer = BertTokenizer.from_pretrained('./pytorch-ernie', do_lower_case = True)

### Convert batch sentence to tokens

In [6]:
batch_sentences = ["Hello I'm a single sentence",
                   "And another sentence",
                   "And the very very last one"]
encoded_inputs = tokenizer(batch_sentences, padding=True, return_tensors="pt", add_special_tokens=True)

In [7]:
encoded_inputs

{'input_ids': tensor([[    1,  6368,  1675, 17963,  1979,  1545,  7512,  8090,  9595, 10483,
             2],
        [    1,  1662,  7076,  8090,  9595, 10483,     2,     0,     0,     0,
             0],
        [    1,  1662,  1499,  6318,  6318,  6975,  3777,     2,     0,     0,
             0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])}

### Make sure the results are correct

In [8]:
for ids in encoded_inputs["input_ids"]:
    print(tokenizer.decode(ids))

[CLS] hello i [UNK] m a single sentence [SEP]
[CLS] and another sentence [SEP] [PAD] [PAD] [PAD] [PAD]
[CLS] and the very very last one [SEP] [PAD] [PAD] [PAD]


 ## Load pretrained model 

In [9]:
model = BertModel.from_pretrained('./pytorch-ernie')

In [11]:
for m in model.modules():
    print(m)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(18000, 768, padding_idx=0)
    (position_embeddings): Embedding(513, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

### Convert tokens to pytorch tensor 

In [13]:
input_tensor = torch.tensor(encoded_inputs["input_ids"])

  """Entry point for launching an IPython kernel.


### Feed into model and print output shape

In [12]:
output = model(input_tensor)
print(output[0].detach().numpy().shape,output[1].detach().numpy().shape )

(3, 11, 768) (3, 768)
