# Embedding using distilbert model using the `transformers` package and simple avaraging across all token embeddings

In [None]:
import torch
from transformers import DistilBertModel, DistilBertTokenizer

In [None]:
model_class, tokenizer_class, pretrained_weights = DistilBertModel, DistilBertTokenizer, 'distilbert-base-cased'

In [None]:
# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

# Encode text
sentences = [
                'We are very happy to include pipeline into the transformers repository.',
                'Another sentence but shorter.',
            ]
# Not nice but works:
# encode twice to figure out max. encoded length of the input sentences
encoded = [tokenizer.encode(s, add_special_tokens=True) for s in sentences]
max_encoded_len = max((len(e) for e in encoded))
encoded = [tokenizer.encode(s, add_special_tokens=True, max_length=max_encoded_len, pad_to_max_length=True) for s in sentences]

input_ids = torch.tensor(encoded)  # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
with torch.no_grad():
    last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples

In [None]:
last_hidden_states.shape

In [None]:
n = torch.mean(last_hidden_states, 1).numpy()
n.shape

In [None]:
l = [list(n[i]) for i in range(n.shape[0])]
[len(x) for x in l]

# Legacy: Embedding a single document using a pipeline

In [None]:
from transformers import pipeline

In [None]:
nlp = pipeline('feature-extraction')

In [None]:
embed = nlp('We are very happy to include pipeline into the transformers repository.')

In [None]:
len(embed[0][0])

In [None]:
embed[0][0][:5]