# Feature Extraction

In [1]:
import sys
sys.path.append("..")
from mangoes.modeling import PretrainedTransformerModelForFeatureExtraction

First, we load a pretrained model:

In [2]:
pretrained_model = PretrainedTransformerModelForFeatureExtraction.load("bert-base-uncased", "bert-base-uncased")

  return torch._C._cuda_getDeviceCount() > 0


Next, we define the text we would like to extract features for:

In [3]:
text = ["I'm a test sentence.", "This is another test sentence"]

We can use the predict function to obtain the last hidden state from the bert model:

In [4]:
outputs = pretrained_model.predict(text)
print(len(outputs))           # one list of hidden layer outputs per input sentences
print(len(outputs[0]))        # sequence length
print(len(outputs[0][-1]))    # size of hidden state of last layer

2
9
768


We can get the same output plus all the previous layer's hidden states, and the attention matrices if needed, using the generate_outputs function:

In [5]:
outputs = pretrained_model.generate_outputs(text, output_hidden_states=True, output_attentions=True)
print(outputs.keys())

dict_keys(['last_hidden_state', 'pooler_output', 'hidden_states', 'attentions', 'offset_mappings'])


In [6]:
print(outputs["hidden_states"][-1].shape)  # batch_size, max_sequence_length, hidden_size
print(outputs["attentions"][-1].shape)     # batch_size, num_attention_heads, max_sequence_length, max_sequence_length
print(outputs["offset_mappings"].shape)    # batch_size, max_sequence_length, 2:(start and end indices)

torch.Size([2, 9, 768])
torch.Size([2, 12, 9, 9])
torch.Size([2, 9, 2])


We can pair hidden states with subtokens using the offset mappings that are part of the output. For example, say we want to create a list of (string, 5th hidden state) tuples for the first sentence:

In [7]:
string_representations = []
for i in range(len(outputs["offset_mappings"][0])):
    start, end = outputs["offset_mappings"][0][i]
    if not start == end:   # skip special tokens
        string = text[0][start:end]
        string_representations.append((string, outputs["hidden_states"][5][0][i]))
    
for (string, hs) in string_representations:
    print(f"{string} : {hs.numpy().shape}")

I : (768,)
' : (768,)
m : (768,)
a : (768,)
test : (768,)
sentence : (768,)
. : (768,)


Sometimes, users want full word embeddings instead of subword token embeddings. The generate outputs method provides functionality for averaging subword tokens into word embeddings. Just set the "word_embeddings" argument to True. Here's an example where we get the word embeddings for each word in the second sentence, averaging sub token embeddings if there are multiple in the same word:

In [8]:
outputs = pretrained_model.generate_outputs(text, output_hidden_states=True, word_embeddings=True)
words = text[0].split()
print(len(words))
print(outputs["hidden_states"][-1][0].shape)
print(outputs["hidden_states"][-1][0][-1][:10]) # padded extra word

4
torch.Size([5, 768])
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


Sometime users only have access to pre-split text, meaning text that has already been split on whitespace or punctuation. The generate outputs function can handle this data as well, just set the pretokenized argument to True:

In [9]:
split_text = [sentence.split("'") for sentence in text]
print(split_text)
outputs = pretrained_model.generate_outputs(split_text, pre_tokenized=True,
                                            output_hidden_states=True, word_embeddings=True)
print(len(split_text[0]))
print(outputs["hidden_states"][-1][0].shape)

[['I', 'm a test sentence.'], ['This is another test sentence']]
2
torch.Size([2, 768])


Note that the PretrainedTransformerModelForFeatureExtraction class works for many other Huggingface model architectures besides BERT, such as ALBERT, which adds parameter sharing to the architecture:

In [10]:
pretrained_albert = PretrainedTransformerModelForFeatureExtraction.load("albert-base-v1", "albert-base-v1")

outputs = pretrained_albert.generate_outputs(text[1], output_hidden_states=True, word_embeddings=True)
print(len(split_text[1]))
print(outputs["hidden_states"][-1][0].shape)

1
torch.Size([5, 768])
