In [1]:
## How to build a sentence vector?

In [46]:
text = "Hello world what a time to be alive!"

In [47]:
# Import pytorch and transformers

In [48]:
from transformers import AutoTokenizer, AutoModel
import torch

In [49]:
## Initialize tokenizer and the model

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

In [50]:
# tokenize the text

tokens = tokenizer.encode_plus(text, max_length=128, truncation=True, padding='max_length', return_tensors='pt')

In [51]:
outputs = model(**tokens)

In [52]:
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 3.0681e-01, -7.8805e-02,  1.7431e+00,  ..., -2.5348e-02,
          -1.1080e-01,  4.8310e-02],
         [ 7.1302e-01,  1.0437e-01,  1.8346e+00,  ...,  1.1343e-01,
          -7.5563e-02,  1.2668e-01],
         [ 8.1722e-01,  1.1321e-01,  1.5408e+00,  ..., -3.8067e-01,
           8.7479e-02, -1.9020e-01],
         ...,
         [ 5.4669e-01,  1.7181e-01,  1.1392e+00,  ...,  3.8549e-02,
          -1.5396e-01,  2.3015e-01],
         [ 3.4457e-01,  1.3151e-01,  1.1324e+00,  ..., -1.4217e-03,
          -1.7517e-01,  1.5220e-01],
         [ 3.2320e-01,  3.3350e-03,  1.1888e+00,  ...,  1.6736e-02,
          -2.0864e-01,  8.9315e-02]]], grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-7.1334e-01, -2.7907e-01,  7.7808e-01, -1.9139e-01, -1.4626e-01,
         -2.2970e-01,  4.3867e-01, -1.3565e-01,  4.1930e-01, -8.8569e-01,
          3.7006e-01, -3.7974e-01,  7.3329e-01, -2.6172e-01,  8.7081e-01,
         -3.7279e-

In [53]:
# extract last hidden state tensor

In [54]:
embeddings = outputs.last_hidden_state
embeddings

tensor([[[ 3.0681e-01, -7.8805e-02,  1.7431e+00,  ..., -2.5348e-02,
          -1.1080e-01,  4.8310e-02],
         [ 7.1302e-01,  1.0437e-01,  1.8346e+00,  ...,  1.1343e-01,
          -7.5563e-02,  1.2668e-01],
         [ 8.1722e-01,  1.1321e-01,  1.5408e+00,  ..., -3.8067e-01,
           8.7479e-02, -1.9020e-01],
         ...,
         [ 5.4669e-01,  1.7181e-01,  1.1392e+00,  ...,  3.8549e-02,
          -1.5396e-01,  2.3015e-01],
         [ 3.4457e-01,  1.3151e-01,  1.1324e+00,  ..., -1.4217e-03,
          -1.7517e-01,  1.5220e-01],
         [ 3.2320e-01,  3.3350e-03,  1.1888e+00,  ...,  1.6736e-02,
          -2.0864e-01,  8.9315e-02]]], grad_fn=<NativeLayerNormBackward0>)

In [55]:
embeddings.shape

torch.Size([1, 128, 768])

- To produce a dense vector we need to perform mean  pooling operation.

- We need to multiply each value in our embeddings tensor by its respective attention mask value so that we ignore non real tokens

- Mask values are 0 and 1s



- First resize the attention mask vector

In [56]:
attention_mask = tokens['attention_mask']

In [57]:
attention_mask.shape

torch.Size([1, 128])

In [58]:
# expand the attention mask vector to be the same as embeddings one
mask = attention_mask.unsqueeze(-1).expand(embeddings.shape).float() #gives an extra dimension

In [59]:
mask

tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]])

In [60]:
masked_embeddings = embeddings * mask

masked_embeddings.shape


torch.Size([1, 128, 768])

In [61]:
masked_embeddings

tensor([[[ 0.3068, -0.0788,  1.7431,  ..., -0.0253, -0.1108,  0.0483],
         [ 0.7130,  0.1044,  1.8346,  ...,  0.1134, -0.0756,  0.1267],
         [ 0.8172,  0.1132,  1.5408,  ..., -0.3807,  0.0875, -0.1902],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ..., -0.0000, -0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.0000,  0.0000]]],
       grad_fn=<MulBackward0>)

In [62]:
## Mean pooling

In [63]:
summed = torch.sum(masked_embeddings, 1)

In [64]:
summed.shape

torch.Size([1, 768])

In [65]:
counts = torch.clamp(mask.sum(1), min=1e-9)

In [66]:
counts.shape

torch.Size([1, 768])

In [67]:
mean_pooled = summed / counts

In [68]:
mean_pooled.shape

torch.Size([1, 768])

In [69]:
#### Using cosine similarity

In [78]:
sentences = [
    "Three years later, the coffin was still full of Jello.",
    "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
    "The person box was packed with jelly many dozens of months later.",
    "Standing on one's head at job interviews forms a lasting impression.",
    "It took him a month to finish the meal.",
    "He found a leprechaun in his walnut shell."
]

In [79]:
sentences[0]

'Three years later, the coffin was still full of Jello.'

In [80]:
# initialize a dictionary that will contain tokenized sentences
tokens = {'input_ids': [], 'attention_mask': []}

for sentence in sentences:
    #tokenize sentences and append it to a dictionary list
    new_tokens = tokenizer.encode_plus(sentence, max_length=128, truncation=True,
                                      padding='max_length', return_tensors='pt')
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])
    
#reformat a list of tensor into a single tensor
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

In [81]:
tokens['input_ids'].shape

torch.Size([6, 128])

In [82]:
outputs = model(**tokens)

In [83]:
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [84]:
embeddings = outputs.last_hidden_state

In [85]:
embeddings.shape

torch.Size([6, 128, 768])

In [86]:
embeddings

tensor([[[-6.9230e-02,  6.2300e-01,  3.5369e-02,  ...,  8.0334e-01,
           1.6314e+00,  3.2812e-01],
         [ 3.6729e-02,  6.8419e-01,  1.9460e-01,  ...,  8.4759e-02,
           1.4747e+00, -3.0080e-01],
         [-1.2140e-02,  6.5431e-01, -7.2718e-02,  ..., -3.2600e-02,
           1.7717e+00, -6.8121e-01],
         ...,
         [ 1.9532e-01,  1.1085e+00,  3.3905e-01,  ...,  1.2826e+00,
           1.0114e+00, -7.2754e-02],
         [ 9.0217e-02,  1.0288e+00,  3.2973e-01,  ...,  1.2940e+00,
           9.8651e-01, -1.1125e-01],
         [ 1.2404e-01,  9.7365e-01,  3.9329e-01,  ...,  1.1359e+00,
           8.7685e-01, -1.0435e-01]],

        [[-3.2124e-01,  8.2512e-01,  1.0554e+00,  ..., -1.8555e-01,
           1.5169e-01,  3.9366e-01],
         [-7.1457e-01,  1.0297e+00,  1.1217e+00,  ...,  3.3118e-02,
           2.3820e-01, -1.5632e-01],
         [-2.3522e-01,  1.1353e+00,  8.5941e-01,  ..., -4.3096e-01,
          -2.7241e-02, -2.9676e-01],
         ...,
         [-5.4000e-01,  3

## Mean Pooling

In [87]:
attention_mask = tokens['attention_mask']
attention_mask.shape

torch.Size([6, 128])

In [88]:
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
mask.shape

torch.Size([6, 128, 768])

In [95]:
mask

tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 

In [96]:
masked_embeddings = embeddings * mask
masked_embeddings.shape

torch.Size([6, 128, 768])

In [97]:
masked_embeddings

tensor([[[-0.0692,  0.6230,  0.0354,  ...,  0.8033,  1.6314,  0.3281],
         [ 0.0367,  0.6842,  0.1946,  ...,  0.0848,  1.4747, -0.3008],
         [-0.0121,  0.6543, -0.0727,  ..., -0.0326,  1.7717, -0.6812],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.0000]],

        [[-0.3212,  0.8251,  1.0554,  ..., -0.1855,  0.1517,  0.3937],
         [-0.7146,  1.0297,  1.1217,  ...,  0.0331,  0.2382, -0.1563],
         [-0.2352,  1.1353,  0.8594,  ..., -0.4310, -0.0272, -0.2968],
         ...,
         [-0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.0000,  0.0000],
         [-0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.0000,  0.0000],
         [-0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.0000,  0.0000]],

        [[-0.7576,  0.8399, -0.3792,  ...,  0.1271,  1.2514,  0.1365],
         [-0.6591,  0.7614, -0.4662,  ...,  0

In [98]:
summed = torch.sum(masked_embeddings, 1)
summed.shape

torch.Size([6, 768])

In [99]:
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape

torch.Size([6, 768])

In [100]:
counts

tensor([[11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
         11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
         11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
         11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
         11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
         11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
         11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
         11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
         11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
         11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
         11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
         11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11., 11.,
         11., 11., 11., 11., 11., 11., 11., 11., 11.

In [101]:
mean_pooled = summed / summed_mask

In [102]:
mean_pooled

tensor([[ 0.0745,  0.8637,  0.1795,  ...,  0.7734,  1.7247, -0.1803],
        [-0.3715,  0.9729,  1.0840,  ..., -0.2552, -0.2759,  0.0358],
        [-0.5030,  0.7950, -0.1240,  ...,  0.1441,  0.9704, -0.1791],
        [-0.0132,  0.9773,  1.4516,  ..., -0.8462, -1.4004, -0.4118],
        [-0.2019,  0.0597,  0.8603,  ..., -0.0100,  0.8431, -0.0841],
        [-0.2131,  1.0175, -0.8833,  ...,  0.7371,  0.1947, -0.3011]],
       grad_fn=<DivBackward0>)

-----------------------

In [103]:
from sklearn.metrics.pairwise import cosine_similarity

In [104]:
# convert from PyTorch tensor to numpy array
mean_pooled = mean_pooled.detach().numpy()

# calculate
cosine_similarity(
    [mean_pooled[0]],
    mean_pooled[1:]
)


array([[0.3308892 , 0.7219259 , 0.17475471, 0.44709635, 0.5548363 ]],
      dtype=float32)

### With the sentence transformer

In [112]:



model_name = 'bert-base-nli-mean-tokens'

In [113]:
from sentence_transformers import SentenceTransformer

In [114]:
model = SentenceTransformer(model_name)

In [115]:
embeddings = model.encode(sentences)

In [116]:
embeddings

array([[ 0.07446156,  0.86369616,  0.17946291, ...,  0.77344   ,
         1.7247493 , -0.1802747 ],
       [-0.37146357,  0.97290134,  1.0839922 , ..., -0.25521314,
        -0.27593705,  0.03575896],
       [-0.50298285,  0.79498583, -0.12402609, ...,  0.14406338,
         0.9703752 , -0.179116  ],
       [-0.01324293,  0.97728604,  1.4515941 , ..., -0.84616524,
        -1.4004319 , -0.41184407],
       [-0.20192575,  0.05970386,  0.8602744 , ..., -0.01000801,
         0.84306234, -0.08407753],
       [-0.21311863,  1.017493  , -0.88327694, ...,  0.7371028 ,
         0.1946914 , -0.30111343]], dtype=float32)

In [117]:
embeddings.shape

(6, 768)

In [118]:
cosine_similarity(
    [embeddings[0]],
    embeddings[1:]

)

array([[0.3308892 , 0.7219259 , 0.17475471, 0.44709635, 0.5548363 ]],
      dtype=float32)