In [2]:
# import os
# os.environ['http_proxy'] = 'http://127.0.0.1:7890'
# os.environ['https_proxy'] = 'http://127.0.0.1:7890'

In [3]:
import torch
import torch.nn.functional as F
from copy import deepcopy

In [4]:
F.cosine_similarity??

[0;31mDocstring:[0m
cosine_similarity(x1, x2, dim=1, eps=1e-8) -> Tensor

Returns cosine similarity between ``x1`` and ``x2``, computed along dim. ``x1`` and ``x2`` must be broadcastable
to a common shape. ``dim`` refers to the dimension in this common shape. Dimension ``dim`` of the output is
squeezed (see :func:`torch.squeeze`), resulting in the
output tensor having 1 fewer dimension.

.. math ::
    \text{similarity} = \dfrac{x_1 \cdot x_2}{\max(\Vert x_1 \Vert _2, \epsilon) \cdot \max(\Vert x_2 \Vert _2, \epsilon)}

Supports :ref:`type promotion <type-promotion-doc>`.

Args:
    x1 (Tensor): First input.
    x2 (Tensor): Second input.
    dim (int, optional): Dimension along which cosine similarity is computed. Default: 1
    eps (float, optional): Small value to avoid division by zero.
        Default: 1e-8

Example::

    >>> input1 = torch.randn(100, 128)
    >>> input2 = torch.randn(100, 128)
    >>> output = F.cosine_similarity(input1, input2)
    >>> print(output)
[0;31mTy

$$
\text{similarity} = \dfrac{x_1 \cdot x_2}{\max(\Vert x_1 \Vert _2, \epsilon) \cdot \max(\Vert x_2 \Vert _2, \epsilon)} 
$$

In [5]:
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, InputExample, models
from torch.utils.data import DataLoader

model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')

  from tqdm.autonotebook import tqdm, trange


## Model

In [6]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

- `model[0]`: `token_embeddings` 
- `model[1]`: `sentence_embedding` (`pooling_mode_mean_tokens`)

> 其他的 Pooling 方法（添加 cls token 进行 pooling）
> ```
> pooling_model = models.Pooling(word_embed_model.get_word_embedding_dimension(), 
                       pooling_mode='cls',
                       pooling_mode_cls_token=True, 
                       pooling_mode_mean_tokens = False)
> ```

## Dataloader

In [7]:
train_examples = [
    InputExample(texts=['This is a positive pair', 'Where the distance will be minimized'], label=1),
#   InputExample(texts=['This is a negative pair', 'Their distance will be increased'], label=0)
]

In [8]:
# sentences input
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=2)

In [9]:
train_dataloader.collate_fn

<function torch.utils.data._utils.collate.default_collate(batch)>

In [10]:
train_dataloader.collate_fn = model.smart_batching_collate
batch = next(iter(train_dataloader))
batch

([{'input_ids': tensor([[ 101, 2023, 2003, 1037, 3893, 3940,  102]]),
   'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]),
   'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])},
  {'input_ids': tensor([[  101,  2073,  1996,  3292,  2097,  2022, 18478,  2094,   102]]),
   'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]),
   'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}],
 tensor([1]))

## Losses

- $(a,b)$: pair sentences embeddings

$$
\frac12|a,b|^2, \ell=1\\
\text{ReLU}^2(\epsilon-|a,b|), \ell=0
$$

In [11]:
losses.ContrastiveLoss??

[0;31mInit signature:[0m
[0mlosses[0m[0;34m.[0m[0mContrastiveLoss[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mmodel[0m[0;34m:[0m [0;34m'SentenceTransformer'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdistance_metric[0m[0;34m=[0m[0;34m<[0m[0mfunction[0m [0mSiameseDistanceMetric[0m[0;34m.[0m[0;34m<[0m[0;32mlambda[0m[0;34m>[0m [0mat[0m [0;36m0x7f78730e9c10[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmargin[0m[0;34m:[0m [0;34m'float'[0m [0;34m=[0m [0;36m0.5[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msize_average[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;34m'None'[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in
a tree structure. You can assign the submodules as regular attributes::

  

In [12]:
train_loss = losses.ContrastiveLoss(model=model)

In [13]:
list(train_loss.named_parameters())[0][1].shape

torch.Size([30522, 384])

In [14]:
# SiameseDistanceMetric.COSINE_DISTANCE
# lambda x, y: 1-F.cosine_similarity(x, y)
train_loss.distance_metric??

[0;31mSignature:[0m [0mtrain_loss[0m[0;34m.[0m[0mdistance_metric[0m[0;34m([0m[0mx[0m[0;34m,[0m [0my[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mSource:[0m        [0mCOSINE_DISTANCE[0m [0;34m=[0m [0;32mlambda[0m [0mx[0m[0;34m,[0m [0my[0m[0;34m:[0m [0;36m1[0m [0;34m-[0m [0mF[0m[0;34m.[0m[0mcosine_similarity[0m[0;34m([0m[0mx[0m[0;34m,[0m [0my[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mFile:[0m      ~/miniconda3/envs/nlp_study/lib/python3.8/site-packages/sentence_transformers/losses/ContrastiveLoss.py
[0;31mType:[0m      function

## Model.forward

In [15]:
batch

([{'input_ids': tensor([[ 101, 2023, 2003, 1037, 3893, 3940,  102]]),
   'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]),
   'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])},
  {'input_ids': tensor([[  101,  2073,  1996,  3292,  2097,  2022, 18478,  2094,   102]]),
   'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]),
   'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}],
 tensor([1]))

In [16]:
batch[1]

tensor([1])

In [17]:
batch[0][0]['input_ids'].shape

torch.Size([1, 7])

In [18]:
features, labels = batch
features

[{'input_ids': tensor([[ 101, 2023, 2003, 1037, 3893, 3940,  102]]),
  'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]),
  'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])},
 {'input_ids': tensor([[  101,  2073,  1996,  3292,  2097,  2022, 18478,  2094,   102]]),
  'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]),
  'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}]

In [19]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [20]:
feature_cpy = deepcopy(features)

In [21]:
model[1](model[0](features[0]))['sentence_embedding'][0, :5]

tensor([-0.2930,  0.3243, -0.6169, -0.0097, -0.1806], grad_fn=<SliceBackward0>)

In [22]:
(torch.sum(model[0](feature_cpy[0])['token_embeddings'], dim=1) / 7)[0, :5]

tensor([-0.2930,  0.3243, -0.6169, -0.0097, -0.1806], grad_fn=<SliceBackward0>)

## Forward loss

In [23]:
sent1_embed = model(features[0])['sentence_embedding']
sent2_embed = model(features[1])['sentence_embedding']

In [24]:
train_loss.distance_metric(sent1_embed, sent2_embed)

tensor([0.9867], grad_fn=<RsubBackward1>)

In [25]:
1 - F.cosine_similarity(sent1_embed, sent2_embed)

tensor([0.9867], grad_fn=<RsubBackward1>)

In [26]:
train_loss(features, labels)

tensor(0.4868, grad_fn=<MeanBackward0>)

In [27]:
1/2 * (1 - F.cosine_similarity(sent1_embed, sent2_embed)) ** 2

tensor([0.4868], grad_fn=<MulBackward0>)

## Pooling methods

In [28]:
train_examples = [
    InputExample(texts=['This is a positive pair', 'Where the distance will be minimized'], label=1),
#     InputExample(texts=['This is a negative pair', 'Their distance will be increased'], label=0)
]
# sentences input 
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=2)
train_dataloader.collate_fn = model.smart_batching_collate
batch = next(iter(train_dataloader))
# batch 
features, labels = batch
features

[{'input_ids': tensor([[ 101, 2023, 2003, 1037, 3893, 3940,  102]]),
  'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]),
  'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])},
 {'input_ids': tensor([[  101,  2073,  1996,  3292,  2097,  2022, 18478,  2094,   102]]),
  'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]),
  'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}]

### pooling_mode_mean_tokens

In [29]:
word_embed_model = models.Transformer('bert-base-uncased')
# a pool function over the token embeddings
pooling_model = models.Pooling(word_embed_model.get_word_embedding_dimension(), 
                               pooling_mode_cls_token=False, 
                               pooling_mode_mean_tokens=True, 
                               pooling_mode_max_tokens=False, 
                               pooling_mode_mean_sqrt_len_tokens=False)
model = SentenceTransformer(modules=[word_embed_model, pooling_model], device='cpu')

In [30]:
model(features[0])['sentence_embedding'][0][:5]

tensor([-0.0863, -0.2668,  0.5492, -0.4936, -0.1374], grad_fn=<SliceBackward0>)

### cls pooling method

In [31]:
word_embed_model = models.Transformer('bert-base-uncased')
# a pool function over the token embeddings
pooling_model = models.Pooling(word_embed_model.get_word_embedding_dimension(), 
                               pooling_mode = 'cls',
                               pooling_mode_cls_token=True, 
                               pooling_mode_mean_tokens = False)
model = SentenceTransformer(modules=[word_embed_model, pooling_model], device='cpu')

In [32]:
model(features[0])['sentence_embedding'][0][:5]

tensor([-0.1775, -0.0474,  0.1351, -0.3242, -0.5006], grad_fn=<SliceBackward0>)

### cls pooling from scartch

In [33]:
model[0](features[0])['token_embeddings'].shape

torch.Size([1, 7, 768])

In [34]:
model[0](features[0])['token_embeddings'][0, 0][:5]

tensor([-0.1775, -0.0474,  0.1351, -0.3242, -0.5006], grad_fn=<SliceBackward0>)