In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
# to put the package in sys path
# Alternate: make the package pip installable!

import sys
sys.path.append("../")

In [3]:
from pprint import pprint

In [4]:
from transformers import AutoModel

In [5]:
from sentform.modeling import SentenceTransformer
from sentform.pooling import MeanPooling
from sentform.utils import pairwise_cosine_similarity, set_seed

In [6]:
set_seed(42)

# SentenceTransformer Embeddings

The `SentenceTransformer` is able to take in any backbone that is supported.
In general, these backbones are BERT-based / BERT variants which give embeddings for each token.
So, to get the embeddings for the whole sentence, we need a mechanism to aggregate these token embeddings.
We can use `sentform.pooling.PoolingLayer` to do so. `MeanPooling` is a standard approach to aggregate.

In [7]:
backbone = AutoModel.from_pretrained("bert-base-uncased")

In [8]:
sentformer = SentenceTransformer(
    backbone=backbone,
    pooling_layer=MeanPooling()
)



In [9]:
sentformer

SentenceTransformer(
  (backbone): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

In [10]:
sentformer.embedding_dim

768

In [11]:
sentences = [
    "I love cats.",
    "I don't like mangoes.",
    "They are using NLP in the company Fetch."
]


In [12]:
embeddings = sentformer.encode(sentences)
embeddings.shape

torch.Size([3, 768])

In [13]:
embeddings

tensor([[ 0.5344,  0.3247, -0.1033,  ..., -0.0295,  0.2302,  0.2154],
        [ 0.2443,  0.2077, -0.2987,  ...,  0.1340,  0.0335, -0.0820],
        [ 0.0744, -0.1423,  0.2127,  ..., -0.4782,  0.1212,  0.1719]])

In [14]:
# sanity-check similarity
pairwise_cosine_similarity(embeddings)

tensor([[1.0000, 0.7191, 0.4666],
        [0.7191, 1.0000, 0.4731],
        [0.4666, 0.4731, 1.0000]])

# Multi-Task learner

Here, we implement `MultiTaskFormer` which takes in any backbone mentioned in the previous section.
Plus, it also takes arbitrary number of `NetworkHead`.

In [15]:
from sentform.modeling import MultiTaskFormer
from sentform.heads import ClassificationHead, NERHead

In [21]:
# Needs fine-tuning of these heads
# Left the tuning part for brevity as per assignment
multi_tasker = MultiTaskFormer(
    heads=[
        ClassificationHead(
            backbone.config.hidden_size,
            num_classes=3,
            labels=["Positive", "Neutral", "Negative"],
            multi_label=True
        ),
        NERHead(
            backbone.config.hidden_size,
            num_tags=3,
            ner_tags=["Person", "Organization", "Location"],
            multi_label=False
        )
    ],
    backbone=backbone,
)

In [22]:
outputs = multi_tasker(sentences)

In [23]:
outputs

{'head_0': {'logits': tensor([[-0.1656, -0.3476,  0.4844],
          [-0.0076, -0.5315,  0.1446],
          [ 0.0027, -0.2253,  0.3223]]),
  'predicted_labels': [['Negative'], ['Negative'], ['Positive', 'Negative']]},
 'head_1': {'logits': tensor([[[ 0.3650,  0.1345, -0.2514],
           [ 0.5034,  0.0140, -0.2033],
           [ 0.2411, -0.0171, -0.1203],
           [ 0.4246, -0.0356, -0.1672],
           [ 0.2063,  0.3407,  0.4070],
           [ 0.2124, -0.2770,  0.1768],
           [ 0.3093, -0.0320, -0.1235],
           [ 0.4071, -0.0214, -0.0909],
           [ 0.4120,  0.0069, -0.1272],
           [ 0.3855, -0.0453,  0.0112],
           [ 0.3581, -0.0378, -0.0568],
           [ 0.2946, -0.0275, -0.1267]],
  
          [[ 0.3164,  0.2235, -0.2897],
           [ 0.3439, -0.0085, -0.2358],
           [ 0.4287,  0.1661, -0.1273],
           [ 0.1695, -0.1226, -0.1536],
           [ 0.0375,  0.3355,  0.1454],
           [-0.2184,  0.0718,  0.0032],
           [ 0.3735,  0.3854, -0.5621]

In [31]:
for i, sentence in enumerate(sentences):
    print(f"Sentence: {sentence}")
    for head_key, head_output in outputs.items():
        predicted_labels = head_output["predicted_labels"][i]
        logits_shape = head_output["logits"][i].shape
        print(f"{head_key} | Labels: {predicted_labels} | Logits shape: {logits_shape}")
    print("-" * 7)


Sentence: I love cats.
head_0 | Labels: ['Negative'] | Logits shape: torch.Size([3])
head_1 | Labels: ['Person', 'Person', 'Person', 'Person', 'Location', 'Person'] | Logits shape: torch.Size([12, 3])
-------
Sentence: I don't like mangoes.
head_0 | Labels: ['Negative'] | Logits shape: torch.Size([3])
head_1 | Labels: ['Person', 'Person', 'Person', 'Person', 'Organization', 'Organization', 'Organization', 'Location', 'Person', 'Location'] | Logits shape: torch.Size([12, 3])
-------
Sentence: They are using NLP in the company Fetch.
head_0 | Labels: ['Positive', 'Negative'] | Logits shape: torch.Size([3])
head_1 | Labels: ['Organization', 'Organization', 'Organization', 'Person', 'Person', 'Person', 'Location', 'Location', 'Organization', 'Organization', 'Person', 'Organization'] | Logits shape: torch.Size([12, 3])
-------
