In [2]:
!pip install transformers
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install datasets

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m91.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multi

In [5]:
%%writefile sentenceTransformer.py

#Task 1
#Sentence transformer architecture
#the base model is bert-base-uncased (Encoder)
#to obtain sentence embeddings, two approaches can be used
#first is to mean pool embeddings of the individual tokens in a sentence
#second is to use the [CLS] token as it stores the information of the entire sentence


import torch
from transformers import AutoTokenizer, AutoModel


class SentenceTransformer(torch.nn.Module):
    def __init__(self, model_name_or_path = 'bert-base-uncased', non_mean_pooling: bool = False, display_token_embeddings: bool = False, train_model: bool = False):

        super().__init__()
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        self.model = AutoModel.from_pretrained(model_name_or_path).to(self.device)
        self.display_token_embeddings = display_token_embeddings
        self.non_mean_pooling = non_mean_pooling
        self.train_model = train_model

    def mean_pooling(self, model_output, attention_mask):
        """
        Mean Pooling - Take the average of all tokens in a sequence to get a single vector.
        param model_output: Last hidden states of the model,
        param attention_mask: Attention mask of the input sequence,
        return: Sentence embeddings (single vector)
        """
        token_embeddings = model_output.last_hidden_state

        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sentence_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

        return sentence_embeddings

    def non_mean_pooling(self, model_output, attention_mask):
        """
        Non-Mean Pooling - Take the last hidden state of the first token ([CLS]) in the sequence to get a single vector.
        param model_output: Last hidden states of the model,
        param attention_mask: Attention mask of the input sequence,
        return: Sentence embeddings
        """
        token_embeddings = model_output.last_hidden_state

        sentence_embeddings = token_embeddings[:, 0]

        return sentence_embeddings

    def forward(self, input_ids, attention_mask):

        model_output = self.model(input_ids, attention_mask)

        if self.display_token_embeddings:
            print("embeddings of individual tokens in the sentence/text/sequence")
            print(model_output.last_hidden_state[0])
            print('size of the embeddings before transformation')
            print(model_output.last_hidden_state[0].size())

        if self.non_mean_pooling:
            sentence_embeddings = self.non_mean_pooling(model_output, attention_mask)
        else:
          sentence_embeddings = self.mean_pooling(model_output, attention_mask)

        if self.display_token_embeddings:
            print("sentence embeddings for the sentence/text/sequence")
            print(sentence_embeddings)
            print('size of the embeddings after transformation')
            print(sentence_embeddings.size())

        return sentence_embeddings

    def encode_text(self, text):
        tokenized_input = self.tokenizer(text, padding=True, truncation=True, return_tensors='pt').to(self.device)
        sentence_embeddings = self.forward(tokenized_input['input_ids'], tokenized_input['attention_mask'])
        return sentence_embeddings



Writing sentenceTransformer.py


In [20]:
model = SentenceTransformer(display_token_embeddings=True)


In [21]:
output = model.encode_text(["hello world how are you"])
print(output.size())

embeddings of individual tokens in the sentence/text/sequence
tensor([[-0.0624, -0.0145,  0.0069,  ..., -0.4263,  0.1253,  0.0850],
        [-0.2205, -0.0817,  1.0345,  ...,  0.1185,  0.9602,  0.0598],
        [-0.2860,  0.2364,  1.3245,  ..., -0.6330,  0.5996, -0.1818],
        ...,
        [ 0.3956, -1.2445,  0.4668,  ..., -0.2550,  0.3197, -0.2652],
        [-0.2639, -0.9359,  0.7665,  ..., -0.0939,  0.2596, -0.8225],
        [ 0.5287, -0.0212, -0.2933,  ...,  0.1731, -0.3957, -0.2407]],
       device='cuda:0', grad_fn=<SelectBackward0>)
size of the embeddings before transformation
torch.Size([7, 768])
sentence embeddings for the sentence/text/sequence
tensor([[-4.5395e-02, -3.8827e-01,  5.1562e-01, -3.7692e-01,  1.3871e-01,
         -4.2901e-01,  4.0577e-01,  5.9627e-01,  3.2398e-02, -3.1430e-01,
         -2.1739e-02, -3.6843e-01,  1.9200e-02,  2.9716e-01, -2.8543e-01,
          2.9637e-01, -1.9475e-01,  3.2351e-01,  1.8238e-01,  6.1712e-01,
          2.2128e-01, -1.6681e-01,  7.54

In [10]:
%%writefile multi_task_sentence_transformer.py

#task 2: Multi task learning

#Head A - text classification
#use the same model from the task 1 (Sentence transformer)
#Add a classification head on top
#It is a feed forward layer, input_dim = 768, output_dim= number of classes
#With a softmax activation to obtain the probability of each class

#Head B- NER
#modify the sentence transformer model for NER task.
#Currently, the sentence transformer returns a single vector for a text.
#The model needs to be modified to obtain the vectors for all the tokens.
#Each token vector needs to be classified as an NER tags (BIO tags representation)
#Each Entity class will have B and I tags indicating the beginning and middle of the entity respectively.
#For ex: Donald Trump is the president. Donald: B-PER; Trump: I-PER; is - O

import torch
from transformers import AutoTokenizer, AutoModel


class MultiTaskSentenceTransformer(torch.nn.Module):
    def __init__(self,
                 model_name,
                 num_classifier_labels: int = 1,
                 num_ner_labels: int = 1,
                 non_mean_pooling: bool = False,
                 display_token_embeddings: bool = False):

        super().__init__()

        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(self.device)
        hidden_dim = self.model.config.hidden_size
        self.classifier_head = torch.nn.Linear(hidden_dim, num_classifier_labels).to(self.device)
        self.ner_head = torch.nn.Linear(hidden_dim, num_ner_labels).to(self.device)

    def mean_pooling(self, model_output, attention_mask):

        token_embeddings = model_output.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sentence_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sentence_embeddings

    def non_mean_pooling(self, model_output, attention_mask):

        token_embeddings = model_output.last_hidden_state
        sentence_embeddings = token_embeddings[:, 0] #[CLS] token
        return sentence_embeddings

    def forward(self, input_ids, attention_mask):

        output = self.model(input_ids, attention_mask)
        sentence_vector = self.mean_pooling(output, attention_mask) if self.non_mean_pooling else self.non_mean_pooling(output, attention_mask)
        token_vectors = output.last_hidden_state

        classifier_logits = self.classifier_head(sentence_vector)
        ner_logits = self.ner_head(token_vectors)

        return classifier_logits, ner_logits

    def predict(self, input_ids, attention_mask):
        with torch.no_grad():
          classifier_logits, ner_logits = self.forward(input_ids, attention_mask)
          return torch.argmax(classifier_logits, dim=-1), torch.argmax(ner_logits, dim=-1)



Overwriting multi_task_sentence_transformer.py


In [32]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = MultiTaskSentenceTransformer(model_name='bert-base-uncased', num_labels=4)
tokenized_input = model.tokenizer(["hello world how are you"], return_tensors="pt")
classifier_logits, ner_logits = model(tokenized_input['input_ids'].to(device), tokenized_input['attention_mask'].to(device))
print(classifier_logits)
print(ner_logits)

tensor([[ 0.3089,  0.1999,  0.0633, -0.0440]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
tensor([[[ 0.3072, -0.2406,  0.5344, -0.0937],
         [ 0.4566, -0.1178,  0.3803, -0.4081],
         [-0.1791,  0.1307,  0.0569, -0.2245],
         [ 0.1328,  0.3651,  0.6373,  0.0726],
         [-0.1102,  0.3038,  0.6543, -0.0767],
         [-0.0080,  0.0835,  0.4231, -0.2158],
         [ 0.1748,  0.6065,  0.1471,  0.6369]]], device='cuda:0',
       grad_fn=<ViewBackward0>)


In [7]:
%%writefile trainer_sentence_transformer.py


#Created a trainer class for training the multi task transformer model
#datasets of two types are created one for classification and the other for NER
#loss for both is calculated and added for a cumalative loss
#both the heads and the transformer parameters are updated.
#available options to freeze the parameters of trnasformer or the either head.


import torch
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments
from datasets import load_dataset
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

class SentenceTransformerTrainer():
  def __init__(self,
               model,
               dataset,
               freeze_transformer: bool = False,
               freeze_classifier: bool = False,
               freeze_ner: bool = False):


    self.model = model
    self.dataset = dataset

    #freezing layers
    if freeze_transformer:
      for param in self.model.model.parameters():
        param.requires_grad = False
    if freeze_classifier:
      for param in self.model.classifier_head.parameters():
        param.requires_grad = False
    if freeze_ner:
      for param in self.model.ner_head.parameters():
        param.requires_grad = False

    self.device = "cuda" if torch.cuda.is_available() else "cpu"
    self.model.to(self.device)
    self.tokenizer = self.model.tokenizer


    #Depending upon the label type
    #if both types of labels are present
    #both the heads will be trained

    self.train_loader, self.test_loader = self.pre_process_data(
        classifier=True if self.dataset.column_names['train'][0] == 'label' else False,
        ner=True if self.dataset.column_names['train'][0] == 'ner_tags' else False
    )

    # Separate loss function for each head
    self.classifier_loss_function = torch.nn.CrossEntropyLoss()
    self.ner_loss_function = torch.nn.CrossEntropyLoss()
    self.optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-5)


  def tokenize_classifier_data(self, batch):
    return self.tokenizer(batch["text"], padding='max_length', truncation=True, max_length=128, return_tensors='pt')

  def tokenize_ner_data(self, batch):
    tokenized_inputs = self.tokenizer(batch["words"], padding='max_length', is_split_into_words = True,   truncation=True, max_length=128, return_tensors='pt')
    #the words key has the list of words for each sentence

    # aligning tags and words
    # adding -100 for 'O' Tags as they are irrelevant
    # -100 allows the loss function to ignore the O tags

    labels = []
    for i, label in enumerate(batch["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            if word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_list[label[word_idx]] != "O" else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["ner_labels"] = labels
    return tokenized_inputs

  def pre_process_data(self, classifier: bool = False, ner: bool = False):

    #creating loaders for train and test data
    if ner:
      tokenized_dataset = self.dataset.map(self.tokenize_ner_data, batched=True)
    else:
      tokenized_dataset = self.dataset.map(self.tokenize_classifier_data, batched=True)

    train = TensorDataset(
        torch.tensor(tokenized_dataset["train"]["input_ids"]),
        torch.tensor(tokenized_dataset["train"]["attention_mask"]),
        torch.tensor(tokenized_dataset["train"]["label"]),
        torch.tensor(tokenized_dataset["train"]["ner_labels"]) if ner else None
    )
    test = TensorDataset(
        torch.tensor(tokenized_dataset["test"]["input_ids"]),
        torch.tensor(tokenized_dataset["test"]["attention_mask"]),
        torch.tensor(tokenized_dataset["test"]["label"]),
        torch.tensor(tokenized_dataset["test"]["ner_labels"]) if ner else None
    )
    train_loader = DataLoader(train, batch_size=16, shuffle=True)
    test_loader = DataLoader(test, batch_size=16, shuffle=True)
    return train_loader, test_loader

  def run(self, epochs=3):
    for epoch in range(epochs):
        for batch in self.train_loader:
            input_ids, attention_mask, labels, ner_labels = batch
            input_ids, attention_mask, labels, ner_labels = input_ids.to(self.device), attention_mask.to(self.device), labels.to(self.device), ner_labels.to(self.device) if ner_labels is not None else None
            classifier_logits, ner_logits = self.model(input_ids, attention_mask)


            classifier_loss = self.classifier_loss_function(classifier_logits, labels)
            if ner_labels:
                ner_loss = self.ner_loss_function(ner_logits.view(-1, ner_logits.shape[-1]), labels.view(-1))
                loss = ner_loss

            #cumalative loss in case NER labels are present
            loss = classifier_loss + ner_loss if ner_labels else classifier_loss

            loss.backward()
            self.optimizer.step()
            self.optimizer.zero_grad()
        print(f"Epoch: {epoch}, Loss: {loss.item()}")

Writing trainer_sentence_transformer.py


In [3]:
from datasets import load_dataset, DatasetDict


dataset = load_dataset("ag_news")

train_dataset = dataset["train"].select(range(1000))
test_dataset = dataset["test"].select(range(200))
dataset = DatasetDict({"train": train_dataset, "test": test_dataset})


"""trainer = SentenceTransformerTrainer(model, dataset, task_type=0)
trainer.run()"""

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

'trainer = SentenceTransformerTrainer(model, dataset, task_type=0)\ntrainer.run()'

In [9]:
%%writefile create_ner_dataset.py

#Dataset creation
#adding NER tags to ag-news dataset
#dataset will then have labels for classifier and NER


import spacy
from datasets import load_dataset, DatasetDict
nlp = spacy.load("en_core_web_sm")
NOT_REQUIRED_ENT_TYPES = ['CARDINAL', 'DATE', 'QUANTITY', 'ORDINAL', 'TIME']

def get_ner_tags(text):
    doc = nlp(text)
    ner_tags = []
    j = 0
    for token in doc:
        if token.ent_type_ not in NOT_REQUIRED_ENT_TYPES:

          if token.ent_type_ != 'O':
            ner_tags.append((token.text, token.ent_iob_ + "-" + token.ent_type_))
          else:
            ner_tags.append((token.text, token.ent_iob_))
        else:
          ner_tags.append((token.text, "O"))

    return ner_tags


def process_dataset(dataset):
    processed_dataset = dataset.map(
        lambda example: {"ner_tags": get_ner_tags(example["text"])},
        batched=False
    )
    processed_dataset = processed_dataset.map(
        lambda example: {"words": [token[0] for token in example["ner_tags"]]},
        batched=False
    )
    processed_dataset = processed_dataset.map(
        lambda example: {"ner_tags": [token[1] for token in example["ner_tags"]]},
        batched=False
    )
    return processed_dataset

def create_dataset():
  dataset = load_dataset("ag_news")

  train_dataset = dataset["train"].select(range(1000))
  test_dataset = dataset["test"].select(range(200))
  dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

  train_dataset = process_dataset(train_dataset)
  test_dataset = process_dataset(test_dataset)
  dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
  return dataset


Overwriting create_ner_dataset.py


In [11]:
%%writefile main.py

from create_ner_dataset import create_dataset
from sentenceTransformer import SentenceTransformer
from multi_task_sentence_transformer import MultiTaskSentenceTransformer
from trainer_sentence_transformer import SentenceTransformerTrainer


dataset = create_dataset()
classifier_labels = dataset.column_names['train'][0] == 'label'
ner_labels = dataset.column_names['train'][0] == 'ner_tags'
model = MultiTaskSentenceTransformer(model_name='bert-base-uncased', num_classifier_labels= len(classifier_labels), num_ner_labels=len(ner_labels))
trainer = SentenceTransformerTrainer(model, dataset)
trainer.run()

Writing main.py
