In [4]:
!pip install transformers
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install datasets
!pip install seqeval

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
#Task 1
#Sentence transformer architecture
#the base model is bert-base-uncased (Encoder)
#to obtain sentence embeddings, two approaches can be used
#first is to mean pool embeddings of the individual tokens in a sentence
#second is to use the [CLS] token as it stores the information of the entire sentence


import torch
from transformers import AutoTokenizer, AutoModel


class SentenceTransformer(torch.nn.Module):
    def __init__(self, model_name_or_path = 'bert-base-uncased', non_mean_pooling: bool = False, display_token_embeddings: bool = False, train_model: bool = False):

        super().__init__()
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        self.model = AutoModel.from_pretrained(model_name_or_path).to(self.device)
        self.display_token_embeddings = display_token_embeddings
        self.non_mean_pooling = non_mean_pooling
        self.train_model = train_model

    def mean_pooling(self, model_output, attention_mask):
        """
        Mean Pooling - Take the average of all tokens in a sequence to get a single vector.
        param model_output: Last hidden states of the model,
        param attention_mask: Attention mask of the input sequence,
        return: Sentence embeddings (single vector)
        """
        token_embeddings = model_output.last_hidden_state

        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sentence_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

        return sentence_embeddings

    def non_mean_pooling(self, model_output, attention_mask):
        """
        Non-Mean Pooling - Take the last hidden state of the first token ([CLS]) in the sequence to get a single vector.
        param model_output: Last hidden states of the model,
        param attention_mask: Attention mask of the input sequence,
        return: Sentence embeddings
        """
        token_embeddings = model_output.last_hidden_state

        sentence_embeddings = token_embeddings[:, 0]

        return sentence_embeddings

    def forward(self, input_ids, attention_mask):

        model_output = self.model(input_ids, attention_mask)

        if self.display_token_embeddings:
            print("embeddings of individual tokens in the sentence/text/sequence")
            print(model_output.last_hidden_state[0])
            print('size of the embeddings before transformation')
            print(model_output.last_hidden_state[0].size())

        if self.non_mean_pooling:
            sentence_embeddings = self.non_mean_pooling(model_output, attention_mask)
        else:
          sentence_embeddings = self.mean_pooling(model_output, attention_mask)

        if self.display_token_embeddings:
            print("sentence embeddings for the sentence/text/sequence")
            print(sentence_embeddings)
            print('size of the embeddings after transformation')
            print(sentence_embeddings.size())

        return sentence_embeddings

    def encode_text(self, text):
        tokenized_input = self.tokenizer(text, padding=True, truncation=True, return_tensors='pt').to(self.device)
        sentence_embeddings = self.forward(tokenized_input['input_ids'], tokenized_input['attention_mask'])
        return sentence_embeddings


In [6]:
model = SentenceTransformer(display_token_embeddings=True)
output = model.encode_text(["hello world how are you"])
print(output.size())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


embeddings of individual tokens in the sentence/text/sequence
tensor([[-0.0624, -0.0145,  0.0069,  ..., -0.4263,  0.1253,  0.0850],
        [-0.2205, -0.0817,  1.0345,  ...,  0.1185,  0.9602,  0.0598],
        [-0.2860,  0.2364,  1.3245,  ..., -0.6330,  0.5996, -0.1818],
        ...,
        [ 0.3956, -1.2445,  0.4668,  ..., -0.2550,  0.3197, -0.2652],
        [-0.2639, -0.9359,  0.7665,  ..., -0.0939,  0.2596, -0.8225],
        [ 0.5287, -0.0212, -0.2933,  ...,  0.1731, -0.3957, -0.2407]],
       device='cuda:0', grad_fn=<SelectBackward0>)
size of the embeddings before transformation
torch.Size([7, 768])
sentence embeddings for the sentence/text/sequence
tensor([[-4.5395e-02, -3.8827e-01,  5.1562e-01, -3.7692e-01,  1.3871e-01,
         -4.2901e-01,  4.0577e-01,  5.9627e-01,  3.2398e-02, -3.1430e-01,
         -2.1739e-02, -3.6843e-01,  1.9200e-02,  2.9716e-01, -2.8543e-01,
          2.9637e-01, -1.9475e-01,  3.2351e-01,  1.8238e-01,  6.1712e-01,
          2.2128e-01, -1.6681e-01,  7.54

In [7]:
import torch
from transformers import AutoTokenizer, AutoModel


class MultiTaskSentenceTransformer(torch.nn.Module):
    def __init__(self,
                 model_name,
                 num_classifier_labels: int = 1,
                 num_ner_labels: int = 1,
                 non_mean_pooling: bool = False,
                 display_token_embeddings: bool = False):

        super().__init__()

        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(self.device)
        hidden_dim = self.model.config.hidden_size
        self.classifier_head = torch.nn.Linear(hidden_dim, num_classifier_labels).to(self.device)
        self.ner_head = torch.nn.Linear(hidden_dim, num_ner_labels).to(self.device)

    def mean_pooling(self, model_output, attention_mask):

        token_embeddings = model_output.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sentence_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sentence_embeddings

    def non_mean_pooling(self, model_output, attention_mask):

        token_embeddings = model_output.last_hidden_state
        sentence_embeddings = token_embeddings[:, 0] #[CLS] token
        return sentence_embeddings

    def forward(self, input_ids, attention_mask):

        output = self.model(input_ids, attention_mask)
        sentence_vector = self.mean_pooling(output, attention_mask) if self.non_mean_pooling else self.non_mean_pooling(output, attention_mask)
        token_vectors = output.last_hidden_state

        classifier_logits = self.classifier_head(sentence_vector)
        ner_logits = self.ner_head(token_vectors)

        return classifier_logits, ner_logits

    def predict(self, input_ids, attention_mask):
        with torch.no_grad():
          classifier_logits, ner_logits = self.forward(input_ids, attention_mask)
          return torch.argmax(classifier_logits, dim=-1), torch.argmax(ner_logits, dim=-1)

In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = MultiTaskSentenceTransformer(model_name='bert-base-uncased', num_classifier_labels=2, num_ner_labels=4)
tokenized_input = model.tokenizer(["hello world how are you"], return_tensors="pt")
classifier_logits, ner_logits = model(tokenized_input['input_ids'].to(device), tokenized_input['attention_mask'].to(device))
print(f"vector size of input: {tokenized_input}")
print(f"vector size of classifier ouput: {classifier_logits.size()}")
print(f"vector size for ner output: {ner_logits.size()}")

print(f"classifier output vector: {classifier_logits}")
print(f"ner output vector: {ner_logits}")

pred_label, pred_ner_label = model.predict(tokenized_input['input_ids'].to(device), tokenized_input['attention_mask'].to(device))
print(f"predicted label: {pred_label}")
print(f"predicted ner label: {pred_ner_label}")

vector size of input: {'input_ids': tensor([[ 101, 7592, 2088, 2129, 2024, 2017,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}
vector size of classifier ouput: torch.Size([1, 2])
vector size for ner output: torch.Size([1, 7, 4])
classifier output vector: tensor([[0.0683, 0.1030]], device='cuda:0', grad_fn=<AddmmBackward0>)
ner output vector: tensor([[[-0.1856,  0.4300,  0.4030, -0.2387],
         [ 0.0310,  0.3503,  0.1555,  0.0433],
         [ 0.5703, -0.3340, -0.6750,  0.1094],
         [-0.2701,  0.1710, -0.2786, -0.2670],
         [-0.3352, -0.1313, -0.4721, -0.1442],
         [-0.1336, -0.2988, -0.4066,  0.1017],
         [-0.3462, -0.3016, -0.0708, -0.2890]]], device='cuda:0',
       grad_fn=<ViewBackward0>)
predicted label: tensor([1], device='cuda:0')
predicted ner label: tensor([[1, 1, 0, 1, 1, 3, 2]], device='cuda:0')


In [61]:
#Dataset creation
#adding NER tags to ag-news dataset
#dataset will then have labels for classifier and NER


import spacy
from datasets import load_dataset, DatasetDict
nlp = spacy.load("en_core_web_sm")
NOT_REQUIRED_ENT_TYPES = ['CARDINAL', 'DATE', 'QUANTITY', 'ORDINAL', 'TIME', 'FAC', 'LAW', 'PERCENT', 'MONEY', 'NORP']
TAGS_SET = set()


def get_ner_tags(text):
    doc = nlp(text)
    ner_tags = []
    j = 0
    for token in doc:
        if token.ent_type_ not in NOT_REQUIRED_ENT_TYPES:

          entity_tag = None
          if token.ent_iob_ != 'O':
            entity_tag = token.ent_iob_ + "-" + token.ent_type_
          else:
            entity_tag = token.ent_iob_

        else:
          entity_tag = "O"

        ner_tags.append((token.text, entity_tag))
        TAGS_SET.add(entity_tag)


    return ner_tags


def process_dataset(dataset):
    processed_dataset = dataset.map(
        lambda example: {"ner_tags": get_ner_tags(example["text"])},
        batched=False
    )
    processed_dataset = processed_dataset.map(
        lambda example: {"words": [token[0] for token in example["ner_tags"]]},
        batched=False
    )
    processed_dataset = processed_dataset.map(
        lambda example: {"ner_tags": [token[1] for token in example["ner_tags"]]},
        batched=False
    )
    return processed_dataset

def create_dataset():
  dataset = load_dataset("ag_news")

  train_dataset = dataset["train"].select(range(5000))
  test_dataset = dataset["test"].select(range(1000))
  dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

  train_dataset = process_dataset(train_dataset)
  test_dataset = process_dataset(test_dataset)
  dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
  return dataset, TAGS_SET


In [62]:
dataset, tag_set = create_dataset()

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [63]:
tag_set

{'B-EVENT',
 'B-GPE',
 'B-LANGUAGE',
 'B-LOC',
 'B-ORG',
 'B-PERSON',
 'B-PRODUCT',
 'B-WORK_OF_ART',
 'I-EVENT',
 'I-GPE',
 'I-LOC',
 'I-ORG',
 'I-PERSON',
 'I-PRODUCT',
 'I-WORK_OF_ART',
 'O'}

In [12]:
print(dataset['train'].column_names)

['text', 'label', 'ner_tags', 'words']


In [13]:
print(f"number of NER tags in a sample: {len(dataset['train'][0]['ner_tags'])}")
print(f"number of words in a sample: {len(dataset['train'][0]['words'])}")


number of NER tags in a sample: 31
number of words in a sample: 31


In [86]:
#Created a trainer class for training the multi task transformer model
#datasets of two types are created one for classification and the other for NER
#loss for both is calculated and added for a cumalative loss
#both the heads and the transformer parameters are updated.
#available options to freeze the parameters of trnasformer or the either head.


import torch
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments
from datasets import load_dataset
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from tqdm import tqdm
from seqeval.metrics import classification_report

class SentenceTransformerTrainer():
  def __init__(self,
               model,
               dataset,
               tag_set,
               freeze_entire_network: bool = False,
               freeze_transformer: bool = False,
               freeze_classifier: bool = False,
               freeze_ner: bool = False):


    self.model = model
    self.dataset = dataset
    self.label_map = {label: i for i, label in enumerate(tag_set)}
    self.reverse_label_map = {i: label for label, i in self.label_map.items()}

    #freezing layers
    if freeze_transformer:
      for param in self.model.model.parameters():
        param.requires_grad = False
    if freeze_classifier:
      for param in self.model.classifier_head.parameters():
        param.requires_grad = False
    if freeze_ner:
      for param in self.model.ner_head.parameters():
        param.requires_grad = False
    if freeze_entire_network:
      for param in self.model.parameters():
        param.requires_grad = False

    self.device = "cuda" if torch.cuda.is_available() else "cpu"
    self.model.to(self.device)
    self.tokenizer = self.model.tokenizer


    #Depending upon the label type
    #if both types of labels are present
    #both the heads will be trained


    self.train_loader, self.test_loader = self.pre_process_data(
        classifier=True if 'label' in self.dataset.column_names["train"] else False,
        ner=True if 'ner_tags' in self.dataset.column_names["train"] else False
    )

    # Separate loss function for each head



  def tokenize_classifier_data(self, batch):
    return self.tokenizer(batch["text"], padding='max_length', truncation=True, max_length=128, return_tensors='pt')

  def tokenize_ner_data(self, batch):
    tokenized_inputs = self.tokenizer(batch["words"], padding='max_length', is_split_into_words = True,   truncation=True, max_length=128, return_tensors='pt')
    #the words key has the list of words for each sentence

    # aligning tags and words
    # adding -100 for 'O' Tags as they are irrelevant
    # -100 allows the loss function to ignore the O tags

    # Create a label mapping if it doesn't exist


    labels = []
    for i, label in enumerate(batch["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # Map the label to its numerical ID
                label_ids.append(self.label_map.get(label[word_idx], -100))
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["ner_labels"] = labels
    return tokenized_inputs

  def pre_process_data(self, classifier: bool = False, ner: bool = False):

    #creating loaders for train and test data

    if ner:
      tokenized_dataset = self.dataset.map(self.tokenize_ner_data, batched=True)
    else:
      tokenized_dataset = self.dataset.map(self.tokenize_classifier_data, batched=True)

    train = TensorDataset(
        torch.tensor(tokenized_dataset["train"]["input_ids"]),
        torch.tensor(tokenized_dataset["train"]["attention_mask"]),
        torch.tensor(tokenized_dataset["train"]["label"]),
        torch.tensor(tokenized_dataset["train"]["ner_labels"])
    )
    test = TensorDataset(
        torch.tensor(tokenized_dataset["test"]["input_ids"]),
        torch.tensor(tokenized_dataset["test"]["attention_mask"]),
        torch.tensor(tokenized_dataset["test"]["label"]),
        torch.tensor(tokenized_dataset["test"]["ner_labels"])
    )
    train_loader = DataLoader(train, batch_size=16, shuffle=True)
    test_loader = DataLoader(test, batch_size=16, shuffle=True)
    return train_loader, test_loader

  def run(self, epochs=3):
    classifier_loss_function = torch.nn.CrossEntropyLoss()
    ner_loss_function = torch.nn.CrossEntropyLoss(ignore_index=-100)
    optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-5)

    for epoch in tqdm(range(epochs)):

        self.model.train()
        for batch in self.train_loader:
            input_ids, attention_mask, labels, ner_labels = batch
            input_ids, attention_mask, labels, ner_labels = input_ids.to(self.device), attention_mask.to(self.device), labels.to(self.device), ner_labels.to(self.device)

            optimizer.zero_grad()

            classifier_logits, ner_logits = self.model(input_ids, attention_mask)

            classifier_loss = classifier_loss_function(classifier_logits, labels)

            ner_loss = ner_loss_function(ner_logits.view(-1, ner_logits.shape[-1]), ner_labels.view(-1))

            #cumalative loss
            loss = classifier_loss + ner_loss

            loss.backward()
            optimizer.step()

        print(f"Epoch: {epoch}, Loss: {loss.item()}")

  def evaluate(self):
    self.model.eval()
    total_correct = 0
    total_samples = 0
    gold_labels = {
        "classifier": [],
        "ner": []
    }
    predicted_labels = {
        "classifier": [],
        "ner": []
    }
    with torch.no_grad():
      total_correct = {
          "classifier": 0,
          "ner": 0
      }
      total_samples = {
          "classifier": 0,
          "ner": 0
      }
      for batch in self.test_loader:
        input_ids, attention_mask, labels, ner_labels = batch
        input_ids, attention_mask, labels, ner_labels = input_ids.to(self.device), attention_mask.to(self.device), labels.to(self.device) , ner_labels.to(self.device)

        #classifier labels
        classifier_label_pred, ner_label_pred = self.model.predict(input_ids, attention_mask)


        total_correct["classifier"] += (classifier_label_pred == labels).sum().item()
        total_samples["classifier"] += labels.size(0)
        total_correct["ner"] += (ner_label_pred == ner_labels).sum().item()
        total_samples["ner"] += ner_labels.size(0) * ner_labels.size(1)

        batch_size = ner_labels.shape[0]
        seq_len = ner_labels.shape[1]
        for i in range(batch_size):
          gold_labels["ner"].append([self.reverse_label_map.get(idx.item(), 'O') for idx in ner_labels[i]])
          predicted_labels["ner"].append([self.reverse_label_map.get(idx.item(), 'O') for idx in ner_label_pred[i]])

    accuracy = {
        "classifier": (total_correct["classifier"] / total_samples["classifier"]) * 100,
        "ner": (total_correct["ner"] / total_samples["ner"]) * 100
    }
    print(f"total_correct: {total_correct}")
    print(f"total_samples: {total_samples}")
    print(f"Test Accuracy: {accuracy}")
    print(f"Classification Report for NER: \n{classification_report(gold_labels['ner'], predicted_labels['ner'])}")


In [64]:
model = MultiTaskSentenceTransformer(model_name='bert-base-uncased',
                                     num_classifier_labels=4,
                                     num_ner_labels=len(list(tag_set)))

trainer = SentenceTransformerTrainer(model, dataset, tag_set)


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [65]:
trainer.run(epochs=5)

 20%|██        | 1/5 [01:44<06:56, 104.22s/it]

Epoch: 0, Loss: 0.5549182891845703


 40%|████      | 2/5 [03:28<05:13, 104.50s/it]

Epoch: 1, Loss: 0.4512898921966553


 60%|██████    | 3/5 [05:13<03:29, 104.54s/it]

Epoch: 2, Loss: 0.4452778398990631


 80%|████████  | 4/5 [06:58<01:44, 104.62s/it]

Epoch: 3, Loss: 0.4984246492385864


100%|██████████| 5/5 [08:42<00:00, 104.59s/it]

Epoch: 4, Loss: 0.381743460893631





In [66]:
trainer.evaluate()

total_correct: {'classifier': 906, 'ner': 41035}
total_samples: {'classifier': 1000, 'ner': 128000}
Test Accuracy: {'classifier': 90.60000000000001, 'ner': 32.05859375}


  _warn_prf(average, modifier, msg_start, len(result))


Classification Report for NER: 
              precision    recall  f1-score   support

       EVENT       0.32      0.55      0.40        93
         GPE       0.62      0.81      0.70      1196
    LANGUAGE       0.00      0.00      0.00         3
         LOC       0.00      0.00      0.00        90
         ORG       0.36      0.42      0.39      1879
      PERSON       0.23      0.34      0.28      1029
     PRODUCT       0.00      0.00      0.00        40
 WORK_OF_ART       0.00      0.00      0.00        29

   micro avg       0.40      0.50      0.44      4359
   macro avg       0.19      0.27      0.22      4359
weighted avg       0.39      0.50      0.44      4359



In [77]:
model = MultiTaskSentenceTransformer(model_name='bert-base-uncased',
                                     num_classifier_labels=4,
                                     num_ner_labels=len(list(tag_set)))
trainer_forzen_transformer = SentenceTransformerTrainer(model, dataset, tag_set, freeze_transformer=True)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [78]:
trainer_forzen_transformer.run(epochs=3)

 33%|███▎      | 1/3 [00:33<01:07, 33.95s/it]

Epoch: 0, Loss: 3.632878303527832


 67%|██████▋   | 2/3 [01:09<00:35, 35.09s/it]

Epoch: 1, Loss: 3.0409512519836426


100%|██████████| 3/3 [01:44<00:00, 34.96s/it]

Epoch: 2, Loss: 2.5653743743896484





In [79]:
trainer_forzen_transformer.evaluate()

total_correct: {'classifier': 587, 'ner': 37471}
total_samples: {'classifier': 1000, 'ner': 128000}
Test Accuracy: {'classifier': 58.699999999999996, 'ner': 29.274218749999996}


  _warn_prf(average, modifier, msg_start, len(result))


Classification Report for NER: 
              precision    recall  f1-score   support

       EVENT       0.00      0.00      0.00        93
         GPE       0.30      0.05      0.08      1196
    LANGUAGE       0.00      0.00      0.00         3
         LOC       0.00      0.00      0.00        90
         ORG       0.09      0.03      0.04      1879
      PERSON       0.09      0.00      0.00      1029
     PRODUCT       0.00      0.00      0.00        40
 WORK_OF_ART       0.00      0.00      0.00        29

   micro avg       0.09      0.02      0.04      4359
   macro avg       0.06      0.01      0.02      4359
weighted avg       0.14      0.02      0.04      4359



In [89]:
model = MultiTaskSentenceTransformer(model_name='bert-base-uncased',
                                     num_classifier_labels=4,
                                     num_ner_labels=len(list(tag_set)))
trainer_frozen_classifier = SentenceTransformerTrainer(model, dataset, tag_set, freeze_classifier=True)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [90]:
trainer_frozen_classifier.run(epochs=10)

 10%|█         | 1/10 [01:42<15:23, 102.64s/it]

Epoch: 0, Loss: 0.5792714953422546


 20%|██        | 2/10 [03:27<13:49, 103.72s/it]

Epoch: 1, Loss: 0.5215522646903992


 30%|███       | 3/10 [05:11<12:08, 104.14s/it]

Epoch: 2, Loss: 0.6388567686080933


 40%|████      | 4/10 [06:56<10:26, 104.36s/it]

Epoch: 3, Loss: 0.40041327476501465


 50%|█████     | 5/10 [08:41<08:42, 104.43s/it]

Epoch: 4, Loss: 0.2887763977050781


 60%|██████    | 6/10 [10:25<06:57, 104.50s/it]

Epoch: 5, Loss: 0.27558404207229614


 70%|███████   | 7/10 [12:10<05:13, 104.52s/it]

Epoch: 6, Loss: 0.2733636498451233


 80%|████████  | 8/10 [13:54<03:29, 104.57s/it]

Epoch: 7, Loss: 0.7582882642745972


 90%|█████████ | 9/10 [15:39<01:44, 104.53s/it]

Epoch: 8, Loss: 0.5312201976776123


100%|██████████| 10/10 [17:23<00:00, 104.39s/it]

Epoch: 9, Loss: 0.21184474229812622





In [91]:
trainer_frozen_classifier.evaluate()

total_correct: {'classifier': 909, 'ner': 41282}
total_samples: {'classifier': 1000, 'ner': 128000}
Test Accuracy: {'classifier': 90.9, 'ner': 32.2515625}


  _warn_prf(average, modifier, msg_start, len(result))


Classification Report for NER: 
              precision    recall  f1-score   support

       EVENT       0.41      0.70      0.52        93
         GPE       0.68      0.81      0.74      1196
    LANGUAGE       0.00      0.00      0.00         3
         LOC       0.38      0.28      0.32        90
         ORG       0.31      0.46      0.37      1879
      PERSON       0.27      0.39      0.32      1029
     PRODUCT       0.00      0.00      0.00        40
 WORK_OF_ART       0.00      0.00      0.00        29

   micro avg       0.39      0.53      0.45      4359
   macro avg       0.26      0.33      0.28      4359
weighted avg       0.40      0.53      0.46      4359



In [83]:
model = MultiTaskSentenceTransformer(model_name='bert-base-uncased',
                                     num_classifier_labels=4,
                                     num_ner_labels=len(list(tag_set)))
trainer_frozen_ner = SentenceTransformerTrainer(model, dataset, tag_set, freeze_ner=True)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [84]:
trainer_frozen_ner.run(epochs=3)

 33%|███▎      | 1/3 [01:44<03:28, 104.46s/it]

Epoch: 0, Loss: 0.6478793025016785


 67%|██████▋   | 2/3 [03:29<01:44, 104.58s/it]

Epoch: 1, Loss: 0.5591111183166504


100%|██████████| 3/3 [05:13<00:00, 104.60s/it]

Epoch: 2, Loss: 0.4231916069984436





In [85]:
trainer_frozen_ner.evaluate()

total_correct: {'classifier': 905, 'ner': 40748}
total_samples: {'classifier': 1000, 'ner': 128000}
Test Accuracy: {'classifier': 90.5, 'ner': 31.834374999999998}


  _warn_prf(average, modifier, msg_start, len(result))


Classification Report for NER: 
              precision    recall  f1-score   support

       EVENT       0.45      0.32      0.38        93
         GPE       0.55      0.80      0.65      1196
    LANGUAGE       0.00      0.00      0.00         3
         LOC       0.00      0.00      0.00        90
         ORG       0.33      0.39      0.36      1879
      PERSON       0.22      0.31      0.26      1029
     PRODUCT       0.00      0.00      0.00        40
 WORK_OF_ART       0.00      0.00      0.00        29

   micro avg       0.37      0.47      0.42      4359
   macro avg       0.19      0.23      0.21      4359
weighted avg       0.36      0.47      0.40      4359



In [87]:
model = MultiTaskSentenceTransformer(model_name='bert-base-uncased',
                                     num_classifier_labels=4,
                                     num_ner_labels=len(list(tag_set)))

frozen_network_trainer= SentenceTransformerTrainer(model, dataset, tag_set, freeze_entire_network=True)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]