In [None]:
!pip install datasets evaluate transformers[sentencepiece]

In [2]:
from transformers import BertTokenizerFast, BertForTokenClassification, BertForSequenceClassification
import torch
import numpy as np
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, Dataset
from transformers import TextDataset

In [None]:
!pip install python-docx

In [4]:
import json
# Read the JSON file
path = "/content/drive/MyDrive/model/B_data.json"
with open(path, 'r') as f:
    dataset = json.load(f)

In [5]:
path = "/content/drive/MyDrive/model/training_set.json"
with open(path, 'r') as f:
    test_dataset = json.load(f)

In [6]:
text, intent, ner = [], [], []
for i in dataset:
    text.append(i['text'])
    intent.append(i['intent'])
    ner.append(i['entities'].split())

In [14]:
test_text, test_intent, test_ner = [], [], []
for i in test_dataset:
    test_text.append(i['text'])
    test_intent.append(i['intent'])
    test_ner.append(i['entities'].split())

In [6]:
o = ner[0]
o

['O', 'O', 'O', 'O', 'B-DUR', 'I-DUR']

In [7]:
o = text[0].strip().split()
o

['Set', 'a', 'timer', 'for', '10', 'minutes.']

Just test

In [15]:
test_ner[0]

['O',
 'O',
 'O',
 'O',
 'O',
 'B-TASK',
 'I-TASK',
 'B-DATE',
 'I-DATE',
 'O',
 'B-TIME',
 'I-TIME']

In [8]:
unique_ = set(test_intent)
num_i = len(unique_)

unique_, num_i

({"'Schedule Appointment'",
  "'Schedule Meeting'",
  "'Set Alarm'",
  "'Set Reminder'",
  "'Set Timer'"},
 5)

In [16]:
one_r = [tag for subset in test_ner for tag in subset]
uni = set(one_r)
num_ = len(uni)
uni, num_

({'B-DATE',
  'B-DUR',
  'B-TASK',
  'B-TIME',
  'I-DATE',
  'I-DUR',
  'I-TASK',
  'I-TIME',
  'O'},
 9)

Delete what is above

In [10]:
unique_intents = set(intent)
num_intent_labels = len(unique_intents)

unique_intents, num_intent_labels

({"'Schedule Appointment'",
  "'Schedule Meeting'",
  "'Set Alarm'",
  "'Set Reminder'",
  "'Set Timer'"},
 5)

In [11]:
one_dimensional_ner = [tag for subset in ner for tag in subset ]
unique_ner = set(one_dimensional_ner)
num_ner_labels = len(unique_ner)
unique_ner, num_ner_labels

({'B-DATE',
  'B-DUR',
  'B-TASK',
  'B-TIME',
  'I-DATE',
  'I-DUR',
  'I-TASK',
  'I-TIME',
  'O'},
 9)

In [10]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
ner_model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_ner_labels)
intent_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_intent_labels)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
label_ner_map = {'O': 0, 'B-DATE': 1, 'I-DATE': 2, 'B-TIME': 3, 'I-TIME': 4, 'B-TASK': 5, 'I-TASK': 6, 'B-DUR': 7, 'I-DUR': 8}

In [12]:
label_intent_map = {
    "'Schedule Appointment'": 0,
    "'Schedule Meeting'": 1,
    "'Set Alarm'": 2,
    "'Set Reminder'": 3,
    "'Set Timer'": 4
}

In [13]:
class dataset(Dataset):
    def __init__(self, text, intent, ner, tokenizer, max_len=128):
        self.len = len(text)
        self.text = text
        self.intent = intent
        self.ner = ner
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: get the sentence, ner label, and intent_label
        sentence = self.text[index].strip()
        intent_label = self.intent[index].strip()
        ner_labels = self.ner[index]

        # step 2: use tokenizer to encode a sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" which highlights where each token starts and ends
        encoding = self.tokenizer(
            sentence,
            return_offsets_mapping=True,
            padding='max_length',
            truncation=True,
            max_length=self.max_len
        )

        # step 3: create ner token labels only for first word pieces of each tokenized word
        tokenized_ner_labels = [label_ner_map[label] for label in ner_labels]
        # create an empty array of -100 of length max_length
        encoded_ner_labels = np.ones(len(encoding['offset_mapping']), dtype=int) * -100

        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        prev = -1
        for idx, mapping in enumerate(encoding['offset_mapping']):
            if mapping[0] == mapping[1] == 0:
                continue
            if mapping[0] != prev:
                # overwrite label
                encoded_ner_labels[idx] = tokenized_ner_labels[i]
                prev = mapping[1]
                i += 1
            else:
                prev = mapping[1]

        # create intent token labels
        tokenized_intent_label = label_intent_map[intent_label]

        # step 4: turn everything into Pytorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['ner_labels'] = torch.as_tensor(encoded_ner_labels)
        item['intent_labels'] = torch.as_tensor(tokenized_intent_label)

        return item

    def __len__(self):
        return self.len

In [14]:
training_set = dataset(text, intent, ner, tokenizer)

In [15]:
training_set[5]

{'input_ids': tensor([ 101, 2275, 2019, 8598, 2005, 1021, 2572, 1012,  102,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0

Let us verify that the input ids and corresponding targets are correct:

In [18]:
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[20]['input_ids']), training_set[20]['ner_labels']):
    print(f"{token} -- {label}")

[CLS] -- -100
schedule -- 0
a -- 0
dentist -- 5
appointment -- 6
for -- 0
april -- 1
5th -- 2
at -- 0
11 -- 3
: -- -100
00 -- -100
in -- 4
the -- 4
morning -- 4
. -- -100
[SEP] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PA

# I HAVE TO GET RID OF " IN MY DATASET

In [19]:
# The dataset is small, batch_size of 1 would not impact the training time significantly
training_loader = DataLoader(training_set, batch_size=1)

In [20]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [28]:
device

'cuda'

In [29]:
ner_model.to(device)
intent_model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

The initial loss of the model should be close to -ln(1/num_labels)=-ln(1/9). In this case it is 2.20.
Why? Because we are using cross entropy loss. The cross entropy loss is defined as -ln(probability score of the model for the correct class). In the beginning, the weights are random, so the probability distribution for all of the classes for a given token will be uniform, meaning that the probability for the correct class will be near 1/9. The loss for a given token will thus be -ln(1/9). As PyTorch's CrossEntropyLoss (which is used by BertForTokenClassification) uses mean reduction by default, it will compute the mean loss for each of the tokens in the sequence for which a label is provided.

In [30]:
inputs = training_set[2]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["ner_labels"].unsqueeze(0)

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)

outputs = ner_model(input_ids, attention_mask=attention_mask, labels=labels)
initial_loss = outputs[0]
initial_loss

tensor(2.1809, device='cuda:0', grad_fn=<NllLossBackward0>)

The shape of logits must be __[batch_size, sequence_length, num_labels]__

In [31]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 128, 9])

# Training

In [32]:
optimizer = torch.optim.Adam([
    {'params': ner_model.parameters()},
    {'params': intent_model.parameters()}
], lr=1e-5)

# Just testing the models; delete it after finished

____

In [33]:
def train(epoch):
    tr_ner_loss, tr_ner_accuracy = 0, 0
    tr_intent_loss, tr_intent_accuracy = 0, 0
    nb_tr_steps = 0
    tr_ner_preds, tr_ner_labels = [], []
    tr_intent_labels, tr_intent_predictions = [], []
    ner_model.train()
    intent_model.train()

    for idx, batch in enumerate(training_loader):
        ids = batch['input_ids'].to(device, dtype=torch.long)
        mask = batch['attention_mask'].to(device, dtype=torch.long)
        ner_labels = batch['ner_labels'].to(device, dtype=torch.long)
        intent_labels = batch['intent_labels'].to(device, dtype=torch.long)

        ner_logits = ner_model(input_ids=ids, attention_mask=mask, labels=ner_labels)

        # here we train an intent_model
        intent_logits = intent_model(input_ids=ids, attention_mask=mask, labels=intent_labels)

        ner_loss = ner_logits.loss
        intent_loss = intent_logits.loss

        comb_loss = ner_loss + intent_loss
        # till here

        tr_loss += ner_logits['loss'] + intent_logits['loss']
        nb_tr_steps += 1

        if idx % 1 == 0:
            loss_step = tr_loss / nb_tr_steps
            print(f"Training loss per {idx} training steps: {loss_step}")

        # compute training accuracy (FOR NER)
        flattened_ner_targets = ner_labels.view(-1) # shape (batch_size * seq_len)
        active_ner_logits = ner_logits.logits.view(-1, ner_model.num_labels) # shape (batch_size*seq_len, num_labels)
        flattened_ner_predictions = torch.argmax(active_ner_logits, axis=1) # shape (batch_size * seq_len)

        # compute accuracy only at active labels
        active_ner_accuracy = ner_labels.view(-1) != -100 # shape (batch_size, seq_len)
        ac_ner_labels = torch.masked_select(flattened_ner_targets, active_ner_accuracy)
        ner_predictions = torch.masked_select(flattened_ner_predictions, active_ner_accuracy)

        tr_ner_labels.extend(ac_ner_labels)
        tr_ner_preds.extend(ner_predictions)

        # compute accuracy for intent_model
        # I CAN MAKE THE CALCULATION MUCH EASIER
        # FIGURE IT OUT
        flattened_intent_targets = intent_labels.view(-1)
        active_intent_logits = intent_logits.logits.view(-1, intent_model.num_labels)
        flattened_intent_predictions = torch.argmax(active_intent_logits, axis=1)

        sample_intent_accuracy = intent_labels.view(-1)
        active_intent_accuracy = torch.ones_like(sample_intent_accuracy, dtype=torch.bool)

        ac_intent_labels = torch.masked_select(flattened_intent_targets, active_intent_accuracy)
        intent_predictions = torch.masked_select(flattened_intent_predictions, active_intent_accuracy)

        tr_intent_labels.extend(ac_intent_labels)
        tr_intent_predictions.extend(intent_predictions)

        tmp_tr_ner_accuracy = accuracy_score(ac_ner_labels.cpu().numpy(), ner_predictions.cpu().numpy())
        tmp_tr_intent_accuracy = accuracy_score(ac_intent_labels.cpu().numpy(), intent_predictions.cpu().numpy())
        tr_accuracy += tmp_tr_ner_accuracy + tmp_tr_intent_accuracy

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=ner_model.parameters(), max_norm=10
        )

        torch.nn.utils.clip_grad_norm_(
            parameters=intent_model.parameters(), max_norm=10
        )

        # backward pass
        optimizer.zero_grad()
        comb_loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")


In [34]:
for epoch in range(3):
    print(f"Training epoch: {epoch+1}")
    print("----------------------------")
    train(epoch)

Training epoch: 1
----------------------------
Training loss per 0 training steps: 3.8046998977661133
Training loss per 1 training steps: 3.8781731128692627
Training loss per 2 training steps: 3.715243101119995
Training loss per 3 training steps: 3.6687569618225098
Training loss per 4 training steps: 3.6605567932128906
Training loss per 5 training steps: 3.7009940147399902
Training loss per 6 training steps: 3.655125379562378
Training loss per 7 training steps: 3.6041133403778076
Training loss per 8 training steps: 3.6191465854644775
Training loss per 9 training steps: 3.5886037349700928
Training loss per 10 training steps: 3.5063629150390625
Training loss per 11 training steps: 3.4274165630340576
Training loss per 12 training steps: 3.422764778137207
Training loss per 13 training steps: 3.4241292476654053
Training loss per 14 training steps: 3.3743255138397217
Training loss per 15 training steps: 3.337383985519409
Training loss per 16 training steps: 3.3314597606658936
Training loss p

# Evaluating the model

In [None]:
def eval(epoch):
