In [None]:
!pip install datasets evaluate transformers[sentencepiece]

In [None]:
from transformers import BertTokenizerFast, BertForTokenClassification, BertForSequenceClassification
import torch
import numpy as np
from torch.utils.data import DataLoader, Dataset
from transformers import TextDataset

In [None]:
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.0-py3-none-any.whl (239 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.6/239.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-docx
Successfully installed python-docx-1.1.0


In [None]:
import json
# Read the JSON file
path = "/content/drive/MyDrive/model/B_data.json"
with open(path, 'r') as f:
    dataset = json.load(f)

In [None]:
text, intent, ner = [], [], []
for i in dataset:
    text.append(i['text'])
    intent.append(i['intent'])
    ner.append(i['entities'].split())

In [None]:
o = ner[0]
o

['O', 'O', 'O', 'O', 'B-DUR', 'I-DUR']

In [None]:
o = text[0].strip().split()
o

['"Set', 'a', 'timer', 'for', '10', 'minutes."']

In [None]:
unique_intents = set(intent)
num_intent_labels = len(unique_intents)

unique_intents, num_intent_labels

({"'Schedule Appointment'",
  "'Schedule Meeting'",
  "'Set Alarm'",
  "'Set Reminder'",
  "'Set Timer'"},
 5)

In [None]:
one_dimensional_ner = [tag for subset in ner for tag in subset ]
unique_ner = set(one_dimensional_ner)
num_ner_labels = len(unique_ner)
unique_ner, num_ner_labels

({'B-DATE',
  'B-DUR',
  'B-TASK',
  'B-TIME',
  'I-DATE',
  'I-DUR',
  'I-TASK',
  'I-TIME',
  'O'},
 9)

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
ner_model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_ner_labels)
intent_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_intent_labels)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
label_ner_map = {'O': 0, 'B-DATE': 1, 'I-DATE': 2, 'B-TIME': 3, 'I-TIME': 4, 'B-TASK': 5, 'I-TASK': 6, 'B-DUR': 7, 'I-DUR': 8}

In [None]:
label_intent_map = {
    "'Schedule Appointment'": 0,
    "'Schedule Meeting'": 1,
    "'Set Alarm'": 2,
    "'Set Reminder'": 3,
    "'Set Timer'": 4
}

In [None]:
class dataset(Dataset):
    def __init__(self, text, intent, ner, tokenizer, max_len=128):
        self.len = len(text)
        self.text = text
        self.intent = intent
        self.ner = ner
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: get the sentence, ner label, and intent_label
        sentence = self.text[index].strip()
        intent_label = self.intent[index].strip()
        ner_labels = self.ner[index]

        # step 2: use tokenizer to encode a sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" which highlights where each token starts and ends
        encoding = self.tokenizer(
            sentence,
            return_offsets_mapping=True,
            padding='max_length',
            truncation=True,
            max_length=self.max_len
        )

        # step 3: create ner token labels only for first word pieces of each tokenized word
        tokenized_ner_labels = [label_ner_map[label] for label in ner_labels]
        # create an empty array of -100 of length max_length
        encoded_ner_labels = np.ones(len(encoding['offset_mapping']), dtype=int) * -100

        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        prev = -1
        for idx, mapping in enumerate(encoding['offset_mapping']):
            if mapping[0] == mapping[1] == 0:
                continue
            if mapping[0] != prev:
                # overwrite label
                encoded_ner_labels[idx] = tokenized_ner_labels[i]
                prev = mapping[1]
                i += 1
            else:
                prev = mapping[1]

        # create intent token labels
        tokenized_intent_label = label_intent_map[intent_label]

        # step 4: turn everything into Pytorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['ner_labels'] = torch.as_tensor(encoded_ner_labels)
        item['intent_labels'] = torch.as_tensor(tokenized_intent_label)

        return item

    def __len__(self):
        return self.len

In [None]:
training_set = dataset(text, intent, ner, tokenizer)

In [None]:
training_set[0]

{'input_ids': tensor([  101,  1000,  2275,  1037, 25309,  2005,  2184,  2781,  1012,  1000,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

Let us verify that the input ids and corresponding targets are correct:

In [None]:
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]['input_ids']), training_set[0]['ner_labels']):
    print(f"{token} -- {label}")

[CLS] -- -100
" -- 0
set -- -100
a -- 0
timer -- 0
for -- 0
10 -- 7
minutes -- 8
. -- -100
" -- -100
[SEP] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PAD] -- -100
[PA

# I HAVE TO GET RID OF " IN MY DATASET

In [None]:
# The dataset is small, batch_size of 1 would not impact the training time significantly
training_loader = DataLoader(training_set, batch_size=1)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

The initial loss of the model should be close to -ln(1/num_labels)=-ln(1/9). In this case it is 2.20.
Why? Because we are using cross entropy loss. The cross entropy loss is defined as -ln(probability score of the model for the correct class). In the beginning, the weights are random, so the probability distribution for all of the classes for a given token will be uniform, meaning that the probability for the correct class will be near 1/9. The loss for a given token will thus be -ln(1/9). As PyTorch's CrossEntropyLoss (which is used by BertForTokenClassification) uses mean reduction by default, it will compute the mean loss for each of the tokens in the sequence for which a label is provided.

In [None]:
inputs = training_set[2]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["ner_labels"].unsqueeze(0)

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)

outputs = ner_model(input_ids, attention_mask=attention_mask, labels=labels)
initial_loss = outputs[0]
initial_loss

tensor(2.3805, grad_fn=<NllLossBackward0>)

The shape of logits must be __[batch_size, sequence_length, num_labels]__

In [None]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 128, 9])

# Training

In [None]:
optimizer = torch.optim.Adam([
    {'params': ner_model.parameters()},
    {'params': intent_model.parameters()}
], lr=1e-5)

# Just testing the models; delete it after finished

In [None]:
tr_loss, tr_accuracy = 0, 0
nb_tr_steps = 0
tr_preds, tr_labels = [], []
ner_model.train()
intent_model.train()

for idx, batch in enumerate(training_loader):
    ids = batch['input_ids'].to(device, dtype=torch.long)
    mask = batch['attention_mask'].to(device, dtype=torch.long)
    ner_labels = batch['ner_labels'].to(device, dtype=torch.long)
    intent_labels = batch['intent_labels'].to(device, dtype=torch.long)

    ner_logits = ner_model(input_ids=ids, attention_mask=mask, labels=ner_labels)

    # here we train an intent_model
    intent_logits = intent_model(input_ids=ids, attention_mask=mask, labels=intent_labels)
    # intent_loss = intent_logits.loss
    print(ner_logits['loss'], intent_logits['loss'])

    # till here

    tr_loss += ner_logits['loss'] + intent_logits['loss']
    nb_tr_steps += 1

    if idx % 50 == 0:
        loss_step = tr_loss / nb_tr_steps
        print(f"Training loss per {idx} training steps: {loss_step}")

    # compute training accuracy (FOR NER)
    flattened_ner_targets = ner_labels.view(-1) # shape (batch_size * seq_len)
    active_ner_logits = ner_logits.view(-1, ner_model.num_labels) # shape (batch_size*seq_len, num_labels)
    flattened_ner_predictions = torch.argmax(active_ner_logits, axis=1) # shape (batch_size * seq_len)

    # compute accuracy only at active labels
    active_ner_accuracy = ner_labels.view(-1) != -100 # shape (batch_size, seq_len)

    ac_ner_labels = torch.masked_select(flattened_ner_targets, active_ner_accuracy)
    ner_predictions = torch.masked_select(flattened_ner_predictions, active_ner_accuracy)

    tr_ner_labels.extend(ac_ner_labels)
    tr_ner_preds.extend(ner_predictions)

    # compute accuracy for intent_model
    flattened_intent_targets = intent_labels.view(-1)
    active_intent_logits = intent_logits.view(-1, intent_model.num_labels)
    flattened_intent_predictions = torch.argmax(active_intent_logits, axis=1)

    # this is the moment I must figure out
    active_intent_accuracy = intent_labels.view(-1)

    ac_intent_labels = torch.masked_select(flattened_intent_targets, active_intent_accuracy)
    intent_predictions = torch.masked_select(flattened_intent_predictions, active_intent_accuracy)

    tr_intent_labels.extend(ac_intent_labels)
    tr_intent_predictions.extend(intent_predictions)

    # KEEP CODING FROM HERE

    tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
    tr_accuracy += tmp_tr_accuracy

    # gradient clipping
    torch.nn.utils.clip_grad_norm_(
        parameters=model.parameters(), max_norm=10
    )

    # backward pass
    optimizer.zero_grad()
    ner_loss.backward()
    optimizer.step()

epoch_loss = tr_loss / nb_tr_steps
tr_accuracy = tr_accuracy / nb_tr_steps
print(f"Training loss epoch: {epoch_loss}")
print(f"Training accuracy epoch: {tr_accuracy}")

tensor(2.0202, grad_fn=<NllLossBackward0>) tensor(1.0957, grad_fn=<NllLossBackward0>)


AttributeError: ignored

____

In [None]:
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_steps = 0
    tr_preds, tr_labels = [], []
    ner_model.train()
    intent_model.train()

    for idx, batch in enumerate(training_loader):
        ids = batch['input_ids'].to(device, dtype=torch.long)
        mask = batch['attention_mask'].to(device, dtype=torch.long)
        ner_labels = batch['ner_labels'].to(device, dtype=torch.long)
        intent_labels = batch['intent_labels'].to(device, dtype=torch.long)

        ner_loss, ner_logits = ner_model(input_ids=ids, attention_mask=mask, labels=ner_labels)

        # here we train an intent_model
        intent_loss, intent_logits = intent_model(input_ids=ids, attention_mask=mask, labels=intent_labels)
        # intent_loss = intent_logits.loss


        # till here

        tr_loss += ner_loss.item() + intent_loss.item()
        nb_tr_steps += 1

        if idx % 50 == 0:
            loss_step = tr_loss / nb_tr_steps
            print(f"Training loss per {idx} training steps: {loss_step}")

        # compute training accuracy (FOR NER)
        flattened_ner_targets = ner_labels.view(-1) # shape (batch_size * seq_len)
        active_ner_logits = ner_logits.view(-1, ner_model.num_labels) # shape (batch_size*seq_len, num_labels)
        flattened_ner_predictions = torch.argmax(active_ner_logits, axis=1) # shape (batch_size * seq_len)

        # compute accuracy only at active labels
        active_ner_accuracy = ner_labels.view(-1) != -100 # shape (batch_size, seq_len)

        labels = torch.masked_select(flattened_ner_targets, active_ner_accuracy)
        predictions = torch.masked_select(flattened_ner_predictions, active_ner_accuracy)

        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=10
        )

        # backward pass
        optimizer.zero_grad()
        ner_loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")
