In [None]:
!pip install datasets evaluate transformers[sentencepiece]

In [2]:
from transformers import BertTokenizer, BertForTokenClassification, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import TextDataset

In [None]:
!pip install python-docx

In [23]:
import json
# Read the JSON file
path = "/content/drive/MyDrive/model/B_data.json"
with open(path, 'r') as f:
    dataset = json.load(f)

# Assuming the dataset is a list of dictionaries

# Define a custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Create an instance of CustomDataset
custom_dataset = CustomDataset(dataset)

# Create a DataLoader
batch_size = 1
train_dataloader = DataLoader(custom_dataset, batch_size=batch_size, shuffle=False)

In [24]:
key = []
for sample in train_dataloader:
    key.append(sample['entities'])

In [25]:
entities = []
for substr in key:
    tokens = substr[0].split()
    entities.extend(tokens)

In [None]:
entities

In [27]:
key = set(entities)
print(key)
print(len(key))
num_ner_labels = len(key)

{'I-TASK', 'O', 'B-TIME', 'B-DATE', 'B-TASK', 'B-DUR', 'I-TIME', 'I-DUR', 'I-DATE'}
9


In [28]:
l = []
for sample in train_dataloader:
    l.extend(sample['intent'])

In [29]:
l = set(l)
print(l)
print(len(l))
num_intent_labels=len(l)

{"'Set Timer'", "'Set Alarm'", "'Set Reminder'", "'Schedule Meeting'", "'Schedule Appointment'"}
5


In [None]:
# Load the BERT tokenizer and models for token classification and sequence classification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
ner_model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_ner_labels)
intent_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_intent_labels)

# Tokenizing the dataset

Maybe I should try to set trincation and padding to False.

In [31]:
tokenized_inputs = []

max_sequence_length = 0  # Initialize maximum sequence length

for batch in train_dataloader:
    text = batch['text']
    batch_tokenized = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    input_ids = batch_tokenized['input_ids']
    attention_mask = batch_tokenized['attention_mask']
    tokenized_inputs.append((input_ids, attention_mask))

    # Update maximum sequence length
    max_sequence_length = max(max_sequence_length, input_ids.shape[1])

# Pad sequences to the maximum sequence length
for i in range(len(tokenized_inputs)):
    input_ids, attention_mask = tokenized_inputs[i]
    pad_length = max_sequence_length - input_ids.shape[1]
    padded_input_ids = torch.nn.functional.pad(input_ids, (0, pad_length), value=tokenizer.pad_token_id)
    padded_attention_mask = torch.nn.functional.pad(attention_mask, (0, pad_length), value=0)  # Assuming 0 for padding mask
    tokenized_inputs[i] = (padded_input_ids, padded_attention_mask)

# Concatenate input IDs and attention masks separately
input_ids = torch.cat([tensor[0] for tensor in tokenized_inputs], dim=0)
attention_mask = torch.cat([tensor[1] for tensor in tokenized_inputs], dim=0)



In [None]:
input_ids

In [35]:
# task -> B-TASK
# date -> B-DATE
# duration -> B-DUR
# days -> B_DAY
# time -> B-TIME
# specific_time
encoded_ner_labels = []
label_ner_map = {'O': 0, 'B-DATE': 1, 'I-DATE': 2, 'B-TIME': 3, 'I-TIME': 4, 'B-TASK': 5, 'I-TASK': 6, 'B-DUR': 7, 'I-DUR': 8}
for sample in train_dataloader:
    encoded_ner_labels.extend([label_ner_map[label] for label in sent_label.split()] for sent_label in sample['entities'])
encoded_ner_labels

[[0, 0, 0, 0, 7, 8],
 [0, 0, 0, 0, 0, 0, 3, 4, 1],
 [0, 0, 0, 0, 1, 2, 0, 3, 4],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 3, 4],
 [0, 0, 0, 0, 3, 4],
 [0, 0, 0, 5, 6, 0, 7, 8],
 [0, 0, 0, 0, 1, 2, 3],
 [0, 0, 0, 0, 0, 0, 5, 0, 7, 8],
 [0, 0, 0, 0, 5, 6, 0, 3, 4, 0, 1],
 [0, 0, 0, 0, 0, 1, 2, 0, 3, 4],
 [0, 0, 0, 0, 0, 3, 5],
 [0, 0, 0, 5, 6, 1, 3],
 [0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 0, 3, 4]]

In [36]:
for i in train_dataloader:
    print(i['entities'])

['O O O O B-DUR I-DUR']
['O O O O O O B-TIME I-TIME B-DATE']
['O O O O B-DATE I-DATE O B-TIME I-TIME']
['O O O O O O O O O O B-DATE']
['O O O O O O O O B-DATE I-DATE I-DATE I-DATE O B-TIME I-TIME']
['O O O O B-TIME I-TIME']
['O O O B-TASK I-TASK O B-DUR I-DUR']
['O O O O B-DATE I-DATE B-TIME']
['O O O O O O B-TASK O B-DUR I-DUR']
['O O O O B-TASK I-TASK O B-TIME I-TIME O B-DATE']
['O O O O O B-DATE I-DATE O B-TIME I-TIME']
['O O O O O B-TIME B-TASK']
['O O O B-TASK I-TASK B-DATE B-TIME']
['O O O O O O B-DATE I-DATE I-DATE I-DATE I-DATE O B-TIME I-TIME']


In [37]:
from sklearn.preprocessing import LabelEncoder
sample_entities = []
for sample in train_dataloader:
    sample_entities.extend(sample['entities'])

# Flatten the list of strings into a single list of tokens
all_entities = [entity for sample in sample_entities for entity in sample.split()]

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit LabelEncoder to all unique entity labels
label_encoder.fit(all_entities)

# Encode entities for each sample in sample_entities
encoded_entities = [
    label_encoder.transform(entity.split()).tolist() for entity in sample_entities
]

print(encoded_entities)


[[8, 8, 8, 8, 1, 5], [8, 8, 8, 8, 8, 8, 3, 7, 0], [8, 8, 8, 8, 0, 4, 8, 3, 7], [8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0], [8, 8, 8, 8, 8, 8, 8, 8, 0, 4, 4, 4, 8, 3, 7], [8, 8, 8, 8, 3, 7], [8, 8, 8, 2, 6, 8, 1, 5], [8, 8, 8, 8, 0, 4, 3], [8, 8, 8, 8, 8, 8, 2, 8, 1, 5], [8, 8, 8, 8, 2, 6, 8, 3, 7, 8, 0], [8, 8, 8, 8, 8, 0, 4, 8, 3, 7], [8, 8, 8, 8, 8, 3, 2], [8, 8, 8, 2, 6, 0, 3], [8, 8, 8, 8, 8, 8, 0, 4, 4, 4, 4, 8, 3, 7]]


In [38]:
intent_labels = list(l)

encoded_intent_labels = label_encoder.fit_transform(intent_labels)

In [39]:
class TokenizedDataset(Dataset):
    def __init__(self, tokenized_inputs, encoded_ner_labels, encoded_intent_labels):
        self.tokenized_inputs = tokenized_inputs
        self.encoded_ner_labels = encoded_ner_labels
        self.encoded_intent_labels = encoded_intent_labels

    def __len__(self):
        return len(self.tokenized_inputs)

    def __getitem__(self, idx):
        inputs = self.tokenized_inputs[idx]
        ner_labels = self.encoded_ner_labels[idx]
        intent_labels = self.encoded_intent_labels[idx]

        return {
            'inputs': inputs,
            'ner_labels': ner_labels,
            'intent_labels': intent_labels
        }

In [40]:
cust_dataset = CustomDataset(tokenized_inputs, encoded_ner_labels, encoded_intent_labels)
data_loader = DataLoader(cust_dataset, batch_size=1, shuffle=True)

TypeError: ignored

In [None]:
# ner_model architecture
ner_model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [None]:
intent_model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [41]:
for sample in train_dataloader:
    print(sample['entities'])

['O O O O B-DUR I-DUR']
['O O O O O O B-TIME I-TIME B-DATE']
['O O O O B-DATE I-DATE O B-TIME I-TIME']
['O O O O O O O O O O B-DATE']
['O O O O O O O O B-DATE I-DATE I-DATE I-DATE O B-TIME I-TIME']
['O O O O B-TIME I-TIME']
['O O O B-TASK I-TASK O B-DUR I-DUR']
['O O O O B-DATE I-DATE B-TIME']
['O O O O O O B-TASK O B-DUR I-DUR']
['O O O O B-TASK I-TASK O B-TIME I-TIME O B-DATE']
['O O O O O B-DATE I-DATE O B-TIME I-TIME']
['O O O O O B-TIME B-TASK']
['O O O B-TASK I-TASK B-DATE B-TIME']
['O O O O O O B-DATE I-DATE I-DATE I-DATE I-DATE O B-TIME I-TIME']


In [42]:
optimizer = torch.optim.Adam([
    {'params': ner_model.parameters()},
    {'params': intent_model.parameters()}
], lr=1e-5)  # Define optimizer for both models

num_epochs = 3  # Define number of epochs

for epoch in range(num_epochs):
    for batch in train_dataloader:  # Iterate over your dataset batches
        inputs = tokenizer(batch['text'], return_tensors="pt", padding=True, truncation=True)

        # Forward pass for intent classification
        outputs_intent = intent_model(**inputs)
        intent_labels = batch['intent']
        intent_loss = torch.nn.CrossEntropyLoss()(outputs_intent.logits, intent_labels)

        # Forward pass for NER
        outputs_ner = ner_model(**inputs)
        ner_labels = batch['entities']
        ner_loss = torch.nn.CrossEntropyLoss()(outputs_ner.logits.view(-1, num_ner_labels), ner_labels.view(-1))

        # Total loss (you might adjust weights for different tasks)
        total_loss = ner_loss + intent_loss

        # Backpropagation
        total_loss.backward()
        optimizer.step()
        optimizer.zero_grad()

TypeError: ignored

In [None]:
optimizer = torch.optim.Adam([
    {'params': ner_model.parameters()},
    {'params': intent_model.parameters()}
], lr=1e-5)  # Define optimizer for both models

num_epochs = 3  # Define number of epochs

for epoch in range(num_epochs):
    for batch in train_dataloader:  # Iterate over your dataset batches
        inputs = tokenizer(batch['text'], return_tensors="pt", padding=True, truncation=True)

        # Forward pass for intent classification
        outputs_intent = intent_model(**inputs)
        intent_labels = batch['intent']
        intent_loss = torch.nn.CrossEntropyLoss()(outputs_intent.logits, intent_labels)

        # Forward pass for NER
        outputs_ner = ner_model(**inputs)
        ner_labels = batch['entities']
        ner_loss = torch.nn.CrossEntropyLoss()(outputs_ner.logits.view(-1, num_ner_labels), ner_labels.view(-1))

        # Total loss (you might adjust weights for different tasks)
        total_loss = ner_loss + intent_loss

        # Backpropagation
        total_loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        # No idea yet

Building the Multi-Task Architecture

In [None]:
class MultiTaskBertWrapper(nn.Module):
    def __init__(self, ner_model, intent_model):
        super().__init__()
        self.ner_model = ner_model
        self.intent_model = intent_model

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None, task_type=None):
        if task_type = 'ner':
            self.ner_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        elif task_type = 'intent':
            self.intent_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        else:
            raise ValueError('Invalid task_type. Use "ner" or "intent"')

In [None]:
model = MultiTaskBertWrapper(ner_model, intent_model)

In [None]:
model

In [None]:
params = list(model.ner_model.parameters()) + list(model.intent_model.parameters())
optimizer = torch.optim.AdamW(params, lr=1e-5)

ner_loss_function = torch.nn.CrossEntropyLoss()
intent_loss_function = torch.nn.CrossEntropyLoss()

In [None]:
model.train()

for epoch in range(num_epochs):
    for batch in train_dataloader:
        inputs = batch['text']
        attention_masks = batch['attention_mask']
        labels_ner = batch['entities']
        labels_intent = batch['intent']

        optimizer.zero_grad()

        # Forward pass for NER task
        ner_outputs = model.ner_model(input_ids=inputs, attention_mask=attention_masks, labels=labels_ner, task_type='ner')
        ner_loss = ner_loss_function(ner_outputs.logits.view(-1, num_ner_labels), labels_ner.view(-1))

        ner_loss.backward()

        # Forward pass for Intent Classification task
        intent_outputs = model.intent_model(input_ids=inputs, attention_mask=attention_masks, labels=labels_intent, task_type='intent')
        intent_loss = intent_loss_function(intent_outputs.logits.view(-1, num_intent_labels), labels_intent.view(-1))

        intent_loss.backward()

        optimizer.step()