In [None]:
!pip install datasets evaluate transformers[sentencepiece]

In [3]:
from transformers import BertTokenizer, BertForTokenClassification, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import TextDataset

In [None]:
!pip install python-docx

In [9]:
import json
# Read the JSON file
path = "/content/drive/MyDrive/model/structured_data.json"
with open(path, 'r') as f:
    dataset = json.load(f)

# Assuming the dataset is a list of dictionaries

# Define a custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Create an instance of CustomDataset
custom_dataset = CustomDataset(dataset)

# Create a DataLoader
batch_size = 1
train_dataloader = DataLoader(custom_dataset, batch_size=batch_size, shuffle=True)

In [33]:
key = []
for sample in train_dataloader:
    key.extend(sample['entities'].keys())

In [36]:
key = set(key)
print(key)
print(len(key))
num_ner_labels = len(key)

{'date', 'time', 'days', 'specific_time', 'task', 'duration'}
6


In [25]:
l = []
for sample in train_dataloader:
    l.extend(sample['intent'])

In [28]:
l = set(l)
print(l)
print(len(l))
num_intent_labels=len(l)

{"'Schedule Meeting'", "'Set Alarm'", "'Schedule Appointment'", "'Set Reminder'", "'Set Timer'"}
5


In [None]:
# Load the BERT tokenizer and models for token classification and sequence classification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
ner_model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_ner_labels)
intent_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_intent_labels)

In [None]:
optimizer = torch.optim.Adam([
    {'params': ner_model.parameters()},
    {'params': intent_model.parameters()}
], lr=1e-5)  # Define optimizer for both models

num_epochs = 3  # Define number of epochs

for epoch in range(num_epochs):
    for batch in train_dataloader:  # Iterate over your dataset batches
        inputs = tokenizer(batch['text'], return_tensors="pt", padding=True, truncation=True)

        # Forward pass for NER
        outputs_ner = ner_model(**inputs)
        ner_labels = batch['ner_labels']
        ner_loss = torch.nn.CrossEntropyLoss()(outputs_ner.logits.view(-1, num_ner_labels), ner_labels.view(-1))

        # Forward pass for intent classification
        outputs_intent = intent_model(**inputs)
        intent_labels = batch['intent_labels']
        intent_loss = torch.nn.CrossEntropyLoss()(outputs_intent.logits, intent_labels)

        # Total loss (you might adjust weights for different tasks)
        total_loss = ner_loss + intent_loss

        # Backpropagation
        total_loss.backward()
        optimizer.step()
        optimizer.zero_grad()