In [1]:
# %pip install transformers datasets torch scikit-learn
import json
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import f1_score
from datasets import load_dataset

In [2]:
# Step 1: Download dataset from HuggingFace
def download_dataset():
    dataset = load_dataset("pfb30/multi_woz_v22", trust_remote_code=True)
    return dataset


In [3]:

# Step 2: Data preprocessing for (X, y)
class MultiWozDataset(Dataset):
    def __init__(self, dialogues, tokenizer, max_length=128):
        self.inputs = []
        self.labels = []
        
        for dialogue in dialogues:
            for turn in dialogue['turns']:
                if 'dialogue_acts' in turn:
                    act = list(turn['dialogue_acts'].keys())[0]  # First act type
                    slots = [f"{k}={v}" for k, v in turn['dialogue_acts'][act][0]['slots']]
                    label = f"{act}|{'|'.join(slots)}"
                    self.inputs.append(turn['text'])
                    self.labels.append(label)

        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        label = self.labels[idx]
        
        encoded = self.tokenizer(
            input_text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt'
        )
        
        return {
            'input_ids': encoded['input_ids'].squeeze(0),
            'attention_mask': encoded['attention_mask'].squeeze(0),
            'labels': label
        }

In [4]:
# Step 3: Model and training loop
def train_model(train_loader, val_loader, num_labels, device='cuda'):
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(3):
        model.train()
        train_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        print(f"Epoch {epoch + 1}: Train Loss = {train_loss / len(train_loader):.4f}")

        # Validation
        model.eval()
        val_loss = 0
        predictions, true_labels = [], []
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                val_loss += criterion(logits, labels).item()

                preds = torch.argmax(logits, dim=1).cpu().tolist()
                predictions.extend(preds)
                true_labels.extend(labels.cpu().tolist())

        f1 = f1_score(true_labels, predictions, average='weighted')
        print(f"Validation F1 Score: {f1:.4f}")

    return model

In [5]:
# Step 4: Testing
def test_model(model, test_loader, device='cuda'):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1).cpu().tolist()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().tolist())

    # Save predictions, true labels
    with open('predictions.json', 'w') as f:
        json.dump({'predictions': predictions, 'true_labels': true_labels}, f)
    
    # Load predictions, true labels
    # with open('predictions.json', 'r') as f:
    #     data = json.load(f)
    #     predictions = data['predictions']
    #     true_labels = data['true_labels']
    
    # Calculate F1 score
    f1 = f1_score(true_labels, predictions, average='weighted')
    print(f"Test F1 Score: {f1:.4f}")

In [6]:
# import json
# import torch
# from torch.utils.data import DataLoader, Dataset
# from transformers import BertTokenizer, BertForSequenceClassification

# # Step 1: Download dataset from HuggingFace
# def download_dataset():
#     from datasets import load_dataset
#     dataset = load_dataset("pfb30/multi_woz_v22", trust_remote_code=True)
#     return dataset

# # Step 2: Custom Dataset Class for MultiWoz
# class MultiWozDataset(Dataset):
#     def __init__(self, data, tokenizer, max_len=128):
#         self.data = data
#         self.tokenizer = tokenizer
#         self.max_len = max_len

#     def __len__(self):
#         return len(self.data["utterance"])

#     def __getitem__(self, idx):
#         utterance = self.data["utterance"][idx]
#         labels = self.data["label"][idx]

#         # Tokenize input
#         inputs = self.tokenizer(
#             utterance, 
#             max_length=self.max_len, 
#             padding="max_length", 
#             truncation=True, 
#             return_tensors="pt"
#         )
#         inputs["labels"] = torch.tensor(labels, dtype=torch.long)
#         return {key: val.squeeze(0) for key, val in inputs.items()}

# # Step 3: Training Function
# def train_model(train_loader, val_loader, num_labels, device):
#     model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
#     model.to(device)

#     optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
#     loss_fn = torch.nn.CrossEntropyLoss()

#     num_epochs = 3
#     for epoch in range(num_epochs):
#         model.train()
#         train_loss = 0.0
#         for batch in train_loader:
#             inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
#             labels = batch["labels"].to(device)

#             optimizer.zero_grad()
#             outputs = model(**inputs)
#             loss = loss_fn(outputs.logits, labels)
#             loss.backward()
#             optimizer.step()
#             train_loss += loss.item()

#         print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader)}")

#     # Validation Step
#     model.eval()
#     val_loss = 0.0
#     with torch.no_grad():
#         for batch in val_loader:
#             inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
#             labels = batch["labels"].to(device)
#             outputs = model(**inputs)
#             loss = loss_fn(outputs.logits, labels)
#             val_loss += loss.item()

#     print(f"Validation Loss: {val_loss/len(val_loader)}")
#     return model

# # Step 4: Testing Function
# def test_model(model, test_loader, device):
#     from sklearn.metrics import f1_score
#     model.to(device)
#     model.eval()

#     all_preds, all_labels = [], []
#     with torch.no_grad():
#         for batch in test_loader:
#             inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
#             labels = batch["labels"].to(device)
#             outputs = model(**inputs)
#             preds = torch.argmax(outputs.logits, dim=-1)
#             all_preds.extend(preds.cpu().numpy())
#             all_labels.extend(labels.cpu().numpy())

#     f1 = f1_score(all_labels, all_preds, average='macro')
#     print(f"F1 Score: {f1}")

# Main Functions
def main1():
    print("Download dataset")
    dataset = download_dataset()
    
    print("Save dataset to disk")
    dataset_dict = {split: dataset[split].to_dict() for split in dataset}
    with open('dataset_multiwoz.json', 'w') as f:
        json.dump(dataset_dict, f)

def main2():
    print("Load dataset from disk")
    with open('dataset_multiwoz.json', 'r') as f:
        dataset = json.load(f)

    print("Tokenizer")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    print("Create train, val, test data")
    train_data = MultiWozDataset(dataset['train'], tokenizer)
    val_data = MultiWozDataset(dataset['validation'], tokenizer)
    test_data = MultiWozDataset(dataset['test'], tokenizer)

    print("Save train, val, test data to disk")
    torch.save(train_data, 'train_data.pt')
    torch.save(val_data, 'val_data.pt')
    torch.save(test_data, 'test_data.pt')

def main3():
    print("Load train, val, test data from disk")
    train_data = torch.load('train_data.pt')
    val_data = torch.load('val_data.pt')
    test_data = torch.load('test_data.pt')

    print("Create data loaders")
    train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=32)
    test_loader = DataLoader(test_data, batch_size=32)

    print("Save data loaders to disk")
    torch.save(train_loader, 'train_loader.pt')
    torch.save(val_loader, 'val_loader.pt')
    torch.save(test_loader, 'test_loader.pt')

def main4():
    print("Load data loaders from disk")
    train_loader = torch.load('train_loader.pt')
    val_loader = torch.load('val_loader.pt')

    print("Train model")
    model = train_model(train_loader, val_loader, num_labels=100, device='cpu')
    
    print("Save model to disk")
    torch.save(model, 'model.pt')

def main5():
    print("Load model from disk")
    model = torch.load('model.pt')

    print("Load test data loader")
    test_loader = torch.load('test_loader.pt')

    print("Test model")
    test_model(model, test_loader, device='cpu')


In [None]:
# Main
if __name__ == "__main__":
    # print("Main1")
    # main1()
    
    print("Main2")
    main2()
    
    # print("Main3")
    # main3()
    
    # print("Main4")
    # main4()
    
    # print("Main5")
    # main5()
    
    print("Done")


Main1
Download dataset
Save dataset to disk
Done


Main1
Download dataset
Save dataset to disk
Done

---

