In [187]:
import torch
from transformers import RobertaTokenizer, RobertaModel, RobertaForTokenClassification, RobertaTokenizerFast
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict, Dataset


In [188]:
# Step 1: Load RoBERTa-base model and tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base", add_prefix_space=True)
model = RobertaForTokenClassification.from_pretrained(
    "roberta-base",
    num_labels=13  # number of NER labels
)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [189]:

# Step 2: Define a function to preprocess the dataset
def preprocess_data(examples):
    """
    Tokenize input text and align labels with tokens.
    Handles subwords by assigning -100 to non-aligned tokens.
    """
    print(examples["label_list"]["label_list"])
    tokenized_inputs = tokenizer(examples["train"]["train.SRC"], truncation=True, padding=True, is_split_into_words=True)
    print(tokenized_inputs)
    labels = []
    for i, label in enumerate(examples["label_list"]["label_list"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [190]:
# Step 3: Load the pizza order dataset
data_path = "./test.json"  
try:
    data = load_dataset('json', data_files=data_path)
except Exception as e:
    raise ValueError(f"Failed to load dataset from {data_path}: {e}")

print(data)
print(data['train'])
print(data['train']['train.SRC'])


DatasetDict({
    train: Dataset({
        features: ['train.SRC', 'train.EXR', 'train.TOP', 'train.TOP-DECOUPLED'],
        num_rows: 3
    })
})
Dataset({
    features: ['train.SRC', 'train.EXR', 'train.TOP', 'train.TOP-DECOUPLED'],
    num_rows: 3
})
["i'd like a pizza with carrots barbecue pulled pork and cheeseburger without thin crust", "i'd like a pizza with banana pepper grilled chicken and white onions without thin crust", 'i want one regular pizza without any fried onions']


In [191]:
# Step 4: Define label mapping
labels = []
with open("unique_labels.txt", 'r') as file:
    labels = file.read()
labels = labels[:-1]


label_list = labels.split("\n")
print(label_list)

num_labels = len(label_list)
print(num_labels)

model.config.num_labels = num_labels

['CONTAINERTYPE', 'ORDER', 'NOT', 'SIZE', 'DRINKORDER', 'TOPPING', 'NUMBER', 'STYLE', 'PIZZAORDER', 'COMPLEX_TOPPING', 'DRINKTYPE', 'VOLUME', 'QUANTITY']
13


In [192]:
# Step 5: Split and preprocess the dataset
label_list = Dataset.from_dict({"label_list": label_list})

data = DatasetDict({
    "train": data["train"].select_columns("train.SRC"),
    "validation": data["train"].select_columns("train.TOP-DECOUPLED"),
    "label_list": label_list
})

data = preprocess_data(data)

data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

['CONTAINERTYPE', 'ORDER', 'NOT', 'SIZE', 'DRINKORDER', 'TOPPING', 'NUMBER', 'STYLE', 'PIZZAORDER', 'COMPLEX_TOPPING', 'DRINKTYPE', 'VOLUME', 'QUANTITY']
{'input_ids': [0, 939, 1017, 101, 10, 9366, 19, 28488, 18906, 2468, 12072, 8, 21629, 3209, 25278, 396, 7174, 22196, 939, 1017, 101, 10, 9366, 19, 23611, 10702, 20346, 5884, 8, 1104, 21568, 396, 7174, 22196, 939, 236, 65, 1675, 9366, 396, 143, 16708, 21568, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


ValueError: word_ids() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast` class).

In [None]:
# Step 6: Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
)

In [None]:
# Step 7: Define evaluation metrics
def compute_metrics(pred):
    """
    Compute evaluation metrics using sklearn's classification report.
    Filters out ignored tokens (-100) from predictions and labels.
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_preds = [[label_list[p] for (p, l) in zip(pred, label) if l != -100] for pred, label in zip(preds, labels)]
    from sklearn.metrics import classification_report
    return classification_report(true_labels, true_preds, output_dict=True)


In [None]:
# Step 8: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data["train"],
    eval_dataset=data["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [None]:
# Step 9: Train and save the model
print("Starting training...")
trainer.train()
print("Training completed. Saving model...")
trainer.save_model("./trained_model")
tokenizer.save_pretrained("./trained_model")
print("Model and tokenizer saved successfully.")
