In [43]:
import torch
from transformers import RobertaTokenizer, RobertaModel, RobertaForTokenClassification, RobertaTokenizerFast
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict, Dataset
import re



In [44]:
import re
def parse_tc(train_SRC,train_TOP):
    train_SRC = "i'd like a pizza with banana pepper grilled chicken and white onions without thin crust"
    train_TOP = "(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza with (TOPPING banana pepper ) (TOPPING grilled chicken ) and (TOPPING white onions ) without (NOT (STYLE thin crust ) ) ) )"

    def parse_sexp(s):
        s = s.replace('(', ' ( ').replace(')', ' ) ')
        tokens = s.split()
        def helper(tokens):
            token = tokens.pop(0)
            if token == '(':
                L = []
                while tokens[0] != ')':
                    L.append(helper(tokens))
                tokens.pop(0)
                return L
            else:
                return token
        return helper(tokens.copy())

    tree = parse_sexp(train_TOP)

    entities = []

    def extract_entities(tree, current_label=None, text_accumulator=[]):
        if isinstance(tree, list):
            label = tree[0]
            content = tree[1:]
            text = []
            for item in content:
                extract_entities(item, label, text)
            entity_text = ' '.join(text)
            if label in ['ORDER', 'PIZZAORDER', 'NOT'] or label not in ['NUMBER']:
                match = re.search(re.escape(entity_text), train_SRC)
                if match:
                    entities.append({
                        'label': label,
                        'word': match.group(),
                    })
            text_accumulator.extend(text)
        else:
            text_accumulator.append(tree)

    extract_entities(tree)

    result = {
        'sentence': train_SRC,
        'entities': entities
    }
    print(result)
    return result

In [45]:
parse_tc("","")

{'sentence': "i'd like a pizza with banana pepper grilled chicken and white onions without thin crust", 'entities': [{'label': 'TOPPING', 'word': 'banana pepper'}, {'label': 'TOPPING', 'word': 'grilled chicken'}, {'label': 'TOPPING', 'word': 'white onions'}, {'label': 'STYLE', 'word': 'thin crust'}, {'label': 'NOT', 'word': 'thin crust'}, {'label': 'PIZZAORDER', 'word': 'a pizza with banana pepper grilled chicken and white onions without thin crust'}, {'label': 'ORDER', 'word': "i'd like a pizza with banana pepper grilled chicken and white onions without thin crust"}]}


{'sentence': "i'd like a pizza with banana pepper grilled chicken and white onions without thin crust",
 'entities': [{'label': 'TOPPING', 'word': 'banana pepper'},
  {'label': 'TOPPING', 'word': 'grilled chicken'},
  {'label': 'TOPPING', 'word': 'white onions'},
  {'label': 'STYLE', 'word': 'thin crust'},
  {'label': 'NOT', 'word': 'thin crust'},
  {'label': 'PIZZAORDER',
   'word': 'a pizza with banana pepper grilled chicken and white onions without thin crust'},
  {'label': 'ORDER',
   'word': "i'd like a pizza with banana pepper grilled chicken and white onions without thin crust"}]}

In [46]:
# Step 1: Load RoBERTa-base model and tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base", add_prefix_space=True)
model = RobertaForTokenClassification.from_pretrained(
    "roberta-base",
    num_labels=13  # number of NER labels
)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:

# Step 2: Define a function to preprocess the dataset
def preprocess_data(examples):
    """
    Tokenize input text and align labels with tokens.
    Handles subwords by assigning -100 to non-aligned tokens.
    """
    print(examples["train"]['train.SRC'])    
    tokenized_inputs = tokenizer(examples["train"]['train.SRC'], truncation=True, padding=True, is_split_into_words=True)
    print(tokenized_inputs)
    labels = []
    print(examples["label_list"]['label_list'])
    for i, label in enumerate(examples["label_list"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100 if (word_idx is None or word_idx == word_ids[j-1]) else label[word_idx] 
                     for j, word_idx in enumerate(word_ids)]
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [48]:
# Step 3: Load the pizza order dataset
data_path = "./test.json"  
try:
    data = load_dataset('json', data_files=data_path)
except Exception as e:
    raise ValueError(f"Failed to load dataset from {data_path}: {e}")

print(data)
print(data['train'])
print(data['train']['train.SRC'])


DatasetDict({
    train: Dataset({
        features: ['train.SRC', 'train.EXR', 'train.TOP', 'train.TOP-DECOUPLED'],
        num_rows: 3
    })
})
Dataset({
    features: ['train.SRC', 'train.EXR', 'train.TOP', 'train.TOP-DECOUPLED'],
    num_rows: 3
})
["i'd like a pizza with carrots barbecue pulled pork and cheeseburger without thin crust", "i'd like a pizza with banana pepper grilled chicken and white onions without thin crust", 'i want one regular pizza without any fried onions']


In [49]:
# Step 4: Define label mapping
labels = []
with open("unique_labels.txt", 'r') as file:
    labels = file.read()
labels = labels[:-1]


label_list = labels.split("\n")
print(label_list)

num_labels = len(label_list)
print(num_labels)

model.config.num_labels = num_labels

['CONTAINERTYPE', 'ORDER', 'NOT', 'SIZE', 'DRINKORDER', 'TOPPING', 'NUMBER', 'STYLE', 'PIZZAORDER', 'COMPLEX_TOPPING', 'DRINKTYPE', 'VOLUME', 'QUANTITY']
13


In [50]:
# Step 5: Split and preprocess the dataset
label_list = Dataset.from_dict({"label_list": label_list})

data = DatasetDict({
    "train": data["train"].select_columns("train.SRC"),
    "validation": data["train"].select_columns("train.TOP-DECOUPLED"),
    "label_list": label_list
})

data = preprocess_data(data)

data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

["i'd like a pizza with carrots barbecue pulled pork and cheeseburger without thin crust", "i'd like a pizza with banana pepper grilled chicken and white onions without thin crust", 'i want one regular pizza without any fried onions']
{'input_ids': [0, 939, 1017, 101, 10, 9366, 19, 28488, 18906, 2468, 12072, 8, 21629, 3209, 25278, 396, 7174, 22196, 939, 1017, 101, 10, 9366, 19, 23611, 10702, 20346, 5884, 8, 1104, 21568, 396, 7174, 22196, 939, 236, 65, 1675, 9366, 396, 143, 16708, 21568, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['CONTAINERTYPE', 'ORDER', 'NOT', 'SIZE', 'DRINKORDER', 'TOPPING', 'NUMBER', 'STYLE', 'PIZZAORDER', 'COMPLEX_TOPPING', 'DRINKTYPE', 'VOLUME', 'QUANTITY']


KeyError: 0

In [None]:
# Step 6: Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
)

In [9]:
# Step 7: Define evaluation metrics
def compute_metrics(pred):
    """
    Compute evaluation metrics using sklearn's classification report.
    Filters out ignored tokens (-100) from predictions and labels.
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_preds = [[label_list[p] for (p, l) in zip(pred, label) if l != -100] for pred, label in zip(preds, labels)]
    from sklearn.metrics import classification_report
    return classification_report(true_labels, true_preds, output_dict=True)


In [None]:
# Step 8: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data["train"],
    eval_dataset=data["validation"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)


In [20]:
# Step 9: Train and save the model
print("Starting training...")
trainer.train()
print("Training completed. Saving model...")
trainer.save_model("./trained_model")
tokenizer.save_pretrained("./trained_model")
print("Model and tokenizer saved successfully.")


Starting training...


ValueError: No columns in the dataset match the model's forward method signature. The following columns have been ignored: [train.SRC]. Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`.