### Read Data

In [4]:
import pandas as pd
train_path = '../dataset/PIZZA_train.json'
test_path = '../dataset/PIZZA_dev.json'
df = pd.read_json(train_path, lines=True)
dev = pd.read_json(test_path, lines=True)

In [31]:
df.describe()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
count,2456446,2456446,2456446,2456446
unique,2456446,694346,2456446,1425035
top,can i have a large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZ...,(ORDER (PIZZAORDER (NUMBER three ) (NOT (TOPPI...
freq,1,1999,1,167


In [32]:
X_train = df['train.SRC']
y_train = df['train.TOP']
X_test = dev['dev.SRC']
y_test = dev['dev.TOP']
print(X_train[476368])
print(y_train[476368])
print(dev['dev.SRC'][0])

i want three pies with parmesan cheese and without any sauce
(ORDER i want (PIZZAORDER (NUMBER three ) pies with (TOPPING parmesan cheese ) and without any (NOT (TOPPING sauce ) ) ) )
i want to order two medium pizzas with sausage and black olives and two medium pizzas with pepperoni and extra cheese and three large pizzas with pepperoni and sausage


In [33]:
X_train

0                         can i have a large bbq pulled pork
1          large pie with green pepper and with extra pep...
2                          i'd like a large vegetarian pizza
3          party size stuffed crust pie with american che...
4                    can i have one personal sized artichoke
                                 ...                        
2456441    i'd like a pizza with arugula ricotta cheese a...
2456442    i'd like a pizza with yellow peppers fried oni...
2456443    i'd like a pizza with olives roasted tomatoes ...
2456444    i'd like a pizza with mozzarella jalapeno and ...
2456445    i'd like a pizza with hot pepper pecorino chee...
Name: train.SRC, Length: 2456446, dtype: object

In [34]:
max_str_1 = len(max(X_train, key=len))
max_str_2 = len(y_train[y_train.str.len().idxmax()])
max_str_1, max_str_2

(133, 335)

### Calculate Vocabulary

In [35]:
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences

def tokenize_output(output):
    """
    Tokenizes the structured output into meaningful tokens.
    Example:
        Input: "(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING bbq pulled pork ) ) )"
        Output: ["(ORDER", "(PIZZAORDER", "(NUMBER", "a", "(SIZE", "large", "(TOPPING", "bbq", "pulled", "pork", ")", ")", ")", ")"]
    """
    tokens = re.findall(r"\(|\)|\w+|[^\s()]+", output)
    return tokens

def build_vocab(outputs):
    """
    Builds a vocabulary from tokenized outputs.
    """
    vocab = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2}  # Special tokens
    i = 3
    for output in outputs:
        tokens = tokenize_output(output)
        for token in tokens:
            if token not in vocab:
                vocab[token] = i
                i += 1
    return vocab
def encode_outputs(outputs, vocab):
    """
    Encodes tokenized outputs into sequences of integers.
    """
    encoded = []
    for output in outputs:
        tokens = tokenize_output(output)
        sequence = [vocab["<SOS>"]] + [vocab[token] for token in tokens if token in vocab] + [vocab["<EOS>"]]
        encoded.append(sequence)
    return encoded

def pad_sequences_to_fixed_length(sequences, max_len):
    """
    Pads sequences to a fixed length.
    """
    return pad_sequences(sequences, maxlen=max_len, padding="post", value=0)

def decode_sequence(sequence, vocab):
    """
    Decodes a sequence of integers back into the structured output string.
    """
    inv_vocab = {v: k for k, v in vocab.items()}  # Reverse the vocabulary
    tokens = [inv_vocab[idx] for idx in sequence if idx in inv_vocab and idx not in {vocab["<SOS>"], vocab["<EOS>"], vocab["<PAD>"]}]

    output = " ".join(tokens)
    output = output.replace(" ( ", " (").replace("( ", "(") #.replace(" )", ")")
    return output

def decode_sequence_2(sequence, vocab):
    """
    Decodes a sequence of integers back into the structured output string.
    """
    inv_vocab = {v: k for k, v in vocab.items()}  # Reverse the vocabulary
    # sequence = sequence.cpu().tolist()  # Convert tensor to a list of integers
    tokens = [inv_vocab.get(idx, "") for idx in sequence if idx > 0]  # Ignore unknown and put empty char
    return "".join(tokens)



In [20]:


def prepare_data(
    X_train, y_train, X_test, y_test, max_len_1=20, max_len_2 = 20
):

    X_vocab = build_vocab(X_train)  # Build vocabulary from training outputs
    X_train_encoded = encode_outputs(X_train, X_vocab)  # Encode training outputs
    X_test_encoded = encode_outputs(X_test, X_vocab)  # Encode testing outputs
    X_train_processed = pad_sequences_to_fixed_length(X_train_encoded, max_len_1)
    X_test_processed = pad_sequences_to_fixed_length(X_test_encoded, max_len_1)

    vocab = build_vocab(y_train)  # Build vocabulary from training outputs
    y_train_encoded = encode_outputs(y_train, vocab)  # Encode training outputs
    y_test_encoded = encode_outputs(y_test, vocab)  # Encode testing outputs
    y_train_processed = pad_sequences_to_fixed_length(y_train_encoded, max_len_2)
    y_test_processed = pad_sequences_to_fixed_length(y_test_encoded, max_len_2)


    return (
        X_train_processed,
        X_test_processed,
        y_train_processed,
        y_test_processed,
        X_vocab,
        vocab,  # Return vocabulary for decoding
    )


In [21]:
X_train_processed, X_test_processed, y_train_processed, y_test_processed, X_vocab, vocab = prepare_data( X_train, y_train, X_test, y_test,max_len_1=250, max_len_2=250)

In [188]:
del df
del dev
del X_train
del X_test
del y_train
del y_test

In [23]:
X_vocab

{'<PAD>': 0,
 '<SOS>': 1,
 '<EOS>': 2,
 'can': 3,
 'i': 4,
 'have': 5,
 'a': 6,
 'large': 7,
 'bbq': 8,
 'pulled': 9,
 'pork': 10,
 'pie': 11,
 'with': 12,
 'green': 13,
 'pepper': 14,
 'and': 15,
 'extra': 16,
 'peperonni': 17,
 "'d": 18,
 'like': 19,
 'vegetarian': 20,
 'pizza': 21,
 'party': 22,
 'size': 23,
 'stuffed': 24,
 'crust': 25,
 'american': 26,
 'cheese': 27,
 'mushroom': 28,
 'one': 29,
 'personal': 30,
 'sized': 31,
 'artichoke': 32,
 'banana': 33,
 'peppperonis': 34,
 'low': 35,
 'fat': 36,
 'want': 37,
 'regular': 38,
 'without': 39,
 'any': 40,
 'fried': 41,
 'onions': 42,
 'little': 43,
 'bit': 44,
 'of': 45,
 'high': 46,
 'rise': 47,
 'dough': 48,
 'lot': 49,
 'olive': 50,
 'pesto': 51,
 'sauce': 52,
 'peperonis': 53,
 'yellow': 54,
 'meatball': 55,
 '-': 56,
 'bean': 57,
 'big': 58,
 'meat': 59,
 'mushrooms': 60,
 'pecorino': 61,
 'balsamic': 62,
 'glaze': 63,
 'black': 64,
 'chicken': 65,
 'mozzarella': 66,
 'italian': 67,
 'sausage': 68,
 'olives': 69,
 'pestos':

In [24]:
vocab

{'<PAD>': 0,
 '<SOS>': 1,
 '<EOS>': 2,
 '(': 3,
 'ORDER': 4,
 'can': 5,
 'i': 6,
 'have': 7,
 'PIZZAORDER': 8,
 'NUMBER': 9,
 'a': 10,
 ')': 11,
 'SIZE': 12,
 'large': 13,
 'TOPPING': 14,
 'bbq': 15,
 'pulled': 16,
 'pork': 17,
 'pie': 18,
 'with': 19,
 'green': 20,
 'pepper': 21,
 'and': 22,
 'COMPLEX_TOPPING': 23,
 'QUANTITY': 24,
 'extra': 25,
 'peperonni': 26,
 "'d": 27,
 'like': 28,
 'STYLE': 29,
 'vegetarian': 30,
 'pizza': 31,
 'party': 32,
 'size': 33,
 'stuffed': 34,
 'crust': 35,
 'american': 36,
 'cheese': 37,
 'mushroom': 38,
 'one': 39,
 'personal': 40,
 'sized': 41,
 'artichoke': 42,
 'banana': 43,
 'peppperonis': 44,
 'low': 45,
 'fat': 46,
 'want': 47,
 'regular': 48,
 'without': 49,
 'any': 50,
 'NOT': 51,
 'fried': 52,
 'onions': 53,
 'little': 54,
 'bit': 55,
 'of': 56,
 'high': 57,
 'rise': 58,
 'dough': 59,
 'lot': 60,
 'olive': 61,
 'pesto': 62,
 'sauce': 63,
 'peperonis': 64,
 'yellow': 65,
 'meatball': 66,
 '-': 67,
 'bean': 68,
 'big': 69,
 'meat': 70,
 'mushro

### Save SRC as input

In [None]:
src_data = [entry for entry in X_train ]
with open("../dataset/src_data.txt", "w") as src_file:
    src_file.write("\n".join(src_data))

### Turn TOP into json tree

In [8]:
import json
import re

def tokenize(s):
    # Extract tokens: parentheses or sequences of non-whitespace, non-parenthesis characters.
    tokens = re.findall(r'\(|\)|[^\s()]+', s)
    return tokens

def parse_tokens(tokens):
    # Parse tokens into a nested list structure
    stack = []
    current_list = []
    for token in tokens:
        if token == '(':
            stack.append(current_list)
            current_list = []
        elif token == ')':
            finished = current_list
            current_list = stack.pop()
            current_list.append(finished)
        else:
            current_list.append(token)
    return current_list

def normalize_structure(tree):
    if not isinstance(tree, list):
        return None

    def is_key(token):
        return token in [
            "ORDER", "PIZZAORDER", "DRINKORDER", "NUMBER", "SIZE", "STYLE", "TOPPING",
            "COMPLEX_TOPPING", "QUANTITY", "VOLUME", "DRINKTYPE", "CONTAINERTYPE", "NOT"
        ]

    # Clean the list by keeping sublists and tokens as-is for further analysis
    cleaned = []
    for el in tree:
        cleaned.append(el)

    if len(cleaned) > 0 and isinstance(cleaned[0], str) and is_key(cleaned[0]):
        key = cleaned[0]
        if key == "ORDER":
            pizzaorders = []
            drinkorders = []
            for sub in cleaned[1:]:
                node = normalize_structure(sub)
                if isinstance(node, dict):
                    if "PIZZAORDER" in node:
                        if isinstance(node["PIZZAORDER"], list):
                            pizzaorders.extend(node["PIZZAORDER"])
                        else:
                            pizzaorders.append(node["PIZZAORDER"])
                    if "DRINKORDER" in node:
                        if isinstance(node["DRINKORDER"], list):
                            drinkorders.extend(node["DRINKORDER"])
                        else:
                            drinkorders.append(node["DRINKORDER"])
                    if node.get("TYPE") == "PIZZAORDER":
                        pizzaorders.append(node)
                    if node.get("TYPE") == "DRINKORDER":
                        drinkorders.append(node)
            result = {}
            if pizzaorders:
                result["PIZZAORDER"] = pizzaorders
            if drinkorders:
                result["DRINKORDER"] = drinkorders
            if result:
                return {"ORDER": result}
            else:
                return {}

        elif key == "PIZZAORDER":
            number = None
            size = None
            style = None
            toppings = []
            for sub in cleaned[1:]:
                node = normalize_structure(sub)
                if isinstance(node, dict):
                    t = node.get("TYPE")
                    if t == "NUMBER":
                        number = node["VALUE"]
                    elif t == "SIZE":
                        size = node["VALUE"]
                    elif t == "STYLE":
                        style = node["VALUE"]
                    elif t == "TOPPING":
                        toppings.append(node)
            result = {}
            if number is not None:
                result["NUMBER"] = number
            if size is not None:
                result["SIZE"] = size
            if style is not None:
                result["STYLE"] = style
            if toppings:
                result["AllTopping"] = toppings
            # Mark type internally, will remove later
            result["TYPE"] = "PIZZAORDER"
            return result

        elif key == "DRINKORDER":
            number = None
            volume = None
            drinktype = None
            containertype = None
            for sub in cleaned[1:]:
                node = normalize_structure(sub)
                if isinstance(node, dict):
                    t = node.get("TYPE")
                    if t == "NUMBER":
                        number = node["VALUE"]
                    elif t == "VOLUME":
                        volume = node["VALUE"]
                    elif t == "DRINKTYPE":
                        drinktype = node["VALUE"]
                    elif t == "CONTAINERTYPE":
                        containertype = node["VALUE"]
            result = {}
            if number is not None:
                result["NUMBER"] = number
            if volume is not None:
                result["VOLUME"] = volume
            if drinktype is not None:
                result["DRINKTYPE"] = drinktype
            if containertype is not None:
                result["CONTAINERTYPE"] = containertype
            result["TYPE"] = "DRINKORDER"
            return result

        elif key in ["NUMBER","SIZE","STYLE","VOLUME","DRINKTYPE","CONTAINERTYPE","QUANTITY"]:
            values = []
            for el in cleaned[1:]:
                if isinstance(el, str):
                    values.append(el)
            value_str = " ".join(values).strip()
            return {
                "TYPE": key,
                "VALUE": value_str
            }

        elif key == "TOPPING":
            values = []
            for el in cleaned[1:]:
                if isinstance(el, str):
                    values.append(el)
            topping_str = " ".join(values).strip()
            return {
                "TYPE": "TOPPING",
                "NOT": False,
                "Quantity": None,
                "Topping": topping_str
            }

        elif key == "COMPLEX_TOPPING":
            quantity = None
            topping = None
            for sub in cleaned[1:]:
                node = normalize_structure(sub)
                if isinstance(node, dict):
                    t = node.get("TYPE")
                    if t == "QUANTITY":
                        quantity = node["VALUE"]
                    elif t == "TOPPING":
                        topping = node["Topping"]
            return {
                "TYPE": "TOPPING",
                "NOT": False,
                "Quantity": quantity,
                "Topping": topping
            }

        elif key == "NOT":
            for sub in cleaned[1:]:
                node = normalize_structure(sub)
                if isinstance(node, dict) and node.get("TYPE") == "TOPPING":
                    node["NOT"] = True
                    if "Quantity" not in node:
                        node["Quantity"] = None
                    return node
            return None

    else:
        # Try to parse sublists and combine orders found
        combined_order = {"PIZZAORDER": [], "DRINKORDER": []}
        found_order = False

        for el in cleaned:
            node = normalize_structure(el)
            if isinstance(node, dict):
                if "ORDER" in node:
                    found_order = True
                    order_node = node["ORDER"]
                    if "PIZZAORDER" in order_node:
                        combined_order["PIZZAORDER"].extend(order_node["PIZZAORDER"])
                    if "DRINKORDER" in order_node:
                        combined_order["DRINKORDER"].extend(order_node["DRINKORDER"])
                elif node.get("TYPE") == "PIZZAORDER":
                    found_order = True
                    combined_order["PIZZAORDER"].append(node)
                elif node.get("TYPE") == "DRINKORDER":
                    found_order = True
                    combined_order["DRINKORDER"].append(node)

        if found_order:
            final = {}
            if combined_order["PIZZAORDER"]:
                final["PIZZAORDER"] = combined_order["PIZZAORDER"]
            if combined_order["DRINKORDER"]:
                final["DRINKORDER"] = combined_order["DRINKORDER"]
            return {"ORDER": final} if final else {}

        return None

def remove_type_keys(obj):
    # Recursively remove "TYPE" keys from all dictionaries
    if isinstance(obj, dict):
        obj.pop("TYPE", None)
        for k, v in obj.items():
            remove_type_keys(v)
    elif isinstance(obj, list):
        for item in obj:
            remove_type_keys(item)


def preprocess(text):
    tokens = tokenize(text)
    parsed = parse_tokens(tokens)
    result = normalize_structure(parsed)
    remove_type_keys(result)
    return result

input_str = "(ORDER potato potato junior (PIZZAORDER (NUMBER one) (SIZE large) (STYLE thin crust) (TOPPING cheese) (TOPPING pepperoni) ) (PIZZAORDER (NUMBER two) (SIZE medium) (STYLE deep dish) (NOT (TOPPING mushrooms) ) (NOT (COMPLEX_TOPPING (QUANTITY extra) (TOPPING olives) ) ) ) (DRINKORDER (NUMBER five) (VOLUME one liter) (DRINKTYPE lemon ice tea) (CONTAINERTYPE bottles)) (DRINKORDER (NUMBER three) (VOLUME two liters) (DRINKTYPE cola) (CONTAINERTYPE cans)) (DRINKORDER (NUMBER three) (VOLUME two liters) (DRINKTYPE cola) (CONTAINERTYPE cans) ) )"

tokens = tokenize(input_str)
parsed = parse_tokens(tokens)
result = normalize_structure(parsed)
remove_type_keys(result)

print(json.dumps(result, indent=2))


{
  "ORDER": {
    "PIZZAORDER": [
      {
        "NUMBER": "one",
        "SIZE": "large",
        "STYLE": "thin crust",
        "AllTopping": [
          {
            "NOT": false,
            "Quantity": null,
            "Topping": "cheese"
          },
          {
            "NOT": false,
            "Quantity": null,
            "Topping": "pepperoni"
          }
        ]
      },
      {
        "NUMBER": "two",
        "SIZE": "medium",
        "STYLE": "deep dish",
        "AllTopping": [
          {
            "NOT": true,
            "Quantity": null,
            "Topping": "mushrooms"
          },
          {
            "NOT": true,
            "Quantity": "extra",
            "Topping": "olives"
          }
        ]
      }
    ],
    "DRINKORDER": [
      {
        "NUMBER": "five",
        "VOLUME": "one liter",
        "DRINKTYPE": "lemon ice tea",
        "CONTAINERTYPE": "bottles"
      },
      {
        "NUMBER": "three",
        "VOLUME": "two liters",
    

In [20]:


input_text = "(ORDER (DRINKORDER (NUMBER two ) (VOLUME 20 fl ounce ) (DRINKTYPE diet sprites ) (CONTAINERTYPE in cans ) ) (DRINKORDER (NUMBER five ) (VOLUME 500 milliliter ) (DRINKTYPE ice teas ) ) (DRINKORDER (NUMBER three ) (VOLUME 500-ml ) (DRINKTYPE san pellegrinos ) ) )"
result = preprocess(input_text)
print(json.dumps(result, indent=2))

{
  "ORDER": {
    "DRINKORDER": [
      {
        "NUMBER": "two",
        "VOLUME": "20 fl ounce",
        "DRINKTYPE": "diet sprites",
        "CONTAINERTYPE": "in cans"
      },
      {
        "NUMBER": "five",
        "VOLUME": "500 milliliter",
        "DRINKTYPE": "ice teas"
      },
      {
        "NUMBER": "three",
        "VOLUME": "500-ml",
        "DRINKTYPE": "san pellegrinos"
      }
    ]
  }
}


In [11]:
input_text = "(ORDER okay i'm all set i need (PIZZAORDER (NUMBER a ) pizza and make it (SIZE medium ) for toppings i'd like (TOPPING mushrooms ) (TOPPING pepperoni ) and i'd like (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) ) on there beautiful )"

result = preprocess(input_text)
print(json.dumps(result, indent=2))

{
  "ORDER": {
    "PIZZAORDER": [
      {
        "NUMBER": "a",
        "SIZE": "medium",
        "AllTopping": [
          {
            "NOT": false,
            "Quantity": null,
            "Topping": "mushrooms"
          },
          {
            "NOT": false,
            "Quantity": null,
            "Topping": "pepperoni"
          },
          {
            "NOT": false,
            "Quantity": "extra",
            "Topping": "cheese"
          }
        ]
      }
    ]
  }
}


In [12]:
input_text = '(ORDER (PIZZAORDER (NUMBER one) (SIZE large) (STYLE thin crust) (TOPPING cheese) (TOPPING pepperoni) ) (PIZZAORDER (NUMBER two) (SIZE medium) (STYLE deep dish) (NOT (TOPPING mushrooms) ) (COMPLEX_TOPPING (QUANTITY extra) (TOPPING olives) ) ) (DRINKORDER (NUMBER five) (VOLUME one liter) (DRINKTYPE lemon ice tea) (CONTAINERTYPE bottles) )(DRINKORDER (NUMBER three) (VOLUME two liters) (DRINKTYPE cola) (CONTAINERTYPE cans) ) (DRINKORDER (NUMBER three) (VOLUME two liters) (DRINKTYPE cola) (CONTAINERTYPE cans) ) )'

result = preprocess(input_text)
print(json.dumps(result, indent=2))

{
  "ORDER": {
    "PIZZAORDER": [
      {
        "NUMBER": "one",
        "SIZE": "large",
        "STYLE": "thin crust",
        "AllTopping": [
          {
            "NOT": false,
            "Quantity": null,
            "Topping": "cheese"
          },
          {
            "NOT": false,
            "Quantity": null,
            "Topping": "pepperoni"
          }
        ]
      },
      {
        "NUMBER": "two",
        "SIZE": "medium",
        "STYLE": "deep dish",
        "AllTopping": [
          {
            "NOT": true,
            "Quantity": null,
            "Topping": "mushrooms"
          },
          {
            "NOT": false,
            "Quantity": "extra",
            "Topping": "olives"
          }
        ]
      }
    ],
    "DRINKORDER": [
      {
        "NUMBER": "five",
        "VOLUME": "one liter",
        "DRINKTYPE": "lemon ice tea",
        "CONTAINERTYPE": "bottles"
      },
      {
        "NUMBER": "three",
        "VOLUME": "two liters",
   

In [13]:
input_text = '(ORDER place a rush (PIZZAORDER (NUMBER two ) (SIZE large ) (TOPPING pepperoni ) (TOPPING ham ) ) on them (DRINKORDER (NUMBER two ) (CONTAINERTYPE cans ) (DRINKTYPE sprite ) ) )'

result = preprocess(input_text)
print(json.dumps(result, indent=2))

{
  "ORDER": {
    "PIZZAORDER": [
      {
        "NUMBER": "two",
        "SIZE": "large",
        "AllTopping": [
          {
            "NOT": false,
            "Quantity": null,
            "Topping": "pepperoni"
          },
          {
            "NOT": false,
            "Quantity": null,
            "Topping": "ham"
          }
        ]
      }
    ],
    "DRINKORDER": [
      {
        "NUMBER": "two",
        "DRINKTYPE": "sprite",
        "CONTAINERTYPE": "cans"
      }
    ]
  }
}


### Json Exact match

In [14]:
import json

def json_equal(obj1, obj2):
    """Return True if two JSON objects (as Python dicts/lists) match exactly, False otherwise. yoshoa"""
    return obj1 == obj2

# Example usage:
json_str_a = """
{
  "ORDER": {
    "PIZZAORDER": [
      {
        "NUMBER": "one",
        "SIZE": "large",
        "STYLE": "thin crust",
        "AllTopping": [
          {
            "NOT": false,
            "Quantity": null,
            "Topping": "cheese"
          },
          {
            "NOT": false,
            "Quantity": null,
            "Topping": "pepperoni"
          }
        ]
      }
    ]
  }
}
"""

json_str_b = """
{
  "ORDER": {
    "PIZZAORDER": [
      {
        "NUMBER": "one",
        "SIZE": "large",
        "STYLE": "thin crust",
        "AllTopping": [
          {
            "NOT": false,
            "Quantity": null,
            "Topping": "cheese"
          },
          {
            "NOT": false,
            "Quantity": null,
            "Topping": "pepperoni"
          }
        ]
      }
    ]
  }
}
"""

obj_a = json.loads(json_str_a)
obj_b = json.loads(json_str_b)

print(json_equal(obj_a, obj_b))  # This will print True since they match exactly.


True


### Turn TOP into TOP-DECOUPLED then tree

In [16]:
import pandas as pd
dev = pd.read_json('../dataset/PIZZA_dev.json', lines=True)
X_test = dev['dev.SRC']
y_test = dev['dev.TOP']

In [17]:
print(y_test[:,].values)

['(ORDER i want to order (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING sausage ) and (TOPPING black olives ) ) and (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING pepperoni ) and (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) ) and (PIZZAORDER (NUMBER three ) (SIZE large ) pizzas with (TOPPING pepperoni ) and (TOPPING sausage ) ) )'
 '(ORDER (PIZZAORDER (NUMBER five ) (SIZE medium ) pizzas with (TOPPING tomatoes ) and (TOPPING ham ) ) )'
 '(ORDER i need to order (PIZZAORDER (NUMBER one ) (SIZE large ) (STYLE vegetarian ) pizza with (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING banana peppers ) ) ) )'
 "(ORDER i'd like to order (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING onion ) and (TOPPING pepper ) pizza ) )"
 "(ORDER i'll have (PIZZAORDER (NUMBER one ) pie along with (TOPPING pesto ) and (TOPPING ham ) but avoid (NOT (TOPPING olives ) ) ) )"
 '(ORDER i need to order (PIZZAORDER (NUMBER one ) (SIZE large ) pizza with (TOPPING ham ) (TOPPING bacon

In [19]:
# Extract parsed data from the DataFrame
parsed_data = []
l = y_test[190:250,].values
for row in l:
    print(row)
    parsed_entry = preprocess(row)
    print(parsed_entry)
    # parsed_data.append(parsed_entry)

(ORDER i want (PIZZAORDER (NUMBER a ) pizza with (TOPPING sausage ) (TOPPING bacon ) and no (NOT (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) ) ) )
{'ORDER': {'PIZZAORDER': [{'NUMBER': 'a', 'AllTopping': [{'NOT': False, 'Quantity': None, 'Topping': 'sausage'}, {'NOT': False, 'Quantity': None, 'Topping': 'bacon'}, {'NOT': True, 'Quantity': 'extra', 'Topping': 'cheese'}]}]}}
(ORDER i need to get (PIZZAORDER (NUMBER six ) (SIZE small ) pizzas with (TOPPING peppers ) and (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) ) )
{'ORDER': {'PIZZAORDER': [{'NUMBER': 'six', 'SIZE': 'small', 'AllTopping': [{'NOT': False, 'Quantity': None, 'Topping': 'peppers'}, {'NOT': False, 'Quantity': 'extra', 'Topping': 'cheese'}]}]}}
(ORDER i would like to order (PIZZAORDER (NUMBER a ) (SIZE large ) pie with (TOPPING mushrooms ) and a (STYLE thin crust ) but i do not want any (NOT (TOPPING pepperoni ) ) ) on it please )
{'ORDER': {'PIZZAORDER': [{'NUMBER': 'a', 'SIZE': 'large', 'STYLE': 'thin c

In [None]:
parsed_data

In [None]:

# Save the parsed data to a file
output_path = "../dataset/parsed_dev_order_data.json"
with open(output_path, "w") as parsed_file:
    json.dump(parsed_data, parsed_file)