### Read Data

In [4]:
import pandas as pd
train_path = '../dataset/PIZZA_train.json'
test_path = '../dataset/PIZZA_dev.json'
df = pd.read_json(train_path, lines=True)
dev = pd.read_json(test_path, lines=True)

In [31]:
df.describe()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
count,2456446,2456446,2456446,2456446
unique,2456446,694346,2456446,1425035
top,can i have a large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZ...,(ORDER (PIZZAORDER (NUMBER three ) (NOT (TOPPI...
freq,1,1999,1,167


In [32]:
X_train = df['train.SRC']
y_train = df['train.TOP']
X_test = dev['dev.SRC']
y_test = dev['dev.TOP']
print(X_train[476368])
print(y_train[476368])
print(dev['dev.SRC'][0])

i want three pies with parmesan cheese and without any sauce
(ORDER i want (PIZZAORDER (NUMBER three ) pies with (TOPPING parmesan cheese ) and without any (NOT (TOPPING sauce ) ) ) )
i want to order two medium pizzas with sausage and black olives and two medium pizzas with pepperoni and extra cheese and three large pizzas with pepperoni and sausage


In [33]:
X_train

0                         can i have a large bbq pulled pork
1          large pie with green pepper and with extra pep...
2                          i'd like a large vegetarian pizza
3          party size stuffed crust pie with american che...
4                    can i have one personal sized artichoke
                                 ...                        
2456441    i'd like a pizza with arugula ricotta cheese a...
2456442    i'd like a pizza with yellow peppers fried oni...
2456443    i'd like a pizza with olives roasted tomatoes ...
2456444    i'd like a pizza with mozzarella jalapeno and ...
2456445    i'd like a pizza with hot pepper pecorino chee...
Name: train.SRC, Length: 2456446, dtype: object

In [34]:
max_str_1 = len(max(X_train, key=len))
max_str_2 = len(y_train[y_train.str.len().idxmax()])
max_str_1, max_str_2

(133, 335)

### Calculate Vocabulary

In [35]:
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences

def tokenize_output(output):
    """
    Tokenizes the structured output into meaningful tokens.
    Example:
        Input: "(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING bbq pulled pork ) ) )"
        Output: ["(ORDER", "(PIZZAORDER", "(NUMBER", "a", "(SIZE", "large", "(TOPPING", "bbq", "pulled", "pork", ")", ")", ")", ")"]
    """
    tokens = re.findall(r"\(|\)|\w+|[^\s()]+", output)
    return tokens

def build_vocab(outputs):
    """
    Builds a vocabulary from tokenized outputs.
    """
    vocab = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2}  # Special tokens
    i = 3
    for output in outputs:
        tokens = tokenize_output(output)
        for token in tokens:
            if token not in vocab:
                vocab[token] = i
                i += 1
    return vocab
def encode_outputs(outputs, vocab):
    """
    Encodes tokenized outputs into sequences of integers.
    """
    encoded = []
    for output in outputs:
        tokens = tokenize_output(output)
        sequence = [vocab["<SOS>"]] + [vocab[token] for token in tokens if token in vocab] + [vocab["<EOS>"]]
        encoded.append(sequence)
    return encoded

def pad_sequences_to_fixed_length(sequences, max_len):
    """
    Pads sequences to a fixed length.
    """
    return pad_sequences(sequences, maxlen=max_len, padding="post", value=0)

def decode_sequence(sequence, vocab):
    """
    Decodes a sequence of integers back into the structured output string.
    """
    inv_vocab = {v: k for k, v in vocab.items()}  # Reverse the vocabulary
    tokens = [inv_vocab[idx] for idx in sequence if idx in inv_vocab and idx not in {vocab["<SOS>"], vocab["<EOS>"], vocab["<PAD>"]}]

    output = " ".join(tokens)
    output = output.replace(" ( ", " (").replace("( ", "(") #.replace(" )", ")")
    return output

def decode_sequence_2(sequence, vocab):
    """
    Decodes a sequence of integers back into the structured output string.
    """
    inv_vocab = {v: k for k, v in vocab.items()}  # Reverse the vocabulary
    # sequence = sequence.cpu().tolist()  # Convert tensor to a list of integers
    tokens = [inv_vocab.get(idx, "") for idx in sequence if idx > 0]  # Ignore unknown and put empty char
    return "".join(tokens)



In [20]:


def prepare_data(
    X_train, y_train, X_test, y_test, max_len_1=20, max_len_2 = 20
):

    X_vocab = build_vocab(X_train)  # Build vocabulary from training outputs
    X_train_encoded = encode_outputs(X_train, X_vocab)  # Encode training outputs
    X_test_encoded = encode_outputs(X_test, X_vocab)  # Encode testing outputs
    X_train_processed = pad_sequences_to_fixed_length(X_train_encoded, max_len_1)
    X_test_processed = pad_sequences_to_fixed_length(X_test_encoded, max_len_1)

    vocab = build_vocab(y_train)  # Build vocabulary from training outputs
    y_train_encoded = encode_outputs(y_train, vocab)  # Encode training outputs
    y_test_encoded = encode_outputs(y_test, vocab)  # Encode testing outputs
    y_train_processed = pad_sequences_to_fixed_length(y_train_encoded, max_len_2)
    y_test_processed = pad_sequences_to_fixed_length(y_test_encoded, max_len_2)


    return (
        X_train_processed,
        X_test_processed,
        y_train_processed,
        y_test_processed,
        X_vocab,
        vocab,  # Return vocabulary for decoding
    )


In [21]:
X_train_processed, X_test_processed, y_train_processed, y_test_processed, X_vocab, vocab = prepare_data( X_train, y_train, X_test, y_test,max_len_1=250, max_len_2=250)

In [188]:
del df
del dev
del X_train
del X_test
del y_train
del y_test

In [23]:
X_vocab

{'<PAD>': 0,
 '<SOS>': 1,
 '<EOS>': 2,
 'can': 3,
 'i': 4,
 'have': 5,
 'a': 6,
 'large': 7,
 'bbq': 8,
 'pulled': 9,
 'pork': 10,
 'pie': 11,
 'with': 12,
 'green': 13,
 'pepper': 14,
 'and': 15,
 'extra': 16,
 'peperonni': 17,
 "'d": 18,
 'like': 19,
 'vegetarian': 20,
 'pizza': 21,
 'party': 22,
 'size': 23,
 'stuffed': 24,
 'crust': 25,
 'american': 26,
 'cheese': 27,
 'mushroom': 28,
 'one': 29,
 'personal': 30,
 'sized': 31,
 'artichoke': 32,
 'banana': 33,
 'peppperonis': 34,
 'low': 35,
 'fat': 36,
 'want': 37,
 'regular': 38,
 'without': 39,
 'any': 40,
 'fried': 41,
 'onions': 42,
 'little': 43,
 'bit': 44,
 'of': 45,
 'high': 46,
 'rise': 47,
 'dough': 48,
 'lot': 49,
 'olive': 50,
 'pesto': 51,
 'sauce': 52,
 'peperonis': 53,
 'yellow': 54,
 'meatball': 55,
 '-': 56,
 'bean': 57,
 'big': 58,
 'meat': 59,
 'mushrooms': 60,
 'pecorino': 61,
 'balsamic': 62,
 'glaze': 63,
 'black': 64,
 'chicken': 65,
 'mozzarella': 66,
 'italian': 67,
 'sausage': 68,
 'olives': 69,
 'pestos':

In [24]:
vocab

{'<PAD>': 0,
 '<SOS>': 1,
 '<EOS>': 2,
 '(': 3,
 'ORDER': 4,
 'can': 5,
 'i': 6,
 'have': 7,
 'PIZZAORDER': 8,
 'NUMBER': 9,
 'a': 10,
 ')': 11,
 'SIZE': 12,
 'large': 13,
 'TOPPING': 14,
 'bbq': 15,
 'pulled': 16,
 'pork': 17,
 'pie': 18,
 'with': 19,
 'green': 20,
 'pepper': 21,
 'and': 22,
 'COMPLEX_TOPPING': 23,
 'QUANTITY': 24,
 'extra': 25,
 'peperonni': 26,
 "'d": 27,
 'like': 28,
 'STYLE': 29,
 'vegetarian': 30,
 'pizza': 31,
 'party': 32,
 'size': 33,
 'stuffed': 34,
 'crust': 35,
 'american': 36,
 'cheese': 37,
 'mushroom': 38,
 'one': 39,
 'personal': 40,
 'sized': 41,
 'artichoke': 42,
 'banana': 43,
 'peppperonis': 44,
 'low': 45,
 'fat': 46,
 'want': 47,
 'regular': 48,
 'without': 49,
 'any': 50,
 'NOT': 51,
 'fried': 52,
 'onions': 53,
 'little': 54,
 'bit': 55,
 'of': 56,
 'high': 57,
 'rise': 58,
 'dough': 59,
 'lot': 60,
 'olive': 61,
 'pesto': 62,
 'sauce': 63,
 'peperonis': 64,
 'yellow': 65,
 'meatball': 66,
 '-': 67,
 'bean': 68,
 'big': 69,
 'meat': 70,
 'mushro

### Save SRC as input

In [None]:
src_data = [entry for entry in X_train ]
with open("../dataset/src_data.txt", "w") as src_file:
    src_file.write("\n".join(src_data))

### Turn TOP-DECOUPLED into json tree

In [2]:
import re
import json
def parse_element(element):
    #print(f"Parsing element: {element}")
    
    # Helper function to parse individual elements
    matches = [y.group() for y in re.finditer(r'\b(PIZZAORDER|DRINKORDER)\b|\((?:[^()]*|\([^()]*\))*\)', element)]

    #print(f"Parts found: {matches}")

    if matches[0] == 'PIZZAORDER':
        pizza = {
            'NUMBER': None,
            'SIZE': None,
            'STYLE': None,
            'AllTopping': []
        }
        for token in matches[1:]:
            #print(f"Processing token: {token}")
            if token.startswith('(NUMBER'):
                pizza['NUMBER'] = token.split(' ', 1)[1].strip(')')
            elif token.startswith('(SIZE'):
                pizza['SIZE'] = token.split(' ', 1)[1].strip(')')
            elif token.startswith('(STYLE'):
                pizza['STYLE'] = token.split(' ', 1)[1].strip(')')
            elif token.startswith('(TOPPING'):
                topping = token.split(' ', 1)[1].strip(')')
                pizza['AllTopping'].append({
                    'NOT': False,
                    'Quantity': None,
                    'Topping': topping
                })
            elif token.startswith('(NOT'):
                # Extract inner TOPPING from NOT
                not_match = re.search(r'\(TOPPING\s([^\)]+)\)', token)
                if not_match:
                    topping = not_match.group(1)
                    pizza['AllTopping'].append({
                        'NOT': True,
                        'Quantity': None,
                        'Topping': topping
                    })
            elif token.startswith('(COMPLEX_TOPPING'):
                # Extract QUANTITY and TOPPING from COMPLEX_TOPPING
                complex_matches = re.findall(r'\(QUANTITY\s([^\)]+)\)\s*\(TOPPING\s([^\)]+)\)', token)
                for quantity, topping in complex_matches:
                    pizza['AllTopping'].append({
                        'NOT': False,
                        'Quantity': quantity,
                        'Topping': topping
                    })

        return pizza
    else:
        matches = re.finditer(
            r'\b(PIZZAORDER|DRINKORDER)\b|'                              # Match PIZZAORDER or DRINKORDER
            r'\w+\(|\([^()]+\)|'                                # Match simple key-value pairs like (KEY VALUE)
            r'\([^()]+\(.*?\)\)',                                # Match nested structures
            element
        )
        matches = [match.group() for match in matches]
        #print(f"Parts found: {matches}")
        if matches[0] == 'PIZZAORDER':
            pizza = {
                'NUMBER': None,
                'SIZE': None,
                'STYLE': None,
                'AllTopping': []
            }
            for token in matches[1:]:
                #print(f"Processing token: {token}")
                if token.startswith('(NUMBER'):
                    pizza['NUMBER'] = token.split(' ', 1)[1].strip(')')
                elif token.startswith('(SIZE'):
                    pizza['SIZE'] = token.split(' ', 1)[1].strip(')')
                elif token.startswith('(STYLE'):
                    pizza['STYLE'] = token.split(' ', 1)[1].strip(')')
                elif token.startswith('(TOPPING'):
                    topping = token.split(' ', 1)[1].strip(')')
                    pizza['AllTopping'].append({
                        'NOT': False,
                        'Quantity': None,
                        'Topping': topping
                    })
                elif token.startswith('(NOT'):
                    # Extract inner TOPPING from NOT
                    not_match = re.search(r'\(TOPPING\s([^\)]+)\)', token)
                    if not_match:
                        topping = not_match.group(1)
                        pizza['AllTopping'].append({
                            'NOT': True,
                            'Quantity': None,
                            'Topping': topping
                        })
                elif token.startswith('(COMPLEX_TOPPING'):
                    # Extract QUANTITY and TOPPING from COMPLEX_TOPPING
                    complex_matches = re.findall(r'\(QUANTITY\s([^\)]+)\)\s*\(TOPPING\s([^\)]+)\)', token)
                    for quantity, topping in complex_matches:
                        pizza['AllTopping'].append({
                            'NOT': False,
                            'Quantity': quantity,
                            'Topping': topping
                        })

            return pizza    
        elif matches[0] == 'DRINKORDER':
            drink = {
                'NUMBER': None,
                'VOLUME': None,
                'DRINKTYPE': None,
                'CONTAINERTYPE': None
            }
            for token in matches[1:]:
                #print(f"Processing drink token: {token}")
                if token.startswith('(NUMBER'):
                    drink['NUMBER'] = token.split(' ', 1)[1].strip(')')
                elif token.startswith('(VOLUME'):
                    drink['VOLUME'] = token.split(' ', 1)[1].strip(')')
                elif token.startswith('(DRINKTYPE'):
                    drink['DRINKTYPE'] = token.split(' ', 1)[1].strip(')')
                elif token.startswith('(CONTAINERTYPE'):
                    drink['CONTAINERTYPE'] = token.split(' ', 1)[1].strip(')')

            return drink
    

    return None


def parse_order(input_text):
    
    order = {"ORDER": {"PIZZAORDER": [], "DRINKORDER": []}}
    
    #print(f"Full input text: {input_text}")

    order_elements = [x.group() for x in  re.finditer(r'\((?:PIZZAORDER|DRINKORDER)(?:[^()]*|\((?:[^()]*|\([^()]*\))*\))*\)', input_text)]
    
    #print(f"Found order elements: {order_elements}")  # Debug print
    
    for e in order_elements:
        parsed = parse_element(e)
        if parsed:
            if 'SIZE' in parsed:
                order['ORDER']['PIZZAORDER'].append(parsed)
            elif 'VOLUME' in parsed:
                order['ORDER']['DRINKORDER'].append(parsed)
    
    return order

# Example usage
input_text = '(ORDER (PIZZAORDER (NUMBER one) (SIZE large) (STYLE thin crust) (TOPPING cheese) (TOPPING pepperoni) ) (PIZZAORDER (NUMBER two) (SIZE medium) (STYLE deep dish) (NOT (TOPPING mushrooms) ) (COMPLEX_TOPPING (QUANTITY extra) (TOPPING olives) ) ) (DRINKORDER (NUMBER five) (VOLUME one liter) (DRINKTYPE lemon ice tea) (CONTAINERTYPE bottles) )(DRINKORDER (NUMBER three) (VOLUME two liters) (DRINKTYPE cola) (CONTAINERTYPE cans) ) (DRINKORDER (NUMBER three) (VOLUME two liters) (DRINKTYPE cola) (CONTAINERTYPE cans) ) )'

result = parse_order(input_text)
print(json.dumps(result, indent=2))

{
  "ORDER": {
    "PIZZAORDER": [
      {
        "NUMBER": "one",
        "SIZE": "large",
        "STYLE": "thin crust",
        "AllTopping": [
          {
            "NOT": false,
            "Quantity": null,
            "Topping": "cheese"
          },
          {
            "NOT": false,
            "Quantity": null,
            "Topping": "pepperoni"
          }
        ]
      },
      {
        "NUMBER": "two",
        "SIZE": "medium",
        "STYLE": "deep dish",
        "AllTopping": [
          {
            "NOT": true,
            "Quantity": null,
            "Topping": "mushrooms"
          },
          {
            "NOT": false,
            "Quantity": "extra",
            "Topping": "olives"
          }
        ]
      }
    ],
    "DRINKORDER": [
      {
        "NUMBER": "five",
        "VOLUME": "one liter",
        "DRINKTYPE": "lemon ice tea",
        "CONTAINERTYPE": "bottles"
      },
      {
        "NUMBER": "three",
        "VOLUME": "two liters",
   

In [4]:
input_text = '(ORDER i want (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING onion ) and (TOPPING pepper ) (NOT (TOPPING ham ) ) ) )'

result = parse_order(input_text)
print(json.dumps(result, indent=2))

{
  "ORDER": {
    "PIZZAORDER": [
      {
        "NUMBER": "a ",
        "SIZE": "large ",
        "STYLE": null,
        "AllTopping": [
          {
            "NOT": false,
            "Quantity": null,
            "Topping": "onion "
          },
          {
            "NOT": false,
            "Quantity": null,
            "Topping": "pepper "
          },
          {
            "NOT": true,
            "Quantity": null,
            "Topping": "ham "
          }
        ]
      }
    ],
    "DRINKORDER": []
  }
}


In [16]:
import re
import json
def parse_element(element):
    print(f"Parsing element: {element}")
    
    # Helper function to parse individual elements
    matches = [y.group() for y in re.finditer(r'\b(PIZZAORDER|DRINKORDER)\b|\((?:[^()]*|\([^()]*\))*\)', element)]

    print(f"Parts found: {matches}")

    if matches[0] == 'PIZZAORDER':
        pizza = {
            'NUMBER': None,
            'SIZE': None,
            'STYLE': None,
            'AllTopping': []
        }
        for token in matches[1:]:
            print(f"Processing token: {token}")
            if token.startswith('(NUMBER'):
                pizza['NUMBER'] = token.split(' ', 1)[1].strip(' )')
            elif token.startswith('(SIZE'):
                pizza['SIZE'] = token.split(' ', 1)[1].strip(' )')
            elif token.startswith('(STYLE'):
                pizza['STYLE'] = token.split(' ', 1)[1].strip(' )')
            elif token.startswith('(TOPPING'):
                topping = token.split(' ', 1)[1].strip(' )')
                pizza['AllTopping'].append({
                    'NOT': False,
                    'Quantity': None,
                    'Topping': topping
                })
            elif token.startswith('(NOT'):
                # Extract inner TOPPING from NOT
                not_match = re.search(r'\(TOPPING\s([^\)]+)\)', token)
                if not_match:
                    topping = not_match.group(1)
                    pizza['AllTopping'].append({
                        'NOT': True,
                        'Quantity': None,
                        'Topping': topping
                    })
            elif token.startswith('(COMPLEX_TOPPING'):
                # Extract QUANTITY and TOPPING from COMPLEX_TOPPING
                complex_matches = re.findall(r'\(QUANTITY\s([^\)]+)\)\s*\(TOPPING\s([^\)]+)\)', token)
                for quantity, topping in complex_matches:
                    pizza['AllTopping'].append({
                        'NOT': False,
                        'Quantity': quantity,
                        'Topping': topping
                    })

        return pizza
    else:
        matches = re.finditer(
            r'\b(PIZZAORDER|DRINKORDER)\b|'                              # Match PIZZAORDER or DRINKORDER
            r'\w+\(|\([^()]+\)|'                                # Match simple key-value pairs like (KEY VALUE)
            r'\([^()]+\(.*?\)\)',                                # Match nested structures
            element
        )
        matches = [match.group() for match in matches]
        print(f"Parts found: {matches}")
        if matches[0] == 'PIZZAORDER':
            pizza = {
                'NUMBER': None,
                'SIZE': None,
                'STYLE': None,
                'AllTopping': []
            }
            for token in matches[1:]:
                print(f"Processing token: {token}")
                if token.startswith('(NUMBER'):
                    pizza['NUMBER'] = token.split(' ', 1)[1].strip(' )')
                elif token.startswith('(SIZE'):
                    pizza['SIZE'] = token.split(' ', 1)[1].strip(' )')
                elif token.startswith('(STYLE'):
                    pizza['STYLE'] = token.split(' ', 1)[1].strip(' )')
                elif token.startswith('(TOPPING'):
                    topping = token.split(' ', 1)[1].strip(' )')
                    pizza['AllTopping'].append({
                        'NOT': False,
                        'Quantity': None,
                        'Topping': topping
                    })
                elif token.startswith('(NOT'):
                    # Extract inner TOPPING from NOT
                    not_match = re.search(r'\(TOPPING\s([^\)]+)\)', token)
                    if not_match:
                        topping = not_match.group(1)
                        pizza['AllTopping'].append({
                            'NOT': True,
                            'Quantity': None,
                            'Topping': topping
                        })
                elif token.startswith('(COMPLEX_TOPPING'):
                    # Extract QUANTITY and TOPPING from COMPLEX_TOPPING
                    complex_matches = re.findall(r'\(QUANTITY\s([^\)]+)\)\s*\(TOPPING\s([^\)]+)\)', token)
                    for quantity, topping in complex_matches:
                        pizza['AllTopping'].append({
                            'NOT': False,
                            'Quantity': quantity,
                            'Topping': topping
                        })

            return pizza    
        elif matches[0] == 'DRINKORDER':
            drink = {
                'NUMBER': None,
                'VOLUME': None,
                'DRINKTYPE': None,
                'CONTAINERTYPE': None
            }
            for token in matches[1:]:
                print(f"Processing drink token: {token}")
                if token.startswith('(NUMBER'):
                    drink['NUMBER'] = token.split(' ', 1)[1].strip(' )')
                elif token.startswith('(VOLUME'):
                    drink['VOLUME'] = token.split(' ', 1)[1].strip(' )')
                elif token.startswith('(DRINKTYPE'):
                    drink['DRINKTYPE'] = token.split(' ', 1)[1].strip(' )')
                elif token.startswith('(CONTAINERTYPE'):
                    drink['CONTAINERTYPE'] = token.split(' ', 1)[1].strip(' )')

            return drink
    

    return None

def sanitize_input(input_text):
    # Step 1: Remove unnecessary words outside parentheses
    sanitized = re.sub(
        r'\b(?!ORDER|PIZZAORDER|DRINKORDER|NUMBER|SIZE|STYLE|TOPPING|NOT|COMPLEX_TOPPING|QUANTITY|DRINKTYPE|VOLUME|CONTAINERTYPE)\w+\b(?![^()]*\)\s)',
        '',
        input_text
    )
    # Step 2: Remove empty parentheses caused by removing words
    sanitized = re.sub(r'\(\s*\)', '', sanitized)
    # Step 3: Remove redundant spaces
    sanitized = re.sub(r'\s+', ' ', sanitized).strip()
    return sanitized


def parse_order(input_text):
    input_text = sanitize_input(input_text)
    order = {"ORDER": {"PIZZAORDER": [], "DRINKORDER": []}}
    
    print(f"Full input text: {input_text}")

    order_elements = [x.group() for x in  re.finditer(r'\((?:PIZZAORDER|DRINKORDER)(?:[^()]*|\((?:[^()]*|\([^()]*\))*\))*\)', input_text)]
    
    print(f"Found order elements: {order_elements}")  # Debug print
    
    for e in order_elements:
        parsed = parse_element(e)
        if parsed:
            if 'SIZE' in parsed:
                order['ORDER']['PIZZAORDER'].append(parsed)
            elif 'VOLUME' in parsed:
                order['ORDER']['DRINKORDER'].append(parsed)
    
    return order


# Example input
input_text = "(ORDER (PIZZAORDER (TOPPING garlic powder ) (NUMBER fourteen ) ) (DRINKORDER (NUMBER 6 ) (DRINKTYPE diet ice teas ) ) )"

result = parse_order(input_text)
print(json.dumps(result, indent=2))


Full input text: (ORDER (PIZZAORDER (TOPPING garlic powder ) (NUMBER fourteen ) ) (DRINKORDER (NUMBER 6 ) (DRINKTYPE diet ice teas ) ) )
Found order elements: ['(PIZZAORDER (TOPPING garlic powder ) (NUMBER fourteen ) )', '(DRINKORDER (NUMBER 6 ) (DRINKTYPE diet ice teas ) )']
Parsing element: (PIZZAORDER (TOPPING garlic powder ) (NUMBER fourteen ) )
Parts found: ['(PIZZAORDER (TOPPING garlic powder ) (NUMBER fourteen ) )']
Parts found: ['PIZZAORDER', '(TOPPING garlic powder )', '(NUMBER fourteen )']
Processing token: (TOPPING garlic powder )
Processing token: (NUMBER fourteen )
Parsing element: (DRINKORDER (NUMBER 6 ) (DRINKTYPE diet ice teas ) )
Parts found: ['(DRINKORDER (NUMBER 6 ) (DRINKTYPE diet ice teas ) )']
Parts found: ['DRINKORDER', '(NUMBER 6 )', '(DRINKTYPE diet ice teas )']
Processing drink token: (NUMBER 6 )
Processing drink token: (DRINKTYPE diet ice teas )
{
  "ORDER": {
    "PIZZAORDER": [
      {
        "NUMBER": "fourteen",
        "SIZE": null,
        "STYLE": nul

In [2]:


input_text = "(ORDER (PIZZAORDER (TOPPING garlic powder ) (NUMBER fourteen ) ) (DRINKORDER (NUMBER 6 ) (DRINKTYPE diet ice teas ) ) )"
result = sanitize_input(input_text)
result

'(ORDER (PIZZAORDER (NUMBER two ) ) )'

In [4]:
input_text = "(ORDER okay i'm all set i need (PIZZAORDER (NUMBER a ) pizza and make it (SIZE medium ) for toppings i'd like (TOPPING mushrooms ) (TOPPING pepperoni ) and i'd like (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) ) on there beautiful )"

result = parse_order(input_text)
print(json.dumps(result, indent=2))

{
  "ORDER": {
    "PIZZAORDER": [
      {
        "NUMBER": "a ",
        "SIZE": "medium ",
        "STYLE": null,
        "AllTopping": [
          {
            "NOT": false,
            "Quantity": null,
            "Topping": "mushrooms "
          },
          {
            "NOT": false,
            "Quantity": null,
            "Topping": "pepperoni "
          },
          {
            "NOT": false,
            "Quantity": "extra ",
            "Topping": "cheese "
          }
        ]
      }
    ],
    "DRINKORDER": []
  }
}


In [10]:
input_text = '(ORDER (PIZZAORDER (NUMBER one) (SIZE large) (STYLE thin crust) (TOPPING cheese) (TOPPING pepperoni) ) (PIZZAORDER (NUMBER two) (SIZE medium) (STYLE deep dish) (NOT (TOPPING mushrooms) ) (COMPLEX_TOPPING (QUANTITY extra) (TOPPING olives) ) ) (DRINKORDER (NUMBER five) (VOLUME one liter) (DRINKTYPE lemon ice tea) (CONTAINERTYPE bottles) )(DRINKORDER (NUMBER three) (VOLUME two liters) (DRINKTYPE cola) (CONTAINERTYPE cans) ) (DRINKORDER (NUMBER three) (VOLUME two liters) (DRINKTYPE cola) (CONTAINERTYPE cans) ) )'

result = parse_order(input_text)
print(json.dumps(result, indent=2))

{
  "ORDER": {
    "PIZZAORDER": [
      {
        "NUMBER": "one",
        "SIZE": "large",
        "STYLE": "thin crust",
        "AllTopping": [
          {
            "NOT": false,
            "Quantity": null,
            "Topping": "cheese"
          },
          {
            "NOT": false,
            "Quantity": null,
            "Topping": "pepperoni"
          }
        ]
      },
      {
        "NUMBER": "two",
        "SIZE": "medium",
        "STYLE": "deep dish",
        "AllTopping": [
          {
            "NOT": true,
            "Quantity": null,
            "Topping": "mushrooms"
          },
          {
            "NOT": false,
            "Quantity": "extra",
            "Topping": "olives"
          }
        ]
      }
    ],
    "DRINKORDER": [
      {
        "NUMBER": "five",
        "VOLUME": "one liter",
        "DRINKTYPE": "lemon ice tea",
        "CONTAINERTYPE": "bottles"
      },
      {
        "NUMBER": "three",
        "VOLUME": "two liters",
   

In [11]:
input_text = '(ORDER place a rush (PIZZAORDER (NUMBER two ) (SIZE large ) (TOPPING pepperoni ) (TOPPING ham ) ) on them (DRINKORDER (NUMBER two ) (CONTAINERTYPE cans ) (DRINKTYPE sprite ) ) )'

result = parse_order(input_text)
print(json.dumps(result, indent=2))

{
  "ORDER": {
    "PIZZAORDER": [
      {
        "NUMBER": "two ",
        "SIZE": "large ",
        "STYLE": null,
        "AllTopping": [
          {
            "NOT": false,
            "Quantity": null,
            "Topping": "pepperoni "
          },
          {
            "NOT": false,
            "Quantity": null,
            "Topping": "ham "
          }
        ]
      }
    ],
    "DRINKORDER": [
      {
        "NUMBER": "two ",
        "VOLUME": null,
        "DRINKTYPE": "sprite ",
        "CONTAINERTYPE": "cans "
      }
    ]
  }
}


### Save TOP-DECOUPLED

In [6]:

# Extract parsed data from the DataFrame
parsed_data = []
for _, row in df.iterrows():
    if "train.TOP-DECOUPLED" in row:
        parsed_entry = parse_order(row["train.TOP-DECOUPLED"])
        parsed_data.append(parsed_entry)

# Save the parsed data to a file
output_path = "../dataset/parsed_order_data.json"
with open(output_path, "w") as parsed_file:
    json.dump(parsed_data, parsed_file)

In [None]:
del parsed_data

In [None]:
del src_data

In [13]:
del df
del dev

In [None]:

del X_test_processed
del X_train_processed
del y_test_processed
del y_train_processed

### Load train.TOP-DECOUPLED

In [None]:
import json

# Step 1: Open the JSON file containing multiple objects
with open('../dataset/parsed_order_data.json', 'r') as file:
    # Step 2: Read each JSON object (assuming each JSON object is on a new line)
    for line in file:
        data = json.loads(line.strip())  # Parse the JSON object

In [None]:
data

### Turn TOP into TOP-DECOUPLED

In [5]:
import re
def clean_text(text):
    """
    Remove special characters and unnecessary symbols from text.
    """
    #stop_words = set(stopwords.words('english')) # takes much time
    stop_words = [
    "an", "the", "and", "or", "but", "if", "in",  "at", 
    "by", "from", "to", "of", "for", "this", "that", "those", "these", 
    "can", "could", "would", "should", "will", "might", "may", "i", "you", 
    "we", "he", "she", "it", "they", "is", "are", "was", "were", "be", 
    "been", "have", "has", "had", "please","'", "d","without", "with", "any", "s", "no"
    ]#### i'd with without no "on", "with", ,    "a", 
    custom_remove = [
    r"please",
    r"thank\s?you", 
    r"kindly", 
    r"just", 
    r"really",
    r"actually",
    r"like",
    r"want",
    r"pizza",
    r"pie",
    r"need",
    r"hold",
    r"also",
    r"hate",
    r"avoid",
    ]
    # Remove special characters
    #text = re.sub(r"[^\w\s]", " ", text)  # Remove punctuation and special characters
    # Remove "i'd"
    text = re.sub(r"\bi'd\b", "", text)
    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()
    # Remove stopwords
    if stop_words:
        text = " ".join([word for word in text.split() if word.lower() not in stop_words])
    # Remove custom characters or substrings
    if custom_remove:
        for pattern in custom_remove:
            text = re.sub(pattern, "", text)
    # Remove extra whitespace (convert any 2 or more spaces into 1)
    text = re.sub(r"\s{2,}", " ", text)
    return text


In [77]:
TOP = df['train.TOP'].apply(clean_text)

In [78]:
TOP = TOP.apply(clean_text)## twice
TOP[2]

'(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (STYLE vegetarian ) ) )'

In [None]:
DECOUPLED = df['train.TOP-DECOUPLED'].apply(clean_text)
DECOUPLED[2]

In [74]:
DECOUPLED[2] == TOP[2]

True

### Ensure Transformation works right

In [79]:
total_sequences = 0
correct_sequences = 0

for src,tgt in zip(TOP, DECOUPLED):

    if src == tgt:
        # print(pred)
        # print(tgt)
        correct_sequences += 1
    else:
        print(src)
        print(tgt)
        
    total_sequences += 1

print(f"Correct {correct_sequences}, Total {total_sequences}")
sequence_accuracy = correct_sequences / total_sequences if total_sequences > 0 else 0
sequence_accuracy * 100

Correct 2456446, Total 2456446


100.0

In [2]:
import pandas as pd
dev = pd.read_json('../dataset/PIZZA_dev.json', lines=True)
X_test = dev['dev.SRC']
y_test = dev['dev.TOP']

In [3]:
import re
def clean_text_dev(text):
    """
    Remove special characters and unnecessary symbols from text.
    """
    #stop_words = set(stopwords.words('english')) # takes much time
    stop_words = [
    "an", "the", "and", "or", "but", "if", "in",  "at", 
    "by", "from", "to", "of", "for", "this", "that", "those", "these", 
    "can", "could", "would", "should", "will", "might", "may", "i", "you", 
    "we", "he", "she", "it", "they", "is", "are", "was", "were", "be", 
    "been", "have", "has", "had", "please","'", "d","without", "with", "any", "s", "no", "i'll", "don't", "let's", "ing"
    ]#### i'd with without no "on", "with", ,    "a", 
    custom_remove = [
    r"please",
    r"thank\s?you", 
    r"kindly", 
    r"just", 
    r"really",
    r"actually",
    r"like",
    r"want",
    r"pizza",
    r"pie",
    r"need",
    r"hold",
    r"also",
    r"hate",
    r"avoid",
    r"order",
    r"get",
    r"me",
    r"ed",
    r"try",
    r"go",
    r"prefer",
    r"lets",
    r"place",
    r"size",
    r"along",
    r"which",
    r"which",
    r"not",
    r"let",
    r"let",
    ]
    # Remove special characters
    #text = re.sub(r"[^\w\s]", " ", text)  # Remove punctuation and special characters
    # Remove "i'd"
    text = re.sub(r"\bi'd\b", "", text)
    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()
    # Remove stopwords
    if stop_words:
        text = " ".join([word for word in text.split() if word.lower() not in stop_words])
    # Remove custom characters or substrings
    if custom_remove:
        for pattern in custom_remove:
            text = re.sub(pattern, "", text)
    # Remove extra whitespace (convert any 2 or more spaces into 1)
    text = re.sub(r"\s{2,}", " ", text)
    return text


In [4]:
TOP = dev['dev.TOP'].apply(clean_text_dev)
TOP = TOP.apply(clean_text_dev)

In [5]:
print(TOP[:,].values)


['(ORDER (PIZZAORDER (NUMBER two ) (SIZE dium ) (TOPPING sausage ) (TOPPING black olives ) ) (PIZZAORDER (NUMBER two ) (SIZE dium ) (TOPPING pepperoni ) (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) ) (PIZZAORDER (NUMBER three ) (SIZE large ) (TOPPING pepperoni ) (TOPPING sausage ) ) )'
 '(ORDER (PIZZAORDER (NUMBER five ) (SIZE dium ) (TOPPING tomatoes ) (TOPPING ham ) ) )'
 '(ORDER (PIZZAORDER (NUMBER one ) (SIZE large ) (STYLE vearian ) (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING banana peppers ) ) ) )'
 '(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING onion ) (TOPPING pepper ) ) )'
 '(ORDER (PIZZAORDER (NUMBER one ) (TOPPING pesto ) (TOPPING ham ) (NOT (TOPPING olives ) ) ) )'
 '(ORDER (PIZZAORDER (NUMBER one ) (SIZE large ) (TOPPING ham ) (TOPPING bacon ) (TOPPING onions ) (TOPPING black olives ) ) (PIZZAORDER (NUMBER one ) (SIZE dium ) (TOPPING sausage ) (TOPPING onions ) ) (DRINKORDER (NUMBER six ) (SIZE large ) (DRINKTYPE cokes ) ) )'
 '(ORDER (PIZZAORDER (NUMBER 

In [38]:
src_data = [entry for entry in X_test ]
with open("../dataset/dev_src_data.txt", "w") as src_file:
    src_file.write("\n".join(src_data))

In [12]:
row = "(ORDER (PIZZAORDER (NUMBER a ) (SIZE dium ) (TOPPING italian sausage ) (TOPPING onions ) (TOPPING green peppers ) ) (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING pepperoni ) (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) ) won't drinks )"
parsed_entry = parse_order(row)
parsed_entry

{'ORDER': {'PIZZAORDER': [{'NUMBER': 'a ',
    'SIZE': 'dium ',
    'STYLE': None,
    'AllTopping': [{'NOT': False,
      'Quantity': None,
      'Topping': 'italian sausage '},
     {'NOT': False, 'Quantity': None, 'Topping': 'onions '},
     {'NOT': False, 'Quantity': None, 'Topping': 'green peppers '}]},
   {'NUMBER': 'a ',
    'SIZE': 'large ',
    'STYLE': None,
    'AllTopping': [{'NOT': False, 'Quantity': None, 'Topping': 'pepperoni '},
     {'NOT': False, 'Quantity': 'extra ', 'Topping': 'cheese '}]}],
  'DRINKORDER': []}}

In [None]:
# Extract parsed data from the DataFrame
parsed_data = []
l = TOP[:,].values
for row in l:
    print(row)
    parsed_entry = parse_order(row)
    parsed_data.append(parsed_entry)

In [None]:
parsed_data

In [None]:

# Save the parsed data to a file
output_path = "../dataset/parsed_dev_order_data.json"
with open(output_path, "w") as parsed_file:
    json.dump(parsed_data, parsed_file)