### Model 1 data generation

In [None]:
import json
import re

ENTITY_KEYS = {
    "NUMBER", "SIZE", "STYLE", "TOPPING", "COMPLEX_TOPPING", "QUANTITY",
    "VOLUME", "DRINKTYPE", "CONTAINERTYPE"
}
ORDER_KEYS = {"PIZZAORDER", "DRINKORDER"}

def tokenize(s):
    tokens = re.findall(r'\(|\)|[^\s()]+', s)
    return tokens

def tokenize2(s):
    # just split by space
    return s.split()

def parse_tokens(tokens):
    stack = []
    current_list = []
    for token in tokens:
        if token == '(':
            stack.append(current_list)
            current_list = []
        elif token == ')':
            finished = current_list
            current_list = stack.pop()
            current_list.append(finished)
        else:
            current_list.append(token)
    return current_list
def extract_orders(structure, order_index=1):

    results = []

    if not isinstance(structure, list) or len(structure) == 0:
        return results, order_index

    first = structure[0]
    if isinstance(first, list):
        for elem in structure:
            sub_results, order_index = extract_orders(elem, order_index)
            results.extend(sub_results)
        return results, order_index

    if isinstance(first, str) and first in ORDER_KEYS:
        order_type = "PIZZAORDER" if first == "PIZZAORDER" else "DRINKORDER"
        current_order_sequence = order_index
        order_index += 1
        content_tokens = []
        for elem in structure[1:]:
            content_tokens.extend(collect_tokens(elem))
        for tok in content_tokens:
            results.append((tok, order_type, current_order_sequence))

        return results, order_index
    else:
        for elem in structure:
            sub_results, order_index = extract_orders(elem, order_index)
            results.extend(sub_results)
        return results, order_index
def collect_tokens(node):
    collected = []
    if isinstance(node, list):
        for sub in node:
            sub_tokens = collect_tokens(sub)
            collected.extend(sub_tokens)
    else:
        if node not in ["(", ")"] and not is_structural_key(node):
            collected.append(node)
    return collected

def is_structural_key(token):
    return token in [
        "ORDER","PIZZAORDER","DRINKORDER","NUMBER","SIZE","STYLE","TOPPING",
        "COMPLEX_TOPPING","QUANTITY","VOLUME","DRINKTYPE","CONTAINERTYPE","NOT"
    ]


In [None]:

def transform_to_labels(input_array):
    labeled_numbers = []

    for _, label, sequence in input_array:
        # Compute the numerical label
        if label == 'O' and sequence is None:
            numerical_label = 0  # Neutral/irrelevant
        elif label == 'PIZZAORDER':
            numerical_label = 10 + sequence  # Unique range for pizza orders
        elif label == 'DRINKORDER':
            numerical_label = 20 + sequence  # Unique range for drink orders
        else:
            numerical_label = 0  # Default fallback

        labeled_numbers.append(numerical_label)

    return labeled_numbers

transform_to_labels(input_label_sequence)

[0, 0, 11, 11, 11, 11, 11, 11, 0, 22, 22, 22, 22]

In [None]:
def transform_labeling(labels):
    label_map = {
        "B-PIZZAORDER": 1,
        "I-PIZZAORDER": 2,
        "B-DRINKORDER": 3,
        "I-DRINKORDER": 4,
        "O": 5
    }

    bio_labels = []
    seen_numbers = set()  # Track already encountered order numbers

    for label in labels:
        if label == 0:
            # Neutral/irrelevant token
            bio_labels.append(label_map["O"])
        else:
            # Determine if it's a PizzaOrder or DrinkOrder
            if 10 <= label < 20:
                label_type = "PIZZAORDER"
            elif 20 <= label < 30:
                label_type = "DRINKORDER"
            else:
                raise ValueError(f"Invalid label encountered: {label}")

            # Assign B- or I- based on first or repeated appearance
            if label not in seen_numbers:
                bio_label = label_map[f"B-{label_type}"]
                seen_numbers.add(label)
            else:
                bio_label = label_map[f"I-{label_type}"]

            bio_labels.append(bio_label)

    return bio_labels


In [None]:
def create_model1_training_data(input_file: str, output_file: str):

    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            record = json.loads(line)
            src = record["train.SRC"]
            top = record["train.TOP"]

            labeled_input = label_input(src, top)
            numerical_labels = transform_to_labels(labeled_input)
            bio_labels = transform_labeling(numerical_labels)

            training_instance = {
                "text": src,
                "labels": bio_labels
            }

            outfile.write(json.dumps(training_instance) + "\n")

# File paths
input_file = "../dataset3/PIZZA_train.json"
output_file = "../dataset3/train_data_model1.json"

# Generate the training data
create_model1_training_data(input_file,output_file)

### Model 2 data generation

In [None]:
import json
import re
import json
import re

def tokenize(s):
    # Extract tokens: parentheses or sequences of non-whitespace, non-parenthesis characters.
    tokens = re.findall(r'\(|\)|[^\s()]+', s)
    return tokens

def parse_tokens(tokens):
    # Parse tokens into a nested list structure
    stack = []
    current_list = []
    for token in tokens:
        if token == '(':
            stack.append(current_list)
            current_list = []
        elif token == ')':
            finished = current_list
            current_list = stack.pop()
            current_list.append(finished)
        else:
            current_list.append(token)
    return current_list

# Keys that indicate entities we want to label
ENTITY_KEYS = {
    "NUMBER", "SIZE", "STYLE", "TOPPING", "COMPLEX_TOPPING", "QUANTITY",
    "VOLUME", "DRINKTYPE", "CONTAINERTYPE"
}

# Keys that indicate higher-level structures (orders)
ORDER_KEYS = {"PIZZAORDER", "DRINKORDER"}

def label_entity_tokens(values, entity_type, negated=False):
    """
    Given a list of tokens (values) inside an entity,
    return a list of (token, label) pairs with B-/I- tagging.

    entity_type: e.g. "NUMBER", "TOPPING", etc.
    negated: boolean, if True we prepend "NEG_" to the entity_type
    """
    if negated:
        prefix = "NEG_"
    else:
        prefix = ""

    labels = []
    for i, val in enumerate(values):
        tag = "B-" + prefix + entity_type if i == 0 else "I-" + prefix + entity_type
        labels.append((val, tag))
    return labels

def flatten_nested_structure(structure, order_context=None, negated=False):
    """
    Recursively descend through the parsed structure and produce (token, label) pairs.
    All tokens in structure are from the same tokenization as the original input.

    order_context: tracks if we are inside PIZZAORDER or DRINKORDER
    negated: tracks if we are inside a NOT block
    """
    if not isinstance(structure, list):
        # It's a single token (string)
        # If it's not an entity value token, it's just structure or unknown => O
        # We'll label it as O
        return [(structure, "O")]

    # structure is a list
    # The first element might be a key like ORDER, PIZZAORDER, NUMBER, etc., or might be nested.
    if len(structure) == 0:
        return []

    first = structure[0]
    if isinstance(first, list):
        # This means we don't have a key at the start; just nested lists.
        # Flatten them recursively.
        result = []
        for elem in structure:
            result.extend(flatten_nested_structure(elem, order_context=order_context, negated=negated))
        return result
    else:
        # first is a token (string)
        key = first
        
        if key == "VOLUME":
            key = "SIZE"

        if key == "ORDER":
            # Just go through its children
            result = [(key, "O")]
            for elem in structure[1:]:
                result.extend(flatten_nested_structure(elem, order_context=None, negated=False))
            return result

        elif key in ORDER_KEYS:
            # Entering a pizza or drink order. We label the key itself as O.
            # Inside this, we find its fields.
            new_order_context = key  # "PIZZAORDER" or "DRINKORDER"
            result = [(key, "O")]
            for elem in structure[1:]:
                result.extend(flatten_nested_structure(elem, order_context=new_order_context, negated=negated))
            return result

        elif key == "NOT":
            # Negation block. We set negated=True inside this.
            result = [(key, "O")]
            for elem in structure[1:]:
                result.extend(flatten_nested_structure(elem, order_context=order_context, negated=True))
            return result

        elif key in ENTITY_KEYS:
            # This is an entity. Extract its values (non-list tokens).
            # Example: (NUMBER one), (TOPPING cheese), (STYLE thin crust)
            # We'll consider everything after key that is a token as part of this entity's value.
            result = [(key, "O")]
            entity_tokens = []
            for elem in structure[1:]:
                if isinstance(elem, list):
                    # Complex entity may have nested (e.g. COMPLEX_TOPPING)
                    # Flatten it and extract from subentities
                    sub_result = flatten_nested_structure(elem, order_context=order_context, negated=negated)
                    # sub_result might contain multiple tokens. They are already labeled, but we want them labeled as part of this entity.
                    # Actually, for COMPLEX_TOPPING, we have QUANTITY and TOPPING inside it.
                    # It's safer to just pass through and let sub-entities handle their own labeling.
                    result.extend(sub_result)
                else:
                    # It's a direct token value
                    entity_tokens.append(elem)

            # If entity_tokens are present directly under this key, label them:
            if entity_tokens:
                # The entity_type should reflect the key.
                # For COMPLEX_TOPPING, we actually break it down into QUANTITY and TOPPING subfields,
                # but if there's a direct token (there shouldn't be normally), treat it similarly.
                # Usually COMPLEX_TOPPING breaks down into more entities. If so, entity_tokens here might be empty.
                entity_type = key
                # label these entity tokens according to entity_type
                labeled = label_entity_tokens(entity_tokens, entity_type, negated=negated)
                result.extend(labeled)

            return result

        else:
            # This is not a recognized key. It's probably a token or extra word not fitting into a known entity.
            # Label as O.
            result = [(key, "O")]
            for elem in structure[1:]:
                result.extend(flatten_nested_structure(elem, order_context=order_context, negated=negated))
            return result


def label_input(text):
    tokens = tokenize(text)
    parsed = parse_tokens(tokens)

    # Flatten and label
    labeled = []
    for elem in parsed:
        labeled.extend(flatten_nested_structure(elem, order_context=None, negated=False))
    
    # remove all tuples where the first element is a ENTITY_KEY or ORDER/PizzaOrder/DrinkOrder
    labeled = [x for x in labeled if x[0] not in ENTITY_KEYS and x[0] not in ORDER_KEYS and x[0] != "ORDER" and x[0] != "NOT"]

    return labeled

# Example:
input_str = "(ORDER i want (PIZZAORDER (NUMBER a ) pizza with (TOPPING pesto ) and (TOPPING mushrooms ) but no (NOT (TOPPING pineapple ) ) ) )"
labeled_output = label_input(input_str)
labeled_output


[('i', 'O'),
 ('want', 'O'),
 ('a', 'B-NUMBER'),
 ('pizza', 'O'),
 ('with', 'O'),
 ('pesto', 'B-TOPPING'),
 ('and', 'O'),
 ('mushrooms', 'B-TOPPING'),
 ('but', 'O'),
 ('no', 'O'),
 ('pineapple', 'B-NEG_TOPPING')]

In [None]:
ENTITY_KEYS = {
    "NUMBER", "SIZE", "STYLE", "TOPPING", "COMPLEX_TOPPING", "QUANTITY",
    "VOLUME", "DRINKTYPE", "CONTAINERTYPE"
}

# Define a mapping for entity keys to numerical labels
LABEL_MAP = {
    'B-DRINKTYPE': 1, 'I-DRINKTYPE': 2,
    'B-SIZE': 3, 'I-SIZE': 4,  # Treats SIZE and VOLUME as the same
    'B-NUMBER': 5, 'I-NUMBER': 6,
    'B-CONTAINERTYPE': 7, 'I-CONTAINERTYPE': 8,
    'B-COMPLEX_TOPPING': 9, 'I-COMPLEX_TOPPING': 10,
    'B-TOPPING': 11, 'I-TOPPING': 12,
    'B-NEG_TOPPING': 13, 'I-NEG_TOPPING': 14,
    'B-NEG_STYLE': 15, 'I-NEG_STYLE': 16,
    'B-STYLE': 17, 'I-STYLE': 18,
    'B-QUANTITY': 19, 'I-QUANTITY': 20,
    'O': 21
}

def transform_to_labels(labeled_output):
    numerical_labels = [LABEL_MAP.get(label, LABEL_MAP["O"]) for _, label in labeled_output]
    return numerical_labels


def create_model2_training_data(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            record = json.loads(line)
            src_text = record["train.SRC"]
            src_top = record["train.TOP"]
            labeled_output = label_input(src_top)
            numerical_labels = transform_to_labels(labeled_output)
            training_instance = {
                "text": src_text,
                "labels": numerical_labels
            }
            outfile.write(json.dumps(training_instance) + "\n")

# Paths to the input and output files
input_file = "../dataset3/PIZZA_train.json"
output_file = "../dataset3/PIZZA_train_model2.json"

# Process the data
create_model2_training_data(input_file, output_file)