In [14]:
import json

def read_conll(filename):
    """
    Reads a file in CoNLL format.
    Returns a list of examples where each example is a tuple:
      (tokens, labels)
    Tokens: list of tokens (strings)
    Labels: list of corresponding BIO labels
    """
    examples = []
    with open(filename, 'r', encoding='utf-8') as f:
        tokens = []
        labels = []
        for line in f:
            line = line.strip()
            if line == "":  # blank line indicates end of a sentence/example
                if tokens:
                    examples.append((tokens, labels))
                    tokens = []
                    labels = []
            else:
                parts = line.split()
                if len(parts) >= 2:
                    tokens.append(parts[0])
                    labels.append(parts[-1])
        if tokens:  # add last sentence if file does not end with a newline
            examples.append((tokens, labels))
    return examples

def extract_entities(tokens, labels):
    """
    Converts BIO tags into a list of entities.
    Each entity is represented as a dictionary with:
      - "entity_type": the entity category (without the "B-" or "I-" prefix)
      - "entity": the concatenated token span (joined by a space)
      - "start": the start token index in the sentence (optional)
      - "end": the end token index (exclusive; optional)
    """
    entities = []
    entity = None
    start_idx = None
    for idx, (token, tag) in enumerate(zip(tokens, labels)):
        if tag == "O":
            # If we were in an entity, save it
            if entity is not None:
                entities.append({
                    "entity_type": entity,
                    "entity": " ".join(current_tokens),
                    "start": start_idx,
                    "end": idx
                })
                entity = None
                current_tokens = []
            continue

        # Split tag into prefix and entity type, e.g., "B-VAR" -> ("B", "VAR")
        try:
            prefix, ent_type = tag.split("-", 1)
        except ValueError:
            # In case the tag does not follow the conventional format; skip it.
            continue

        if prefix == "B":  # beginning of a new entity span
            if entity is not None:  # save the previous entity span
                entities.append({
                    "entity_type": entity,
                    "entity": " ".join(current_tokens),
                    "start": start_idx,
                    "end": idx
                })
            entity = ent_type
            current_tokens = [token]
            start_idx = idx
        elif prefix == "I" and entity == ent_type:
            # Continuation of an entity span
            current_tokens.append(token)
        else:
            # Case: tag inconsistency (e.g., I- tag that doesn’t match the previous B- tag).
            # We start a new entity span.
            if entity is not None:
                entities.append({
                    "entity_type": entity,
                    "entity": " ".join(current_tokens),
                    "start": start_idx,
                    "end": idx
                })
            entity = ent_type
            current_tokens = [token]
            start_idx = idx

    # Catch any remaining entity at the end of the sentence
    if entity is not None:
        entities.append({
            "entity_type": entity,
            "entity": " ".join(current_tokens),
            "start": start_idx,
            "end": len(tokens)
        })
    return entities

def convert_to_subtask2_format(conll_filename, output_filename):
    """
    Reads CoNLL-formatted data (sub-task 1 training data) and converts it to the format
    expected by sub-task 2. For each example, it produces a JSON object with:
      - "problem_description": the full natural language text (tokens joined together)
      - "entities": a list of problem entities extracted using the BIO tags.
    Saves the results in JSON Lines format.
    """
    examples = read_conll(conll_filename)
    with open(output_filename, 'w', encoding='utf-8') as out_file:
        for tokens, labels in examples:
            problem_description = " ".join(tokens)
            entities = extract_entities(tokens, labels)
            # Create a dict matching the expected sub-task 2 input format.
            # (If needed, you can also add an "order_mapping" field here.)
            formatted_example = {
                "problem_description": problem_description,
                "entities": entities
            }
            out_file.write(json.dumps(formatted_example) + "\n")

# Example usage:
conll_train_file = r"D:\LLM\NER\nl4opt-subtask1-baseline\test_output.conll"      # sub-task 1 data (CoNLL format)
output_jsonl_file = r"D:\LLM\NER\nl4opt-subtask1-baseline\train_subtask2_test.jsonl"  # desired output format for sub-task 2

convert_to_subtask2_format(conll_train_file, output_jsonl_file)
print(f"Converted data written to {output_jsonl_file}")


Converted data written to D:\LLM\NER\nl4opt-subtask1-baseline\train_subtask2_test.jsonl


In [17]:
import json

def read_conll(filename):
    """
    Reads a CoNLL-formatted file.
    Returns a list of examples; each example is a tuple (tokens, labels).
    """
    examples = []
    with open(filename, "r", encoding="utf-8") as f:
        tokens = []
        labels = []
        for line in f:
            line = line.strip()
            if not line:  # empty line signals end of one example
                if tokens:
                    examples.append((tokens, labels))
                    tokens = []
                    labels = []
            else:
                parts = line.split()
                # Assuming the token is the first column and the tag is the last.
                if len(parts) >= 2:
                    tokens.append(parts[0])
                    labels.append(parts[-1])
        if tokens:
            examples.append((tokens, labels))
    return examples

def extract_entities(tokens, labels):
    """
    Extracts entity spans from a token list with BIO tags.
    Each entity is returned as a dictionary containing:
       - "text": the concatenated tokens (joined with a space)
       - "token_start": the index of the first token of the entity
       - "token_end": the index (exclusive) of the entity
       - "start": the character start (here, we use token_start as a proxy)
       - "end": the character end (here, we use token_end as a proxy)
       - "label": the entity type (without the B- or I- prefix)
    """
    spans = []
    current_entity = None
    current_tokens = []
    start_idx = None

    for idx, (token, tag) in enumerate(zip(tokens, labels)):
        if tag == "O":
            if current_entity is not None:
                spans.append({
                    "text": " ".join(current_tokens),
                    "token_start": start_idx,
                    "token_end": idx,
                    "start": start_idx,  # simplified proxy for character offset
                    "end": idx,
                    "label": current_entity
                })
                current_entity = None
                current_tokens = []
                start_idx = None
            continue

        # Split tag: e.g., "B-VAR" -> prefix "B", entity "VAR"
        try:
            prefix, ent_type = tag.split("-", 1)
        except ValueError:
            continue

        if prefix == "B":
            if current_entity is not None:
                # Save previous entity span
                spans.append({
                    "text": " ".join(current_tokens),
                    "token_start": start_idx,
                    "token_end": idx,
                    "start": start_idx,
                    "end": idx,
                    "label": current_entity
                })
            current_entity = ent_type
            current_tokens = [token]
            start_idx = idx
        elif prefix == "I" and current_entity == ent_type:
            current_tokens.append(token)
        else:
            # If the tag doesn't follow the expected sequence, end the previous span and start a new one.
            if current_entity is not None:
                spans.append({
                    "text": " ".join(current_tokens),
                    "token_start": start_idx,
                    "token_end": idx,
                    "start": start_idx,
                    "end": idx,
                    "label": current_entity
                })
            current_entity = ent_type
            current_tokens = [token]
            start_idx = idx

    # End of sentence: add remaining entity if any.
    if current_entity is not None:
        spans.append({
            "text": " ".join(current_tokens),
            "token_start": start_idx,
            "token_end": len(tokens),
            "start": start_idx,
            "end": len(tokens),
            "label": current_entity
        })
    return spans

def build_order_mapping(vars_list):
    """
    Creates a mapping from a canonical variable name (from vars_list) to its order index.
    """
    mapping = {}
    for idx, var in enumerate(vars_list):
        mapping[var] = idx
    return mapping

def convert_subtask1_to_subtask2(conll_filename, output_filename):
    """
    Converts sub-task 1 output (in CoNLL format) to sub-task 2 input format (JSON format).
    The output JSON object contains:
      - "document": the original problem description (tokens joined into one string)
      - "tokens": the token list
      - "spans": the extracted entity spans (each a dict with text, token_start, token_end, etc.)
      - "vars": a list of unique variable names (from spans labeled "VAR")
      - "var_mentions": a list (in order) of all variable mentions (all spans with label "VAR")
      - "params": a list of all parameters (from spans labeled "PARAM")
      - "var_mention_to_first_var": mapping from each variable mention to the first occurrence
      - "first_var_to_mentions": reverse mapping from canonical variable to list of mentions
      - "obj_declaration": a stub example built from available objective spans (labels "OBJ_DIR" and "OBJ_NAME") and parameters
      - "const_declarations": a stub list built from constraint spans ("CONST_DIR" and "LIMIT")
      - "order_mapping": mapping of each variable (canonical) to an order index
    The top-level JSON object is keyed by a unique id (here we use hash(document)).
    """
    examples = read_conll(conll_filename)
    output_dict = {}
    
    for tokens, labels in examples:
        document = " ".join(tokens)
        spans = extract_entities(tokens, labels)
        
        # Collect variables and parameters from spans.
        vars_list = []
        var_mentions = []
        params = []
        for span in spans:
            if span["label"] == "VAR":
                var_mentions.append(span["text"])
                if span["text"] not in vars_list:
                    vars_list.append(span["text"])
            elif span["label"] == "PARAM":
                params.append(span["text"])
                
        # Create mapping: for each var mention, map to its first occurrence.
        var_mention_to_first_var = {}
        first_var_to_mentions = {}
        for mention in var_mentions:
            if mention not in var_mention_to_first_var:
                var_mention_to_first_var[mention] = mention
                first_var_to_mentions[mention] = [mention]
            else:
                first_var_to_mentions[mention].append(mention)
                
        # Stub: Build objective declaration from spans with label OBJ_DIR and OBJ_NAME.
        obj_dir = None
        obj_name = None
        for span in spans:
            if span["label"] == "OBJ_DIR":
                obj_dir = span["text"]
            elif span["label"] == "OBJ_NAME":
                if not obj_name:
                    obj_name = span["text"]
                else:
                    obj_name += " " + span["text"]
        # For terms, we map each variable to a parameter if available.
        terms = {}
        if params and vars_list:
            # This is a simple heuristic: assign the first PARAM to the first variable, etc.
            for i, var in enumerate(vars_list):
                if i < len(params):
                    terms[var] = params[i]
        obj_declaration = {
            "type": "objective",
            "direction": obj_dir if obj_dir else "",
            "name": obj_name if obj_name else "",
            "terms": terms
        }
        
        # Stub: Build a list of constraint declarations.
        const_declarations = []
        # For every span with label CONST_DIR, try to pair with a nearby LIMIT span.
        for span in spans:
            if span["label"] == "CONST_DIR":
                # Look ahead for a LIMIT span after this.
                for span2 in spans:
                    if span2["label"] == "LIMIT" and span2["token_start"] > span["token_start"]:
                        # Simple heuristic: if we have a CONST_DIR followed by a LIMIT, create a constraint.
                        # Decide the type of constraint (e.g., "sum" or "ratio") based on the text of CONST_DIR.
                        const_declarations.append({
                            "type": "ratio" if "minimum" in span["text"].lower() or "no more than" in span["text"].lower() else "sum",
                            "direction": span["text"],
                            "limit": span2["text"],
                            # In a more complete solution, you would also assign a variable for ratio constraints.
                            "operator": "GREATER_OR_EQUAL" if "minimum" in span["text"].lower() else "LESS_OR_EQUAL"
                        })
                        break
        
        # Determine the order mapping for variables.
        order_mapping = build_order_mapping(vars_list)
        
        # Build the JSON object for this example.
        example_json = {
            "document": document,
            "tokens": tokens,
            "spans": spans,
            "vars": vars_list,
            "var_mentions": var_mentions,
            "params": params,
            "var_mention_to_first_var": var_mention_to_first_var,
            "first_var_to_mentions": first_var_to_mentions,
            "obj_declaration": obj_declaration,
            "const_declarations": const_declarations,
            "order_mapping": order_mapping
        }
        
        # Use a unique id key for this document (for example, using the hash of the document).
        doc_id = str(hash(document))
        output_dict[doc_id] = example_json

    # Save the output as a single JSON object (or use JSON Lines as needed)
    with open(output_filename, "w", encoding="utf-8") as out_file:
        json.dump(output_dict, out_file, indent=2)

# Example usage:
conll_train_file = r"D:\LLM\NER\nl4opt-subtask1-baseline\test_output.conll"          # Sub-task 1 data in CoNLL format
output_jsonl_file = r"D:\LLM\NER\nl4opt-subtask1-baseline\train_subtask2_test_2.jsonl"  # Output file for sub-task 2 input

convert_subtask1_to_subtask2(conll_train_file, output_jsonl_file)
print(f"Conversion complete. Output saved to {output_jsonl_file}")

Conversion complete. Output saved to D:\LLM\NER\nl4opt-subtask1-baseline\train_subtask2_test_2.jsonl


In [19]:
print(conll_output)

-DOCSTART-	_	_	O

Cautious	_	_	O
Asset	_	_	O
Investment	_	_	O
has	_	_	O
a	_	_	O
total	_	_	O
of	_	_	O
$150,000	_	_	O
to	_	_	O
manage	_	_	O
and	_	_	O
decides	_	_	O
to	_	_	O
invest	_	_	O
it	_	_	O
in	_	_	O
money	_	_	B-VAR
market	_	_	I-VAR
fund,	_	_	I-VAR
which	_	_	O
yields	_	_	O
a	_	_	O
2%	_	_	B-PARAM
return	_	_	B-OBJ_NAME
as	_	_	O
well	_	_	O
as	_	_	O
in	_	_	O
foreign	_	_	B-VAR
bonds,	_	_	I-VAR
which	_	_	O
gives	_	_	O
and	_	_	O
average	_	_	O
rate	_	_	O
of	_	_	O
return	_	_	B-OBJ_NAME
of	_	_	O
10.2%.	_	_	B-PARAM
Internal	_	_	O
policies	_	_	O
require	_	_	O
PAI	_	_	O
to	_	_	O
diversify	_	_	O
the	_	_	O
asset	_	_	O
allocation	_	_	O
so	_	_	O
that	_	_	O
the	_	_	O
minimum	_	_	B-CONST_DIR
investment	_	_	O
in	_	_	O
money	_	_	B-VAR
market	_	_	I-VAR
fund	_	_	I-VAR
is	_	_	O
40%	_	_	B-LIMIT
of	_	_	O
the	_	_	O
total	_	_	O
investment.	_	_	O
Due	_	_	O
to	_	_	O
the	_	_	O
risk	_	_	O
of	_	_	O
default	_	_	O
of	_	_	O
foreign	_	_	O
countries,	_	_	O
no	_	_	B-CONST_DIR
more	_	_	I-CONST_DIR
than	_	_	I-CONST_DIR
40%	

In [24]:
import json
import re

# --- Helper: Read CoNLL data from file ---
def read_conll(filename):
    """Read a CoNLL-format file and return a list of sentences and corresponding tag sequences."""
    sentences = []
    tags = []
    with open(filename, "r", encoding="utf-8") as f:
        tokens = []
        labels = []
        for line in f:
            line = line.strip()
            if line == "":
                if tokens:
                    sentences.append(tokens)
                    tags.append(labels)
                    tokens = []
                    labels = []
            else:
                parts = line.split()
                if len(parts) >= 2:
                    tokens.append(parts[0])
                    labels.append(parts[-1])
        if tokens:
            sentences.append(tokens)
            tags.append(labels)
    return sentences, tags


# --- Helper: Convert a BIO-tagged sentence to an XML-like annotated string ---
def bio_to_xml(tokens, bio_tags):
    """
    Convert tokens and their BIO tags to an XML-like annotated string.
    For example, if tokens = ['A', 'foldable', 'cell-phone'] with tags
      ['O', 'B-VAR', 'I-VAR'],
    then the output might be "A <VAR>foldable cell-phone</VAR>".
    """
    output_tokens = []
    current_entity = None  # e.g., "VAR", "CONSTR_DIR", etc.

    for token, tag in zip(tokens, bio_tags):
        if tag == "O":
            # close any open entity span
            if current_entity is not None:
                output_tokens[-1] += f"</{current_entity}>"
                current_entity = None
            output_tokens.append(token)
        else:
            # tag in BIO format; split into prefix and entity type
            try:
                prefix, entity_type = tag.split("-")
            except ValueError:
                # If tag is malformed, treat as O.
                if current_entity is not None:
                    output_tokens[-1] += f"</{current_entity}>"
                    current_entity = None
                output_tokens.append(token)
                continue

            if prefix == "B":
                # If an entity span is already open, close it first.
                if current_entity is not None:
                    output_tokens[-1] += f"</{current_entity}>"
                # Open new entity span with the detected entity type.
                output_tokens.append(f"<{entity_type}>{token}")
                current_entity = entity_type
            elif prefix == "I":
                # Continue the entity span
                if current_entity == entity_type:
                    output_tokens.append(token)
                else:
                    # Inconsistent tag ordering; treat as beginning of new entity.
                    if current_entity is not None:
                        output_tokens[-1] += f"</{current_entity}>"
                    output_tokens.append(f"<{entity_type}>{token}")
                    current_entity = entity_type
    # Close any open entity span at the end.
    if current_entity is not None:
        output_tokens[-1] += f"</{current_entity}>"
    # Join tokens with a single space (or use your desired formatting).
    return " ".join(output_tokens)


# --- Main script: Process train.txt and produce train_bart_ready_1.jsonl ---
def main():
    # Path to input file (CoNLL format) and output jsonl file.
    input_file = r"D:\LLM\DATA\train.txt"
    output_file = r"D:\LLM\DATA\train_bart_ready_1.jsonl"

    # Read tokenized examples and their BIO tags.
    sentences, tag_sequences = read_conll(input_file)
    print(f"Read {len(sentences)} examples from {input_file}")

    # Prepare a list of JSON objects
    json_objects = []
    for tokens, tags in zip(sentences, tag_sequences):
        # Reconstruct the original (untokenized) text
        # Here we assume that simply joining tokens with a space approximates the original text.
        original_text = " ".join(tokens)
        # Convert the tokens/BIO tags to an XML-like annotation.
        xml_annotation = bio_to_xml(tokens, tags)
        # Prepare the JSON object – here we assume the input for training BART is the XML-annotated text.
        # You could also include fields for the target logical form if needed.
        json_obj = {
            "input": xml_annotation,
            "original_text": original_text  # optionally include the original text for reference
        }
        json_objects.append(json_obj)

    # Write out to a JSONL file.
    with open(output_file, "w", encoding="utf-8") as out_f:
        for obj in json_objects:
            out_f.write(json.dumps(obj) + "\n")

    print(f"Saved {len(json_objects)} examples to {output_file}")

if __name__ == "__main__":
    main()

Read 714 examples from D:\LLM\DATA\train.txt
Saved 714 examples to D:\LLM\DATA\train_bart_ready_1.jsonl


In [4]:
import os
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import (AutoTokenizer,
                          AutoModelForTokenClassification,
                          DataCollatorForTokenClassification,
                          TrainingArguments,
                          Trainer)
import evaluate

# ----------------------------------------------------------
# Helper function to read CoNLL format data from a file.
# Each sentence is separated by an empty line.
# Assumes token is in the first column and the entity tag in the last column.
# ----------------------------------------------------------
def read_conll(filename):
    sentences = []
    tags = []
    with open(filename, "r", encoding="utf-8") as f:
        tokens = []
        labels = []
        for line in f:
            line = line.strip()
            if line == "":  # end of sentence
                if tokens:
                    sentences.append(tokens)
                    tags.append(labels)
                    tokens = []
                    labels = []
            else:
                # split line; token is first, tag is last column
                parts = line.split()
                if len(parts) >= 2:
                    tokens.append(parts[0])
                    labels.append(parts[-1])
        if tokens:  # if last sentence is not followed by a newline
            sentences.append(tokens)
            tags.append(labels)
    return sentences, tags

# ----------------------------------------------------------
# Load the train, dev, and test data from files.
# ----------------------------------------------------------
train_tokens, train_tags = read_conll(r"D:\LLM\DATA\train.txt")
dev_tokens, dev_tags = read_conll(r"D:\LLM\DATA\dev.txt")
test_tokens, test_tags = read_conll(r"D:\LLM\DATA\test.txt")

train_dataset = Dataset.from_dict({"tokens": train_tokens, "labels": train_tags})
dev_dataset   = Dataset.from_dict({"tokens": dev_tokens, "labels": dev_tags})
test_dataset  = Dataset.from_dict({"tokens": test_tokens, "labels": test_tags})

dataset = DatasetDict({
    "train": train_dataset,
    "validation": dev_dataset,
    "test": test_dataset
})

# ----------------------------------------------------------
# Build the label mapping.
# We extract the set of all unique labels from the training set.
# ----------------------------------------------------------
unique_labels = set()
for seq in train_tags:
    unique_labels.update(seq)
label_list = sorted(list(unique_labels))
label_to_id = {label: idx for idx, label in enumerate(label_list)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

print("Label set:", label_list)

# ----------------------------------------------------------
# Load the XLM-RoBERTa tokenizer.
# ----------------------------------------------------------
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# ----------------------------------------------------------
# Tokenize the data and align the labels.
# For sub-word tokens, we assign a label only to the first sub-token and -100 to the remaining (ignored in loss).
# ----------------------------------------------------------
def tokenize_and_align_labels(batch):
    tokenized_inputs = tokenizer(batch["tokens"],
                                 truncation=True,
                                 is_split_into_words=True)
    all_labels = []
    for i, labels in enumerate(batch["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[labels[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        all_labels.append(label_ids)
    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

# Apply the tokenization to the entire dataset.
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

# ----------------------------------------------------------
# Load the XLM-RoBERTa-base model for token classification.
# Set the number of output labels and provide label mappings.
# ----------------------------------------------------------
model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-base",
                                                        num_labels=len(label_list),
                                                        id2label=id_to_label,
                                                        label2id=label_to_id)

# ----------------------------------------------------------
# Define training arguments.
# ----------------------------------------------------------
training_args = TrainingArguments(
    output_dir="./xlm_roberta_token_classification",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_strategy="epoch"  # or remove if your version does not support it either
)

# ----------------------------------------------------------
# Define the data collator for token classification.
# It dynamically pads the input sequences.
# ----------------------------------------------------------
data_collator = DataCollatorForTokenClassification(tokenizer)

# ----------------------------------------------------------
# Define the evaluation metric using the seqeval library.
# ----------------------------------------------------------
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [id_to_label[pred] for pred, label in zip(prediction, label_seq) if label != -100]
        for prediction, label_seq in zip(predictions, labels)
    ]
    true_labels = [
        [id_to_label[label] for pred, label in zip(prediction, label_seq) if label != -100]
        for prediction, label_seq in zip(predictions, labels)
    ]
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# ----------------------------------------------------------
# Initialize the Trainer.
# ----------------------------------------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ----------------------------------------------------------
# Train the model.
# ----------------------------------------------------------
trainer.train()

# ----------------------------------------------------------
# Evaluate the model on the test set.
# ----------------------------------------------------------
test_results = trainer.evaluate(tokenized_datasets["test"])
print("Test set evaluation:", test_results)

# ----------------------------------------------------------
# (Optional) Predict on the test set.
# ----------------------------------------------------------
predictions, _, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

# (Optional) Post-process and print a few example predictions.
for i in range(3):
    tokens = tokenized_datasets["test"][i]["tokens"]
    pred_label_ids = predictions[i]
    # Convert sub-token predictions back to word-level labels.
    word_ids = tokenized_datasets["test"][i].get("word_ids", None)
    if word_ids is None:
        # If word_ids are not stored, re-run tokenizer for the single example.
        encoded = tokenizer(tokens, is_split_into_words=True)
        word_ids = encoded.word_ids()
    word_preds = []
    previous = None
    for idx, word_idx in enumerate(word_ids):
        if word_idx is None:
            continue
        if word_idx != previous:
            word_preds.append(id_to_label[pred_label_ids[idx]])
            previous = word_idx
    print("Tokens:", tokens)
    print("Predicted Labels:", word_preds)
    print()

Label set: ['B-CONST_DIR', 'B-LIMIT', 'B-OBJ_DIR', 'B-OBJ_NAME', 'B-PARAM', 'B-VAR', 'I-CONST_DIR', 'I-LIMIT', 'I-OBJ_NAME', 'I-PARAM', 'I-VAR', 'O']


Map:   0%|          | 0/714 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/290 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
NVIDIA GeForce RTX 5070 Ti with CUDA capability sm_120 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_61 sm_70 sm_75 sm_80 sm_86 sm_90 compute_37.
If you want to use the NVIDIA GeForce RTX 5070 Ti GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/

  trainer = Trainer(


Step,Training Loss
50,0.9228
100,0.1959


Test set evaluation: {'eval_loss': 0.2724146246910095, 'eval_precision': 0.8777260018639329, 'eval_recall': 0.8166840097121054, 'eval_f1': 0.8461054712065402, 'eval_accuracy': 0.938971528362491, 'eval_runtime': 1.6785, 'eval_samples_per_second': 172.778, 'eval_steps_per_second': 11.32, 'epoch': 3.0}
Tokens: ['-DOCSTART-']
Predicted Labels: ['O']

Tokens: ['A', 'flooring', 'company', 'produces', 'engineered', 'hardwood', 'and', 'vinyl', 'planks', '.', 'Their', 'sales', 'forecasts', 'show', 'an', 'expected', 'demand', 'of', 'at', 'least', '20,000', 'square', 'foot', 'of', 'hardwood', 'and', '10,000', 'square', 'feet', 'of', 'vinyl', 'planks', 'each', 'week', '.', 'To', 'satisfy', 'a', 'shipping', 'contract', ',', 'a', 'total', 'of', 'at', 'least', '60,000', 'square', 'feet', 'of', 'flooring', 'much', 'be', 'shipped', 'each', 'week', '.', 'Due', 'to', 'a', 'labor', 'shortage', 'issue', ',', 'no', 'more', 'than', '50,000', 'square', 'feet', 'of', 'hardwood', 'and', '30,000', 'square', 'fee

In [2]:
# Save the model and tokenizer to the specified directory
model.save_pretrained("./xlmr_lp_model_1")
tokenizer.save_pretrained("./xlmr_lp_model_1")

NameError: name 'model' is not defined

In [4]:
import torch
import numpy as np
from transformers import AutoModelForTokenClassification, AutoTokenizer

# Load the post-trained XLM-RoBERTa model and tokenizer for sub-task 1
model = AutoModelForTokenClassification.from_pretrained("./xlmr_lp_model_1")
tokenizer = AutoTokenizer.from_pretrained("./xlmr_lp_model_1")

def predict_entities(text: str, max_length=512):
    """
    Given a plain-text optimization problem description, this function tokenizes the text,
    runs the model to predict BIO tags, and returns two lists:
      - words: the original tokens (word-level)
      - predicted_tags: the predicted label for each word (taking the first sub-token only)
    """
    # Simple whitespace tokenization: for sub-task 1 the input is a plain text description.
    words = text.split()
    
    # Tokenize the list of words while preserving word boundaries.
    encoded = tokenizer(words,
                        is_split_into_words=True,
                        return_tensors="pt",
                        truncation=True,
                        max_length=max_length)
    encoded = encoded.to(model.device)
    
    # Obtain logits from the model
    with torch.no_grad():
        logits = model(**encoded).logits  # shape: (1, seq_length, num_labels)
    
    # Choose the label with the highest logit for each token (sub-token)
    predictions = np.argmax(logits.cpu().detach().numpy(), axis=2)[0]
    
    # Get the mapping of sub-tokens to original word indices.
    word_ids = encoded.word_ids(batch_index=0)
    
    final_words = []
    final_tags = []
    previous_word_idx = None
    for idx, word_idx in enumerate(word_ids):
        if word_idx is None:
            continue
        # Only take the first sub-token for each word (to avoid duplicate labels for a single word)
        if word_idx != previous_word_idx:
            final_words.append(words[word_idx])
            final_tags.append(model.config.id2label[predictions[idx]])
            previous_word_idx = word_idx
    return final_words, final_tags

def get_conll_format(words, tags):
    """
    Generate a string in CoNLL format with each token on a new line.
    """
    lines = []
    # DOCSTART header.
    lines.append("-DOCSTART-\t_\t_\tO\n")
    for word, tag in zip(words, tags):
        lines.append(f"{word}\t_\t_\t{tag}")
    return "\n".join(lines)

def save_conll_format(output_str, filename):
    """
    Save the given CoNLL-style string to the specified file.
    """
    with open(filename, "w", encoding="utf-8") as f:
        f.write(output_str)

# --- Sample Input for Sub-task 1 ---
# For sub-task 1 the input is simply the problem description (without additional XML markup).
sample_input = (
    "A farmer has 100 acres to plant corn and wheat; corn yields $200/acre and uses 3 fertilizer units, wheat yields $300/acre and uses 2 fertilizer units, and only 240 fertilizer units are available—how many acres of each should be planted to maximize profit?"
)

# Get predictions from the model.
words, predicted_tags = predict_entities(sample_input)

# Convert the predictions to CoNLL format.
conll_output = get_conll_format(words, predicted_tags)
print("Predicted Output in CoNLL Format:\n")
print(conll_output)

# Save the output to a file.
output_filename = "single_test_output-sub1.conll"
save_conll_format(conll_output, output_filename)
print(f"\nSaved the predicted output to {output_filename}")

HFValidationError: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: './xlmr_lp_model_1'.

In [6]:
!python conll2bart_ready.py \
  --conll D:/LLM/NER/nl4opt-subtask1-baseline/single_test_output-sub1.conll \
  --out bart_inputs_single_test.jsonl