In [4]:
import os
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import (AutoTokenizer,
                          AutoModelForTokenClassification,
                          DataCollatorForTokenClassification,
                          TrainingArguments,
                          Trainer)
import evaluate

# ----------------------------------------------------------
# Helper function to read CoNLL format data from a file.
# Each sentence is separated by an empty line.
# Assumes token is in the first column and the entity tag in the last column.
# ----------------------------------------------------------
def read_conll(filename):
    sentences = []
    tags = []
    with open(filename, "r", encoding="utf-8") as f:
        tokens = []
        labels = []
        for line in f:
            line = line.strip()
            if line == "":  # end of sentence
                if tokens:
                    sentences.append(tokens)
                    tags.append(labels)
                    tokens = []
                    labels = []
            else:
                # split line; token is first, tag is last column
                parts = line.split()
                if len(parts) >= 2:
                    tokens.append(parts[0])
                    labels.append(parts[-1])
        if tokens:  # if last sentence is not followed by a newline
            sentences.append(tokens)
            tags.append(labels)
    return sentences, tags

# ----------------------------------------------------------
# Load the train, dev, and test data from files.
# ----------------------------------------------------------
train_tokens, train_tags = read_conll(r"D:\LLM\DATA\train.txt")
dev_tokens, dev_tags = read_conll(r"D:\LLM\DATA\dev.txt")
test_tokens, test_tags = read_conll(r"D:\LLM\DATA\test.txt")

train_dataset = Dataset.from_dict({"tokens": train_tokens, "labels": train_tags})
dev_dataset   = Dataset.from_dict({"tokens": dev_tokens, "labels": dev_tags})
test_dataset  = Dataset.from_dict({"tokens": test_tokens, "labels": test_tags})

dataset = DatasetDict({
    "train": train_dataset,
    "validation": dev_dataset,
    "test": test_dataset
})

# ----------------------------------------------------------
# Build the label mapping.
# We extract the set of all unique labels from the training set.
# ----------------------------------------------------------
unique_labels = set()
for seq in train_tags:
    unique_labels.update(seq)
label_list = sorted(list(unique_labels))
label_to_id = {label: idx for idx, label in enumerate(label_list)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

print("Label set:", label_list)

# ----------------------------------------------------------
# Load the XLM-RoBERTa tokenizer.
# ----------------------------------------------------------
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# ----------------------------------------------------------
# Tokenize the data and align the labels.
# For sub-word tokens, we assign a label only to the first sub-token and -100 to the remaining (ignored in loss).
# ----------------------------------------------------------
def tokenize_and_align_labels(batch):
    tokenized_inputs = tokenizer(batch["tokens"],
                                 truncation=True,
                                 is_split_into_words=True)
    all_labels = []
    for i, labels in enumerate(batch["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[labels[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        all_labels.append(label_ids)
    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

# Apply the tokenization to the entire dataset.
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

# ----------------------------------------------------------
# Load the XLM-RoBERTa-base model for token classification.
# Set the number of output labels and provide label mappings.
# ----------------------------------------------------------
model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-base",
                                                        num_labels=len(label_list),
                                                        id2label=id_to_label,
                                                        label2id=label_to_id)

# ----------------------------------------------------------
# Define training arguments.
# ----------------------------------------------------------
training_args = TrainingArguments(
    output_dir="./xlm_roberta_token_classification",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_strategy="epoch"  # or remove if your version does not support it either
)

# ----------------------------------------------------------
# Define the data collator for token classification.
# It dynamically pads the input sequences.
# ----------------------------------------------------------
data_collator = DataCollatorForTokenClassification(tokenizer)

# ----------------------------------------------------------
# Define the evaluation metric using the seqeval library.
# ----------------------------------------------------------
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [id_to_label[pred] for pred, label in zip(prediction, label_seq) if label != -100]
        for prediction, label_seq in zip(predictions, labels)
    ]
    true_labels = [
        [id_to_label[label] for pred, label in zip(prediction, label_seq) if label != -100]
        for prediction, label_seq in zip(predictions, labels)
    ]
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# ----------------------------------------------------------
# Initialize the Trainer.
# ----------------------------------------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ----------------------------------------------------------
# Train the model.
# ----------------------------------------------------------
trainer.train()

# ----------------------------------------------------------
# Evaluate the model on the test set.
# ----------------------------------------------------------
test_results = trainer.evaluate(tokenized_datasets["test"])
print("Test set evaluation:", test_results)

# ----------------------------------------------------------
# (Optional) Predict on the test set.
# ----------------------------------------------------------
predictions, _, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

# (Optional) Post-process and print a few example predictions.
for i in range(3):
    tokens = tokenized_datasets["test"][i]["tokens"]
    pred_label_ids = predictions[i]
    # Convert sub-token predictions back to word-level labels.
    word_ids = tokenized_datasets["test"][i].get("word_ids", None)
    if word_ids is None:
        # If word_ids are not stored, re-run tokenizer for the single example.
        encoded = tokenizer(tokens, is_split_into_words=True)
        word_ids = encoded.word_ids()
    word_preds = []
    previous = None
    for idx, word_idx in enumerate(word_ids):
        if word_idx is None:
            continue
        if word_idx != previous:
            word_preds.append(id_to_label[pred_label_ids[idx]])
            previous = word_idx
    print("Tokens:", tokens)
    print("Predicted Labels:", word_preds)
    print()

Label set: ['B-CONST_DIR', 'B-LIMIT', 'B-OBJ_DIR', 'B-OBJ_NAME', 'B-PARAM', 'B-VAR', 'I-CONST_DIR', 'I-LIMIT', 'I-OBJ_NAME', 'I-PARAM', 'I-VAR', 'O']


Map:   0%|          | 0/714 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/290 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
NVIDIA GeForce RTX 5070 Ti with CUDA capability sm_120 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_61 sm_70 sm_75 sm_80 sm_86 sm_90 compute_37.
If you want to use the NVIDIA GeForce RTX 5070 Ti GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/

  trainer = Trainer(


Step,Training Loss
50,0.9228
100,0.1959


Test set evaluation: {'eval_loss': 0.2724146246910095, 'eval_precision': 0.8777260018639329, 'eval_recall': 0.8166840097121054, 'eval_f1': 0.8461054712065402, 'eval_accuracy': 0.938971528362491, 'eval_runtime': 1.6785, 'eval_samples_per_second': 172.778, 'eval_steps_per_second': 11.32, 'epoch': 3.0}
Tokens: ['-DOCSTART-']
Predicted Labels: ['O']

Tokens: ['A', 'flooring', 'company', 'produces', 'engineered', 'hardwood', 'and', 'vinyl', 'planks', '.', 'Their', 'sales', 'forecasts', 'show', 'an', 'expected', 'demand', 'of', 'at', 'least', '20,000', 'square', 'foot', 'of', 'hardwood', 'and', '10,000', 'square', 'feet', 'of', 'vinyl', 'planks', 'each', 'week', '.', 'To', 'satisfy', 'a', 'shipping', 'contract', ',', 'a', 'total', 'of', 'at', 'least', '60,000', 'square', 'feet', 'of', 'flooring', 'much', 'be', 'shipped', 'each', 'week', '.', 'Due', 'to', 'a', 'labor', 'shortage', 'issue', ',', 'no', 'more', 'than', '50,000', 'square', 'feet', 'of', 'hardwood', 'and', '30,000', 'square', 'fee

In [5]:
# Save the model and tokenizer to the specified directory
model.save_pretrained("./xlmr_lp_model_1")
tokenizer.save_pretrained("./xlmr_lp_model_1")

('./xlmr_lp_model_1\\tokenizer_config.json',
 './xlmr_lp_model_1\\special_tokens_map.json',
 './xlmr_lp_model_1\\tokenizer.json')

In [1]:
import torch
import numpy as np
from transformers import AutoModelForTokenClassification, AutoTokenizer

# Load the post-trained XLM-RoBERTa model and tokenizer for sub-task 1
model = AutoModelForTokenClassification.from_pretrained("./xlmr_lp_model_1")
tokenizer = AutoTokenizer.from_pretrained("./xlmr_lp_model_1")

def predict_entities(text: str, max_length=512):
    """
    Given a plain-text optimization problem description, this function tokenizes the text,
    runs the model to predict BIO tags, and returns two lists:
      - words: the original tokens (word-level)
      - predicted_tags: the predicted label for each word (taking the first sub-token only)
    """
    # Simple whitespace tokenization: for sub-task 1 the input is a plain text description.
    words = text.split()
    
    # Tokenize the list of words while preserving word boundaries.
    encoded = tokenizer(words,
                        is_split_into_words=True,
                        return_tensors="pt",
                        truncation=True,
                        max_length=max_length)
    encoded = encoded.to(model.device)
    
    # Obtain logits from the model
    with torch.no_grad():
        logits = model(**encoded).logits  # shape: (1, seq_length, num_labels)
    
    # Choose the label with the highest logit for each token (sub-token)
    predictions = np.argmax(logits.cpu().detach().numpy(), axis=2)[0]
    
    # Get the mapping of sub-tokens to original word indices.
    word_ids = encoded.word_ids(batch_index=0)
    
    final_words = []
    final_tags = []
    previous_word_idx = None
    for idx, word_idx in enumerate(word_ids):
        if word_idx is None:
            continue
        # Only take the first sub-token for each word (to avoid duplicate labels for a single word)
        if word_idx != previous_word_idx:
            final_words.append(words[word_idx])
            final_tags.append(model.config.id2label[predictions[idx]])
            previous_word_idx = word_idx
    return final_words, final_tags

def get_conll_format(words, tags):
    """
    Generate a string in CoNLL format with each token on a new line.
    """
    lines = []
    # DOCSTART header.
    lines.append("-DOCSTART-\t_\t_\tO\n")
    for word, tag in zip(words, tags):
        lines.append(f"{word}\t_\t_\t{tag}")
    return "\n".join(lines)

def save_conll_format(output_str, filename):
    """
    Save the given CoNLL-style string to the specified file.
    """
    with open(filename, "w", encoding="utf-8") as f:
        f.write(output_str)

In [22]:
# --- Sample Input for Sub-task 1 ---
# For sub-task 1 the input is simply the problem description (without additional XML markup).
sample_input = (
    "A man  only eats vegetable and fruits. A serving of vegetables contains 2 units of vitamins and 3 units of minerals. A serving of fruit contains 4 units of vitamins and 1 unit of minerals. He wants to eat at least 20 units of vitamins and 30 units of minerals. If vegetables cost $3 per serving and fruits cost $5 per serving, how many servings of each should he eat to minimize his cost?"
)

# Get predictions from the model.
words, predicted_tags = predict_entities(sample_input)

# Convert the predictions to CoNLL format.
conll_output = get_conll_format(words, predicted_tags)
print("Predicted Output in CoNLL Format:\n")
print(conll_output)

# Save the output to a file.
output_filename = "single_test_output-sub1.conll"
save_conll_format(conll_output, output_filename)
print(f"\nSaved the predicted output to {output_filename}")

Predicted Output in CoNLL Format:

-DOCSTART-	_	_	O

A	_	_	O
man	_	_	O
only	_	_	O
eats	_	_	O
vegetable	_	_	B-VAR
and	_	_	O
fruits.	_	_	B-VAR
A	_	_	O
serving	_	_	O
of	_	_	O
vegetables	_	_	B-VAR
contains	_	_	O
2	_	_	B-PARAM
units	_	_	O
of	_	_	O
vitamins	_	_	O
and	_	_	O
3	_	_	B-PARAM
units	_	_	O
of	_	_	O
minerals.	_	_	O
A	_	_	O
serving	_	_	O
of	_	_	O
fruit	_	_	B-VAR
contains	_	_	O
4	_	_	B-PARAM
units	_	_	O
of	_	_	O
vitamins	_	_	O
and	_	_	O
1	_	_	B-PARAM
unit	_	_	O
of	_	_	O
minerals.	_	_	O
He	_	_	O
wants	_	_	O
to	_	_	O
eat	_	_	O
at	_	_	B-CONST_DIR
least	_	_	I-CONST_DIR
20	_	_	B-LIMIT
units	_	_	O
of	_	_	O
vitamins	_	_	O
and	_	_	O
30	_	_	B-LIMIT
units	_	_	O
of	_	_	O
minerals.	_	_	O
If	_	_	O
vegetables	_	_	B-VAR
cost	_	_	B-OBJ_NAME
$3	_	_	B-PARAM
per	_	_	O
serving	_	_	O
and	_	_	O
fruits	_	_	B-VAR
cost	_	_	B-OBJ_NAME
$5	_	_	B-PARAM
per	_	_	O
serving,	_	_	O
how	_	_	O
many	_	_	O
servings	_	_	O
of	_	_	O
each	_	_	O
should	_	_	O
he	_	_	O
eat	_	_	O
to	_	_	O
minimize	_	_	B-OBJ_DIR
his	_	_	O
cost?	_	_

In [23]:
!python conll2bart_ready.py \
  --conll D:/LLM/NER/nl4opt-subtask1-baseline/single_test_output-sub1.conll \
  --out bart_inputs_single_test.jsonl