In [16]:
import os
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import load_dataset, DatasetDict, Dataset
import numpy as np
from seqeval.metrics import classification_report

In [17]:
# Dataset path
data_dir = r"D:\LLM\DATA"

def read_ner_file(filename):
    path = os.path.join(data_dir, filename)
    with open(path, 'r', encoding='utf-8') as f:
        raw = f.read().strip().split("-DOCSTART-")[1:]
    examples = []
    for block in raw:
        lines = block.strip().split("\n")
        tokens, labels = [], []
        for line in lines:
            if line.strip() == "":
                continue
            parts = line.split()
            if len(parts) == 2:
                token, label = parts
            elif len(parts) == 3:
                token, _, label = parts
            else:
                continue  # skip malformed lines
            tokens.append(token)
            labels.append(label)
        if tokens and labels:
            examples.append({"tokens": tokens, "ner_tags": labels})
    return examples

ner_dataset = DatasetDict({
    "train": Dataset.from_list(read_ner_file("train.txt")),
    "validation": Dataset.from_list(read_ner_file("dev.txt")),
    "test": Dataset.from_list(read_ner_file("test.txt")),
})

label_list = sorted(set(label for ex in ner_dataset["train"] for label in ex["ner_tags"]))
label_to_id = {label: i for i, label in enumerate(label_list)}

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[label[word_idx]])
            else:
                label_ids.append(label_to_id[label[word_idx]] if label[word_idx].startswith("I-") else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [40]:
import json
from pathlib import Path

# Paths
input_path = Path(r"D:\LLM\DATA\test.jsonl")
output_path = Path(r"D:\LLM\DATA\test_bart_ready.jsonl")

# Tags to wrap from spans
TAGS = {
    "VAR": "var",
    "PARAM": "param",
    "OBJ_NAME": "obj_name",
    "CONST_DIR": "const_dir",
    "LIMIT": "limit",
    "OBJ_DIR": "obj_dir"
}

# Replace spaces in variable names to keep LP output valid
def sanitize(varname):
    return varname.replace(" ", "_")

# Wrap XML tags around the span-marked words
def wrap_spans(text, spans):
    spans = sorted(spans, key=lambda x: x["start"])  # Sort by position
    wrapped = ""
    last_idx = 0
    for span in spans:
        start, end = span["start"], span["end"]
        label = span["label"]
        tag = TAGS.get(label)
        if not tag:
            continue
        wrapped += text[last_idx:start]
        wrapped += f"<{tag}>{text[start:end]}</{tag}>"
        last_idx = end
    wrapped += text[last_idx:]
    return wrapped

# Convert one entry from original format to {"input": ..., "output": ...}
def convert_entry(entry):
    entry = next(iter(entry.values()))  # unwrap hash-keyed entry
    raw_text = entry["document"]
    spans = entry.get("spans", [])
    tagged_input = wrap_spans(raw_text, spans)

    # LP objective
    obj = entry["obj_declaration"]
    obj_str = f"{obj['direction'][:3]}: "
    obj_terms = [f"{coef} {sanitize(var)}" for var, coef in obj.get("terms", {}).items()]
    obj_str += " + ".join(obj_terms)

    # Constraints
    const_strs = []
    for c in entry.get("const_declarations", []):
        if c["type"] == "sum":
            const_strs.append(f"{' + '.join(sanitize(v) for v in obj['terms'].keys())} <= {c['limit']}")
        elif c["type"] == "lowerbound":
            const_strs.append(f"{sanitize(c['var'])} >= {c['limit']}")
        elif c["type"] == "upperbound":
            const_strs.append(f"{sanitize(c['var'])} <= {c['limit']}")

    output = obj_str + "\nst: " + "\n".join(const_strs)

    return {
        "input": tagged_input.strip(),
        "output": output.strip()
    }

# Process all lines
with open(input_path, "r", encoding="utf-8") as fin, open(output_path, "w", encoding="utf-8") as fout:
    for line in fin:
        data = json.loads(line)
        try:
            converted = convert_entry(data)
            fout.write(json.dumps(converted) + "\n")
        except Exception as e:
            print("Error processing line:", e)

print("Converted file saved to:", output_path)


Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
Error processing line: 'x'
E

In [19]:
# posttrain_bart_lp.py
import os
import json
from transformers import BartTokenizer, BartForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset, DatasetDict

# Data paths
DATA_DIR = r"D:\LLM\DATA"
train_file = os.path.join(DATA_DIR, "train_bart_ready.jsonl")
dev_file = os.path.join(DATA_DIR, "dev_bart_ready.jsonl")
test_file = os.path.join(DATA_DIR, "test_bart_ready.jsonl")

def load_jsonl(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [json.loads(line.strip()) for line in f]

dataset = DatasetDict({
    "train": Dataset.from_list(load_jsonl(train_file)),
    "validation": Dataset.from_list(load_jsonl(dev_file)),
    "test": Dataset.from_list(load_jsonl(test_file)),
})

# Load BART tokenizer & model
checkpoint = "facebook/bart-large"
tokenizer = BartTokenizer.from_pretrained(checkpoint)
model = BartForConditionalGeneration.from_pretrained(checkpoint)

# Preprocess (tokenization)
def preprocess(example):
    input_enc = tokenizer(example["input"], max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        label_enc = tokenizer(example["output"], max_length=256, truncation=True, padding="max_length")
    input_enc["labels"] = label_enc["input_ids"]
    return input_enc

tokenized_dataset = dataset.map(preprocess, batched=True)

# Training args
training_args = Seq2SeqTrainingArguments(
    output_dir="./bart_lp_model",
    num_train_epochs=10,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    logging_dir="./logs",
    do_eval=True,
    save_steps=500,
    logging_steps=100,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
)

trainer.train()


Map:   0%|          | 0/713 [00:00<?, ? examples/s]

Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Map:   0%|          | 0/289 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Step,Training Loss
100,4.2713
200,0.0879
300,0.0425
400,0.0331
500,0.0266
600,0.0182
700,0.017
800,0.0122
900,0.0116
1000,0.0361




TrainOutput(global_step=1790, training_loss=0.2572715645395844, metrics={'train_runtime': 13792.0482, 'train_samples_per_second': 0.517, 'train_steps_per_second': 0.13, 'total_flos': 7725727906529280.0, 'train_loss': 0.2572715645395844, 'epoch': 10.0})

In [20]:
model.save_pretrained("./bart_lp_model")
tokenizer.save_pretrained("./bart_lp_model")

('./bart_lp_model\\tokenizer_config.json',
 './bart_lp_model\\special_tokens_map.json',
 './bart_lp_model\\vocab.json',
 './bart_lp_model\\merges.txt',
 './bart_lp_model\\added_tokens.json')

In [21]:
results = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
print(results)

{'eval_loss': 0.027730919420719147, 'eval_runtime': 12.8924, 'eval_samples_per_second': 22.416, 'eval_steps_per_second': 5.662, 'epoch': 10.0}


In [1]:
from transformers import BartTokenizer, BartForConditionalGeneration
from pathlib import Path

# 1. Use pure local Path object without slashes
model_dir = Path("D:/LLM/BART/bart_lp_model").resolve()

# 2. Confirm required files exist
print("Files in model folder:", list(model_dir.glob("*")))

# 3. Load model + tokenizer safely
tokenizer = BartTokenizer.from_pretrained(str(model_dir), local_files_only=True)
model = BartForConditionalGeneration.from_pretrained(str(model_dir), local_files_only=True)
model.eval()

Files in model folder: [WindowsPath('D:/LLM/BART/bart_lp_model/checkpoint-1500'), WindowsPath('D:/LLM/BART/bart_lp_model/checkpoint-1790'), WindowsPath('D:/LLM/BART/bart_lp_model/config.json'), WindowsPath('D:/LLM/BART/bart_lp_model/generation_config.json'), WindowsPath('D:/LLM/BART/bart_lp_model/merges.txt'), WindowsPath('D:/LLM/BART/bart_lp_model/model.safetensors'), WindowsPath('D:/LLM/BART/bart_lp_model/special_tokens_map.json'), WindowsPath('D:/LLM/BART/bart_lp_model/tokenizer_config.json'), WindowsPath('D:/LLM/BART/bart_lp_model/vocab.json')]


BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
    

In [2]:
import torch

def generate_lp(statement: str, max_len=256):
    inputs = tokenizer(statement, return_tensors="pt", truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=max_len, num_beams=4)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test with a tagged statement:
tagged_input = (
    "A grocery store wants to liquidate its <const_dir>stock</const_dir> of <limit>10</limit> apples, <limit>20</limit> bananas, and <limit>80</limit> grapes. Given past experience, the store knows that they can propose a <var>banana-haters package</var> with <param>6</param> apples and <param>30</param> grapes and that this package will bring a <obj_name>profit</obj_name> of <param>six</param> euros. Similarly, they can prepare a <var>combo package</var> with <param>5</param> apples, <param>6</param> bananas, and <param>20</param> grapes, yielding a <obj_name>profit</obj_name> of <param>seven</param> euros. They know they can sell any quantity of these two packages within the availability of its stock. What quantity of each package, <var>banana-haters packages</var> and <var>combo packages</var>, should the store prepare to <obj_dir>maximize</obj_dir> <obj_name>net profit</obj_name>?"
)

print("Input:\n", tagged_input)
print("Output:\n", generate_lp(tagged_input))


Input:
 A grocery store wants to liquidate its <const_dir>stock</const_dir> of <limit>10</limit> apples, <limit>20</limit> bananas, and <limit>80</limit> grapes. Given past experience, the store knows that they can propose a <var>banana-haters package</var> with <param>6</param> apples and <param>30</param> grapes and that this package will bring a <obj_name>profit</obj_name> of <param>six</param> euros. Similarly, they can prepare a <var>combo package</var> with <param>5</param> apples, <param>6</param> bananas, and <param>20</param> grapes, yielding a <obj_name>profit</obj_name> of <param>seven</param> euros. They know they can sell any quantity of these two packages within the availability of its stock. What quantity of each package, <var>banana-haters packages</var> and <var>combo packages</var>, should the store prepare to <obj_dir>maximize</obj_dir> <obj_name>net profit</obj_name>?
Output:
 max: six banana-haters_package + seven combo_package
st:


In [9]:
# New conversion

import json
from pathlib import Path

# Paths
input_path = Path(r"D:\LLM\DATA\train.jsonl")
output_path = Path(r"D:\LLM\DATA\train_bart_ready_1_bart.jsonl")

# Tags to wrap from spans
TAGS = {
    "VAR": "var",
    "PARAM": "param",
    "OBJ_NAME": "obj_name",
    "CONST_DIR": "const_dir",
    "LIMIT": "limit",
    "OBJ_DIR": "obj_dir"
}

def sanitize(varname):
    return varname.replace(" ", "_")

def wrap_spans(text, spans):
    spans = sorted(spans, key=lambda x: x["start"])
    wrapped = ""
    last_idx = 0
    for span in spans:
        start, end = span["start"], span["end"]
        label = span["label"]
        tag = TAGS.get(label)
        if not tag:
            continue
        wrapped += text[last_idx:start]
        wrapped += f"<{tag}>{text[start:end]}</{tag}>"
        last_idx = end
    wrapped += text[last_idx:]
    return wrapped

def convert_entry(entry):
    entry = next(iter(entry.values()))
    raw_text = entry["document"]
    spans = entry.get("spans", [])
    tagged_input = wrap_spans(raw_text, spans)

    # Objective
    obj = entry["obj_declaration"]
    obj_str = f"{obj['direction'][:3]}: "
    obj_terms = [f"{coef} {sanitize(var)}" for var, coef in obj.get("terms", {}).items()]
    obj_str += " + ".join(obj_terms)

    # Constraints
    const_strs = []
    for c in entry.get("const_declarations", []):
        ctype = c.get("type")

        if ctype == "sum":
            const_strs.append(f"{' + '.join(sanitize(v) for v in obj['terms'].keys())} <= {c['limit']}")

        elif ctype == "lowerbound":
            const_strs.append(f"{sanitize(c['var'])} >= {c['limit']}")

        elif ctype == "upperbound":
            const_strs.append(f"{sanitize(c['var'])} <= {c['limit']}")

        elif ctype == "linear":
            terms = [f"{coef} {sanitize(var)}" for var, coef in c.get("terms", {}).items()]
            operator = {
                "LESS_OR_EQUAL": "<=",
                "GREATER_OR_EQUAL": ">=",
                "EQUAL": "="
            }.get(c.get("operator", ""), "<=")
            const_strs.append(f"{' + '.join(terms)} {operator} {c['limit']}")

        elif ctype == "xby":
            x = sanitize(c["x_var"])
            y = sanitize(c["y_var"])
            const_strs.append(f"{x} >= {c['factor']} {y}")

        elif ctype == "ratio":
            x = sanitize(c["x_var"])
            y = sanitize(c["y_var"])
            direction = c.get("direction", "").lower()
            op = {
                "less than": "<=",
                "greater than": ">=",
                "equal to": "="
            }.get(direction, "<=")
            const_strs.append(f"{x} {op} {c['ratio']} {y}")

        elif ctype == "xy":
            x = sanitize(c["x_var"])
            y = sanitize(c["y_var"])
            direction = c.get("direction", "").lower()
            op = {
                "less than": "<=",
                "greater than": ">=",
                "equal to": "="
            }.get(direction, "<=")
            const_strs.append(f"{x} {op} {y}")

    output = obj_str
    if const_strs:
        output += "\nst: " + "\n     ".join(const_strs)

    return {
        "input": tagged_input.strip(),
        "output": output.strip()
    }

# Run conversion
with open(input_path, "r", encoding="utf-8") as fin, open(output_path, "w", encoding="utf-8") as fout:
    for line in fin:
        data = json.loads(line)
        try:
            converted = convert_entry(data)
            fout.write(json.dumps(converted) + "\n")
        except Exception as e:
            print("Error processing entry:", e)

print(f"Converted file saved to:\n{output_path}")


Error processing entry: 'factor'
Error processing entry: 'x_var'
Error processing entry: 'x_var'
Error processing entry: 'factor'
Error processing entry: 'factor'
Error processing entry: 'factor'
Error processing entry: 'x_var'
Error processing entry: 'x_var'
Error processing entry: 'factor'
Error processing entry: 'x_var'
Error processing entry: 'factor'
Error processing entry: 'factor'
Error processing entry: 'x_var'
Error processing entry: 'factor'
Error processing entry: 'factor'
Error processing entry: 'x_var'
Error processing entry: 'x_var'
Error processing entry: 'factor'
Error processing entry: 'x_var'
Error processing entry: 'x_var'
Error processing entry: 'x_var'
Error processing entry: 'x_var'
Error processing entry: 'x_var'
Error processing entry: 'factor'
Error processing entry: 'factor'
Error processing entry: 'factor'
Error processing entry: 'factor'
Error processing entry: 'factor'
Error processing entry: 'factor'
Error processing entry: 'factor'
Error processing entry:

In [20]:
# posttrain_bart_lp.py
import os
import json
from transformers import BartTokenizer, BartForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset, DatasetDict

# Data paths
DATA_DIR = r"D:\LLM\DATA"
train_file = os.path.join(DATA_DIR, "train_bart_ready_1_bart.jsonl")
dev_file = os.path.join(DATA_DIR, "dev_bart_ready_1.jsonl")
test_file = os.path.join(DATA_DIR, "test_bart_ready_1.jsonl")

def load_jsonl(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [json.loads(line.strip()) for line in f]

dataset = DatasetDict({
    "train": Dataset.from_list(load_jsonl(train_file)),
    "validation": Dataset.from_list(load_jsonl(dev_file)),
    "test": Dataset.from_list(load_jsonl(test_file)),
})

# Load BART tokenizer & model
checkpoint = "facebook/bart-large"
tokenizer = BartTokenizer.from_pretrained(checkpoint)
model = BartForConditionalGeneration.from_pretrained(checkpoint)

# Preprocess (tokenization)
def preprocess(example):
    input_enc = tokenizer(example["input"], max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        label_enc = tokenizer(example["output"], max_length=256, truncation=True, padding="max_length")
    input_enc["labels"] = label_enc["input_ids"]
    return input_enc

tokenized_dataset = dataset.map(preprocess, batched=True)

# Training args
training_args = Seq2SeqTrainingArguments(
    output_dir="./bart_lp_model",
    num_train_epochs=10,
    learning_rate=5e-5,
    per_device_train_batch_size=4,  # safe with 16GB + fp16
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    logging_dir="./logs",
    do_eval=True,
    save_steps=500,
    logging_steps=100,
    predict_with_generate=True,
    fp16=True,  # Recommended for faster and more memory-efficient training
    save_total_limit=2,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
)

trainer.train()

Map:   0%|          | 0/521 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/146 [00:00<?, ? examples/s]

NVIDIA GeForce RTX 5070 Ti with CUDA capability sm_120 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_61 sm_70 sm_75 sm_80 sm_86 sm_90 compute_37.
If you want to use the NVIDIA GeForce RTX 5070 Ti GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/

  trainer = Seq2SeqTrainer(


Step,Training Loss
100,4.4504
200,0.1281
300,0.0664
400,0.0447
500,0.0333
600,0.023
700,0.0169
800,0.0145
900,0.012
1000,0.0083




TrainOutput(global_step=1310, training_loss=0.36783181967796713, metrics={'train_runtime': 493.0485, 'train_samples_per_second': 10.567, 'train_steps_per_second': 2.657, 'total_flos': 5645307488501760.0, 'train_loss': 0.36783181967796713, 'epoch': 10.0})

In [22]:
model.save_pretrained("./bart_lp_model_2")
tokenizer.save_pretrained("./bart_lp_model_2")

SafetensorError: Error while serializing: IoError(Os { code: 1224, kind: Uncategorized, message: "The requested operation cannot be performed on a file with a user-mapped section open." })

In [32]:
from transformers import BartTokenizer, BartForConditionalGeneration
from pathlib import Path

# 1. Use pure local Path object without slashes
model_dir = Path("D:/LLM/BART/bart_lp_model_2").resolve()

# 2. Confirm required files exist
print("Files in model folder:", list(model_dir.glob("*")))

# 3. Load model + tokenizer safely
tokenizer = BartTokenizer.from_pretrained(str(model_dir), local_files_only=True)
model = BartForConditionalGeneration.from_pretrained(str(model_dir), local_files_only=True)
model.eval()

Files in model folder: [WindowsPath('D:/LLM/BART/bart_lp_model_2/config.json'), WindowsPath('D:/LLM/BART/bart_lp_model_2/generation_config.json'), WindowsPath('D:/LLM/BART/bart_lp_model_2/merges.txt'), WindowsPath('D:/LLM/BART/bart_lp_model_2/model.safetensors'), WindowsPath('D:/LLM/BART/bart_lp_model_2/special_tokens_map.json'), WindowsPath('D:/LLM/BART/bart_lp_model_2/tokenizer_config.json'), WindowsPath('D:/LLM/BART/bart_lp_model_2/vocab.json')]


BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
    

In [33]:
import torch

def generate_lp(statement: str, max_len=256):
    inputs = tokenizer(statement, return_tensors="pt", truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=max_len, num_beams=4)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test with a tagged statement:
tagged_input = (
    "A juice bar sells <var>apple juice</var> and <var>orange juice.</var> They can make <const_dir>at most</const_dir> <limit>120</limit> <var>apple juices</var> and <limit>90</limit> <var>orange juices</var> each day. For customer satisfaction, they must sell <const_dir>at least</const_dir> <limit>40</limit> <var>apple juices</var> and <limit>30</limit> <var>orange juices.</var> Each <var>apple juice</var> <obj_name>earns</obj_name> $3 and each <var>orange juice</var> <obj_name>earns</obj_name> $4. How many of each should they sell to <obj_dir>maximize</obj_dir> <obj_name>profit?</obj_name>"
)

print("Input:\n", tagged_input)
print("Output:\n", generate_lp(tagged_input))


Input:
 A juice bar sells <var>apple juice</var> and <var>orange juice.</var> They can make <const_dir>at most</const_dir> <limit>120</limit> <var>apple juices</var> and <limit>90</limit> <var>orange juices</var> each day. For customer satisfaction, they must sell <const_dir>at least</const_dir> <limit>40</limit> <var>apple juices</var> and <limit>30</limit> <var>orange juices.</var> Each <var>apple juice</var> <obj_name>earns</obj_name> $3 and each <var>orange juice</var> <obj_name>earns</obj_name> $4. How many of each should they sell to <obj_dir>maximize</obj_dir> <obj_name>profit?</obj_name>
Output:
 max: 3 apple_juice + 4.4 orange_jjuice
st: apple_juries <= 120
    + orange_juices <= 90
  + 40
  >= 30


In [4]:
from transformers import BartTokenizer, BartForConditionalGeneration
from pathlib import Path
import json

# 1. Use pure local Path object without slashes
model_dir = Path("D:/LLM/BART/bart_lp_model_2").resolve()

# 2. Confirm required files exist
print("Files in model folder:", list(model_dir.glob("*")))

# 3. Load model + tokenizer safely
tokenizer = BartTokenizer.from_pretrained(str(model_dir), local_files_only=True)
model = BartForConditionalGeneration.from_pretrained(str(model_dir), local_files_only=True)
model.eval()

Files in model folder: [WindowsPath('D:/LLM/BART/bart_lp_model_2/config.json'), WindowsPath('D:/LLM/BART/bart_lp_model_2/generation_config.json'), WindowsPath('D:/LLM/BART/bart_lp_model_2/merges.txt'), WindowsPath('D:/LLM/BART/bart_lp_model_2/model.safetensors'), WindowsPath('D:/LLM/BART/bart_lp_model_2/special_tokens_map.json'), WindowsPath('D:/LLM/BART/bart_lp_model_2/tokenizer_config.json'), WindowsPath('D:/LLM/BART/bart_lp_model_2/vocab.json')]


BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
    

In [5]:
from datasets import Dataset
import json
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Load new training data
def load_jsonl(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [json.loads(line.strip()) for line in f]

new_data_path = Path("D:/LLM/DATA/generated_samples_1000.jsonl")
new_dataset = Dataset.from_list(load_jsonl(new_data_path))

# Preprocess
def preprocess(example):
    inputs = tokenizer(example["input"], padding="max_length", truncation=True, max_length=512)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["output"], padding="max_length", truncation=True, max_length=256)
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_dataset = new_dataset.map(preprocess, batched=True)

# Training args
training_args = Seq2SeqTrainingArguments(
    output_dir="./bart_lp_model_2_with_generated_samples_1000",
    num_train_epochs=5,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    weight_decay=0.01,
    logging_dir="./logs_continue",
    save_steps=500,
    logging_steps=100,
    predict_with_generate=True,
    fp16=True,
    save_total_limit=2,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

trainer.train()


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

NVIDIA GeForce RTX 5070 Ti with CUDA capability sm_120 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_61 sm_70 sm_75 sm_80 sm_86 sm_90 compute_37.
If you want to use the NVIDIA GeForce RTX 5070 Ti GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/

  trainer = Seq2SeqTrainer(


Step,Training Loss
100,0.0341
200,0.0191
300,0.0167
400,0.0103
500,0.007
600,0.0045
700,0.0037
800,0.0422
900,0.0026
1000,0.0021


TrainOutput(global_step=1250, training_loss=0.015147188782691955, metrics={'train_runtime': 461.682, 'train_samples_per_second': 10.83, 'train_steps_per_second': 2.707, 'total_flos': 5417761505280000.0, 'train_loss': 0.015147188782691955, 'epoch': 5.0})

In [6]:
model.save_pretrained("./bart_lp_model_2_with_generated_samples_1000")
tokenizer.save_pretrained("./bart_lp_model_2_with_generated_samples_1000")

SafetensorError: Error while serializing: IoError(Os { code: 1224, kind: Uncategorized, message: "The requested operation cannot be performed on a file with a user-mapped section open." })

In [11]:
from transformers import BartTokenizer, BartForConditionalGeneration
from pathlib import Path
import json
import torch

# 1. Use pure local Path object without slashes
model_dir = Path("D:/LLM/BART/bart_lp_model_2_with_generated_samples_1000").resolve()

# 2. Confirm required files exist
print("Files in model folder:", list(model_dir.glob("*")))

# 3. Load model + tokenizer safely
tokenizer = BartTokenizer.from_pretrained(str(model_dir), local_files_only=True)
model = BartForConditionalGeneration.from_pretrained(str(model_dir), local_files_only=True)
model.eval()

Files in model folder: [WindowsPath('D:/LLM/BART/bart_lp_model_2_with_generated_samples_1000/checkpoint-1000'), WindowsPath('D:/LLM/BART/bart_lp_model_2_with_generated_samples_1000/checkpoint-1250'), WindowsPath('D:/LLM/BART/bart_lp_model_2_with_generated_samples_1000/config.json'), WindowsPath('D:/LLM/BART/bart_lp_model_2_with_generated_samples_1000/generation_config.json'), WindowsPath('D:/LLM/BART/bart_lp_model_2_with_generated_samples_1000/merges.txt'), WindowsPath('D:/LLM/BART/bart_lp_model_2_with_generated_samples_1000/model.safetensors'), WindowsPath('D:/LLM/BART/bart_lp_model_2_with_generated_samples_1000/special_tokens_map.json'), WindowsPath('D:/LLM/BART/bart_lp_model_2_with_generated_samples_1000/tokenizer_config.json'), WindowsPath('D:/LLM/BART/bart_lp_model_2_with_generated_samples_1000/vocab.json')]


BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
    

In [12]:
# LP generation function
def generate_lp(statement: str, max_len=256):
    inputs = tokenizer(statement, return_tensors="pt", truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=max_len, num_beams=4)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Read from file and generate LPs
input_file = Path("D:/LLM/NER/nl4opt-subtask1-baseline/bart_inputs_single_test.jsonl")
with open(input_file, "r", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line)
        tagged_input = data.get("input", "")
        if tagged_input:
            print("\nInput:\n", tagged_input)
            print("Output:\n", generate_lp(tagged_input))



Input:
 A bubble tea store sells <var>peach</var> and <var>mango</var> flavored drinks. The store can make <const_dir>at most</const_dir> <limit>788</limit> drinks in total. To stay in business, they must sell <const_dir>at least</const_dir> <limit>53</limit> <var>mango drinks</var> and <limit>89</limit> <var>peach drinks.</var> However, due to fruit shortages, they can make <const_dir>at most</const_dir> <limit>560</limit> <var>mango drinks</var> and <const_dir>at most</const_dir> <limit>64</limit> <var>peach drinks.</var> The <obj_name>profit</obj_name> per <var>mango drink</var> is <param>$3,</param> and the <obj_name>profit</obj_name> per <var>peach drink</var> is <param>$1.</param> How many of each drink should they sell to <obj_dir>maximize</obj_dir> <obj_name>profit?</obj_name>
Output:
 max: 1 mango_drink + 1 peach_name
st: mango_jacket + peach_ink <= 788
   _ mango_jackets >= 53
  peach_drinks <= 560
 _   peach-drinks >= 89
  <= 64
