In [1]:
import subprocess
import json
import re
from openai import OpenAI
from typing import Tuple, Iterator, List, Dict
from pathlib import Path

In [2]:
api_key_path = r"../openai_key.txt"
model = "gpt-4o"

with open(api_key_path, 'r') as f:
    key = f.read()
client = OpenAI(api_key=key)


In [3]:

ff_loc = "/home/exale/FF-v2.3/ff"

def run_ff(domain: str, problem: str, ff_loc: str) -> str:
    cmd = ["wsl", ff_loc, "-o", domain, "-f", problem]
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
        #raise RuntimeError(f"FF failed:\n{result.stderr}")
        return result.stderr
    return result.stdout

In [13]:
def run_VAL(domain: str, problem: str, candidate: str) -> str:
    cmd = ["validate", domain, problem, candidate]
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
        return result.stderr
    return result.stdout

In [4]:
def query_iterator(json_path: str) -> Iterator[Tuple[int, str]]:
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    for entry in data:
        yield entry["id"], entry["nl_query"]

In [5]:
def _extract(text: str, anchor: str) -> str:
    start = text.find(anchor)
    if start == -1:
        raise ValueError(f"Anchor not found: {anchor!r}")
    depth = 0
    end = None
    for i, c in enumerate(text[start:], start):
        if c == '(':
            depth += 1
        elif c == ')':
            depth -= 1
            if depth == 0:
                end = i
                break
    if end is None:
        raise ValueError(f"Unbalanced parentheses for anchor {anchor!r}")
    return text[start:end+1]

In [6]:
def extract_d_p(source: str, domain_out: str, problem_out: str) -> None:
    domain_text = _extract(source, "(define (domain")
    problem_text = _extract(source, "(define (problem")

    Path(domain_out).write_text(domain_text, encoding='utf-8')
    Path(problem_out).write_text(problem_text, encoding='utf-8')

In [7]:
def extract_plan(ff_output: str) -> str:
    # Regex matches either "step    0: …" or lines like "    1: …"
    step_re = re.compile(r'^\s*(?:step\s*)?(\d+:\s*\S.*)$', re.IGNORECASE)
    lines = ff_output.splitlines()

    steps = []
    found_plan_header = False

    for line in lines:
        if 'unsolvable' in line:
            return 'uns'
        if not found_plan_header:
            if 'ff: found legal plan as follows' in line.lower():
                found_plan_header = True
            continue

        if line.strip().lower().startswith('time spent'):
            break

        m = step_re.match(line)
        if m:
            steps.append(m.group(1).strip())
    return "\n".join(steps)

In [None]:
def to_sol(extracted_plan: str, soln_path: str) -> None:
    # Follows extract_plan to convert and write to .sol file to be checked by VAL.
    sol_lines = []
    for line in extracted_plan.splitlines():
        _, rest = line.split(':', 1)
        rest = rest.strip()
        parts = rest.split()
        action = parts[0].lower()
        args = parts[1:]
        sol_lines.append(f"({action} {' '.join(args)})")
    soln_text = "\n".join(sol_lines)
    Path(soln_path).write_text(soln_text, encoding='utf-8')

In [8]:
def _parse_plan(steps_str: str) -> List[Tuple[str, List[str]]]:
    step_pattern = re.compile(r'^\s*\d+:\s*(.+)$')
    actions: List[Tuple[str, List[str]]] = []
    for line in steps_str.splitlines():
        m = step_pattern.match(line)
        if not m:
            continue
        action_line = m.group(1).strip()
        parts = action_line.split()
        if not parts:
            continue
        name = parts[0]
        args = parts[1:]
        actions.append((name, args))
    return actions

def plan_matches(plan1: str, plan2: str) -> bool:
    """
    Return True if two plans (as extracted strings) are effectively the same,
    i.e. they have the same length, the same arguments in each step, and
    their action names correspond under a 1-1 renaming.
    """
    acts1 = _parse_plan(plan1)
    acts2 = _parse_plan(plan2)

    if len(acts1) != len(acts2):
        return False

    mapping: dict = {}
    inverse_mapping: dict = {}

    for (name1, args1), (name2, args2) in zip(acts1, acts2):
        if args1 != args2:
            return False

        if name1 in mapping:
            if mapping[name1] != name2:
                return False
        else:
            mapping[name1] = name2

        if name2 in inverse_mapping:
            if inverse_mapping[name2] != name1:
                return False
        else:
            inverse_mapping[name2] = name1

    return True


In [9]:
def query_to_pddl(model: str, client: object, query: str, d_out: str, p_out: str) -> None:
    response = client.chat.completions.create(
    model=model,
    messages=[
        {
            "role": "user",
            "content": query
        }
        ]
    )
    resp_text = response.choices[0].message.content
    domain_out = d_out
    problem_out = p_out
    extract_d_p(resp_text, domain_out, problem_out)

In [10]:
def gt_match(idx: int, gt_domain: str, gt_problem: str, test_domain: str, test_problem:str) -> bool:
    gt_ff_result = run_ff(gt_domain, gt_problem, ff_loc)
    lm_ff_result = run_ff(test_domain, test_problem, ff_loc)

    gt_ff_plan = extract_plan(gt_ff_result)
    lm_ff_output = extract_plan(lm_ff_result)
    return plan_matches(gt_ff_plan, lm_ff_output), lm_ff_output

In [36]:
"""
Main loop running through 30 blocksworld instances from PlanBench on single agent.
"""
lim = 1
correct = 0
incorrect = []
query_mode = "pddl"
gt_domain = "./blocksworld/blocksworld_domain.pddl"
task_query = {"pddl": "\nGenerate the contents for a domain.pddl and a problem.pddl file that captures this task.", 
              "plan": "\nCome up with a sequence of steps to complete this task."}
with open("./blocksworld/blocksworld_domain_nl.txt", 'r') as f:
    domain_nl = f.read()
json_path = "./blocksworld/blocksworld_problems_nl.json"

for idx, p_query in query_iterator(json_path):
    query = domain_nl + '\n\n' + p_query + task_query[query_mode]
    domain_out = f"./temp/domain{idx}.pddl"
    problem_out = f"./temp/problem{idx}.pddl"
    query_to_pddl(model, client, query, domain_out, problem_out)
    gt_problem = f"./blocksworld/blocksworld_problems_pddl/instance-{idx}.pddl"
    match_correct, _ = gt_match(idx, gt_domain, gt_problem, domain_out, problem_out)

    if match_correct:
        correct += 1
    else:
        incorrect.append(idx)
    if idx == lim:
        break

print(f"Correct: {correct}/{lim}, Accuracy: {correct/lim}")
print(f"Incorrect instances on first iteration: {incorrect}")


Correct: 1/1, Accuracy: 1.0
Incorrect instances on first iteration: []


In [16]:
#domain = "./blocksworld/blocksworld_domain.pddl"
#problem = "./blocksworld/blocksworld_problems_pddl/instance-10.pddl"
domain = "./temp/domain1.pddl"
problem = "./temp/problem1.pddl"
plan_a =extract_plan(run_ff(domain, problem, ff_loc))
print(plan_a)

0: PICK-UP H
1: STACK H L
2: PICK-UP B
3: STACK B H
4: PICK-UP D
5: STACK D B
6: PICK-UP G
7: STACK G D
8: PICK-UP I
9: STACK I G
10: PICK-UP F
11: STACK F I
12: PICK-UP J
13: STACK J F


In [14]:
domain = "./blocksworld/blocksworld_domain.pddl"
problem = "./blocksworld/blocksworld_problems_pddl/instance-1.pddl"
plan_b = extract_plan(run_ff(domain, problem, ff_loc))
print(plan_b)

0: PICK-UP H
1: STACK H L
2: PICK-UP B
3: STACK B H
4: PICK-UP D
5: STACK D B
6: PICK-UP G
7: STACK G D
8: PICK-UP I
9: STACK I G
10: PICK-UP F
11: STACK F I
12: PICK-UP J
13: STACK J F


In [11]:
plan_matches(plan_a, plan_b)

False

In [None]:
# Testing VAL on ground truth plan
sol_path = "./blocksworld/sols/gt_soln1.sol"
to_sol(plan_b, sol_path)
run_VAL(domain, problem, sol_path)

'Checking plan: ./blocksworld/sols/gt_soln1.sol\nPlan executed successfully - checking goal\nPlan valid\nFinal value: 14 \n\nSuccessful plans:\nValue: 14\n ./blocksworld/sols/gt_soln1.sol 14 \n\n'

In [13]:
def build_query_map(json_path: str) -> Dict[int, str]:
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return {entry["id"]: entry["nl_query"] for entry in data}

In [None]:
"""
Iterative refinement on failed instances using feedback from FF.
"""
debug = True
incorrect = [1, 7] # Manually set here to avoid running on all 30 again
gt_domain = "./blocksworld/blocksworld_domain.pddl"
p_query_path = "./blocksworld/blocksworld_problems_nl.json"
p_query_dict = build_query_map(p_query_path)
with open("./blocksworld/blocksworld_domain_nl.txt", 'r') as f:
    domain_nl = f.read()

fails = set(incorrect)
no_iter = 1
for fail in fails:
    
    domain_path = f"./temp/domain{fail}.pddl"
    problem_path = f"./temp/problem{fail}.pddl"

    with open(domain_path, 'r') as d:
        domain_pddl = d.read()
    with open(problem_path, 'r') as p:
        problem_pddl = p.read()
    p_query = p_query_dict[fail]
    # For fully automated, might combine this with the previous loop. Then use matching, output = gt_match(...) instead of defining separately.
    output = extract_plan(run_ff(domain_path, problem_path, ff_loc))

    ### Maybe better to have stateful agents for iterative? Because now we'd need a large context prompt to brief the previous interaction (and only the previous interaction)
    context = "Domain and problem pddl files were generated to capture the following task:\n"
    context = context + domain_nl + p_query

    matching = False
    # Checks if still failing. 
    while not matching:
        if not output: # if empty output i.e. error in FF compile
            feedback = "The files are invalid and could not be solved. Check the syntax provide a new set of corrected pddl files."
        elif output == 'uns': # if can compile but unsolvable
            feedback = "The problem under the provided domain is unsolvable. Check that the actions are not missing preconditions or effects " \
                       "and that the initial and goal conditions match, and provide a new set of corrected pddl files."
        elif output[0] == "0": # compiles but incorrect plan (actions work but are wrong or init/goal wrong (less likely))
            feedback = "The current problem definition results in an incorrect plan, possibly due to incorrect action specifications. " \
                       "Reread the original task statement carefully and provide a new set of corrected pddl files, double checking " \
                       "each action has all necessary preconditions and effects."
        else:
            raise ValueError(f"Unexpected output from plan extraction: {output}")
        
        query = context + f"\n[Domain]: {domain_pddl}\n\n" + f"[Problem]: {problem_pddl}\n\n" + feedback
        query_to_pddl(model, client, query, domain_path, problem_path)
        # debug
        if debug:
            print(f"Output: {output}")
            print(f"Iter: {no_iter}")
            print(f"Id: {fail}")
        gt_problem = f"./blocksworld/blocksworld_problems_pddl/instance-{fail}.pddl"
        
        # Not efficient for multi-iteration because re-solves gt. But ok for simple cases
        matching, output = gt_match(fail, gt_domain, gt_problem, domain_path, problem_path)

        no_iter += 1
    print(f"{fail} produced correct output in {no_iter} total iterations.")


Output: 0: PICK-UP J
1: STACK J F
2: PICK-UP F
3: STACK F I
4: PICK-UP I
5: STACK I G
6: PICK-UP G
7: STACK G D
8: PICK-UP D
9: STACK D B
10: PICK-UP B
11: STACK B H
12: PICK-UP H
13: STACK H L
Iter: 1
Output: 0: PICK-UP J
1: STACK J F
2: PICK-UP F
3: STACK F I
4: PICK-UP I
5: STACK I G
6: PICK-UP G
7: STACK G D
8: PICK-UP D
9: STACK D B
10: PICK-UP B
11: STACK B H
12: PICK-UP H
13: STACK H L
Iter: 2
Output: 0: PICK-UP J
1: STACK J F
2: PICK-UP F
3: STACK F I
4: PICK-UP I
5: STACK I G
6: PICK-UP G
7: STACK G D
8: PICK-UP D
9: STACK D B
10: PICK-UP B
11: STACK B H
12: PICK-UP H
13: STACK H L
Iter: 3
Output: uns
Iter: 4
Output: 0: PICK-UP J
1: STACK J I
2: PICK-UP I
3: STACK I F
4: PICK-UP F
5: STACK F G
6: PICK-UP G
7: STACK G D
8: PICK-UP D
9: STACK D B
10: PICK-UP B
11: STACK B H
12: PICK-UP H
13: STACK H L
Iter: 5
Output: uns
Iter: 6


KeyboardInterrupt: 

Strangely, after multiple iterations (6) with (I think) quite obvious prompting telling the LM to look at action definitions, it still gets it worng. But somehow got it right in one try for the remaining 28? But in fact re-running *without* any feedback at all resolves the issue in a single iteration! So at least for the *stateless* case, it seems like relying on model temperature and just keeping the prompt simple is better (especially given the high pass rate for these relatively simple blocksworld tasks) than to iterate with the previous iteration pddl files plus feedback.