In [None]:
import os
import json


def parse_entailmentbank_entry(entry):
    return {
        "input": entry.get("question") or entry["meta"].get("question_text"),
        "facts": entry["meta"]["triples"],
        "intermediate_conclusions": entry["meta"].get("intermediate_conclusions", {}),
        "proof_steps": entry.get("proof", ""),
        "final_conclusion": entry["hypothesis"],
    }


def load_all_jsonl_in_folder(folder_path):
    parsed_entries = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".jsonl"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as f:
                for line in f:
                    raw = json.loads(line)
                    parsed = parse_entailmentbank_entry(raw)
                    parsed_entries.append(parsed)

    return parsed_entries


# Example usage
parsed_entries = load_all_jsonl_in_folder("EntailmentDataset")
print(json.dumps(parsed_entries[0], indent=2))

In [None]:
import re


def build_rilith_text(entry):
    used = set()
    output_lines = ["<|start_of_reasoning|>"]

    proof_steps = entry["proof_steps"]
    facts = entry["facts"]
    intermediates = entry.get("intermediate_conclusions", {})
    final_conclusion = entry["final_conclusion"]

    # Parse proof steps: pattern like "sent1 & sent3 -> int1: ...;"
    steps = [s.strip() for s in proof_steps.split(";") if s.strip()]

    for step in steps:
        # Example: "sent1 & sent3 -> int1" or "int1 & sent2 -> hypothesis"
        match = re.match(r"(.+?)\s*->\s*(\w+)", step)
        if not match:
            continue
        left, right = match.groups()
        sources = [x.strip() for x in left.split("&")]

        for src in sources:
            if src in used:
                continue
            if src.startswith("sent") and src in facts:
                output_lines.append(f"<|fact|> {facts[src]}")
                used.add(src)
            elif src.startswith("int") and src in intermediates:
                output_lines.append(f"<|intermediate|> {intermediates[src]}")
                used.add(src)

        # Process right side
        if right == "hypothesis":
            if "conclusion" not in used:
                output_lines.append(f"<|conclusion|> {final_conclusion}")
                used.add("conclusion")
        elif right.startswith("int") and right in intermediates:
            if right not in used:
                output_lines.append(f"<|intermediate|> {intermediates[right]}")
                used.add(right)

    output_lines.append("<|end_of_reasoning|>")
    return "\n".join(output_lines)


# Example usage for a list of entries:
def generate_rilith_batch(entries):
    train_dataset = []
    for e in entries:
        train_dataset.append(
            {
                "messages": [
                    {"role": "user", "content": e["input"]},
                    {"role": "reasoning", "content": build_rilith_text(e)},
                    {"role": "assistant", "content": e["final_conclusion"]},
                ]
            }
        )
    return train_dataset


train_dataset = generate_rilith_batch(parsed_entries)

In [None]:
print(train_dataset[0])

In [None]:
with open("RILITHDataset/train_dataset.json", "w") as f:
    for entry in train_dataset:
        json.dump(entry, f)
        f.write("\n")