# Tutorial of LangDa

### Schema of Facts and Views

##### 1.1 Define FactSchema

In [1]:
# === Define FactSchema ===
# Usage:
# - FactSchema is the canonical set of fact-layer predicates allowed in your system.
# - Each predicate is defined with a signature: a list of ArgSpec entries.
#
# Design rationale:
# - We separate names and argument signatures so that predicates are stable, hashable, and
#   unambiguous even if names collide across domains (namespace/role).
# - This enables FactView filtering and stable predicate_id references for LLM decoding.
#
# ArgSpec fields:
# - datatype: the data type of the argument, e.g., string/int/float
# - namespace: business domain/module/ontology (optional)
# - role: semantic role of the parameter (optional)
import pprint as pp
from symir.rule_ir import ArgSpec, Fact, Rel, FactLayer

person_name = ArgSpec("Name:string", namespace="person", role="key")
# default arg_name: Param, Param2 when omitted
person_address = ArgSpec("string", namespace="address")
person_age = ArgSpec("int", namespace="person")

city_name = ArgSpec("City:string", namespace="geo", role="name")
company_name = ArgSpec("Company:string", namespace="org", role="key")
company_revenue = ArgSpec("Revenue:float", namespace="org")
country_name = ArgSpec("Country:string", namespace="geo", role="key")

# richer fact to show multi-arity + default arg_name
person_profile = Fact(
    "person_profile",
    [person_name, person_address, person_age],
    description="Person with address and age",
)

person = Fact(
    "person", 
    args=[person_name], 
    description="A person entity",
    merge_policy="keep_all"
)
city = Fact("city", [city_name], description="A city entity")

# use composite key fields for company to show flexibility of referencing non-name fields as keys
company = Fact("company", [company_name, company_revenue, country_name], description="A company entity")
country = Fact("country", [country_name], description="A country entity")

lives = Rel("lives_in", sub=person, obj=city, description="person lives in city")
works = Rel("works_at", sub=person, obj=company, description="person works at company")
located = Rel("located_in", sub=city, obj=country, description="city located in country")
hq = Rel("company_hq", sub=company, obj=city, description="company HQ city")
friend = Rel("are_friends", sub=person, obj=person, description="person knows person")



In [2]:
# relation with properties
employment = Rel(
    "employment",
    sub=person,
    obj=company,
    props=[ArgSpec("since:int", namespace="time",role="key"), ArgSpec("title:string", namespace="org")],
    description="employment relation with attributes",
)
print(employment.key_fields)

None


In [3]:
company.key_fields

['Company', 'Country']

##### 1.2 Registry Mechamism:

In [4]:
registry = FactLayer(
    [person, city, company, country, person_profile, lives, works, located, hq, friend, employment]
)
print("SCHEMA:")
pp.pprint(registry.to_dict())

SCHEMA:
{'predicates': [{'arity': 1,
                 'description': 'A city entity',
                 'key_fields': ['City'],
                 'kind': 'fact',
                 'name': 'city',
                 'schema_id': '76ce9a0f477f12a51b0eafa5246dc3ea5272433cc766695ca7a8b9f85c2a060a',
                 'signature': [{'arg_name': 'City',
                                'datatype': 'string',
                                'namespace': 'geo',
                                'role': 'name'}]},
                {'arity': 3,
                 'description': 'A company entity',
                 'key_fields': ['Company', 'Country'],
                 'kind': 'fact',
                 'name': 'company',
                 'schema_id': '9ee7711705be81cec696a4eff71c3b719af5dfe4eef30f94b603ec3fd99c19c3',
                 'signature': [{'arg_name': 'Company',
                                'datatype': 'string',
                                'namespace': 'org',
                                'rol

In [5]:
# === Test FactSchema APIs ===
all_preds = registry.predicates()
person_schema = registry.get(person.schema_id)
person_schema = registry.fact("person")
lives_schema = registry.rel("lives_in")
person_id = registry.resolve("fact", "person")
lives_id = registry.resolve("rel", "lives_in")
lives_schema = registry.rel_of_ids("lives_in", person.schema_id, city.schema_id)

pp.pprint(lives_schema.to_dict())

{'arity': 2,
 'derived_signature': {'derived': True,
                       'obj_args': [{'arg_name': 'Obj', 'datatype': 'Fact'},
                                    {'arg_name': 'obj_City',
                                     'datatype': 'string',
                                     'namespace': 'geo',
                                     'role': 'obj_key'}],
                       'prop_args': [],
                       'sub_args': [{'arg_name': 'Sub', 'datatype': 'Fact'},
                                    {'arg_name': 'sub_Name',
                                     'datatype': 'string',
                                     'namespace': 'person',
                                     'role': 'sub_key'}]},
 'description': 'person lives in city',
 'endpoints': {'obj_key_fields': ['City'], 'sub_key_fields': ['Name']},
 'kind': 'rel',
 'name': 'lives_in',
 'obj_schema_id': '76ce9a0f477f12a51b0eafa5246dc3ea5272433cc766695ca7a8b9f85c2a060a',
 'props': [],
 'schema_id': '1427b55d4a682e0

##### Serialization/Deserialization and Save/Load

In [6]:
# Serialization test
import json
payload = registry.to_dict()
with open("schema_dump.json", "w") as f:
    json.dump(payload, f, indent=2)

# Load back from dict
loaded = FactLayer.from_dict(payload)
print("LOADED SCHEMA:")
pp.pprint(loaded.to_dict())

LOADED SCHEMA:
{'predicates': [{'arity': 1,
                 'description': 'A city entity',
                 'key_fields': ['City'],
                 'kind': 'fact',
                 'name': 'city',
                 'schema_id': '76ce9a0f477f12a51b0eafa5246dc3ea5272433cc766695ca7a8b9f85c2a060a',
                 'signature': [{'arg_name': 'City',
                                'datatype': 'string',
                                'namespace': 'geo',
                                'role': 'name'}]},
                {'arity': 3,
                 'description': 'A company entity',
                 'key_fields': ['Company', 'Country'],
                 'kind': 'fact',
                 'name': 'company',
                 'schema_id': '9ee7711705be81cec696a4eff71c3b719af5dfe4eef30f94b603ec3fd99c19c3',
                 'signature': [{'arg_name': 'Company',
                                'datatype': 'string',
                                'namespace': 'org',
                             

##### 1.3 Build Views

In [7]:
registry.facts()        # list of fact predicates
registry.rels()         # list of rel predicates
registry.names()        # {"facts": [...], "rels": [...]}

{'facts': ['person', 'city', 'company', 'country', 'person_profile'],
 'rels': ['lives_in',
  'works_at',
  'located_in',
  'company_hq',
  'are_friends',
  'employment']}

In [8]:
# === Build FactView ===
# Usage:
# - FactView is the subset of FactSchema that a rule generator may reference.
# - Use it to constrain LLM outputs or rule construction to known predicates.
#
# Design rationale:
# - This isolates the rule-authoring surface from the full schema (security + clarity).
# - It also makes JSON schema constraints smaller and more stable.

import pprint as pp
from symir.ir.filters import filter_from_dict

load_with_filters = False  # set to True to test Option B below
if not load_with_filters:
    # Option A: directly select schema_ids
    # view = schema.view([person.schema_id, lives.schema_id])
    view = registry.view([pred.schema_id for pred in registry.predicates()])

else:
    # Option B: Filter AST / dict sugar
    # - Filter AST supports combinators (And/Or/Not) and predicate matching.
    # - Dict sugar is a short-hand that compiles into the filter AST.
    filt = filter_from_dict({
    "and": [
        {"predicate": {"name": "person"}},
        {"predicate": {"name": "lives_in"}},
    ]
    })
    view = registry.view_from_filter(filt)

# print("\nVIEW PREDICATES:")
view_preds = view.predicates()
# pp.pprint(view_preds)

# Test that the view allows the correct predicates
view.allows(person.schema_id)

True

In [9]:
registry.describe(person.schema_id)

{'schema_id': '947dcb81ad1ff55ae09ddff33b1a4cd7f259db9fd3a8b60f1be1186ee6ff2504',
 'kind': 'fact',
 'name': 'person',
 'arity': 1,
 'key_fields': ['Name'],
 'signature': [{'datatype': 'string',
   'role': 'key',
   'namespace': 'person',
   'arg_name': 'Name'}]}

##### 1.3 Use of Filters on view

In [10]:
import json
# === 2.1) Filter dict usage examples (all supported forms) ===
# Usage:
# - These dicts are parsed by filter_from_dict into an AST (And/Or/Not/PredMatch).
# - Use them with schema.view_from_filter(...) to build a FactView.
#
# Design rationale:
# - Dict sugar keeps user input minimal, while AST nodes enforce structure.

filter_examples = [
    {"name": "lives_in"},
    {"arity": 2},
    {"name": "lives_in", "arity": 2, "datatype": "string"},
    {"role": "subject"},
    {"namespace": "geo"},

    {"match": {"name": "lives_in"}},
    {"match": {"name": "lives_in", "arity": 2}},
    {"match": {"datatype": "string", "namespace": "geo"}},
    
    {"and": [{"match": {"name": "lives_in"}}, {"match": {"arity": 2}}]},
    {"or": [{"name": "person"}, {"name": "city"}]},
    {"not": {"match": {"namespace": "geo"}}},
    {
        "and": [
            {"match": {"arity": 2}},
            {"not": {"match": {"name": "lives_in"}}},
            {"or": [{"match": {"namespace": "geo"}}, {"match": {"namespace": "org"}}]},
        ]
    },
]

print("\nFILTER EXAMPLES:")
for idx, example in enumerate(filter_examples, start=1):
    filt = filter_from_dict(example)
    view_from_filter = registry.view_from_filter(filt)
    print(f"\nFilter {idx}: {json.dumps(example, ensure_ascii=False)}")
    print([pred.name for pred in view_from_filter.predicates()])


FILTER EXAMPLES:

Filter 1: {"name": "lives_in"}
['lives_in']

Filter 2: {"arity": 2}
['located_in', 'are_friends', 'lives_in']

Filter 3: {"name": "lives_in", "arity": 2, "datatype": "string"}
['lives_in']

Filter 4: {"role": "subject"}
[]

Filter 5: {"namespace": "geo"}
['located_in', 'country', 'city']

Filter 6: {"match": {"name": "lives_in"}}
['lives_in']

Filter 7: {"match": {"name": "lives_in", "arity": 2}}
['lives_in']

Filter 8: {"match": {"datatype": "string", "namespace": "geo"}}
['located_in', 'country', 'city']

Filter 9: {"and": [{"match": {"name": "lives_in"}}, {"match": {"arity": 2}}]}
['lives_in']

Filter 10: {"or": [{"name": "person"}, {"name": "city"}]}
['city', 'person']

Filter 11: {"not": {"match": {"namespace": "geo"}}}
['company_hq', 'employment', 'are_friends', 'person', 'company', 'person_profile', 'works_at', 'lives_in']

Filter 12: {"and": [{"match": {"arity": 2}}, {"not": {"match": {"name": "lives_in"}}}, {"or": [{"match": {"namespace": "geo"}}, {"match": 

##### Use cache mechanism

In [11]:
from symir.ir.fact_schema import cache_predicate_schema

cache_predicate_schema(friend)

from symir.ir.fact_schema import load_predicate_schemas_from_cache
# This should load the 'friend' predicate schema we just cached, and print it out.
loaded_schemas = load_predicate_schemas_from_cache()
print("\nLOADED SCHEMAS:")
pp.pprint([s.to_dict() for s in loaded_schemas],sort_dicts=False)

# Cleanup cache after test
from diskcache import Cache
from symir.ir.fact_schema import _predicate_schema_cache_dir

cache = Cache(str(_predicate_schema_cache_dir()))
cache.clear()
cache.close()



LOADED SCHEMAS:
[{'name': 'works_at',
  'arity': 5,
  'schema_id': 'a5f8858737e6269dd45bd5226251545913b643d22cdaa6df5babccdf8aaedae8',
  'description': None,
  'kind': 'rel',
  'sub_schema_id': '947dcb81ad1ff55ae09ddff33b1a4cd7f259db9fd3a8b60f1be1186ee6ff2504',
  'obj_schema_id': '9ee7711705be81cec696a4eff71c3b719af5dfe4eef30f94b603ec3fd99c19c3',
  'endpoints': {'sub_key_fields': ['Name'],
                'obj_key_fields': ['Company', 'Country']},
  'props': [{'datatype': 'int',
             'role': None,
             'namespace': None,
             'arg_name': 'since'},
            {'datatype': 'string',
             'role': None,
             'namespace': None,
             'arg_name': 'title'}],
  'derived_signature': {'derived': True,
                        'sub_args': [{'arg_name': 'Sub', 'datatype': 'Fact'},
                                     {'datatype': 'string',
                                      'role': 'sub_key',
                                      'namespace': 'per

### Define Fact Instances

##### 2.1 Define facts directly

In [12]:
# === 3) Manual facts (FactInstance) ===
# Usage:
# - FactInstance is a single fact record with a predicate_id and constant terms.
# - You can inject facts from any source, not just CSV.
#
# Design rationale:
# - Data access is abstracted by DataProvider, but manual facts are still a first-class path.
# - This is useful for programmatic data or unit tests.
import pprint as pp

from symir.ir.instance import Instance
from symir.probability import ProbabilityConfig, resolve_probability

# list/tuple: order matches Fact.signature
alice = Instance(schema=person, terms=["alice"], prob=0.9)
# dict: keys must match Fact.signature
bob = Instance(schema=person, terms={"Name": "bob"})
openai = Instance(schema=company, terms={"Company": "openai", "Revenue": 1e9, "Country": "USA"})

meta = {
    "source": "csv",
    "observed_at": "2026-02-10T12:00:00Z",
    "ingested_at": "2026-02-10T12:03:00Z",
    "confidence": 0.82,
    "status": "asserted",
    "evidence_id": "row:123",
    "trace_id": "import_20260210_001",
    "provenance": {"file": "people.csv", "row": 123},
    "tags": ["imported", "manual_ok"],
}
works_at_fact = Instance(
    schema=works,
    terms={
        "sub_key": {"Name": "bob"},
        "obj_key": {"Company": "openai", "Country":"USA"},
    },
    meta=meta
)


In [13]:
alice.to_dict()

{'schema_id': '947dcb81ad1ff55ae09ddff33b1a4cd7f259db9fd3a8b60f1be1186ee6ff2504',
 'kind': 'fact',
 'props': {'Name': 'alice'},
 'prob': 0.9,
 'meta': {},
 'entity_id': '7dd896b2bbcf3636ad19456db864df6a6de067fdd7f8abe4c9ea8a7604b86010',
 'record_id': '4336077a5bba051661612d17c257dae23ae1deee2183ad3108643883cf6d093f'}

In [14]:
works_at = Rel(
    "works_at",
    sub=person,
    obj=company,
    props=[
        ArgSpec("since:int"),
        ArgSpec("title:string"),
    ],
)

In [15]:
Instance(
    schema=works_at,
    terms=[bob, openai, 2020, "researcher"],
).to_dict()

{'schema_id': 'a5f8858737e6269dd45bd5226251545913b643d22cdaa6df5babccdf8aaedae8',
 'kind': 'rel',
 'props': {'since': 2020, 'title': 'researcher'},
 'prob': None,
 'meta': {},
 'sub_entity_id': '66762e5b241bb085374c975f01e29918288d72e4b71cbc0915071b7b9242eb01',
 'obj_entity_id': '092f311f8ee8b73b3658607d8bc0ce6d2e2696107eb5f24b9e1c23ea19856c06'}

In [16]:
Instance(
    schema=works_at,
    terms=[bob, openai, 2020, "researcher"],
).to_dict()

{'schema_id': 'a5f8858737e6269dd45bd5226251545913b643d22cdaa6df5babccdf8aaedae8',
 'kind': 'rel',
 'props': {'since': 2020, 'title': 'researcher'},
 'prob': None,
 'meta': {},
 'sub_entity_id': '66762e5b241bb085374c975f01e29918288d72e4b71cbc0915071b7b9242eb01',
 'obj_entity_id': '092f311f8ee8b73b3658607d8bc0ce6d2e2696107eb5f24b9e1c23ea19856c06'}

In [17]:
Instance(
    schema=works_at,
    terms={
        "sub_ref": bob,
        "obj_ref": openai,
        "since": 2020,
        "title": "researcher",
    },
).to_dict()

{'schema_id': 'a5f8858737e6269dd45bd5226251545913b643d22cdaa6df5babccdf8aaedae8',
 'kind': 'rel',
 'props': {'since': 2020, 'title': 'researcher'},
 'prob': None,
 'meta': {},
 'sub_entity_id': '66762e5b241bb085374c975f01e29918288d72e4b71cbc0915071b7b9242eb01',
 'obj_entity_id': '092f311f8ee8b73b3658607d8bc0ce6d2e2696107eb5f24b9e1c23ea19856c06'}

In [18]:
Instance(
    schema=works_at,
    terms={
        "sub_key": {"Name": "bob"},
        "obj_key": {"Company": "openai", "Country":"USA"},
        "since": 2020,
        "title": "researcher",
    },
).to_dict()

{'schema_id': 'a5f8858737e6269dd45bd5226251545913b643d22cdaa6df5babccdf8aaedae8',
 'kind': 'rel',
 'props': {'since': 2020, 'title': 'researcher'},
 'prob': None,
 'meta': {},
 'sub_entity_id': '66762e5b241bb085374c975f01e29918288d72e4b71cbc0915071b7b9242eb01',
 'obj_entity_id': '092f311f8ee8b73b3658607d8bc0ce6d2e2696107eb5f24b9e1c23ea19856c06'}

In [19]:
Instance(
    schema=works_at,
    terms={
        "sub_ref": bob,
        "obj_ref": openai,
        "props": {"since": 2020, "title": "researcher"},
    },
).to_dict()

{'schema_id': 'a5f8858737e6269dd45bd5226251545913b643d22cdaa6df5babccdf8aaedae8',
 'kind': 'rel',
 'props': {'since': 2020, 'title': 'researcher'},
 'prob': None,
 'meta': {},
 'sub_entity_id': '66762e5b241bb085374c975f01e29918288d72e4b71cbc0915071b7b9242eb01',
 'obj_entity_id': '092f311f8ee8b73b3658607d8bc0ce6d2e2696107eb5f24b9e1c23ea19856c06'}

In [20]:
Instance(
    schema=works_at,
    terms={
        "sub_key": {"Name": "bob"},
        "obj_key": {"Company": "openai", "Country":"USA"},
        "props": {"since": 2020, "title": "researcher"},
    },
).to_dict(include_keys=True)


{'schema_id': 'a5f8858737e6269dd45bd5226251545913b643d22cdaa6df5babccdf8aaedae8',
 'kind': 'rel',
 'props': {'since': 2020, 'title': 'researcher'},
 'prob': None,
 'meta': {},
 'sub_entity_id': '66762e5b241bb085374c975f01e29918288d72e4b71cbc0915071b7b9242eb01',
 'obj_entity_id': '092f311f8ee8b73b3658607d8bc0ce6d2e2696107eb5f24b9e1c23ea19856c06',
 'sub_key': {'Name': 'bob'},
 'obj_key': {'Company': 'openai', 'Country': 'USA'}}

In [21]:
from pathlib import Path
from symir.fact_store.provider import CSVProvider, CSVSource
from symir.fact_store.rel_builder import RelBuilder

# === 3) Load facts from CSV with CSVProvider and CSVSource ===
# define some manual facts to test with, since we don't have actual CSV files in this environment
facts = [
    Instance(schema=person, terms=["alice"], prob=0.9),
    Instance(schema=company, terms=["openai", 1e9, "USA"]),
]

# load facts from CSV files using CSVProvider and CSVSource
sources=[
        CSVSource(predicate_id=person.schema_id, file="people_rich.csv", columns=["name"],prob_column="prob"),
        CSVSource(predicate_id=city.schema_id, file="cities_rich.csv", columns=["Company","Revenue","Country"], prob_column="prob"),
        CSVSource(predicate_id=company.schema_id, file="companies.csv", columns=["name"], prob_column=None),
        CSVSource(predicate_id=country.schema_id, file="countries.csv", columns=["name"]),
        CSVSource(predicate_id=lives.schema_id, file="lives_in_rich.csv", columns=["person", "city"]),
        CSVSource(predicate_id=works.schema_id, file="works_at.csv", columns=["person", "company"]),
        CSVSource(predicate_id=located.schema_id, file="located_in.csv", columns=["city", "country"]),
        CSVSource(predicate_id=hq.schema_id, file="company_hq.csv", columns=["company", "city"]),
        CSVSource(predicate_id=friend.schema_id, file="friends.csv", columns=["person_a", "person_b"]),
]

provider = CSVProvider(
    schema=registry,
    base_path=Path("samples"),
    sources=sources,
    datatype_cast="coerce",
)
view_fact = registry.view([person.schema_id, company.schema_id])

facts = provider.query(view_fact)


# Define some manual relation facts to test with, since we don't have actual CSV files in this environment
rels = [
    Instance(schema=works_at, terms=[alice, openai, 2020, "researcher"]),
]

# Load relation facts from CSV files using CSVProvider and CSVSource, and RelBuilder
rel_source = CSVSource(
    predicate_id=works.schema_id,
    file="works_at.csv",
    columns=["person", "company", "since"],
    prob_column="prob",
)

rel_provider = CSVProvider(
    schema=registry,
    base_path=Path("samples"),
    sources=[rel_source],
    datatype_cast="coerce",
)

builder = RelBuilder(
    rel=works,
    match_keys={
        "sub": {"Name": "person"},
        "obj": {"Company": "Company"},
    },
    key_mode="partial",     # 要求 key_fields 全部匹配
    multi="cartesian",
)


rels = provider.build_relations(
    builder=builder,
    facts=facts,
    source=rel_source,
)


ProviderError: CSV column mapping arity mismatch for company: expected 3

In [None]:
facts

[Instance(schema_id='947dcb81ad1ff55ae09ddff33b1a4cd7f259db9fd3a8b60f1be1186ee6ff2504', kind='fact', props={'Name': 'alice'}, prob=1.0, meta={}, entity_id='7dd896b2bbcf3636ad19456db864df6a6de067fdd7f8abe4c9ea8a7604b86010', sub_entity_id=None, obj_entity_id=None, record_id='4336077a5bba051661612d17c257dae23ae1deee2183ad3108643883cf6d093f', _sub_key_props=None, _obj_key_props=None),
 Instance(schema_id='947dcb81ad1ff55ae09ddff33b1a4cd7f259db9fd3a8b60f1be1186ee6ff2504', kind='fact', props={'Name': 'bob'}, prob=1.0, meta={}, entity_id='66762e5b241bb085374c975f01e29918288d72e4b71cbc0915071b7b9242eb01', sub_entity_id=None, obj_entity_id=None, record_id='9c1c825658817e7e8232851573f55a2ade946fdb6dda9ee7e7fbdbf365771740', _sub_key_props=None, _obj_key_props=None),
 Instance(schema_id='947dcb81ad1ff55ae09ddff33b1a4cd7f259db9fd3a8b60f1be1186ee6ff2504', kind='fact', props={'Name': 'carol'}, prob=1.0, meta={}, entity_id='0fe8575a24819e53086a8cf80b4c07dc9aa11fe22058ce86bcc2169a6b04475d', sub_entity_

##### 2.2 Load facts from csv files

In [None]:
# === 4) CSVProvider facts ===
# Usage:
# - CSVProvider is a DataProvider implementation that loads facts from CSV.
# - Mapping from CSV columns to predicate arguments is explicit (CSVSource).
#
# Design rationale:
# - DataProvider abstracts access so we can add DBProvider later with the same API.
# - FactView filtering is handled by the provider, not hard-coded into CSV parsing.
from pathlib import Path

from symir.fact_store.provider import CSVProvider, CSVSource
from symir.probability import ProbabilityConfig
registry = FactLayer(
    [person, city, company, country, person_profile, lives, works, located,],
)
view = registry.view([pred.schema_id for pred in registry.predicates()])
sources=[
        CSVSource(predicate_id=person.schema_id, file="people_rich.csv", columns=["name"],prob_column="prob"),
        CSVSource(predicate_id=city.schema_id, file="cities_rich.csv", columns=["name"], prob_column="prob"),
        CSVSource(predicate_id=company.schema_id, file="companies.csv", columns=["name"], prob_column=None),
        CSVSource(predicate_id=country.schema_id, file="countries.csv", columns=["name"]),
        CSVSource(predicate_id=lives.schema_id, file="lives_in_rich.csv", columns=["person", "city"]),
        CSVSource(predicate_id=works.schema_id, file="works_at.csv", columns=["person", "company"]),
        CSVSource(predicate_id=located.schema_id, file="located_in.csv", columns=["city", "country"]),
        CSVSource(predicate_id=hq.schema_id, file="company_hq.csv", columns=["company", "city"]),
        CSVSource(predicate_id=friend.schema_id, file="friends.csv", columns=["person_a", "person_b"]),
]

provider = CSVProvider(
    schema=registry,
    base_path=Path("samples"),
    sources=sources,
    prob_config=ProbabilityConfig(default_fact_prob=1.0, missing_prob_policy="inject_default"),
    datatype_cast="coerce"
)
print("\nCSVProvider VIEW PREDICATES:")
pp.pprint(provider.sources)


CSVProvider VIEW PREDICATES:
{'0d7f321526421614cf75eb9cd18a2a7abfa8d1ca9cd3d19d327221903921fa8f': CSVSource(predicate_id='0d7f321526421614cf75eb9cd18a2a7abfa8d1ca9cd3d19d327221903921fa8f',
                                                                               file='countries.csv',
                                                                               columns=['name'],
                                                                               prob_column=None),
 '11d1c1e1efdd2cc30ec433b92b4d11dbb8f58df72725dab8da91f00f687f085e': CSVSource(predicate_id='11d1c1e1efdd2cc30ec433b92b4d11dbb8f58df72725dab8da91f00f687f085e',
                                                                               file='company_hq.csv',
                                                                               columns=['company',
                                                                                        'city'],
                                                        

In [None]:
facts_from_csv = provider.query(view)
print("\nFACTS (csv, first 3):")
pp.pprint(facts_from_csv[:3])

ProviderError: CSV column mapping arity mismatch for works_at: expected 3

### Define Rule Instances

##### 3.1 Define rules directly

In [23]:
from symir.rule_ir import (
    ArgSpec, Fact, Rel, Cond, Rule,
    Var, Ref, Unify, Const
)

person = Fact("person", [ArgSpec("Name:string"), ArgSpec("Addr:string"), ArgSpec("Age:int")])
company = Fact("company", [ArgSpec("Company:string")])

employment = Rel(
    "employment",
    sub=person,
    obj=company,
    props=[ArgSpec("Since:int"), ArgSpec("Title:string")],
)

# conditions 只存用户写的东西
cond = Cond(literals=[
    Ref(schema=person, terms=[Var("SubName"), Var("SubAddr"), Var("SubAge")]),
    Ref(schema=company, terms=[Var("ObjName")]),
    Unify(Var("Sub"), Ref(schema=person, terms=[Var("SubName"), Var("SubAddr"), Const(23)])),
    Unify(Var("Obj"), Ref(schema=company, terms=[Var("ObjName")])),
])

rule = Rule(predicate=employment, conditions=[cond])
payload = rule.to_dict()


In [24]:
import pprint as pp
print("\nRULE DICT:")
pp.pprint(payload)


RULE DICT:
{'arity': 4,
 'conditions': [{'literals': [{'kind': 'ref',
                               'negated': False,
                               'schema_id': '885a30de415408ad8e6c4e91e072f83fca34ad1c1e29b7c729d57e92e26a6e85',
                               'terms': [{'datatype': None,
                                          'kind': 'var',
                                          'name': 'SubName'},
                                         {'datatype': None,
                                          'kind': 'var',
                                          'name': 'SubAddr'},
                                         {'datatype': None,
                                          'kind': 'var',
                                          'name': 'SubAge'}]},
                              {'kind': 'ref',
                               'negated': False,
                               'schema_id': '81da3f15ef119e821f2750a6de1cf20f8c556f1eddad290bfee376d8d278d0d5',
                       

In [25]:
from symir.rule_ir import If, Call, Expr, NotExpr, Ref
i = Expr(
    If(
        cond=Call("gt", [Var("Age"), Const(18)]),
        then=Const(True),
        else_=Const(False),
    )
)
n = Expr(
    NotExpr(Ref(schema=person, terms=[Var("Name"), Const("USA"),Const(30)])),
)

Expr(Call("gt", [Var("Age"), Const(18)]))

# 或者直接用 Ref(negated=True)
neg_ref = Ref(schema=person, terms=[Var("Name"), Const("DE"),Const(22)], negated=True)


In [29]:
# === Construct rules (Head var-only) ===
# Usage:
# - HeadSchema defines the head predicate + variables (var-only).
# - Body is a list of literals with clause-level probability.
#
# Design rationale:
# - "head var-only" prevents constants in the head; constants must appear in bodies.
# - Multiple bodies represent multiple rule branches (Head :- Body_i).
import pprint as pp

from symir.ir.fact_schema import ArgSpec, PredicateSchema
from symir.ir.expr_ir import Var, Const, Call, Unify, If, NotExpr
from symir.ir.rule_schema import Ref, Expr, Body, Rule

relocation_candidate = PredicateSchema("relocation_candidate", 2, [ArgSpec("string"), ArgSpec("string")])

# Body1: WorksAt(X, C) and CompanyHQ(C, Y)
# - Reuse head vars in body to ensure logical linkage.
body1 = Body(
    literals=[
        Ref(schema=works.schema_id, terms=[Var("X", "string"), Var("C", "string")]),
        Ref(schema=hq.schema_id, terms=[Var("C", "string"), Var("Y", "string")]),
    ],
    prob=0.7,
)

# Body2: Social + location signals and an if-then-else expression
# - Expr holds structured ExprIR (no raw strings).
expr = If(
    cond=Call("eq", [Var("Y", "string"), Const("seattle", "string")]),
    then=Unify(Var("Flag", "bool"), Const(True, "bool")),
    else_=Unify(Var("Flag", "bool"), Const(False, "bool")),
)
body2 = Body(
    literals=[
        Ref(schema=lives.schema_id, terms=[Var("X", "string"), Var("Y", "string")]),
        Ref(schema=friend.schema_id, terms=[Var("X", "string"), Var("F", "string")]),
        Ref(schema=lives.schema_id, terms=[Var("F", "string"), Var("Y", "string")]),
        Ref(schema=located.schema_id, terms=[Var("Y", "string"), Var("Country", "string")]),
        Expr(expr=expr),
    ],
    prob=None,  # will use default rule prob during render
)

rule_from_user = Rule(relocation_candidate, conditions=[body1, body2])

print("\nRULE (user):")
pp.pprint(rule_from_user)


RULE (user):
Rule(predicate=PredicateSchema(name='relocation_candidate',
                               arity=2,
                               signature=[ArgSpec(datatype='string',
                                                  role=None,
                                                  namespace=None,
                                                  name='Param'),
                                          ArgSpec(datatype='string',
                                                  role=None,
                                                  namespace=None,
                                                  name='Param2')],
                               description=None,
                               kind='fact',
                               sub_schema_id=None,
                               obj_schema_id=None,
                               props=None,
                               key_fields=['Param'],
                               endpoints=None,
                        

3.1.? Special: Render rule as code

In [None]:
from symir.probability import ProbabilityConfig
from symir.mappers.renderers import (
    ProbLogRenderer,
    RenderContext,
)
render_cfg = ProbabilityConfig(default_rule_prob=1.0, missing_prob_policy="inject_default")
ctx = RenderContext(schema=schema, prob_config=render_cfg)

problog_text = ProbLogRenderer().render_program([], [rule_from_user], ctx, queries=[])
print("\nPROBLOG (facts + rules):")
print(problog_text)



PROBLOG (facts + rules):
0.7::relocation_candidate(X, Y) :- works_at(X, C), company_hq(C, Y).
1.0::relocation_candidate(X, Y) :- lives_in(X, Y), are_friends(X, F), lives_in(F, Y), located_in(Y, Country), ((Y = seattle, Flag = true) ; (\+ (Y = seattle), Flag = false)).


##### 3.2 Literal syntax

In [None]:
# === Literal syntax examples (all forms) ===
# Usage:
# - Demonstrate all literal/expr forms: RefLiteral, negated RefLiteral,
#   Unify, Call, If, NotExpr.
#
# Design rationale:
# - Keep this section lightweight by showing literal instances only.
# - These literals can be combined into bodies to form full rules.
print("\nLITERAL SYNTAX EXAMPLES (all forms):")

literals = [
    # RefLiteral (positive)
    RefLiteral(schema=lives.schema_id, terms=[Var("X"), Var("Y")]),
    # RefLiteral (negated)
    RefLiteral(schema=lives.schema_id, terms=[Var("X"), Var("Y")], negated=True),
    # Expr: Unify (equality/assignment)
    Expr(expr=Unify(Var("Y"), Const("seattle", "string"))),
    # Expr: Call (comparison)
    Expr(expr=Call("eq", [Var("Y"), Const("seattle", "string")])),
    # Expr: Call (arithmetic) + Unify
    Expr(expr=Unify(Var("Y", "int"), Call("add", [Var("X", "int"), Const(1, "int")]))),
    # Expr: If (if-then-else)
    Expr(
        expr=If(
            cond=Call("gt", [Var("X", "int"), Const(0, "int")]),
            then=Unify(Var("Y", "int"), Call("add", [Var("X", "int"), Const(1, "int")])),
            else_=Unify(Var("Y", "int"), Const(0, "int")),
        )
    ),
    # Expr: NotExpr (expression-level negation)
    Expr(expr=NotExpr(Call("eq", [Var("Y"), Const("seattle", "string")]))),
]

for idx, literal in enumerate(literals, start=1):
    print(f"\nLiteral Example {idx}:")
    pp.pprint(literal)


LITERAL SYNTAX EXAMPLES (all forms):

Literal Example 1:
Ref(schema_id='b6f6ac3180a13b0934fb7688c195bb4f63b31f70f6a0e3d0a42b7fdf17ba7261',
           terms=[Var(name='X', datatype=None), Var(name='Y', datatype=None)],
           negated=False)

Literal Example 2:
Ref(schema_id='b6f6ac3180a13b0934fb7688c195bb4f63b31f70f6a0e3d0a42b7fdf17ba7261',
           terms=[Var(name='X', datatype=None), Var(name='Y', datatype=None)],
           negated=True)

Literal Example 3:
Expr(expr=Unify(lhs=Var(name='Y', datatype=None),
                       rhs=Const(value='seattle', datatype='string')))

Literal Example 4:
Expr(expr=Call(op='eq',
                      args=[Var(name='Y', datatype=None),
                            Const(value='seattle', datatype='string')]))

Literal Example 5:
Expr(expr=Unify(lhs=Var(name='Y', datatype='int'),
                       rhs=Call(op='add',
                                args=[Var(name='X', datatype='int'),
                                      Const(value=1

##### Libraries

In [None]:
from symir.rules.library import Library, LibrarySpec
from symir.rules.library_runtime import LibraryRuntime

# === Libraries and runtime libraries ===
# Usage:
# - Library holds special predicates that can be referenced in rules but are not part of the FactSchema.
# - Runtime library provides implementations for these predicates during rendering.
# Design rationale:
# - This allows us to reference external predicates (e.g., "member") without bloating the FactSchema.

lib = Library()
lib.register(
    LibrarySpec(
        name="member",
        arity=2,
        kind="predicate",
        description="Check if an element is a member of a list",
        signature=["term", "list"],
    )
)
lib.register(
    LibrarySpec(
        name="is_even",
        arity=1,
        kind="expr",
        description="Check if a number is even",
        signature=["term"],
    )
)

runtime = LibraryRuntime(lib)
runtime.register(
    name="member",
    arity=2,
    kind="predicate",
    backend="problog",
    handler=lambda args: f"member({args[0]}, {args[1]})",
)

### Langda Generation

##### Schemas for Constrained Decoding

In [None]:
# === Constraint schemas (LLM decoding) ===
# Usage:
# - Library holds predicate-like or expr-like specs (metadata).
# - Constraint schemas are generated to restrict LLM output.
#
# Design rationale:
# - LibrarySpec is serializable metadata for prompt + schema use.
# - Runtime implementations are attached later in LibraryRuntime for rendering.
import pprint as pp

from symir.rules.constraint_schemas import (
    build_pydantic_rule_model,
    build_responses_schema,
    build_predicate_catalog,
)

json_schema = build_responses_schema(view, lib, mode="compact")
pydantic_model = build_pydantic_rule_model(view, lib, mode="compact")
catalog = build_predicate_catalog(view, lib)  # prompt-only catalog

print("\n### JSON SCHEMA (responses):")
pp.pprint(json_schema)
print("\n")
print("\n### PYDANTIC MODEL:")
pp.pprint(pydantic_model.schema())
print("\n")
print("\n### PREDICATE CATALOG (prompt-only):")
pp.pprint(catalog)



### JSON SCHEMA (responses):
{'$defs': {'call': {'additionalProperties': False,
                    'properties': {'args': {'items': {'$ref': '#/$defs/expr'},
                                            'type': 'array'},
                                   'kind': {'const': 'call', 'type': 'string'},
                                   'op': {'type': 'string'}},
                    'required': ['args', 'kind', 'op'],
                    'type': 'object'},
           'const': {'additionalProperties': False,
                     'properties': {'datatype': {'type': 'string'},
                                    'kind': {'const': 'const',
                                             'type': 'string'},
                                    'value': {'type': ['string',
                                                       'number',
                                                       'boolean']}},
                     'required': ['datatype', 'kind', 'value'],
                     'type': 'o

/var/folders/05/6btr2vg13b9gvgs3gxt8fw_40000gn/T/ipykernel_92726/3187763343.py:25: PydanticDeprecatedSince20: The `schema` method is deprecated; use `model_json_schema` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  pp.pprint(pydantic_model.schema())


##### USER INPUT INSTRUCTION

In [None]:
head_pred_for_agent = PredicateSchema("relocation_candidate", 2, [ArgSpec("string"), ArgSpec("string")])
head_for_agent = HeadSchema(predicate=head_pred_for_agent, terms=[Var("X"), Var("Y")])

instructions = (
    "Decide whether a person should be considered a relocation candidate for a city. "
    "Use real-world signals such as where they work, where the company's headquarters are, "
    "where they currently live, and whether they have close connections in the target city. "
    "Consider that cities in the same country may be more plausible. "
    "Provide multiple plausible cases with probabilities."
)

##### Prompts

In [None]:
# === LLM prompt example (head is fixed externally) ===
# Usage:
# - The LLM only outputs bodies. The head is fixed outside decoding.
# - Use catalog for human/LLM prompt, schema for strict decoding.
#
# Design rationale:
# - Splitting head/bodies avoids complex dependency analysis in generation.
# - Compact mode keeps schema small and stable.
import json

from openai import OpenAI

head_vars = ", ".join([t.name for t in head_for_agent.terms])
head_params = ", ".join([a.datatype for a in head_for_agent.predicate.signature])

system_prompt = """
You output ONLY JSON that conforms to the provided JSON Schema (no extra text).
Context:
- The predicate head is fixed externally and must NOT appear in the output.
- You are given a fact-layer schema and nodes/relationships as formatted data.
- You are also given a predicate registry. You MUST reference only existing predicates from the registry.

Task:
- Generate one or more cases (bodies) as conditional statements to solve the user task.
- Each case has: probability (0..1) and conditions (structured).
- Reuse the head variable names when possible (to link head and body).
"""

user_prompt = f"""
## Fixed head (do NOT output it):
- Head Name: {head_for_agent.predicate.name}
- Head Arity: {head_for_agent.predicate.arity}
- Head Param Types: {head_params}
- Head Vars: {head_vars}

## Known predicates:
{json.dumps(catalog, ensure_ascii=False, indent=2)}

## Instructions:
{instructions}
"""

print("\nSYSTEM PROMPT:")
print(system_prompt.strip())
print("\nUSER PROMPT:")
print(user_prompt.strip())



SYSTEM PROMPT:
You output ONLY JSON that conforms to the provided JSON Schema (no extra text).
Context:
- The predicate head is fixed externally and must NOT appear in the output.
- You are given a fact-layer schema and nodes/relationships as formatted data.
- You are also given a predicate registry. You MUST reference only existing predicates from the registry.

Task:
- Generate one or more cases (bodies) as conditional statements to solve the user task.
- Each case has: probability (0..1) and conditions (structured).
- Reuse the head variable names when possible (to link head and body).

USER PROMPT:
## Fixed head (do NOT output it):
- Head Name: relocation_candidate
- Head Arity: 2
- Head Param Types: string, string
- Head Vars: X, Y

## Known predicates:
{
  "7040837e0e071ab8de18bfe91abe3a83f492093bd5bf8c1f8ab407f14bf14d27": {
    "name": "works_at",
    "arity": 2,
    "arg_types": [
      "string",
      "string"
    ],
    "description": "Relationship: person works at company"


##### Call Openai Api

In [None]:
client = OpenAI()
resp = client.responses.create(
    model="gpt-4o-2024-08-06",
    instructions=system_prompt,
    input=user_prompt,
    text={
        "format": {
            "type": "json_schema",
            "name": "cases_only_contract",
            "schema": json_schema,
            "strict": True,
        }
    },
)

##### Convert output of llm

In [None]:
# === Convert LLM output -> Rule IR ===
# Usage:
# - Use resp_to_rule to parse Responses API output into Rule IR.
#
# Design rationale:
# - Parsing goes through Pydantic validation to guarantee structure.

import pprint as pp

from symir.examples.parse_llm_response import resp_to_rule

# rule_from_agent = resp_to_rule(
#     _MockResp(), head=head_for_agent, view=view, library=lib, mode="compact"
# )
rule_from_agent = resp_to_rule(
    resp, head=head_for_agent, view=view, library=lib, mode="compact"
)
print("\nRULE (from LLM):")
pp.pprint(rule_from_agent)



RULE (from LLM):
Rule(head=HeadSchema(predicate=PredicateSchema(name='relocation_candidate',
                                               arity=2,
                                               signature=[ArgSpec(datatype='string',
                                                                  role=None,
                                                                  namespace=None,
                                                                  arg_name='Param'),
                                                          ArgSpec(datatype='string',
                                                                  role=None,
                                                                  namespace=None,
                                                                  arg_name='Param2')],
                                               description=None,
                                               kind='fact',
                                               sub_schema_id=None

##### Validate

In [None]:
# === Validate rules ===
# Usage:
# - RuleValidator enforces: FactView references, arity, recursion rules, etc.
#
# Design rationale:
# - LLM output is always validated before execution or rendering.
from symir.rules.validator import RuleValidator

RuleValidator(view, library=lib).validate(rule_from_agent)


### Render as logic code

##### Render as problog code

In [None]:
# === Render ===
# Usage:
# - Renderers turn Rule IR into backend-specific code.
# - LibraryRuntime supplies backend handlers for library predicates/exprs.
#
# Design rationale:
# - Rendering is decoupled from IR so new backends can be plugged in.
from symir.probability import ProbabilityConfig
from symir.rules.library_runtime import LibraryRuntime
from symir.mappers.renderers import (
    ProbLogRenderer,
    PrologRenderer,
    DatalogRenderer,
    CypherRenderer,
    RenderContext,
)
from symir.ir.rule_schema import Query

render_cfg = ProbabilityConfig(default_rule_prob=0.6, missing_prob_policy="inject_default")
ctx = RenderContext(schema=schema, library=lib, library_runtime=runtime, prob_config=render_cfg)

all_facts = facts_from_user + facts_from_csv
all_rules = [rule_from_user, rule_from_agent]
all_queries = [
    Query(predicate=head_pred, terms=[Var("X"), Var("Y")]),
    Query(predicate_id=person.schema_id, terms=[Const("alice", "string")]),
]

problog_text = ProbLogRenderer().render_program(all_facts, all_rules, ctx, queries=all_queries)
print("\nPROBLOG (facts + rules):")
print(problog_text)



PROBLOG (facts + rules):
0.9::person(alice).
1.0::person(bob).
1.0::city(seattle).
1.0::company(openai).
1.0::country(usa).
1.0::works_at(alice, openai).
1.0::works_at(bob, acme_corp).
1.0::works_at(carol, globex).
1.0::works_at(dave, initech).
1.0::works_at(emma, umbrella).
1.0::works_at(frank, stark_industries).
1.0::works_at(grace, wayne_enterprises).
1.0::works_at(heidi, wonka).
1.0::works_at(ivan, tyrell).
1.0::works_at(judy, cyberdyne).
1.0::works_at(mallory, openai).
1.0::works_at(niaj, acme_corp).
1.0::works_at(olivia, globex).
1.0::works_at(peggy, initech).
1.0::works_at(quinn, umbrella).
1.0::works_at(ruth, stark_industries).
1.0::works_at(sybil, wayne_enterprises).
1.0::works_at(trent, wonka).
1.0::works_at(victor, tyrell).
1.0::works_at(walter, cyberdyne).
1.0::works_at(xavier, openai).
1.0::works_at(yvonne, acme_corp).
1.0::works_at(zara, globex).
1.0::works_at(oscar, initech).
1.0::lives_in(alice, seattle).
1.0::lives_in(bob, san_francisco).
1.0::lives_in(carol, new_york

##### execute problog code

In [None]:
from problog.program import PrologString
from problog import get_evaluatable

result = get_evaluatable().create_from(PrologString(problog_text)).evaluate()
print("\nPROBLOG EVALUATION RESULT:")
pp.pprint(result)

UnknownClause: No clauses found for 'member/2' at 159:90.

In [None]:
for r in result:
    print(r.args)

(alice, san_francisco)
(bob, chicago)
(carol, new_york)
(dave, austin)
(emma, boston)
(frank, los_angeles)
(grace, dallas)
(heidi, denver)
(ivan, seattle)
(judy, houston)
(mallory, san_francisco)
(niaj, chicago)
(olivia, new_york)
(peggy, austin)
(quinn, boston)
(ruth, los_angeles)
(sybil, dallas)
(trent, denver)
(victor, seattle)
(walter, houston)
(xavier, san_francisco)
(yvonne, chicago)
(zara, new_york)
(oscar, austin)
(alice, seattle)
(alice, new_york)
(alice, chicago)
(alice, austin)
(alice, boston)
(alice, denver)
(alice, miami)
(alice, portland)
(alice, atlanta)
(alice, los_angeles)
(alice, dallas)
(alice, houston)
(bob, seattle)
(bob, san_francisco)
(bob, new_york)
(bob, austin)
(bob, boston)
(bob, denver)
(bob, miami)
(bob, portland)
(bob, atlanta)
(bob, los_angeles)
(bob, dallas)
(bob, houston)
(carol, seattle)
(carol, san_francisco)
(carol, chicago)
(carol, austin)
(carol, boston)
(carol, denver)
(carol, miami)
(carol, portland)
(carol, atlanta)
(carol, los_angeles)
(carol, 

In [None]:
import json
json.dumps(result, indent=2)

TypeError: keys must be str, int, float, bool or None, not Term

##### Render as other codes...

In [None]:
# Other backends (stubs for now) not implemented yet
from symir.mappers.renderers import RenderError
try:
    _ = PrologRenderer().render_rule(rule_from_agent, ctx)
    _ = DatalogRenderer().render_rule(rule_from_agent, ctx)
    _ = CypherRenderer().render_rule(rule_from_agent, ctx)
except RenderError as e:
    print("\nOther renderers not implemented yet.")


Other renderers not implemented yet.
