In [1]:
# CELL 1 – Install & imports

# Optional: install dependencies in a fresh environment
# !pip install neo4j certifi

import os
import re
import xml.etree.ElementTree as ET
from typing import Dict, List, Tuple

from neo4j import GraphDatabase
import certifi  # kept for consistency with other notebooks

In [4]:
# CELL 2 – Configuration (paths, Neo4j, namespaces)

# Path to your ArchiMate XML model
# Adjust this to where you store the file in your project
XML_PATH = "./sources/Data/Cross_App_Data_Flow.xml"

# Neo4j connection (same Aura instance as textbook)
NEO4J_URI = "neo4j+s://fde218db.databases.neo4j.io"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "VgkdUn1MfwDO5ad3TdAh2eFzu9Ry0wNjly1QaFpxJK0"
NEO4J_DB = "neo4j"

# Embedding model configuration – IDENTICAL to textbook parser
EMBEDDING_MODEL = "text-embedding-3-small"  # 1536 dimensions
OPENAI_API_KEY = "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"

print("XML path:", XML_PATH)
print("Neo4j URI:", NEO4J_URI)
print("Embedding model:", EMBEDDING_MODEL)

# XML namespaces
ARCHIMATE_NS = {"a": "http://www.opengroup.org/xsd/archimate/3.0/"}
XSI_NS = "http://www.w3.org/2001/XMLSchema-instance"

XML path: ./sources/Data/Cross_App_Data_Flow.xml
Neo4j URI: neo4j+s://fde218db.databases.neo4j.io
Embedding model: text-embedding-3-small


In [5]:
# CELL 3 – Connectivity helpers (same style as other notebooks)

URI  = NEO4J_URI
AUTH = (NEO4J_USER, NEO4J_PASSWORD)
DB   = NEO4J_DB

def nuke_proxies():
    """Remove proxy env vars so Aura connectivity is not broken."""
    for k in ("HTTPS_PROXY", "HTTP_PROXY", "https_proxy", "http_proxy", "ALL_PROXY", "all_proxy"):
        os.environ.pop(k, None)
    # Optional: avoid routing Aura traffic through proxies
    os.environ.setdefault("NO_PROXY", "databases.neo4j.io,.neo4j.io")

def try_connect(tag: str, uri: str, driver_kwargs=None) -> bool:
    print(f"Testing {tag} → {uri}")
    driver_kwargs = driver_kwargs or {}
    try:
        drv = GraphDatabase.driver(uri, auth=AUTH, **driver_kwargs)
        drv.verify_connectivity()
        with drv.session(database=DB) as s:
            s.run("RETURN 1").consume()
        print("OK ✅")
        drv.close()
        return True
    except Exception as e:
        print(f"Failed: {e}")
        return False

nuke_proxies()
ok = try_connect("neo4j+s default", URI)
if not ok:
    print("⚠️ Connectivity test failed. Check URI/credentials/proxy.")
else:
    print("All good. ✅")


Testing neo4j+s default → neo4j+s://fde218db.databases.neo4j.io
OK ✅
All good. ✅


In [6]:
# CELL 4 – Helper: slugify for stable Application keys

def slugify(text: str) -> str:
    """Simple slug for stable keys in Neo4j, same style as support matrix parser."""
    text = text.strip().lower()
    text = re.sub(r"[^a-z0-9]+", "_", text)
    return text.strip("_")


In [7]:
# CELL 5 – Parse Cross-App Data-Flow ArchiMate XML into Python structures

def parse_cross_app_model(path: str) -> Tuple[List[Dict], List[Dict]]:
    """
    Parse the Cross-App Data-Flow ArchiMate model and return:
      - applications: list of dicts describing Application nodes
      - flows: list of dicts describing DATA_FLOW relationships

    We:
      * Read <elements> and build an id → element map
      * Use <organizations> / label 'Application' to decide which elements are applications
      * Read <relationships> and keep only those where both ends are Applications
    """
    tree = ET.parse(path)
    root = tree.getroot()
    ns = ARCHIMATE_NS

    # --- 1) Elements ---
    elements_node = root.find("a:elements", ns)
    if elements_node is None:
        raise ValueError("No <elements> section found in XML.")

    elements_by_id: Dict[str, Dict] = {}

    for el in elements_node:
        identifier = el.attrib.get("identifier")
        if not identifier:
            continue

        # name
        name_el = el.find("a:name", ns)
        name = (name_el.text or "").strip() if name_el is not None else None

        # documentation (optional)
        doc_el = el.find("a:documentation", ns)
        documentation = (doc_el.text or "").strip() if doc_el is not None and doc_el.text else None

        # ArchiMate type (e.g. DataObject, ApplicationComponent, ApplicationService)
        archimate_type = el.attrib.get(f"{{{XSI_NS}}}type")

        # properties (we mainly care about Category)
        props: Dict[str, str] = {}
        props_node = el.find("a:properties", ns)
        if props_node is not None:
            for p in props_node.findall("a:property", ns):
                prop_def = p.attrib.get("propertyDefinitionRef")
                val_el = p.find("a:value", ns)
                if prop_def and val_el is not None and val_el.text:
                    props[prop_def] = val_el.text.strip()

        elements_by_id[identifier] = {
            "id": identifier,
            "name": name,
            "archimate_type": archimate_type,
            "documentation": documentation,
            "properties": props,
        }

    # --- 2) Organizations – find which elements are Applications ---
    orgs_node = root.find("a:organizations", ns)
    if orgs_node is None:
        raise ValueError("No <organizations> section found in XML.")

    application_ids = set()

    for item in orgs_node.findall("a:item", ns):
        label_el = item.find("a:label", ns)
        label = (label_el.text or "").strip() if label_el is not None and label_el.text else None
        if label == "Application":
            # nested item elements refer to element identifiers
            for sub in item.findall("a:item", ns):
                ref = sub.attrib.get("identifierRef")
                if ref:
                    application_ids.add(ref)

    # Build application list from elements_by_id
    applications: List[Dict] = []
    for app_id in sorted(application_ids):
        el = elements_by_id.get(app_id)
        if not el:
            continue
        props = el.get("properties", {})
        # 'propid-1' is the Category property in your model
        category = props.get("propid-1")

        applications.append(
            {
                "id": el["id"],
                "name": el["name"],
                "archimate_type": el["archimate_type"],
                "documentation": el["documentation"],
                "category": category,
            }
        )

    # --- 3) Relationships – only those between Applications ---
    rels_node = root.find("a:relationships", ns)
    if rels_node is None:
        raise ValueError("No <relationships> section found in XML.")

    flows: List[Dict] = []

    for rel in rels_node:
        rel_id = rel.attrib.get("identifier")
        source_id = rel.attrib.get("source")
        target_id = rel.attrib.get("target")
        if not rel_id or not source_id or not target_id:
            continue

        # only Application-to-Application relations
        if source_id not in application_ids or target_id not in application_ids:
            continue

        rel_type = rel.attrib.get(f"{{{XSI_NS}}}type")  # Flow, Triggering, Serving, Aggregation, Realization...

        # optional textual label describing payload, e.g. "order data"
        name_el = rel.find("a:name", ns)
        label = (name_el.text or "").strip() if name_el is not None and name_el.text else None

        src_el = elements_by_id.get(source_id)
        tgt_el = elements_by_id.get(target_id)

        flows.append(
            {
                "id": rel_id,
                "archimate_type": rel_type,
                "label": label,
                "source_id": source_id,
                "target_id": target_id,
                "source_name": src_el["name"] if src_el else None,
                "target_name": tgt_el["name"] if tgt_el else None,
            }
        )

    print(f"Parsed {len(applications)} Application elements.")
    print(f"Parsed {len(flows)} Application-to-Application relationships.")

    return applications, flows


applications, flows = parse_cross_app_model(XML_PATH)

# quick peek
applications[:3], flows[:3]


Parsed 44 Application elements.
Parsed 50 Application-to-Application relationships.


([{'id': 'id-03cdb4dc95c14054848e160f72c80d3b',
   'name': 'RabbitMQ',
   'archimate_type': 'ApplicationComponent',
   'documentation': 'Purpose: This application is somewhat special within the SpeedParcel landscape\nbecause it is a message-oriented middleware (MOM) through which all data passes. This\nmeans that all data flows through this application and thus it technically enables all\ncapabilities. In order to not have to connect it with every capability, we have the edges\nout of the landscape map. Again, in order to keep the landscape readable, RabbitMQ is\nleft out, as every data flow connection would have to run through it. It is technically in the\nmiddle between each connection in the diagram.\n\nBusiness Objects: n/a',
   'category': None},
  {'id': 'id-08a4e24a9b0d42c880efb48bf3387033',
   'name': 'IBM OMS',
   'archimate_type': 'ApplicationComponent',
   'documentation': 'Purpose: IBM OMS is an order management system that was already in place at\nSpeedParcel. It is one of

In [8]:
# CELL 6 – Connect to Neo4j (driver) and basic schema

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
driver.verify_connectivity()
print("✅ Connected to Neo4j")

with driver.session(database=NEO4J_DB) as session:
    # We reuse existing Application constraints from your other notebooks.
    # If you haven't created them yet, you can (uncomment if needed):
    #
    # session.run("""
    #     CREATE CONSTRAINT appName IF NOT EXISTS
    #     FOR (a:Application) REQUIRE a.name IS UNIQUE;
    # """)
    #
    # No extra constraints for DATA_FLOW relationships needed.
    pass

print("Schema check done.")


✅ Connected to Neo4j
Schema check done.


In [9]:
# CELL 7 – Upsert Application nodes from Cross-App Data-Flow diagram

def upsert_applications_in_neo4j(apps: List[Dict]):
    with driver.session(database=NEO4J_DB) as session:
        for app in apps:
            if not app["name"]:
                # Ignore nameless application elements
                continue

            session.run(
                """
                MERGE (a:Application {name: $name})
                ON CREATE SET
                    a.key           = $key,
                    a.archimateId   = $archimate_id,
                    a.archimateType = $archimate_type,
                    a.category      = $category,
                    a.sourceModels  = [$source_model]
                ON MATCH SET
                    a.archimateId   = coalesce(a.archimateId, $archimate_id),
                    a.archimateType = coalesce(a.archimateType, $archimate_type),
                    a.category      = coalesce(a.category, $category),
                    a.sourceModels  = (
                        CASE
                            WHEN a.sourceModels IS NULL THEN [$source_model]
                            WHEN NOT $source_model IN a.sourceModels
                                THEN a.sourceModels + $source_model
                            ELSE a.sourceModels
                        END
                    )
                """,
                name=app["name"],
                key=slugify(app["name"]),
                archimate_id=app["id"],
                archimate_type=app["archimate_type"],
                category=app["category"],
                source_model="cross_app_data_flow",
            )

    print(f"Upserted/merged {len(apps)} Application nodes (Cross-App Data-Flow).")


upsert_applications_in_neo4j(applications)


Upserted/merged 44 Application nodes (Cross-App Data-Flow).


In [10]:
# CELL 8 – Create DATA_FLOW relationships between Applications

def create_data_flows_in_neo4j(flows: List[Dict]):
    created = 0
    skipped = 0
    with driver.session(database=NEO4J_DB) as session:
        for rel in flows:
            src = rel["source_name"]
            tgt = rel["target_name"]
            if not src or not tgt:
                skipped += 1
                continue

            session.run(
                """
                MATCH (src:Application {name: $src_name})
                MATCH (tgt:Application {name: $tgt_name})
                MERGE (src)-[r:DATA_FLOW {archimateRelId: $rel_id}]->(tgt)
                ON CREATE SET
                    r.flowKind   = $archimate_type,
                    r.label      = $label,
                    r.source     = $source_model
                ON MATCH SET
                    r.flowKind   = coalesce(r.flowKind, $archimate_type),
                    r.label      = coalesce(r.label, $label),
                    r.source     = coalesce(r.source, $source_model)
                """,
                src_name=src,
                tgt_name=tgt,
                rel_id=rel["id"],
                archimate_type=rel["archimate_type"],
                label=rel["label"],
                source_model="cross_app_data_flow",
            )
            created += 1

    print(f"Created/merged {created} DATA_FLOW relationships.")
    if skipped:
        print(f"Skipped {skipped} relationships without resolvable source/target names.")


create_data_flows_in_neo4j(flows)


Created/merged 50 DATA_FLOW relationships.


In [11]:
# CELL 9 – Quick sanity checks (optional)

with driver.session(database=NEO4J_DB) as session:
    app_count = session.run(
        """
        MATCH (a:Application)
        WHERE $source_model IN coalesce(a.sourceModels, [])
        RETURN count(a) AS c
        """,
        source_model="cross_app_data_flow",
    ).single()["c"]

    flow_count = session.run(
        """
        MATCH ()-[r:DATA_FLOW]->()
        WHERE r.source = $source_model
        RETURN count(r) AS c
        """,
        source_model="cross_app_data_flow",
    ).single()["c"]

print("Application nodes from Cross-App model:", app_count)
print("DATA_FLOW relationships from Cross-App model:", flow_count)


Application nodes from Cross-App model: 25
DATA_FLOW relationships from Cross-App model: 50


In [12]:
# CELL 10 – Close driver (optional clean-up)

driver.close()
print("Neo4j driver closed.")

Neo4j driver closed.
