In [2]:
xml_file = "thesaurus/DRZE/BETHES8_201209.xml"
dtd_file = "thesaurus/DRZE/thesauri.dtd"

In [27]:
from lxml import etree
from collections import defaultdict

# This script parses the xml Thesaurus files and creates a synonym list file for each Supergroup (1-14)
LANG_ID_ENGLISH = "2"
#LANGID=1 corresponds to language: German
#LANGID=2 corresponds to language: English
#LANGID=3 corresponds to language: French
output_filename = "thesaurus_by_supergroup.txt"

# --- Data storage ---
termid_to_keyword = {}
keyword_synonyms = defaultdict(list)
supergroup_names = {}
descriptor_to_supergroups = defaultdict(list)
descriptor_narrower_terms = defaultdict(list)

# --- Pass 1: Collect keywords from DESCRIPTORS ---
print("Pass 1: Collecting descriptor terms...")
for _, elem in etree.iterparse(xml_file, events=("end",), tag="DESCRIPTOR"):
    termid = elem.get("TERMID")
    langdata = elem.find(f"LANGDATA[@LANGID='{LANG_ID_ENGLISH}']")
    if langdata is not None:
        term_elem = langdata.find("TERM")
        if term_elem is not None and term_elem.text:
            keyword = term_elem.text.strip()
            termid_to_keyword[termid] = keyword
    elem.clear()

# --- Pass 2: Collect synonyms from SYNONYMS ---
print("Pass 2: Collecting synonyms...")
for _, elem in etree.iterparse(xml_file, events=("end",), tag="SYNONYMS"):
    if elem.get("LANGID") == LANG_ID_ENGLISH:
        for synonym in elem.findall("SYNONYM"):
            useptr = synonym.findtext("USEPTR")
            term = synonym.findtext("TERM")
            if useptr and term:
                keyword_synonyms[useptr.strip()].append(term.strip())
    elem.clear()

# --- Pass 3: Collect Supergroup names ---
print("Pass 3: Collecting supergroup names...")
for _, elem in etree.iterparse(xml_file, events=("end",), tag="SUPERGROUP"):
    sg_id = elem.get("ID")
    lang_entry = elem.find(f"LANGENTRY[@LANGID='{LANG_ID_ENGLISH}']")
    if lang_entry is not None:
        designator = lang_entry.findtext("DESIGNATOR")
        if designator:
            supergroup_names[sg_id] = designator.strip()
    elem.clear()

# --- Pass 4: Map descriptors to supergroups and collect narrower terms ---
print("Pass 4: Mapping relationships...")
for _, elem in etree.iterparse(xml_file, events=("end",), tag="DESCRIPTOR"):
    termid = elem.get("TERMID")
    # Map to supergroups via GROUPPTR
    for group_ptr in elem.findall("GROUPPTR"):
        if group_ptr.text:
            sg_id = group_ptr.text.strip()
            descriptor_to_supergroups[termid].append(sg_id)
    # Collect narrower term IDs via NTPTR
    for nt_ptr in elem.findall("NTPTR"):
        if nt_ptr.text:
            narrower_term_id = nt_ptr.text.strip()
            descriptor_narrower_terms[termid].append(narrower_term_id)
    elem.clear()

# --- Final Assembly and File Writing ---
print(f"Final Assembly: Building the data and writing to {output_filename}...")
supergroup_all_terms = defaultdict(list)

# Iterate through all known descriptors
for termid, keyword in termid_to_keyword.items():
    # Collect all terms related to THIS descriptor in a single list
    all_related_terms = []
    all_related_terms.append(keyword)
    all_related_terms.extend(keyword_synonyms.get(termid, []))
    
    # Look up this descriptor's narrower terms by ID, then convert those IDs to keywords
    narrower_ids = descriptor_narrower_terms.get(termid, [])
    narrower_keywords = [termid_to_keyword.get(nt_id, "Unknown Term") for nt_id in narrower_ids]
    all_related_terms.extend(narrower_keywords)
    
    # Assign this collection of terms to the correct supergroup(s)
    for sg_id in descriptor_to_supergroups.get(termid, []):
        supergroup_all_terms[sg_id].extend(all_related_terms)

# Write the collected data to the text file
with open(output_filename, "w", encoding="utf-8") as outfile:
    for sg_id, terms_list in supergroup_all_terms.items():
        supergroup_name = supergroup_names.get(sg_id, "Unknown Supergroup")
        
        # Use a set to get unique terms, then join with a pipe
        # This prevents the same term from appearing multiple times in a line
        unique_terms = sorted(list(set(terms_list)))
        line_content = "|".join(unique_terms)
        
        outfile.write(f"{supergroup_name}:{line_content}\n")

print("Done. The output has been saved to the file.")

Pass 1: Collecting descriptor terms...
Pass 2: Collecting synonyms...
Pass 3: Collecting supergroup names...
Pass 4: Mapping relationships...
Final Assembly: Building the data and writing to thesaurus_by_supergroup.txt...
Done. The output has been saved to the file.


In [3]:
##This does recursive Dump of terms into a json


from lxml import etree
from collections import defaultdict
import json

#xml_file = "your_input.xml"  # Pfad zur XML-Datei
LANG_ID_ENGLISH = "1"
output_filename = "thesaurus_supergroup_hierarchy_DE.json"

termid_to_keyword = {}
supergroup_names = {}
descriptor_to_supergroups = defaultdict(list)
descriptor_narrower_terms = defaultdict(list)

# --- Pass 1: Begriffe sammeln ---
for _, elem in etree.iterparse(xml_file, events=("end",), tag="DESCRIPTOR"):
    termid = elem.get("TERMID")
    langdata = elem.find(f"LANGDATA[@LANGID='{LANG_ID_ENGLISH}']")
    if langdata is not None:
        term_elem = langdata.find("TERM")
        if term_elem is not None and term_elem.text:
            termid_to_keyword[termid] = term_elem.text.strip()
    elem.clear()

# --- Pass 2: Supergruppen-Namen sammeln ---
for _, elem in etree.iterparse(xml_file, events=("end",), tag="SUPERGROUP"):
    sg_id = elem.get("ID")
    lang_entry = elem.find(f"LANGENTRY[@LANGID='{LANG_ID_ENGLISH}']")
    if lang_entry is not None:
        designator = lang_entry.findtext("DESIGNATOR")
        if designator:
            supergroup_names[sg_id] = designator.strip()
    elem.clear()

# --- Pass 3: Zuordnungen ---
for _, elem in etree.iterparse(xml_file, events=("end",), tag="DESCRIPTOR"):
    termid = elem.get("TERMID")
    for group_ptr in elem.findall("GROUPPTR"):
        if group_ptr.text:
            descriptor_to_supergroups[termid].append(group_ptr.text.strip())
    for nt_ptr in elem.findall("NTPTR"):
        if nt_ptr.text:
            descriptor_narrower_terms[termid].append(nt_ptr.text.strip())
    elem.clear()

# --- Alle Narrower rekursiv auflösen ---
def get_all_narrowers(termid, seen=None):
    if seen is None:
        seen = set()
    result = []
    for child in descriptor_narrower_terms.get(termid, []):
        if child in seen:
            continue
        seen.add(child)
        if child in termid_to_keyword:
            result.append(termid_to_keyword[child])
        result.extend(get_all_narrowers(child, seen))
    return result

# --- IDs aller Narrower sammeln, um doppelte Top-Level zu verhindern ---
all_narrower_ids = set()
for nt_list in descriptor_narrower_terms.values():
    all_narrower_ids.update(nt_list)

# --- Hierarchie aufbauen ---
supergroup_dict = defaultdict(dict)

for termid, sg_ids in descriptor_to_supergroups.items():
    if termid in all_narrower_ids:
        continue  # auslassen, wenn der Begriff ein Narrower ist

    descriptor = termid_to_keyword.get(termid, f"Unknown descriptor {termid}")
    narrower = get_all_narrowers(termid)
    if len(narrower) < 2:
        continue
    for sg_id in sg_ids:
        sg_name = supergroup_names.get(sg_id, f"Unknown Supergroup {sg_id}")
        supergroup_dict[sg_name][descriptor] = sorted(set(narrower))

# --- Als JSON speichern ---
with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(supergroup_dict, f, indent=2, ensure_ascii=False)

print("Done.")


Done.
