In [2]:
import gedcom7

# Helper function to extract important info from individual or family structure
def parse_individual(structure):
    person_id = structure.pointer
    person = {
            'id': person_id,  # Capture the ID (pointer)
            'name': None,
            'sex': None,
            'birth': None,
            'family': None
        }
    for child in structure.children:
        if child.tag == 'NAME':
            person['name'] = child.text
        elif child.tag == 'SEX':
            person['sex'] = child.text
        elif child.tag == 'BIRT':
            for birth_child in child.children:
                if birth_child.tag == 'DATE':
                    person['birth'] = birth_child.text
        elif child.tag == 'FAMC':
            person['family'] = child.text
    
    # Debugging statement to check what is being parsed
    print(f"Parsed Individual: {person}")
    return person

def parse_family(structure):
# Try to manually extract pointer if it is not captured
    family_id = structure.pointer
    family = {
        'id': family_id,  # Capture the family ID (pointer)
        'husband': None,
        'wife': None,
        'marriage_date': None
    }    
    for child in structure.children:
        if child.tag == 'HUSB':
            family['husband'] = child.text
        elif child.tag == 'WIFE':
            family['wife'] = child.text
        elif child.tag == 'MARR':
            for marr_child in child.children:
                if marr_child.tag == 'DATE':
                    family['marriage_date'] = marr_child.text
    
    # Debugging statement to check what is being parsed
    print(f"Parsed Family: {family}")
    return family

def extract_gedcom_data(gedcom_structures):
    individuals = {}
    families = {}
    # Parse top-level structures
    for structure in gedcom_structures:
        #print(structure.pointer)
        if structure.tag == 'INDI':  # Individual
            individual = parse_individual(structure)
            individuals[individual['id']] = individual

        elif structure.tag == 'FAM':  # Family
            family = parse_family(structure)
            families[family['id']] = family
        else:
            # Debugging statement for unexpected structures
            print(f"Unexpected structure: {structure.tag} with pointer: {structure.pointer}")

    return {'individuals': individuals, 'families': families}

# Sample usage with debugging
with open("smallgedcom70.ged", "r", encoding="utf-8") as f:
    gedcom_data = f.read()

gedcom_structures = gedcom7.loads(gedcom_data)
data = extract_gedcom_data(gedcom_structures)

# Print the extracted data
from pprint import pprint
pprint(data)


Unexpected structure: HEAD with pointer: None
Parsed Individual: {'id': None, 'name': 'Kate /Perk/', 'sex': 'F', 'birth': '1999', 'family': None}
Parsed Individual: {'id': None, 'name': 'Keith /Perk/', 'sex': 'M', 'birth': '1965', 'family': None}
Parsed Family: {'id': None, 'husband': None, 'wife': None, 'marriage_date': None}
Unexpected structure: TRLR with pointer: None
{'families': {None: {'husband': None,
                     'id': None,
                     'marriage_date': None,
                     'wife': None}},
 'individuals': {None: {'birth': '1965',
                        'family': None,
                        'id': None,
                        'name': 'Keith /Perk/',
                        'sex': 'M'}}}


In [9]:
import gedcom

def print_structure(structure, level=0):
    indent = "  " * level  # Indent to visualize hierarchy
    print(f"{indent}Tag: {structure.tag}, Text: {structure.text}")

    # Recursively print children, if any
    for child in structure.children:
        print_structure(child, level + 1)  # Increase indentation for children

with open("smallgedcom70.ged", "r", encoding="utf-8") as f:
    string = f.read()

records = gedcom7.loads(string)
for structure in records:
    print_structure(structure)

Tag: HEAD, Text: None
  Tag: SOUR, Text: Gramps
    Tag: VERS, Text: 5.2.3
    Tag: NAME, Text: Gramps
  Tag: DATE, Text: 5 OCT 2024
    Tag: TIME, Text: 21:22:51
  Tag: SUBM, Text: None
  Tag: FILE, Text: /Users/georgiadanehy/Downloads/kate.ged
  Tag: COPR, Text: Copyright (c) 2024 .
  Tag: GEDC, Text: None
    Tag: VERS, Text: 5.5.1
    Tag: FORM, Text: LINEAGE-LINKED
  Tag: CHAR, Text: UTF-8
  Tag: LANG, Text: English
Tag: SUBM, Text: None
  Tag: NAME, Text: None
Tag: INDI, Text: None
  Tag: NAME, Text: Kate /Perk/
    Tag: TYPE, Text: birth
    Tag: GIVN, Text: Kate
    Tag: SURN, Text: Perkins
  Tag: SEX, Text: F
  Tag: BIRT, Text: None
    Tag: DATE, Text: 1999
  Tag: FAMC, Text: None
    Tag: PEDI, Text: birth
  Tag: CHAN, Text: None
    Tag: DATE, Text: 5 OCT 2024
      Tag: TIME, Text: 21:16:33
Tag: INDI, Text: None
  Tag: NAME, Text: Keith /Perk/
    Tag: TYPE, Text: birth
    Tag: GIVN, Text: Keith
    Tag: SURN, Text: Perkins
  Tag: SEX, Text: M
  Tag: BIRT, Text: None
    