## Skript zum Harvesten von Ortsdaten aus iDAI.chronontology

Mithlife dieses einfachen Skripts lassen sich Ortszuweisungen in Chronontology extrahieren.

Eingelesen wird ein Auszug der Chronontology Konkordanz.

Ausgegeben wird eine Liste aller Ortsnamen und IDs im iDAI.gazetteer, die mit den eingegebenen Perioden primär assoziiert sind. Dies bedeutet, dass Kerngebeite gegenüber Regionen bevorzugt werden, und Regionen wiederrum über namensgebende Orte.

In [None]:
#Import modules

import pandas as pd
import requests
import time

In [None]:
#Einlesen der Konkordanz

#Choose CSV or Excel file
#df_konkordanz = pd.read_excel("konkordanz_20240527.xlsx", sheet_name="nur SPP")
df_konkordanz = pd.read_csv("konkordanz_nurSPP.csv", sep=",", encoding="utf-8")
print(df_konkordanz)

In [None]:
#Liste aller eingegebenen ChronoIDs

chronoids = df_konkordanz["chronontologyID"]
chronoids = chronoids.tolist()

#print(chronoids)

#Testweise nur die ersten 10 ChronoIDs verwenden:
#chronoids_test = chronoids[:10]

In [None]:
#Retrieve Title, Core Area or SpatiallPartOf from Chronontology API

url = "https://chronontology.dainst.org/data/period/"

#Funktion zur Abfrage des Namens einer Periode
def get_chrono_name(chrono_id):
    response = requests.get(f"{url}{chrono_id}")
    if response.status_code == 200:
        data = response.json()
        #print(data)
        title = data['resource']['names']
        #get the first value
        title = title.values()
        title = list(title)[0][0]
        #print(title)
        return title
    else:
        title = "ERROR"
        return title

#Funktion zur Abfrage des Kerngebietes einer Periode
def get_chrono_core_area(chrono_id):
    response = requests.get(f"{url}{chrono_id}")
    if response.status_code == 200:
        data = response.json()
        #print(data['resource'])
        if 'hasCoreArea' in data['resource']:
            core_area = data['resource']['hasCoreArea'][0]
            return core_area
        else:
            core_area = "no_core_area"
            return core_area
    else:
        core_area = "ERROR"
        return core_area

#Funktion zur Abfrage der Region einer Periode
def get_chrono_region(chrono_id):
    response = requests.get(f"{url}{chrono_id}")
    if response.status_code == 200:
        data = response.json()
        #print(data['resource'])
        if 'spatiallyPartOfRegion' in data['resource']:
            region = data['resource']['spatiallyPartOfRegion'][0]
            return region
        else:
            region = "no_region"
            return region
    else:
        region = "ERROR"
        return region

#Funktion zur Abfrage des namensgebenden Ortes einer Periode
def get_chrono_named_after(chrono_id):
    response = requests.get(f"{url}{chrono_id}")
    if response.status_code == 200:
        data = response.json()
        #print(data['resource'])
        if 'isNamedAfter' in data['resource']:
            named_after = data['resource']['isNamedAfter'][0]
            return named_after
        else:
            named_after = "no_named_after"
            return named_after
    else:
        named_after = "ERROR"
        return named_after


In [None]:
#Funktion zur Abfrage des Namens eines Ortes im Gazetteer
def get_gaz_title(gaz_id):
    gaz_id = gaz_id.split("/")[-1]
    url = f"https://gazetteer.dainst.org/doc/{gaz_id}.json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if 'prefName' in data:
            title = data['prefName']['title']
            return title
        else:
            title = "ERROR_no_title!!!"
            return title

In [None]:
#Main Loop

#ids to test the loop
#chronoids = ['oUj62ZmY8g8U','bPXYskijLKfh','iHHsmCseJynu']

processed_chrono_ids = []
result = []
names = []
gaz_titles = []
x = 0

for chrono_id in chronoids:
    x += 1
    print(f"processing {chrono_id}, nr. {x} of {len(chronoids)}")
    processed_chrono_ids.append(chrono_id)
    name = get_chrono_name(chrono_id)
    names.append(name)
    core_area = get_chrono_core_area(chrono_id)
    region = get_chrono_region(chrono_id)
    named_after = get_chrono_named_after(chrono_id)
    if core_area != "no_core_area":
        gaz_title = get_gaz_title(core_area)
        gaz_titles.append(gaz_title)
        result.append(core_area)
    elif region != "no_region":
        gaz_title = get_gaz_title(region)
        gaz_titles.append(gaz_title)
        result.append(region)
    elif named_after != "no_named_after":
        gaz_title = get_gaz_title(named_after)
        gaz_titles.append(gaz_title)
        result.append(named_after)
    else:
        result.append("no_localization")
        gaz_titles.append("no_gaz_title")
    time.sleep(1)  # Sleep for 1 second to avoid overloading the API

# Create a DataFrame with the results
df_results = pd.DataFrame({
    'ChonoID': processed_chrono_ids,
    'PeriodName': names,
    'Localization': gaz_titles,
    'GazetteerID': result
})

print(df_results)

In [None]:
#export to Excel
df_results.to_excel("chronontology_gazetteer_mapping.xlsx", index=False)


Um nur einzigartige Ortsbezeichnungen zu erhalten, kann der folgende Code auf die zuvor exportierte Tabelle angewendet werden:

In [None]:
#Get unique Gazetteer IDs with place names
df_gazetteer_mapping = pd.read_excel("chronontology_gazetteer_mapping.xlsx", usecols=["GazetteerID"])
unique_gazetteer_ids = df_gazetteer_mapping['GazetteerID'].unique()

placenames = []

for gaz_id in unique_gazetteer_ids:
    gaz_id = gaz_id.split("/")[-1]
    url = f"https://gazetteer.dainst.org/doc/{gaz_id}.json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if 'prefName' in data:
            title = data['prefName']['title']
            placenames.append((title))
            print(f"{gaz_id}: {title}")
        else:
            print(f"{gaz_id}: ERROR_no_title!!!")
            title = "ERROR_no_title!!!"
            placenames.append((title))
    else:
        print(f"{gaz_id}: ERROR fetching data")
        title = "ERROR fetching data"
        placenames.append((title))

# Create a DataFrame with the unique Gazetteer IDs and their place names
df_placenames = pd.DataFrame({
    'GazetteerID': unique_gazetteer_ids,
    'PlaceName': placenames
})

# Export the DataFrame to an Excel file
df_placenames.to_excel("unique_gazetteer_placenames.xlsx", index=False)

Entwickelt von Lukas Lammers im Projekt FAIR.rdm, teil des DFG-geförderten SPP 2143 "Entangled Africa".

Mit Unterstützung von GitHub Copilot (KI-Assistent) bei der Code-Entwicklung und -Optimierung.

Version 1.1, 30.07.2025
