# Extract all unique entities from MSE turtle file

In [6]:
import rdflib
from collections import defaultdict
import pandas as pd

# Path to the RDF file (Turtle format)
file_path = 'Material/output.ttl'

# Create a new RDF graph
g = rdflib.Graph()

# Try to parse the RDF file
try:
    g.parse(file_path, format='turtle')
except Exception as e:
    print(f"Error parsing RDF file: {e}")

# Initialize data structures
entities = set()
subject_entities = set()
relationships = set()
label_dict = {}
descriptions = defaultdict(list)  # Initialize descriptions with default empty lists

# Set of URIs for description
description_info_set = {
    rdflib.URIRef("http://purl.org/dc/terms/description"),
    rdflib.URIRef("http://purls.helmholtz-metadaten.de/mwo/description"),
    rdflib.URIRef("http://www.w3.org/2000/01/rdf-schema#comment"),
    rdflib.URIRef("http://purl.obolibrary.org/obo/IAO_0000115"),
    rdflib.URIRef("http://www.w3.org/2004/02/skos/core#definition")
}

# Iterate over the graph
for subject, predicate, obj in g:
    relationships.add(predicate)  # Add predicate to relationships

    # Add entities
    if isinstance(subject, rdflib.URIRef):
        subject_entities.add(subject)
        entities.add(subject)
    if isinstance(obj, rdflib.URIRef):
        entities.add(obj)

    # Extract labels for subject entities
    if predicate == rdflib.RDFS.label or predicate == rdflib.URIRef("http://purl.org/dc/terms/title") and isinstance(subject, rdflib.URIRef):
        label_dict[subject] = str(obj)

    # Append descriptions
    if predicate in description_info_set:
        descriptions[subject].append(str(obj))

# Combine multiple descriptions into a single string for each entity
for entity, desc_list in descriptions.items():
    descriptions[entity] = ' '.join(desc_list)

# Use URI as fallback label for subject entities without explicit label
for entity in entities:
    if entity not in label_dict:
        label_dict[entity] = str(entity) if entity in subject_entities else None

# Create DataFrame for entities with labels and descriptions
entity_list = []
for entity in entities:
    entity_label = label_dict.get(entity, "")
    entity_description = descriptions.get(entity, "")
    entity_list.append([str(entity), entity_label, entity_description])

df_entity = pd.DataFrame(entity_list, columns=['entity_uri', 'entity_label', 'entity_description'])

# To export the DataFrame 'df_entity' to an Excel file
df_entity.to_excel( 'extracted_entity_list .xlsx' , index=False)


Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#time, Converter=<function parse_time at 0x000002A2D3AB5550>
Traceback (most recent call last):
  File "d:\Anaconda\envs\iseenv\lib\site-packages\rdflib\term.py", line 2119, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "d:\Anaconda\envs\iseenv\lib\site-packages\isodate\isotime.py", line 148, in parse_time
    raise ISO8601Error('Unrecognised ISO 8601 time format: %r' % timestring)
isodate.isoerror.ISO8601Error: Unrecognised ISO 8601 time format: '2x90 min/week'
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#time, Converter=<function parse_time at 0x000002A2D3AB5550>
Traceback (most recent call last):
  File "d:\Anaconda\envs\iseenv\lib\site-packages\rdflib\term.py", line 2119, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "d:\Anaconda\envs\iseenv\lib\site-packages\isodate\

Unnamed: 0,entity_uri,entity_label,entity_description
0,http://demo.fiz-karlsruhe.de/matwerk/E47361,Homogenization of elastic,
1,http://demo.fiz-karlsruhe.de/matwerk/E602157,MetaStore2,General purpose metadata repository and schema...
2,http://demo.fiz-karlsruhe.de/matwerk/E1153922,Electronic Structure Common Data Format (ESCDF),This project gathers the working documents use...
3,https://www.tf.uni-kiel.de/matwis/instmat/de,,
4,https://ams3.digitaloceanspaces.com/externalpu...,,
...,...,...,...
2587,http://demo.fiz-karlsruhe.de/matwerk/E370371,https://materials.hybrid3.duke.edu/,
2588,http://demo.fiz-karlsruhe.de/matwerk/E85667,http://www.bio.uni-freiburg.de/fakultaet/insti...,
2589,http://demo.fiz-karlsruhe.de/matwerk/E388641,https://github.com/Atomistica/atomistica,
2590,https://www.ctcms.nist.gov/potentials/,,


# Extract all unique predicates/relations from MSE turtle file

In [5]:
from rdflib import Graph
from rdflib.namespace import RDF
import pandas as pd

# Define the namespaces
namespaces = {
    "dc": "http://purl.org/dc/elements/1.1/",
    "dcterms": "http://purl.org/dc/terms/",
    "default1": "https://nfdi.fiz-karlsruhe.de/ontology/",
    "emmo": "http://emmo.info/emmo#",
    "foaf": "http://xmlns.com/foaf/0.1/",
    "modsci": "https://w3id.org/skgo/modsci#",
    "mwo": "http://purls.helmholtz-metadaten.de/mwo/",
    "nfdicore": "http://nfdi.fiz-karlsruhe.de/ontology/",
    "ns1": "https://w3id.org/scholarlydata/ontology/conference-ontology.owl#",
    "ns2": "http://purl.obolibrary.org/obo/",
    "ns3": "http://www.ebi.ac.uk/swo/",
    "ns4": "http://www.geneontology.org/formats/oboInOwl#",
    "owl": "http://www.w3.org/2002/07/owl#",
    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
    "schema": "https://schema.org/",
    "skos": "http://www.w3.org/2004/02/skos/core#",
    "void": "http://rdfs.org/ns/void#",
    "xsd": "http://www.w3.org/2001/XMLSchema#"
}

# Load the Turtle file into an RDF graph
g = Graph()
g.parse("Material\output.ttl", format="turtle")

# Extract predicates
predicates_with_namespace = set()
predicate_uri = set()

for s, p, o in g:
    # Check if predicate is not an RDF reserved predicate
    if not p.startswith(RDF):
        for prefix, namespace in namespaces.items():
            if p.startswith(namespace):
                predicates_with_namespace.add(f"{prefix}:{p.replace(namespace, '')}")
                predicate_uri.add(p)
                break

# Create a DataFrame
df_relation = pd.DataFrame({
    'Predicate with Namespace': list(predicates_with_namespace),
    'Predicate_uri': list(predicate_uri)
})

# Save DataFrame to Excel file
df_relation.to_excel("extracted_relation_list.xlsx", index=False)


file:///c:/Users/ZHAOTAI/Desktop/Data%20Processing/Material\output.ttl does not look like a valid URI, trying to serialize this will break.
file:///c:/Users/ZHAOTAI/Desktop/Data%20Processing/Material\output.ttl does not look like a valid URI, trying to serialize this will break.
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#time, Converter=<function parse_time at 0x000002A2D3AB5550>
Traceback (most recent call last):
  File "d:\Anaconda\envs\iseenv\lib\site-packages\rdflib\term.py", line 2119, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "d:\Anaconda\envs\iseenv\lib\site-packages\isodate\isotime.py", line 148, in parse_time
    raise ISO8601Error('Unrecognised ISO 8601 time format: %r' % timestring)
isodate.isoerror.ISO8601Error: Unrecognised ISO 8601 time format: '2x90 min/week'
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#time, Converter=<function parse