In [1]:

import rdflib
import pyshacl


### Importing the RDF Graph

In [5]:
q = rdflib.Graph()
q.parse(r"C:/Users/mahar/OneDrive/Desktop/Fraunhofer/Hydrogen/First_Tests/Hydrogen_Refined_Graph.ttl")
query = """SELECT ?s ?p ?o WHERE {?s ?p ?o .}"""
results = q.query(query)
for result in results:
    print(result)

(rdflib.term.URIRef('http://fraunhofer.de/data/Channel_10'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://fraunhofer.de/MarketActivities/Supplier'))
(rdflib.term.URIRef('http://fraunhofer.de/data/Bloom'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://fraunhofer.de/MarketActivities/Manufacturer'))
(rdflib.term.URIRef('http://fraunhofer.de/data/Perovskites_crystals'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://fraunhofer.de/MarketActivities/inProduction'))
(rdflib.term.URIRef('http://fraunhofer.de/data/hydrogen_technology'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://fraunhofer.de/MarketActivities/Manufacturer'))
(rdflib.term.URIRef('http://fraunhofer.de/data/Energy_Department'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://

### get_delimiter returns the part of the subject, predicate and object after the last special character in the URL, i.e., '/' or '#'

In [6]:
def get_delimiter(url):
    if '#' in url:
        delimiter = '#'
    else:
        delimiter = '/'
    return delimiter

# Firstly, we create a list temporary_list_of_triples which in turn contains lists containing subject, predicate, and object of each triple. Then, we create a dictionary case_list which contains each subject as the key value and all the predicates and objects related to that specific subject in a list of dictionaries. The format is to be understood as: {subject1:[{predicate1:object1},{predicate2:object2}.....],subject2:[{predicate1:object1},{predicate2:object2}].....}.

In [7]:
import itertools
temporary_list_of_triples = []
for row in results:
    subject = str(row[0]).split(get_delimiter(str(row[0])))[-1] # Getting the subject from a triple
    predicate = str(row[1]).split(get_delimiter(str(row[1])))[-1] #Getting the predicate from a triple
    object = str(row[2]).split(get_delimiter(str(row[2])))[-1] # Getting the object from a triple
    temporary_list_of_triples.append([str(subject),str(predicate),str(object)]) # A list of lists containing subjects,predicates,objects



subjects = [data[0] for data in temporary_list_of_triples] #all subjects in temporary_list_of_triples
subjects = set(subjects) #we find out all the unique subjects
empty_dict = dict(zip(subjects, [None]*len(subjects)))#we set empty_dct as {subject1:None,subject2:None...}
case_list = {}
for name, items in itertools.product(empty_dict, temporary_list_of_triples):
    if ((len(items[0])>=1) & (items[0] == name)):#This was necessary, as often there was nothing left after the initial filteration using get_url(see above). We consider only subjects having text after the last '/' or '#'
        if(items[0][0].isupper()):#By inspecting the RDF Graph, I found a rule that was satisfactory to filter out organizations: All organization names started with capital letters. So used this filtering.
            temp = {items[1]:items[2]}#Creating the sub dictionary {predicate:object}
            if name in case_list:
                case_list[name].append(temp)#if name already present in case_list,append to the list of dictionaries for a specific subject
            else:
                case_list[name] = [temp]#else, set subject:[{predicate1:object1}] 
print(case_list)

{'Advance_Article': [{'type': 'RegionalTruck'}, {'type': 'Supplier'}, {'type': 'inProduction'}, {'type': 'inTesting'}], 'DOI': [{'type': 'Supplier'}, {'type': 'Customer'}, {'type': 'Manufacturer'}, {'type': 'Researcher'}], 'Switch_to_Hydrogen_Buses': [{'type': 'Bus'}], 'CrystEngComm': [{'type': 'Supplier'}, {'type': 'Manufacturer'}, {'type': 'Researcher'}], 'ALICE': [{'type': 'Manufacturer'}, {'type': 'Supplier'}], 'High_Pressure_Hydrogen_Tanks': [{'type': 'Researcher'}, {'type': 'Supplier'}, {'type': 'Manufacturer'}], 'Fukushima_Prefecture': [{'type': 'Researcher'}], 'Lithium_Ion_Battery_and_Fuel_Cell': [{'type': 'inResearch'}, {'type': 'LithiumBattery'}, {'type': 'Product'}, {'type': 'Battery'}, {'type': 'inProduction'}, {'type': 'Reconversion'}, {'type': 'inTesting'}], 'CAF': [{'type': 'Researcher'}], 'Featured_Company_Featured_Company_DNI_Metals_Inc': [{'type': 'Researcher'}, {'type': 'RegisteredOrganization'}], 'ACWA_Power': [{'type': 'Researcher'}, {'type': 'Supplier'}, {'type': 

### The below code now groups together common predicates for a specific sibject together, ex - 'Daimler_Truck_Fuel_Cell': for 'Daimler_Truck_Fuel_Cell': [{'type': 'Customer'}, {'type': 'Supplier'}....], the below code will result in 'Daimler_Truck_Fuel_Cell': 'type':'Customer,Supplier'.

In [12]:
import pandas as pd
rows = []
for key, values in case_list.items():
    row = {'Entity': key}
    for item in values:
        for k, v in item.items():
            
            row[k] = row.get(k, '') + ',' + v if row.get(k, '') else v
    rows.append(row)
    

df = pd.DataFrame(rows)


### Finally, the results are exported to Initial_Results.csv

In [15]:
df
df.to_csv(r"C:/Users/mahar/OneDrive/Desktop/Fraunhofer/Hydrogen/First_Tests/Initial_Results.csv")