In [35]:

import rdflib
import pyshacl


### Import the given RDF file to process it into a more readable format. The SPARQL query extracts all the triples from the graph.


In [36]:
def import_rdf_graph():
    g = rdflib.Graph()
    g.parse(r"C:/Users/mahar/OneDrive/Desktop/Fraunhofer/Hydrogen/First_Tests/statements.ttl", format="turtle")
    query = """SELECT DISTINCT ?url WHERE
    {
    {?s ?p ?o.
        BIND (?s AS ?url) }
         UNION
        { ?s ?p ?o. BIND (?p as ?url) }
        UNION
        { ?s ?p ?o. BIND (?o as ?url) }
    }   
    """
    results = g.query(query)
    return results


### Define a function get_delimiter to get the last '/' or '#'. For example: passing the URL https://fuelcellsworks.com/subscribers/first-injection-of-green-hydrogen-into-the-existing-northern-german-gas-pipeline-network/New_York to this function will return '/' as it is the last '/' or '#' before the last part of the URL, 'New_York'.

In [37]:
def get_delimiter(url):
    if '#' in url:
        delimiter = '#'
    else:
        delimiter = '/'
    return delimiter

### Process the namespaces further to remove extra  commas and opening brackets from the namespaces using the function further_processing_for_namespaces

In [38]:
def further_processing_for_namespaces(results):
    further_processed_list = []
    for result in results:
        temp_str = str(result).split('(')[1]
        if temp_str == 'rdflib.term.URIRef':
            uri = rdflib.term.URIRef(result)
            uri_str = str(uri)
            delimiter = get_delimiter(uri_str)
            shortened_uri_str = (('/'.join(uri_str.split(delimiter)[:-1]) + delimiter).split("(")[2]).split("'")[1]
            further_processed_list.append(shortened_uri_str)

    further_processed_list = set(further_processed_list)    
    return further_processed_list 
        


### I will now generate prefixes of length 3 for each of the namespaces using the random module. The prefixes, along with the namespaces, will be exported to the prefixes.txt text file.

In [39]:
def namespaces(processed_results):
    import random
    import string
    prefix_list = []

    df=open('C:/Users/mahar/OneDrive/Desktop/Fraunhofer/Hydrogen/First_tests/prefixes.txt','w+')
    for result in processed_results:
        prefixes = ''.join(random.choices(string.ascii_lowercase, k=3))
        prefix_list.append(prefixes)
    for prefix, result in zip(prefix_list,processed_results):
         line = "@prefix {}: <{}> .".format(prefix, result)
         df.write(line)
         df.write('\n') 
    df.close()




### Now, the triples are exported to a fresh file output.txt.We then re-read the contents of output.txt, and replace the last '>' in every subject, predicate and object URL with a blank space. This is necessary for putting all subjects, predicates and objects in 'prefix:abc' format. Without this, after replacing the namespaces for prefixes, we would be left with something like 'prefix:abc>'.We then write our modified triplets to output1.txt

In [40]:
def put_whitespaces():
    from rdflib import Graph
    # Load the Turtle file into an RDF graph
    g = Graph()
    g.parse(r"C:/Users/mahar/OneDrive/Desktop/Fraunhofer/Hydrogen/First_Tests/statements.ttl", format="turtle")

    # Open a text file for writing
    with open("C:/Users/mahar/OneDrive/Desktop/Fraunhofer/Hydrogen/output.txt", "w") as f:
    # Iterate over the triples in the graph and write them to the file
        for subject, predicate, object in g:
            f.write(f"<{subject}> <{predicate}> <{object}>.\n")
            f.write('\n') 
    f.close()
    # Open the file
    file = open("C:/Users/mahar/OneDrive/Desktop/Fraunhofer/Hydrogen/First_tests/output.txt", "r")

    # Read the contents of the file
    contents = file.read()

    # Close the file
    file.close()

    # Replace the character(s)
    new_contents = contents.replace(">", "")

    # Open the file in write mode
    file = open("C:/Users/mahar/OneDrive/Desktop/Fraunhofer/Hydrogen/First_tests/output1.txt", "w")

    # Write the modified contents to the file
    file.write(new_contents)

    # Close the file
    file.close()



### It is important to note at this point, that this part involves a little bit of manual work. The text from prefix.txt was copied to an csv file Prefix_processing.csv. It was then delimited based on the characters '@'and '>' in order to enable matching of the prefixes with the namespaces in output1.txt. 

In [41]:
def read_file():
    import pandas as pd
    test_df = pd.read_csv(r"C:/Users/mahar/OneDrive/Desktop/Fraunhofer/Hydrogen/First_tests/Prefix_Processing.csv", encoding = 'unicode_escape', delimiter=';')
    
    '''This code is essential to ensure that correct prefixes are assigned to the correct namespaces, for example: if two namespaces start with similar text, then, without sorting, 
    the namespaces may get matched to the prefix corresponding to the shorter namespace, since it appears first in the list, which is not correct. 
    This step ensures that doesn't happen.'''
    
    res = test_df.SITE.str.len().sort_values(ascending = False).index 

    # Display Modified DataFrame
    new_test_df = test_df.reindex(res)
    new_test_df = new_test_df.reset_index(drop = True)
    new_test_df.rename(columns ={'ï»¿PREFIX':'PREFIX'},inplace = True)
    return new_test_df



### Finally, we replace all the namespaces in output1.txt with their correct prefixes.

In [42]:
def replace_namespaces_with_prefixes(new_test_df):
    with open("C:/Users/mahar/OneDrive/Desktop/Fraunhofer/Hydrogen/First_tests/output1.txt", 'r') as file:
        # read the contents of the file
        text = file.read()

        # replace "old" with "new"
    for index, row in new_test_df.iterrows():
        text = text.replace(row['SITE'], row['PREFIX'])

    with open('C:/Users/mahar/OneDrive/Desktop/Fraunhofer/Hydrogen/First_tests/output2.txt', 'w') as file:
        # write the modified text to the file
        file.write(text)
        file.close()

### It is now discovered at this point, that several triples have special characters in them; ex: abc:New-York, which cannot be correctly parsed by rdflib or pyshacl.
### It is necessary at this point, that we replace all special characters with '_'.

In [43]:
def replace_special_chars():
    import re

    # Open the file and read its contents
    with open('C:/Users/mahar/OneDrive/Desktop/Fraunhofer/Hydrogen/First_tests/output2.txt', 'r') as file:
        data = file.read()

    # Use regex to find all special characters in the file
    special_chars = re.findall('[^a-zA-Z0-9\n\s]', data)

    # Print the list of special characters
    sp_char = set(special_chars)
    
    with open('C:/Users/mahar/OneDrive/Desktop/Fraunhofer/Hydrogen/First_tests/output2.txt', 'r') as input_file:
    # Read the contents of the file
        text = input_file.read()

    # Replace all hyphens with underscores
    text = re.sub(r'-', '_', text)

    # Open the output file for writing
    with open('C:/Users/mahar/OneDrive/Desktop/Fraunhofer/Hydrogen/First_tests/output3.txt', 'w') as output_file:
        # Write the modified text to the output file
        output_file.write(text)

    with open('C:/Users/mahar/OneDrive/Desktop/Fraunhofer/Hydrogen/First_tests/output3.txt', 'r') as file:
        lines = file.readlines()

    for i in range(len(lines)):
        #if lines[i][-1] == '.':
            # If the second-to-last character is already a period, skip this line
            #continue
        #else:
            # Replace all periods except the last one in the line
        lines[i] = lines[i][:-1].replace('.', '_') + lines[-1]

    with open('C:/Users/mahar/OneDrive/Desktop/Fraunhofer/Hydrogen/First_tests/output4.txt', 'w') as file:
        file.writelines(lines)
    with open('C:/Users/mahar/OneDrive/Desktop/Fraunhofer/Hydrogen/First_tests/output4.txt', 'r') as file:
        lines = file.readlines()
    for i,line in enumerate(lines):
        line = line.strip()
        last_comma_index = line.rfind("_")
        if last_comma_index != -1:
            line = line.rsplit("_", 1)[0] + "."
            print(line)  # Output: "Hello, world;"
            lines[i] = line + '\n'  # Update the list of lines with the modified line
        else:
            print(line)
    with open('C:/Users/mahar/OneDrive/Desktop/Fraunhofer/Hydrogen/First_tests/output4.txt', 'w') as file:
        file.writelines(lines)
    # Open the file and read its contents
    with open('C:/Users/mahar/OneDrive/Desktop/Fraunhofer/Hydrogen/First_tests/output4.txt', 'r') as file:
        data = file.read()

    # Use regex to find all special characters in the file
    special_chars = re.findall('[^a-zA-Z0-9\n\s]', data)

    # Print the list of special characters
    sp_char = set(special_chars)
    special_char = ['%', '=', '?']
    with open('C:/Users/mahar/OneDrive/Desktop/Fraunhofer/Hydrogen/First_tests/output4.txt', 'r') as file:
        data = file.readlines()
    for char in special_char:
        for i, line in enumerate(data):  # Use enumerate to access the index of each line
            if char in line:
                   data[i] = line.replace(char, '_')  # Update the line in the list
    #for line in data:
        #print(line)  # Print the modified lines'
    with open('C:/Users/mahar/OneDrive/Desktop/Fraunhofer/Hydrogen/First_tests/final_output.txt', 'w') as file:
        file.writelines(data)
    




In [44]:
first_variable = further_processing_for_namespaces(import_rdf_graph())
second_variable = namespaces(first_variable)
put_whitespaces()
third_variable = replace_namespaces_with_prefixes(read_file())
replace_special_chars()




tfw:highly_compressed_hydrogen sfi:type ttl:inProduction.

tfw:the_Suiso_Frontier sfi:type ttl:inProduction.

tfw:Storengy sfi:type ttl:Manufacturer.

tfw:Embracing_Hydrogen_and_Fuel_Cell_Technologies_Energy_Department sfi:type ttl:inProduction.

tfw:production sfi:type ttl:inResearch.

tfw:solar sfi:type ttl:inResearch.

tfw:LONDON sfi:type ttl:Customer.

tfw:a_fuel_cell_electric sfi:type ttl:inResearch.

tfw:Volvo_Group sfi:type ttl:Supplier.

tfw:Climate_Action_Champions sfi:type ttl:inProduction.

tfw:NJC sfi:type ttl:Supplier.

vcb:Initiative_seeks_to_reduce_green_hydrogen_cost_htm sfi:type bbq:BibliographicResource.

tfw:Toyota_Motor_Corporation sfi:type ttl:Supplier.

tfw:Hydrogen_NEXO_Vehicle_POSCO sfi:type czz:Road.

tfw:University sfi:type ttl:Supplier.

tfw:Hydrogen sfi:type ttl:inTesting.

tfw:UK sfi:type ttl:Customer.

tfw:technology sfi:type czz:Product.

tfw:Amazon sfi:type ttl:Manufacturer.

tfw:hydrogen_fuel_and sfi:type ttl:inResearch.

tfw:hydrogen_fuel_cell sfi:type

### You can now simply copy the contents of final_output.txt to a Turtle file in VS Code. I have called it Hydrogen_Refined_Graph.ttl