In [1]:
import datetime
import itertools
import numpy as np
import pandas as pd

import rdflib
import owlrl
import pyshacl

# Create KG resources from CSV dumps

## Load CSV file

In [2]:
studies_df = pd.read_csv("./data/dataset.csv", sep=';')
studies_df.head()

Unnamed: 0,title,abstract,topic,tags,Group,Authors,Release Date,Identifier,Permanent Identifier (DOI),Permanent Identifier (URI),...,License,Public Access Level,Contact Name,Contact Email,Plasma Source Procedure,Plasma Medium Procedure,Plasma Diagnostics Procedure,Plasma Target Name,Plasma Target Properties,Plasma Target Procedure
0,Verified modeling of a low pressure hydrogen p...,A self-consistent ﬂuid model has been successf...,Materials / Surfaces; Plasma Chemical Processes,ECR plasma; Model validation; fluid modelling;...,INP,"Sigeneger, Florian; Ellis, James; Harhausen, J...",2022-10-10,dc66dc03-a157-4179-8d8c-19999bcfe8ff,10.34711/inptdat.599,https://www.inptdat.de/node/599,...,Creative Commons Attribution 4.0 International...,Public,Florian Sigeneger,sigeneger@inp-greifswald.de,,,,,,
1,Effect of a spatially fluctuating heating of p...,The work is concerned with the effect of a spa...,Materials / Surfaces,plasma processing,INP,"Zhu, Tao; Baeva, Margarita; Testrich, Holger; ...",2022-05-13,7a581f22-499c-4050-b2bb-09dab8f881bd,10.34711/inptdat.571,https://www.inptdat.de/node/571,...,Creative Commons Attribution 4.0 International...,Public,Margarita Baeva,baeva@inp-greifswald.de,Al2O3 powder is injected into the plasma jet. ...,A plasma jet is generated in a mixture of Ar (...,,,,
2,Upscaling from single- to multi-filament diele...,A study on the scalability of discharge charac...,Plasma Chemical Processes,streamer discharge; basic research,INP,"Höft, Hans; Becker, Markus M.; Kettlitz, Manfr...",2022-08-26,51e4aceb-eb8c-44d4-ac6a-295fdbd79d84,10.34711/inptdat.572,https://www.inptdat.de/node/572,...,Creative Commons Attribution 4.0 International...,Public,"Höft, Hans",hans.hoeft@inp-greifswald.de,Unipolar square wave pulses with 10kV amplitud...,The total gas flow through the cell was set to...,Optical diagnostics were performed with a fast...,,,
3,The effect of oxygen admixture on the properti...,This work presents the datasets of the results...,Plasma Chemical Processes,argon; atmospheric pressure; fluid modelling; ...,INP; Leibniz-IPHT,"Baeva, Margarita; Stankov, Marjan; Trautvetter...",2021-03-01,adeb6746-9957-4b9c-afcf-f9b2e14b21bc,10.34711/inptdat.378,https://www.inptdat.de/node/378,...,Creative Commons Attribution 4.0 International...,Public,"Baeva, Margarita",baeva@inp-greifswald.de,The microwave plasma is generated in a quartz ...,The increase of the O2-admixture has to be fol...,,,,
4,Modelling and experimental evidence of the cat...,The lifetime of tungsten cathodes used in plas...,Materials / Surfaces; Welding / Switching,plasma spray torch; erosion; tungsten cathode;...,INP; Universidade da Madeira,"Baeva, Margarita; Benilov, Mikhail S.; Zhu, Ta...",2022-07-07,f7116dbe-455b-451d-b7d7-8a8a48d82993,10.34711/inptdat.512,https://www.inptdat.de/node/512,...,Creative Commons Attribution 4.0 International...,Public,Margarita Baeva,baeva@inp-greifswald.de,,,The erosion of the cathode made of lanthanated...,tungsten,Tungsten can withstand temperatures up to abou...,"Due to the erosion, the cathode has to be repl..."


In [3]:
studies_df.describe()

Unnamed: 0,title,abstract,topic,tags,Group,Authors,Release Date,Identifier,Permanent Identifier (DOI),Permanent Identifier (URI),...,License,Public Access Level,Contact Name,Contact Email,Plasma Source Procedure,Plasma Medium Procedure,Plasma Diagnostics Procedure,Plasma Target Name,Plasma Target Properties,Plasma Target Procedure
count,29,29,29,27,28,29,29,29,29,29,...,29,29,29,29,14,7,18,6,6,6
unique,29,29,6,26,12,26,26,29,29,29,...,1,1,16,14,12,6,18,5,6,5
top,Verified modeling of a low pressure hydrogen p...,A self-consistent ﬂuid model has been successf...,Plasma Chemical Processes,GMAW; high-current arc,INP,"Loffhagen, Detlef; Pinhao, Nuno R.; Vass, Mate...",2020-01-30,dc66dc03-a157-4179-8d8c-19999bcfe8ff,10.34711/inptdat.599,https://www.inptdat.de/node/599,...,Creative Commons Attribution 4.0 International...,Public,"Baeva, Margarita",baeva@inp-greifswald.de,The drift cell is situated within a vacuum cha...,Standard conditions of the argon gas are assured.,Optical diagnostics were performed with a fast...,mild steel,Tungsten can withstand temperatures up to abou...,Bead-on-plate welding
freq,1,1,9,2,15,3,3,1,1,1,...,29,29,4,7,3,2,1,2,1,2


### Preprocess CSV

In [53]:
studies_df["formatted_authors"] = [[tuple([p.strip() for p in n.split(',')]) for n in row.split(';')] 
                           if row==row else [] for row in studies_df["Authors"]]
studies_df["formatted_topic"] = [[p.strip() for p in row.split(';')] if row==row else [] for row in studies_df["topic"]]
studies_df["formatted_tags"] = [[p.strip() for p in row.split(';')] if row==row else [] for row in studies_df["tags"]]
studies_df["formatted_Source_Name"] = [[p.strip() for p in row.split(';')] if row==row else [] for row in studies_df["Plasma Source Name"]]
studies_df["formatted_Target_Name"] = [[p.strip() for p in row.split(';')] if row==row else [] for row in studies_df["Plasma Target Name"]]
studies_df["formatted_Medium_Name"] = [[p.strip() for p in row.split(';')] if row==row else [] for row in studies_df["Plasma Medium Name"]]
studies_df["formatted_Diagnostics_Name"] = [[p.strip() for p in row.split(';')] if row==row else [] for row in studies_df["Plasma Diagnostics Name"]]

## Create KG
KG
Controller

In [5]:
rdf_namespace = rdflib.Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
rdfs_namespace = rdflib.Namespace("http://www.w3.org/2000/01/rdf-schema#")
owl_namespace = rdflib.Namespace("http://www.w3.org/2002/07/owl#")
xsd_namespace = rdflib.Namespace("http://www.w3.org/2001/XMLSchema#")
foaf_namespace = rdflib.Namespace("http://xmlns.com/foaf/0.1/")
qpt_namespace = rdflib.Namespace("http://qptdat.plasma-mds.org/ontology/")
qpt_source_namespace = rdflib.Namespace("http://qptdat.plasma-mds.org/source_ontology/")
qptkg_namespace = rdflib.Namespace("http://qptdat.plasma-mds.org/resource/")

In [6]:
remote_graph = rdflib.Graph()
remote_graph.bind("qpt", qpt_namespace)
remote_graph.bind("qpt_source", qpt_source_namespace)
remote_graph.bind("rdf", rdf_namespace)
remote_graph.bind("rdfs", rdfs_namespace)
remote_graph.bind("owl", owl_namespace)
remote_graph.bind("xsd", xsd_namespace)
remote_graph.bind("foaf", foaf_namespace)
remote_graph.bind("qptkg", qptkg_namespace)

In [7]:
class KG_Controller():    
    def __init__(self, remote_graph, namespace, prefix="e"):
        self.remote_graph = remote_graph
        
        self.namespace=namespace
        self.prefix=prefix
        
        max_index_query = """
            SELECT (MAX(?x) as ?max_index)
            WHERE {
                SERVICE <http://qptdat.plasma-mds.org/sparql> {
                    ?s ?p ?o .
                    FILTER(REGEX(STR(?s),"http://qptdat.plasma-mds.org/resource/e[0-9]+$"))
                    BIND (REPLACE(STR(?s), "http://qptdat.plasma-mds.org/resource/e([0-9]+)$", "$1") as ?x)
                }
            }
            """
        qres = self.remote_graph.query(max_index_query)
        self.index=int([row.max_index for row in qres][0])+1
    
    def _exists(self,uri):
        exists_query = """
            ASK
            WHERE {
                SERVICE <http://qptdat.plasma-mds.org/sparql> {
                    <"""+uri+"""> ?p ?o . 
                }
            }
            """
        qres = self.remote_graph.query(exists_query)
        return bool([row for row in qres][0])
    def _getUri(self,):
        return self.namespace + self.prefix + f"{self.index}"
    def getNewEntity(self,):
        while self._exists(self._getUri()):
            self.index += 1
        entity = rdflib.URIRef(self._getUri())
        self.index +=1 
        return entity

    def getNewEntities(self, length):
        return [self.getNewEntity() for _ in range(length)]

controller = KG_Controller(remote_graph, rdflib.Namespace("http://qptdat.plasma-mds.org/resource/"), "e")
g = rdflib.Graph()

### Create plasma studies 

In [8]:
studies_dict = {i:controller.getNewEntity() for i in studies_df.index}

In [9]:
for k,v in studies_dict.items():
    g.add((v,rdf_namespace.type,qpt_namespace.PlasmaStudy))
    
    g.add((v,qpt_namespace.studyTitle,rdflib.Literal(studies_df["title"].loc[k], lang="en")))
    g.add((v,qpt_namespace.studyAbstract,rdflib.Literal(studies_df["abstract"].loc[k], lang="en")))   

### Create dataset

In [10]:
dataset_dict = {i:controller.getNewEntity() for i in studies_df.index}
for k,v in dataset_dict.items():
    g.add((v, rdf_namespace.type, qpt_namespace.Dataset))
    g.add((studies_dict[k], qpt_namespace.hasOutputVar, dataset_dict[k]))
    
    dataset_title = studies_df["title"].loc[k]
    g.add((v, rdfs_namespace.label, rdflib.Literal(dataset_title, lang="en")))
    if studies_df["Release Date"].loc[k]:
        publication_date = datetime.date.fromisoformat(studies_df["Release Date"].loc[k])
        g.add((v, qpt_namespace.publicationDate, rdflib.Literal(publication_date, datatype=xsd_namespace.date) ))
    if studies_df['Permanent Identifier (DOI)'].loc[k]:
        doi = f"http://doi.org/{studies_df['Permanent Identifier (DOI)'].loc[k]}"
        g.add((v, qpt_namespace.identifier, rdflib.URIRef(doi)))
    if studies_df["Is supplementing"].loc[k]:
        supplementing = f"http://doi.org/{studies_df['Is supplementing'].loc[k]}"
        g.add((v, qpt_namespace.isSupplementing, rdflib.URIRef(supplementing))) 
        g.add((studies_dict[k], qpt_namespace.isSupplementing, rdflib.URIRef(supplementing)))

### topics

In [11]:
search_dict = {f"\"^{x}$\"":x for x in set(itertools.chain(*studies_df["formatted_topic"])) }

In [12]:
topic_query = """
SELECT ?search ?topic
WHERE {
    SERVICE <http://qptdat.plasma-mds.org/sparql> {
        ?topic a qpt:ResearchTopic .
        ?topic rdfs:label ?label .
        FILTER (LANG(?label) = 'en') 
        FILTER( REGEX(?label, ?search, "i") )

        VALUES ?search { """ + " ".join(search_dict.keys()) + """ }
    }
}
"""
qres = remote_graph.query(topic_query)
result_dict = {search_dict[f"\"{row.search}\""]:row.topic for row in qres} 
unmapped = [v for k,v in search_dict.items() if v not in result_dict]
print(f"unmapped strings: {unmapped}")

unmapped strings: ['Materials / Surfaces', 'Plasma Chemical Processes', 'Plasma Medicine', 'Welding / Switching']


In [13]:
for i,row in studies_df.iterrows():
    for topic in row["formatted_topic"]: 
        if topic in result_dict:
            g.add((studies_dict[i] ,qpt_namespace.hasResearchTopic, result_dict[topic] ))

### tags

In [14]:
search_dict = {f"\"^{x}$\"":x for x in set(itertools.chain(*studies_df["formatted_tags"])) }

In [15]:
tag_query = """
SELECT ?search ?topic
WHERE {
    SERVICE <http://qptdat.plasma-mds.org/sparql> {
        ?topic a qpt:PlasmaStudyTag .
        ?topic rdfs:label ?label .
        FILTER (LANG(?label) = 'en') 
        FILTER( REGEX(?label, ?search, "i") )

        VALUES ?search { """ + " ".join(search_dict.keys()) + """ }
    }
}
"""
qres = remote_graph.query(tag_query)
result_dict = {search_dict[f"\"{row.search}\""]:row.topic for row in qres} 
unmapped = [v for k,v in search_dict.items() if v not in result_dict]
print(f"unmapped strings: {unmapped}")

unmapped strings: ['plasma polymerization', 'C2H4', 'ultra low-k etching', 'plasma processing', 'argon', 'ionization layer', 'radicals', 'plasma jet', 'MHD model', 'basic research', 'electric arcs', 'plasma chemistry', 'tungsten cathode', 'experiment', 'EEDF', 'thermal plasma', 'convection', 'isotopic labelling', 'field reversal', 'high-current arc', 'reactive oxygen species', 'absorption spectroscopy', 'plasma spraying', 'fluid modelling', 'TMS', 'ECR plasma', 'microwave plasma', 'erosion', 'self-controlling mechanism', 'non-equilibrium plasma', 'infrared laser', 'evaporation', 'microarc', 'space charge', 'ellipsometry', 'HMDSO', 'Ammonia production mechanisms', 'Model validation', 'plasma microprinting', 'TALIF', 'atmospheric pressure', 'XPS', 'oxygen admixture', 'electron Boltzmann equation', 'EPR spectroscopy', 'plasma parameters', 'Monte Carlo Flux', 'electron swarm parameters', 'C2H6', 'benchmark data', 'unified description', 'streamer discharge', 'self-organisation', 'DC torch',

In [16]:
for i,row in studies_df.iterrows():
    for tag in row["formatted_tags"]: 
        if tag in result_dict:
            g.add((studies_dict[i] ,qpt_namespace.hasPlasmaStudyTag, result_dict[tag] ))

### Authors


In [17]:
search_dict = {f"(\"{x[0]}\"@en \"{x[1]}\"@en)":x for x in set(itertools.chain(*studies_df["formatted_authors"])) }

In [18]:
person_query = """
SELECT ?search_surname ?search_givenname ?person
WHERE {
    SERVICE <http://qptdat.plasma-mds.org/sparql> {
        ?person a qpt:Person .
        ?person foaf:surname ?search_surname .
        ?person foaf:givenname ?search_givenname .

        VALUES (?search_surname ?search_givenname)  { """ + " ".join(search_dict.keys()) + """ }
    }
}
"""
qres = remote_graph.query(person_query)
result_dict = {search_dict[f"(\"{row.search_surname}\"@en \"{row.search_givenname}\"@en)"]:
               row.person for row in qres} 
unmapped = [v for k,v in search_dict.items() if v not in result_dict]
print(f"unmapped strings: {unmapped}")

unmapped strings: [('Wubs', 'Jente'), ('Schäfer', 'Jan'), ('Schröter', 'Sandra'), ('Melzer', 'Marcel'), ('Bosnjakovic', 'Danko'), ('Loffhagen', 'Detlef'), ('Araoud', 'Zouhour'), ('Baeva', 'Margarita'), ('Kewitz', 'Thorben'), ('Gonzalez', 'Diego'), ('Ecke', 'Ramona'), ('Manfred', 'Katherine M.'), ('Zimmermann', 'Sven'), ('Longo', 'Savino'), ('Gorbanev', 'Yury'), ('Foest', 'Rüdiger'), ('Ellis', 'James'), ('Testrich', 'Holger'), ('Stankov', 'Marjan'), ('Lang', 'Norbert'), ('Trautvetter', 'Tom'), ('Höft', 'Hans'), ('Kählert', 'Hanno'), ("O'Connell", 'Deborah'), ('Klages', 'Claus-Peter'), ('Veklich', 'Anatoly'), ('Zhang', 'Guokai'), ('Niemi', 'Kari'), ('Harhausen', 'Jens'), ('Lozano', 'Philipp'), ('Uhrlandt', 'Dirk'), ('Hempel', 'Frank'), ('Murmantsev', 'Oleksandr'), ('Riedel', 'Frederik'), ('Brandenburg', 'Ronny'), ('Norman', 'Helen C.'), ('Charrada', 'Kamel'), ('Semenov', 'Igor'), ('Mohsni', 'Chayma'), ('Dujko', 'Sasa'), ('Haase', 'Micha'), ('Vass', 'Mate'), ('Vialetto', 'Luca'), ('Chechi

Add new persons to KG

In [19]:
person_dict = {p:controller.getNewEntity() for p in unmapped}
for k,v in person_dict.items():
    g.add((v, rdf_namespace.type, foaf_namespace.Person))
    g.add((v, foaf_namespace.givenname , rdflib.Literal(k[1], lang="en")))
    g.add((v, foaf_namespace.surename , rdflib.Literal(k[0], lang="en")))
    g.add((v, foaf_namespace.name , rdflib.Literal(f"{k[1]} {k[0]}", lang="en")))
    g.add((v, rdf_namespace.label , rdflib.Literal(f"{k[1]} {k[0]}", lang="en")))
    result_dict[k]=v

In [20]:
for i,row in studies_df.iterrows():
    for person in row["formatted_authors"]: 
        if person in result_dict:
            g.add((studies_dict[i], qpt_namespace.contributor, result_dict[person]))
            g.add((dataset_dict[i], qpt_namespace.author, result_dict[person]))

### License, Access Level

In [21]:
license_dict = {"Creative Commons Attribution 4.0 International (CC BY 4.0)":rdflib.URIRef("")}
access_level_dict = {"Public":qpt_namespace.publicAccessLevel}

In [22]:
for i,row in studies_df.iterrows():
    if row["License"] in license_dict:
        g.add((dataset_dict[i],qpt_namespace.license, license_dict[row["License"]]))
    if row["Public Access Level"] in access_level_dict:
        g.add((dataset_dict[i],qpt_namespace.hasAccessLevel, access_level_dict[row["Public Access Level"]]))

### Source
In general we assume each study was performed on *one* specific source. This assumption might not be valid for all plasma studies, but the CSV data does not contain such information. 

We assume the studies ["278af539-7a4e-4fe5-8064-37c53a0ee7f5","8c6093fc-0672-4f1e-8dad-eb414320bead", 
 "95b7984b-bd8d-48f0-b5cf-c469d9fd17ff"] used the same souce base on the avalible data. 

In [23]:
studies_df[["Plasma Source Name","Plasma Source Properties", "Plasma Source Procedure"]].dropna(how="all")

Unnamed: 0,Plasma Source Name,Plasma Source Properties,Plasma Source Procedure
0,AURA-WAVE (Sairem),Electron cyclotron resonance (ECR) coaxial pla...,
1,Oerlikon Metco F4MB-XL,The plasma spray torch considered in the prese...,Al2O3 powder is injected into the plasma jet. ...
2,DBD; single-filament DBD; multi-filament DBD,Two different arrangements were used. The sing...,Unipolar square wave pulses with 10kV amplitud...
3,atmospheric pressure MW discharge; plasma torch,"For an incoming power of 1 kW, the predicted a...",The microwave plasma is generated in a quartz ...
4,Oerlikon Metco F4MB-XL,"The torch is typically operated in pure argon,...",
5,scanning drift tube apparatus,Time-of-flight (TOF) experiment; ultraviolet l...,The drift cell is situated within a vacuum cha...
6,scanning drift tube apparatus,Time-of-flight (TOF) experiment; ultraviolet l...,The drift cell is situated within a vacuum cha...
7,scanning drift tube apparatus,Time-of-flight (TOF) experiment; ultraviolet l...,The drift cell is situated within a vacuum cha...
8,DBD,Atmospheric-pressure DBD between plane steel m...,
9,single-filament DBD,The object of interest is streamer-surface int...,


In [24]:
source_groups = [["278af539-7a4e-4fe5-8064-37c53a0ee7f5","8c6093fc-0672-4f1e-8dad-eb414320bead", 
 "95b7984b-bd8d-48f0-b5cf-c469d9fd17ff"]]

source_dict=dict()
for group in source_groups:
    group_entity = controller.getNewEntity() 
    for x in group:
        i = studies_df.index[studies_df["Identifier"] == x][0]
        source_dict[i]=group_entity
for i in studies_df.dropna(subset=["Plasma Source Name"]).index:
    if not i in source_dict:
        source_dict[i]=controller.getNewEntity()     

In [25]:
for k,v in source_dict.items():
    g.add((v, rdf_namespace.type, qpt_namespace.Source))
    g.add((studies_dict[k], qpt_namespace.hasInputVar, v))

#### Find suitable subclasses of source based on the name

In [26]:
search_dict = {f"\"{x}\"":x for x in set(itertools.chain(*studies_df["formatted_Source_Name"])) }

In [27]:
source_query = """
SELECT ?search_label ?source_class
WHERE {
    SERVICE <http://qptdat.plasma-mds.org/sparql> {
        ?source_class rdfs:subClassOf* qpt:Source .
        ?source_class rdfs:label ?source_label .
        FILTER(REGEX(?source_label,?search_label,"i"))

        VALUES ?search_label { """ + " ".join(search_dict.keys()) + """ }
    }
}
"""
qres = remote_graph.query(source_query)
result_dict = {search_dict[f"\"{row.search_label}\""]:
               row.source_class for row in qres} 
unmapped = [v for k,v in search_dict.items() if v not in result_dict]
print(f"unmapped strings: {unmapped}")

unmapped strings: ['kHz plasma jet', 'DC arc', 'HF plasma jet', 'CCP', 'AURA-WAVE (Sairem)', 'Applied Materials Centura 5200', 'kINPen-sci', 'mircoarc-plasma source', 'ntAPPJ', 'multi-filament DBD', 'Oerlikon Metco F4MB-XL', 'atmospheric pressure MW discharge', 'single-filament DBD', 'KINPen-sci']


In [28]:
for i,row in studies_df.iterrows():
    for source_label in row["formatted_Source_Name"]:
        if source_label in result_dict:
            g.add((source_dict[i],rdf_namespace.type,result_dict[source_label]))

The remaining unstructured data of each source has to be extended manually. Below an example is shown. 

In [29]:
print(studies_df["Plasma Source Properties"].iloc[0])
print("-----")
print(studies_df["Plasma Source Procedure"].iloc[0])

Electron cyclotron resonance (ECR) coaxial plasma source AURA-WAVE (Sairem): microwave frequency excitation 2.45 GHz, applied power range 50 – 150 W, volume of the plasma reactor about 70 l. 
-----
nan


In [30]:
source_0 = source_dict[0]

In [31]:
frequency = controller.getNewEntity()
g.add((frequency, rdf_namespace.type, qpt_namespace.Frequency))
g.add((source_0, qpt_namespace.hasConfiguration, frequency))
g.add((frequency, qpt_namespace.value, rdflib.Literal(2.45, datatype=xsd_namespace.decimal)))
g.add((frequency, qpt_namespace.measuredIn, qpt_namespace.gigaherz))

power = controller.getNewEntity()
g.add((power, rdf_namespace.type, qpt_namespace.Power))
g.add((power, rdf_namespace.type, qpt_namespace.Range))
g.add((source_0, qpt_namespace.hasConfiguration, power))
g.add((power, qpt_namespace.lowerLimit, rdflib.Literal(50, datatype=xsd_namespace.decimal)))
g.add((power, qpt_namespace.upperLimit, rdflib.Literal(150, datatype=xsd_namespace.decimal)))
g.add((power, qpt_namespace.measuredIn, qpt_namespace.watt))

reactorVolume = controller.getNewEntity()
g.add((reactorVolume, rdf_namespace.type, qpt_namespace.ReactorVolume))
g.add((source_0, qpt_namespace.hasConfiguration, reactorVolume))
g.add((reactorVolume, qpt_namespace.value, rdflib.Literal(70, datatype=xsd_namespace.decimal)))
g.add((reactorVolume, qpt_namespace.measuredIn, qpt_namespace.litre))


<Graph identifier=N2a821d40de3443eaa752da727af27551 (<class 'rdflib.graph.Graph'>)>

### Target

In [32]:
studies_df[["Plasma Target Name","Plasma Target Properties", "Plasma Target Procedure"]].dropna(how="all")

Unnamed: 0,Plasma Target Name,Plasma Target Properties,Plasma Target Procedure
4,tungsten,Tungsten can withstand temperatures up to abou...,"Due to the erosion, the cathode has to be repl..."
17,mild steel,"Mild steel: wire material G3Si1, base material...",Bead-on-plate welding
21,mild steel,"Mild steel: wire material ER70 S-6, base mater...",Bead-on-plate welding
23,Si wafer; SiOCH,Cutted pieces of 10 times 10 cm from a 300 mm ...,Each peace is placed on an Al wafer with 200 m...
26,H2O2; H2SO4; NaN3; D2O; PBN; TEMP; TEMPO; sodi...,"Hydrogen peroxide H₂O₂ (30%), sulphuric acid H...","In spin trapping experiments, a 100 mM solutio..."
28,Polyamide 12,Polyamide 12 powder with mean diameter of 60 µ...,Dosis: 10 mg in 50 ms introduced in the argon ...


Each element of the semicolon spearated list is treated as a separete target of a study. 

In [33]:
target_dict = {i:{k:controller.getNewEntity() for k in row} 
               for i,row in studies_df["formatted_Target_Name"].items() if len(row)>0}

In [34]:
for k,v in target_dict.items():
    for x in v.values():
        g.add((x, rdf_namespace.type, qpt_namespace.Target))
        g.add((studies_dict[k], qpt_namespace.hasInputVar, x))

search for chemical compounds to connect to target 

In [35]:
search_dict = {f"\"{x}\"":x for x in set(itertools.chain(*studies_df["formatted_Target_Name"])) }

In [36]:
target_query = """
SELECT ?search_label ?entity
WHERE {
    SERVICE <http://qptdat.plasma-mds.org/sparql> {
        ?entity rdf:type/rdfs:subClassOf* qpt:ChemicalCompound .
        ?entity rdfs:label ?entity_label .
        FILTER(REGEX(?entity_label,?search_label,"i"))

        VALUES ?search_label { """ + " ".join(search_dict.keys()) + """ }
    }
}
"""# TODO check result_dict
qres = remote_graph.query(target_query)
result_dict = {search_dict[f"\"{row.search_label}\""]:
               row.entity for row in qres} 
unmapped = [v for k,v in search_dict.items() if v not in result_dict]
print(f"unmapped strings: {unmapped}")

unmapped strings: ['H2O', 'Si wafer', 'TEMPO', 'NaN3', 'Polyamide 12', 'DMPO', 'H2O2', 'DEPMPO', 'TEMP', 'SiOCH', 'mild steel', 'tungsten', 'sodium tosylate', 'H2SO4', 'potassium bis(oxalato)oxotitanate(IV) dihydrate', 'PBN', 'D2O']


In [37]:
for i,row in studies_df.iterrows():
    for target in row["formatted_Target_Name"]: 
        if target in result_dict:
            g.add((target_dict[i][target], qpt_namespace.composedOf, result_dict[target]))

The remaining unstructured data of each target has to be extended manually. Below an example is shown. 

In [38]:
print(studies_df["Plasma Target Properties"].iloc[17])
print("----")
print(studies_df["Plasma Target Procedure"].iloc[17])

Mild steel: wire material G3Si1, base material S235
----
Bead-on-plate welding


In [39]:
target_17 = target_dict[17]["mild steel"]

In [40]:
#TODO add properties of target 
process = controller.getNewEntity()
g.add((process, rdf_namespace.type, qpt_namespace.ExperimentStep))
g.add((studies_dict[17], qpt_namespace.hasExperimentStep, process))
g.add((process, qpt_namespace.hasInputVar, target_17))
g.add((process, rdfs_namespace.label, rdflib.Literal("Bead-on-plate welding", lang="en")))

<Graph identifier=N2a821d40de3443eaa752da727af27551 (<class 'rdflib.graph.Graph'>)>

### Medium

In [41]:
studies_df[["Plasma Medium Name","Plasma Medium Properties", "Plasma Medium Procedure"]].dropna(how="all")

Unnamed: 0,Plasma Medium Name,Plasma Medium Properties,Plasma Medium Procedure
0,H2,Hydrogen with a gas purity of 6.0 and a flow o...,
1,Ar; H2,The plasma is generated in a mixture of Ar (40...,A plasma jet is generated in a mixture of Ar (...
2,N2/O2,A gas mixture of 0.1 vol% O2 (purity 4.8) in N...,The total gas flow through the cell was set to...
3,Ar; Ar/O2,"Mixture of Ar and O2, the O2-admixture is vari...",The increase of the O2-admixture has to be fol...
4,Ar,"Thermal plasma in the arc column, non-equilibr...",
5,C2H6,Gas temperature: 293.15 K\nPressure: 5 to 1000 Pa,
6,,Gas temperature: 293.15 K\nPressure: 5 to 1000 Pa,
7,C2H2,Gas temperature: 293.15 K\nPressure: 5 to 1000 Pa,
8,Ar; TMS,Gas temperature: 300 K; Pressure: 1 atm; Gas m...,
9,Ar,"Pure argon, gas pressure is 760 Torr, constant...",


The semicolon separeted list of names is treated as mixture and each study is connected to only one medium. 

In [42]:
medium_dict = {i:controller.getNewEntity() for i in studies_df.dropna(subset=["Plasma Medium Name", "Plasma Medium Properties", 
                                                                              "Plasma Medium Procedure"], how="all").index}

In [43]:
search_dict = {f"\"{x}\"":x for x in set(itertools.chain(*studies_df["formatted_Medium_Name"])) }

In [44]:
medium_query = """
SELECT ?search_label ?entity
WHERE {
    SERVICE <http://qptdat.plasma-mds.org/sparql> {
        ?entity rdf:type/rdfs:subClassOf* qpt:ChemicalCompound .
        ?entity rdfs:label ?entity_label .
        FILTER(REGEX(?entity_label,?search_label,"i"))

        VALUES ?search_label { """ + " ".join(search_dict.keys()) + """ }
    }
}
"""# TODO check result_dict
qres = remote_graph.query(medium_query)
result_dict = {search_dict[f"\"{row.search_label}\""]:
               row.entity for row in qres} 
unmapped = [v for k,v in search_dict.items() if v not in result_dict]
print(f"unmapped strings: {unmapped}")

unmapped strings: ['H2O', 'C2H6', 'H2', 'N2', 'HMDSO', 'Ar', 'He', 'CHF3', 'N2/O2', 'Ar/O2', 'TMS', 'CF4', 'Cu', 'C2H2', 'CO2']


In [45]:
for i,row in studies_df.iterrows():
    if i in medium_dict:
        g.add((medium_dict[i],rdf_namespace.type,qpt_namespace.Medium))
        g.add((studies_dict[i],qpt_namespace.hasInputVar , medium_dict[i]))
        for medium in row["formatted_Medium_Name"]:
            if medium in result_dict:
                g.add((medium_dict[i], qpt_namespace.composedOf, result_dict[medium]))

The remaining unstructured data of each medium has to be extended manually. Below an example is shown.

In [46]:
print(studies_df["Plasma Medium Properties"].iloc[1])
print("----")
print(studies_df["Plasma Medium Procedure"].iloc[1])

The plasma is generated in a mixture of Ar (40 NLPM) and H2 (14 NLPM) and in pure Ar.
----
A plasma jet is generated in a mixture of Ar (40 NLPM) and H2 (14 NLPM) and in pure Ar at an electric current of 600 A. 


In [47]:
medium_1 = medium_dict[1]

In [48]:
#TODO add properties of medium 
process = controller.getNewEntity()
g.add((process, rdf_namespace.type, qpt_namespace.ExperimentStep))
g.add((studies_dict[1], qpt_namespace.hasExperimentStep, process))
g.add((process, qpt_namespace.hasInputVar, medium_1))
g.add((process, rdfs_namespace.label, rdflib.Literal("A plasma jet is generated in a mixture of Ar (40 NLPM) and H2 (14 NLPM) and in pure Ar at an electric current of 600 A.", lang="en")))


<Graph identifier=N2a821d40de3443eaa752da727af27551 (<class 'rdflib.graph.Graph'>)>

### Diagnostics

In [52]:
studies_df[["Plasma Diagnostics Name","Plasma Diagnostics Properties", "Plasma Diagnostics Procedure"]].dropna(how="all")

Unnamed: 0,Plasma Diagnostics Name,Plasma Diagnostics Properties,Plasma Diagnostics Procedure
0,fluid model; TALIF; RFEA,The low-pressure hydrogen plasma driven by the...,
1,CFD simulations; turbulence model,,
2,electrical measurements; current measurement; ...,Voltage probe: Tektronix P6015A; current probe...,Optical diagnostics were performed with a fast...
4,boundary layer model; microarc model,,The erosion of the cathode made of lanthanated...
5,swarm map recording; electron Boltzmann calcul...,The experiments are based on a scanning drift ...,
6,swarm map recording; electron Boltzmann calcul...,The experiments are based on a scanning drift ...,
7,swarm map recording; electron Boltzmann calcul...,The experiments are based on a scanning drift ...,
8,fluid-Poisson model; current measurement; volt...,Fluid-Poisson model:; \nMesh size: Non-equidis...,"Fluid-Poisson model:; \nThe time-dependent, sp..."
9,fluid-Poisson model; plasma chemical model,Model: Fluid-Poisson model in cylindrical geom...,An atmospheric-pressure DBD in argon is invest...
10,electrical measurements; current measurement; ...,Electrical measurements were performed with fa...,All data were recorded spectrally-integrated.; \n


In [58]:
diagnostics_groups = [["278af539-7a4e-4fe5-8064-37c53a0ee7f5","8c6093fc-0672-4f1e-8dad-eb414320bead", 
 "95b7984b-bd8d-48f0-b5cf-c469d9fd17ff"]]

diagnostic_dict=dict()
for group in diagnostics_groups:
    group_dict= dict()
    i = studies_df.index[studies_df["Identifier"] == group[0]][0]
    for name in studies_df["formatted_Diagnostics_Name"].loc[i]:
        group_dict[name] = controller.getNewEntity() 
    
    for x in group:
        i = studies_df.index[studies_df["Identifier"] == x][0]
        diagnostic_dict[i]=group_dict    
        
for i in studies_df.dropna(subset=["formatted_Diagnostics_Name"]).index:
    if not i in diagnostic_dict:
        diagnostic_dict[i]={name:controller.getNewEntity() for name in studies_df["formatted_Diagnostics_Name"].loc[i]}

In [None]:
for k,v in diagnostic_dict.items():
    for n,e in v.items():
        g.add((e, rdf_namespace.type, qpt_namespace.DiagnosticDevice))
        g.add((studies_dict[k], qpt_namespace.hasInputVar, e))

#### Find suitable subclasses of source based on the name

In [None]:
search_dict = {f"\"{x}\"":x for x in set(itertools.chain(*studies_df["formatted_Diagnostics_Name"])) }

In [None]:
source_query = """
SELECT ?search_label ?device_class
WHERE {
    SERVICE <http://qptdat.plasma-mds.org/sparql> {
        ?device_class rdfs:subClassOf* qpt:DiagnosticDevice .
        ?device_class rdfs:label ?device_label .
        FILTER(REGEX(?device_label,?search_label,"i"))

        VALUES ?search_label { """ + " ".join(search_dict.keys()) + """ }
    }
}
"""
qres = remote_graph.query(source_query)
result_dict = {search_dict[f"\"{row.search_label}\""]:
               row.device_class for row in qres} 
unmapped = [v for k,v in search_dict.items() if v not in result_dict]
print(f"unmapped strings: {unmapped}")

In [None]:
for i,row in studies_df.iterrows():
    for device_label in row["formatted_Diagnostics_Name"]:
        if device_label in result_dict:
            g.add((diagnostic_dict[i][device_label],rdf_namespace.type,result_dict[device_label]))

The remaining unstructured data of each source has to be extended manually. Below an example is shown.

In [60]:
print(studies_df["Plasma Diagnostics Properties"].iloc[0])
print("-----")
print(studies_df["Plasma Diagnostics Procedure"].iloc[0])

The low-pressure hydrogen plasma driven by the commercial ECR source was investigated by a self-consistent fluid model which is implemented in Comsol Multiphysics. The model includes equations for the permanent magnetic field, the microwave field and fluid equations to describe the plasma generation. The stationary spatially homogeneous electron Boltzmann equation is solved once to determine the rate coefficients of reactions between electrons and heavy particle as functions of the mean energy. The static magnetic field is determined for each geometric configuration in advance. The remaining equations describing the microwave and the plasma are solved in a coupled system where the electron density and the absorbed power determine the coupling between the microwave and plasma parts.; 
; Two-photon absorption laser induced ﬂuorescence (TALIF) was used for the detection of hydrogen atoms. For this photons with a wavelength of 205 nm are created through the combination of a frequency doubl

## Validate local experiments with SHACL

In [49]:
shape_graph = rdflib.Graph()
s = """
@prefix qpt_shape: <http://qptdat.plasma-mds.org/shape/> .

@prefix qpt: <http://qptdat.plasma-mds.org/ontology/> .
@prefix qpt_source: <http://qptdat.plasma-mds.org/source_ontology/> .

@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .

@prefix sh: <http://www.w3.org/ns/shacl#> .                                                 

qpt_shape:SimplePlasmaStudy a rdfs:Class, sh:NodeShape ;
    sh:targetClass qpt:PlasmaStudy ;
    sh:property
        [
            sh:path qpt:studyTitle ;
            sh:minCount 1 ;
        ] .
"""
shape_graph.parse(data=s)

<Graph identifier=Na932fc81f71049e2a42d04cea041b813 (<class 'rdflib.graph.Graph'>)>

In [50]:
shape_url = "http://qptdat.plasma-mds.org/raw/shape.ttl"
shape_graph = rdflib.Graph()
shape_graph.parse(shape_url)

HTTPError: HTTP Error 500: Internal Server Error

In [None]:
conforms, results_graph, results_text = pyshacl.validate(g,shacl_graph=shape_graph, 
                                                         ont_graph=remote_graph, inference='rdfs',
      abort_on_first=False,
      allow_infos=True,
      allow_warnings=False,
      meta_shacl=False,
      advanced=True,
      js=False,
      debug=False)
print(conforms)

In [None]:
print(results_text)

Write RDF data into local file 

In [None]:
g.serialize(destination="new_studies.ttl")

Loading the created dump into the triplestore

The commands are based on this wiki page: https://vos.openlinksw.com/owiki/wiki/VOS/VirtBulkRDFLoader
1. Copy the new data dump into the triple store container. 
```
docker cp new_studies.ttl knowledge_graph_virtuoso_1:/usr/local/virtuoso-opensource/var/lib/virtuoso/db/dumps
```
2. Start a bash in the triple store container
```
docker exec -it knowledge_graph_virtuoso_1 bash
```
3. Run inside of the container an ISQL terminal
```
isql-v -U dba -P $DBA_PASSWORD
```
4. Run in the ISQL terminal the following comands to load the data into the http://qptdat.plasma-mds.org/resource/ graph.
```
ld_dir('dumps', 'new_studies.ttl', 'http://qptdat.plasma-mds.org/resource/');
rdf_loader_run();
```
5. Close ISQL with exit command 
```
exit;
```