## Import the cdm excel

In [57]:
import pandas as pd
import numpy as np
from pathlib import Path
from tabulate import tabulate

# save xlsx to xls format, because pandas doesn't read xslx
# f1 = Path('epnd_minimal_dataset_draft for review_20220411.xlsx')
# f2 = Path('epnd_minimal_dataset_draft for review_20220411.xlsx').stem + ".xls"

cwd = os.getcwd()
df = pd.read_excel(os.path.join(cwd, 'EPNDCS1Terminology.xls'), usecols=["Description", "OntologyClass", "rangeMin", "rangeMax", "type", "ValueClass", "unitClass", "Unit", "Domain"])
#change all NaN to none values
df = df.replace({np.nan: None})

#replace all spaces and special characters for the shape descriptions
df['Description'] = df['Description'].str.replace(' ','_')
df['Description'] = df['Description'].str.replace('\W', '')
df['Domain'] = df['Domain'].str.replace(' ','_')
df['Domain'] = df['Domain'].str.replace('\W', '')

#print df in a tabular form
print(tabulate(df, headers = 'keys', tablefmt = 'github'))

|    | Domain              | OntologyClass                  | Description                                                                                                | Unit              | unitClass        | type    | ValueClass                                                       |   rangeMin |   rangeMax |
|----|---------------------|--------------------------------|------------------------------------------------------------------------------------------------------------|-------------------|------------------|---------|------------------------------------------------------------------|------------|------------|
|  0 | Demographics        | OBI:0003076                    | An_age_since_birth_measurement_datum_at_the_time_of_a_clinical_visit                                       | day               | UO:UO_0000033    | integer |                                                                  |          0 |        130 |
|  1 | Demographics        | EPND:AgeAtSampleDate           |

## Create an empty SHACL file

In [58]:
#create a shacl.ttl file and write down the prefixes 
f = open("EPNDCS1shacl.ttl", "w")
f.write ("""
@prefix OBI: <http://purl.obolibrary.org/obo>.
@prefix ncbi: <https://www.ncbi.nlm.nih.gov/>.
@prefix LNC: <http://purl.bioontology.org/ontology/LNC>.
@prefix ex: <http://example/#> .
@prefix schema: <http://schema.org/> .
@prefix dash: <http://datashapes.org/dash#> . 
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . 
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix sh: <http://www.w3.org/ns/shacl#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> . 
@prefix sio: <http://semanticscience.org/resource/>. 
@prefix snomed: <http://purl.bioontology.org/ontology/SNOMEDCT/>. 
@prefix UO: <http://purl.obolibrary.org/obo/>. 
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix sty: <http://purl.bioontology.org/ontology/STY/>. 
@prefix loinc: <http://purl.bioontology.org/ontology/LNC/>. 
@prefix EPND: <http://epnd.org/ontology/>. 

""")


874

## Add temporary EPND ontology class label triples

In [59]:

f.write ('''
sty:T081 rdfs:label "quantitative concept".
sty:T080 rdfs:label "qualitative concept".
sty:T079 rdfs:label "temporal concept".
OBI:0003076 rdfs:label "An age since birth at the time of a clinical visit.".
snomed:429019009 rdfs:label "Finding related to biological sex".
snomed:224288002 rdfs:label "Duration of formal education (observable entity)".
snomed:439272007 rdfs:label "Date of procedure (observable entity)".
snomed:459651000124107 rdfs:label "Montreal cognitive assessment score".
snomed:273617000 rdfs:label "Mini Mental State Examination".
snomed:273633002 rdfs:label "Motor assessment scale score".
ncbi:25728 rdfs:label "APOE genotype category".
LNC:MTHU015751 rdfs:label "collection date ".
sio:SIO_000137 rdfs:label "category".
sio:SIO_000914 rdfs:label "binary scale".
sio:SIO_000915 rdfs:label "numeric scale".

EPND:AgeAtSampleDate rdfs:label "Age when biosample was taken.".
EPND:ClinicalDiagnosis  rdfs:label "Diagnosis of cognitive performance, function".
EPND:ClinicalDiagnosisDate rdfs:label "Date of clinical diagnosis".
EPND:ClinicalDementiaDiagnosis rdfs:label "type of dementia diagnosis".
EPND:ADDiseaseStaging rdfs:label "Staging according to a combination of amyloid pathology and cognitive impairment".
EPND:MemoryAssessment rdfs:label "Memory assessment".
EPND:MemoryRawScore rdfs:label "Raw score of Memory assessment".
EPND:MemoryLocalNorms rdfs:label "Local norms of memory assessment".
EPND:standerdisedZScore rdfs:label "Z-score of memory assessment".
EPND:APOE4Status rdfs:label "APOE 4 allele status".
EPND:GBAGenotypeStatus rdfs:label "GBA Genotype status".
EPND:CSFAmyloidBeta42Cutoff rdfs:label "CSF Amyloid-beta42 cut off dependent on the assay and for different pre-analytical procedures in the cohort. ".
EPND:AmyloidBeta42Status  rdfs:label "Amyloid beta 42 status ".
EPND:AmyloidBeta42 rdfs:label "Amyloid beta 42".

snomed:248153007 rdfs:label "Male".
snomed:248152002 rdfs:label "Female".
snomed:32570691000036108 rdfs:label "Intersex".
EPND:NC rdfs:label "Normal cognition".
EPND:MCI rdfs:label "Mild cognitive impairment".
EPND:Dementia rdfs:label "dementia (mild-moderate-severe)".
EPND:ADDementia rdfs:label "Alzheimer disease dementia".
EPND:DLBDementia rdfs:label "Dementia with Lewy bodies ".
EPND:PDDementia rdfs:label "Parkinson's disease dementia".
EPND:PreclinicalAD rdfs:label "Combination of amyloid pathology and no cognitive impairment: Amyloid + NC/SCD".
EPND:ProdromalAD rdfs:label "Combination of amyloid pathology and and cognitive impairment: Amyloid + MCI".
EPND:ADDementia rdfs:label "Combination of amyloid pathology and dementia: Amyloid + dementia".
EPND:E2E2 rdfs:label "APOE allele E2E2".
EPND:E2E3 rdfs:label "APOE allele E2E3".
EPND:E2E4 rdfs:label "APOE allele E2E4".
EPND:E3E3 rdfs:label "APOE allele E3E3".
EPND:E3E4 rdfs:label "APOE allele E3E4".
EPND:E4E4 rdfs:label "APOE allele E4E4".
sio:SIO_000269 rdfs:label "true".
sio:SIO_000270 rdfs:label "false".

''')

2820

## Create continous variable SHACL shapes

In [60]:
from string import Template

#create a class for continous variables. extract columns from df and enter in the template shacle
class ContinousVariable:
    def __init__(self, description, OntologyClass, unitClass, rangeMin, domain, rangeMax):
        """
        The continousVariable class has the variables needed to create a shacl shape for a continous variable .

        Input parameters:
            self:   
            description = human readable description of the variable
            unitClass = units of measurement ontology class id for the unit that the variable is measured in
            rangeMin = smallest value that is accepted for the variable
            rangeMax = largest value that is accepted for the variable
            subdomain = 
        """
        self.description = description
        self.OntologyClass = OntologyClass
        self.rangeMin = rangeMin
        self.rangeMax = rangeMax
        self.unitClass = unitClass
        self.domain = domain

    def shape(self):
        """
        the shape function uses the parameters and fills the template shacl shape.
        Input parameters:

        Return value: shape template with parameters, link to domain shape and link to person shape
        """
        
        #add optional rangeMax shacle shape
        rangeMax = ''
        if self.rangeMax is None:
            rangeMax = ''
        else:
            rangeMax = Template(""";
                            sh:property [ 
                                sh:path sio:SIO_000300; #has value
                                sh:datatype xsd:int; 
                                sh:name ${OntologyClass}; 
                                sh:minCount 1; 
                                sh:maxCount 1; 
                                sh:maxInclusive ${rangeMaxVal};
                            ]""").substitute(rangeMaxVal=self.rangeMax, OntologyClass=self.OntologyClass)

        #change range minimum because it can't be none
        if self.rangeMin is None:
            self.rangeMin = 0.0

        #shacle shape template    
        return Template(
"""ex:${description}_Shape a sh:NodeShape, sio:SIO_000915; #numeric
                            sh:targetClass ${OntologyClass};
                            sh:closed true;
                            sh:property [ 
                                sh:path rdf:type; 
                                sh:hasValue ${unitClass}; 
                                sh:minCount 1;
                            ];
                            sh:property [ 
                                sh:path sio:SIO_000300; #has value
                                sh:datatype xsd:int; 
                                sh:name ${OntologyClass}; 
                                sh:minCount 1; 
                                sh:maxCount 1; 
                                sh:minInclusive ${rangeMin}; 
                            ]${rangeMax}.                                            

ex:${domain}_Shape a sh:NodeShape;
                            sh:targetClass sty:T081; #quantitative concept
                            sh:property [
                                sh:path rdf:type ;
                                sh:hasValue sty:T081; #quantitative concept
                            ];
                            sh:property [
                                sh:path sio:SIO_000011; #is attribute of
                                sh:node ex:${description}_Shape;
                                sh:minCount 1;            
                            ].
                            
"""
        ).substitute(description=self.description, unitClass=self.unitClass, rangeMin=self.rangeMin, rangeMax=rangeMax, OntologyClass=self.OntologyClass, domain=self.domain)


for i, row in df.iterrows():
    if row['type'] == "integer" or row['type'] == "float" or row['type'] == "numeric":
        shacleShape = ContinousVariable(row.Description, row.OntologyClass, row.unitClass, row.rangeMin, row.Domain, row.rangeMax)
        f.write(shacleShape.shape())
        print(shacleShape.shape())



ex:An_age_since_birth_measurement_datum_at_the_time_of_a_clinical_visit_Shape a sh:NodeShape, sio:SIO_000915; #numeric
                            sh:targetClass OBI:0003076;
                            sh:closed true;
                            sh:property [ 
                                sh:path rdf:type; 
                                sh:hasValue UO:UO:UO_0000033; 
                                sh:minCount 1;
                            ];
                            sh:property [ 
                                sh:path sio:SIO_000300; #has value
                                sh:datatype xsd:int; 
                                sh:name OBI:0003076; 
                                sh:minCount 1; 
                                sh:maxCount 1; 
                                sh:minInclusive 0.0; 
                            ];
                            sh:property [ 
                                sh:path sio:SIO_000300; #has value
                                sh:da

## Create date variable SHACL shapes

In [61]:
from string import Template
#Todo create column with all the ontology classes for the date shapes
#create a class for continous variables. extract columns from df and enter in the template shacle
class DateVariable:
    def __init__(self, description, OntologyClass, Unit, domain):
        """
        The dateVariable class has the variables needed to create a shacl shape for a date variable .

        Input parameters:
            self:   
            description = human readable description of the variable
            format = format of date eg YYYY-MM-DD
            domain
        """
        self.description = description
        self.OntologyClass = OntologyClass
        self.Unit = Unit
        self.domain = domain

    def shape(self):
        """
        the shape function uses the parameters and fills the template shacl shape.
        Input parameters:

        Return value: shape template with parameters, link to domain shape
        """

        #shacle shape template    
        return Template(
"""ex:${description}_Shape a sh:NodeShape, sio:SIO_000418; #time instant
                            sh:closed true;
                            sh:targetClass ${OntologyClass};
                            sh:property [ 
                                sh:path rdf:type; 
                                sh:hasValue ${OntologyClass};
                            ];
                            sh:property [ 
                                sh:path sio:SIO_000221; #has unit
                                sh:node ex:dateFormat;
                            ].                                               

ex:dateFormat a sh:NodeShape;
                sh:targetClass schema:Date; #date in ISO 8601 date format
                sh:closed true;
                sh:property [
                    sh:path rdf:type ;
                    sh:hasValue schema:Date; #date in ISO 8601 date format
                ];
                sh:property [
                    sh:path sio:SIO_000300; #has_value
                    sh:datatype xsd:date;
                    skos:altLabel '${Unit}';
                ].

ex:${domain}_Shape a sh:NodeShape;
                            sh:targetClass sty:T079; #temporal concept
                            sh:property [
                                sh:path rdf:type ;
                                sh:hasValue sty:T079; #temporal concept
                            ];
                            sh:property [
                                sh:path sio:SIO_000011; #is attribute of
                                sh:node ex:${description}_Shape;
                                sh:minCount 1;
                            ].            
                    
"""
        ).substitute(description=self.description, OntologyClass=self.OntologyClass, Unit=self.Unit, domain=self.domain)


for i, row in df.iterrows():
    if row['type'] == "date":
        shacleDateShape = DateVariable(row.Description, row.OntologyClass, row.Unit, row.Domain)
        f.write(shacleDateShape.shape())
        print(shacleDateShape.shape())



ex:Date_of_clinical_diagnosis_Shape a sh:NodeShape, sio:SIO_000418; #time instant
                            sh:closed true;
                            sh:targetClass EPND:ClinicalDiagnosisDate;
                            sh:property [ 
                                sh:path rdf:type; 
                                sh:hasValue EPND:ClinicalDiagnosisDate;
                            ];
                            sh:property [ 
                                sh:path sio:SIO_000221; #has unit
                                sh:node ex:dateFormat;
                            ].                                               

ex:dateFormat a sh:NodeShape;
                sh:targetClass schema:Date; #date in ISO 8601 date format
                sh:closed true;
                sh:property [
                    sh:path rdf:type ;
                    sh:hasValue schema:Date; #date in ISO 8601 date format
                ];
                sh:property [
                    sh:path sio:SI

## Create factor variable SHACL shapes

In [62]:
from string import Template

#create a class for factor variables. extract columns from df and enter in the template shacle
class FactorVariable:
    def __init__(self, description, OntologyClass, ValueClass, domain):
        """
        The factorVariable class has the variables needed to create a shacl shape for a date variable .

        Input parameters:
            self:   
            description 
            domain
        """
        self.description = description
        self.OntologyClass = OntologyClass
        self.domain = domain
        self.ValueClass = ValueClass

    def shape(self):
        """
        the shape function uses the parameters and fills the template shacl shape.
        Input parameters:

        Return value: shape template with parameters, link to domain shape
        """
        categoryList = []
        categoryString = ''

        if self.ValueClass is not None: 
            categoryList = self.ValueClass.split(",")
            categoryString = ''.join(map(str, categoryList))
        OntologyClass = self.OntologyClass
        optionList = []
        
        
        for i in categoryList:
            
            option = Template('''   [
                                        sh:property [
                                                sh:path rdf:type;
                                                sh:hasValue ${OntologyClass};
                                                sh:minCount 2;
                                                sh:maxCount 2;
                                        ];
                                        sh:property [
                                                sh:path rdf:type;
                                                sh:hasValue ${i};
                                                sh:minCount 2;
                                                sh:maxCount 2;
                                        ];
                                    ]''').substitute(OntologyClass=self.OntologyClass, i=i)

            optionList.append(option) 

        optionString = ' '.join(map(str, optionList))

        #shacl shape template    
        return Template(
''' 
ex:${description}_Shape  a   sh:NodeShape, sio:SIO_000137; #category
                                sh:closed true;
                                sh:targetClass ${OntologyClass};
                                sh:xone 
                                ( 
                                ${optionString}
                                );
                                sh:property [
                                    sh:path rdf:type ;
                                    sh:in ( ${categoryString} ${OntologyClass} ) ;
                                    sh:minCount 2; # columnClass and valueClass combination 
                                ].
                                               
ex:${domain}_Shape a sh:NodeShape;
                        sh:targetClass sty:T080; #qualitative concept
                        sh:property [
                            sh:path rdf:type ;
                            sh:hasValue sty:T080; #qualitative concept
                        ];
                        sh:property [
                            sh:path sio:SIO_000137; #is attribute of
                            sh:node ex:${description}_Shape;
                            sh:minCount 1;  
                        ].
                                
''' 
                                ).substitute(description=self.description, OntologyClass=self.OntologyClass, ValueClass=self.ValueClass, domain=self.domain, optionString=optionString, categoryString=categoryString)



for i, row in df.iterrows():
    if row['type'] == "factor" or row['type'] == "boolean":
        shaclFactorShape = FactorVariable(row.Description, row.OntologyClass, row.ValueClass, row.Domain)
        f.write(shaclFactorShape.shape())
        print(shaclFactorShape.shape())
      

 
ex:Finding_related_to_biological_sex_Shape  a   sh:NodeShape, sio:SIO_000137; #category
                                sh:closed true;
                                sh:targetClass snomed:429019009;
                                sh:xone 
                                ( 
                                   [
                                        sh:property [
                                                sh:path rdf:type;
                                                sh:hasValue snomed:429019009;
                                                sh:minCount 2;
                                                sh:maxCount 2;
                                        ];
                                        sh:property [
                                                sh:path rdf:type;
                                                sh:hasValue snomed:248153007;
                                                sh:minCount 2;
                                                sh:maxCo

## Create string variable SHACL shapes

In [63]:
from string import Template

#create a class for continous variables. extract columns from df and enter in the template shacle
class StringVariable:
    def __init__(self, description, OntologyClass, domain):
        """
        The stringVariable class has the variables needed to create a shacl shape for a string variable .

        Input parameters:
            self:   
            description = human readable description of the variable
            domain = 
        """
        self.description = description
        self.OntologyClass = OntologyClass
        self.domain = domain

    def shape(self):
        """
        the shape function uses the parameters and fills the template shacl shape.
        Input parameters:

        Return value: shape template with parameters, link to domain shape and link to domain shape
        """
    
        #shacle shape template    
        return Template("""
ex:${description}_Shape a sh:NodeShape;
                            sh:closed true;
                            sh:property [ 
                                sh:path sio:SIO_000300; #has value
                                sh:datatype xsd:string; 
                                sh:name ${OntologyClass};   
                            ].                                             

ex:${domain}_Shape a sh:NodeShape, sio:SIO_000651; #textual entitiy
                            sh:targetClass sty:T080; #qualitative concept
                            sh:property [
                                sh:path rdf:type ;
                                sh:hasValue sty:T080; #qualitative concept
                            ];
                            sh:property [
                                sh:path sio:SIO_000011; #is attribute of
                                sh:node ex:${description}_Shape;
                                sh:minCount 1;            
                            ].
                            
"""
        ).substitute(description=self.description, OntologyClass=self.OntologyClass, domain=self.domain)


for i, row in df.iterrows():
    if row['type'] == "string" :
        shacleShape = StringVariable(row.Description, row.OntologyClass, row.Domain)
        f.write(shacleShape.shape())
        print(shacleShape.shape())


: 