In [1]:
%%capture
!pip install openpyxl

In [2]:
import pandas as pd
import openpyxl
from rdflib import Namespace, Graph, URIRef, Literal, BNode
from rdflib.namespace import DCTERMS, DC, XSD, RDFS, DCAT, RDF, FOAF
import os
import urllib.parse
from datetime import datetime

g = Graph()
WD = Namespace("http://www.wikidata.org/entity/")
WDT = Namespace("http://www.wikidata.org/prop/direct/")

In [3]:
fileURI = URIRef("https://g-68f8be.81de.36fe.data.globus.org/Pages/Sample%20clinical%20data_July%202022.xlsx")

g.add((fileURI, RDF.type, DCAT.Dataset))
g.add((fileURI, DCTERMS.format, WD.Q63082925))
filename = 'Sample clinical data_July 2022.xlsx'
g.add((fileURI, RDFS.label, Literal(filename, datatype=XSD.string)))
datecreated = os.path.getctime(filename)
g.add((fileURI, DCTERMS.issued, Literal(datetime.fromtimestamp(datecreated).strftime('%Y-%m-%dT%H:%M:%S'), datatype=XSD.date)))
datemodified = os.path.getmtime(filename)
g.add((fileURI, DCTERMS.modified, Literal(datetime.fromtimestamp(datemodified).strftime('%Y-%m-%dT%H:%M:%S'), datatype=XSD.date)))

<Graph identifier=N74a8376450e8453d8da0a4495a791012 (<class 'rdflib.graph.Graph'>)>

In [4]:
print(g.serialize(format="turtle"))

@prefix dcat: <http://www.w3.org/ns/dcat#> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<https://g-68f8be.81de.36fe.data.globus.org/Pages/Sample%20clinical%20data_July%202022.xlsx> a dcat:Dataset ;
    rdfs:label "Sample clinical data_July 2022.xlsx"^^xsd:string ;
    dcterms:format <http://www.wikidata.org/entity/Q63082925> ;
    dcterms:issued "2022-09-02"^^xsd:date ;
    dcterms:modified "2022-09-01"^^xsd:date .




In [5]:
pd.set_option('display.max_columns', None)
xls = pd.ExcelFile(filename)
subjects = pd.read_excel(xls)
subjects

Unnamed: 0,ID Lab site,ID clinical site,DOB,Age,Gender,Pathology,Localization,IDH1/2 status,MGMT status,ECOG,Nature of initial resection,lymphocyte counts at V1 in G/L,cummulative dose preV1 steroid use in mg,steroid use after V1,cummulative steroid dose in mg,Cycles adj TMZ,doses bevacizumab,Best response,pseudo-progression (1=yes 0=no),œdema,PP or œdema,progression (1=yes 0=no),date of progression,months PFS,deceased (1=yes 0=no),months OS
0,Ge 829,DAG,1953-05-16,62,F,GBM,parietal R,wt,UnMeth,1,biopsy,0.69,5420,no,5420.0,2,12,PD,0,1,1,1,2013-11-11,3,1,19
1,Ge 835,EAG,1964-06-18,50,M,GBM,temporal D,wt,UnMeth,0,subtot,1.05,12,yes,506.5,3,0,PD,1,1,1,1,2014-01-21,6,1,10
2,Ge 849,FAG,1957-12-15,57,F,GBM,frontal R,wt,UnMeth,0,subtot,0.64,0,yes,404.0,6,9,PD,1,0,1,1,2014-09-02,11,1,18
3,Ge 852,GAG,1960-03-22,55,M,GBM,fronto insular L,wt,UnMeth,0,subtot,2.1,0,yes,260.0,6,16,SD,0,1,1,1,2014-10-01,11,1,21
4,Ge 882,IAG,1948-05-06,67,M,GBM,fronto insular R,wt,UnMeth,0,subtot,0.68,0,yes,240.0,6,4,PD,0,1,1,1,2014-12-22,18,1,24
5,Ge 893*,KAG,1958-07-12,56,M,GBM,temporal R,wt,Meth,0,subtot,1.05,0,no,0.0,6,0,SD,0,0,0,1,2015-02-04,26,0,44
6,Ge 901,LAG,1991-06-09,23,F,AIII,fronto temporal insular R,mut,UnMeth,0,subtot,0.99,0,no,0.0,6,29,SD,1,0,1,1,2016-07-18,17,1,37
7,Ge 904,MAG,1949-10-13,65,M,GBM,parieto occipital L,wt,Meth,0,subtot,1.67,0,no,0.0,6,6,PR,1,1,1,1,2016-12-12,29,1,35
8,Ge 939,PAG,1972-04-10,43,F,AIII,temporal R,wt,UnMeth,1,biopsy,1.37,0,no,0.0,2,42,PD,1,1,1,1,2015-06-12,6,1,32
9,Ge 941*,OAG,1954-01-07,61,M,GBM,frontal L,mut,Meth,0,subtot,0.94,0,no,0.0,6,0,CR,1,0,1,0,NaT,38,0,38


In [6]:
BASE = Namespace("http://idr.semscape.org/")
SUBJECT = Namespace(BASE.subject)
OBO = Namespace("http://purl.obolibrary.org/obo/")
g.bind("obo", "http://purl.obolibrary.org/obo/")
SCHEMA = Namespace("https://schema.org/")
SIO = Namespace("http://semanticscience.org/resource/")
NODE = Namespace("http://idr.semscape.org/nodeURI/")

In [7]:
for index, row in subjects.iterrows():
    # Row Subject
    patientID = rllib.parse.quote(row["Sample ID"])
    g.add((SUBJECT["/"+patientID], RDF.type, OBO.NCIT_C16960))
    g.add((SUBJECT["/"+patientID], DCTERMS.identifier, Literal(patientID, datatype=XSD.string)))
    
    subjectProperties = BNode()
    g.add((SUBJECT["/"+patientID], OBO.RO_0000053, subjectProperties))
    # Row Age
    ## please store birthdate, since Age does not age well in data. 
    
    # g.add((subjectProperties, SCHEMA.birthDate, Literal(str(int(row["Age (year of birth)"])), datatype=XSD.date)))
    
      

    
    # Family history
    familyhistory = BNode()
    g.add((familyhistory, RDF.type, RDF.Bag))
    g.add((familyhistory, RDFS.domain, OBO.NCIT_C17726))
    g.add((subjectProperties, OBO.BFO_0000185, familyhistory))
    if row["Premen BRCA"] == 1:
        premenbrca = BNode()
        g.add((familyhistory, RDFS.member, premenbrca))
        g.add((premenbrca, RDF.type, OBO.NCIT_C49152))
        g.add((premenbrca, RDFS.domain, OBO.DOID_1612))
        g.add((premenbrca, URIRef("http://rs.tdwg.org/dwc/terms/lifeStage"), OBO.OMIT_0018003))
        g.add((premenbrca, RDF.value, Literal(int(row["Premen BRCA"]), datatype=XSD.int)))

    if row["Post BRCA"] == 1:
        postbrca = BNode()
        g.add((familyhistory, RDFS.member, postbrca))
        g.add((postbrca, RDF.type, OBO.NCIT_C49152))
        g.add((postbrca, RDFS.domain, OBO.DOID_1612))
        g.add((postbrca, URIRef("http://rs.tdwg.org/dwc/terms/lifeStage"), OBO.OMIT_0023947))
        g.add((postbrca, RDF.value, Literal(int(row["Post BRCA"]), datatype=XSD.int)))

    if row["OVCA"] == 1:
        ovca = BNode()
        g.add((familyhistory, RDFS.member, ovca))
        g.add((ovca, RDF.type, OBO.NCIT_C49152))
        g.add((ovca, RDFS.domain, OBO.DOID_2394))
        g.add((ovca, RDF.value, Literal(int(row["OVCA"]), datatype=XSD.int)))
        
    if row["Male BrCA"] == 1:
        malebrca = BNode()
        g.add((familyhistory, RDFS.member, malebrca))
        g.add((malebrca, RDF.type, OBO.NCIT_C15709))
        g.add((malebrca, RDFS.domain, OBO.DOID_1614))
        g.add((malebrca, RDF.value, Literal(int(row["OVCA"]), datatype=XSD.int)))
        
    
    if row["BRCA1.1"] == 1:
        fambrca1test = BNode()
        g.add((familyhistory, RDFS.member, fambrca1test))
        g.add((fambrca1test, RDF.type, OBO.NCIT_C15709))
        g.add((fambrca1test, RDFS.domain, OBO.NCIT_C17965))
         
    if row["BRCA2.1"] == 1:
        fambrca2test = BNode()
        g.add((familyhistory, RDFS.member, fambrca2test))
        g.add((fambrca1test, RDF.type, OBO.NCIT_C15709))
        g.add((fambrca2test, RDFS.domain, OBO.NCIT_C18120))
    if row["PALB2"] == 1:
        fampalb2test = BNode()
        g.add((familyhistory, RDFS.member, fampalb2test))
        g.add((fambrca1test, RDF.type, OBO.NCIT_C15709))
        g.add((fampalb2test, RDFS.domain, OBO.NCIT_C84945))
    if row["Other"] == 1:
        othertest = BNode()
        g.add((familyhistory, RDFS.member, othertest))
        g.add((othertest, RDF.type, OBO.NCIT_C15709))
        g.add((fampalb2test, RDFS.domain, OBO.NCIT_C113067))
   
    # Prior Diagnosis
    medicalhistory = BNode()
    g.add((medicalhistory, RDF.type, RDF.Bag))
    g.add((medicalhistory, RDFS.domain, OBO.SIO_010673))
    g.add((subjectProperties, OBO.BFO_0000185, medicalhistory))
    if row["Breast.1"] == 1:
        past_diagnosis = BNode()
        g.add((medicalhistory, RDFS.member, past_diagnosis))
        g.add((past_diagnosis, RDF.type, OBO.C4872))
        g.add((past_diagnosis, DC.date, Literal(str(int(row["Year  Dx"]))+"-"+str(int(row["Month Dx"])), datatype=XSD.date)))
      
    # Menopause
    ## premenopause
    if row["Pre"] == 1:
        g.add((subjectProperties, URIRef("http://rs.tdwg.org/dwc/terms/lifeStage"), OBO.OMIT_0018003))
    if row["Peri"] == 1:
        g.add((subjectProperties, URIRef("http://rs.tdwg.org/dwc/terms/lifeStage"), OBO.OMIT_0023947))
    if row["Post"] == 1:
        g.add((subjectProperties, URIRef("http://rs.tdwg.org/dwc/terms/lifeStage"), OBO.OMIT_0018004))

    subjectArchivedSpecimen = BNode()
    g.add((SUBJECT["/"+row["Subject"]], OBO.OBI_0100051, subjectArchivedSpecimen))
    g.add((subjectArchivedSpecimen, RDF.type, OBO.NCIT_C19157))
    
    #Diagnosis
    diagnosis = BNode()
    g.add((subjectArchivedSpecimen, OBO.RO_0040035, diagnosis))
    g.add((medicalhistory, RDFS.member, diagnosis))
    if row["Invasive CA"] == 1:
        g.add((diagnosis, RDF.type, OBO.NCIT_C9480)) # Invasive carcinoma
    if row["DCIS"] == 1:
        g.add((diagnosis, RDF.type, OBO.NCIT_C2924)) # DCIS
    if row["LCIS"] == 1:
        g.add((diagnosis, RDF.type, OBO.NCIT_C137839)) # LCIS = Breast Pleomorphic Lobular Carcinoma In Situ?
    if row["Atypia"] == 1:
        g.add((diagnosis, RDF.type, OBO.NCIT_C9478)) # Atypia
    g.add((diagnosis, DC.date, Literal(str(int(row["Year"]))+"-"+str(int(row["Month"])), datatype=XSD.date)))
 
    # How should "Normal" be interpreted?
    
    tfinding = BNode()
    g.add((diagnosis, OBO.NCIT_R108, tfinding))
    g.add((tfinding, RDF.type, OBO.NCIT_C48879))
    g.add((tfinding, RDF.value, Literal("T"+str(row["T"]), datatype=XSD.string)))
    
    nfinding = BNode()
    g.add((diagnosis, OBO.NCIT_R108, nfinding))
    g.add((nfinding, RDF.type, OBO.NCIT_C48879))
    g.add((nfinding, RDF.value, Literal("N"+str(row["N"]), datatype=XSD.string)))
    
    mfinding = BNode()
    g.add((diagnosis,OBO.NCIT_R108, mfinding))
    g.add((mfinding, RDF.type, OBO.NCIT_C48879))
    g.add((mfinding, RDF.value, Literal("M"+str(row["M"]), datatype=XSD.string)))
    
    # Expression
    if not pd.isna(row["ER"]):
        er = BNode()
        g.add((subjectArchivedSpecimen, OBO.TXPO_0001873, er))
        g.add((er, RDF.type, OBO.NCIT_C25209))
        g.add((er, RDFS.range, OBO.NCIT_C16150))
        if row["ER"] == 1:
            g.add((er, RDF.value, Literal(True)))
        if row["ER"] == 0:
            g.add((er, RDF.value, Literal(False)))
            
    if not pd.isna(row["PR"]):
        pr = BNode()
        g.add((subjectArchivedSpecimen, OBO.TXPO_0001873, pr))
        g.add((pr, RDF.type, OBO.NCIT_C25209))
        g.add((pr, RDFS.range, OBO.NCIT_C16149))
        if row["PR"] == 1:
            g.add((pr, RDF.value, Literal(True)))
        if row["PR"] == 0:
            g.add((pr, RDF.value, Literal(False)))
            
    if not pd.isna(row["HER2"]):
        her = BNode()
        g.add((subjectArchivedSpecimen, OBO.TXPO_0001873, her))
        g.add((her, RDF.type, OBO.NCIT_C25209))
        g.add((her, RDFS.range, OBO.NCIT_C184942))
        if row["HER2"] == 1:
            g.add((her, RDF.value, Literal(True)))
        if row["HER2"] == 0:
            g.add((her, RDF.value, Literal(False)))
    
    if not pd.isna(row["Ki67 (%)"]):
        ki67 = BNode()
        g.add((subjectArchivedSpecimen, OBO.TXPO_0001873, ki67))
        g.add((her, RDF.type, OBO.NCIT_C25209))
        g.add((her, RDF.value, Literal(row["Ki67 (%)"])))
        g.add((her, SIO.SIO_000221, SIO.SIO_001413))
        
            
    # collection method
    collectionmethod = BNode()
    g.add((subjectArchivedSpecimen, OBO.HSO_0000288, collectionmethod))
    g.add((collectionmethod, RDF.type, OBO.NCIT_C70700))
    if row["core Biopsy (y/n)"] == 1:
        g.add((collectionmethod, RDF.value, OBO.NCIT_C15189))
    if row["Excisional biopsy"] == 1:
        g.add((collectionmethod, RDF.value, OBO.NCIT_C51633))
    if row["Mastectomy"] == 1:
        g.add((collectionmethod, RDF.value, OBO.NCIT_C15279))
    
    # Hypoxia time (mins)
    specimenHandling = BNode()
    g.add((subjectArchivedSpecimen, OBO.RO_0002334, specimenHandling))
    g.add((specimenHandling, RDF.type, OBO.NCIT_C179745))
    g.add((specimenHandling, RDFS.range, OBO.OMIT_0016357))
    g.add((specimenHandling, RDF.value, Literal(row["Hypoxia time (mins)"])))
    g.add((specimenHandling, SIO.SIO_000221, SIO.SIO_000434)) # unit minutes 
    
    # Formalin Fix Time (mins)
    specimenHandling = BNode()
    g.add((subjectArchivedSpecimen, OBO.RO_0002334, specimenHandling))
    g.add((specimenHandling, RDF.type, OBO.NCIT_C179745))
    g.add((specimenHandling, RDFS.range, OBO.OBIB_0000718))
    g.add((specimenHandling, RDF.value, Literal(row["Formalin Fix Time"])))
    g.add((specimenHandling, SIO.SIO_000221, SIO.SIO_000434)) # unit minutes 
    
    # Mutations
    if row["BRCA1"] == 1:
        known_mutation = BNode()
        g.add((SUBJECT["/"+row["Subject"]], OBO.CLO_0037333, known_mutation))
        g.add((known_mutation, RDF.type, OBO.NCIT_C36391))
        g.add((known_mutation, RDFS.range, OBO.NCIT_C17965))
        g.add((known_mutation, RDFS.comment, Literal(row["Specific MT (text)"])))
    if row["BRCA2"] == 1:
        known_mutation = BNode()
        g.add((SUBJECT["/"+row["Subject"]], OBO.CLO_0037333, known_mutation))
        g.add((known_mutation, RDF.type, OBO.NCIT_C36391))
        g.add((known_mutation, RDFS.range, OBO.NCIT_C18120))
        g.add((known_mutation, RDFS.comment, Literal(row["Specific MT (text)"])))
    if row["other"] == 1:
        known_mutation = BNode()
        g.add((SUBJECT["/"+row["Subject"]], OBO.CLO_0037333, known_mutation))
        g.add((known_mutation, RDF.type, OBO.NCIT_C36391))
        g.add((known_mutation, RDFS.range, OBO.NCIT_C16612))
        g.add((known_mutation, RDFS.comment, Literal(row["Specific MT (text)"])))
    
    # Treatment
    if row["NACT (y/n)"] == 1:
        treatment = BNode()
        g.add((subjectArchivedSpecimen, WDT.P2176, treatment))
        g.add((treatment, RDF.type, OBO.NCIT_C15665))
        if not pd.isna(row["Year.1"]):
            g.add((treatment, SCHEMA.startDate, Literal(str(row["Year.1"])+"-"+str(row["Month.1"]), datatype=XSD.date)))
        if not pd.isna(row["Year.2"]):
            g.add((treatment, SCHEMA.endDate, Literal(str(row["Year.2"])+"-"+str(row["Month.2"]), datatype=XSD.date)))
    if row["PARP-I y/n"] == 1:
        treatment = BNode()
        g.add((subjectArchivedSpecimen, WDT.P2176, treatment))
        g.add((treatment, RDF.type, OBO.NCIT_C62554))
        if not pd.isna(row["Year.1"]):
            g.add((treatment, SCHEMA.startDate, Literal(str(row["Year.1"])+"-"+str(row["Month.1"]), datatype=XSD.date)))
        if not pd.isna(row["Year.2"]):
            g.add((treatment, SCHEMA.endDate, Literal(str(row["Year.2"])+"-"+str(row["Month.2"]), datatype=XSD.date)))
   
    
        
        
    
    
    
    

KeyError: 'Subject'

In [None]:
print(g.serialize(destination=filename+".ttl", format="turtle"))
print(g.serialize(format="turtle"))


In [None]:
for node in g.all_nodes():
    if isinstance(node, URIRef):
        try:
            g.parse(node)
        except:
            print(node)

In [None]:
g.parse("http://www.wikidata.org/entity/Q63082925.ttl")

# Extract schema

In [None]:
from shexer.shaper import Shaper
from shexer.consts import NT, TURTLE

q = "select ?class where { ?item rdf:type ?class }"
target_classes = []
x = g.query(q)
for target_class in x:
  if str(target_class["class"]) not in target_classes:
    target_classes.append(str(target_class["class"]))

shex_target_file = filename+".shex"

shaper = Shaper(target_classes=target_classes,
                rdflib_graph=g,
                input_format=TURTLE,
                )  # Default rdf:type
            
shaper.shex_graph(output_file=shex_target_file)


In [None]:
command = f"cat {filename}.shex"

In [None]:
!command

In [None]:
import os
os.system(command)