In [18]:
!pip install rdflib
!pip install pandas
!pip install pydicom


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [19]:
import os

# Get the directory of the current script
base_dir = os.getcwd()
try:
    if folder:
        base_dir = os.path.join(base_dir, folder)
except:
    pass

In [20]:
from rdflib import *
from hashlib import sha256
tbox = Namespace('http://www.semanticweb.org/acraf/ontologies/2024/healthmesh/tbox#')
abox = Namespace('http://www.semanticweb.org/acraf/ontologies/2024/healthmesh/abox#')
dcat = Namespace('https://www.w3.org/ns/dcat#')
dcterms = Namespace('http://purl.org/dc/terms/')
dqv = Namespace('http://www.w3.org/ns/dqv#')

In [None]:
#Profiler Class
import os
import pandas as pd
import pydicom
class Profiler:
    
    def __init__(self, file_path, owner="Unknown"):
        self.file_path = file_path
        self.source_graph = self.create_graph()
        self.datasetname = self.self_get_dataset_name()
        self.set_owner = self.set_owner(owner)
        self.file_extension = self.get_file_extension()
        self.technology = self.add_technology()
        self.source_graph = self.extract_metadata()
        
    def create_graph(self):
        """
        This function returns a graph object with the necessary prefixes
        :return: RDF Graph
        """
        g = Graph()
        g.bind('tb', 'http://www.semanticweb.org/acraf/ontologies/2024/healthmesh/tbox#')
        g.bind('ab', 'http://www.semanticweb.org/acraf/ontologies/2024/healthmesh/abox#')
        return g
    
    def set_owner(self, owner):
        # Owner Metadata
        self.source_graph.add((abox[self.datasetname],  tbox.owner, Literal(owner)))
        return owner
    
    def self_get_dataset_name(self):
        
        name = os.path.basename(self.file_path).replace('.', '')
        # Name Metadata
        self.source_graph.add((abox[name], RDF.type, tbox.DataProduct))
        identifier = sha256(name.encode('utf-8')).hexdigest()
        self.source_graph.add((abox[name],dcterms.identifier, Literal(identifier)))
        return name
        

    def get_file_extension(self):
        file_name, file_extension = os.path.splitext(self.file_path)
        format = abox.Tabular
        if file_extension.lower() == '.csv':
            format = abox.Tabular
        elif file_extension.lower() == '.dcm':
            format = abox.Image
        
        # DataSetTypeTemplate Metadata
        self.source_graph.add((format, RDF.type, tbox.DatasetTypeTemplate))
        self.source_graph.add((format, dcterms['format'], Literal(file_extension)))  # Correct usage of the namespace
        self.source_graph.add((abox[self.datasetname], tbox.hasDTT, format))

        return file_extension.lower()
    
    def generate_unique_uri(self, base_uri):
        import uuid
        unique_identifier = str(uuid.uuid4())
        return URIRef(f"{base_uri}{unique_identifier}")
    
    def add_technology(self):
        #triple
        self.source_graph.add((abox[self.datasetname+"_TA"], RDF.type, tbox.TechnologyAspects))
        self.source_graph.add((abox[self.datasetname], tbox.hasTA, abox[self.datasetname+"_TA"]))

        acces_uri = self.generate_unique_uri(abox)
        self.source_graph.add((abox[self.datasetname+"_TA"], tbox.typeAcces, acces_uri))
        self.source_graph.add((acces_uri, RDF.type, tbox.Acces))
        self.source_graph.add((acces_uri, RDFS.label, abox.Static))
        # PATH
        self.source_graph.add((acces_uri, tbox.path, Literal(self.file_path)))

    def extract_metadata(self):
        if self.file_extension.lower() == '.csv':
            return self.extract_csv_metadata()
        elif self.file_extension.lower() == '.dcm':
            return self.extract_dicom_metadata()
        else:
            raise ValueError(f"Unsupported file extension: {self.file_extension}")

    def extract_csv_metadata(self):
        df = pd.read_csv(self.file_path)
        for column in df.columns:
            self.source_graph.add((abox[column], RDF.type, tbox.Attribute))
            self.source_graph.add((abox[self.datasetname], tbox.hasAttribute, abox[column]))
            self.source_graph.add((abox[column], tbox.attribute, Literal(column)))
        return self.source_graph

    def extract_dicom_metadata(self, n_attributes=50):
        ds = pydicom.dcmread(self.file_path)
        # Iterate over all attributes
        for attribute in dir(ds)[:n_attributes]:
            if attribute[0].isalpha():
                if hasattr(ds, attribute):
                    self.source_graph.add((abox[attribute], RDF.type, tbox.Attribute))
                    self.source_graph.add((abox[self.datasetname], tbox.hasAttribute, abox[attribute]))
                    self.source_graph.add((abox[attribute], tbox.attribute, Literal(attribute)))
        return self.source_graph

    def get_source_graph(self):
        return self.source_graph


In [22]:
try:
    if file_path:
        file_path = os.path.join(base_dir, file_path)
except:
    file_path = input("Enter file path: ")


In [23]:
profiler = Profiler(file_path)

In [24]:
graph = profiler.get_source_graph()

Save Graph to File

In [25]:
sdm = Graph().parse(os.path.join(base_dir, '../../FederatedComputationalGovernance/SemanticDataModel/sdm.ttl'), format='turtle')

In [26]:
sdm = Graph().parse(os.path.join(base_dir, '../../FederatedComputationalGovernance/SemanticDataModel/sdm.ttl'), format='turtle')
sdm += graph
sdm.serialize(destination=os.path.join(base_dir, '../../FederatedComputationalGovernance/SemanticDataModel/sdm.ttl'), format='turtle')

<Graph identifier=Nd04fce0f30ff4ec1aebbfe1d0beb7fa9 (<class 'rdflib.graph.Graph'>)>