Return the results of a sparql query as a pandas dataframe

LinkedEarth · Feb 18, 2023 · 27fe55b · 27fe55b
1 parent 36ca318
commit 27fe55b
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 7 deletions.
diff --git a/pylipd/lipd.py b/pylipd/lipd.py
@@ -8,13 +8,14 @@
 import re
 import os.path
 import tempfile
+import pandas as pd
 
 from rdflib import ConjunctiveGraph, Namespace
 from pylipd.multi_processing import multi_convert_to_pickle, multi_convert_to_rdf
 
 from pylipd.rdf_to_lipd import RDFToLiPD
 from pylipd.legacy_utils import LiPD_Legacy
-from pylipd.utils import sanitizeId
+from pylipd.utils import sanitizeId, sparql_results_to_df
 
 from .globals.urls import NSURL, ONTONS
 
@@ -261,6 +262,9 @@ def query(self, query, result="sparql"):
 
         result : dict
             Dictionary of sparql variable and binding values
+        
+        result_df : pandas.Dataframe
+            Return the dictionary above as a pandas.Dataframe
     
         Examples
         --------
@@ -282,17 +286,21 @@ def query(self, query, result="sparql"):
                             ?ds a le:Dataset .
                             ?ds le:hasUrl ?url
                         }"""
-                result = lipd.query(query)
-                print(result)
+                result, result_df = lipd.query(query)
+                result_df
         '''
 
         if self.remote:
             matches = re.match(r"\s*SELECT\s+(.+)\s+WHERE\s+{(.+)}\s*", query, re.DOTALL)
             if matches:
                 vars = matches.group(1)
                 where = matches.group(2)
-                query = f"SELECT {vars} WHERE {{ SERVICE <{self.endpoint}> {{ {where} }} }}"
-        return self.graph.query(query, result=result)
+                query = f"SELECT {vars} WHERE {{ SERVICE <{self.endpoint}> {{ {where} }} }}"   
+
+        result = self.graph.query(query)
+        result_df = sparql_results_to_df(result)
+
+        return result, result_df 
 
     def load_remote_datasets(self, dsids):
         '''Loads remote datasets into cache if a remote endpoint is set

diff --git a/pylipd/tests/test_LiPD.py b/pylipd/tests/test_LiPD.py
@@ -80,7 +80,7 @@ def test_query_t0(self):
         if  __name__=="__main__":
             lipd = LiPD()
             lipd.load(url)
-            result = lipd.query(query)
+            result, result_df = lipd.query(query)
 
 
 
diff --git a/pylipd/utils.py b/pylipd/utils.py
@@ -4,6 +4,9 @@
 import math
 import random
 
+from pandas import DataFrame
+from rdflib.plugins.sparql.processor import SPARQLResult
+
 import zlib, json, base64
 
 def ucfirst(s):
@@ -78,4 +81,15 @@ def expand_schema(schema) :
                 for altkey in pdetails["alternates"]: 
                     xschema[key][altkey] = pdetails
     xschema["__expanded"] = True
-    return xschema
+    return xschema
+
+
+def sparql_results_to_df(results: SPARQLResult) -> DataFrame:
+    """
+    Export results from an rdflib SPARQL query into a `pandas.DataFrame`,
+    using Python types. See https://github.com/RDFLib/rdflib/issues/1179.
+    """
+    return DataFrame(
+        data=([None if x is None else x.toPython() for x in row] for row in results),
+        columns=[str(x) for x in results.vars],
+    )