In [1]:

from rdflib.serializer import Serializer
import configparser
import corpus
import csv
import glob
import json
import rdflib
import sys


In [4]:

PREAMBLE = """
@base <https://github.com/Coleridge-Initiative/adrf-onto/wiki/Vocabulary> .
@prefix cito:<http://purl.org/spar/cito/> .
@prefix dct:<http://purl.org/dc/terms/> .
@prefix foaf:<http://xmlns.com/foaf/0.1/> .
@prefix rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix xsd:<http://www.w3.org/2001/XMLSchema#> .
"""

TEMPLATE_DATASET = """
:{}
  rdf:type :Dataset ;
  foaf:page "{}"^^xsd:anyURI ;
  dct:publisher "{}" ;
  dct:title "{}" ;
"""
# TEMPLATE_PUBLICATION = """                                                                                                           
# :{}                                                                                                                          
#   rdf:type :ResearchPublication ;                                                                                            
#   dct:title "{}"@en ;                                                                                                        
#   dct:identifier "{}" ;                                                                                                      
#   dct:language "en" ;                                                                                                        
#   foaf:page "{}"^^xsd:anyURI ;  
#   cito:citesAsDataSource "{}";
#   dct:subject "{}" ;                                                                                                      
#   .                                                                                                                          
# """
TEMPLATE_PUBLICATION = """
:{}
  rdf:type :ResearchPublication ;
  foaf:page "{}"^^xsd:anyURI ;
  dct:title "{}" ;
  dct:identifier "{}" ;
"""

In [11]:

out_buf = [ PREAMBLE.lstrip() ]

## load the datasets
dataset_path = "/Users/sophierand/RCDatasets/datasets.json"
known_datasets = {}

with open(dataset_path, "r") as f:
    for elem in json.load(f):
        dat_id = elem["id"]
        id_list = [elem["provider"], elem["title"]]
        known_datasets[dat_id] = dat_id
#         dat_idcorpus.get_hash(id_list, prefix="dataset-")

        if "url" in elem:
            url = elem["url"]
        else:
            url = "http://example.com"

        out_buf.append(
            TEMPLATE_DATASET.format(
                known_datasets[dat_id],
                url,
                elem["provider"],
                elem["title"]
                ).strip()
            )

        if "alt_title" in elem:
            for alt_title in elem["alt_title"]:
                out_buf.append("  dct:alternative \"{}\" ;".format(alt_title))

        out_buf.append(".\n")

    

In [9]:
# out_buf

In [15]:
## load the publications
filename = "metadata_final/20191021_pub_md.json"
with open(filename) as f:
    for elem in json.load(f):
        link_map = elem["datasets"]

        if len(link_map) > 0:
#             id_list = [elem["publisher"], elem["title"]]
#             pub_id = corpus.get_hash(id_list, prefix="publication-")

            out_buf.append(
                TEMPLATE_PUBLICATION.format(
                    elem["pub_id"],
                    elem["pub_url"],
#                     elem["publisher"],
                    elem["title"],
                    elem["doi"],
#                     elem["pdf"]
                    ).strip()
                )

            dat_list = [ ":{}".format(known_datasets[dat_id]) for dat_id in link_map ]
            out_buf.append("  cito:citesAsDataSource {} ;".format(", ".join(dat_list)))
            out_buf.append(".\n")

   

In [20]:
dat_list

[':dataset-301']

In [19]:
len(out_buf)

2667

In [23]:
## write the TTL output
filename = "tmp.ttl"

with open(filename, "w") as f:
    for text in out_buf:
        f.write(text)
        f.write("\n")

# ## load the TTL output as a graph
# graph = rdflib.Graph()
# graph.parse(filename, format="n3")

# ## transform graph into JSON-LD
# with open("corpus/vocab.json", "r") as f:
#     context = json.load(f)

# with open("tmp.jsonld", "wb") as f:
#     f.write(graph.serialize(format="json-ld", context=context, indent=2))

# ## read back
# graph = rdflib.Graph()
# graph.parse("tmp.jsonld", format="json-ld")
