In [17]:
from rdflib import Graph, Namespace, RDF

def ttl_corpora_tags(filename:str) -> list[dict]:
    g = Graph()
    g.parse(filename, format="turtle")

    NIF = Namespace("http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#")
    ITSRDF = Namespace("http://www.w3.org/2005/11/its/rdf#")

    data = []

    for context in g.subjects(RDF.type, NIF.Context):
        entry = {"corpus": None, "tags": []}

        for text in g.objects(context, NIF.isString):
            entry["corpus"] = str(text)

        for span in g.subjects(NIF.referenceContext, context):
            tag = {}

            for anchor in g.objects(span, NIF.anchorOf):
                tag["text"] = str(anchor)

            for begin in g.objects(span, NIF.beginIndex):
                tag["beginIndex"] = int(begin)
            for end in g.objects(span, NIF.endIndex):
                tag["endIndex"] = int(end)

            for ref in g.objects(span, ITSRDF.taIdentRef):
                tag["uri"] = str(ref)

            if tag:
                entry["tags"].append(tag)

        data.append(entry)
    return data

In [None]:
import re
def tsv_to_json(filename):
    data = []
    current_doc_corpus = ""
    current_doc_tags = []

    
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip().encode("utf-8").decode("unicode_escape")
            try:
                line = line.encode("latin1").decode("utf-8")
            except Exception:
                pass
            if line.startswith('-DOCSTART-'):
                if current_doc_corpus:
                    data.append({"corpus":current_doc_corpus.strip(), "tags":current_doc_tags})
                    current_doc_corpus = ""
                    current_doc_tags = []
                continue
            if not line:
                continue
            if '\t' in line:
                parts = line.split('\t')
                if len(parts) == 4:
                    token, mention_type, mention_string, entity = parts
                    if mention_type == "B" and entity != "--NME--":
                        current_doc_tags.append(
                            {
                                "text": token,
                                "beginIndex": len(current_doc_corpus),
                                "endIndex": len(current_doc_corpus) + len(token),
                                "uri": f"https://en.wikipedia.org/wiki/{entity}"
                            }
                        )
                    elif entity != "--NME--":
                        current_doc_tags[-1]['text'] += f" {token}"
                        current_doc_tags[-1]['endIndex'] = len(current_doc_corpus) + len(token)
                    current_doc_corpus += f"{token} "
            else:
                current_doc_corpus += f"{line} "
            current_doc_corpus = current_doc_corpus.replace(" ,",",").replace(" .",".")
    return data

In [19]:
import json
for file in [
    "../../data/N3/RSS-500.ttl",
    "../../data/N3/Reuters-128.ttl",
    "../../data/MSNBC/MSNBCt.ttl",
    "../../data/OKE/evaluation-dataset-task1.ttl"
    ]:
    
    data = [x for x in ttl_corpora_tags(file) if x['corpus'] and x['tags']]
    fname_short = file.split("/")[-1].split(".")[0]
    with open(f"../../data/jsons/{fname_short}.json","w+",encoding="utf-8") as f:
        f.write(json.dumps(data,indent=4))

In [20]:
for file in [
    "../../data/KORE50/KORE50.tsv"
    ]:
    
    data = [x for x in tsv_to_json(file) if x['corpus'] and x['tags']]
    fname_short = file.split("/")[-1].split(".")[0]
    with open(f"../../data/jsons/{fname_short}.json","w+",encoding="utf-8") as f:
        f.write(json.dumps(data,indent=4, ensure_ascii=False))