In [57]:
import logging
import rdflib
import pandas as pd
import csv
import os
import jsonlines
import re

#### binding namespaces

In [2]:
from rdflib import Namespace, Graph
from rdflib.namespace import RDF, RDFS, OWL, XSD, DCTERMS

g = Graph()

SCHEMA = Namespace("https://schema.org/")
ONTOLEX = Namespace("http://www.w3.org/ns/lemon/ontolex#")
VARTRANS = Namespace("http://www.w3.org/ns/lemon/vartrans#")
LOC = Namespace("http://id.loc.gov/vocabulary/")
LEXINFO = Namespace("http://www.lexinfo.net/ontology/2.0/lexinfo#")
LIME = Namespace("http://www.w3.org/ns/lemon/lime#")
WORDNET = Namespace("https://globalwordnet.github.io/schemas/wn#")
LEXVO = Namespace("http://lexvo.org/id/term/")
LILA = Namespace("http://lila-erc.eu/ontologies/lila/")

WIKIENTITY = Namespace("http://www.wikidata.org/entity/")
WIKIPROP = Namespace("http://www.wikidata.org/prop/direct/")
WIKIBASE = Namespace("http://wikiba.se/ontology#")
BIGDATA = Namespace("http://www.bigdata.com/rdf#")
PROP = Namespace("http://www.wikidata.org/prop/")
STATEMENT = Namespace("http://www.wikidata.org/prop/statement/")

g.bind("rdf", RDF)
g.bind("rdfs", RDFS)
g.bind("xsd", XSD)
g.bind("dct", DCTERMS)
g.bind("owl", OWL)

g.bind("schema", SCHEMA)
g.bind("ontolex", ONTOLEX)
g.bind("vartrans", VARTRANS)
g.bind("loc", LOC)
g.bind("lexinfo", LEXINFO)
g.bind("lime", LIME)
g.bind("wn", WORDNET)
g.bind("lexvo", LEXVO)
g.bind("lila", LILA)

g.bind("wd", WIKIENTITY)
g.bind("wdt", WIKIPROP)
g.bind("wikibase", WIKIBASE)
g.bind( "bd", BIGDATA)
g.bind("p", PROP)
g.bind("ps", STATEMENT)


#### define SPARQL queries

In [3]:
from rdflib.plugins.sparql import prepareQuery

lemma_query = prepareQuery(""" SELECT ?lemma 
    WHERE {
        SERVICE <https://lila-erc.eu/sparql/lila_knowledge_base/sparql> {
            ?lemma ontolex:writtenRep ?entry ;
            lila:hasPOS ?pos .             
        }
    }                    
    """, initNs = {'ontolex' : ONTOLEX, 'lila': LILA})

#### dataset

In [62]:
from rdflib import Literal, URIRef, BNode

lkgDataset = '../data/lkg/dataset.jsonl'
llkgGraph = '../data/llkg/llkg.ttl'
llkg = Literal('LLKG')

In [68]:
def setup_graph():
    g.add((llkg, RDF.type, LIME.Lexicon))
    g.add((llkg, RDFS.label, Literal('Linked Linguistic Knowledge Graph', lang='en')))
    g.add((llkg, DCTERMS.language, Literal('English')))
    g.add((Literal('English'), RDF.type, DCTERMS.LinguisticSystem))
    g.add((Literal('English'), RDFS.label, Literal('English', lang='en')))
    g.add((Literal('English'), DCTERMS.identifier, URIRef(LOC+'iso639-1/en')))
    g.add((Literal('English'), DCTERMS.identifier, URIRef(LOC+'iso639-2/eng')))
    g.add((llkg, SCHEMA.email, Literal('e.ghizzota@studenti.uniba.it')))

def update_entry(entry):
    g.add((llkg, LIME.entry, entry))


In [69]:
def populate_lemmas():
    lilaPosMapping = {'N' : LILA.noun, 'ADJ' : LILA.adjective, 'V' : LILA.verb}
    lexinfoPosMapping = {'N' : LEXINFO.noun , 'ADJ' : LEXINFO.adjective, 'V' : LEXINFO.verb}
    
    with jsonlines.open(lkgDataset, 'r') as lkg:   
        lemmas = (line for line in lkg if line['jtype'] == 'node' and line['label'] == 'Lemma')

        lemmaID = 1
        for line in lemmas:     
            result = g.query(lemma_query, initBindings={'entry': Literal(line['properties']['value']), 'pos' : URIRef(lilaPosMapping[line['properties']['posTag']]) })
            for r in result:
                lemma = r.lemma
                update_entry(lemma)     
                g.add((lemma, RDF.type, ONTOLEX.Form))
                g.add((lemma, RDFS.label, Literal(line['properties']['value'])))
                g.add((lemma, DCTERMS.identifier, Literal(line['identity'], datatype=XSD.unsignedInt)))
                g.add((lemma, DCTERMS.identifier, Literal('lemma_{}'.format(lemmaID), datatype=XSD.string)))
                g.add((lemma, ONTOLEX.writtenRep, Literal(line['properties']['value'], lang='la'))) # NON ESISTE STANDARD PER LATINO
                g.add((lemma, LEXINFO.partOfSpeech, URIRef(lexinfoPosMapping[line['properties']['posTag']])))
                lemmaID = lemmaID + 1
        
        lkg.close()
    g.serialize(format='ttl')
    
def populate_lexicalEntries():

    g.add((Literal('Latin'), RDF.type, DCTERMS.LinguisticSystem))
    g.add((Literal('Latin'), RDFS.label, Literal('Latin', lang='en')))
    g.add((Literal('Latin'), DCTERMS.identifier, URIRef(LOC+'iso639-1/la')))
    g.add((Literal('Latin'), DCTERMS.identifier, URIRef(LOC+'iso639-2/lat')))

    with jsonlines.open(lkgDataset, 'r') as lkg:
        lexicalEntries = (line for line in lkg if line['jtype'] == 'node' and line['label'] == 'InflectedWord')  
        leID = 1
        for line in lexicalEntries:
            value = line['properties']['value'].lower()
            word = URIRef(LEXVO+'lat/'+value)
            if not (word, None, None) in g:
                update_entry(word)
                if bool(re.search(r'\s', value)):
                    g.add((word, RDF.type, ONTOLEX.MultiwordExpression))
                elif value.startswith('-') or value.endswith('-'):
                    g.add((word, RDF.type, ONTOLEX.Affix))
                else:
                    g.add((word, RDF.type, ONTOLEX.Word))
                g.add((word, RDFS.label, Literal(value)))
                g.add((word, DCTERMS.language, Literal('Latin')))
                g.add((word, DCTERMS.identifier, Literal(line['identity'], datatype=XSD.unsignedInt)))
                g.add((word, DCTERMS.identifier, Literal('le_{}'.format(leID), datatype=XSD.string)))
                leID = leID + 1
            else:
                g.add((word, DCTERMS.identifier, Literal(line['identity'], datatype=XSD.unsignedInt)))
        lkg.close()
    g.serialize(format='ttl')

def populate_lexicalSenses():
    return 0

def populate_relations():
    with jsonlines.open(lkgDataset, 'r') as lkg:
        relations = (line for line in lkg if line['jtype'] == 'relationship')

        for line in relations:
            if line['name'] == 'HAS_LEMMA':
                subj = g.value(predicate=DCTERMS.identifier, object=Literal(line['subject'], datatype=XSD.unsignedInt))
                obj = g.value(predicate=DCTERMS.identifier, object=Literal(line['object'], datatype=XSD.unsignedInt))
                if subj != obj :
                    g.add((URIRef(str(subj)), ONTOLEX.canonicalForm, URIRef(str(obj))))
        lkg.close
    g.serialize(format='ttl')
    


In [70]:
g.remove((None, None, None))

setup_graph()
populate_lemmas()
populate_lexicalEntries()
populate_relations()

print(g.serialize(destination=llkgGraph,format='ttl'))

[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].
