In [43]:
import logging
import pandas as pd
import csv
import jsonlines
import re
from urllib.parse import quote
import os

#### binding namespaces

In [60]:
from rdflib import Namespace, Graph, Literal, URIRef, BNode
from rdflib.namespace import RDF, RDFS, OWL, XSD, DCTERMS

g = Graph()

SCHEMA = Namespace("https://schema.org/")
ONTOLEX = Namespace("http://www.w3.org/ns/lemon/ontolex#")
VARTRANS = Namespace("http://www.w3.org/ns/lemon/vartrans#")
LEXINFO = Namespace("http://www.lexinfo.net/ontology/2.0/lexinfo#")
LIME = Namespace("http://www.w3.org/ns/lemon/lime#")
WORDNET = Namespace("https://globalwordnet.github.io/schemas/wn#")
LEXVO = Namespace("http://lexvo.org/id/term/")
LVONT = Namespace("http://lexvo.org/ontology#")
LILA = Namespace("http://lila-erc.eu/ontologies/lila/")
SKOS = Namespace("http://www.w3.org/2008/05/skos#")

WIKIENTITY = Namespace("http://www.wikidata.org/entity/")
WIKIPROP = Namespace("http://www.wikidata.org/prop/direct/")
WIKIBASE = Namespace("http://wikiba.se/ontology#")

DUMMY = Namespace("http://dummy.com/")

g.bind("rdf", RDF)
g.bind("rdfs", RDFS)
g.bind("xsd", XSD)
g.bind("dct", DCTERMS)
g.bind("owl", OWL)

g.bind("schema", SCHEMA)
g.bind("ontolex", ONTOLEX)
g.bind("vartrans", VARTRANS)
g.bind("lexinfo", LEXINFO)
g.bind("lime", LIME)
g.bind("wn", WORDNET)
g.bind("lexvo", LEXVO)
g.bind("lvont", LVONT)
g.bind("lila", LILA)
g.bind("skos", SKOS)

g.bind("wd", WIKIENTITY)
g.bind("wdt", WIKIPROP)
g.bind("wikibase", WIKIBASE)

g.bind("dummy", DUMMY)

logger = logging.getLogger(__name__)
llkg = URIRef(DUMMY.LLKG)
llkgGraph = '../data/llkg/llkg.ttl'

#### graph setup

In [61]:
def setupGraph():
    g.add((llkg, RDF.type, LIME.Lexicon))
    g.add((llkg, RDFS.label, Literal('Linked Linguistic Knowledge Graph', lang='en')))
    g.add((llkg, SCHEMA.email, Literal('e.ghizzota@studenti.uniba.it')))

def updateEntry(entry):
    g.add((llkg, LIME.entry, entry))


#### define SPARQL queries

In [62]:
lemma_query = (""" SELECT ?lemma 
    WHERE {
        SERVICE <https://lila-erc.eu/sparql/lila_knowledge_base/sparql> {
            ?lemma ontolex:writtenRep ?entry ;
            lila:hasPOS ?pos .             
        }
    }""")

document_query = '''SELECT ?document
    WHERE {{
        VALUES ?title {{ "{}"@la}}
        ?document wdt:P31 wd:Q7725634 ;
           wdt:P1476 ?title.
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}   
    }} ''' 


#### dataset - EtymWN

In [65]:
etymFolder = '../data/etymwn' 

def languageNodes():
    logger.info('Generating language nodes...')

    l = Graph()
    l.parse(os.path.join(etymFolder, 'lexvo/lexvo_2013-02-09.nt'))
    
    for subj in l.subjects(predicate=RDF.type, object=LVONT.Language):
        language = URIRef(str(subj))
        g.add((language, RDF.type, DCTERMS.LinguisticSystem))
        g.add((language, RDFS.label, Literal(l.value(subject=subj, predicate=SKOS.prefLabel, object=None), lang='en')))
        g.add((language, DUMMY.iso6391, Literal(l.value(subject=subj, predicate=LVONT.iso639P1Code, object=None, any=False), datatype=XSD.string)))
        g.add((language, DUMMY.iso6392, Literal(l.value(subject=subj, predicate=LVONT.iso6392TCode, object=None, any=False), datatype=XSD.string)))
        g.add((language, DUMMY.iso6393, Literal(l.value(subject=subj, predicate=LVONT.iso639P3PCode, object=None, any=False), datatype=XSD.string)))

    g.add((llkg, DCTERMS.language, g.value(subject=None, predicate=RDFS.label, object=Literal("English", lang='en'))))
    l.close()

    logger.info('Serializing...')
    g.serialize(format='ttl')

def etymNodes():
    logger.info('Generating words nodes...')

    file = open(os.path.join(etymFolder, 'words.csv'), mode='r', encoding='utf-8')
    reader = csv.reader(file)
    
    for value in reader:
        wordString = str(value[1])
        word = URIRef(LEXVO+value[2]+'/'+quote(wordString))
        if not (word, None, None) in g:
            updateEntry(word)     
            if bool(re.search(r'\s', wordString)):
                g.add((word, RDF.type, ONTOLEX.MultiwordExpression))
            elif wordString.startswith('-') or wordString.endswith('-'):
                g.add((word, RDF.type, ONTOLEX.Affix))
            else:
                g.add((word, RDF.type, ONTOLEX.Word))
            g.add((word, RDFS.label, Literal(wordString, datatype=XSD.string)))
            g.add((word, DCTERMS.identifier, Literal(value[0], datatype=XSD.string)))
            g.add((word, DCTERMS.language, Literal(g.value(subject=None, predicate=DUMMY.iso6393, object=Literal(value[2], datatype=XSD.string)))))
        else:
            g.add((word, DCTERMS.identifier, Literal(value[0], datatype=XSD.string)))
    
    file.close()

    logger.info('Serializing...')
    g.serialize(format='ttl')

In [None]:
def etymRelations():
    logger.info('Connecting nodes...')

    file = open(os.path.join(etymFolder,'relations.csv'), 'r')
    reader = csv.reader(file)

    for line in reader:
        subj = g.value(predicate=DCTERMS.identifier, object=Literal(line[0], datatype=XSD.string))
        obj = g.value(predicate=DCTERMS.identifier, object=Literal(line[2], datatype=XSD.string))

        relation = line[1]

        if relation == 'etymology':
            g.add((URIRef(str(subj)), DUMMY.etymology, URIRef(str(obj))))
        elif relation == 'etymological_origin_of':
            g.add((URIRef(str(subj)), DUMMY.etymologicalOriginOf, URIRef(str(obj))))
        elif relation == 'etymologically_related':
            g.add((URIRef(str(subj)), DUMMY.etymologically_related, URIRef(str(obj))))
        elif relation == 'has_derived_form':
            g.add((URIRef(str(subj)), DUMMY.has_derived_form, URIRef(str(obj))))
            g.add((URIRef(str(obj)), DUMMY.is_derived_from, URIRef(str(subj))))
        elif relation == 'variant:orthography':
            g.add((URIRef(str(subj)), DUMMY.orthographyVariant, URIRef(str(obj))))
    
    file.close()

    logger.info('Nodes successfully connected!')
    logger.info('Serializing...')
    g.serialize(format='ttl')       

In [66]:
#g.remove((None, None, None))

languageNodes()
etymNodes()

g.serialize(destination=llkgGraph, format='ttl')

2024-03-25 00:41:33,338 Generating language nodes...
2024-03-25 00:42:09,293 Serializing...
2024-03-25 00:42:11,230 Generating words nodes...


#### dataset - LKG

In [3]:
lkgDataset = '../data/lkg/dataset.jsonl'
wikidataMap = '../data/lkg/wikidata_metadata/'

In [6]:
latin = URIRef(str(g.value(subject=None, predicate=RDFS.label, object=Literal('Latin', lang='en'), any=False)))

def lemmaNodes():

    logger.info('Generating lemma nodes...')

    lilaPosMapping = {'N' : LILA.noun, 'ADJ' : LILA.adjective, 'V' : LILA.verb}
    lexinfoPosMapping = {'N' : LEXINFO.noun , 'ADJ' : LEXINFO.adjective, 'V' : LEXINFO.verb}
    
    with jsonlines.open(lkgDataset, 'r') as lkg:   
        lemmas = (line for line in lkg if line['jtype'] == 'node' and line['label'] == 'Lemma')

        lemmaID = 1
        for line in lemmas:     
            result = g.query(lemma_query, initNs = {'ontolex' : ONTOLEX, 'lila': LILA}, initBindings={'entry': Literal(line['properties']['value']), 'pos' : URIRef(lilaPosMapping[line['properties']['posTag']]) })
            for r in result:
                lemma = r.lemma    
                g.add((lemma, RDF.type, ONTOLEX.Form))
                g.add((lemma, RDFS.label, Literal(line['properties']['value'])))
                g.add((lemma, DCTERMS.identifier, Literal(line['identity'], datatype=XSD.unsignedInt)))
                g.add((lemma, DCTERMS.identifier, Literal('lemma_{}'.format(lemmaID), datatype=XSD.string)))
                g.add((lemma, ONTOLEX.writtenRep, Literal(line['properties']['value'], lang='la'))) 
                g.add((lemma, LEXINFO.partOfSpeech, URIRef(lexinfoPosMapping[line['properties']['posTag']])))
                g.add((lemma, DCTERMS.language, g.value(subject=None, predicate=RDFS.label, object=Literal("Latin", lang='en'))))
                lemmaID = lemmaID + 1
        
        lkg.close()

    logger.info('Serializing...')
    g.serialize(format='ttl')
    
def entryNodes():

    logger.info('Generating entries nodes...')

    with jsonlines.open(lkgDataset, 'r') as lkg:
        lexicalEntries = (line for line in lkg if line['jtype'] == 'node' and line['label'] == 'InflectedWord')  
        leID = 1
        for line in lexicalEntries:
            value = line['properties']['value'].lower()
            word = URIRef(LEXVO+'lat/'+value)
            if not (word, None, None) in g:
                updateEntry(word)
                if bool(re.search(r'\s', value)):
                    g.add((word, RDF.type, ONTOLEX.MultiwordExpression))
                elif value.startswith('-') or value.endswith('-'):
                    g.add((word, RDF.type, ONTOLEX.Affix))
                else:
                    g.add((word, RDF.type, ONTOLEX.Word))
                g.add((word, RDFS.label, Literal(value)))
                g.add((word, DCTERMS.language, latin))
                g.add((word, DCTERMS.identifier, Literal(line['identity'], datatype=XSD.unsignedInt)))
                g.add((word, DCTERMS.identifier, Literal('le_{}'.format(leID), datatype=XSD.string)))
                leID = leID + 1
            else:
                g.add((word, DCTERMS.identifier, Literal(line['identity'], datatype=XSD.unsignedInt)))
        lkg.close()

    logger.info('Serializing...')
    g.serialize(format='ttl')

def senseNodes(): 

    logger.info('Generating lexical sense nodes...')

    with jsonlines.open(lkgDataset, 'r') as lkg:
        lexicalSenses = (line for line in lkg if line['jtype'] == 'node' and line['label'] == 'LexiconConcept')

        LS = URIRef("https://lila-erc.eu/data/lexicalResources/LewisShort/Lexicon")
        g.add((LS, RDF.type, RDFS.Resource))
        g.add((LS, RDFS.label, Literal('Lewis-Short Dictionary', lang='en')))
        LWN = URIRef("https://lila-erc.eu/data/lexicalResources/LatinWordNet/Lexicon")
        g.add((LWN, RDF.type, RDFS.Resource))
        g.add((LWN, RDFS.label, Literal('Latin WordNet', lang='en')))

        lsID = 1
        lwnID = 1
        for line in lexicalSenses:
            if line['properties']['resource'] == 'Lewis-Short Dictionary':
                sense = URIRef(line['properties']['id'])
                g.add((sense, RDF.type, ONTOLEX.LexicalSense))
                g.add((sense, DCTERMS.source, LS))
                g.add((sense, DCTERMS.description, Literal(line['properties']['alias'], lang='en')))     
                g.add((sense, DCTERMS.identifier, Literal(line['identity'], datatype=XSD.unsignedInt)))
                g.add((sense, DCTERMS.identifier, Literal('ls_{}'.format(lsID), datatype=XSD.string)))
                lsID = lsID + 1
            elif line['properties']['resource'] == 'Latin WordNet':
                sense = URIRef(line['properties']['alias'])
                g.add((sense, RDF.type, ONTOLEX.LexicalSense))
                g.add((sense, DCTERMS.source, LWN))
                g.add((sense, DCTERMS.description, Literal(line['properties']['gloss'], lang='en')))
                g.add((sense, DCTERMS.identifier, Literal(line['identity'], datatype=XSD.unsignedInt)))
                g.add((sense, DCTERMS.identifier, Literal('lwn_{}'.format(lwnID), datatype=XSD.string)))
                lwnID = lwnID + 1
                
        lkg.close()

    logger.info('Serializing...')
    g.serialize(format='ttl')

def authorNodes():

    logger.info('Generating author nodes...')

    authors_df = pd.read_csv(os.path.join(wikidataMap, 'latinISE_author_mapping.tsv'), sep='\t', header=None, usecols=[2,3,4,5], names=['name', 'lastname', 'title', 'id'])
    authors_df = authors_df.drop_duplicates(subset=['id'])
    authors_df = authors_df.fillna('')

    with jsonlines.open(lkgDataset, 'r') as lkg:
        authors = [line for line in lkg if line['jtype'] == 'node' and line['label'] == 'Person']
        
        for line in authors:
            name = line['properties']['name']
            lastname = line['properties']['lastname']
            if not lastname:
                wikiEntity = authors_df.loc[(authors_df['name'] == name), 'id'].values             
            else:
                wikiEntity = authors_df.loc[((authors_df['name'] == name) & (authors_df['lastname'] == lastname)), 'id'].values
            
            if wikiEntity.size > 0:
                author = URIRef(WIKIENTITY+wikiEntity[0])
                g.add((author, RDF.type, SCHEMA.Person))
                g.add((Literal(name), RDF.type, SCHEMA.Text))
                g.add((author, SCHEMA.givenName, Literal(name)))
                if len(lastname)>0:
                    g.add((Literal(lastname), RDF.type, SCHEMA.Text))
                    g.add((author, SCHEMA.familyName, Literal(lastname)))
                g.add((author, DCTERMS.identifier, Literal(line['identity'], datatype=XSD.unsignedInt)))

        lkg.close()

    logger.info('Serializing...')
    g.serialize(format='ttl')

def occupationNodes():
    logger.info('Creating dictionary...')

    file = open(os.path.join(wikidataMap, 'occupations_map.tsv'), encoding='utf-8', mode='r')
    reader = csv.reader(file, delimiter='\t')
    occupationDict = {}
    for row in reader:
        occupationDict[row[1]] = row[0]
    file.close()

    logger.info('Dictionary created')

    with jsonlines.open(lkgDataset, 'r') as lkg:
        occupations = [line for line in lkg if line['jtype']=='node' and line['label']=='Occupation']

        logger.info('Generating occupation nodes...')
        
        for line in occupations:
            value = line['properties']['value']
            occupation = URIRef(WIKIENTITY+occupationDict[value])
            g.add((occupation, RDF.type, SCHEMA.Occupation))
            g.add((occupation, RDFS.label, Literal(value, datatype=XSD.string)))
            g.add((occupation, DCTERMS.identifier, Literal(line['identity'], datatype=XSD.unsignedInt)))
        lkg.close()

    logger.info('Nodes generated')
    
    logger.info('Serializing...')
    g.serialize(format='ttl')

def textNodes():

    with jsonlines.open(lkgDataset, 'r') as lkg:
        ids = [line['object'] for line in lkg if line['jtype'] == 'relationship' and line['name'] == 'HAS_OCCURRENCE']
        lkg.close()

    with jsonlines.open(lkgDataset, 'r') as lkg:
        occurrences = [line for line in lkg if line['jtype'] == 'node' and line['identity'] in ids]
        textID = 1
        for line in occurrences:
            text = Literal('text_{}'.format(textID))
            g.add((text, RDF.type, SCHEMA.Quotation))
            g.add((text, SCHEMA.text, Literal(line['properties']['value'], datatype=XSD.string)))
            g.add((text, DCTERMS.language, latin)) 

'''def documentNodes():
    with jsonlines.open(lkgDataset, 'r') as lkg:
    documents = [line for line in lkg if line['jtype'] == 'node' and line['label'] == 'Document']

    for line in documents:
        #QUERY SPARQL'''

def corpusNodes():
    with jsonlines.open(lkgDataset, 'r') as lkg:
        corpora = [line for line in lkg if line['jtype'] == 'node' and line['identity'] == 'Corpus']
        cID = 1
        for line in corpora:
            corpus = Literal(line['properties']['name'], datatype=XSD.string)
            g.add((corpus, RDF.type, SCHEMA.Collection))
            g.add((corpus, RDFS.label, Literal('c_{}'.format(cID), datatype=XSD.string)))
            cID = cID + 1
        lkg.close() 

    logger.info('Serializing...')
    g.serialize(format='ttl')


In [10]:
def lkgRelations():

    logger.info('Connecting nodes...')

    with jsonlines.open(lkgDataset, 'r') as lkg:
        relations = [line for line in lkg if line['jtype'] == 'relationship']

        for line in relations:
            relation = line['name']
            subj = g.value(predicate=DCTERMS.identifier, object=Literal(line['subject'], datatype=XSD.unsignedInt))
            obj = g.value(predicate=DCTERMS.identifier, object=Literal(line['object'], datatype=XSD.unsignedInt))

            if relation == 'HAS_LEMMA':    
                if subj != obj :
                    g.add((URIRef(str(subj)), ONTOLEX.canonicalForm, URIRef(str(obj))))

            elif relation == 'HAS_CONCEPT':
                for o in g.objects(subject = subj, predicate=RDF.type):
                    if o != ONTOLEX.Form: # according to Ontolex schema, Form entities are not directly linked to LexicalSense entities
                        g.add((URIRef(str(subj)), ONTOLEX.sense, URIRef(str(obj))))
                        g.add((URIRef(str(obj)), ONTOLEX.isSenseOf, URIRef(str(subj))))

            elif relation == 'HAS_SUBCLASS':
                g.add((URIRef(str(subj)), VARTRANS.senseRel, URIRef(str(obj))))
                
            elif relation == 'SAME_AS':
                g.add((URIRef(str(subj)), OWL.sameAs, URIRef(str(obj))))

            elif relation == 'HAS_OCCUPATION':
                g.add((URIRef(str(subj)), SCHEMA.hasOccupation, URIRef(str(obj))))

            elif relation == 'HAS_OCCURRENCE':
                g.add((URIRef(str(subj)), DCTERMS.isPartOf, URIRef(str(obj))))
                #g.add((URIRef(str(obj), POWLA.start, Literal())))              UNAVAILABLE DATA
                #g.add((URIRef(str(obj), POWLA.end, Literal())))                UNAVAILABLE DATA

            elif relation == 'HAS_EXAMPLE':
                g.add((URIRef(str(subj)), WORDNET.example, URIRef(str(obj))))
                g.add((URIRef(str(obj), DUMMY.grade, Literal(line['properties']['grade'], datatype=XSD.float))))

            elif relation == 'HAS_AUTHOR':
                g.add((URIRef(str(subj)), SCHEMA.author, URIRef(str(obj))))

            elif relation == 'PUBLISHED_IN':
                g.add((URIRef(str(subj)), SCHEMA.datePublished, URIRef(str(obj))))

            elif relation == 'BELONG_TO':
                g.add((URIRef(str(subj)), SCHEMA.isPartOf, URIRef(str(obj))))

        lkg.close

    logger.info('Nodes successfully connected!')
    logger.info('Serializing...')
    g.serialize(format='ttl')

In [None]:
#g.remove((None, None, None))

logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)

'''lemmaNodes()
entryNodes()
senseNodes()
authorNodes()
textNodes()
corpusNodes()
relations()'''

g.serialize(destination=llkgGraph,format='ttl')