In [26]:
import logging
import pandas as pd
import csv
import jsonlines
import re
from urllib.parse import quote
from nltk.corpus import wordnet as wn
import os
from py4j.java_gateway import JavaGateway

import nodes
import relations

import importlib

#### binding namespaces

In [27]:
from rdflib import Namespace, Graph, Literal, URIRef, BNode
from rdflib.namespace import RDF, RDFS, OWL, XSD, DCTERMS

g = Graph()

SCHEMA = Namespace("https://schema.org/")
ONTOLEX = Namespace("http://www.w3.org/ns/lemon/ontolex#")
VARTRANS = Namespace("http://www.w3.org/ns/lemon/vartrans#")
LEXINFO = Namespace("http://www.lexinfo.net/ontology/2.0/lexinfo#")
LIME = Namespace("http://www.w3.org/ns/lemon/lime#")
WORDNET = Namespace("https://globalwordnet.github.io/schemas/wn#")
LEXVO = Namespace("http://lexvo.org/id/term/")
LVONT = Namespace("http://lexvo.org/ontology#")
UWN = Namespace("http://www.lexvo.org/uwn/entity/s/")
LILA = Namespace("http://lila-erc.eu/ontologies/lila/")
SKOS = Namespace("http://www.w3.org/2008/05/skos#")

WIKIENTITY = Namespace("http://www.wikidata.org/entity/")
WIKIPROP = Namespace("http://www.wikidata.org/prop/direct/")
WIKIBASE = Namespace("http://wikiba.se/ontology#")

DUMMY = Namespace("http://dummy.com/")

g.bind("rdf", RDF)
g.bind("rdfs", RDFS)
g.bind("xsd", XSD)
g.bind("dct", DCTERMS)
g.bind("owl", OWL)

g.bind("schema", SCHEMA)
g.bind("ontolex", ONTOLEX)
g.bind("vartrans", VARTRANS)
g.bind("lexinfo", LEXINFO)
g.bind("lime", LIME)
g.bind("wn", WORDNET)
g.bind("lexvo", LEXVO)
g.bind("lvont", LVONT)
g.bind("uwn", UWN)
g.bind("lila", LILA)
g.bind("skos", SKOS)

g.bind("wd", WIKIENTITY)
g.bind("wdt", WIKIPROP)
g.bind("wikibase", WIKIBASE)

g.bind("dummy", DUMMY)

logger = logging.getLogger(__name__)
llkg = URIRef(DUMMY.LLKG)
llkgGraph = '../data/llkg/llkg.ttl'

#### graph setup

In [28]:
def setupGraph():
    g.add((llkg, RDF.type, LIME.Lexicon))
    g.add((llkg, RDFS.label, Literal('Linked Linguistic Knowledge Graph', lang='en')))
    g.add((llkg, SCHEMA.email, Literal('e.ghizzota@studenti.uniba.it')))

    g.serialize(format='ttl')

#### dataset - EtymWN

In [29]:
etymFolder = '../data/etymwn' 

def languageNodes():
    logger.info('Generating language nodes...')

    l = Graph()
    l.parse(os.path.join(etymFolder, 'lexvo/lexvo_2013-02-09.nt'))
    for item in l.subjects(predicate=RDF.type, object=LVONT.Language):
        nodes.addLanguageNode(language=item, l=l, g=g)
    
    g.add((llkg, DCTERMS.language, g.value(subject=None, predicate=RDFS.label, object=Literal("English", lang='en'))))
    l.close()

    logger.info('Serializing...')
    g.serialize(format='ttl')

def etymNodes():
    logger.info('Generating words nodes...')

    file = open(os.path.join(etymFolder, 'words.csv'), mode='r', encoding='utf-8')
    reader = csv.reader(file)
    for line in reader:
        nodes.addEtymLexicalEntryNode(word=line[1], language=line[2], iso='3', id=line[0], llkg=llkg, g=g)
        g.serialize(format='ttl')
    file.close()
    

In [30]:
def etymRelations():
    logger.info('Connecting nodes...')

    file = open(os.path.join(etymFolder,'relations.csv'), 'r')
    reader = csv.reader(file)

    for line in reader:
        subj = g.value(predicate=DUMMY.etymwnID, object=Literal(line[0], datatype=XSD.string))
        obj = g.value(predicate=DUMMY.etymwnID, object=Literal(line[2], datatype=XSD.string))

        property = line[1]

        if property == 'etymology':
            relations.addEtymology(subj, obj, g)
            relations.addEtymologicalOrigin(obj, subj, g)           
        elif property == 'etymologically_related':
            relations.addEtymologicallyRelated(subj, obj, g)
        elif property == 'has_derived_form':
            relations.addHasDerivedForm(subj, obj, g)
            relations.addIsDerivedFrom(subj, obj, g)
        elif property == 'variant:orthography':
            relations.addOrthographyVariant(subj, obj, g)
        g.serialize(format='ttl')  
    file.close()

    logger.info('Nodes successfully connected!')

#### dataset - LKG

In [31]:
lkgDataset = '../data/lkg/dataset.jsonl'
wikidataMap = '../data/lkg/wikidata_metadata/'

In [32]:
def resourceNodes():
    logger.info('Generating resources nodes...')
    nodes.addResourceNode(resource='https://www.perseus.tufts.edu/hopper/text?doc=Perseus:text:1999.04.0059', label='Lewis-Short Dictionary', g=g)
    nodes.addResourceNode(resource='https://lila-erc.eu/data/lexicalResources/LatinWordNet/Lexicon', label='Latin WordNet', g=g)
    nodes.addResourceNode(resource='https://www.mpi-inf.mpg.de/departments/databases-and-information-systems/research/yago-naga/uwn', label='Universal WordNet', g=g)
    logger.info('Serializing...')
    g.serialize(format='ttl')

def lemmaNodes():
    logger.info('Generating lemma nodes...')
    with jsonlines.open(lkgDataset, 'r') as lkg:   
        lemmas = (line for line in lkg if line['jtype'] == 'node' and line['label'] == 'Lemma')
        for line in lemmas:     
            nodes.addFormNode(writtenRep=line['properties']['value'], pos=line['properties']['posTag'], id=line['identity'], g=g)      
        lkg.close()
    logger.info('Serializing...')
    g.serialize(format='ttl')
    
def entryNodes():
    logger.info('Generating entries nodes...')
    with jsonlines.open(lkgDataset, 'r') as lkg:
        lexicalEntries = (line for line in lkg if line['jtype'] == 'node' and line['label'] == 'InflectedWord')  
        for line in lexicalEntries:
            nodes.addLexicalEntryNode(entry=line['properties']['value'], id=line['identity'], language='Latin', llkg=llkg, g=g)
        lkg.close()
    logger.info('Serializing...')
    g.serialize(format='ttl')

def senseNodes(): 
    logger.info('Generating lexical sense nodes...')
    with jsonlines.open(lkgDataset, 'r') as lkg:
        lexicalSenses = (line for line in lkg if line['jtype'] == 'node' and line['label'] == 'LexiconConcept')
        for line in lexicalSenses:
            resource = line['properties']['resource']
            if resource == 'Lewis-Short Dictionary':
                nodes.addLexicalSenseNode(resource=resource, sense=line['properties']['id'], gloss=line['properties']['alias'], id=line['identity'], g=g) 
            elif resource == 'Latin WordNet':
                nodes.addLexicalSenseNode(resource='Universal WordNet', sense=line['properties']['alias'], gloss=line['properties']['gloss'], id=line['identity'], g=g) 
        lkg.close()
    logger.info('Serializing...')
    g.serialize(format='ttl')

def authorNodes():
    logger.info('Generating author nodes...')
    authors_df = pd.read_csv(os.path.join(wikidataMap, 'latinISE_author_mapping.tsv'), sep='\t', header=None, usecols=[2,3,4,5], names=['name', 'lastname', 'title', 'id'])
    authors_df = authors_df.drop_duplicates(subset=['id'])
    authors_df = authors_df.fillna('')
    with jsonlines.open(lkgDataset, 'r') as lkg:
        authors = [line for line in lkg if line['jtype'] == 'node' and line['label'] == 'Person']     
        for line in authors:
           nodes.addPersonNode(firstname=line['properties']['name'], lastname=line['properties']['lastname'], id=line['identity'], df=authors_df, g=g)
        lkg.close()
    logger.info('Serializing...')
    g.serialize(format='ttl')

def occupationNodes():
    logger.info('Creating dictionary...')
    file = open(os.path.join(wikidataMap, 'occupations_map.tsv'), encoding='utf-8', mode='r')
    reader = csv.reader(file, delimiter='\t')
    occupationDict = {}
    for row in reader:
        occupationDict[row[1]] = row[0]
    file.close()
    logger.info('Dictionary created')

    logger.info('Generating occupation nodes...')
    with jsonlines.open(lkgDataset, 'r') as lkg:
        occupations = [line for line in lkg if line['jtype']=='node' and line['label']=='Occupation']        
        for line in occupations:
           nodes.addOccupationNode(occupation=line['properties']['value'], id=line['identity'], dict=occupationDict, g=g)
        lkg.close()
    logger.info('Serializing...')
    g.serialize(format='ttl')

def textNodes():
    with jsonlines.open(lkgDataset, 'r') as lkg:
        ids = [line['object'] for line in lkg if line['jtype'] == 'relationship' and line['name'] == 'HAS_OCCURRENCE']
        lkg.close()
    logger.info('Generating text nodes...')
    with jsonlines.open(lkgDataset, 'r') as lkg:
        occurrences = [line for line in lkg if line['jtype'] == 'node' and line['identity'] in ids]
        textID = 1
        for line in occurrences:
            nodes.addQuotationNode(quotation=line['properties']['value'], language='Latin', id=line['identity'], g=g)
            textID = textID + 1
        lkg.close()
    logger.info('Serializing...')
    g.serialize(format='ttl')

def documentNodes():
    with jsonlines.open(lkgDataset, 'r') as lkg:
        documents = [line for line in lkg if line['jtype'] == 'node' and line['label'] == 'Document']
        for line in documents:
            nodes.addCreativeWorkNode(title=line['properties']['title'], id=line['identity'], g=g)

def corpusNodes():
    logger.info('Generating corpora nodes...')
    with jsonlines.open(lkgDataset, 'r') as lkg:
        corpora = [line for line in lkg if line['jtype'] == 'node' and line['identity'] == 'Corpus']
        for line in corpora:
            nodes.addCollectionNode(title=line['properties']['name'], id=line['identity'], g=g)      
        lkg.close() 
    logger.info('Serializing...')
    g.serialize(format='ttl')


In [33]:
def lkgRelations():

    logger.info('Connecting nodes...')

    with jsonlines.open(lkgDataset, 'r') as lkg:
        relationships = [line for line in lkg if line['jtype'] == 'relationship']

        for line in relationships:
            property = line['name']
            subj = g.value(predicate=DUMMY.lkgID, object=Literal(line['subject'], datatype=XSD.unsignedInt))
            obj = g.value(predicate=DUMMY.lkgID, object=Literal(line['object'], datatype=XSD.unsignedInt))

            if property == 'HAS_LEMMA':    
                relations.addCanonicalForm(subj, obj, g)
            elif property == 'HAS_CONCEPT':
                relations.addSense(subj, obj, g)
            elif property == 'HAS_SUBCLASS':
                relations.addSenseRel(subj, obj, g)        
            elif property == 'SAME_AS':
                relations.addSameAs(subj, obj, g)
            elif property == 'HAS_OCCURRENCE':
                relations.addDCTIsPartOf(subj, obj, g)
            elif property == 'HAS_EXAMPLE':
                relations.addExample(subj, obj, line, g)
            elif property == 'HAS_AUTHOR':
                relations.addAuthor(subj, obj, g)
            elif property == 'HAS_OCCUPATION':
                relations.addHasOccupation(subj, obj, g)
            elif property == 'PUBLISHED_IN':
                relations.addDatePublished(subj, obj, g)
            elif property == 'BELONG_TO':
                relations.addSCHEMAIsPartOf(subj, obj, g)
        lkg.close

    logger.info('Nodes successfully connected!')
    logger.info('Serializing...')
    g.serialize(format='ttl')

In [36]:
importlib.reload(nodes)
importlib.reload(relations)

g.remove((None, None, None))
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)

setupGraph()
languageNodes()
resourceNodes()
lemmaNodes()
entryNodes()
senseNodes()
authorNodes()
occupationNodes()
textNodes()

lkgRelations()


g.serialize(destination=llkgGraph,format='ttl')

2024-03-26 13:29:18,804 Generating language nodes...
2024-03-26 13:29:55,305 Serializing...


<Graph identifier=N706b279f98d0496489f3f323b31a5137 (<class 'rdflib.graph.Graph'>)>