In [94]:
import logging
import pandas as pd
import csv
import jsonlines
#import re
#from urllib.parse import quote
#from nltk.corpus import wordnet as wn
import os
from py4j.java_gateway import JavaGateway
from rdflib import Graph, Literal, URIRef, BNode
from rdflib.namespace import RDF, RDFS, OWL, XSD, DCTERMS

import nodes
import relations
import queries
from namespaces import *
import importlib

In [95]:
importlib.reload(nodes)
importlib.reload(relations)
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)

#### binding namespaces

In [96]:
g = Graph()

g.bind("rdf", RDF)
g.bind("rdfs", RDFS)
g.bind("xsd", XSD)
g.bind("dct", DCTERMS)
g.bind("owl", OWL)

g.bind("schema", SCHEMA)
g.bind("ontolex", ONTOLEX)
g.bind("vartrans", VARTRANS)
g.bind("lexinfo", LEXINFO)
g.bind("lime", LIME)
g.bind("wn", WORDNET)
g.bind("lexvo", LEXVO)
g.bind("lvont", LVONT)
g.bind("uwn", UWN)
g.bind("lila", LILA)
g.bind("skos", SKOS)

g.bind("wd", WIKIENTITY)
g.bind("wdt", WIKIPROP)
g.bind("wikibase", WIKIBASE)
g.bind("bd", BIGDATA)

g.bind("dummy", DUMMY)

logger = logging.getLogger(__name__)
llkg = URIRef(DUMMY.LLKG)
llkgGraph = '../data/llkg/llkg.ttl'

#### graph setup

In [97]:
def setupGraph():
    g.add((llkg, RDF.type, LIME.Lexicon))
    g.add((llkg, RDFS.label, Literal('Linked Linguistic Knowledge Graph', lang='en')))
    g.add((llkg, SCHEMA.email, Literal('e.ghizzota@studenti.uniba.it')))

    g.serialize(format='ttl')

#### dataset - EtymWN

In [98]:
etymFolder = '../data/etymwn' 

def languageNodes():
    logger.info('Generating language nodes...')

    l = Graph()
    l.parse(os.path.join(etymFolder, 'lexvo/lexvo_2013-02-09.nt'))
    for item in l.subjects(predicate=RDF.type, object=LVONT.Language):
        nodes.addLanguageNode(language=item, l=l, g=g)
    
    g.add((llkg, DCTERMS.language, g.value(subject=None, predicate=RDFS.label, object=Literal("English", lang='en'))))
    l.close()

    logger.info('Serializing...')
    g.serialize(format='ttl')

def etymNodes():
    logger.info('Generating words nodes...')

    file = open(os.path.join(etymFolder, 'words.csv'), mode='r', encoding='utf-8')
    reader = csv.reader(file)
    for line in reader:
        nodes.addEtymLexicalEntryNode(word=line[1], language=line[2], iso='3', id=line[0], llkg=llkg, g=g)
        g.serialize(format='ttl')
    file.close()
    

In [99]:
def etymRelations():
    logger.info('Connecting nodes...')

    file = open(os.path.join(etymFolder,'relations.csv'), 'r')
    reader = csv.reader(file)

    for line in reader:
        subj = g.value(predicate=DUMMY.etymwnID, object=Literal(line[0], datatype=XSD.string))
        obj = g.value(predicate=DUMMY.etymwnID, object=Literal(line[2], datatype=XSD.string))

        property = line[1]

        if property == 'etymology':
            relations.addEtymology(subj, obj, g)
            relations.addEtymologicalOrigin(obj, subj, g)           
        elif property == 'etymologically_related':
            relations.addEtymologicallyRelated(subj, obj, g)
        elif property == 'has_derived_form':
            relations.addHasDerivedForm(subj, obj, g)
            relations.addIsDerivedFrom(subj, obj, g)
        elif property == 'variant:orthography':
            relations.addOrthographyVariant(subj, obj, g)
        g.serialize(format='ttl')  
    file.close()

    logger.info('Nodes successfully connected!')

#### dataset - LKG

In [100]:
lkgDataset = '../data/lkg/dataset.jsonl'
wikidataMap = '../data/lkg/wikidata_metadata/'

In [101]:
def resourceNodes():
    logger.info('Generating resources nodes...')
    nodes.addResourceNode(resource='https://www.perseus.tufts.edu/hopper/text?doc=Perseus:text:1999.04.0059', label='Lewis-Short Dictionary', g=g)
    nodes.addResourceNode(resource='https://lila-erc.eu/data/lexicalResources/LatinWordNet/Lexicon', label='Latin WordNet', g=g)
    nodes.addResourceNode(resource='https://www.mpi-inf.mpg.de/departments/databases-and-information-systems/research/yago-naga/uwn', label='Universal WordNet', g=g)
    logger.info('Serializing...')
    g.serialize(format='ttl')

def lemmaNodes():
    logger.info('Generating lemma nodes...')
    with jsonlines.open(lkgDataset, 'r') as lkg:   
        lemmas = (line for line in lkg if line['jtype'] == 'node' and line['label'] == 'Lemma')
        for line in lemmas:     
            nodes.addFormNode(writtenRep=line['properties']['value'], pos=line['properties']['posTag'], id=line['identity'], g=g)      
    logger.info('Serializing...')
    g.serialize(format='ttl')
    
def entryNodes():
    logger.info('Generating entries nodes...')
    with jsonlines.open(lkgDataset, 'r') as lkg:
        lexicalEntries = (line for line in lkg if line['jtype'] == 'node' and line['label'] == 'InflectedWord')  
        for line in lexicalEntries:
            nodes.addLexicalEntryNode(entry=line['properties']['value'], id=line['identity'], language='Latin', llkg=llkg, g=g)
    logger.info('Serializing...')
    g.serialize(format='ttl')

def senseNodes(): 
    logger.info('Generating lexical sense nodes...')
    with jsonlines.open(lkgDataset, 'r') as lkg:
        lexicalSenses = (line for line in lkg if line['jtype'] == 'node' and line['label'] == 'LexiconConcept')
        for line in lexicalSenses:
            resource = line['properties']['resource']
            if resource == 'Lewis-Short Dictionary':
                nodes.addLexicalSenseNode(resource=resource, sense=line['properties']['id'], gloss=line['properties']['alias'], id=line['identity'], g=g) 
            elif resource == 'Latin WordNet':
                nodes.addLexicalSenseNode(resource='Universal WordNet', sense=line['properties']['alias'], gloss=line['properties']['gloss'], id=line['identity'], g=g) 
    logger.info('Serializing...')
    g.serialize(format='ttl')

def authorNodes():
    logger.info('Generating author nodes...')
    authors_df = pd.read_csv(os.path.join(wikidataMap, 'latinISE_author_mapping.tsv'), sep='\t', header=None, usecols=[2,3,4,5], names=['name', 'lastname', 'title', 'id'])
    authors_df = authors_df.drop_duplicates(subset=['id'])
    authors_df = authors_df.fillna('')
    with jsonlines.open(lkgDataset, 'r') as lkg:
        authors = [line for line in lkg if line['jtype'] == 'node' and line['label'] == 'Person']     
        for line in authors:
           nodes.addPersonNode(firstname=line['properties']['name'], lastname=line['properties']['lastname'], id=line['identity'], df=authors_df, g=g)
    logger.info('Serializing...')
    g.serialize(format='ttl')

def occupationNodes():
    logger.info('Creating dictionary...')
    file = open(os.path.join(wikidataMap, 'occupations_map.tsv'), encoding='utf-8', mode='r')
    reader = csv.reader(file, delimiter='\t')
    occupationDict = {}
    for row in reader:
        occupationDict[row[1]] = row[0]
    file.close()
    logger.info('Dictionary created')

    logger.info('Generating occupation nodes...')
    with jsonlines.open(lkgDataset, 'r') as lkg:
        occupations = [line for line in lkg if line['jtype']=='node' and line['label']=='Occupation']        
        for line in occupations:
           nodes.addOccupationNode(occupation=line['properties']['value'], id=line['identity'], dict=occupationDict, g=g)
    logger.info('Serializing...')
    g.serialize(format='ttl')

def textNodes():
    with jsonlines.open(lkgDataset, 'r') as lkg:
        ids = [line['object'] for line in lkg if line['jtype'] == 'relationship' and line['name'] == 'HAS_OCCURRENCE']
    logger.info('Generating text nodes...')
    with jsonlines.open(lkgDataset, 'r') as lkg:
        occurrences = [line for line in lkg if line['jtype'] == 'node' and line['identity'] in ids]
        textID = 1
        for line in occurrences:
            nodes.addQuotationNode(quotation=line['properties']['value'], language='Latin', id=line['identity'], g=g)
            textID = textID + 1
    logger.info('Serializing...')
    g.serialize(format='ttl')

def documentNodes():
    with jsonlines.open(lkgDataset, 'r') as lkg:
        documents = [line for line in lkg if line['jtype'] == 'node' and line['label'] == 'Document']
        for line in documents:
            nodes.addCreativeWorkNode(title=line['properties']['title'], id=line['identity'], g=g)

def corpusNodes():
    logger.info('Generating corpora nodes...')
    with jsonlines.open(lkgDataset, 'r') as lkg:
        corpora = [line for line in lkg if line['jtype'] == 'node' and line['identity'] == 'Corpus']
        for line in corpora:
            nodes.addCollectionNode(title=line['properties']['name'], id=line['identity'], g=g)      
    logger.info('Serializing...')
    g.serialize(format='ttl')            



In [102]:
def dateDictionary():
    logger.info('Generating dates dictionary...')

    with jsonlines.open(lkgDataset, 'r') as lkg:
        startTimes = [line for line in lkg if line['jtype'] == 'relationship' and line['name'] == 'startTime']
        startDict = {}
        for line in startTimes:
            startDict[line['subject']] = line['object']

    with jsonlines.open(lkgDataset, 'r') as lkg:
        endTimes = [line for line in lkg if line['jtype'] == 'relationship' and line['name'] == 'endTime']
        endDict = {}
        for line in endTimes:
            endDict[line['subject']] = line['object']
        
        intervalsDict = {}
        for k in startDict.keys():
            intervalsDict.update({k : (startDict[k], endDict[k])})

    with jsonlines.open(lkgDataset, 'r') as lkg:
        timePoints = [line for line in lkg if line['jtype'] == 'node' and line['label'] == 'TimePoint']
        pointsDict = {}
        for line in timePoints:
            pointsDict[line['identity']] = line['properties']['Year']

    return intervalsDict, pointsDict

def lkgRelations():

    logger.info('Connecting nodes...')

    intervalsDict, pointsDict = dateDictionary()

    with jsonlines.open(lkgDataset, 'r') as lkg:
        relationships = [line for line in lkg if line['jtype'] == 'relationship']

        for line in relationships:
            property = line['name']
            subj = g.value(predicate=DUMMY.lkgID, object=Literal(line['subject'], datatype=XSD.unsignedInt))
            obj = g.value(predicate=DUMMY.lkgID, object=Literal(line['object'], datatype=XSD.unsignedInt))

            if property == 'HAS_LEMMA':    
                relations.addCanonicalForm(subj, obj, g)
            elif property == 'HAS_CONCEPT':
                relations.addSense(subj, obj, g)
            elif property == 'HAS_SUBCLASS':
                relations.addSenseRel(subj, obj, g)        
            elif property == 'SAME_AS':
                relations.addSameAs(subj, obj, g)
            elif property == 'HAS_AUTHOR':
                relations.addAuthor(subj, obj, g)
            elif property == 'HAS_OCCUPATION':
                relations.addHasOccupation(subj, obj, g)
            elif property == 'BELONG_TO':
                relations.addSCHEMAIsPartOf(subj, obj, g)
            elif property == 'HAS_OCCURRENCE':
                occurrence = g.value(subject=None, predicate=DCTERMS.isPartOf, object=obj)
                relations.addDCTIsPartOf(subj, occurrence, g)
            elif property == 'HAS_EXAMPLE':
                example = g.value(subject=None, predicate=DCTERMS.isPartOf, object=obj)
                relations.addExample(subj, example, line['properties']['grade'], g)
            elif property == 'PUBLISHED_IN' or property == 'BORN' or property == 'DIED':
                if line['object'] in intervalsDict.keys():
                    s, e = intervalsDict[line['object']]
                    start = pointsDict[s]
                    end = pointsDict[e]
                    relations.addDateInterval(subj, start, end, property, g)
                elif line['object'] in pointsDict.keys():
                    relations.addDatePoint(subj, pointsDict[line['object']], property, g)

    logger.info('Nodes successfully connected!')
    logger.info('Serializing...')
    g.serialize(format='ttl')

In [103]:
importlib.reload(nodes)
importlib.reload(relations)

g.remove((None, None, None))
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)

setupGraph()
languageNodes()
resourceNodes()
lemmaNodes()
entryNodes()
authorNodes()
occupationNodes()
senseNodes()
textNodes()
documentNodes()
corpusNodes()
lkgRelations()


g.serialize(destination=llkgGraph,format='ttl')

2024-03-27 19:47:24,173 Generating language nodes...


2024-03-27 19:47:58,014 Serializing...
2024-03-27 19:47:59,596 Generating resources nodes...
2024-03-27 19:47:59,596 Serializing...
2024-03-27 19:48:00,729 Generating lemma nodes...
2024-03-27 19:48:07,625 Serializing...
2024-03-27 19:48:08,741 Generating entries nodes...
2024-03-27 19:48:08,841 Serializing...
2024-03-27 19:48:10,007 Generating author nodes...
2024-03-27 19:48:10,290 Serializing...
2024-03-27 19:48:11,571 Creating dictionary...
2024-03-27 19:48:11,592 Dictionary created
2024-03-27 19:48:11,592 Generating occupation nodes...
2024-03-27 19:48:11,689 Serializing...
2024-03-27 19:48:12,957 Generating lexical sense nodes...
2024-03-27 19:48:13,258 Serializing...
2024-03-27 19:48:14,741 Generating text nodes...
2024-03-27 19:48:15,233 Serializing...
2024-03-27 19:48:17,420 Generating corpora nodes...
2024-03-27 19:48:17,498 Serializing...
2024-03-27 19:48:19,635 Connecting nodes...
2024-03-27 19:48:19,635 Generating dates dictionary...


TypeError: addDatePointPublished() takes 3 positional arguments but 4 were given

In [52]:
# LINK DOCUMENT TO WIKIDATA VIA TITLE

from SPARQLWrapper import SPARQLWrapper, JSON

def transform2dicts(results):

    new_results = []
    for result in results:
        new_result = {}
        for key in result:
            new_result[key] = result[key]['value']
        new_results.append(new_result)
    return new_results


endpoint = "https://query.wikidata.org/sparql"
sparql = SPARQLWrapper(endpoint)
sparql.setQuery(queries.documentQuery.format("Ab Urbe condĭta"))
sparql.setReturnFormat(JSON)

results = sparql.queryAndConvert()


In [53]:
print(results)
results

{'head': {'vars': ['document']}, 'results': {'bindings': [{'document': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q1155892'}}]}}


{'head': {'vars': ['document']},
 'results': {'bindings': [{'document': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q1155892'}}]}}

In [66]:
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)


logger.info('Generating date nodes...')
with jsonlines.open(lkgDataset, 'r') as lkg:
    startTimes = [line for line in lkg if line['jtype'] == 'relationship' and line['name'] == 'startTime']
    startDict = {}
    for line in startTimes:
        startDict[line['subject']] = line['object']

with jsonlines.open(lkgDataset, 'r') as lkg:
    endTimes = [line for line in lkg if line['jtype'] == 'relationship' and line['name'] == 'endTime']
    endDict = {}
    for line in endTimes:
        endDict[line['subject']] = line['object']
    lkg.close()
    
    intervalsDict = {}
    for k in startDict.keys():
        intervalsDict.update({k : (startDict[k], endDict[k])})

logger.info('Generatin time points')

with jsonlines.open(lkgDataset, 'r') as lkg:
    timePoints = [line for line in lkg if line['jtype'] == 'node' and line['label'] == 'TimePoint']
    pointsDict = {}
    for line in timePoints:
        nodes.addDateNode(line['properties']['Year'], line['identity'], g)
        pointsDict[line['identity']] = line['properties']['Year']
        
with jsonlines.open(lkgDataset, 'r') as lkg:
    timeIntervals = [line for line in lkg if line['jtype'] == 'node' and line['label'] == 'TimeInterval']
    for line in timeIntervals:
        description = line['properties']['name']
        s, e = intervalsDict[line['identity']]
        start = pointsDict[s]
        end = pointsDict[e]

            

2024-03-27 17:02:37,453 Generating date nodes...
2024-03-27 17:02:37,669 Generatin time points


-2000 -1901 / -1999 -1900
-1900 -1801 / -1899 -1800
-1800 -1701 / -1799 -1700
-1700 -1601 / -1699 -1600
-1600 -1501 / -1599 -1500
-1500 -1401 / -1499 -1400
-1400 -1301 / -1399 -1300
-1300 -1201 / -1299 -1200
-1200 -1101 / -1199 -1100
-1100 -1001 / -1099 -1000
-1000 -901 / -0999 -0900
-900 -801 / -0899 -0800
-800 -701 / -0799 -0700
-700 -601 / -0699 -0600
-600 -501 / -0599 -0500
-500 -401 / -0499 -0400
-400 -301 / -0399 -0300
-300 -201 / -0299 -0200
-200 -101 / -0199 -0100
-100 -1 / -0099 +0000
1 100 / +0001 +0100
101 200 / +0101 +0200
201 300 / +0201 +0300
301 400 / +0301 +0400
401 500 / +0401 +0500
501 600 / +0501 +0600
601 700 / +0601 +0700
701 800 / +0701 +0800
801 900 / +0801 +0900
901 1000 / +0901 +1000
1001 1100 / +1001 +1100
1101 1200 / +1101 +1200
1201 1300 / +1201 +1300
1301 1400 / +1301 +1400
1401 1500 / +1401 +1500
1501 1600 / +1501 +1600
1601 1700 / +1601 +1700
1701 1800 / +1701 +1800
1801 1900 / +1801 +1900
1901 2000 / +1901 +2000


In [83]:
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)

logger.info('Generating date nodes...')




2024-03-27 18:34:06,305 Generating date nodes...


1894
