In [1]:
%%time
# load YSO vocabulary
import rdflib

YSOPATH='../../../vocab/yso-skos.ttl'

yso = rdflib.Graph()
yso.parse(YSOPATH, format='turtle')

# verify the number of triples
len(yso)

CPU times: user 1min 16s, sys: 415 ms, total: 1min 16s
Wall time: 1min 16s


751624

In [2]:
# collect the Wikidata mappings from within YSO into a dict
from rdflib.namespace import SKOS

yso_to_wd = {}

for prop in (SKOS.closeMatch, SKOS.exactMatch):
    for ysoc,item in yso.subject_objects(prop):
        if not str(item).startswith('http://www.wikidata.org/entity/'):
            continue  # not a wikidata mapping, skip
        yso_to_wd[str(ysoc)] = str(item)

# check the number of mappings we found
len(yso_to_wd)

5517

In [3]:
# query Wikidata for the mappings to/from YSO and add them to the dict

queryString = """
PREFIX wdt: <http://www.wikidata.org/prop/direct/>

SELECT ?ysoc ?item
WHERE {
    ?item wdt:P2347 ?ysoid .
    BIND(IRI(CONCAT('http://www.yso.fi/onto/yso/p', ?ysoid)) AS ?ysoc)
}
"""

from SPARQLWrapper import SPARQLWrapper, JSON
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setReturnFormat(JSON)
sparql.setQuery(queryString)
results = sparql.query().convert()

for result in results["results"]["bindings"]:
    yso_to_wd[result["ysoc"]["value"]] = result["item"]["value"]

# check the number of mappings we now have
len(yso_to_wd)

13759

In [4]:
# process the yso-finna corpus file, converting YSO URIs to Wikidata URIs

import gzip

INPUT_CORPUS = '../../yso-finna-en.tsv.gz'
OUTPUT_CORPUS = '../finna-wikidata-en.tsv.gz'

samples = 0
converted = 0

with gzip.open(INPUT_CORPUS, 'rt') as incorpus:
    with gzip.open(OUTPUT_CORPUS, 'wt') as outcorpus:
        for line in incorpus:
            samples += 1
            text, uris = line.strip().split('\t', 1)
            uris = uris.split()
            wd_uris = [wd_uri for wd_uri in [yso_to_wd.get(uri) for uri in uris] if wd_uri]
            if not wd_uris:
                continue  # no Wikidata URIs found for this sample
            print("{}\t{}".format(text, ' '.join(wd_uris)), file=outcorpus)
            converted += 1

print("converted {} of {} samples".format(converted, samples))

converted 1781559 of 1969586 samples
