In [1]:
# load current YSA
from rdflib import Graph
ysa = Graph()
ysa.parse('http://finto.fi/rest/v1/ysa/data?format=text/turtle')
print(len(ysa))  # show number of triples

433048


In [2]:
# load current Allärs
allars = Graph()
allars.parse('http://finto.fi/rest/v1/allars/data?format=text/turtle')
print(len(allars))  # show number of triples

412097


In [3]:
# load YSO and YSO Places
yso = Graph()
yso.parse('../vocab/yso-skos.ttl', format='turtle')
print(len(yso))  # show number of triples

740137


In [4]:
import unicodedata

from rdflib import Literal, Namespace, URIRef
from rdflib.namespace import SKOS, OWL

YSO = Namespace('http://www.yso.fi/onto/yso/')
COMPLAIN = False  # whether to complain about unknown labels

def is_deprecated(ysouri):
    return (ysouri, OWL.deprecated, True) in yso

def label_to_yso_uris(label, source, voc, lang, complain=COMPLAIN):
    #print("looking up '{}' from {} in language {}".format(label, source, lang))
    value = Literal(unicodedata.normalize('NFC', label), lang)

    for prop in (SKOS.prefLabel, SKOS.altLabel):
        vocuri = voc.value(None, prop, value, any=True)
        if vocuri is not None:
            if vocuri.startswith(YSO):
                return [vocuri]
            for matchprop in (SKOS.exactMatch, SKOS.closeMatch):
                matches = [match for match in voc.objects(vocuri, matchprop)
                           if match.startswith(YSO)]
                if matches:
                    return matches

    # hackish fallbacks for cases like "kulttuuri", where YSO Cicero is out of date: look up via ysa/allars
    if source == 'yso/fin':
        matches = label_to_yso_uris(label, "ysa", ysa, lang)
        if matches:
            print("missing yso/fin label '{}' found via ysa".format(label))
            return matches

    if source == 'yso/swe':
        matches = label_to_yso_uris(label, "allars", allars, lang)
        if matches:
            print("missing yso/swe label '{}' found via allars".format(label))
            return matches
    
    if complain:
        print("Unknown label '{}' in source {}".format(label, source))
    return []


print(label_to_yso_uris('kissa', 'yso/fin', yso, 'fi'))  # YSO: kissa
print(label_to_yso_uris('Ingmanin talo', 'yso/fin', yso, 'fi')) # YSO: Casagranden talo
print(label_to_yso_uris('siirtäminen', 'ysa', ysa, 'fi')) # YSO: siirto (liikuttaminen) + siirto (viestintä)
print(label_to_yso_uris('lähioikeudet', 'yso/fin', yso, 'fi')) # YSO: lähioikeudet
print(label_to_yso_uris('kulttuuri', 'yso/fin', yso, 'fi')) # YSO: kulttuuri
print(label_to_yso_uris('Helsinki -- Kallio', 'ysa', ysa, 'fi'))  # YSO-paikat: Kallio (Helsinki)
print(label_to_yso_uris('Zambia', 'yso/fin', yso, 'fi'))  # YSO-paikat: Sambia
print(label_to_yso_uris('not found', 'yso/fin', yso, 'fi'))

[rdflib.term.URIRef('http://www.yso.fi/onto/yso/p19378')]
[rdflib.term.URIRef('http://www.yso.fi/onto/yso/p18095')]
[rdflib.term.URIRef('http://www.yso.fi/onto/yso/p5700')]
[rdflib.term.URIRef('http://www.yso.fi/onto/yso/p11910')]
[rdflib.term.URIRef('http://www.yso.fi/onto/yso/p372')]
[rdflib.term.URIRef('http://www.yso.fi/onto/yso/p105606')]
[rdflib.term.URIRef('http://www.yso.fi/onto/yso/p104983')]
[]


In [5]:
import functools

from rdflib.namespace import RDF, OWL, DCTERMS, SKOS

@functools.lru_cache(maxsize=30000)
def replace_concept(uri):
    replacement_candidates = list(yso.objects(uri, DCTERMS.isReplacedBy)) + list(yso.objects(uri, SKOS.narrowMatch))
    replacements = [rc for rc in replacement_candidates if rc.startswith(YSO)]
    if len(replacements) == 0:
        return [uri]
    else:
        print("replacing", uri, "with", ' '.join([str(r) for r in replacements]))
        if len(replacements) > 1:
            print("warning: multiple replacements for", uri)
        return replacements

@functools.lru_cache(maxsize=30000)
def check_concept(uri):
    if (uri, RDF.type, SKOS.Concept) not in yso:
        print(str(uri), "not a skos:Concept")
        return False
    if yso.value(uri, OWL.deprecated, None, any=True):
        print(str(uri), "is deprecated")
        return False
    return True

print(replace_concept(URIRef('http://www.yso.fi/onto/yso/p23766')))  # should be replaced by p1947 and p2203
print(check_concept(URIRef('http://www.yso.fi/onto/yso/p23766')))    # is deprecated, should return False

replacing http://www.yso.fi/onto/yso/p23766 with http://www.yso.fi/onto/yso/p1947 http://www.yso.fi/onto/yso/p22036
[rdflib.term.URIRef('http://www.yso.fi/onto/yso/p1947'), rdflib.term.URIRef('http://www.yso.fi/onto/yso/p22036')]
http://www.yso.fi/onto/yso/p23766 is deprecated
False


In [6]:
import json

def get_subject_uris(subject_dicts_in):
    """Returns a list of subjects, i.e. strings extracted from the heading
    fields of the dictionaries in the input list."""

    subjects_out = []
    for subject_dict in subject_dicts_in:
        if ('source', 'ysa') in subject_dict.items():
            source = 'ysa'
            voc = ysa
            lang = 'fi'
        elif ('source', 'allars') in subject_dict.items():
            source = 'allars'
            voc = allars
            lang = 'sv'
        elif ('source', 'yso/fin') in subject_dict.items():
            source = 'yso/fin'
            voc = yso
            lang = 'fi'
        elif ('source', 'yso/fin') in subject_dict.items():
            source = 'yso/swe'
            voc = yso
            lang = 'sv'
        else:
            continue
        
        uris = []
        
        if len(subject_dict['heading']) > 1:
            label = ' -- '.join(subject_dict['heading'])
            uris = label_to_yso_uris(label, source, voc, lang, complain=False)

        if not uris:
            # not found as a precoordinated subject, try the individual parts instead
            uris = []
            for label in subject_dict['heading']:
                uris.extend(label_to_yso_uris(label, source, voc, lang))
        
        for uri in uris:
            newuris = replace_concept(uri)
            for newuri in newuris:
                if check_concept(newuri):
                    subjects_out.append(newuri)

    return set(subjects_out)

def print_title_with_subject_uris(title, subjects):
    urilist = [label_to_yso_uri(label, lang) for label, lang in subjects]
    urilist = [uri for uri in urilist if uri is not None]
    if not urilist:
        return  # no URIs found - omit the line
    print(unicodedata.normalize('NDC', title) + '\t' + '\t'.join(
         ['<'+item+'>' for sublist in urilist if sublist for item in sublist ]
    ))


def main(ndjson_in, output):
    """Prints the title (nimike) and subjects (aiheet) contained in the json
    objects of the input."""
    for ind, line in enumerate(ndjson_in):
        line_dict = json.loads(line)
        if 'title' not in line_dict:
            continue
        if 'subjectsExtended' not in line_dict:
            continue

        subjects = get_subject_uris(line_dict['subjectsExtended'])
        if subjects:
            print(line_dict['title'] + '\t' + '\t'.join((str(subj) for subj in subjects)), file=output)

In [7]:
import gzip
FINNA_BASE = 'finna-all-2020-02'

In [8]:
%%time

with gzip.open(FINNA_BASE + '-swe.ndjson.gz', 'rt') as inputf:
    with gzip.open(FINNA_BASE + '-swe.tsv.gz', 'wt') as outputf:
        main(inputf, outputf)

replacing http://www.yso.fi/onto/yso/p3936 with http://www.yso.fi/onto/yso/p38317 http://www.yso.fi/onto/yso/p38418 http://www.yso.fi/onto/yso/p38444
replacing http://www.yso.fi/onto/yso/p21011 with http://www.yso.fi/onto/yso/p1631
replacing http://www.yso.fi/onto/yso/p5559 with http://www.yso.fi/onto/yso/p485
replacing http://www.yso.fi/onto/yso/p23620 with http://www.yso.fi/onto/yso/p7500
replacing http://www.yso.fi/onto/yso/p14259 with http://www.yso.fi/onto/yso/p107968
replacing http://www.yso.fi/onto/yso/p14065 with http://www.yso.fi/onto/yso/p130575
replacing http://www.yso.fi/onto/yso/p5842 with http://www.yso.fi/onto/yso/p38653
replacing http://www.yso.fi/onto/yso/p21622 with http://www.yso.fi/onto/yso/p6895
replacing http://www.yso.fi/onto/yso/p12978 with http://www.yso.fi/onto/yso/p1804
missing yso/fin label 'toimittajat' found via ysa
replacing http://www.yso.fi/onto/yso/p7812 with http://www.yso.fi/onto/yso/p10515
replacing http://www.yso.fi/onto/yso/p26246 with http://www.

missing yso/fin label 'toimittajat' found via ysa
missing yso/fin label 'kuolevaisuus' found via ysa
missing yso/fin label 'balladit' found via ysa
missing yso/fin label 'toimittajat' found via ysa
missing yso/fin label 'toimittajat' found via ysa
missing yso/fin label 'toimittajat' found via ysa
missing yso/fin label 'toimittajat' found via ysa
missing yso/fin label 'toimittajat' found via ysa
missing yso/fin label 'toimittajat' found via ysa
missing yso/fin label 'toimittajat' found via ysa
missing yso/fin label 'toimittajat' found via ysa
missing yso/fin label 'toimittajat' found via ysa
missing yso/fin label 'toimittajat' found via ysa
missing yso/fin label 'toimittajat' found via ysa
missing yso/fin label 'toimittajat' found via ysa
missing yso/fin label 'toimittajat' found via ysa
missing yso/fin label 'toimittajat' found via ysa
missing yso/fin label 'toimittajat' found via ysa
missing yso/fin label 'toimittajat' found via ysa
missing yso/fin label 'toimittajat' found via ysa
mi

In [None]:
%%time

with gzip.open(FINNA_BASE + '-eng.ndjson.gz', 'rt') as inputf:
    with gzip.open(FINNA_BASE + '-eng.tsv.gz', 'wt') as outputf:
        main(inputf, outputf)

In [None]:
%%time

with gzip.open(FINNA_BASE + '-fin.ndjson.gz', 'rt') as inputf:
    with gzip.open(FINNA_BASE + '-fin.tsv.gz', 'wt') as outputf:
        main(inputf, outputf)

# Converting to the final format

    zcat finna-all-2020-02-swe.tsv.gz |shuf|gzip >yso-finna-sv.tsv.gz
    zcat finna-all-2020-02-eng.tsv.gz |shuf|gzip >yso-finna-en.tsv.gz
    
The Finnish file must be split:

    zcat finna-all-2020-02-fin.tsv.gz |shuf|split -l 2000000 --numeric-suffixes=1 --additional-suffix=.tsv - yso-finna-fi-
    gzip yso-finna-fi-*.tsv
   
