In [1]:
from rdflib import Graph, Namespace, Literal
from rdflib.namespace import SKOS
import requests

YSO = Namespace('http://www.yso.fi/onto/yso/')

In [68]:
# Load YSA, YSO and YSO Places - this may take a few minutes
# The parsing is done in parallel on multiple CPUs, to save some time

import time
starttime = time.time()
import multiprocessing

def load_graph(url):
    g = Graph()
    g.load(url)
    return g

urls = [
    'http://api.finto.fi/rest/v1/ysa/data',
    'http://api.finto.fi/rest/v1/yso/data',
    'http://api.finto.fi/rest/v1/yso-paikat/data'
]

# create a pool of processes and parse each vocabulary in a separate process
with multiprocessing.Pool(processes=len(urls)) as pool:
    ysa, yso, ysop = pool.map(load_graph, urls)

elapsed = time.time() - starttime
print("Time taken:", elapsed)

Time taken: 169.1254231929779


In [76]:
import os
import os.path


URLBASE = 'https://jyx2.jyu.fi'

LABELLANG = {
    'fin': 'fi',
    'swe': 'sv',
    'eng': 'en'
}

MIN_SUBJECTS = 3

def handle(doc):
    return doc['handle'].split('/')[-1] # use only the last part since the first part is always 123456789

def pdf_url(doc):
    for bitstream in doc['bitstreams']:
        if bitstream['bundleName'] == 'ORIGINAL' and bitstream['mimeType'] == 'application/pdf':
            return URLBASE + bitstream['retrieveLink']

def language(doc):
    for mdfld in doc['metadata']:
        if mdfld['element'] == 'language':
            return mdfld['value']

def ysa_subjects(doc):
    subjs = []
    for mdfld in doc['metadata']:
        if mdfld['key'] == 'dc.subject.ysa':
            subjs.append(mdfld['value'])
    return subjs

def ysalabel_to_ysauri(label):
    # prefLabel
    uri = ysa.value(None, SKOS.prefLabel, Literal(label, "fi"))
    if uri is not None:
        return uri
    # altLabel
    uri = ysa.value(None, SKOS.altLabel, Literal(label, "fi"))
    if uri is not None:
        return uri
    return None

def ysauri_to_ysouris(ysauri):
    if ysauri is None:
        return []
    return [uri for prop in (SKOS.closeMatch, SKOS.exactMatch)
                for uri in ysa.objects(ysauri, prop)
                if uri.startswith(YSO)]

def ysouri_label(uri, lang):
    for voc in (yso, ysop):
        labels = voc.preferredLabel(uri, lang=lang)
        if len(labels) > 0:
            return labels[0][1]

def to_yso_subjects(ysalabels, lang):
    yso_uris_labels = []
    for ysalabel in ysalabels:
        ysauri = ysalabel_to_ysauri(ysalabel)
        if ysauri is None:
            continue # not found in YSA, skip
        for ysouri in ysauri_to_ysouris(ysauri):
            ysolabel = ysouri_label(ysouri, lang)
            if ysolabel is not None:
                yso_uris_labels.append((ysouri, str(ysolabel)))
    return yso_uris_labels

def fetch_items(url):
    req = requests.get(url)
    data = req.json()
    return data['items']

def extract_doc_metadata(items):
    docs = []
    for doc in items:
        lang = language(doc)
        if lang not in LABELLANG:
            continue # skip doc if the language is not in YSO
        subjects = ysa_subjects(doc)
        yso_subjects = to_yso_subjects(ysa_subjects(doc), LABELLANG[lang])
        if len(yso_subjects) < MIN_SUBJECTS:
            continue # skip doc if there are not enough YSO subjects
        docs.append((handle(doc), lang, pdf_url(doc), yso_subjects))
    return docs

def store_doc(doc):
    hdl, lang, url, subjects = doc
    if not os.path.exists(lang):
        os.mkdir(lang)
    with open(os.path.join(lang, hdl + '.url'), 'w') as urlfile:
        print(url, file=urlfile)
    with open(os.path.join(lang, hdl + '.tsv'), 'w') as tsvfile:
        for uri, label in subjects:
            print("<%s>\t%s" % (uri, label), file=tsvfile)
    with open(os.path.join(lang, hdl + '.key'), 'w') as keyfile:
        for url, label in subjects:
            print(label, file=keyfile)


In [77]:
master_url = URLBASE + '/rest/filtered-items?limit=%d&offset=%d&query_field[]=dc.date.issued&query_op[]=equals&query_val[]=%d&collSel[]=b657e8ab-2cd2-42c6-b680-085a802e0c1e&expand=parentCollection,metadata,bitstreams'
diss_url = URLBASE + '/rest/filtered-items?limit=%d&offset=%d&query_field[]=dc.date.issued&query_op[]=equals&query_val[]=%d&collSel[]=a39f7178-a5ff-4242-8e6d-316be022727a&expand=parentCollection,metadata,bitstreams'

PAGESIZE=100

starttime = time.time()
total = 0
for year in range(2010, 2018):
    for name, urlpat in (("Master's", master_url), ("doctoral", diss_url)):
        for page in range(10):
            print("Fetching %s theses for year %d, page %d: " % (name, year, page), end='')
            url = urlpat % (PAGESIZE, page * PAGESIZE, year)
            items = fetch_items(url)
            docs = extract_doc_metadata(items)
            print("%d found, of which %d were stored." % (len(items), len(docs)))
            total += len(docs)
            for doc in docs:
                store_doc(doc)
            if len(items) < PAGESIZE:
                break # no more results expected

elapsed = time.time() - starttime
print("Total documents stored:", total, "elapsed time:", elapsed)

https://jyx2.jyu.fi/rest/filtered-items?limit=%d&offset=%d&query_field[]=dc.date.issued&query_op[]=equals&query_val[]=%d&collSel[]=b657e8ab-2cd2-42c6-b680-085a802e0c1e&expand=parentCollection,metadata,bitstreams
Fetching Master's theses for year 2010, page 0: 100 found, of which 92 were stored.
https://jyx2.jyu.fi/rest/filtered-items?limit=%d&offset=%d&query_field[]=dc.date.issued&query_op[]=equals&query_val[]=%d&collSel[]=b657e8ab-2cd2-42c6-b680-085a802e0c1e&expand=parentCollection,metadata,bitstreams
Fetching Master's theses for year 2010, page 1: 100 found, of which 90 were stored.
https://jyx2.jyu.fi/rest/filtered-items?limit=%d&offset=%d&query_field[]=dc.date.issued&query_op[]=equals&query_val[]=%d&collSel[]=b657e8ab-2cd2-42c6-b680-085a802e0c1e&expand=parentCollection,metadata,bitstreams
Fetching Master's theses for year 2010, page 2: 100 found, of which 94 were stored.
https://jyx2.jyu.fi/rest/filtered-items?limit=%d&offset=%d&query_field[]=dc.date.issued&query_op[]=equals&query_

Fetching Master's theses for year 2013, page 0: 100 found, of which 93 were stored.
https://jyx2.jyu.fi/rest/filtered-items?limit=%d&offset=%d&query_field[]=dc.date.issued&query_op[]=equals&query_val[]=%d&collSel[]=b657e8ab-2cd2-42c6-b680-085a802e0c1e&expand=parentCollection,metadata,bitstreams
Fetching Master's theses for year 2013, page 1: 100 found, of which 87 were stored.
https://jyx2.jyu.fi/rest/filtered-items?limit=%d&offset=%d&query_field[]=dc.date.issued&query_op[]=equals&query_val[]=%d&collSel[]=b657e8ab-2cd2-42c6-b680-085a802e0c1e&expand=parentCollection,metadata,bitstreams
Fetching Master's theses for year 2013, page 2: 100 found, of which 92 were stored.
https://jyx2.jyu.fi/rest/filtered-items?limit=%d&offset=%d&query_field[]=dc.date.issued&query_op[]=equals&query_val[]=%d&collSel[]=b657e8ab-2cd2-42c6-b680-085a802e0c1e&expand=parentCollection,metadata,bitstreams
Fetching Master's theses for year 2013, page 3: 100 found, of which 95 were stored.
https://jyx2.jyu.fi/rest/fil

Fetching Master's theses for year 2015, page 5: 100 found, of which 82 were stored.
https://jyx2.jyu.fi/rest/filtered-items?limit=%d&offset=%d&query_field[]=dc.date.issued&query_op[]=equals&query_val[]=%d&collSel[]=b657e8ab-2cd2-42c6-b680-085a802e0c1e&expand=parentCollection,metadata,bitstreams
Fetching Master's theses for year 2015, page 6: 100 found, of which 84 were stored.
https://jyx2.jyu.fi/rest/filtered-items?limit=%d&offset=%d&query_field[]=dc.date.issued&query_op[]=equals&query_val[]=%d&collSel[]=b657e8ab-2cd2-42c6-b680-085a802e0c1e&expand=parentCollection,metadata,bitstreams
Fetching Master's theses for year 2015, page 7: 100 found, of which 95 were stored.
https://jyx2.jyu.fi/rest/filtered-items?limit=%d&offset=%d&query_field[]=dc.date.issued&query_op[]=equals&query_val[]=%d&collSel[]=b657e8ab-2cd2-42c6-b680-085a802e0c1e&expand=parentCollection,metadata,bitstreams
Fetching Master's theses for year 2015, page 8: 100 found, of which 91 were stored.
https://jyx2.jyu.fi/rest/fil

Fetching Master's theses for year 2017, page 9: 100 found, of which 90 were stored.
https://jyx2.jyu.fi/rest/filtered-items?limit=%d&offset=%d&query_field[]=dc.date.issued&query_op[]=equals&query_val[]=%d&collSel[]=a39f7178-a5ff-4242-8e6d-316be022727a&expand=parentCollection,metadata,bitstreams
Fetching doctoral theses for year 2017, page 0: 100 found, of which 95 were stored.
https://jyx2.jyu.fi/rest/filtered-items?limit=%d&offset=%d&query_field[]=dc.date.issued&query_op[]=equals&query_val[]=%d&collSel[]=a39f7178-a5ff-4242-8e6d-316be022727a&expand=parentCollection,metadata,bitstreams
Fetching doctoral theses for year 2017, page 1: 32 found, of which 31 were stored.
Total documents stored: 7279 elapsed time: 2016.2948818206787
