In [1]:
from rdflib import Graph, Namespace, Literal
from rdflib.namespace import SKOS
import requests

YSO = Namespace('http://www.yso.fi/onto/yso/')

In [2]:
# Load YSA, YSO and YSO Places - this may take a few minutes
# The parsing is done in parallel on multiple CPUs, to save some time

import time
starttime = time.time()
import multiprocessing

def load_graph(url):
    g = Graph()
    g.load(url)
    return g

urls = [
    'http://api.finto.fi/rest/v1/ysa/data',
    'http://api.finto.fi/rest/v1/yso/data',
    'http://api.finto.fi/rest/v1/yso-paikat/data'
]

# create a pool of processes and parse each vocabulary in a separate process
with multiprocessing.Pool(processes=len(urls)) as pool:
    ysa, yso, ysop = pool.map(load_graph, urls)

elapsed = time.time() - starttime
print("Time taken:", elapsed)

Time taken: 88.10392665863037


In [4]:
URLBASE = 'https://jyx2.jyu.fi'

doctoral_uuid = requests.post(URLBASE + '/rest/collections/find-collection',
                              data='Väitöskirjat'.encode('UTF-8')).json()['uuid']
masters_uuid = requests.post(URLBASE + '/rest/collections/find-collection',
                             data='Pro gradu -tutkielmat'.encode('UTF-8')).json()['uuid']
print(doctoral_uuid, masters_uuid)

ac0b9293-18ab-4eee-ab0b-ddee87fb1a1c a5ae67f3-9d8f-4458-af1d-80a1b8d3c48d


In [13]:
import os
import os.path

LABELLANG = {
    'fin': 'fi',
    'swe': 'sv',
    'eng': 'en'
}

MIN_SUBJECTS = 3

def handle(doc):
    return doc['handle']

def doc_id(doc):
    return handle(doc).split('/')[-1] # use only the last part since the first part is always 123456789

def pdf_url(doc):
    for bitstream in doc['bitstreams']:
        if bitstream['bundleName'] == 'ORIGINAL' and bitstream['mimeType'] == 'application/pdf':
            return URLBASE + '/dspace/bitstream/handle/' + handle(doc) + '/' + bitstream['name']

def issued(doc):
    for mdfld in doc['metadata']:
        if mdfld['key'] == 'dc.date.issued':
            return mdfld['value']

def language(doc):
    for mdfld in doc['metadata']:
        if mdfld['element'] == 'language':
            return mdfld['value']

def ysa_subjects(doc):
    subjs = []
    for mdfld in doc['metadata']:
        if mdfld['key'] == 'dc.subject.ysa':
            subjs.append(mdfld['value'])
    return subjs

def ysalabel_to_ysauri(label):
    # prefLabel
    uri = ysa.value(None, SKOS.prefLabel, Literal(label, "fi"))
    if uri is not None:
        return uri
    # altLabel
    uri = ysa.value(None, SKOS.altLabel, Literal(label, "fi"))
    if uri is not None:
        return uri
    return None

def ysauri_to_ysouris(ysauri):
    if ysauri is None:
        return []
    return [uri for prop in (SKOS.closeMatch, SKOS.exactMatch)
                for uri in ysa.objects(ysauri, prop)
                if uri.startswith(YSO)]

def ysouri_label(uri, lang):
    for voc in (yso, ysop):
        labels = voc.preferredLabel(uri, lang=lang)
        if len(labels) > 0:
            return labels[0][1]

def to_yso_subjects(ysalabels, lang):
    yso_uris_labels = []
    for ysalabel in ysalabels:
        ysauri = ysalabel_to_ysauri(ysalabel)
        if ysauri is None:
            continue # not found in YSA, skip
        for ysouri in ysauri_to_ysouris(ysauri):
            ysolabel = ysouri_label(ysouri, lang)
            if ysolabel is not None:
                yso_uris_labels.append((ysouri, str(ysolabel)))
    return yso_uris_labels

def fetch_items(url):
    req = requests.get(url)
    data = req.json()
    return data['items']

def extract_doc_metadata(items):
    docs = []
    for doc in items:
        lang = language(doc)
        if lang not in LABELLANG:
            continue # skip doc if the language is not in YSO
        subjects = ysa_subjects(doc)
        yso_subjects = to_yso_subjects(ysa_subjects(doc), LABELLANG[lang])
        if len(yso_subjects) < MIN_SUBJECTS:
            continue # skip doc if there are not enough YSO subjects
        pdfurl = pdf_url(doc)
        if pdfurl is None:
            continue # skip if we didn't find a PDF URL
        docs.append((doc_id(doc), lang, issued(doc), pdfurl, yso_subjects))
    return docs

def store_doc(doctype, doc):
    doc_id, lang, iss, url, subjects = doc
    if not os.path.exists(lang):
        os.mkdir(lang)
    filenamebase = "%s-%s-%s" % (iss, doctype, doc_id)
    with open(os.path.join(lang, filenamebase + '.url'), 'w') as urlfile:
        print(url, file=urlfile)
    with open(os.path.join(lang, filenamebase + '.tsv'), 'w') as tsvfile:
        for uri, label in subjects:
            print("<%s>\t%s" % (uri, label), file=tsvfile)


In [14]:
urlpat = URLBASE + '/rest/filtered-items?limit=%d&offset=%d&query_field[]=dc.date.issued&query_op[]=equals&query_val[]=%d&collSel[]=%s&expand=parentCollection,metadata,bitstreams'

PAGESIZE=100

starttime = time.time()
total = 0
for year in range(2010, 2018):
    for name, uuid in (("Master's", masters_uuid), ("Doctoral", doctoral_uuid)):
        for page in range(20):
            print("Fetching %s theses for year %d, page %d: " % (name, year, page), end='')
            doctype = name[0] # "M" or "D"
            url = urlpat % (PAGESIZE, page * PAGESIZE, year, uuid)
            items = fetch_items(url)
            docs = extract_doc_metadata(items)
            print("%d found, of which %d were stored." % (len(items), len(docs)))
            total += len(docs)
            for doc in docs:
                store_doc(doctype, doc)
            if len(items) < PAGESIZE:
                break # no more results expected

elapsed = time.time() - starttime
print("Total documents stored:", total, "elapsed time:", elapsed)

Fetching Master's theses for year 2010, page 0: 100 found, of which 76 were stored.
Fetching Master's theses for year 2010, page 1: 100 found, of which 77 were stored.
Fetching Master's theses for year 2010, page 2: 100 found, of which 83 were stored.
Fetching Master's theses for year 2010, page 3: 100 found, of which 74 were stored.
Fetching Master's theses for year 2010, page 4: 100 found, of which 79 were stored.
Fetching Master's theses for year 2010, page 5: 98 found, of which 80 were stored.
Fetching Doctoral theses for year 2010, page 0: 100 found, of which 86 were stored.
Fetching Doctoral theses for year 2010, page 1: 12 found, of which 8 were stored.
Fetching Master's theses for year 2011, page 0: 100 found, of which 74 were stored.
Fetching Master's theses for year 2011, page 1: 100 found, of which 88 were stored.
Fetching Master's theses for year 2011, page 2: 100 found, of which 83 were stored.
Fetching Master's theses for year 2011, page 3: 100 found, of which 75 were sto