Check that specific metadata values are actually represented in publication PDF


In [1]:
import glob
import json
import re

metadata_files = sorted(glob.glob("../metadata/*.jsonl"))
records = []

for mdfile in metadata_files:
    with open(mdfile) as inf:
        for line in inf:
            rec = json.loads(line)
            records.append(rec)

def extract_number(s):
    matches = re.findall(r'\d+', s)
    return int(matches[0])

# sort by doctype and numeric part of rowid
records.sort(key=lambda rec: (rec['doctype'], extract_number(rec['rowid'])))

print(len(records))

800


In [2]:
# convert already downloaded PDFs to text
#!for pdf in ../pdfs/*.pdf; do pdftotext $pdf; done

def id_to_txtfn(identifier):
    """convert a URI identifier to a simpler string we can use as a filename for the text file"""
    return '../pdfs/' + identifier.replace('https://', '').replace('/','_') + ".txt"

In [3]:
# Check for ISSNs in metadata that are not given in the PDF document text

def find_issn_in_text(issn, text):
    pattern = f"{issn[:4]}[ -‐‑—‒–−]?{issn[5:]}"
    return re.search(pattern, text)

nrec = 0
for rec in records:
    record_ok = True

    fn = id_to_txtfn(rec['id'])
    with open(fn) as infile:
        text = infile.read()

    pissn = rec.get('dc.relation.pissn')
    if pissn:
        if not find_issn_in_text(pissn, text):
            print(f"{rec['rowid']}: pissn {pissn} not found in text {rec['url']}")
            for match in re.findall(r"ISSN", text, re.I):
                print(match)
            record_ok = False

    eissn = rec.get('dc.relation.eissn')
    if eissn:
        if not find_issn_in_text(eissn, text):
            print(f"{rec['rowid']}: eissn {eissn} not found in text {rec['url']}")
            for match in re.findall(r"ISSN", text, re.I):
                print(match)
            record_ok = False
    if not record_ok:
        nrec += 1

print(f"{nrec} records with ISSN mismatches")

0 records with ISSN mismatches


In [4]:
# Check for publishers in metadata that are not mentioned in the PDF document text

import regex

def normalize_hyphens(text):
    # Use the \p{Pd} Unicode property escape to match punctuation dashes
    normalized_text = regex.sub(r'\p{Pd}', '-', text)
    return normalized_text

def find_publisher_in_text(publisher, text):
    all_found = True
    text = normalize_hyphens(text)
    text = text.lower()

    text = " ".join(text.split())  # normalize whitespace to spaces only
    for part in publisher.lower().split(','):
        if part.strip() not in text:
            all_found = False
    return all_found

def is_inferred_value(val):
    return val.startswith('[') and val.endswith(']')

nrec = 0
for rec in records:
    record_ok = True

    fn = id_to_txtfn(rec['id'])
    with open(fn) as infile:
        text = infile.read()

    publishers = rec.get('dc.publisher')
    for publisher in publishers:
        if is_inferred_value(publisher):
            if find_publisher_in_text(publisher[1:-1], text):
                print(f"{rec['rowid']}: inferred publisher {publisher} WAS found in text {rec['url']}")
                record_ok = False            
        else:
            if not find_publisher_in_text(publisher, text):
                print(f"{rec['rowid']}: publisher '{publisher}' NOT found in text {rec['url']}")
                record_ok = False

    if not record_ok:
        nrec += 1

print(f"{nrec} records with publisher mismatches")

article16: publisher 'Suomen Pankki' NOT found in text https://publications.bof.fi/bitstream/handle/10024/43381/30012015Valimaki.pdf
article38: inferred publisher [The Donner Institute] WAS found in text https://www.doria.fi/bitstream/handle/10024/184051/114539-Article Text-224982-1-10-20220228 (1) editorial.pdf
article39: inferred publisher [The Donner Institute] WAS found in text https://www.doria.fi/bitstream/handle/10024/180209/editorial.pdf
article40: inferred publisher [The Donner Institute] WAS found in text https://www.doria.fi/bitstream/handle/10024/185537/117088-Article Text-240731-1-10-20220626 (1) editorial.pdf
article42: inferred publisher [Tutkijaliitto] WAS found in text https://taju.uniarts.fi/bitstream/handle/10024/6009/AA-ME-rinnakkaistallennettavaksi.pdf
article125: inferred publisher [Kansanvalistusseura] WAS found in text https://taju.uniarts.fi/bitstream/handle/10024/5998/Laes_Rautiainen_Osallistuminentaiteeseenjakulttuuriin.pdf
article126: inferred publisher [Tut