Check that specific metadata values are actually represented in publication PDF


In [1]:
import glob
import json
import re

metadata_files = sorted(glob.glob("../metadata/*.jsonl"))
records = []

for mdfile in metadata_files:
    with open(mdfile) as inf:
        for line in inf:
            rec = json.loads(line)
            records.append(rec)

def extract_number(s):
    matches = re.findall(r'\d+', s)
    return int(matches[0])

# sort by doctype and numeric part of rowid
records.sort(key=lambda rec: (rec['doctype'], extract_number(rec['rowid'])))

print(len(records))

800


In [2]:
# convert already downloaded PDFs to text
#!for pdf in ../pdfs/*.pdf; do pdftotext $pdf; done

def id_to_txtfn(identifier):
    """convert a URI identifier to a simpler string we can use as a filename for the text file"""
    return '../pdfs/' + identifier.replace('https://', '').replace('/','_') + ".txt"

In [3]:
# Check for titles in metadata that don't match what's in the PDF document text

import re

def normalize_text(text):
    # strip all non-alphanumeric characters and convert the remaining to lowercase
    return re.sub(r'\W+', '', text, flags=re.UNICODE).lower()

def find_title_in_text(title, text):
    all_found = True
    text = normalize_text(text)

    for part in title.split(' : '):
        if normalize_text(part) not in text:
            all_found = False
    return all_found

nrec = 0
for rec in records:
    record_ok = True

    fn = id_to_txtfn(rec['id'])
    with open(fn) as infile:
        text = infile.read()

    title = rec["ground_truth"].get('title')
    if title:
        if not find_title_in_text(title, text):
            print(f"{rec['rowid']}: title '{title}' NOT found in text {rec['url']}")
            record_ok = False

    if not record_ok:
        nrec += 1

print(f"{nrec} records with title mismatches")

article16: title 'Arvopaperiostoja pääoma-avaimella' NOT found in text https://publications.bof.fi/bitstream/handle/10024/43381/30012015Valimaki.pdf
article66: title 'IMF ennustaa talouskasvua euroalueelle ja Suomelle' NOT found in text https://publications.bof.fi/bitstream/handle/10024/43364/15042015Toivanen.pdf
article78: title 'Kansalliset keskuspankit tuntevat oman maansa markkinapaikat' NOT found in text https://publications.bof.fi/bitstream/handle/10024/43379/02022015Hievanen.pdf
article127: title 'Pankkiongelmat leviäisivät Pohjoismaissa nopeasti ja laajasti' NOT found in text https://publications.bof.fi/bitstream/handle/10024/43405/24062014Koskinen.pdf
book50: title 'Kartta, kompassi & kalenteri : Projektiarvioinnin opas' NOT found in text https://www.theseus.fi/bitstream/handle/10024/344934/Arviointiopas_web.pdf
book78: title 'Orden speglar genren : Kvantitativ analys av lexikala mönster i Paul Sinebrychoffs korrespondens' NOT found in text https://www.theseus.fi/bitstream/han

In [4]:
# Check for creators in metadata that are not mentioned in the PDF document text

import regex

def find_creator_in_text(creator, text):
    all_found = True
    text = text.lower()

    text = " ".join(text.split())  # normalize whitespace to spaces only
    for part in creator.lower().split(','):
        if part.strip() not in text:
            all_found = False
    return all_found

nrec = 0
for rec in records:
    record_ok = True

    fn = id_to_txtfn(rec['id'])
    with open(fn) as infile:
        text = infile.read()

    creators = rec["ground_truth"].get('creator')
    if creators:
        for creator in creators:
            if not find_creator_in_text(creator, text):
                print(f"{rec['rowid']}: creator '{creator}' NOT found in text {rec['url']}")
                record_ok = False

    if not record_ok:
        nrec += 1

print(f"{nrec} records with creator mismatches")

article16: creator 'Välimäki, Tuomas' NOT found in text https://publications.bof.fi/bitstream/handle/10024/43381/30012015Valimaki.pdf
article66: creator 'Toivanen, Mervi' NOT found in text https://publications.bof.fi/bitstream/handle/10024/43364/15042015Toivanen.pdf
article78: creator 'Hievanen, Laura' NOT found in text https://publications.bof.fi/bitstream/handle/10024/43379/02022015Hievanen.pdf
article113: creator 'Jouhki, Maija-Riitta' NOT found in text https://www.theseus.fi/bitstream/handle/10024/348965/Niko_-97-hanke.pdf
article113: creator 'Oksanen, Jukka' NOT found in text https://www.theseus.fi/bitstream/handle/10024/348965/Niko_-97-hanke.pdf
article113: creator 'Sinisaari-Eskelinen, Maarit' NOT found in text https://www.theseus.fi/bitstream/handle/10024/348965/Niko_-97-hanke.pdf
article120: creator 'Pajula, Carolina' NOT found in text https://www.theseus.fi/bitstream/handle/10024/304336/Opiskelija_keikkatyon_kiemuroissa.pdf
article127: creator 'Koskinen, Kimmo' NOT found in t

In [5]:
# Check for publication years in metadata that are not mentioned in the PDF document text

def find_year_in_text(year, text):
    pattern = rf"\b{year}\b"
    return re.search(pattern, text)

nrec = 0
for rec in records:
    record_ok = True

    fn = id_to_txtfn(rec['id'])
    with open(fn) as infile:
        text = infile.read()

    year = rec["ground_truth"].get('year')
    if year:
        if not find_year_in_text(year, text):
            print(f"{rec['rowid']}: year {year} not found in text {rec['url']}")
            record_ok = False
    if not record_ok:
        nrec += 1

print(f"{nrec} records with year mismatches")

article78: year 2015 not found in text https://publications.bof.fi/bitstream/handle/10024/43379/02022015Hievanen.pdf
article146: year 2017 not found in text https://taju.uniarts.fi/bitstream/handle/10024/5991/Gröndahl_SpeakingaboutReality.pdf
2 records with year mismatches


In [6]:
# Check for publishers in metadata that are not mentioned in the PDF document text

import regex

def normalize_hyphens(text):
    # Use the \p{Pd} Unicode property escape to match punctuation dashes
    normalized_text = regex.sub(r'\p{Pd}', '-', text)
    return normalized_text

def find_publisher_in_text(publisher, text):
    all_found = True
    text = normalize_hyphens(text)
    text = text.lower()

    text = " ".join(text.split())  # normalize whitespace to spaces only
    for part in publisher.lower().split(','):
        if part.strip() not in text:
            all_found = False
    return all_found

nrec = 0
for rec in records:
    record_ok = True

    fn = id_to_txtfn(rec['id'])
    with open(fn) as infile:
        text = infile.read()

    publishers = rec["ground_truth"].get('publisher')
    if publishers:
        for publisher in publishers:
            if not find_publisher_in_text(publisher, text):
                print(f"{rec['rowid']}: publisher '{publisher}' NOT found in text {rec['url']}")
                record_ok = False

    if not record_ok:
        nrec += 1

print(f"{nrec} records with publisher mismatches")

article16: publisher 'Suomen Pankki' NOT found in text https://publications.bof.fi/bitstream/handle/10024/43381/30012015Valimaki.pdf
article174: publisher 'Finanssivalvonta' NOT found in text https://publications.bof.fi/bitstream/handle/10024/46291/Fiva_blogi_Vaalitulos-ja-ilmastonmuutos.pdf
report42: publisher 'Kansanterveyden neuvottelukunta' NOT found in text https://www.julkari.fi/bitstream/handle/10024/141035/KTNK Hyvinvoinnin terveyden ja turvallisuuden edistäminen infograafit.pdf
report89: publisher 'Terveyden ja hyvinvoinnin laitos' NOT found in text https://www.julkari.fi/bitstream/handle/10024/140770/TEAviisari2020_Etelä-Savon-maakunta.pdf
report90: publisher 'Terveyden ja hyvinvoinnin laitos' NOT found in text https://www.julkari.fi/bitstream/handle/10024/140775/TEAviisari2020_Kymenlaakso.pdf
report91: publisher 'Terveyden ja hyvinvoinnin laitos' NOT found in text https://www.julkari.fi/bitstream/handle/10024/140779/TEAviisari2020_Pohjois-Karjala.pdf
report97: publisher 'Ty

In [None]:
# Check for DOIs in metadata that are not given in the PDF document text

def normalize_text_doi(text):
    # strip all non-alphanumeric characters except those that can appear in DOIs
    # This pattern will match any character that is not alphanumeric or one of / . - _ :
    pattern = r'[^a-zA-Z0-9/._:-]'
    return re.sub(pattern, '', text)

def find_doi_in_text(doi, text):
    return doi in text

nrec = 0
for rec in records:
    record_ok = True

    fn = id_to_txtfn(rec['id'])
    with open(fn) as infile:
        text = infile.read()

    text = normalize_text_doi(text)

    doi = rec["ground_truth"].get('doi')
    if doi:
        if not find_doi_in_text(doi, text):
            print(f"{rec['rowid']}: DOI {doi} not found in text {rec['url']}")
            record_ok = False

    if not record_ok:
        nrec += 1

print(f"{nrec} records with DOI mismatches")

book28: DOI 10.1016/B978-0-12-817792-1.00014-9 not found in text https://osuva.uwasa.fi/bitstream/handle/10024/11551/Osuva_Björk__Kauppinen-Räisänen_2020.pdf


In [None]:
# Check for ISBNs in metadata that are not given in the PDF document text

def normalize_text_alnum(text):
    # strip all non-alphanumeric characters
    return re.sub(r'\W+', '', text, flags=re.UNICODE)

def find_isbn_in_text(isbn, text):
    return normalize_text_alnum(isbn) in text

def is_valid_isbn13(isbn):
    if len(isbn) != 13 or not isbn.isdigit():
        return False
    
    total = 0
    for i, digit in enumerate(isbn):
        if (i % 2 == 0):
            total += int(digit)
        else:
            total += 3 * int(digit)
    
    return total % 10 == 0

def find_possible_isbns(text):
    isbn_pattern = r'978\d{10}'
    possible_isbns = re.findall(isbn_pattern, text)
    valid_isbns = [isbn for isbn in possible_isbns if is_valid_isbn13(isbn)]
    return valid_isbns

nrec = 0
for rec in records:
    record_ok = True

    fn = id_to_txtfn(rec['id'])
    with open(fn) as infile:
        text = infile.read()

    text = normalize_text_alnum(text)
    
    pisbns = rec["ground_truth"].get('p-isbn') or []
    for pisbn in pisbns:
        if not find_isbn_in_text(pisbn, text):
            print(f"{rec['rowid']}: p-isbn {pisbn} not found in text {rec['url']}")
            record_ok = False

    eisbns = rec["ground_truth"].get('e-isbn') or []
    for eisbn in eisbns:
        if not find_isbn_in_text(eisbn, text):
            print(f"{rec['rowid']}: e-isbn {eisbn} not found in text {rec['url']}")
            record_ok = False

    # check for ISBNs found in the PDF that are not in the metadata
    known_isbns = pisbns + eisbns
    if len(known_isbns) < 2:
        for isbn in sorted(set(find_possible_isbns(text))):
            if isbn not in known_isbns:
                print(f"{rec['rowid']}: ISBN {isbn} not in metadata: {known_isbns} {rec['url']}")
                record_ok = False

    if not record_ok:
        nrec += 1

print(f"{nrec} records with ISBN mismatches")

In [None]:
# Check for ISSNs in metadata that are not given in the PDF document text

def find_issn_in_text(issn, text):
    pattern = f"{issn[:4]}[ -‐‑—‒–−]?{issn[5:]}"
    return re.search(pattern, text)

nrec = 0
for rec in records:
    record_ok = True

    fn = id_to_txtfn(rec['id'])
    with open(fn) as infile:
        text = infile.read()

    pissn = rec["ground_truth"].get('p-issn')
    if pissn:
        if not find_issn_in_text(pissn, text):
            print(f"{rec['rowid']}: pissn {pissn} not found in text {rec['url']}")
            for match in re.findall(r"ISSN", text, re.I):
                print(match)
            record_ok = False

    eissn = rec["ground_truth"].get('e-issn')
    if eissn:
        if not find_issn_in_text(eissn, text):
            print(f"{rec['rowid']}: eissn {eissn} not found in text {rec['url']}")
            for match in re.findall(r"ISSN", text, re.I):
                print(match)
            record_ok = False
    if not record_ok:
        nrec += 1

print(f"{nrec} records with ISSN mismatches")