Check that specific metadata values are actually represented in publication PDF


In [1]:
import glob
import json
import re

metadata_files = sorted(glob.glob("../metadata/*.jsonl"))
records = []

for mdfile in metadata_files:
    with open(mdfile) as inf:
        for line in inf:
            rec = json.loads(line)
            records.append(rec)

def extract_number(s):
    matches = re.findall(r'\d+', s)
    return int(matches[0])

# sort by doctype and numeric part of rowid
records.sort(key=lambda rec: (rec['doctype'], extract_number(rec['rowid'])))

print(len(records))

800


In [2]:
# convert already downloaded PDFs to text
#!for pdf in ../pdfs/*.pdf; do pdftotext $pdf; done

def id_to_txtfn(identifier):
    """convert a URI identifier to a simpler string we can use as a filename for the text file"""
    return '../pdfs/' + identifier.replace('https://', '').replace('/','_') + ".txt"

In [3]:
# Check for ISSNs in metadata that are not given in the PDF document text

def find_issn_in_text(issn, text):
    pattern = f"{issn[:4]}[ -‐‑—‒–−]?{issn[5:]}"
    return re.search(pattern, text)

nrec = 0
for rec in records:
    record_ok = True

    fn = id_to_txtfn(rec['id'])
    with open(fn) as infile:
        text = infile.read()

    pissn = rec["ground_truth"].get('p-issn')
    if pissn:
        if not find_issn_in_text(pissn, text):
            print(f"{rec['rowid']}: pissn {pissn} not found in text {rec['url']}")
            for match in re.findall(r"ISSN", text, re.I):
                print(match)
            record_ok = False

    eissn = rec["ground_truth"].get('e-issn')
    if eissn:
        if not find_issn_in_text(eissn, text):
            print(f"{rec['rowid']}: eissn {eissn} not found in text {rec['url']}")
            for match in re.findall(r"ISSN", text, re.I):
                print(match)
            record_ok = False
    if not record_ok:
        nrec += 1

print(f"{nrec} records with ISSN mismatches")

0 records with ISSN mismatches


In [None]:
# Check for ISBNs in metadata that are not given in the PDF document text

def normalize_text_alnum(text):
    # strip all non-alphanumeric characters
    return re.sub(r'\W+', '', text, flags=re.UNICODE)

def find_isbn_in_text(isbn, text):
    return normalize_text_alnum(isbn) in text


nrec = 0
for rec in records:
    record_ok = True

    fn = id_to_txtfn(rec['id'])
    with open(fn) as infile:
        text = infile.read()

    text = normalize_text_alnum(text)
    
    pisbns = rec["ground_truth"].get('p-isbn')
    if pisbns:
        for pisbn in pisbns:
            if not find_isbn_in_text(pisbn, text):
                print(f"{rec['rowid']}: p-isbn {pisbn} not found in text {rec['url']}")
                record_ok = False

    eisbns = rec["ground_truth"].get('e-isbn')
    if eisbns:
        for eisbn in eisbns:
            if not find_isbn_in_text(eisbn, text):
                print(f"{rec['rowid']}: e-isbn {eisbn} not found in text {rec['url']}")
                record_ok = False


    if not record_ok:
        nrec += 1

print(f"{nrec} records with ISBN mismatches")

book77: e-isbn 9789525959468 not found in text https://taju.uniarts.fi/bitstream/handle/10024/7007/OperaontheMove.pdf


In [None]:
# Check for publication years in metadata that are not mentioned in the PDF document text

def find_year_in_text(year, text):
    pattern = rf"\b{year}\b"
    return re.search(pattern, text)

nrec = 0
for rec in records:
    record_ok = True

    fn = id_to_txtfn(rec['id'])
    with open(fn) as infile:
        text = infile.read()

    year = rec["ground_truth"].get('year')
    if year:
        if not find_year_in_text(year, text):
            print(f"{rec['rowid']}: year {year} not found in text {rec['url']}")
            record_ok = False
    if not record_ok:
        nrec += 1

print(f"{nrec} records with year mismatches")

In [None]:
# Check for publishers in metadata that are not mentioned in the PDF document text

import regex

def normalize_hyphens(text):
    # Use the \p{Pd} Unicode property escape to match punctuation dashes
    normalized_text = regex.sub(r'\p{Pd}', '-', text)
    return normalized_text

def find_publisher_in_text(publisher, text):
    all_found = True
    text = normalize_hyphens(text)
    text = text.lower()

    text = " ".join(text.split())  # normalize whitespace to spaces only
    for part in publisher.lower().split(','):
        if part.strip() not in text:
            all_found = False
    return all_found

nrec = 0
for rec in records:
    record_ok = True

    fn = id_to_txtfn(rec['id'])
    with open(fn) as infile:
        text = infile.read()

    publishers = rec["ground_truth"].get('publisher')
    if publishers:
        for publisher in publishers:
            if not find_publisher_in_text(publisher, text):
                print(f"{rec['rowid']}: publisher '{publisher}' NOT found in text {rec['url']}")
                record_ok = False

    if not record_ok:
        nrec += 1

print(f"{nrec} records with publisher mismatches")

In [None]:
# Check for creators in metadata that are not mentioned in the PDF document text

import regex

def find_creator_in_text(creator, text):
    all_found = True
    text = text.lower()

    text = " ".join(text.split())  # normalize whitespace to spaces only
    for part in creator.lower().split(','):
        if part.strip() not in text:
            all_found = False
    return all_found

nrec = 0
for rec in records:
    record_ok = True

    fn = id_to_txtfn(rec['id'])
    with open(fn) as infile:
        text = infile.read()

    creators = rec["ground_truth"].get('creator')
    if creators:
        for creator in creators:
            if not find_creator_in_text(creator, text):
                print(f"{rec['rowid']}: creator '{creator}' NOT found in text {rec['url']}")
                record_ok = False

    if not record_ok:
        nrec += 1

print(f"{nrec} records with creator mismatches")

In [None]:
# Check for titles in metadata that don't match what's in the PDF document text

import re

def normalize_text(text):
    # strip all non-alphanumeric characters and convert the remaining to lowercase
    return re.sub(r'\W+', '', text, flags=re.UNICODE).lower()

def find_title_in_text(title, text):
    all_found = True
    text = normalize_text(text)

    for part in title.split(' : '):
        if normalize_text(part) not in text:
            all_found = False
    return all_found

nrec = 0
for rec in records:
    record_ok = True

    fn = id_to_txtfn(rec['id'])
    with open(fn) as infile:
        text = infile.read()

    title = rec["ground_truth"].get('title')
    if title:
        if not find_title_in_text(title, text):
            print(f"{rec['rowid']}: title '{title}' NOT found in text {rec['url']}")
            record_ok = False

    if not record_ok:
        nrec += 1

print(f"{nrec} records with title mismatches")