# Metadata extraction using Meteor

This notebook attempts to extract metadata from the FinGreyLit test set documents using [Meteor](https://github.com/NationalLibraryOfNorway/meteor), an open source metadata extraction tool developed by the National Library of Norway.

It assumes that Meteor is installed and running locally on http://127.0.0.1:5000/ (the default).

In [1]:
import glob
import json

# metadata_files = glob.glob("../../metadata/*.jsonl")  # train + test (used for refining code & fixing dataset)
metadata_files = glob.glob("../../metadata/*-test.jsonl")  # test set only (used for final evaluation)
records = []

for mdfile in metadata_files:
    with open(mdfile) as inf:
        for line in inf:
            rec = json.loads(line)
            records.append(rec)

print(len(records))

179


In [2]:
%%time

import os
import requests

METEOR_API_URL = 'http://127.0.0.1:5000/json'

FIELDS_MAP = {
    "year": "year",
    "language": "language",
    "title": "title",
    "publisher": "publisher",
    "authors": "creator",
    "isbn": "e-isbn",
    "issn": "e-issn",
}


def id_to_fn(identifier):
    """convert a URI identifier to a simpler string we can use as a filename for the PDF"""
    return '../../pdfs/' + identifier.replace('https://', '').replace('/','_') + ".pdf"

def download(file_url, identifier):
    """download a PDF file, with the given identifier, from the given URL (unless this was done already)
    and return a path to the PDF file"""
    path = id_to_fn(identifier)
    if os.path.exists(path) and os.path.getsize(path) > 0:
        return path

    response = requests.get(file_url)
    with open(path, "wb") as f:
        f.write(response.content)
        print(f"wrote {file_url} as {path}")
    return path

def map_prediction(key, value):
    """"Convert data from Meteor schema to FinGreyLit schema"""
    if key == "authors":
        authors = [
                    f"{author['lastname']}, {author['firstname']}"
                    for author in value
                ]
        return authors  # creator is a list
    elif key == "isbn" and value:
        return [value["value"]]  # e-isbn is a list
    elif key == "publisher" and value:
        return [(value["value"])]  # publisher is a list
    try:
        return str(value["value"])
    except (KeyError, TypeError):
        return None


out_records = []

for rec in records:
    path = download(rec['url'], rec['id'])
    out_rec = rec

    # Create a dictionary containing the file to be sent
    filedata = {'fileInput': (path, open(path, 'rb'), 'application/pdf')}

    # Send the POST request with the file
    response = requests.post(METEOR_API_URL, files=filedata)
    try:
        meteor_output = response.json()
    except requests.JSONDecodeError:
        print(f"JSON decode error for {rec['id']} / {rec['url']}")
        out_rec['prediction'] = {}
    else:
        mapped_prediction = {
            FIELDS_MAP[key]: map_prediction(key, value)
            for key, value in sorted(meteor_output.items())
            if key in FIELDS_MAP
            }
        out_rec['prediction'] = mapped_prediction
    out_records.append(out_rec)

# write output to JSONL file
with open('test-records.jsonl', 'w') as outfile:
    for rec in out_records:
        json.dump(rec, outfile)
        outfile.write("\n")

CPU times: user 492 ms, sys: 658 ms, total: 1.15 s
Wall time: 35.4 s
