# Metadata extraction using Meteor

This notebook attempts to extract metadata from the FinGreyLit test set documents using [Meteor](https://github.com/NationalLibraryOfNorway/meteor), an open source metadata extraction tool developed by the National Library of Norway.

It assumes that Meteor is installed and running locally on http://127.0.0.1:5000/ (the default).

In [1]:
import glob
import json

metadata_files = glob.glob("../../metadata/*.jsonl")  # train + test (used for refining code & fixing dataset)
#metadata_files = glob.glob("../../metadata/*-test.jsonl")  # test set only (used for final evaluation)
records = []

for mdfile in metadata_files:
    with open(mdfile) as inf:
        for line in inf:
            rec = json.loads(line)
            records.append(rec)

print(len(records))

724


In [2]:
%%time

import os
import requests

METEOR_API_URL = 'http://127.0.0.1:5000/json'

LANG_MAP = {"fi": "fin", "sv": "swe", "en": "eng"}
FIELDS_MAP = {
    "year": "dc.date.issued",
    "language": "dc.language.iso",
    "title": "dc.title",
    "publisher": "dc.publisher",
    "authors": "dc.contributor.author",
    "isbn": "dc.identifier.isbn",
    "issn": "dc.relation.eissn",
}


def id_to_fn(identifier):
    """convert a URI identifier to a simpler string we can use as a filename for the PDF"""
    return '../../pdfs/' + identifier.replace('https://', '').replace('/','_') + ".pdf"

def download(file_url, identifier):
    """download a PDF file, with the given identifier, from the given URL (unless this was done already)
    and return a path to the PDF file"""
    path = id_to_fn(identifier)
    if os.path.exists(path) and os.path.getsize(path) > 0:
        return path

    response = requests.get(file_url)
    with open(path, "wb") as f:
        f.write(response.content)
        print(f"wrote {file_url} as {path}")
    return path

def map_record(rec):
    """Organize record, placing keys starting with 'dc.' under 'prediction'."""
    out = {
        key: value for key, value in rec.items()
        if not key.startswith("dc.")
    }
    out["ground_truth"] = {
        key: value for key, value in sorted(rec.items())
        if key.startswith("dc.")
    }
    return out

def map_prediction(key, value):
    """"Convert data from Meteor schema to FinGreyLit schema"""
    if key == "authors":
        authors = [
                    f"{author['lastname']}, {author['firstname']}"
                    for author in value
                ]
        return authors  # dc.contributor.author is a list
    elif key == "isbn" and value:  # non-empty isbn prediction
        return [value["value"]]  # dc.contributor.isbn is a list
    elif key == "language":  # convert to ISO 639-3 3-letter language code
        value["value"] = LANG_MAP.get(value["value"])
    try:
        return str(value["value"])
    except (KeyError, TypeError):
        return None


out_records = []

for rec in records:
    path = download(rec['url'], rec['id'])
    out_rec = map_record(rec)

    # Create a dictionary containing the file to be sent
    filedata = {'fileInput': (path, open(path, 'rb'), 'application/pdf')}

    # Send the POST request with the file
    response = requests.post(METEOR_API_URL, files=filedata)
    try:
        meteor_output = response.json()
    except requests.JSONDecodeError:
        print(f"JSON decode error for {rec['id']} / {rec['url']}")
        out_rec['prediction'] = {}
    else:
        mapped_prediction = {
            FIELDS_MAP[key]: map_prediction(key, value)
            for key, value in sorted(meteor_output.items())
            if key in FIELDS_MAP
            }
        out_rec['prediction'] = mapped_prediction
    out_records.append(out_rec)

# write output to JSONL file
with open('test-records-meteor.jsonl', 'w') as outfile:
    for rec in out_records:
        json.dump(rec, outfile)
        outfile.write("\n")

JSON decode error for https://www.doria.fi/handle/10024/181710 / https://www.doria.fi/bitstream/handle/10024/181710/Laura Hollsten ÅBO AKADEMI OCH KUNSKAPEN doria 23.8.2021.pdf
JSON decode error for https://www.doria.fi/handle/10024/181709 / https://www.doria.fi/bitstream/handle/10024/181709/Nils Villstrand ÅBO AKADEMI I SIN BÖRJAN doria 2021.pdf
CPU times: user 3.41 s, sys: 6.26 s, total: 9.67 s
Wall time: 3min 56s


In [3]:
# Analyze the extracted metadata

import pandas as pd
import sys
sys.path.append('..')
from eval import MetadataEvaluator

prediction_records_file = 'test-records-meteor.jsonl'

evaluator = MetadataEvaluator(prediction_records_file)
results = evaluator.evaluate_records()

df = pd.DataFrame(results)
df

Unnamed: 0,rowid,language,field,predicted_val,true_val,match_type,score
0,serial3,fin,dc.contributor.author,[],"[Antila, Heli, Lähteenmäki, Eija]",not-found,0
1,serial3,fin,dc.identifier.isbn,,,not-relevant,1
2,serial3,fin,dc.relation.eissn,,,not-relevant,1
3,serial3,fin,dc.language.iso,fin,fin,exact,1
4,serial3,fin,dc.publisher,,[Tampereen ammattikorkeakoulu],not-found,0
...,...,...,...,...,...,...,...
5049,mono294,eng,dc.relation.eissn,,,not-relevant,1
5050,mono294,eng,dc.language.iso,eng,eng,exact,1
5051,mono294,eng,dc.publisher,,[Academy of Fine Arts of the University of the...,not-found,0
5052,mono294,eng,dc.title,,Constellations,not-found,0


In [4]:
df.groupby(['language','field'])['score'].mean()

language  field                
eng       dc.contributor.author    0.608856
          dc.date.issued           0.804428
          dc.identifier.isbn       0.797048
          dc.language.iso          0.981550
          dc.publisher             0.014760
          dc.relation.eissn        0.756458
          dc.title                 0.623616
fin       dc.contributor.author    0.578231
          dc.date.issued           0.812925
          dc.identifier.isbn       0.727891
          dc.language.iso          0.948980
          dc.publisher             0.034014
          dc.relation.eissn        0.768707
          dc.title                 0.506803
swe       dc.contributor.author    0.668790
          dc.date.issued           0.694268
          dc.identifier.isbn       0.904459
          dc.language.iso          0.904459
          dc.publisher             0.101911
          dc.relation.eissn        0.923567
          dc.title                 0.324841
Name: score, dtype: float64

In [5]:
value_counts_df = df.groupby(['language', 'field','score'])['match_type'].value_counts()
print(value_counts_df.to_string())

language  field                  score  match_type       
eng       dc.contributor.author  0      not-found             61
                                        wrong                 35
                                        overlap                5
                                        found-nonexistent      4
                                        subset                 1
                                 1      exact                105
                                        superset              48
                                        not-relevant          12
          dc.date.issued         0      not-found             36
                                        wrong                 17
                                 1      exact                218
          dc.identifier.isbn     0      related-isbn          22
                                        not-found             17
                                        found-nonexistent     10
                                

In [6]:
pd.set_option('display.max_rows', None)
df[(df['match_type'] == 'printed-issn') & (df['field'] == 'issn')]

Unnamed: 0,rowid,language,field,predicted_val,true_val,match_type,score
