# Metadata extraction using Meteor

This notebook attempts to extract metadata from the FinGreyLit test set documents using [Meteor](https://github.com/NationalLibraryOfNorway/meteor), an open source metadata extraction tool developed by the National Library of Norway.

It assumes that Meteor is installed and running locally on http://127.0.0.1:5000/ (the default).

In [1]:
import glob
import json

metadata_files = glob.glob("../../metadata/*.jsonl")  # train + test (used for refining code & fixing dataset)
#metadata_files = glob.glob("../../metadata/*-test.jsonl")  # test set only (used for final evaluation)
records = []

for mdfile in metadata_files:
    with open(mdfile) as inf:
        for line in inf:
            rec = json.loads(line)
            records.append(rec)

print(len(records))

724


In [2]:
%%time

import os
import requests

METEOR_API_URL = 'http://127.0.0.1:5000/json'

def id_to_fn(identifier):
    """convert a URI identifier to a simpler string we can use as a filename for the PDF"""
    return '../../pdfs/' + identifier.replace('https://', '').replace('/','_') + ".pdf"

def download(file_url, identifier):
    """download a PDF file, with the given identifier, from the given URL (unless this was done already)
    and return a path to the PDF file"""
    path = id_to_fn(identifier)
    if os.path.exists(path) and os.path.getsize(path) > 0:
        return path

    response = requests.get(file_url)
    with open(path, "wb") as f:
        f.write(response.content)
        print(f"wrote {file_url} as {path}")
    return path

out_records = []

for rec in records:
    path = download(rec['url'], rec['id'])

    # Create a dictionary containing the file to be sent
    filedata = {'fileInput': (path, open(path, 'rb'), 'application/pdf')}

    # Send the POST request with the file
    response = requests.post(METEOR_API_URL, files=filedata)
    try:
        rec['meteor_output'] = response.json()
    except requests.JSONDecodeError:
        print(f"JSON decode error for {rec['id']} / {rec['url']}")
        rec['meteor_output'] = {}
    out_records.append(rec)

# write output to JSONL file
with open('test-records-meteor.jsonl', 'w') as outfile:
    for rec in out_records:
        json.dump(rec, outfile)
        outfile.write("\n")

JSON decode error for https://www.doria.fi/handle/10024/181710 / https://www.doria.fi/bitstream/handle/10024/181710/Laura Hollsten ÅBO AKADEMI OCH KUNSKAPEN doria 23.8.2021.pdf
JSON decode error for https://www.doria.fi/handle/10024/181709 / https://www.doria.fi/bitstream/handle/10024/181709/Nils Villstrand ÅBO AKADEMI I SIN BÖRJAN doria 2021.pdf
CPU times: user 2.08 s, sys: 2.54 s, total: 4.62 s
Wall time: 2min 44s


In [9]:
# Analyze the extracted metadata

import json
import pandas as pd
import Levenshtein

records_meteor = []

ALMOST_THRESHOLD = 0.95  # similarity threshold to be considered "almost correct"

LANGMAP = {
    'fin': 'fi',
    'swe': 'sv',
    'eng': 'en'
}

FIELDS = {
    'year': 'dc.date.issued',
    'language': 'dc.language.iso',
    'title': 'dc.title',
    'publisher': 'dc.publisher',
    'authors': 'dc.contributor.author',
    'isbn': 'dc.identifier.isbn',
    'issn': 'dc.relation.eissn'
}

with open('test-records-meteor.jsonl') as infile:
    for line in infile:
        rec = json.loads(line)
        records_meteor.append(rec)

def join_authors(authors):
    return [f"{author['lastname']}, {author['firstname']}"
            for author in authors]

def compare_authors(rec):
    true_authors = set(rec.get('dc.contributor.author', []))
    try:
        predicted_authors = set(join_authors(rec['meteor_output']['authors']))
    except (KeyError, TypeError):
        predicted_authors = set()

    if not true_authors and not predicted_authors:
        return ('not-relevant', 1)
    elif not true_authors:
        return ('found-nonexistent', 0)
    elif not predicted_authors:
        return ('not-found', 0)
    elif true_authors == predicted_authors:
        return ('exact', 1)
    elif true_authors.issubset(predicted_authors):
        return ('superset', 1)
    elif true_authors.issuperset(predicted_authors):
        return ('subset', 0)
    elif true_authors.intersection(predicted_authors):
        return ('overlap', 0)
    else:
        return ('wrong', 0)

def meteor_prediction(rec, meteor_key):
    if meteor_key == 'authors' and meteor_key in rec['meteor_output']:
        return join_authors(rec['meteor_output'][meteor_key])

    try:
        return str(rec['meteor_output'][meteor_key]['value'])
    except (KeyError, TypeError):
        return None

def compare(rec, dc_key, meteor_key):
    
    true_val = rec.get(dc_key)

    # special case for "authors" field which may contain multiple values
    if dc_key == 'dc.contributor.author':
        return compare_authors(rec)
    
    # field-specific adjustments
    if dc_key == 'dc.language.iso':
        true_val = LANGMAP[true_val]  # convert to ISO 639-1 2-letter language code
    elif dc_key == 'dc.date.issued' and true_val is not None:
        true_val = true_val[:4]  # compare only the year
    elif dc_key == 'dc.identifier.isbn' and true_val:
        true_val = true_val[0]  # compare only against first (usually only) ISBN
        true_val = true_val.replace('-', '') # strip dashes in ISBNs
    elif dc_key == 'dc.publisher' and true_val:
        true_val = true_val[0]  # compare only against first (usually only) publisher

    predicted_val = meteor_prediction(rec, meteor_key)

    if predicted_val is None and true_val is None:
        return ('not-relevant', 1)
    elif predicted_val == true_val:
        return ('exact', 1)
    elif predicted_val is None:
        return ('not-found', 0)
    elif dc_key == 'dc.relation.eissn' and predicted_val == rec.get('dc.relation.pissn'):
        if true_val is None:
            return ('printed-issn', 1)  # this is the only ISSN available, so counts as a success
        else:
            return ('printed-issn', 0)  # Meteor chose the wrong (printed) ISSN even though an e-ISSN was available
    elif dc_key == 'dc.identifier.isbn' and predicted_val == rec.get('dc.relation.isbn', [''])[0].replace('-', ''):
        return ('related-isbn', 0)
    elif true_val is None:
        return ('found-nonexistent', 0)
    elif true_val in predicted_val:
        return ('superset', 1)
    elif true_val.lower() == predicted_val.lower():
        return ('case', 1)
    elif true_val.lower() in predicted_val.lower():
        return ('superset-case', 1)
    elif Levenshtein.ratio(true_val, predicted_val) >= ALMOST_THRESHOLD:
        return ('almost', 1)
    elif Levenshtein.ratio(true_val.lower(), predicted_val.lower()) >= ALMOST_THRESHOLD:
        return ('almost-case', 1)
    else:
        #if meteor_key not in ('title', 'language', 'year'):
        #if meteor_key == 'issn':
        #    print(rec['id'], meteor_key, repr(true_val), repr(predicted_val))
        return ('wrong', 0)

results = []

for rec in records_meteor:
    for meteor_field, dc_field in FIELDS.items():
        match_type, score = compare(rec, dc_field, meteor_field)
        results.append({
            'rowid': rec['rowid'],
            'language': rec['dc.language.iso'],
            'field': meteor_field,
            'prediction': meteor_prediction(rec, meteor_field),
            'true_val': rec.get(dc_field),
            'match_type': match_type,
            'score': score
        })

df = pd.DataFrame(results)
df

Unnamed: 0,rowid,language,field,prediction,true_val,match_type,score
0,serial4,eng,year,2019,2019-02-15,exact,1
1,serial4,eng,language,en,eng,exact,1
2,serial4,eng,title,Bank of Finland Research Discussion Papers 4 •...,Model-based regulation and firms' access to fi...,superset,1
3,serial4,eng,publisher,,,not-relevant,1
4,serial4,eng,authors,"[Tuuli, Saara]","[Tuuli, Saara]",exact,1
5,serial4,eng,isbn,9789523232617,[978-952-323-261-7],exact,1
6,serial4,eng,issn,1456-6184,1456-6184,exact,1
7,serial6,eng,year,,2019-04-30,not-found,0
8,serial6,eng,language,en,eng,exact,1
9,serial6,eng,title,PROFESSIONALISM IN ESPORT: BENEFITS IN SKILLS ...,Professionalism in Esport: Benefits in Skills ...,superset-case,1


In [10]:
df.groupby(['language','field'])['score'].mean()

language  field    
eng       authors      0.608856
          isbn         0.797048
          issn         0.837638
          language     0.970480
          publisher    0.036900
          title        0.623616
          year         0.804428
fin       authors      0.578231
          isbn         0.727891
          issn         0.816327
          language     0.952381
          publisher    0.115646
          title        0.506803
          year         0.816327
swe       authors      0.666667
          isbn         0.899371
          issn         0.937107
          language     0.886792
          publisher    0.106918
          title        0.320755
          year         0.698113
Name: score, dtype: float64

In [11]:
value_counts_df = df.groupby(['language', 'field','score'])['match_type'].value_counts()
print(value_counts_df.to_string())

language  field      score  match_type       
eng       authors    0      not-found             61
                            wrong                 34
                            overlap                5
                            found-nonexistent      4
                            subset                 2
                     1      exact                106
                            superset              47
                            not-relevant          12
          isbn       0      related-isbn          22
                            not-found             17
                            found-nonexistent     10
                            wrong                  6
                     1      not-relevant         118
                            exact                 98
          issn       0      printed-issn          39
                            not-found              5
                     1      not-relevant         174
                            exact                 31


In [12]:
pd.set_option('display.max_rows', None)
df[(df['match_type'] == 'wrong') & (df['field'] == 'language')]

Unnamed: 0,rowid,language,field,prediction,true_val,match_type,score
407,serial22,fin,language,en,fin,wrong,0
449,serial79,fin,language,en,fin,wrong,0
771,thes151,swe,language,en,swe,wrong,0
1198,mono254,fin,language,en,fin,wrong,0
1870,serial2343,fin,language,en,fin,wrong,0
1968,docthes16,eng,language,fi,eng,wrong,0
1989,docthes19,eng,language,sv,eng,wrong,0
2486,docthes605,eng,language,sv,eng,wrong,0
2626,thes34,fin,language,en,fin,wrong,0
2808,thes77,fin,language,en,fin,wrong,0
