# Metadata extraction using Meteor

This notebook attempts to extract metadata from the FinGreyLit test set documents using [Meteor](https://github.com/NationalLibraryOfNorway/meteor), an open source metadata extraction tool developed by the National Library of Norway.

It assumes that Meteor is installed and running locally on http://127.0.0.1:5000/ (the default).

In [1]:
import glob
import json

metadata_files = glob.glob("../../metadata/*.jsonl")  # train + test (used for refining code & fixing dataset)
#metadata_files = glob.glob("../../metadata/*-test.jsonl")  # test set only (used for final evaluation)
records = []

for mdfile in metadata_files:
    with open(mdfile) as inf:
        for line in inf:
            rec = json.loads(line)
            records.append(rec)

print(len(records))

740


In [2]:
%%time

import os
import requests

METEOR_API_URL = 'http://127.0.0.1:5000/json'

def id_to_fn(identifier):
    """convert a URI identifier to a simpler string we can use as a filename for the PDF"""
    return '../../pdfs/' + identifier.replace('https://', '').replace('/','_') + ".pdf"

def download(file_url, identifier):
    """download a PDF file, with the given identifier, from the given URL (unless this was done already)
    and return a path to the PDF file"""
    path = id_to_fn(identifier)
    if os.path.exists(path) and os.path.getsize(path) > 0:
        return path

    response = requests.get(file_url)
    with open(path, "wb") as f:
        f.write(response.content)
        print(f"wrote {file_url} as {path}")
    return path

out_records = []

for rec in records:
    path = download(rec['url'], rec['id'])

    # Create a dictionary containing the file to be sent
    filedata = {'fileInput': (path, open(path, 'rb'), 'application/pdf')}

    # Send the POST request with the file
    response = requests.post(METEOR_API_URL, files=filedata)
    try:
        rec['meteor_output'] = response.json()
    except requests.JSONDecodeError:
        print(f"JSON decode error for {rec['id']} / {rec['url']}")
        rec['meteor_output'] = {}
    out_records.append(rec)

# write output to JSONL file
with open('test-records-meteor.jsonl', 'w') as outfile:
    for rec in out_records:
        json.dump(rec, outfile)
        outfile.write("\n")

JSON decode error for https://www.doria.fi/handle/10024/181710 / https://www.doria.fi/bitstream/handle/10024/181710/Laura Hollsten ÅBO AKADEMI OCH KUNSKAPEN doria 23.8.2021.pdf
JSON decode error for https://www.doria.fi/handle/10024/181709 / https://www.doria.fi/bitstream/handle/10024/181709/Nils Villstrand ÅBO AKADEMI I SIN BÖRJAN doria 2021.pdf
CPU times: user 2.28 s, sys: 2.88 s, total: 5.16 s
Wall time: 3min 9s


In [36]:
# Analyze the extracted metadata

import json
import pandas as pd
import Levenshtein

records_meteor = []

ALMOST_THRESHOLD = 0.95  # similarity threshold to be considered "almost correct"

LANGMAP = {
    'fin': 'fi',
    'swe': 'sv',
    'eng': 'en'
}

FIELDS = {
    'year': 'dc.date.issued',
    'language': 'dc.language.iso',
    'title': 'dc.title',
    'publisher': 'dc.publisher',
    'authors': 'dc.contributor.author',
    'isbn': 'dc.identifier.isbn',
    'issn': 'dc.relation.eissn'
}

with open('test-records-meteor.jsonl') as infile:
    for line in infile:
        rec = json.loads(line)
        records_meteor.append(rec)

def compare_authors(rec):
    true_authors = set(rec.get('dc.contributor.author', []))
    try:
        predicted_authors = set([f"{author['lastname']}, {author['firstname']}"
                                for author in rec['meteor_output']['authors']])
    except (KeyError, TypeError):
        predicted_authors = set()

    if not true_authors and not predicted_authors:
        return ('not-relevant', 1)
    elif not true_authors:
        return ('found-nonexistent', 0)
    elif not predicted_authors:
        return ('not-found', 0)
    elif true_authors == predicted_authors:
        return ('exact', 1)
    elif true_authors.issubset(predicted_authors):
        return ('superset', 1)
    elif true_authors.issuperset(predicted_authors):
        return ('subset', 0)
    elif true_authors.intersection(predicted_authors):
        return ('overlap', 0)
    else:
        return ('wrong', 0)

def compare(rec, dc_key, meteor_key):
    
    true_val = rec.get(dc_key)

    # special case for "authors" field which may contain multiple values
    if dc_key == 'dc.contributor.author':
        return compare_authors(rec)
    
    # field-specific adjustments
    if dc_key == 'dc.language.iso':
        true_val = LANGMAP[true_val]  # convert to ISO 639-1 2-letter language code
    elif dc_key == 'dc.date.issued' and true_val is not None:
        true_val = true_val[:4]  # compare only the year
    elif dc_key == 'dc.identifier.isbn' and true_val:
        true_val = true_val[0]  # compare only against first (usually only) ISBN
    elif dc_key == 'dc.publisher' and true_val:
        true_val = true_val[0]  # compare only against first (usually only) publisher

    try:
        predicted_val = str(rec['meteor_output'][meteor_key]['value'])
    except (KeyError, TypeError):
        predicted_val = None

    if predicted_val is None and true_val is None:
        return ('not-relevant', 1)
    elif predicted_val == true_val:
        return ('exact', 1)
    elif predicted_val is None:
        return ('not-found', 0)
    elif true_val is None:
        return ('found-nonexistent', 0)
    elif true_val in predicted_val:
        return ('superset', 1)
    elif true_val.lower() == predicted_val.lower():
        return ('case', 1)
    elif true_val.lower() in predicted_val.lower():
        return ('superset-case', 1)
    elif Levenshtein.ratio(true_val, predicted_val) >= ALMOST_THRESHOLD:
        return ('almost', 1)
    elif Levenshtein.ratio(true_val.lower(), predicted_val.lower()) >= ALMOST_THRESHOLD:
        return ('almost-case', 1)
    else:
        #if meteor_key not in ('title', 'language', 'year'):
        #if meteor_key == 'issn':
        #    print(rec['id'], meteor_key, repr(true_val), repr(predicted_val))
        return ('wrong', 0)

results = []

for rec in records_meteor:
    for meteor_field, dc_field in FIELDS.items():
        match_type, score = compare(rec, dc_field, meteor_field)
        results.append({
            'language': rec['dc.language.iso'],
            'field': meteor_field,
            'match_type': match_type,
            'score': score
        })

df = pd.DataFrame(results)
df

Unnamed: 0,language,field,match_type,score
0,eng,year,exact,1
1,eng,language,exact,1
2,eng,title,superset,1
3,eng,publisher,not-relevant,1
4,eng,authors,exact,1
...,...,...,...,...
5175,eng,title,wrong,0
5176,eng,publisher,not-found,0
5177,eng,authors,not-found,0
5178,eng,isbn,not-found,0


In [37]:
df.groupby(['language','field'])['score'].mean()

language  field    
eng       authors      0.597826
          isbn         0.782609
          issn         0.644928
          language     0.956522
          publisher    0.039855
          title        0.608696
          year         0.797101
fin       authors      0.561056
          isbn         0.686469
          issn         0.712871
          language     0.937294
          publisher    0.112211
          title        0.498350
          year         0.788779
swe       authors      0.664596
          isbn         0.869565
          issn         0.807453
          language     0.888199
          publisher    0.105590
          title        0.322981
          year         0.689441
Name: score, dtype: float64

In [39]:
value_counts_df = df.groupby(['language', 'field','score'])['match_type'].value_counts()
print(value_counts_df.to_string())

language  field      score  match_type       
eng       authors    0      not-found             65
                            wrong                 36
                            overlap                5
                            found-nonexistent      4
                            subset                 1
                     1      exact                108
                            superset              45
                            not-relevant          12
          isbn       0      wrong                 28
                            not-found             20
                            found-nonexistent     12
                     1      not-relevant         119
                            exact                 97
          issn       0      wrong                 40
                            not-found             35
                            found-nonexistent     23
                     1      not-relevant         148
                            exact                 30
