# Metadata extraction using Meteor

This notebook attempts to extract metadata from the FinGreyLit test set documents using [Meteor](https://github.com/NationalLibraryOfNorway/meteor), an open source metadata extraction tool developed by the National Library of Norway.

It assumes that Meteor is installed and running locally on http://127.0.0.1:5000/ (the default).

In [1]:
import glob
import json

metadata_files = glob.glob("../../metadata/*.jsonl")  # train + test (used for refining code & fixing dataset)
#metadata_files = glob.glob("../../metadata/*-test.jsonl")  # test set only (used for final evaluation)
records = []

for mdfile in metadata_files:
    with open(mdfile) as inf:
        for line in inf:
            rec = json.loads(line)
            records.append(rec)

print(len(records))

724


In [2]:
%%time

import os
import requests

METEOR_API_URL = 'http://127.0.0.1:5000/json'

def id_to_fn(identifier):
    """convert a URI identifier to a simpler string we can use as a filename for the PDF"""
    return '../../pdfs/' + identifier.replace('https://', '').replace('/','_') + ".pdf"

def download(file_url, identifier):
    """download a PDF file, with the given identifier, from the given URL (unless this was done already)
    and return a path to the PDF file"""
    path = id_to_fn(identifier)
    if os.path.exists(path) and os.path.getsize(path) > 0:
        return path

    response = requests.get(file_url)
    with open(path, "wb") as f:
        f.write(response.content)
        print(f"wrote {file_url} as {path}")
    return path

out_records = []

for rec in records:
    path = download(rec['url'], rec['id'])

    # Create a dictionary containing the file to be sent
    filedata = {'fileInput': (path, open(path, 'rb'), 'application/pdf')}

    # Send the POST request with the file
    response = requests.post(METEOR_API_URL, files=filedata)
    try:
        rec['meteor_output'] = response.json()
    except requests.JSONDecodeError:
        print(f"JSON decode error for {rec['id']} / {rec['url']}")
        rec['meteor_output'] = {}
    out_records.append(rec)

# write output to JSONL file
with open('test-records-meteor.jsonl', 'w') as outfile:
    for rec in out_records:
        json.dump(rec, outfile)
        outfile.write("\n")

JSON decode error for https://www.doria.fi/handle/10024/181710 / https://www.doria.fi/bitstream/handle/10024/181710/Laura Hollsten ÅBO AKADEMI OCH KUNSKAPEN doria 23.8.2021.pdf
JSON decode error for https://www.doria.fi/handle/10024/181709 / https://www.doria.fi/bitstream/handle/10024/181709/Nils Villstrand ÅBO AKADEMI I SIN BÖRJAN doria 2021.pdf
CPU times: user 1.71 s, sys: 2.31 s, total: 4.02 s
Wall time: 2min 15s


In [2]:
# Analyze the extracted metadata

import pandas as pd
from eval import MetadataEvaluator

prediction_records_file = 'test-records-meteor.jsonl'
prediction_output_key = "meteor_output"

evaluator = MetadataEvaluator(prediction_records_file, prediction_output_key)
results = evaluator.evaluate_records()

df = pd.DataFrame(results)
df

Unnamed: 0,rowid,language,field,predicted_val,true_val,match_type,score
0,serial3,fin,year,,2019,not-found,0
1,serial3,fin,language,fi,fin,exact,1
2,serial3,fin,title,Please note! This is a self-archived version o...,Virtain metsä- ja liiketalouden opiskelijat po...,superset,1
3,serial3,fin,publisher,,[Tampereen ammattikorkeakoulu],not-found,0
4,serial3,fin,authors,,"[Antila, Heli, Lähteenmäki, Eija]",not-found,0
...,...,...,...,...,...,...,...
5063,mono294,eng,title,,Constellations,not-found,0
5064,mono294,eng,publisher,,[Academy of Fine Arts of the University of the...,not-found,0
5065,mono294,eng,authors,,"[Feehily, Fergus, Suutari, Inkeri, Demozay, An...",not-found,0
5066,mono294,eng,isbn,9789527131510,[978-952-7131-51-0],exact,1


In [9]:
df.groupby(['language','field'])['score'].mean()

language  field    
eng       authors      0.608856
          isbn         0.793358
          issn         0.833948
          language     0.974170
          publisher    0.036900
          title        0.619926
          year         0.808118
fin       authors      0.576792
          isbn         0.730375
          issn         0.819113
          language     0.941980
          publisher    0.116041
          title        0.508532
          year         0.815700
swe       authors      0.668750
          isbn         0.900000
          issn         0.937500
          language     0.887500
          publisher    0.106250
          title        0.325000
          year         0.693750
Name: score, dtype: float64

In [10]:
value_counts_df = df.groupby(['language', 'field','score'])['match_type'].value_counts()
print(value_counts_df.to_string())

language  field      score  match_type       
eng       authors    0      not-found             61
                            wrong                 34
                            overlap                5
                            found-nonexistent      4
                            subset                 2
                     1      exact                106
                            superset              47
                            not-relevant          12
          isbn       0      related-isbn          23
                            not-found             17
                            found-nonexistent     10
                            wrong                  6
                     1      not-relevant         117
                            exact                 98
          issn       0      printed-issn          40
                            not-found              5
                     1      not-relevant         173
                            exact                 31


In [11]:
pd.set_option('display.max_rows', None)
df[(df['match_type'] == 'printed-issn') & (df['field'] == 'issn')]

Unnamed: 0,rowid,language,field,prediction,true_val,match_type,score
13,serial6,eng,issn,2243-3384,,printed-issn,1
90,serial57,eng,issn,2242-6418,2242-6426,printed-issn,0
405,serial7,fin,issn,1797-5743,,printed-issn,1
412,serial22,fin,issn,1456-002X,,printed-issn,1
454,serial79,fin,issn,1239-3908,2342-1150,printed-issn,0
517,serial2355,fin,issn,1237-4334,,printed-issn,1
531,docthes11,eng,issn,2343-3159,2343-3167,printed-issn,0
566,docthes32,eng,issn,0355-9483,2343-3213,printed-issn,0
580,docthes43,eng,issn,0082-7002,2343-3175,printed-issn,0
587,docthes61,eng,issn,1456-4491,,printed-issn,1
