# Metadata Extraction using local LLM inference

In [1]:
# import necessary libraries

#import transformers
import requests
import fitz
import io
import re
import regex
import peft
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load and test the language model

MODEL = "NatLibFi/Nous-Hermes-2-Mistral-7B-DPO-meteor"

from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftConfig

config = PeftConfig.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(MODEL, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

input_text = "Write a poem about Machine Learning."
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

outputs = model.generate(**input_ids, max_new_tokens=128, pad_token_id=tokenizer.pad_token_id)
print(tokenizer.decode(outputs[0]))

Loading checkpoint shards: 100%|██████████| 3/3 [02:38<00:00, 52.80s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<s> Write a poem about Machine Learning.

Machine Learning

The machines are learning,
Their minds are expanding,
Their knowledge is growing,
Their potential is landing.

They learn from data,
They learn from experience,
They learn from patterns,
They learn from persistence.

They learn to recognize,
They learn to predict,
They learn to optimize,
They learn to detect.

They learn to classify,
They learn to cluster,
They learn to regress,
They learn to forecast.

They learn to recommend,
They learn to segment,
They learn to


In [3]:
# ask for PDF URL

url = input("Please enter a PDF URL: ")
print(f"You entered: {url}")

Please enter a PDF URL:  https://www.doria.fi/bitstream/handle/10024/180306/Automaattisen%20kuvailun%20palvelun%20integroiminen%20Kansalliskirjaston%20j%c3%a4rjestelm%c3%a4kokonaisuuteen.pdf?sequence=5&isAllowed=y


You entered: https://www.doria.fi/bitstream/handle/10024/180306/Automaattisen%20kuvailun%20palvelun%20integroiminen%20Kansalliskirjaston%20j%c3%a4rjestelm%c3%a4kokonaisuuteen.pdf?sequence=5&isAllowed=y


In [4]:
# download the PDF and extract the relevant text

# settings for text extraction
PAGES = [0, 1, 2, 3, 4, 5, 6, 7, -1]  # pages to analyze: first 8 pages + last page
THRESHOLD = 100                       # paragraphs shorter than this will always be kept
LONG_PARAGRAPH_PAGES = [0, 1]         # on first two pages, some long paragraphs are accepted
LONG_PARAGRAPH_MAX = 2                # how many long paragraphs to keep on the first two pages
PDF_METADATA_SKIP = {'format', 'creator', 'producer'}  # PDF metadata fields not to include in extracted text

def download_and_open_pdf(url):
    response = requests.get(url)
    pdf_stream = io.BytesIO(response.content)
    return fitz.open(stream=pdf_stream, filetype="pdf")

def extract_text(pdf):
    texts = []

    for key in pdf.metadata.keys():
        if key not in PDF_METADATA_SKIP and pdf.metadata.get(key):
            texts.append(f"{key}: {pdf.metadata.get(key)}")

    for page in PAGES:
        if page > len(pdf) - 2:
            continue

        text = pdf[page].get_text(sort=True)
        # Use regular expression to split text into paragraphs
        # Delimiter: newline(s) followed by an upper case character
        paragraphs = regex.split(r'\n+(?=\p{Lu})', text, flags=re.UNICODE)
        long_paragraph_count = 0

        for paragraph in paragraphs:
            paragraph = " ".join(paragraph.strip().split())

            if '.....' in paragraph or '. . . . .' in paragraph: # looks like a ToC entry, skip it
                continue
            elif len(paragraph) < THRESHOLD:  # short paragraph, keep it
                texts.append(paragraph)
            elif page in LONG_PARAGRAPH_PAGES and long_paragraph_count < LONG_PARAGRAPH_MAX:
                # allow some long paragraphs on the first two pages
                long_paragraph_count += 1
                texts.append(paragraph)
            else:  # must be a long paragraph, skip it
                pass
    return '\n'.join(texts)
pdf = download_and_open_pdf(url)

doc_text = extract_text(pdf)
print(f"text length: {len(doc_text)} characters")
print()
print(doc_text)

text length: 1961 characters

title: Automaattisen kuvailun palvelun integroiminen Kansalliskirjaston järjestelmäkokonaisuuteen 
author: Mona Lehtinen
creationDate: D:20210211163354+02'00'
modDate: D:20210211163359+02'00'
Automaattisen kuvailun palvelun integroiminen
Kansalliskirjaston järjestelmäkokonaisuuteen - tietovirrat ja prosessit
Mona Lehtinen, Satu Niininen, Juho Inkinen, Mikko Lappalainen
Image by DavidRockDesign from Pixabay
Kansalliskirjaston raportteja ja selvityksiä 1/2021
ISBN 978-951-51-6986-0
ISSN 2242–8119
Sisältö 1
Johdanto 2
Annif 2
Kansalliskirjaston tietovirtoihin liittyvät järjestelmät 6
Kansalliskirjaston tietovirrat ja kuvailuprosessit 8
Annif-integraatioiden nykytila 9
Johtopäätökset
Johdanto
Mona Lehtinen, Satu Niininen, Juho Inkinen, Mikko Lappalainen 1
Annifin algoritmien toiminta perustuu koneoppi-
Edellä mainitussa artikkelissa käydään myös läpi
Kansalliskirjaston tietovirtoihin liittyvät järjestelmät
Annifin käyttöönottoa suunniteltaessa on hyvä tuntea
A

In [5]:
%%time

# submit the text to the LLM and display results

import pprint
pp = pprint.PrettyPrinter(indent=4)

SYSTEM_PROMPT = "You are a skilled librarian specialized in meticulous cataloguing of digital documents."
INSTRUCTION = "Extract metadata from this document. Return as JSON."

def generate(doc_text):
    messages = [
        {'role': 'system', 'content': SYSTEM_PROMPT},
        {'role': 'user', 'content': INSTRUCTION + "\n\n" + doc_text}
    ]
    
    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")
    print(f"input length: {len(input_ids[0])} tokens")

    outputs = model.generate(input_ids,
                             max_new_tokens=1024,
                             pad_token_id=tokenizer.pad_token_id)
    return tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)

response = generate(doc_text)
extracted_data = json.loads(response)
print()
pp.pprint(extracted_data)
print()

input length: 944 tokens

{   'dc.contributor.author': [   'Lehtinen, Mona',
                                 'Niininen, Satu',
                                 'Inkinen, Juho',
                                 'Lappalainen, Mikko'],
    'dc.date.issued': '2021',
    'dc.identifier.isbn': ['9789515169860'],
    'dc.language.iso': 'fin',
    'dc.publisher': ['Kansalliskirjaston tiedotus- ja analyysioikeus'],
    'dc.relation.eissn': '2242-8119',
    'dc.title': 'Automaattisen kuvailun palvelun integroiminen '
                'Kansalliskirjaston järjestelmäkokonaisuuteen'}

CPU times: user 8.7 s, sys: 228 ms, total: 8.93 s
Wall time: 9.52 s


# Comparison with Meteor output

Command:

    curl -s -d fileUrl=https://www.regjeringen.no/contentassets/7464f476cb4744e59554c2cb4b192df5/no/pdfs/dataspillstrategi.pdf http://127.0.0.1:5000/json|jq .


Output:

```json
{
  "year": {
    "origin": {
      "type": "PDFINFO",
      "pageNumber": 52
    },
    "value": 2023
  },
  "language": {
    "origin": {
      "type": "LANGUAGE_MODEL"
    },
    "value": "no"
  },
  "title": {
    "origin": {
      "type": "PDFINFO",
      "pageNumber": 1
    },
    "value": "Tid for spill – regjeringens dataspillstrategi 2024–2026"
  },
  "publisher": {
    "origin": {
      "type": "PAGE",
      "pageNumber": 52
    },
    "value": "Kultur- og likestillingsdepartementet"
  },
  "publicationType": null,
  "authors": [],
  "isbn": null,
  "issn": null
}

```
