# Metadata Extraction using local LLM inference

In [1]:
# import necessary libraries

#import transformers
import requests
import fitz
import io
import re
import regex
import peft
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load and test the language model

MODEL = "NatLibFi/Nous-Hermes-2-Mistral-7B-DPO-meteor"

from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftConfig

config = PeftConfig.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(MODEL, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

input_text = "Write a poem about Machine Learning."
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

outputs = model.generate(**input_ids, max_new_tokens=128, pad_token_id=tokenizer.pad_token_id)
print(tokenizer.decode(outputs[0]))

Loading checkpoint shards: 100%|██████████| 3/3 [02:50<00:00, 56.97s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


NotImplementedError: Cannot copy out of meta tensor; no data!

In [3]:
# ask for PDF URL

url = input("Please enter a PDF URL: ")
print(f"You entered: {url}")

Please enter a PDF URL:  https://osuva.uwasa.fi/bitstream/handle/10024/16903/978-952-395-133-4.pdf?sequence=2&isAllowed=y


You entered: https://osuva.uwasa.fi/bitstream/handle/10024/16903/978-952-395-133-4.pdf?sequence=2&isAllowed=y


In [4]:
# download the PDF and extract the relevant text

# settings for text extraction
PAGES = [0, 1, 2, 3, 4, 5, 6, 7, -2, -1]  # pages to analyze: first 8 pages + last page
THRESHOLD = 100                       # paragraphs shorter than this will always be kept
LONG_PARAGRAPH_PAGES = [0, 1]         # on first two pages, some long paragraphs are accepted
LONG_PARAGRAPH_MAX = 2                # how many long paragraphs to keep on the first two pages
PDF_METADATA_SKIP = {'format', 'creator', 'producer'}  # PDF metadata fields not to include in extracted text

def download_and_open_pdf(url):
    response = requests.get(url)
    pdf_stream = io.BytesIO(response.content)
    return fitz.open(stream=pdf_stream, filetype="pdf")

def extract_text(pdf):
    texts = []

    for key in pdf.metadata.keys():
        if key not in PDF_METADATA_SKIP and pdf.metadata.get(key):
            texts.append(f"{key}: {pdf.metadata.get(key)}")

    for page in PAGES:
        if page > len(pdf) - 2:
            continue

        text = pdf[page].get_text(sort=True)
        # Use regular expression to split text into paragraphs
        # Delimiter: newline(s) followed by an upper case character
        paragraphs = regex.split(r'\n+(?=\p{Lu})', text, flags=re.UNICODE)
        long_paragraph_count = 0

        for paragraph in paragraphs:
            paragraph = " ".join(paragraph.strip().split())

            if '.....' in paragraph or '………' in paragraph or '. . . . .' in paragraph: # looks like a ToC entry, skip it
                continue
            elif len(paragraph) < THRESHOLD:  # short paragraph, keep it
                texts.append(paragraph)
            elif page in LONG_PARAGRAPH_PAGES and long_paragraph_count < LONG_PARAGRAPH_MAX:
                # allow some long paragraphs on the first two pages
                long_paragraph_count += 1
                texts.append(paragraph)
            else:  # must be a long paragraph, skip it
                pass
    return '\n'.join(texts)
pdf = download_and_open_pdf(url)

doc_text = extract_text(pdf)
print(f"text length: {len(doc_text)} characters")
print()
print(doc_text)

text length: 5196 characters

title: Hallintaa epävarmuudessa – Informaatioresilienssi kriiseissä ja niihin varautumisessa
author: Aino Rantamäki
keywords: Informaatioresilienssi, tietopohjainen päätöksenteko, episteeminen hallinta, resilienssi, kriisi, varautuminen
creationDate: D:20240205103358+02'00'
modDate: D:20240215105801+02'00'
Aino Rantamäki
Hallintaa epävarmuudessa
Informaatioresilienssi kriiseissä ja niihin varautumisessa 
ACTA WASAENSIA 530
Copyright © Vaasan yliopisto ja tekijänoikeuksien haltijat.
ISBN 978-952-395-132-7 (painettu) 978-952-395-133-4 (verkkoaineisto)
ISSN 0355-2667 (Acta Wasaensia 530, painettu) 2323-9123 (Acta Wasaensia 530, verkkoaineisto)
URN https://urn.fi/URN:ISBN:978-952-395-133-4
Hansaprint Oy, Turenki, 2024.
Artikkeliväitöskirja, Johtamisen akateeminen yksikkö, Sosiaali- ja terveyshallin- totiede
Tekijä
Aino Rantamäki https://orcid.org/0000-0001-9828-0511
Ohjaaja(t)
Professori Harri Jalonen
Vaasan yliopisto. Johtamisen akateeminen yksikkö, Sosiaa

In [5]:
%%time

# submit the text to the LLM and display results

import pprint
pp = pprint.PrettyPrinter(indent=4)

SYSTEM_PROMPT = "You are a skilled librarian specialized in meticulous cataloguing of digital documents."
INSTRUCTION = "Extract metadata from this document. Return as JSON."

def generate(doc_text):
    messages = [
        {'role': 'system', 'content': SYSTEM_PROMPT},
        {'role': 'user', 'content': INSTRUCTION + "\n\n" + doc_text}
    ]
    
    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")
    print(f"input length: {len(input_ids[0])} tokens")

    outputs = model.generate(input_ids,
                             max_new_tokens=1024,
                             pad_token_id=tokenizer.pad_token_id)
    return tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)

response = generate(doc_text)
extracted_data = json.loads(response)
print()
pp.pprint(extracted_data)
print()

input length: 2590 tokens

{   'dc.contributor.author': ['Rantamäki, Aino'],
    'dc.date.issued': '2024',
    'dc.identifier.isbn': ['9789523951334'],
    'dc.language.iso': 'fin',
    'dc.publisher': ['Vaasan yliopisto'],
    'dc.relation.eissn': '2323-9123',
    'dc.title': 'Hallintaa epävarmuudessa – Informaatoresilienssi kriiseissä '
                'ja niihin varautumisessa'}

CPU times: user 8.59 s, sys: 778 ms, total: 9.37 s
Wall time: 9.64 s


# Comparison with Meteor output

Command:

    curl -s -d fileUrl=https://www.regjeringen.no/contentassets/7464f476cb4744e59554c2cb4b192df5/no/pdfs/dataspillstrategi.pdf http://127.0.0.1:5000/json|jq .


Output:

```json
{
  "year": {
    "origin": {
      "type": "PDFINFO",
      "pageNumber": 52
    },
    "value": 2023
  },
  "language": {
    "origin": {
      "type": "LANGUAGE_MODEL"
    },
    "value": "no"
  },
  "title": {
    "origin": {
      "type": "PDFINFO",
      "pageNumber": 1
    },
    "value": "Tid for spill – regjeringens dataspillstrategi 2024–2026"
  },
  "publisher": {
    "origin": {
      "type": "PAGE",
      "pageNumber": 52
    },
    "value": "Kultur- og likestillingsdepartementet"
  },
  "publicationType": null,
  "authors": [],
  "isbn": null,
  "issn": null
}

```
