# Metadata Extraction using local LLM inference

In [1]:
# import necessary libraries

#import transformers
import requests
import fitz
import io
import re
import regex
import peft
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load and test the language model

#MODEL = "NatLibFi/Nous-Hermes-2-Mistral-7B-DPO-meteor"
MODEL = "NatLibFi/stablelm-2-zephyr-1_6b-meteor"

from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftConfig

config = PeftConfig.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(MODEL, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL)
#tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

input_text = "Write a poem about Machine Learning."
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

outputs = model.generate(**input_ids, max_new_tokens=128, pad_token_id=tokenizer.pad_token_id)
print(tokenizer.decode(outputs[0]))

Write a poem about Machine Learning. 

Machine Learning: A Journey Through the Digital Universe

In the vast digital universe,  
A machine learning journey begins  
A neural network, a self-learning system  
Guiding us through the unknown terrain  

The first step, a dataset to feed  
A vast sea of data, a treasure trove  
Structured and unstructured,  
A treasure trove of knowledge to explore  

The algorithm, a master of learning  
A self-adapting system,  
A journey through the unknown  
A journey of discovery, a journey of learning  

The neural network, a self-learning system  
A journey through the digital universe  
A journey of discovery


In [8]:
# ask for PDF URL

url = input("Please enter a PDF URL: ")
print(f"You entered: {url}")

Please enter a PDF URL:  https://ffi-publikasjoner.archive.knowledgearc.net/bitstream/handle/20.500.12242/3133/22-01384.pdf


You entered: https://ffi-publikasjoner.archive.knowledgearc.net/bitstream/handle/20.500.12242/3133/22-01384.pdf


In [9]:
# download the PDF and extract the relevant text

# settings for text extraction
PAGES = [0, 1, 2, 3, 4, 5, 6, 7, -1]  # pages to analyze: first 8 pages + last page
THRESHOLD = 100                       # paragraphs shorter than this will always be kept
LONG_PARAGRAPH_PAGES = [0, 1]         # on first two pages, some long paragraphs are accepted
LONG_PARAGRAPH_MAX = 2                # how many long paragraphs to keep on the first two pages
PDF_METADATA_SKIP = {'format', 'creator', 'producer'}  # PDF metadata fields not to include in extracted text

def download_and_open_pdf(url):
    response = requests.get(url)
    pdf_stream = io.BytesIO(response.content)
    return fitz.open(stream=pdf_stream, filetype="pdf")

def extract_text(pdf):
    texts = []

    for key in pdf.metadata.keys():
        if key not in PDF_METADATA_SKIP and pdf.metadata.get(key):
            texts.append(f"{key}: {pdf.metadata.get(key)}")

    for page in PAGES:
        if page > len(pdf) - 2:
            continue

        text = pdf[page].get_text(sort=True)
        # Use regular expression to split text into paragraphs
        # Delimiter: newline(s) followed by an upper case character
        paragraphs = regex.split(r'\n+(?=\p{Lu})', text, flags=re.UNICODE)
        long_paragraph_count = 0

        for paragraph in paragraphs:
            paragraph = " ".join(paragraph.strip().split())

            if '.....' in paragraph or '. . . . .' in paragraph: # looks like a ToC entry, skip it
                continue
            elif len(paragraph) < THRESHOLD:  # short paragraph, keep it
                texts.append(paragraph)
            elif page in LONG_PARAGRAPH_PAGES and long_paragraph_count < LONG_PARAGRAPH_MAX:
                # allow some long paragraphs on the first two pages
                long_paragraph_count += 1
                texts.append(paragraph)
            else:  # must be a long paragraph, skip it
                pass
    return '\n'.join(texts)
pdf = download_and_open_pdf(url)

doc_text = extract_text(pdf)
print(f"text length: {len(doc_text)} characters")
print()
print(doc_text)

text length: 2666 characters

title: Bør vi samarbeide? – en litteraturstudie om valg av sourcingstrategi
author: Olger Breivik Pedersen
keywords: 

creationDate: D:20230110100249+01'00'
modDate: D:20230110112249+01'00'
encryption: Standard V4 R4 128-bit AES
22/01384
FFI-RAPPORT
Bør vi samarbeide? – en litteraturstudie om valg av sourcingstrategi
Olger Breivik Pedersen


Bør vi samarbeide? – en litteraturstudie om valg av sourcingstrategi
Olger Breivik Pedersen
Forsvarets forskningsinstitutt (FFI) 2. januar 2023
FFI-RAPPORT 22/01384 1

Emneord
Outsourcing
Sourcing
Industrisamarbeid
Samarbeid
Strategisk partnerskap
Forsvarsanskaffelser
FFI-rapport 22/01384
Prosjektnummer 1545
Elektronisk ISBN 978-82-464-3444-5
Engelsk tittel
Sourcing strategies – a literature review
Godkjennere
Ane Ofstad Presterud, forskningsleder
Sverre Nyhus Kvalvik, forskningssjef
Dokumentet er elektronisk godkjent og har derfor ikke håndskreven signatur
FFI-RAPPORT 22/01384

Sammendrag
FFI-RAPPORT 22/01384 3

Summa

In [10]:
%%time

# submit the text to the LLM and display results

import pprint
pp = pprint.PrettyPrinter(indent=4)

SYSTEM_PROMPT = "You are a skilled librarian specialized in meticulous cataloguing of digital documents."
INSTRUCTION = "Extract metadata from this document. Return as JSON."

def generate(doc_text):
    messages = [
        {'role': 'system', 'content': SYSTEM_PROMPT},
        {'role': 'user', 'content': INSTRUCTION + "\n\n" + doc_text}
    ]
    
    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")
    print(f"input length: {len(input_ids[0])} tokens")

    outputs = model.generate(input_ids,
                             max_new_tokens=2048,
                             pad_token_id=tokenizer.pad_token_id,
                             eos_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)

response = generate(doc_text)
#print(response)
extracted_data = json.loads(response)
print()
pp.pprint(extracted_data)
print()

input length: 1250 tokens

{   'dc.contributor.author': ['Pedersen, Olger Breivik'],
    'dc.date.issued': '2023',
    'dc.identifier.isbn': ['9788246434445'],
    'dc.language.iso': 'swe',
    'dc.publisher': ['Forsvarets forskningsinstitutt (FFI)'],
    'dc.relation.eissn': '1545',
    'dc.title': 'Bør vi samarbeide? å en litteraturstudie om valg av '
                'sourcingstrategi'}

CPU times: user 4.55 s, sys: 50.6 ms, total: 4.6 s
Wall time: 4.63 s


# Comparison with Meteor output

Command:

    curl -s -d fileUrl=https://www.regjeringen.no/contentassets/7464f476cb4744e59554c2cb4b192df5/no/pdfs/dataspillstrategi.pdf http://127.0.0.1:5000/json|jq .


Output:

```json
{
  "year": {
    "origin": {
      "type": "PDFINFO",
      "pageNumber": 52
    },
    "value": 2023
  },
  "language": {
    "origin": {
      "type": "LANGUAGE_MODEL"
    },
    "value": "no"
  },
  "title": {
    "origin": {
      "type": "PDFINFO",
      "pageNumber": 1
    },
    "value": "Tid for spill – regjeringens dataspillstrategi 2024–2026"
  },
  "publisher": {
    "origin": {
      "type": "PAGE",
      "pageNumber": 52
    },
    "value": "Kultur- og likestillingsdepartementet"
  },
  "publicationType": null,
  "authors": [],
  "isbn": null,
  "issn": null
}

```
