# Metadata Extraction using local LLM inference

In [1]:
# import necessary libraries

#import transformers
import requests
import fitz
import io
import re
import regex
import peft
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load and test the language model

MODEL = "NatLibFi/zephyr-7b-meteor-ludwig"

from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftConfig

config = PeftConfig.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(MODEL, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

input_text = "Write me a poem about Machine Learning."
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

outputs = model.generate(**input_ids, max_new_tokens=128)
print(tokenizer.decode(outputs[0]))

Loading checkpoint shards: 100%|██████████| 8/8 [02:44<00:00, 20.62s/it]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> Write me a poem about Machine Learning.

In a world where data is the new gold,
Machine Learning is the alchemist's tale,
Transforming raw data into insights bold,
With algorithms that never fail.

It's a science that's both art and craft,
A symphony of math and code,
A journey that's both deep and vast,
A story that's yet to be told.

It's a language that's spoken by the stars,
A language that's written in the sky,
A language that's whispered in the wind,
A language that'


In [13]:
# ask for PDF URL

url = input("Please enter a PDF URL: ")
print(f"You entered: {url}")

Please enter a PDF URL:  https://www.regjeringen.no/contentassets/7464f476cb4744e59554c2cb4b192df5/no/pdfs/dataspillstrategi.pdf


You entered: https://www.regjeringen.no/contentassets/7464f476cb4744e59554c2cb4b192df5/no/pdfs/dataspillstrategi.pdf


In [14]:
# download the PDF and extract the relevant text

# settings for text extraction
PAGES = [0, 1, 2, 3, 4, 5, 6, 7, -1]  # pages to analyze: first 8 pages + last page
THRESHOLD = 100                       # paragraphs shorter than this will always be kept
LONG_PARAGRAPH_PAGES = [0, 1]         # on first two pages, some long paragraphs are accepted
LONG_PARAGRAPH_MAX = 2                # how many long paragraphs to keep on the first two pages
PDF_METADATA_SKIP = {'format', 'creator', 'producer'}  # PDF metadata fields not to include in extracted text

def download_and_open_pdf(url):
    response = requests.get(url)
    pdf_stream = io.BytesIO(response.content)
    return fitz.open(stream=pdf_stream, filetype="pdf")

def extract_text(pdf):
    texts = []

    for key in pdf.metadata.keys():
        if key not in PDF_METADATA_SKIP and pdf.metadata.get(key):
            texts.append(f"{key}: {pdf.metadata.get(key)}")

    for page in PAGES:
        if page > len(pdf) - 2:
            continue

        text = pdf[page].get_text(sort=True)
        # Use regular expression to split text into paragraphs
        # Delimiter: newline(s) followed by an upper case character
        paragraphs = regex.split(r'\n+(?=\p{Lu})', text, flags=re.UNICODE)
        long_paragraph_count = 0

        for paragraph in paragraphs:
            paragraph = " ".join(paragraph.strip().split())

            if '.....' in paragraph or '. . . . .' in paragraph: # looks like a ToC entry, skip it
                continue
            elif len(paragraph) < THRESHOLD:  # short paragraph, keep it
                texts.append(paragraph)
            elif page in LONG_PARAGRAPH_PAGES and long_paragraph_count < LONG_PARAGRAPH_MAX:
                # allow some long paragraphs on the first two pages
                long_paragraph_count += 1
                texts.append(paragraph)
            else:  # must be a long paragraph, skip it
                pass
    return '\n'.join(texts)
pdf = download_and_open_pdf(url)

doc_text = extract_text(pdf)
print(f"text length: {len(doc_text)} characters")
print()
print(doc_text)

text length: 1485 characters

title: Tid for spill – regjeringens dataspillstrategi 2024–2026
author: Kultur- og likestillingsdepartementet
subject: Tid for spill – regjeringens dataspillstrategi 2024–2026
creationDate: D:20231212102035+01'00'
modDate: D:20231212150502+01'00'
Kultur- og likestillingsdepartementet
Strategi
Tid for spill – regjeringens dataspillstrategi 2024–2026
Illustrasjonsfoto: AdobeStock 2
Vurdering av dagens virkemidler i
Forsideillustrasjon: Sunlight, Krillbite Studio AS 3
Illustrasjonsfoto: AdobeStock 4
Forord
Lubna Jaffery
KULTUR- OG LIKESTILLINGSMINISTER
Foto: Ilja C. Hendel/KUD
Takk til Norsk filminstitutt (NFI), Medietilsynet, Kulturtanken, Innovasjon
Illustrasjonsfoto: AdobeStock 6
Statsminister Jonas Gahr Støre
Dataspill er næring
Dataspill er kultur
Dataspill er internasjonalisering
Dataspill er innovasjon
Chart: Are You Not Entertained? | Statista 7
Om dataspillstrategien
Dataspill er kunnskap
Arbeidet med regjeringens dataspillstrategi er ledet av Kultur

In [15]:
%%time

# submit the text to the LLM and display results

import pprint
pp = pprint.PrettyPrinter(indent=4)

PROMPT_TEMPLATE = \
"""### Instruction:
Extract metadata from the following document. Return as JSON.

### Input:
{text}

### Response:
"""

def generate(doc_text):
    input_text = PROMPT_TEMPLATE.format(text=doc_text)
    input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
    print(f"input length: {len(input_ids[0])} tokens")

    outputs = model.generate(**input_ids, max_new_tokens=1024)
    return tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)

response = generate(doc_text)
extracted_data = json.loads(response)
print()
pp.pprint(extracted_data)
print()

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


input length: 680 tokens

{   'dc.contributor.author': ['Jaffery, Lubna'],
    'dc.date.issued': '2023-12-12',
    'dc.language.iso': 'nor',
    'dc.publisher': ['Kultur- og likestillingsdepartementet'],
    'dc.title': 'Tid for spill : regjeringens dataspillstrategi 2024-2026'}

CPU times: user 3.45 s, sys: 72.7 ms, total: 3.52 s
Wall time: 3.62 s


# Comparison with Meteor output

Command:

    curl -s -d fileUrl=https://www.regjeringen.no/contentassets/7464f476cb4744e59554c2cb4b192df5/no/pdfs/dataspillstrategi.pdf http://127.0.0.1:5000/json|jq .


Output:

```json
{
  "year": {
    "origin": {
      "type": "PDFINFO",
      "pageNumber": 52
    },
    "value": 2023
  },
  "language": {
    "origin": {
      "type": "LANGUAGE_MODEL"
    },
    "value": "no"
  },
  "title": {
    "origin": {
      "type": "PDFINFO",
      "pageNumber": 1
    },
    "value": "Tid for spill – regjeringens dataspillstrategi 2024–2026"
  },
  "publisher": {
    "origin": {
      "type": "PAGE",
      "pageNumber": 52
    },
    "value": "Kultur- og likestillingsdepartementet"
  },
  "publicationType": null,
  "authors": [],
  "isbn": null,
  "issn": null
}

```
