# Extract metadata from a PDF document using a LLM API service

This notebook demonstrates how to extract metadata from a PDF document using an API service that provides access to a fine-tuned LLM that performs the main extraction task.

For testing, you need the following:

1. Install [llama.cpp](https://github.com/ggerganov/llama.cpp) on your computer (on Linux, using CPU only: `git clone` the repository and run `make` to compile it)
2. Download a fine-tuned model such as [NatLibFi/Qwen2-0.5B-Instruct-FinGreyLit-GGUF](https://huggingface.co/NatLibFi/Qwen2-0.5B-Instruct-FinGreyLit-GGUF) in GGML format i.e. [Qwen2-0.5B-Instruct-FinGreyLit-Q4_K_M.gguf
](https://huggingface.co/NatLibFi/Qwen2-0.5B-Instruct-FinGreyLit-GGUF/blob/main/Qwen2-0.5B-Instruct-FinGreyLit-Q4_K_M.gguf)
3. Start the llama.cpp server using the GGUF model: `./llama-server -m Qwen2-0.5B-Instruct-FinGreyLit-Q4_K_M.gguf` and leave it running
4. Install the Python dependencies that this notebook needs from `requirements.txt`
5. Now you can run this notebook! Adjust LLM_API_URL below accordingly if your server is not running on `localhost:8080`

In [1]:
import os

os.environ["NO_PROXY"] = "localhost"
del os.environ["HTTP_PROXY"]
del os.environ["HTTPS_PROXY"]
del os.environ["http_proxy"]
del os.environ["https_proxy"]

#os.environ

In [9]:
# set config

#LLM_API_URL = "http://localhost:8080/chat/completions"  # local llama.cpp server
#LLM_API_URL = "http://127.0.0.1:30000/v1/chat/completions"  # local SGLang
LLM_API_URL = "http://127.0.0.1:8000/v1/chat/completions"  # local vLLM

MODEL_NAME = "NousResearch/Nous-Hermes-2-Mistral-7B-DPO"  # llama-server doesn't care about this, it will simply use the model file you gave it

In [10]:
# import necessary libraries

import requests
import json
import io
import fitz
import regex
import re

In [21]:
%%time
# test the language model using a simple request

def llm_request(message, system_prompt="You are a helpful assistant.", temperature=0.0, max_tokens=50):

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer token-abc123"
    }

    data = {
        "model": MODEL_NAME,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": message},
        ],
        "temperature": temperature,
        "max_tokens": max_tokens,
        "response_format": {"type": "json_object"}
    }

    response = requests.post(LLM_API_URL, headers=headers, json=data)

    if response.status_code == 200:
        #print("Response from OpenAI:", response.json())
        return response.json()['choices'][0]['message']['content']
    else:
        print("Error:", response.status_code, response.text)

print(llm_request("Write some JSON that describes a poem collection about machine learning", temperature=0.5))



CPU times: user 3.84 ms, sys: 1.92 ms, total: 5.75 ms
Wall time: 776 ms


In [4]:
# ask for PDF URL

url = input("Please enter a PDF URL: ")

Please enter a PDF URL:  https://www.doria.fi/bitstream/handle/10024/188075/Tekoa%cc%88lykahvit_%20Extracting%20metadata%20using%20LLMs.pdf?sequence=1&isAllowed=y


In [5]:
# download the PDF and extract the relevant text

# settings for text extraction
PAGES = [0, 1, 2, 3, 4, 5, 6, 7, -2, -1]  # pages to analyze: first 8 pages + last page
THRESHOLD = 100                       # paragraphs shorter than this will always be kept
LONG_PARAGRAPH_PAGES = [0, 1]         # on first two pages, some long paragraphs are accepted
LONG_PARAGRAPH_MAX = 2                # how many long paragraphs to keep on the first two pages
PDF_METADATA_SKIP = {'format', 'creator', 'producer'}  # PDF metadata fields not to include in extracted text

def download_and_open_pdf(url):
    response = requests.get(url)
    pdf_stream = io.BytesIO(response.content)
    return fitz.open(stream=pdf_stream, filetype="pdf")

def extract_content(pdf):
    """extract and return the pdfinfo metadata and the first few pages of text (and last page) from the given PDF file"""

    pdfinfo = {}
    pages = []
    
    for key in pdf.metadata.keys():
        if key not in PDF_METADATA_SKIP and pdf.metadata.get(key):
            pdfinfo[key] = pdf.metadata.get(key)

    for page in PAGES:
        if page > len(pdf) - 2:
            continue

        texts = []
        text = pdf[page].get_text(sort=True)
        # Use regular expression to split text into paragraphs
        # Delimiter: newline(s) followed by an upper case character
        paragraphs = regex.split(r'\n+(?=\p{Lu})', text, flags=re.UNICODE)
        long_paragraph_count = 0

        for paragraph in paragraphs:
            paragraph = " ".join(paragraph.strip().split())

            if '.....' in paragraph or '. . . . .' in paragraph: # looks like a ToC entry, skip it
                continue
            elif len(paragraph) < THRESHOLD:  # short paragraph, keep it
                texts.append(paragraph)
            elif page in LONG_PARAGRAPH_PAGES and long_paragraph_count < LONG_PARAGRAPH_MAX:
                # allow some long paragraphs on the first two pages
                long_paragraph_count += 1
                texts.append(paragraph)
            else:  # must be a long paragraph, skip it
                pass
        text = '\n'.join(texts)
        if text:
            pages.append({"page": pdf[page].number, "text": text})

    return {"pdfinfo": pdfinfo, "pages": pages}

pdf = download_and_open_pdf(url)

doc_json = json.dumps(extract_content(pdf))
print(f"text length: {len(doc_json)} characters")
print()
print(doc_json)

text length: 1232 characters

{"pdfinfo": {"title": "Teko\u00e4lykahvit: Extracting metadata using LLMs"}, "pages": [{"page": 0, "text": "Extracting metadata from grey literature using large language models\nOsma Suominen\nTeko\u00e4lykahvit 1.11.2023\nPerustuu SWIB23-konferenssin salamaesitykseen 12.9.2023"}, {"page": 1, "text": "Grey literature? reports working papers government documents white papers preprints theses \u2026 semi-formal non-commercial\nPDFs published on the web \u2013 lots of them!\nImage made with DreamStudio (based on Stable Diffusion)\nPrompt: \"A big pile of papers, reports, documents, PDF \ufb01les, word documents, powerpoint slides, posters, articles and books forming a wave in the style of Hokusai\""}, {"page": 5, "text": "First 5 pages of text from PDF contributor/faculty: fi=School of\nFine-tuned\nWasaensia relation/numberinseries: 500"}, {"page": 6, "text": "Example of LLM extracted metadata\nDiff view: human vs. LLM generated"}, {"page": 7, "text": "What w

In [6]:
%%time

# submit the text to the LLM and display results

import pprint
pp = pprint.PrettyPrinter(indent=4)

SYSTEM_PROMPT = "You are a skilled librarian specialized in meticulous cataloguing of digital documents."
INSTRUCTION = "Extract metadata from this document. Return as JSON."
MAX_TOKENS = 1024

message = f"{INSTRUCTION}\n\n{doc_json}"
response = llm_request(message, system_prompt=SYSTEM_PROMPT, max_tokens=1024)
#print(response)
extracted_data = json.loads(response)
print()
pp.pprint(extracted_data)
print()


{   'creator': ['Suominen, Osma'],
    'language': 'eng',
    'publisher': ['University of Helsinki'],
    'title': 'Tekoälykahvit : Extracting metadata using large language models',
    'type_coar': 'research article',
    'year': '2023'}

CPU times: user 5.22 ms, sys: 0 ns, total: 5.22 ms
Wall time: 5.18 s
