In [None]:
!sudo apt update
%pip install -q -U python-dotenv
%pip install -q -U pdf2image
!sudo apt install --yes poppler-utils
%pip install -q -U openai
%pip install -q -U pytesseract
!sudo apt install --yes tesseract-ocr
%pip install -q -U Unidecode

In [1]:
from dotenv import load_dotenv
from enum import Enum
from openai import OpenAI
from pdf2image import convert_from_bytes, convert_from_path
from PIL import Image
from pydantic import BaseModel
from tqdm import tqdm
from typing import List, Literal, Optional, Union
from unidecode import unidecode
from urllib.parse import urlparse, urlunparse
import json
import os
import pytesseract
import re
import requests
import tempfile
import unicodedata
import urllib
import base64


In [2]:
load_dotenv()

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
WEIGHTS_AND_BIASES = os.environ["WEIGHTS_AND_BIASES"]
HF_TOKEN = os.environ["HF_TOKEN"]

In [3]:
DATASET_PATH = "./documents"
PDF_PATH = os.path.join(DATASET_PATH, "german_pdf_files")
IMAGE_PATH = os.path.join(DATASET_PATH, "german_img_files")

In [None]:
with open(os.path.join(PDF_PATH, "mapping.json"), "r", encoding="utf8") as json_file:
    mappings_pdf = json.load(json_file)
    print("Current number of PDFs:", len(mappings_pdf))


with open(os.path.join(IMAGE_PATH, "mapping.json"), "r", encoding="utf8") as json_file:
    mappings_img = json.load(json_file)
    print("Current number of images:", len(mappings_img))

with open(
    os.path.join(DATASET_PATH, "extraction.json"), "r", encoding="utf8"
) as json_file:
    extraction = json.load(json_file)
    print("Current number of metadata extractions:", len(extraction))

with open(os.path.join(DATASET_PATH, "transcripts.json"), "r", encoding="utf8") as json_file:
    transcripts = json.load(json_file)
    print("Current number of transcripts:", len(transcripts))

del mappings_img["https://www.geo-iburg.de/Bergaufsicht.pdf"]

# Download Scraped PDF Files


In [None]:
!wget https://digitalcorpora.s3.amazonaws.com/corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/metadata/cc-provenance-20230303.csv.gz
!gunzip cc-provenance-20230303.csv.gz 

In [None]:
lines = []
with open("./cc-provenance-20230303.csv") as csv_file:
    for line in csv_file:
        lines.append(line)
header = lines[0]
content = lines[1:]

content = [line.split(",")[2] for line in content]


# filter all german urls
german_urls = []
for url in tqdm(content):
    parsed_url = urlparse(url)
    hostname = parsed_url.hostname
    if hostname is None:
        continue
    if hostname.endswith("de"):
        german_urls.append(url)

len(german_urls)

In [None]:
download_count = 0

for url in tqdm(german_urls):
    if url in mappings_pdf:
        continue

    decoded_str = urllib.parse.unquote(url)
    parsed_url = urlparse(decoded_str)
    file_name = parsed_url.hostname + "_" + os.path.basename(parsed_url.path)
    if not file_name.endswith(".pdf"):
        continue

    try:
        response = requests.get(url, timeout=10)
    except:
        continue

    if response.status_code == 200:
        with open(os.path.join(PDF_PATH, file_name), "wb") as file:
            file.write(response.content)
            mappings_pdf[url] = file_name

        download_count += 1

        # Dump mapping to JSON file every 200 downloads
        if download_count % 200 == 0:
            with open(os.path.join(PDF_PATH, "mapping.json"), "w") as json_file:
                json.dump(mappings_pdf, json_file)

# Final dump of mapping to JSON file
with open(os.path.join(PDF_PATH, "mapping.json"), "w") as json_file:
    json.dump(mappings_pdf, json_file)

# PDFs to Images


In [None]:
for url, file_name in tqdm(mappings_pdf.items()):
    if url in mappings_img:
        continue
    pdf_path = os.path.join(PDF_PATH, file_name)
    pdf_path = unicodedata.normalize("NFC", pdf_path)
    try:
        with tempfile.TemporaryDirectory() as path:
            image_data = convert_from_path(
                pdf_path, output_folder=path, fmt="jpeg", first_page=1, last_page=1
            )[0]
            image_data.save(os.path.join(IMAGE_PATH, file_name.replace(".pdf", ".jpg")))
            mappings_img[url] = file_name.replace(".pdf", ".jpg")
            with open(os.path.join(IMAGE_PATH, "mapping.json"), "w") as f:
                json.dump(mappings_img, f)
    except:
        continue

# Transcribe Images

In [None]:
for url in tqdm(list(mappings_img.keys())):
    file_name = mappings_img[url]

    if url in transcripts:
        continue

    file_path = os.path.join(IMAGE_PATH, file_name)
    try:
        transcript = pytesseract.image_to_string(Image.open(file_path))
        transcripts[url] = transcript
    except:
        continue

    if len(transcripts) % 100 == 0:
        with open(os.path.join(DATASET_PATH, "transcripts.json"), "w") as f:
            json.dump(transcripts, f)


# Extract Attribute Infomration from Images

In [65]:
prompt = """You are an expert data analyst specializing in document analysis and information extraction.
            Your task is to carefully analyze a transcribed text from various types of documents (e.g., emails, letters, invoices, invitations)
            and extract all relevant attributes describing human or legal entities (Person, Organization, Institution, etc.) of the document.
                
            Here are EXAMPLES of attributes you should extract.
            This is not an exhaustive list, find more interesting attributes yourself.
            The attributes MUST directly describe the entity and must be directly extracted from the text.
            - Full Name (if available, separate first name and last name)
            - Complete Address (including street, city, state/province, postal code, country)
            - Phone Number(s) (specify if it's a landline, mobile, or fax)
            - Email Address
            - Any relevant identification numbers (e.g., Customer ID, Order Number, Invoice Number, License Key)
            - Company or Organization name (if applicable)
            - Job Title or Role (if mentioned)
            
            Remember, accuracy is crucial. If you're unsure about any piece of information, indicate your level of confidence or that the information is ambiguous.
            
            The main target of this analysis are documents that carry detailed information about entities e.g. invoices, business letters, etc.
            Give a score from 1 to 10 how good the document fits the quality criteria, 1 being very bad and 10 being very good.
            
            Please analyze the following transcribed text and provide your detailed extraction:
            """

In [62]:
class Attribute(BaseModel):
    key: str
    value: str

class Entity(BaseModel):
    name: str
    attributes: List[Attribute]

class Extraction(BaseModel):
    entities: List[Entity]
    score: int
    

In [None]:
from openai import OpenAI
from llama_index.core.llms import ChatMessage
from llama_index.core.program import FunctionCallingProgram
from tqdm import tqdm

# Initialize the OpenAI client
client = OpenAI(api_key=OPENAI_API_KEY)

attribute_list = {}

for url in tqdm(list(transcripts.keys())[11:12]):
    transcript = transcripts[url]

    messages = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": transcript}
    ]

    response = client.chat.completions.create(
        model="gpt-4o-mini-2024-07-18",
        messages=messages
    )

    chatgpt_response = response.choices[0].message.content

    prompt_template_str = """You are an expert data analyst specializing in document analysis and information extraction.
    Your task is to carefully analyze a transcribed text from various types of documents (e.g., emails, letters, invoices, invitations)
    and extract all relevant attributes describing both the recipient and the sender of the document.
    """

    completion = client.beta.chat.completions.parse(    
        model="gpt-4o-mini-2024-07-18",
        messages=[
            {"role": "system", "content": prompt_template_str},
            {"role": "user", "content": chatgpt_response}
        ],
        response_format=Extraction,
    )

    output = completion.choices[0].message.parsed

    attribute_list[url] = output.dict()

with open(os.path.join(DATASET_PATH, "attributes.json"), "w") as f:
    json.dump(attribute_list, f)

In [None]:
url = list(attribute_list.keys())[0]
attribute_list[url]


In [None]:
attribute_list

In [None]:
completion

In [None]:
%pip install --quiet --upgrade llama-index-core llama-index-readers-file llama-index-llms-ollama llama-index-embeddings-huggingface llama-index-llms-openai llama-index-embeddings-openai

In [18]:
from llama_index.llms.ollama import Ollama
#llm = Ollama(model="gemma2:27b")
llm = Ollama(model="llama3.1:70b", request_timeout=300)

In [None]:
help(program)chatgpt_response

In [8]:
from pydantic import RootModel, BaseModel




In [None]:
%pip install -U pydantic

In [None]:
help(FunctionCallingProgram)

In [None]:
#client = OpenAI(api_key=OPENAI_API_KEY)
from llama_index.core.llms import ChatMessage
from llama_index.core.program import FunctionCallingProgram
from llama_index.llms.openai import OpenAI

for url in tqdm(list(transcripts.keys())[2001:2002]):
    transcript = transcripts[url]

    messages = [ChatMessage(role="system", content=prompt), ChatMessage(role="user", content=transcript)]

    response = llm.chat(messages)

    #print(response)

    prompt_template_str = """You are an expert data analyst specializing in document analysis and information extraction.
    Your task is to carefully analyze a transcribed text from various types of documents (e.g., emails, letters, invoices, invitations)
    and extract all relevant attributes describing both the recipient and the sender of the document.
    Here is the transcribed text:
    {transcript}
    """


    program = FunctionCallingProgram.from_defaults(
        output_cls=Extraction,
        prompt_template_str=prompt_template_str,
        llm=llm
    )

    output = program(transcript=transcript)

    print(output)

    break


# Extract Metadata from Images


In [8]:
client = OpenAI(api_key=OPENAI_API_KEY)

In [9]:
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

In [10]:
class BaseEntity(BaseModel):
    type: str
    email: Optional[List[str]]
    phone: Optional[List[str]]
    fax: Optional[List[str]]
    address: Optional[str]
    website: Optional[str]


class Person(BaseEntity):
    type: Literal["person"]
    first_name: str
    last_name: Optional[str]
    role: Optional[str]
    mobile: Optional[List[str]]


class Government(BaseEntity):
    type: Literal["government"]
    department: str
    state: Optional[str]
    district: Optional[str]
    city: Optional[str]


class Company(BaseEntity):
    type: Literal["company"]
    name: str
    sector: Optional[str]


class Organisation(BaseEntity):
    type: Literal["organization"]
    name: str
    sector: Optional[str]


class Group(BaseEntity):
    type: Literal["group"]
    name: str


Entity = Union[Person, Government, Company, Organisation, Group]


class ContentType(str, Enum):
    presentation = "presentation"
    mail = "mail"
    newsletter = "newsletter"
    werbung = "werbung"
    einladung = "einladung"
    bewerbung = "bewerbung"
    ankündigung = "ankündigung"
    rechnung = "rechnung"
    brief = "brief"
    ausschreibung = "ausschreibung"
    nachrichten = "nachrichten"
    antrag = "antrag"
    angebot = "angebot"
    urkunde = "urkunde"
    sonstiges = "sonstiges"


class DocumentMetadata(BaseModel):
    title: str
    date: Optional[str]
    content_type: ContentType
    has_signature: bool
    main_author: Optional[Entity]
    other_authors: Optional[List[Entity]]
    logo_owners: Optional[List[str]]
    recipients: Optional[List[Entity]]

In [None]:
for url in tqdm(list(mappings_img.keys())):

    if url in extraction:
        continue

    file_name = mappings_img[url]

    file_path = os.path.join(IMAGE_PATH, file_name)

    try:
        transcript = pytesseract.image_to_string(Image.open(file_path))
    except:
        continue

    base64_image = encode_image(file_path)

    response = client.chat.completions.create(
        model="gpt-4o-mini-2024-07-18",
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant. Your task is to extract metadata from documents. Structure your answer as clear as possible and be very precise and detailed in your answer.",
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": f"""
    Analysiere das folgende Dokument und extrahiere die wichtigsten Metadaten. Beantworte dabei die folgenden Punkte:

    0. Beschreibung des Dokuments (Inhalt, Form/Format, Auffälliges/Markantes, )

    1. Dokumententyp (wähle aus: Präsentation, E-Mail, Urkunde, Newsletter, Werbung, Einladung, Bewerbung, Ankündigung, Rechnung, Brief, Ausschreibung, Antrag, Sonstiges)

    2. Titel des Dokuments

    3. Datum des Dokuments

    4. Ist ein Logo vorhanden und deutet es auf eine Autorschaft hin?
    - Welche Firma / Person / Organisation ist Logoinhaber?

    5. Ist eine Unterschrift vorhanden, wenn ja von wem?

    6. Welche Entitäten (Person, Organisation, Behörde, ...) werden im Dokument genannt? In welchem Kontext werden sie genannt? Welche Rolle nehmen die Entitäten ein?

    7. Welche Entitäten könnten Autoren des Dokuments sein und warum? Welche dieser Entitäten werden explizit genannt?

    8. Welche Entitäten könnten Empfänger / Zielgruppe des Dokuments sein und warum? Welche dieser Entitäten werden explizit genannt?

    9. Liste alle Autoren oder Herausgeber.
        - Gib jede Entität (Person, Organisation, Behörde, ...) separat an
        - Füge Kontaktinformationen hinzu, falls verfügbar
        - Nenne ggf. die Verhältnisse der Entitäten zueinander

    10. Liste alle Empfänger.
        - Gib jede Entität (Person, Organisation, Behörde, ...) separat an
        - Füge Kontaktinformationen hinzu, falls verfügbar
        - Nenne ggf. die Verhältnisse der Entitäten zueinander

    11. Weitere relevante Metadaten des Dokuments

    Hier ist der Mitschrieb des zu analysierenden Dokuments:

    {transcript}

    Bitte antworte auf Deutsch und strukturiere deine Antwort entsprechend der obigen Punkte.
                        """,
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
                    },
                ],
            },
        ],
    )

    unstructured_answer = response.choices[0].message.content

    completion = client.beta.chat.completions.parse(
        model="gpt-4o-mini-2024-07-18",
        messages=[
            {
                "role": "system",
                "content": "Your task is to correctly format the given information. Integrate as much information as possible!",
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": f"{unstructured_answer} \n\n {transcript}",
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
                    },
                ],
            },
        ],
        response_format=DocumentMetadata,
    )

    res = completion.choices[0].message.parsed

    res = res.dict()

    extraction[url] = {
        "unstructured": unstructured_answer,
        "transcript": transcript,
        "metadata": res,
    }

    with open(os.path.join(DATASET_PATH, "extraction.json"), "w", encoding="utf8") as f:
        json.dump(extraction, f)

# Experiments

### Create Dataset from Extraction

In [7]:
import shutil
HF_DATASET_PATH = os.path.join(DATASET_PATH, "hf_dataset")
os.makedirs(HF_DATASET_PATH, exist_ok=True)

METADATA_PATH = os.path.join(HF_DATASET_PATH, "metadata.jsonl")

In [None]:
for i, (url, data) in tqdm(enumerate(extraction.items())):
    file_name = mappings_img[url]
    file_path = unicodedata.normalize("NFC", os.path.join(IMAGE_PATH, file_name))

    shutil.copyfile(file_path, os.path.join(HF_DATASET_PATH, f"{i}.jpg"))

    with open(METADATA_PATH, "a", encoding="utf8") as f:
        data = {
            "file_name": f"{i}.jpg",
            "additional_features": data["metadata"],
        }
        f.write(json.dumps(data, ensure_ascii=False) + "\n")



In [None]:
from datasets import load_dataset
from huggingface_hub import login
login(token=HF_TOKEN, new_session=True)

In [None]:
dataset = load_dataset("imagefolder", data_dir=HF_DATASET_PATH)

In [None]:
dataset["train"][0]

In [None]:
dataset.push_to_hub("sodowo/doc_meta")

### Create Donut Dataset

In [6]:
new_special_tokens = []  # new tokens which will be added to the tokenizer
task_start_token = "<s>"  # start of task token
eos_token = "</s>"  # eos token of tokenizer


def json2token(
    obj, update_special_tokens_for_json_key: bool = True, sort_json_key: bool = True
):
    """
    Convert an ordered JSON object into a token sequence
    """
    if type(obj) == dict:
        if len(obj) == 1 and "text_sequence" in obj:
            return obj["text_sequence"]
        else:
            output = ""
            if sort_json_key:
                keys = sorted(obj.keys(), reverse=True)
            else:
                keys = obj.keys()
            for k in keys:
                if update_special_tokens_for_json_key:
                    (
                        new_special_tokens.append(rf"<s_{k}>")
                        if rf"<s_{k}>" not in new_special_tokens
                        else None
                    )
                    (
                        new_special_tokens.append(rf"</s_{k}>")
                        if rf"</s_{k}>" not in new_special_tokens
                        else None
                    )
                output += (
                    rf"<s_{k}>"
                    + json2token(
                        obj[k], update_special_tokens_for_json_key, sort_json_key
                    )
                    + rf"</s_{k}>"
                )
            return output
    elif type(obj) == list:
        return r"<sep/>".join(
            [
                json2token(item, update_special_tokens_for_json_key, sort_json_key)
                for item in obj
            ]
        )
    else:
        # excluded special tokens for now
        obj = str(obj)
        if f"<{obj}/>" in new_special_tokens:
            obj = f"<{obj}/>"  # for categorical special tokens
        return obj

In [7]:
def filter_metadata(meta):
    if meta["main_author"] == None:
        return True
    if meta["recipients"] == None:
        return True
    if meta["date"] == None:
        return True
    if meta["logo_owners"] == None:
        return True
    if meta["other_authors"] == None:
        return True

In [None]:
from transformers import DonutProcessor
import torchvision as tv
import torch
import torchvision.transforms as transforms
import gc
from datasets import Dataset
import os
import json

task_start_token = "<s>"
eos_token = "</s>"
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")
processor.tokenizer.add_special_tokens(
    {"additional_special_tokens": [task_start_token, eos_token]}
)

# Reduce the image size
new_size = [1080, 1920]  # Half of the original size
processor.feature_extractor.size = new_size

# Define image transformations
transform = transforms.Compose([
    transforms.Resize(new_size),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Create output directory
HF_DATASET_PATH_DONUT = os.path.join(DATASET_PATH, "hf_dataset_donut")
os.makedirs(HF_DATASET_PATH_DONUT, exist_ok=True)

dataset_info = []

for i, (url, data) in tqdm(enumerate(extraction.items()), total=len(extraction)):
    item_path = os.path.join(HF_DATASET_PATH_DONUT, f"{i}.json")
    #if os.path.exists(item_path):
    #    continue

    file_name = mappings_img[url]
    file_path = unicodedata.normalize("NFC", os.path.join(IMAGE_PATH, file_name))

    metadata = data["metadata"]

    if filter_metadata(metadata):
        continue

    metadata = task_start_token + json2token(metadata) + eos_token

    input_ids_metadata = (
        processor.tokenizer(
            metadata,
            add_special_tokens=False,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=1000,
        )["input_ids"]
        .squeeze(0)
    )
    
    metadata_target = input_ids_metadata.clone()
    metadata_target[metadata_target == processor.tokenizer.pad_token_id] = -100

    # Load and transform the image
    image = tv.io.read_image(unicodedata.normalize("NFC", file_path))
    image = transforms.ToPILImage()(image)
    pixel_values = transform(image)


    # TODO: Use tensordict?

    with open(item_path, "w") as f:
        json.dump({
            "pixel_values": pixel_values.to(torch.float16).tolist(),
            "metadata": metadata_target.to(torch.int16).tolist()
        }, f)


In [None]:
from datasets import load_dataset
HF_DATASET_PATH_DONUT = os.path.join(DATASET_PATH, "hf_dataset_donut")

dataset = load_dataset(HF_DATASET_PATH_DONUT)
dataset = dataset.with_format("torch")
dataset["train"][0]

In [None]:
#import numpy as np
#from datasets import Dataset, load_from_disk
#from torch.utils.data import DataLoader
#data = np.randomdata dddddddddeffe.rand(10_000)
#Dataset.from_dict({"data": data}).save_to_disk("my_dataset")
#ds = load_from_disk("my_dataset").with_format("torch")
#dataloader = DataLoader(ds, batch_size=32, num_workers=4)

In [None]:
ds.push_to_hub("sodowo/doc_meta_donut")

In [None]:
dataset = load_dataset("sodowo/doc_meta_donut")


In [None]:
dataset = dataset.with_format("torch")
dataset["train"][0]

# Examples

In [None]:
url = list(extraction.keys())[0]
print("url:", url)
print("\n")
print("EXTRACTION:")
res = extraction[url]["metadata"]
print(json.dumps(res, indent=4, ensure_ascii=False))
from IPython.display import Image

file_name = mappings_img[url]
file_path = os.path.join(IMAGE_PATH, file_name)
Image(filename=file_path)