In [7]:
!sudo apt update
%pip install -q -U python-dotenv
%pip install -q -U pdf2image
!sudo apt install --yes poppler-utils
%pip install -q -U openai
%pip install -q -U pytesseract
!sudo apt install --yes tesseract-ocr
%pip install -q -U Unidecode
%pip install -U huggingface_hub

Get:1 file:/var/cuda-repo-ubuntu2204-12-4-local  InRelease [1572 B]
Get:2 file:/var/cudnn-local-repo-ubuntu2204-9.5.1  InRelease [1572 B]
Get:1 file:/var/cuda-repo-ubuntu2204-12-4-local  InRelease [1572 B]
Get:2 file:/var/cudnn-local-repo-ubuntu2204-9.5.1  InRelease [1572 B]
Hit:3 https://deb.nodesource.com/node_20.x nodistro InRelease                  [0m[33m
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease                         [0m[33m
Hit:5 http://security.ubuntu.com/ubuntu jammy-security InRelease               [0m
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease                 [0m
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease               [0m[33m
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease   [33m
Hit:8 https://packagecloud.io/github/git-lfs/ubuntu jammy InRelease
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
4 packages can be upgraded. Run 'apt li

In [2]:
from dotenv import load_dotenv
from enum import Enum
from openai import OpenAI
from pdf2image import convert_from_path
from PIL import Image
from pydantic import BaseModel
from tqdm import tqdm
from typing import List, Literal, Optional, Union
from urllib.parse import urlparse
import json
import os
import pytesseract
import requests
import tempfile
import unicodedata
import urllib
import base64

In [8]:
load_dotenv()

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
WEIGHTS_AND_BIASES = os.environ["WEIGHTS_AND_BIASES"]
HF_TOKEN = os.environ["HF_TOKEN"]

In [3]:
DATASET_PATH = "./documents"
PDF_PATH = os.path.join(DATASET_PATH, "german_pdf_files")
IMAGE_PATH = os.path.join(DATASET_PATH, "german_img_files")

In [4]:
with open(os.path.join(DATASET_PATH, "pdf_mapping.json"), "r", encoding="utf8") as json_file:
    mappings_pdf = json.load(json_file)
    print("Current number of PDFs:", len(mappings_pdf))

with open(os.path.join(DATASET_PATH, "img_mapping.json"), "r", encoding="utf8") as json_file:
    mappings_img = json.load(json_file)
    print("Current number of images:", len(mappings_img))

with open(os.path.join(DATASET_PATH, "extraction.json"), "r", encoding="utf8") as json_file:
    extraction = json.load(json_file)
    print("Current number of metadata extractions:", len(extraction))

Current number of PDFs: 4702
Current number of images: 4702
Current number of metadata extractions: 4702


# Download Scraped PDF Files


In [6]:
!wget https://digitalcorpora.s3.amazonaws.com/corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/metadata/cc-provenance-20230303.csv.gz
!gunzip cc-provenance-20230303.csv.gz 

--2024-11-02 10:46:48--  https://digitalcorpora.s3.amazonaws.com/corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/metadata/cc-provenance-20230303.csv.gz
Resolving digitalcorpora.s3.amazonaws.com (digitalcorpora.s3.amazonaws.com)... 52.218.220.59, 52.92.130.73, 52.92.129.49, ...
Connecting to digitalcorpora.s3.amazonaws.com (digitalcorpora.s3.amazonaws.com)|52.218.220.59|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1293543097 (1.2G) [text/csv]
Saving to: ‘cc-provenance-20230303.csv.gz’


2024-11-02 10:48:01 (17.0 MB/s) - ‘cc-provenance-20230303.csv.gz’ saved [1293543097/1293543097]



In [6]:
lines = []
with open("./cc-provenance-20230303.csv") as csv_file:
    for line in csv_file:
        lines.append(line)
header = lines[0]
content = lines[1:]

content = [line.split(",")[2] for line in content]


# filter all german urls
german_urls = []
for url in tqdm(content[:100000]):
    parsed_url = urlparse(url)
    hostname = parsed_url.hostname
    if hostname is None:
        continue
    if hostname.endswith("de"):
        german_urls.append(url)

len(german_urls)

100%|██████████| 100000/100000 [00:00<00:00, 206565.59it/s]


7239

In [None]:
download_count = 0

for url in tqdm(german_urls):
    if url in mappings_pdf:
        print(f"Already downloaded {url}")
        continue

    decoded_str = urllib.parse.unquote(url)
    parsed_url = urlparse(decoded_str)
    
    host = parsed_url.hostname.replace(".", "_")
    file_name = parsed_url.hostname + "_" + os.path.basename(parsed_url.path)

    pdf_path = os.path.join(PDF_PATH, host, file_name)

    if not file_name.endswith(".pdf"):
        continue

    try:
        response = requests.get(url, timeout=10)
    except:
        continue

    if response.status_code == 200:

        os.makedirs(os.path.dirname(pdf_path), exist_ok=True)

        with open(pdf_path, "wb") as file:
            file.write(response.content)
            mappings_pdf[url] = pdf_path

        download_count += 1

        if download_count % 200 == 0:
            with open(os.path.join(DATASET_PATH, "pdf_mapping.json"), "w", encoding="utf8") as json_file:
                json.dump(mappings_pdf, json_file)

# PDFs to Images


In [5]:
for url, pdf_path in tqdm(mappings_pdf.items()):
    if url in mappings_img:
        print(f"Already converted {url}")
        continue

    img_path = pdf_path.replace(".pdf", ".jpg")

    try:
        with tempfile.TemporaryDirectory() as path:
            image_data = convert_from_path(
                pdf_path, output_folder=path, fmt="jpeg", first_page=1, last_page=1
            )[0]
    except:
        continue

    image_data.save(img_path)
    mappings_img[url] = img_path
            
    with open(os.path.join(DATASET_PATH, "img_mapping.json"), "w", encoding="utf8") as f:
                json.dump(mappings_img, f)


# Extract Metadata from Images


In [8]:
client = OpenAI(api_key=OPENAI_API_KEY)

In [9]:
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

In [10]:
class BaseEntity(BaseModel):
    type: str
    email: Optional[List[str]]
    phone: Optional[List[str]]
    fax: Optional[List[str]]
    address: Optional[str]
    website: Optional[str]


class Person(BaseEntity):
    type: Literal["person"]
    first_name: str
    last_name: Optional[str]
    role: Optional[str]
    mobile: Optional[List[str]]


class Government(BaseEntity):
    type: Literal["government"]
    department: str
    state: Optional[str]
    district: Optional[str]
    city: Optional[str]


class Company(BaseEntity):
    type: Literal["company"]
    name: str
    sector: Optional[str]


class Organisation(BaseEntity):
    type: Literal["organization"]
    name: str
    sector: Optional[str]


class Group(BaseEntity):
    type: Literal["group"]
    name: str


Entity = Union[Person, Government, Company, Organisation, Group]


class ContentType(str, Enum):
    presentation = "presentation"
    mail = "mail"
    newsletter = "newsletter"
    werbung = "werbung"
    einladung = "einladung"
    bewerbung = "bewerbung"
    ankündigung = "ankündigung"
    rechnung = "rechnung"
    brief = "brief"
    ausschreibung = "ausschreibung"
    nachrichten = "nachrichten"
    antrag = "antrag"
    angebot = "angebot"
    urkunde = "urkunde"
    sonstiges = "sonstiges"


class DocumentMetadata(BaseModel):
    title: str
    date: Optional[str]
    content_type: ContentType
    has_signature: bool
    main_author: Optional[Entity]
    other_authors: Optional[List[Entity]]
    logo_owners: Optional[List[str]]
    recipients: Optional[List[Entity]]

In [None]:
for url in tqdm(list(mappings_img.keys())):

    if url in extraction:
        continue

    file_path = mappings_img[url]

    try:
        transcript = pytesseract.image_to_string(Image.open(file_path))
    except:
        continue

    base64_image = encode_image(file_path)

    response = client.chat.completions.create(
        model="gpt-4o-mini-2024-07-18",
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant. Your task is to extract metadata from documents. Structure your answer as clear as possible and be very precise and detailed in your answer.",
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": f"""
    Analysiere das folgende Dokument und extrahiere die wichtigsten Metadaten. Beantworte dabei die folgenden Punkte:

    0. Beschreibung des Dokuments (Inhalt, Form/Format, Auffälliges/Markantes, )

    1. Dokumententyp (wähle aus: Präsentation, E-Mail, Urkunde, Newsletter, Werbung, Einladung, Bewerbung, Ankündigung, Rechnung, Brief, Ausschreibung, Antrag, Sonstiges)

    2. Titel des Dokuments

    3. Datum des Dokuments

    4. Ist ein Logo vorhanden und deutet es auf eine Autorschaft hin?
    - Welche Firma / Person / Organisation ist Logoinhaber?

    5. Ist eine Unterschrift vorhanden, wenn ja von wem?

    6. Welche Entitäten (Person, Organisation, Behörde, ...) werden im Dokument genannt? In welchem Kontext werden sie genannt? Welche Rolle nehmen die Entitäten ein?

    7. Welche Entitäten könnten Autoren des Dokuments sein und warum? Welche dieser Entitäten werden explizit genannt?

    8. Welche Entitäten könnten Empfänger / Zielgruppe des Dokuments sein und warum? Welche dieser Entitäten werden explizit genannt?

    9. Liste alle Autoren oder Herausgeber.
        - Gib jede Entität (Person, Organisation, Behörde, ...) separat an
        - Füge Kontaktinformationen hinzu, falls verfügbar
        - Nenne ggf. die Verhältnisse der Entitäten zueinander

    10. Liste alle Empfänger.
        - Gib jede Entität (Person, Organisation, Behörde, ...) separat an
        - Füge Kontaktinformationen hinzu, falls verfügbar
        - Nenne ggf. die Verhältnisse der Entitäten zueinander

    11. Weitere relevante Metadaten des Dokuments

    Hier ist der Mitschrieb des zu analysierenden Dokuments:

    {transcript}

    Bitte antworte auf Deutsch und strukturiere deine Antwort entsprechend der obigen Punkte.
                        """,
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
                    },
                ],
            },
        ],
    )

    unstructured_answer = response.choices[0].message.content

    completion = client.beta.chat.completions.parse(
        model="gpt-4o-mini-2024-07-18",
        messages=[
            {
                "role": "system",
                "content": "Your task is to correctly format the given information. Integrate as much information as possible!",
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": f"{unstructured_answer} \n\n {transcript}",
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
                    },
                ],
            },
        ],
        response_format=DocumentMetadata,
    )

    res = completion.choices[0].message.parsed

    res = res.dict()

    extraction[url] = {
        "unstructured": unstructured_answer,
        "transcript": transcript,
        "metadata": res,
    }

    with open(os.path.join(DATASET_PATH, "extraction.json"), "w", encoding="utf8") as f:
        json.dump(extraction, f)

# Publishing

#### Archive Folder on HF

In [9]:
from huggingface_hub import login
login(token=HF_TOKEN, new_session=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [None]:
from huggingface_hub import HfApi
from huggingface_hub.utils import disable_progress_bars

api = HfApi()

disable_progress_bars()
api.upload_large_folder(
        folder_path="documents",
        repo_id="sodowo/documents",
        repo_type="dataset"
)

### Create HF Dataset from Extraction

In [17]:
import shutil
HF_DATASET_PATH = "hf_dataset"
os.makedirs(HF_DATASET_PATH, exist_ok=True)
METADATA_PATH = os.path.join(HF_DATASET_PATH, "metadata.jsonl")

In [18]:
for i, (url, data) in tqdm(enumerate(extraction.items())):
    file_path = os.path.join(IMAGE_PATH, mappings_img[url])
    
    shutil.copyfile(file_path, os.path.join(HF_DATASET_PATH, f"{i}.jpg"))

    with open(METADATA_PATH, "a", encoding="utf8") as f:
        data = {
            "file_name": f"{i}.jpg",
            "additional_features": data["metadata"],
        }
        f.write(json.dumps(data, ensure_ascii=False) + "\n")

4702it [00:32, 142.78it/s] 


In [19]:
from datasets import load_dataset

In [20]:
dataset = load_dataset("imagefolder", data_dir=HF_DATASET_PATH)

Resolving data files:   0%|          | 0/4703 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/4703 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [21]:
dataset["train"][0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1969x1386>,
 'additional_features': {'title': 'Entschuldigung',
  'date': None,
  'content_type': 'sonstiges',
  'has_signature': False,
  'main_author': {'type': 'company',
   'email': None,
   'phone': None,
   'fax': None,
   'address': None,
   'website': None,
   'name': 'OLYMPIA Schnittmusterverlag',
   'sector': None,
   'first_name': None,
   'last_name': None,
   'role': None,
   'mobile': None,
   'department': None,
   'state': None,
   'district': None,
   'city': None},
  'other_authors': None,
  'logo_owners': ['OLYMPIA Schnittmusterverlag'],
  'recipients': None}}

In [22]:
dataset.push_to_hub("sodowo/doc_meta")

Map:   0%|          | 0/1176 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/4 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Map:   0%|          | 0/1176 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Map:   0%|          | 0/1175 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Map:   0%|          | 0/1175 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/2.92k [00:00<?, ?B/s]