In [1]:
!sudo apt update
%pip install -q -U python-dotenv
%pip install -q -U pdf2image
!sudo apt install --yes poppler-utils
%pip install -q -U openai
%pip install -q -U pytesseract
!sudo apt install --yes tesseract-ocr
%pip install -q -U Unidecode
%pip install -U huggingface_hub

Get:1 file:/var/cuda-repo-ubuntu2204-12-4-local  InRelease [1572 B]
Get:2 file:/var/cudnn-local-repo-ubuntu2204-9.5.1  InRelease [1572 B]    [0m
Get:1 file:/var/cuda-repo-ubuntu2204-12-4-local  InRelease [1572 B]            [0m[33m
Get:2 file:/var/cudnn-local-repo-ubuntu2204-9.5.1  InRelease [1572 B]          [0m[33m
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease                         [0m
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]        [0m
Get:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]      [0m[33m
Get:6 https://deb.nodesource.com/node_20.x nodistro InRelease [12.1 kB]        [0m[33m[33m
Get:7 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]      [0m[33m
Get:8 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [2667 kB]m[33m[33m
Get:9 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1451 kB]3m
Get:11 http://archive.ubuntu.com/ubuntu jammy-updates

In [1]:
from dotenv import load_dotenv
from enum import Enum
from openai import OpenAI
from pdf2image import convert_from_path
from PIL import Image
from pydantic import BaseModel
from tqdm import tqdm
from typing import List, Literal, Optional, Union
from urllib.parse import urlparse
import json
import os
import pytesseract
import requests
import tempfile
import unicodedata
import urllib
import base64

In [2]:
load_dotenv()

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
WEIGHTS_AND_BIASES = os.environ["WEIGHTS_AND_BIASES"]
HF_TOKEN = os.environ["HF_TOKEN"]

In [3]:
DATASET_PATH = "./documents"
PDF_PATH = os.path.join(DATASET_PATH, "german_pdf_files")
IMAGE_PATH = os.path.join(DATASET_PATH, "german_img_files")

In [4]:
# mappings from url to file path (relative to PDF_PATH)
# example:
# pdf_url = "https://www.example.com/file.pdf"
# pdf_path = os.path.join(PDF_PATH, mappings_pdf[pdf_url])
with open(os.path.join(DATASET_PATH, "pdf_mapping.json"), "r", encoding="utf8") as json_file:
    mappings_pdf = json.load(json_file)
    print("Current number of PDFs:", len(mappings_pdf))

# contains urls that:
# - were not available to download
# - were downloaded but not a valid pdf
# - contained some other error
with open(os.path.join(DATASET_PATH, "skipped_urls.json"), "r", encoding="utf8") as f:
    skipped_urls = json.load(f)
    print("Current number of skipped URLs:", len(skipped_urls))

with open(os.path.join(DATASET_PATH, "author_extraction.json"), "r", encoding="utf8") as f:
    author_extraction = json.load(f)
    num_author_extractions = 0
    for url in author_extraction:
        if author_extraction[url]["name"] is not None:
            num_author_extractions += 1
    print("Current number of author extractions:", num_author_extractions)

with open(os.path.join(DATASET_PATH, "img_mapping.json"), "r", encoding="utf8") as json_file:
    mappings_img = json.load(json_file)
    print("Current number of images:", len(mappings_img))

with open(os.path.join(DATASET_PATH, "extraction.json"), "r", encoding="utf8") as json_file:
    extraction = json.load(json_file)
    print("Current number of metadata extractions:", len(extraction))

Current number of PDFs: 19902
Current number of skipped URLs: 25275
Current number of author extractions: 2015
Current number of images: 4702
Current number of metadata extractions: 4702


# Download Scraped PDF Files


In [5]:
!wget https://digitalcorpora.s3.amazonaws.com/corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/metadata/cc-provenance-20230303.csv.gz
!gunzip cc-provenance-20230303.csv.gz 

--2024-11-04 20:19:30--  https://digitalcorpora.s3.amazonaws.com/corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/metadata/cc-provenance-20230303.csv.gz
Resolving digitalcorpora.s3.amazonaws.com (digitalcorpora.s3.amazonaws.com)... 52.92.207.113, 52.218.182.51, 52.92.153.201, ...
Connecting to digitalcorpora.s3.amazonaws.com (digitalcorpora.s3.amazonaws.com)|52.92.207.113|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1293543097 (1.2G) [text/csv]
Saving to: ‘cc-provenance-20230303.csv.gz’


2024-11-04 20:21:05 (13.2 MB/s) - ‘cc-provenance-20230303.csv.gz’ saved [1293543097/1293543097]

gzip: cc-provenance-20230303.csv already exists; do you wish to overwrite (y or n)? ^C


In [5]:
lines = []
with open("./cc-provenance-20230303.csv") as csv_file:
    for line in csv_file:
        lines.append(line)
header = lines[0]
content = lines[1:]

content = [line.split(",")[2] for line in content]


# filter all german urls
german_urls = []
for url in tqdm(content):
    parsed_url = urlparse(url)
    hostname = parsed_url.hostname
    if hostname is None:
        continue
    if hostname.endswith("de"):
        german_urls.append(url)

len(german_urls)

100%|██████████| 8410703/8410703 [00:38<00:00, 220999.86it/s]


590954

In [7]:
# clears up disk space by deleting files that were deemed unusable
# skipped urls are avoided by not being downloaded in the future

def delete_skipped_urls():
    deletions = 0
    pbar = tqdm(skipped_urls)
    for url in pbar:

        # pdf was not downloaded
        if url not in mappings_pdf:
            continue

        pdf_path = os.path.join(PDF_PATH, mappings_pdf[url])
        if os.path.exists(pdf_path):
            #remove file
            os.remove(pdf_path)
            # remove folder if empty
            deletions += 1
            if not os.listdir(os.path.dirname(pdf_path)):
                os.rmdir(os.path.dirname(pdf_path))

        pbar.set_postfix({"deletions": deletions, "current": url})

delete_skipped_urls()

100%|██████████| 25275/25275 [00:00<00:00, 34409.64it/s, deletions=0, current=https://simress.de/sites/default/files/Progress%20Report%20of%20Two%20SimRess%20Group%20Modelling%20Workshops.pdf]                                                                                                                                                                                                                                                                                                                                              


In [9]:
errors = 0
download_count = 0
existing_urls = 0

pbar = tqdm(german_urls)

for url in pbar:

    if url in mappings_pdf:
        existing_urls += 1
        continue


    decoded_str = urllib.parse.unquote(url)
    parsed_url = urlparse(decoded_str)
    
    host = parsed_url.hostname.replace(".", "_")
    file_name = parsed_url.hostname + "_" + os.path.basename(parsed_url.path)

    pdf_path = os.path.join(PDF_PATH, host, file_name)

    if url in skipped_urls:
        continue


    if not file_name.endswith(".pdf"):
        errors += 1
        skipped_urls.append(url)
        continue

    try:
        response = requests.get(url, timeout=10)
    except:
        errors += 1
        skipped_urls.append(url)
        continue

    if response.status_code == 200:
        os.makedirs(os.path.dirname(pdf_path), exist_ok=True)

        with open(pdf_path, "wb") as file:
            file.write(response.content)
            mappings_pdf[url] = os.path.join(host, file_name)

        download_count += 1

        if download_count % 200 == 0:
            with open(os.path.join(DATASET_PATH, "pdf_mapping.json"), "w", encoding="utf8") as json_file:
                json.dump(mappings_pdf, json_file)

    else:
        errors += 1
        skipped_urls.append(url)

    with open(os.path.join(DATASET_PATH, "skipped_urls.json"), "w", encoding="utf8") as f:
        json.dump(skipped_urls, f)

    pbar.set_postfix({"downloaded": download_count, "errors": errors, "existing": existing_urls, "total": download_count + errors + existing_urls})

  7%|▋         | 44077/590954 [00:22<40:11, 226.80it/s, downloaded=21, errors=1, existing=2e+4, total=2e+4]  

  7%|▋         | 44077/590954 [00:25<40:11, 226.80it/s, downloaded=24, errors=2, existing=2e+4, total=2e+4]

# PDFs to Images


In [5]:
for url, pdf_path in tqdm(mappings_pdf.items()):
    if url in mappings_img:
        continue

    img_path = pdf_path.replace(".pdf", ".jpg")

    try:
        with tempfile.TemporaryDirectory() as path:
            image_data = convert_from_path(
                pdf_path, output_folder=path, fmt="jpeg", first_page=1, last_page=1
            )[0]
    except:
        continue

    image_data.save(img_path)
    mappings_img[url] = img_path
            
    with open(os.path.join(DATASET_PATH, "img_mapping.json"), "w", encoding="utf8") as f:
        json.dump(mappings_img, f)


# Extract Author using URL

In [9]:
def delete_special_chars(text):
    return text.replace(" ", "").replace("-", "").replace(".", "").replace(",", "").replace("!", "").replace("?", "").replace("(", "").replace(")", "").replace("'", "").replace('"', "").replace(";", "").replace(":", "")

In [10]:
import pytesseract
from PIL import Image
from pdf2image import convert_from_path

def get_text_tesseract(pdf_path):
    images = convert_from_path(pdf_path)
    return pytesseract.image_to_string(images[0])

In [11]:
import json
import os
import urllib.parse
from tqdm import tqdm


files = list(mappings_pdf.values())
urls = list(mappings_pdf.keys())

num_matches = 0
for res in author_extraction:
    if author_extraction[res]["name"] != None:
        num_matches += 1


skipped = 0

pbar = tqdm(enumerate(files), total=len(files))
for idx, file in pbar:
    file = os.path.join("documents", "german_pdf_files", file)
    url = urls[idx]

    if url in author_extraction:
        continue

    if url in skipped_urls:
        continue

    #get document size
    doc_size = os.path.getsize(file)

    # avoid excessivly large files
    if doc_size > 10000000:
        skipped += 1
        skipped_urls.append(url)
        continue

    hostname = urllib.parse.urlparse(url).netloc.replace("www.", "").replace(".de", "").replace(".", " ").replace("-", " ").replace("www", " ").strip()

    pbar.set_postfix({"matches": num_matches, "skipped": skipped, "hostname": hostname})

    try:
        text = get_text_tesseract(file)
    except:
        skipped += 1
        skipped_urls.append(url)
        continue

    lines_stripped = delete_special_chars(text)

    # remove the url from the text => we dont just want to match if the url is in the document but if the name in the url naturally occurs in the text
    # => leads to far fewer matches but more expressive samples
    lines_stripped = lines_stripped.replace(urllib.parse.urlparse(url).netloc, "")

    # verify that the hostname is in the document => document probably belongs to the company which owns the url
    if "".join(hostname.split(" ")) in lines_stripped:
        author_extraction[url] = {"name": hostname, "transcript": text}

        with open(os.path.join(DATASET_PATH, "author_extraction.json"), "w", encoding="utf8") as f:
            json.dump(author_extraction, f)

        # tail -f names.txt
        with open("names.txt", "a") as f:
            f.write(hostname + "\n")

        num_matches += 1
    else:
        author_extraction[url] = {"name": None, "transcript": text}

    # save skipped urls
    with open(os.path.join(DATASET_PATH, "skipped_urls.json"), "w", encoding="utf8") as f:
        json.dump(skipped_urls, f)


 54%|█████▍    | 10788/19902 [00:45<01:35, 95.29it/s, matches=2015, skipped=0, hostname=gems spiesen elversberg]  

# Extract Metadata from Images


In [6]:
client = OpenAI(api_key=OPENAI_API_KEY)

In [7]:
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

In [8]:
class BaseEntity(BaseModel):
    type: str
    email: Optional[List[str]]
    phone: Optional[List[str]]
    fax: Optional[List[str]]
    address: Optional[str]
    website: Optional[str]


class Person(BaseEntity):
    type: Literal["person"]
    first_name: str
    last_name: Optional[str]
    role: Optional[str]
    mobile: Optional[List[str]]


class Government(BaseEntity):
    type: Literal["government"]
    department: str
    state: Optional[str]
    district: Optional[str]
    city: Optional[str]


class Company(BaseEntity):
    type: Literal["company"]
    name: str
    sector: Optional[str]


class Organisation(BaseEntity):
    type: Literal["organization"]
    name: str
    sector: Optional[str]


class Group(BaseEntity):
    type: Literal["group"]
    name: str


Entity = Union[Person, Government, Company, Organisation, Group]


class ContentType(str, Enum):
    presentation = "presentation"
    mail = "mail"
    newsletter = "newsletter"
    werbung = "werbung"
    einladung = "einladung"
    bewerbung = "bewerbung"
    ankündigung = "ankündigung"
    rechnung = "rechnung"
    brief = "brief"
    ausschreibung = "ausschreibung"
    nachrichten = "nachrichten"
    antrag = "antrag"
    angebot = "angebot"
    urkunde = "urkunde"
    sonstiges = "sonstiges"


class DocumentMetadata(BaseModel):
    title: str
    date: Optional[str]
    content_type: ContentType
    has_signature: bool
    main_author: Optional[Entity]
    other_authors: Optional[List[Entity]]
    logo_owners: Optional[List[str]]
    recipients: Optional[List[Entity]]

In [13]:
for url in tqdm(list(mappings_img.keys())):

    if url in extraction:
        continue

    file_path = mappings_img[url]

    try:
        transcript = pytesseract.image_to_string(Image.open(file_path))
    except:
        continue

    base64_image = encode_image(file_path)

    response = client.chat.completions.create(
        model="gpt-4o-mini-2024-07-18",
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant. Your task is to extract metadata from documents. Structure your answer as clear as possible and be very precise and detailed in your answer.",
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": f"""
    Analysiere das folgende Dokument und extrahiere die wichtigsten Metadaten. Beantworte dabei die folgenden Punkte:

    0. Beschreibung des Dokuments (Inhalt, Form/Format, Auffälliges/Markantes, )

    1. Dokumententyp (wähle aus: Präsentation, E-Mail, Urkunde, Newsletter, Werbung, Einladung, Bewerbung, Ankündigung, Rechnung, Brief, Ausschreibung, Antrag, Sonstiges)

    2. Titel des Dokuments

    3. Datum des Dokuments

    4. Ist ein Logo vorhanden und deutet es auf eine Autorschaft hin?
    - Welche Firma / Person / Organisation ist Logoinhaber?

    5. Ist eine Unterschrift vorhanden, wenn ja von wem?

    6. Welche Entitäten (Person, Organisation, Behörde, ...) werden im Dokument genannt? In welchem Kontext werden sie genannt? Welche Rolle nehmen die Entitäten ein?

    7. Welche Entitäten könnten Autoren des Dokuments sein und warum? Welche dieser Entitäten werden explizit genannt?

    8. Welche Entitäten könnten Empfänger / Zielgruppe des Dokuments sein und warum? Welche dieser Entitäten werden explizit genannt?

    9. Liste alle Autoren oder Herausgeber.
        - Gib jede Entität (Person, Organisation, Behörde, ...) separat an
        - Füge Kontaktinformationen hinzu, falls verfügbar
        - Nenne ggf. die Verhältnisse der Entitäten zueinander

    10. Liste alle Empfänger.
        - Gib jede Entität (Person, Organisation, Behörde, ...) separat an
        - Füge Kontaktinformationen hinzu, falls verfügbar
        - Nenne ggf. die Verhältnisse der Entitäten zueinander

    11. Weitere relevante Metadaten des Dokuments

    Hier ist der Mitschrieb des zu analysierenden Dokuments:

    {transcript}

    Bitte antworte auf Deutsch und strukturiere deine Antwort entsprechend der obigen Punkte.
                        """,
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
                    },
                ],
            },
        ],
    )

    unstructured_answer = response.choices[0].message.content

    completion = client.beta.chat.completions.parse(
        model="gpt-4o-mini-2024-07-18",
        messages=[
            {
                "role": "system",
                "content": "Your task is to correctly format the given information. Integrate as much information as possible!",
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": f"{unstructured_answer} \n\n {transcript}",
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
                    },
                ],
            },
        ],
        response_format=DocumentMetadata,
    )

    res = completion.choices[0].message.parsed

    res = res.dict()

    extraction[url] = {
        "unstructured": unstructured_answer,
        "transcript": transcript,
        "metadata": res,
    }

    with open(os.path.join(DATASET_PATH, "extraction.json"), "w", encoding="utf8") as f:
        json.dump(extraction, f)

# Publishing

#### Archive Folder on HF

In [9]:
from huggingface_hub import login
login(token=HF_TOKEN, new_session=True)

In [None]:
from huggingface_hub import HfApi
from huggingface_hub.utils import disable_progress_bars

api = HfApi()

disable_progress_bars()
api.upload_large_folder(
        folder_path="documents",
        repo_id="sodowo/documents",
        repo_type="dataset"
)

### Create HF Dataset from Extraction

In [13]:
import shutil
HF_DATASET_PATH = "hf_dataset"
os.makedirs(HF_DATASET_PATH, exist_ok=True)
METADATA_PATH = os.path.join(HF_DATASET_PATH, "metadata.jsonl")

In [14]:
for i, (url, data) in tqdm(enumerate(extraction.items())):
    file_path = os.path.join(IMAGE_PATH, mappings_img[url])
    
    #TODO: fix this
    file_path = file_path.replace("./documents/german_img_files/./documents/german_pdf_files", "./documents/german_pdf_files")

    shutil.copyfile(file_path, os.path.join(HF_DATASET_PATH, f"{i}.jpg"))

    with open(METADATA_PATH, "a", encoding="utf8") as f:
        data = {
            "file_name": f"{i}.jpg",
            "additional_features": data["metadata"],
        }
        f.write(json.dumps(data, ensure_ascii=False) + "\n")

In [15]:
from datasets import load_dataset

In [9]:
dataset = load_dataset("imagefolder", data_dir=HF_DATASET_PATH)

In [10]:
dataset["train"][0]

In [10]:
dataset.push_to_hub("sodowo/doc_meta")