## This Notebook will create the database for the temporary project washing machine manuals

The intention of this notebook is to create a clean and best practice database structure, along with utilizing snowflakes AI functions.

The intended database structure is as follows: 

- **documents** (Stores metadata about each manual)  
  - `document_id` (Unique ID for each manual)  
  - `doc_name` (Document name)
  - `version` (Version or revision number)  
  - `relative_path` (Original PDF file path or S3 URL) 
  - `stage_name`  (snowflake stage name (source))
  - `size`  (size in bytes of the PDF document) 

- **sections** (Defines logical sections and subsections within each manual)  
  - `section_id` (Unique ID for the section)  
  - `manual_id` (Foreign key referencing `manuals`)  
  - `title` (Title or heading of the section)  
  - `order_num` (Numerical order of the section in the manual)  
  - `parent_section_id` (Optional FK for nested subsections)  

- **chunks small** (1024 characters, 64 overlap)
  - `chunk_id` (Unique ID for the chunk)  
  - `section_id` (Foreign key referencing `sections`)  
  - `chunk_text` (The text content of the chunk)  
  - `chunk_order` (Order of the chunk within the section)  
  - `embedding` (Vector for semantic search or embeddings)  

- **chunks large** (4096 characters, overlap 256)
  - `chunk_id` (Unique ID for the chunk)  
  - `section_id` (Foreign key referencing `sections`)  
  - `chunk_text` (The text content of the chunk)  
  - `chunk_order` (Order of the chunk within the section)  
  - `embedding` (Vector for semantic search or embeddings)  

- **images** (Stores references to images extracted from the manual)  
  - `image_id` (Unique ID for the image)  
  - `manual_id` (Foreign key referencing `manuals`)  
  - `page_number` 
  - `section_id` (Foreign key referencing `sections`)  
  - `order_num` (Display order within the section)  
  - `image_path` (S3 or web-accessible path to the image)  
  - `description`   



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import keyring
import os 
import snowflake.connector as sf_connector # ( https://docs.snowflake.com/en/developer-guide/python-connector/python-connector-connect)
from snowflake.connector.pandas_tools import write_pandas # (https://docs.snowflake.com/en/developer-guide/python-connector/python-connector-api#write_pandas)
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PDFPlumberLoader
from langchain.evaluation import load_evaluator
from collections import defaultdict

import numpy as np
from tqdm import tqdm
import time
import re
import json

from io import BytesIO
import fitz 
from shapely.geometry import box
from shapely.ops import unary_union
from PIL import Image, ImageDraw
import cv2

# Set max rows to display in pandas DataFrame 200
pd.set_option('display.max_rows', 200)

In [2]:
account_identifier = keyring.get_password('NC_Snowflake_Trial_Account_Name', 'account_identifier')
user_name = "EMHALDEMO1"
password = keyring.get_password('NC_Snowflake_Trial_User_Password', user_name)
database = "WASHING_MACHINE_MANUALS"
schema = "PUBLIC"

print("Account Identifier: ", account_identifier)
print("User Name: ", user_name)
print("Database: ", database)
print("Schema: ", schema)

try:
    connection_parameters = {
        "account_identifier": account_identifier,
        "user": user_name,
        "password": password,
        "role": "ACCOUNTADMIN",
        "warehouse": "COMPUTE_WH",
        "database": database,
        "schema": schema
    }
except:
        connection_parameters = {
        "account_identifier": account_identifier,
        "user": user_name,
        "password": password,
        "role": "ACCOUNTADMIN",
        "warehouse": "COMPUTE_WH",
        "database": "SNOWFLAKE",
        "schema": "CORTEX"
    }


# Connect to Snowflake
conn = sf_connector.connect(
    user=connection_parameters['user'],
    password=connection_parameters['password'],
    account=connection_parameters['account_identifier'],
    warehouse=connection_parameters['warehouse'],
    database=connection_parameters['database'],
    schema=connection_parameters['schema'],
    role=connection_parameters['role']
)

cursor = conn.cursor()
cursor.execute(f" CREATE DATABASE IF NOT EXISTS {database}; ")
cursor.execute(f" CREATE SCHEMA IF NOT EXISTS {database}.{schema}; ")
cursor.execute(f" USE DATABASE {database}; ")
cursor.execute(f" USE SCHEMA {schema}; ")


Account Identifier:  EPTJRCA-HWB83214
User Name:  EMHALDEMO1
Database:  WASHING_MACHINE_MANUALS
Schema:  PUBLIC


<snowflake.connector.cursor.SnowflakeCursor at 0x1f2b5275190>

## Create a stage for the PDF files with the code below
#### DO NOT RUN - unless you don't have the documents in the stage.

In [None]:
# Creating stage to dump PDF documents into
# cursor.execute(" create or replace stage docs ENCRYPTION = (TYPE = 'SNOWFLAKE_SSE') DIRECTORY = ( ENABLE = true ); ")

<snowflake.connector.cursor.SnowflakeCursor at 0x1943df39760>

## Creating documents table

In [10]:
cursor.execute("""
    CREATE OR REPLACE TABLE DOCUMENTS (
    DOCUMENT_ID INT AUTOINCREMENT PRIMARY KEY,
    DOCUMENT_NAME STRING,
    DOC_VERSION STRING,
    FILE_PATH STRING NOT NULL,
    FILE_SIZE NUMBER,
    CREATED_AT TIMESTAMP_LTZ DEFAULT CURRENT_TIMESTAMP()
);
""")

<snowflake.connector.cursor.SnowflakeCursor at 0x269b7eb5910>

# The section below focuses on creating chunks_large and chunks_small.

Different size chunks are good at different things - it could be a good idea to store both size, especially during testing

In [3]:
pdf_files_path = ".\\Washer_Manuals"
document_rows = []

for idx, filename in enumerate(os.listdir(pdf_files_path)):
    # Temporary filter to only process a set of PDF files
    if filename not in ["WGG254Z0GB.pdf", "WGA1420SIN.pdf","WAV28KH3GB.pdf"]:
        continue

    if filename.endswith(".pdf"):
        file_path = os.path.join(pdf_files_path, filename)
        print(f"Document number: {idx}  : {file_path}")
        file_size = os.path.getsize(file_path)
        
        document_rows.append({
            "DOCUMENT_NAME": filename,
            "FILE_PATH": file_path,
            "DOC_VERSION": "N/A",  # Placeholder, you can modify this logic as needed
            "FILE_SIZE": file_size
        })

documents_df = pd.DataFrame(document_rows)
print(documents_df.head())

Document number: 8  : .\Washer_Manuals\WAV28KH3GB.pdf
Document number: 11  : .\Washer_Manuals\WGA1420SIN.pdf
Document number: 13  : .\Washer_Manuals\WGG254Z0GB.pdf
    DOCUMENT_NAME                        FILE_PATH DOC_VERSION  FILE_SIZE
0  WAV28KH3GB.pdf  .\Washer_Manuals\WAV28KH3GB.pdf         N/A    5686613
1  WGA1420SIN.pdf  .\Washer_Manuals\WGA1420SIN.pdf         N/A    3247850
2  WGG254Z0GB.pdf  .\Washer_Manuals\WGG254Z0GB.pdf         N/A    3291555


In [4]:
success, nchunks, nrows, output = write_pandas(
    conn=conn,
    df=documents_df,
    database =database,
    table_name="DOCUMENTS",
    schema=schema,
    auto_create_table=False,
    overwrite=False
)

# Lets see the table
cursor.execute("""
    SELECT * 
    FROM DOCUMENTS;
""")

documents_df = cursor.fetch_pandas_all()
documents_df.head()

Unnamed: 0,DOCUMENT_ID,DOCUMENT_NAME,DOC_VERSION,FILE_PATH,FILE_SIZE,CREATED_AT
0,1,WAV28KH3GB.pdf,,.\Washer_Manuals\WAV28KH3GB.pdf,5686613,2025-04-22 01:52:03.390000-07:00
1,2,WGA1420SIN.pdf,,.\Washer_Manuals\WGA1420SIN.pdf,3247850,2025-04-22 01:52:03.390000-07:00
2,3,WGG254Z0GB.pdf,,.\Washer_Manuals\WGG254Z0GB.pdf,3291555,2025-04-22 01:52:03.390000-07:00
3,101,WAV28KH3GB.pdf,,.\Washer_Manuals\WAV28KH3GB.pdf,5686613,2025-04-25 04:44:31.675000-07:00
4,102,WGA1420SIN.pdf,,.\Washer_Manuals\WGA1420SIN.pdf,3247850,2025-04-25 04:44:31.675000-07:00


## Creating chunks tables with vector embeddings

To include page numbers, i decided to create the tables using pandas, and then uploading them to snowflake

Followed by that will be a query to crete a vector embeddings.

In [5]:
## Extracting section headers from the PDF files

def extract_text_chunks(file_path, manual_id, chunk_size=512, chunk_overlap=128):
    loader = PDFPlumberLoader(file_path)
    docs = loader.load()

    # Step 1: Combine all text across pages with page tracking
    all_text = ""
    page_map = []  # (char_index, page_number)

    for doc_page in docs:
        text = doc_page.page_content.strip().replace('\n', ' ')
        start_idx = len(all_text)
        all_text += text + " "  # Add space to separate pages
        end_idx = len(all_text)
        page_map.append((start_idx, end_idx, doc_page.metadata['page']))

    # Step 2: Create chunks with overlap, spanning across pages
    chunks = []
    chunk_order = []
    page_start_list = []
    page_end_list = []

    idx = 0
    chunk_idx = 0

    while idx < len(all_text):
        chunk = all_text[idx:idx + chunk_size]

        # Determine pages involved in this chunk
        chunk_start = idx
        chunk_end = idx + len(chunk)

        pages_in_chunk = [
            page_num
            for start, end, page_num in page_map
            if not (end <= chunk_start or start >= chunk_end)  # overlap condition
        ]

        page_start = min(pages_in_chunk) if pages_in_chunk else None
        page_end = max(pages_in_chunk) if pages_in_chunk else None

        chunks.append(chunk)
        page_start_list.append(page_start)
        page_end_list.append(page_end)
        chunk_order.append(chunk_idx)

        chunk_idx += 1
        idx += chunk_size - chunk_overlap

    # Step 3: Create DataFrame
    rows = [{
        'DOCUMENT_ID': manual_id,
        'PAGE_START_NUMBER': start,
        'PAGE_END_NUMBER': end,
        'CHUNK_TEXT': chunk,
        'CHUNK_ORDER': order
    } for chunk, start, end, order in zip(chunks, page_start_list, page_end_list, chunk_order)]

    df = pd.DataFrame(rows, columns=["DOCUMENT_ID", "PAGE_START_NUMBER", "PAGE_END_NUMBER", "CHUNK_TEXT", "CHUNK_ORDER"])
    return df


large_chunks_df = pd.DataFrame()
for row in tqdm(documents_df.iterrows(), total = len(documents_df)):
    manual_id = row[1]["DOCUMENT_ID"]
    file_path = os.path.join(pdf_files_path, row[1]["DOCUMENT_NAME"])

    tmp_chunked_df = extract_text_chunks(file_path = file_path, 
                        manual_id = manual_id,
                        chunk_size = 6000,#1024,
                        chunk_overlap = 128)  # Show first 5 chunks
    large_chunks_df = pd.concat([large_chunks_df, tmp_chunked_df], ignore_index=True)

large_chunks_df

  0%|          | 0/6 [00:02<?, ?it/s]


KeyboardInterrupt: 

In [14]:
create_table_sql = """
CREATE OR REPLACE TABLE CHUNKS_LARGE (
    CHUNK_ID INT AUTOINCREMENT PRIMARY KEY,
    DOCUMENT_ID INT NOT NULL,
    PAGE_START_NUMBER INT,
    PAGE_END_NUMBER INT,
    CHUNK_ORDER INT,
    CHUNK_TEXT STRING NOT NULL,
    EMBEDDING VECTOR(FLOAT, 1024),
    CREATED_AT TIMESTAMP_LTZ DEFAULT CURRENT_TIMESTAMP(),
    CONSTRAINT fk_document
        FOREIGN KEY (DOCUMENT_ID)
        REFERENCES DOCUMENTS(DOCUMENT_ID)
);
"""
cursor.execute(create_table_sql)

<snowflake.connector.cursor.SnowflakeCursor at 0x269b7eb5910>

In [15]:
success, nchunks, nrows, output = write_pandas(
    conn=conn,
    df=large_chunks_df,
    database =database,
    table_name="CHUNKS_LARGE",
    schema=schema,
    auto_create_table=False,
    overwrite=False
)

print(f"Success: {success}, Chunks: {nchunks}, Rows: {nrows}")

# Update the embeddings for the chunks in the CHUNKS_LARGE table
cursor.execute("""
    UPDATE CHUNKS_LARGE
    SET EMBEDDING = SNOWFLAKE.CORTEX.EMBED_TEXT_1024(
        'snowflake-arctic-embed-l-v2.0',
        CHUNK_TEXT
    )
    WHERE EMBEDDING IS NULL;
""")


Success: True, Chunks: 1, Rows: 40


<snowflake.connector.cursor.SnowflakeCursor at 0x269b7eb5910>

In [16]:
small_chunks_df = pd.DataFrame()
for row in tqdm(documents_df.iterrows(), total = len(documents_df)):
    manual_id = row[1]["DOCUMENT_ID"]
    tmp_chunked_df = extract_text_chunks(file_path = file_path, 
                        manual_id = manual_id,
                        chunk_size = 1024,
                        chunk_overlap = 64)  # Show first 5 chunks
    small_chunks_df = pd.concat([small_chunks_df, tmp_chunked_df], ignore_index=True)


create_table_sql = """
CREATE OR REPLACE TABLE CHUNKS_SMALL (
    CHUNK_ID INT AUTOINCREMENT PRIMARY KEY,
    DOCUMENT_ID INT NOT NULL,
    PAGE_START_NUMBER INT,
    PAGE_END_NUMBER INT,
    CHUNK_ORDER INT,
    CHUNK_TEXT STRING NOT NULL,
    EMBEDDING VECTOR(FLOAT, 1024),
    CREATED_AT TIMESTAMP_LTZ DEFAULT CURRENT_TIMESTAMP(),
    CONSTRAINT fk_document
        FOREIGN KEY (DOCUMENT_ID)
        REFERENCES DOCUMENTS(DOCUMENT_ID)
);
"""
cursor.execute(create_table_sql)

success, nchunks, nrows, output = write_pandas(
    conn=conn,
    df=small_chunks_df,
    database =database,
    table_name="CHUNKS_SMALL",
    schema=schema,
    auto_create_table=False,
    overwrite=False
)

print(f"Success: {success}, Chunks: {nchunks}, Rows: {nrows}")

# Update the embeddings for the small chunks
cursor.execute("""
    UPDATE CHUNKS_SMALL
    SET EMBEDDING = SNOWFLAKE.CORTEX.EMBED_TEXT_1024(
        'snowflake-arctic-embed-l-v2.0',
        CHUNK_TEXT
    )
    WHERE EMBEDDING IS NULL;
""")


100%|██████████| 3/3 [00:23<00:00,  7.78s/it]


Success: True, Chunks: 1, Rows: 210


<snowflake.connector.cursor.SnowflakeCursor at 0x269b7eb5910>

## Creating sections table using LLM for TOC extraction

The function `extract_TOC` takes quite a while due to the chunk size and the model. This can be tampered with, but i found most consistent results with said model. I also think that larger chunks are better for this task, as the model can see context of the first few pages, and it also ensures that the table of contents is included in the first chunk.

In [None]:
def extract_TOC(text: str, model : str) -> str:
    prompt = (
    """
    I will provide a long string of text that most likely contains a table of contents, 
    although it may also include additional body text from a document. Your task is to carefully 
    extract only the table of contents and structure it as a JSON object in the following 
    format:
    {
      "Section": "<section name>",
      "Section Number": "<section name>",
      "Page": <page number>,
      "Sub Sections" : [{
        "Section": "<section name>",
        "Section Number": "<section name>",
        "Page": <page number>,
        "Sub Sections" : []}
      ],
    }    

    Guideines:
    - All keys in the json object must be either "Section", "Section Number", "Page", "Sub Sections".
    - "Section Number" must be represented as an integer or float - E.G: 1, 2, 5.3, 1,4, etc.
    - Ignore any text that is not part of the table of contents.
    - Ensure that sub-sections are nested appropriately under their parent section.
    - Page numbers should be extracted as integers, if possible.
    - Be tolerant of inconsistencies in formatting, spacing, or punctuation (e.g. dashes, colons, ellipses).
    - Do not include duplicate or repeated sections.
    - You should only consider items which are part of the table of contents, nothing before, nothing after.
    - "Section" must consist of words
    - "Section Number" must be represented as an integer or float - E.G: 1, 2, 5.3, 1,4, etc.
    - You must include a top level key value pair called "Section":"Table of contents".

    """
    f"Text:\n{text}"
    )
    start_time = time.time()
    result = cursor.execute(f"""
        SELECT SNOWFLAKE.CORTEX.COMPLETE('{model}', $$ {prompt} $$)
    """)
    print(f"Runtime in seconds: {time.time() - start_time:.4f}")

    return cursor.fetch_pandas_all().iloc[0,0]


# This example prints out section 4 of the first document of the database. mistral-large2 mistral-7b
# llm_output = extract_TOC(df_large_chunks.loc[0,"CHUNK"], model = 'mistral-7b')

# llm_output = extract_TOC(large_chunks_df.loc[0,"CHUNK_TEXT"], model = 'llama3.1-70b')
# llm_output

Runtime in seconds: 84.2520


'Here is the extracted table of contents in the requested JSON format:\n\n```\n{\n  "Section": "Table of contents",\n  "Section Number": "",\n  "Page": "",\n  "Sub Sections": [\n    {\n      "Section": "Safety",\n      "Section Number": "1",\n      "Page": "4",\n      "Sub Sections": [\n        {\n          "Section": "General information",\n          "Section Number": "1.1",\n          "Page": "4",\n          "Sub Sections": []\n        },\n        {\n          "Section": "Intended use",\n          "Section Number": "1.2",\n          "Page": "4",\n          "Sub Sections": []\n        },\n        {\n          "Section": "Restriction on user group",\n          "Section Number": "1.3",\n          "Page": "4",\n          "Sub Sections": []\n        },\n        {\n          "Section": "Safe installation",\n          "Section Number": "1.4",\n          "Page": "5",\n          "Sub Sections": []\n        },\n        {\n          "Section": "Safe use",\n          "Section Number": "1.5",\n  

In [None]:
def extract_json_from_llm_output(llm_output: str) -> dict:
    try:
        # Confirming that a JSON block is returned
        match = re.search(r"```\s*(\{.*?\})\s*```", llm_output, re.DOTALL)
        if not match:
            raise ValueError("No JSON code block found in the text.")

        # Extracting sub string (json string)
        raw_json = match.group(1)

        # Clean common JSON errors (e.g., trailing commas)
        cleaned_json = re.sub(r",\s*([\]}])", r"\1", raw_json)  # remove trailing commas before ] or }
        
        # Parse string to json
        parsed = json.loads(cleaned_json)
        return parsed
    
    except Exception as e:
        print("Failed to extract JSON:", e)
        return {}

        
# parsed_dict = extract_json_from_llm_output(llm_output)
# print(json.dumps(parsed_dict, indent=2))

In [None]:
def traverse_sections(node, parent_section=None):
    rows = []

    # Get info from the current node
    section = node.get("Section")
    section_number = node.get("Section Number")
    page = node.get("Page")

    # Add current node to the list
    evaluator = load_evaluator("string_distance")
    levenshtein_score_toc = evaluator.evaluate_strings(
    prediction=section,
    reference="Table of Contents",
    metric="levenshtein"
    )["score"]  # This will be a float between 0 and 1, where 0 means identical

    if levenshtein_score_toc > 0.1:  # if the levenshtein distance is very small its likely to match "Table of Contents"
        rows.append({
            "SECTION": section,
            "SECTION_NUMBER": section_number,
            "PAGE": page,
            "PARENT_SECTION_NUMBER": parent_section
        })

    # Recurse into each sub-section, if any
    for subsection in node.get("Sub Sections", []):
        rows.extend(traverse_sections(subsection, parent_section=section_number))

    return rows

# flat_rows = traverse_sections(parsed_dict)
# toc_df = pd.DataFrame(flat_rows)
# toc_df.head(10)


Unnamed: 0,SECTION,SECTION_NUMBER,PAGE,PARENT_SECTION_NUMBER
0,Safety,1.0,4,
1,General information,1.1,4,1.0
2,Intended use,1.2,4,1.0
3,Restriction on user group,1.3,4,1.0
4,Safe installation,1.4,5,1.0
5,Safe use,1.5,7,1.0
6,Safe cleaning and maintenance,1.6,9,1.0
7,Preventing material damage,2.0,10,
8,Environmental protection and saving energy,3.0,11,
9,Disposing of packaging,3.1,11,3.0


In [None]:
def create_TOC_table(documents_df, large_chunks_df, model ="llama3.1-70b"):
    df_list = []

    for row in tqdm(documents_df.iterrows(), total = len(documents_df)):
        manual_id = row[1]["DOCUMENT_ID"]
        file_path = os.path.join(pdf_files_path, row[1]["DOCUMENT_NAME"])
        first_chunk_of_doc = large_chunks_df.loc[large_chunks_df["DOCUMENT_ID"] == manual_id, "CHUNK_TEXT"].iloc[0]
        # print("First chunk:", first_chunk_of_doc)

        llm_output = extract_TOC(first_chunk_of_doc, model = model)
        parsed_dict = extract_json_from_llm_output(llm_output)
        flat_rows = traverse_sections(parsed_dict)
        local_toc_df = pd.DataFrame(flat_rows)
        local_toc_df["DOCUMENT_ID"] = manual_id
        df_list.append(local_toc_df)

    return pd.concat(df_list, ignore_index=True)
     
sections_df = create_TOC_table(documents_df, large_chunks_df, model ="llama3.1-70b")
sections_df

 33%|███▎      | 1/3 [01:33<03:06, 93.47s/it]

Runtime in seconds: 93.4563


 67%|██████▋   | 2/3 [02:38<01:17, 77.03s/it]

Runtime in seconds: 65.5171


100%|██████████| 3/3 [03:48<00:00, 76.24s/it]

Runtime in seconds: 69.7199





Unnamed: 0,SECTION,SECTION_NUMBER,PAGE,PARENT_SECTION_NUMBER,DOCUMENT_ID
0,Safety,1,4,,1
1,General information,1.1,4,1,1
2,Intended use,1.2,4,1,1
3,Restriction on user group,1.3,4,1,1
4,Safe installation,1.4,5,1,1
...,...,...,...,...,...
216,Disposing of old appliance,18.4,47,18,3
217,Customer Service,19,47,,3
218,Product number (E-Nr.) and production number (FD),19.1,47,19,3
219,Consumption values,20,48,,3


In [None]:
cursor.execute("""
    CREATE OR REPLACE TABLE SECTIONS (
    SECTION_ID INT AUTOINCREMENT PRIMARY KEY,
    DOCUMENT_ID INT NOT NULL,
    SECTION STRING NOT NULL,
    SECTION_NUMBER STRING NOT NULL,
    PARENT_SECTION_NUMBER STRING,
    PAGE INT,
    CREATED_AT TIMESTAMP_LTZ DEFAULT CURRENT_TIMESTAMP(),
    CONSTRAINT fk_document
        FOREIGN KEY (DOCUMENT_ID)
        REFERENCES DOCUMENTS(DOCUMENT_ID)
);
""")


success, nchunks, nrows, output = write_pandas(
    conn=conn,
    df=sections_df,
    database =database,
    table_name="SECTIONS",
    schema=schema,
    auto_create_table=False,
    overwrite=False
)
print(f"Success: {success}, Chunks: {nchunks}, Rows: {nrows}")


Success: True, Chunks: 1, Rows: 221


In [7]:
# Lets see the table
cursor.execute("""
    SELECT * 
    FROM CHUNKS_LARGE;
""")

large_chunks_df = cursor.fetch_pandas_all()
large_chunks_df.head()


Unnamed: 0,CHUNK_ID,DOCUMENT_ID,PAGE_START_NUMBER,PAGE_END_NUMBER,CHUNK_ORDER,CHUNK_TEXT,EMBEDDING,CREATED_AT
0,1,1,0,3,0,Register your b M o ge s y n c t B e h f o r w...,"[0.048339844, 0.0158844, -0.015388489, -0.0038...",2025-04-22 01:53:38.352000-07:00
1,2,1,3,7,1,t. ¡ Up to an altitude of max. 4000 m above se...,"[0.038604736, 0.08459473, -0.023513794, -0.014...",2025-04-22 01:53:38.352000-07:00
2,3,1,7,10,2,on or lean against the appliance door. ▶ Do no...,"[0.074279785, 0.068359375, -0.0036392212, 0.02...",2025-04-22 01:53:38.352000-07:00
3,4,1,10,15,3,"ese instructions, your creases energy and wate...","[0.008338928, 0.00027441978, 0.044067383, 0.00...",2025-04-22 01:53:38.352000-07:00
4,5,1,15,22,4,ht. Water outlet connection types 4.6 Aligning...,"[0.052764893, 0.074157715, -0.033294678, 0.011...",2025-04-22 01:53:38.352000-07:00


In [8]:
# Lets see the table
cursor.execute("""
    SELECT * 
    FROM CHUNKS_SMALL;
""")

small_chunks_df = cursor.fetch_pandas_all()
small_chunks_df.head()


Unnamed: 0,CHUNK_ID,DOCUMENT_ID,PAGE_START_NUMBER,PAGE_END_NUMBER,CHUNK_ORDER,CHUNK_TEXT,EMBEDDING,CREATED_AT
0,1,1,0,1,0,Register your b M o ge s y n c t B e h f o r w...,"[0.032073975, 0.05307007, -0.021911621, -0.006...",2025-04-22 01:54:15.568000-07:00
1,2,1,1,1,1,..... 29 2Preventing material damage.... 10 13...,"[0.018035889, 0.035339355, -0.0013151169, 0.00...",2025-04-22 01:54:15.568000-07:00
2,3,1,1,2,2,moving the transit bolts...... 13 13.10 Cancel...,"[0.086364746, 0.014724731, 0.004234314, -0.003...",2025-04-22 01:54:15.568000-07:00
3,4,1,1,3,3,......... 20 2 en 15 Basic settings..............,"[0.029403687, -0.017074585, -0.033416748, 0.00...",2025-04-22 01:54:15.568000-07:00
4,5,1,3,3,4,e the following safety instructions. 1.1 Gener...,"[0.050964355, 0.06311035, -0.032409668, 0.0217...",2025-04-22 01:54:15.568000-07:00


In [6]:
# Lets see the table
cursor.execute("""
    SELECT * 
    FROM SECTIONS;
""")

sections_df = cursor.fetch_pandas_all()
sections_df.head()

sections_df


Unnamed: 0,SECTION_ID,DOCUMENT_ID,SECTION,SECTION_NUMBER,PARENT_SECTION_NUMBER,PAGE,CREATED_AT
0,1,1,Safety,1,,4,2025-04-22 03:14:00.711000-07:00
1,2,1,General information,1.1,1,4,2025-04-22 03:14:00.711000-07:00
2,3,1,Intended use,1.2,1,4,2025-04-22 03:14:00.711000-07:00
3,4,1,Restriction on user group,1.3,1,4,2025-04-22 03:14:00.711000-07:00
4,5,1,Safe installation,1.4,1,5,2025-04-22 03:14:00.711000-07:00
...,...,...,...,...,...,...,...
216,217,3,Disposing of old appliance,18.4,18,47,2025-04-22 03:14:00.711000-07:00
217,218,3,Customer Service,19,,47,2025-04-22 03:14:00.711000-07:00
218,219,3,Product number (E-Nr.) and production number (FD),19.1,19,47,2025-04-22 03:14:00.711000-07:00
219,220,3,Consumption values,20,,48,2025-04-22 03:14:00.711000-07:00


# Extracting images from the manual

This chosen method which appears to be more diverse across the manuals treats each page as an image. This is a good way to ensure that all images are extracted. 
The downside is that tables and other image like content will be extracted as images. Currently this is a feature not a bug. Adjusting the image extraction method is a task for the future when we have the real PDFs.

In [60]:
def render_pdf_to_images(pdf_path, zoom=2.0):
    doc = fitz.open(pdf_path)
    images = []
    for i, page in enumerate(doc):
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat)
        img_data = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        images.append({
            "page_number": i + 1,
            "image": img_data
        })
    return images


def get_pdf_page_pixel_size(pdf_image):
    width, height = pdf_image.size
    return width * height


def detect_image_regions(page_image, buffer=0, min_size=70, max_size = 1000, threshold=240):
    image = np.array(page_image)
    gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)

    # Applying blur to reduce fine lines from tables
    _, thresh = cv2.threshold(gray, threshold, 255, cv2.THRESH_BINARY_INV)
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    regions = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > min_size and h > min_size:  # Skip tiny blocks (Maybe reconsider)
            regions.append([x - buffer, 
                            y - buffer, 
                            x + w + buffer, 
                            y + h + buffer])
            if w * h > max_size:
                regions.pop(-1)  
    return regions


def crop_regions_from_image(page_image, regions, output_dir, page_num, manual_id):
    os.makedirs(output_dir, exist_ok=True)
    saved_images = []

    for i, coords in enumerate(regions):
        x1, y1, x2, y2 = map(int, coords)
        cropped = page_image.crop((x1, y1, x2, y2))
        save_path = os.path.join(output_dir, f"doc_{manual_id}_page_{page_num}_img_{i+1}.png")
        cropped.save(save_path)
        saved_images.append({
            "page": page_num,
            "image_path": save_path,
            "coords": (x1, y1, x2, y2)
        })
    return saved_images



def add_region_to_page(page_image, regions, output_dir, page_num, pdf_path ,color=(0, 255, 0), alpha=50, save=True, verbose=0):
    os.makedirs(output_dir, exist_ok=True)

    # Convert to RGBA to allow transparency
    annotated = page_image.convert("RGBA")
    overlay = Image.new("RGBA", annotated.size, (0, 0, 0, 0))
    draw = ImageDraw.Draw(overlay)

    for coords in regions:
        x1, y1, x2, y2 = map(int, coords)
        draw.rectangle([x1, y1, x2, y2], outline=color + (alpha,), fill=color + (alpha,))

    # Combine original image with overlay
    combined = Image.alpha_composite(annotated, overlay)

    if save:
        save_path = os.path.join(output_dir, f"page_{page_num:03d}_with_regions_{color}.png")
        combined.convert("RGB").save(save_path)
        if verbose > 0:
            print(f"Saved page {page_num} with highlighted regions to {save_path}")

    return combined


def merge_overlapping_regions(regions, buffer=0):
    """
    Merges overlapping or intersecting regions.

    Args:
        regions (List[List[int]]): List of regions as [x1, y1, x2, y2].
        buffer (int): Optional buffer added to each region before checking overlaps.

    Returns:
        List[List[int]]: Merged list of non-overlapping regions.
    """
    from shapely.geometry import box
    from shapely.ops import unary_union

    # Convert to shapely boxes with optional buffer
    boxes = [box(x1 - buffer, y1 - buffer, x2 + buffer, y2 + buffer) for x1, y1, x2, y2 in regions]

    # Merge all overlapping boxes (A fix to a previous issues of diagrams being cropped into multiple images)
    merged = unary_union(boxes)

    # Ensure output is a list of boxes
    if merged.geom_type == 'Polygon':
        merged_boxes = [merged]
    else:
        merged_boxes = list(merged.geoms)

    # Convert back to [x1, y1, x2, y2] format (round to int)
    merged_regions = []
    for b in merged_boxes:
        x1, y1, x2, y2 = b.bounds
        merged_regions.append([int(x1), int(y1), int(x2), int(y2)])

    return merged_regions



# This is the main function to extract images from the PDF
def extract_images_from_pdf(pdf_path:str, manual_id:int, output_dir: str, verbose:int =0):
    rendered_pages = render_pdf_to_images(pdf_path)
    all_extracted = []

    for page_idx,page in enumerate(rendered_pages):
        page_num = page["page_number"] 
        image = page["image"]
        if verbose > 0:
            print(f"Processing page {page_num}...")

        # Detecting regions
        regions = detect_image_regions(image , buffer=2, min_size=70, 
                                        max_size=get_pdf_page_pixel_size(image) * 0.99)
        # Creates new regions by merging overlapping regions (this is a fix for cropped images  )
        new_regions = merge_overlapping_regions(regions, buffer=0)

        if verbose > 0:
            print(f"Found {len(new_regions)} image regions on page {page_num}")

        if not new_regions:
            if verbose > 0:
                print(f"No image regions found on page {page_num}")
            continue
        
        # Creates an image directory for each PDF file
        image_output_dir = pdf_path.split("/")[-1].replace(".pdf", "").replace("Washer_Manuals", output_dir)
        os.makedirs(image_output_dir, exist_ok=True)

        # Showing the pages with the masked regions 
        modified_image = add_region_to_page(image, new_regions, image_output_dir, page_num, pdf_path, color=(0, 0, 255), alpha=50, save = False)

        # OLD code 
        extracted = crop_regions_from_image(
            image, new_regions, output_dir=image_output_dir, page_num=page_num, manual_id=manual_id
        )
        all_extracted.extend(extracted)
    return all_extracted


for idx,row in tqdm(enumerate(documents_df.iterrows()), total = len(documents_df)):
    manual_id = row[1]["DOCUMENT_ID"]
    file_path = os.path.join(pdf_files_path, row[1]["DOCUMENT_NAME"])
    extract_images_from_pdf(file_path, manual_id, output_dir="Washer_Images", verbose = 0)

  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:06<00:00,  2.33s/it]


# Creating table for image references and metadata

Currently the images are matched to the sections using the page number, which is problematic if the end of section 4.3 is one the same page as the start of section 4.4. On the top of my head i'm not quite sure how to match the images to the sections accurately, but this method yields mostly correct results.

In [83]:
def extract_page_number_from_filename(filename):
    return filename.split("_")[3] if "_" in filename else None

def generate_image_table(documents_df, sections_df, image_dir):
    image_records = []

    # Loop over all subdirectories in image_dir
    for subfolder in os.listdir(image_dir):
        subfolder_path = os.path.join(image_dir, subfolder)
        
        if not os.path.isdir(subfolder_path):
            continue  # skip files
        
        # Match to document by DOCUMENT_NAME (strip extension if needed)
        matching_docs = documents_df[documents_df['DOCUMENT_NAME'].str.contains(subfolder, case=False)]
        if matching_docs.empty:
            print(f"No matching document for subfolder: {subfolder}")
            continue
        
        document_id = matching_docs.iloc[0]['DOCUMENT_ID']
        document_name = matching_docs.iloc[0]['DOCUMENT_NAME']
        
        # List all image files in subdirectory
        for image_file in os.listdir(subfolder_path):
            if not image_file.lower().endswith((".png")):
                continue
            
            image_path = os.path.join(subfolder_path, image_file)
            page_number = extract_page_number_from_filename(image_file)
            order_number = image_file.split("img_")[-1].strip(".png")

            image_size = os.path.getsize(image_path)
            image_width, image_height = Image.open(image_path).size
            
            # Try to match to a section (same document, closest PAGE <= image page)
            section_match = None
            if page_number is not None:
                matching_sections = sections_df[
                    (sections_df['DOCUMENT_ID'] == document_id) & 
                    (sections_df['PAGE'].astype(str) <= str(page_number))
                ]
                if not matching_sections.empty:
                    section_match = matching_sections.sort_values("PAGE", ascending=False).iloc[0]
            
            image_records.append({
                "DOCUMENT_ID": document_id,
                "SECTION_ID": section_match["SECTION_ID"] if section_match is not None else None,
                "SECTION_NUMBER": section_match["SECTION_NUMBER"] if section_match is not None else None,
                "PAGE": page_number,
                "IMG_ORDER": order_number,
                "IMAGE_FILE": image_file,
                "IMAGE_PATH": image_path,
                "IMAGE_SIZE": image_size,
                "IMAGE_WIDTH": image_width,
                "IMAGE_HEIGHT": image_height
            })

    return pd.DataFrame(image_records)


image_df = generate_image_table(documents_df, sections_df, ".\\Washer_Images")
image_df.head(5)

Unnamed: 0,DOCUMENT_ID,SECTION_ID,SECTION_NUMBER,PAGE,IMG_ORDER,IMAGE_FILE,IMAGE_PATH,IMAGE_SIZE,IMAGE_WIDTH,IMAGE_HEIGHT
0,1,15,4.2,13,1,doc_1_page_13_img_1.png,.\Washer_Images\WAV28KH3GB\doc_1_page_13_img_1...,27070,318,452
1,1,15,4.2,13,2,doc_1_page_13_img_2.png,.\Washer_Images\WAV28KH3GB\doc_1_page_13_img_2...,4549,113,152
2,1,15,4.2,13,3,doc_1_page_13_img_3.png,.\Washer_Images\WAV28KH3GB\doc_1_page_13_img_3...,9425,157,140
3,1,17,4.4,14,1,doc_1_page_14_img_1.png,.\Washer_Images\WAV28KH3GB\doc_1_page_14_img_1...,3428,166,121
4,1,17,4.4,14,2,doc_1_page_14_img_2.png,.\Washer_Images\WAV28KH3GB\doc_1_page_14_img_2...,4973,166,120


In [84]:
cursor.execute("""
    CREATE OR REPLACE TABLE IMAGES (
    IMAGE_ID INT AUTOINCREMENT PRIMARY KEY,
    SECTION_ID INT NOT NULL,
    DOCUMENT_ID INT NOT NULL,
    SECTION_NUMBER STRING NOT NULL,
    PAGE INT,
    IMG_ORDER INT,
    IMAGE_FILE STRING,
    IMAGE_PATH STRING,
    IMAGE_SIZE NUMBER,
    IMAGE_WIDTH NUMBER,
    IMAGE_HEIGHT NUMBER,
    CREATED_AT TIMESTAMP_LTZ DEFAULT CURRENT_TIMESTAMP(),

    CONSTRAINT fk_document
        FOREIGN KEY (DOCUMENT_ID)
        REFERENCES DOCUMENTS(DOCUMENT_ID),
        
    CONSTRAINT fk_section
            FOREIGN KEY (SECTION_ID)
            REFERENCES SECTIONS(SECTION_ID)
);
""")


success, nchunks, nrows, output = write_pandas(
    conn=conn,
    df=image_df,
    database =database,
    table_name="IMAGES",
    schema=schema,
    auto_create_table=False,
    overwrite=False
)
print(f"Success: {success}, Chunks: {nchunks}, Rows: {nrows}")


Success: True, Chunks: 1, Rows: 194


In [11]:
# Lets see the table
cursor.execute("""
    SELECT * 
    FROM IMAGES;
""")

images_df = cursor.fetch_pandas_all()
images_df.head()

images_df.tail()

Unnamed: 0,IMAGE_ID,SECTION_ID,DOCUMENT_ID,SECTION_NUMBER,PAGE,IMG_ORDER,IMAGE_FILE,IMAGE_PATH,IMAGE_SIZE,IMAGE_WIDTH,IMAGE_HEIGHT,CREATED_AT
189,190,210,3,16.3,36,2,doc_3_page_36_img_2.png,.\Washer_Images\WGG254Z0GB\doc_3_page_36_img_2...,28987,328,237,2025-04-22 04:25:31.443000-07:00
190,191,210,3,16.3,36,3,doc_3_page_36_img_3.png,.\Washer_Images\WGG254Z0GB\doc_3_page_36_img_3...,15373,163,237,2025-04-22 04:25:31.443000-07:00
191,192,210,3,16.3,36,4,doc_3_page_36_img_4.png,.\Washer_Images\WGG254Z0GB\doc_3_page_36_img_4...,13176,164,237,2025-04-22 04:25:31.443000-07:00
192,193,216,3,18.3,46,1,doc_3_page_46_img_1.png,.\Washer_Images\WGG254Z0GB\doc_3_page_46_img_1...,16799,328,237,2025-04-22 04:25:31.443000-07:00
193,194,216,3,18.3,46,2,doc_3_page_46_img_2.png,.\Washer_Images\WGG254Z0GB\doc_3_page_46_img_2...,16547,328,237,2025-04-22 04:25:31.443000-07:00
