## This Notebook will create the database for the temporary project washing machine manuals

The intention of this notebook is to create a clean and best practice database structure, along with utilizing snowflakes AI functions.

The intended database structure is as follows: 

- **documents** (Stores metadata about each manual)  
  - `document_id` (Unique ID for each manual)  
  - `doc_name` (Document name)
  - `version` (Version or revision number)  
  - `relative_path` (Original PDF file path or S3 URL) 
  - `stage_name`  (snowflake stage name (source))
  - `size`  (size in bytes of the PDF document) 

- **sections** (Defines logical sections and subsections within each manual)  
  - `section_id` (Unique ID for the section)  
  - `manual_id` (Foreign key referencing `manuals`)  
  - `title` (Title or heading of the section)  
  - `order_num` (Numerical order of the section in the manual)  
  - `parent_section_id` (Optional FK for nested subsections)  

- **chunks small** (1024 characters, 64 overlap)
  - `chunk_id` (Unique ID for the chunk)  
  - `section_id` (Foreign key referencing `sections`)  
  - `chunk_text` (The text content of the chunk)  
  - `chunk_order` (Order of the chunk within the section)  
  - `embedding` (Vector for semantic search or embeddings)  

- **chunks large** (4096 characters, overlap 256)
  - `chunk_id` (Unique ID for the chunk)  
  - `section_id` (Foreign key referencing `sections`)  
  - `chunk_text` (The text content of the chunk)  
  - `chunk_order` (Order of the chunk within the section)  
  - `embedding` (Vector for semantic search or embeddings)  

- **images** (Stores references to images extracted from the manual)  
  - `image_id` (Unique ID for the image)  
  - `manual_id` (Foreign key referencing `manuals`)  
  - `page_number` 
  - `section_id` (Foreign key referencing `sections`)  
  - `order_num` (Display order within the section)  
  - `image_path` (S3 or web-accessible path to the image)  
  - `description`   



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import keyring
import os 
import snowflake.connector as sf_connector # ( https://docs.snowflake.com/en/developer-guide/python-connector/python-connector-connect)
from snowflake.connector.pandas_tools import write_pandas # (https://docs.snowflake.com/en/developer-guide/python-connector/python-connector-api#write_pandas)
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PDFPlumberLoader
from langchain.evaluation import load_evaluator
from collections import defaultdict
import PyPDF2
from PyPDF2 import PdfReader

from snowflake.core import Root
from snowflake.snowpark import Session
# from snowflake.snowpark.cortex import KnowledgeBase
from snowflake.snowpark.context import get_active_session

import numpy as np
from tqdm import tqdm
import time
import re
import json

from io import BytesIO
import fitz 
from shapely.geometry import box
from shapely.ops import unary_union
from PIL import Image, ImageDraw
import cv2

# Set max rows to display in pandas DataFrame 200
pd.set_option('display.max_rows', 200)

In [2]:
account_identifier = keyring.get_password('NC_Snowflake_Trial_Account_Name', 'account_identifier')
user_name = "JESPEREDSTROM"
password = keyring.get_password('NC_Snowflake_Trial_User_Password', user_name)
database = "WASHING_MACHINE_MANUALS"
schema = "PUBLIC"



print("Account Identifier: ", account_identifier)
print("User Name: ", user_name)
print("Database: ", database)
print("Schema: ", schema)

try:
    connection_parameters = {
        "account": account_identifier,
        "account_identifier": account_identifier,
        "user": user_name,
        "password": password,
        "role": "SYSADMIN",
        "warehouse": "COMPUTE_WH",
        "database": database,
        "schema": schema,
        
    }
except:
        connection_parameters = {
        "account": account_identifier,
        "account_identifier": account_identifier,
        "user": user_name,
        "password": password,
        "role": "SYSADMIN",
        "warehouse": "COMPUTE_WH",
        "database": "SNOWFLAKE",
        "schema": "CORTEX"
    }


# Connect to Snowflake
conn = sf_connector.connect(
    user=connection_parameters['user'],
    password=connection_parameters['password'],
    account=connection_parameters['account_identifier'],
    warehouse=connection_parameters['warehouse'],
    database=connection_parameters['database'],
    schema=connection_parameters['schema'],
    role=connection_parameters['role']
)

session = Session.builder.configs(connection_parameters).create()

cursor = conn.cursor()
cursor.execute(f" CREATE DATABASE IF NOT EXISTS {database}; ")
cursor.execute(f" CREATE SCHEMA IF NOT EXISTS {database}.{schema}; ")
cursor.execute(f" USE DATABASE {database}; ")
cursor.execute(f" USE SCHEMA {schema}; ")



Account Identifier:  TWHPJRA-OEB55075
User Name:  JESPEREDSTROM
Database:  WASHING_MACHINE_MANUALS
Schema:  PUBLIC


<snowflake.connector.cursor.SnowflakeCursor at 0x1abe6de2a80>

## Create a stage for the PDF files with the code below
#### DO NOT RUN - unless you don't have the documents in the stage.

In [3]:
# Creating stage to dump PDF documents into
# cursor.execute(" create or replace stage docs ENCRYPTION = (TYPE = 'SNOWFLAKE_SSE') DIRECTORY = ( ENABLE = true ); ")

## Creating documents table

In [4]:
cursor.execute("""
    CREATE OR REPLACE TABLE DOCUMENTS (
    DOCUMENT_ID INT AUTOINCREMENT PRIMARY KEY,
    DOCUMENT_NAME STRING,
    DOC_VERSION STRING,
    FILE_PATH STRING NOT NULL,
    FILE_SIZE NUMBER,
    CREATED_AT TIMESTAMP_LTZ DEFAULT CURRENT_TIMESTAMP()
);
""")

<snowflake.connector.cursor.SnowflakeCursor at 0x199c0cbbaa0>

# The section below focuses on creating chunks_large and chunks_small.

Different size chunks are good at different things - it could be a good idea to store both size, especially during testing

### Extract error codes

In [3]:
def extract_error_codes(text):
    # Match 'E:XX' followed by optional whitespace and '/-XX' parts
    patterns = [r'E:\d{2}(?:\s*/\s*-\d{2})*', r'E\d+']
    
    all_matches = []
    
    for pattern in patterns:
        matches = re.findall(pattern, text, re.DOTALL)
        cleaned = [re.sub(r'\s*/\s*', '/', match) for match in matches]
        all_matches.extend(cleaned)
    
    # Remove duplicates while preserving order
    unique_errors = list(dict.fromkeys(all_matches))
    error_codes = pd.DataFrame({'ERROR_CODES': unique_errors})
    error_codes["ERROR_ID"] = error_codes.index
    return error_codes

### Extract basic document data

In [6]:
def extract_document_and_errorcode_data(pdf_files_path = "Washer_Manuals"):
    """Method for extracting basic document data for files in a folder 
    Input:
        string: relative path to folder with documents
    Returns:
        pandas.dataframe: dataframe for the documents in the folder provided
        pandas.dataframe: dataframe for the error codes present in the documents
    """
    document = ""
    document_rows = []

    for idx, filename in enumerate(os.listdir(pdf_files_path)):
        # if filename not in ["k714wm14 service manual.pdf","WAV28KH3GB.pdf"]:
        #     continue
        if filename.endswith(".pdf"):
            file_path = os.path.join(pdf_files_path, filename)
        
        with open(file_path, 'rb') as file:
            reader = PdfReader(file)
            print(reader.__dict__)
            
            for page_number in range(len(reader.pages)):
                page = reader.pages[page_number]
                text = page.extract_text()
                document += text + "\n"
                # print(text)
                
                file_size = os.path.getsize(file_path)
            
        document_rows.append({
            "DOCUMENT_NAME": filename,
            "FILE_PATH": file_path,
            "DOC_VERSION": "N/A",  # Placeholder, you can modify this logic as needed
            "FILE_SIZE": file_size
        })

    documents_df = pd.DataFrame(document_rows)
    error_codes = extract_error_codes(document)
    return documents_df, error_codes

documents_df, error_codes_df = extract_document_and_errorcode_data()

{'strict': False, 'flattened_pages': None, 'resolved_objects': {(0, 468): {'/DecodeParms': {'/Columns': 4, '/Predictor': 12}, '/Filter': '/FlateDecode', '/ID': [b'\xed\xf8\x9fV\xe9\x88|\x03\x05\x02\x01J|(Z\x0e', 'âÍëÜùV\x02H—Y\x11_˝Ì‹\x15'], '/Index': [456, 25], '/Info': IndirectObject(455, 0, 1759877569552), '/Prev': 2240089, '/Root': IndirectObject(457, 0, 1759877569552), '/Size': 481, '/Type': '/XRef', '/W': [1, 2, 1]}, (0, 340): {'/DecodeParms': {'/Columns': 5, '/Predictor': 12}, '/Filter': '/FlateDecode', '/ID': [b'\xed\xf8\x9fV\xe9\x88|\x03\x05\x02\x01J|(Z\x0e', 'âÍëÜùV\x02H—Y\x11_˝Ì‹\x15'], '/Info': IndirectObject(455, 0, 1759877569552), '/Root': IndirectObject(457, 0, 1759877569552), '/Size': 456, '/Type': '/XRef', '/W': [1, 3, 1]}}, 'xref_index': 0, '_page_id2num': None, 'xref': {0: {456: 16, 457: 1197, 458: 1282, 459: 1521, 460: 2427, 461: 5075, 462: 7768, 463: 7910, 464: 18699, 465: 19014, 466: 42915, 467: 43124, 468: 116, 480: 482, 1: 51237, 2: 51472, 3: 53683, 4: 53877, 5:

In [40]:
display(documents_df)

Unnamed: 0,DOCUMENT_ID,DOCUMENT_NAME,DOC_VERSION,FILE_PATH,FILE_SIZE,CREATED_AT
0,1,k714wm14 service manual.pdf,,Washer_Manuals\k714wm14 service manual.pdf,2241315,2025-04-29 05:04:50.736000-07:00
1,2,mmo_87050793_1630397705_64_10689.pdf,,Washer_Manuals\mmo_87050793_1630397705_64_1068...,3219053,2025-04-29 05:04:50.736000-07:00
2,3,technical-manual-w11663204-revb.pdf,,Washer_Manuals\technical-manual-w11663204-revb...,17270389,2025-04-29 05:04:50.736000-07:00
3,4,WAK20160IN.pdf,,Washer_Manuals\WAK20160IN.pdf,5052759,2025-04-29 05:04:50.736000-07:00
4,5,WAN28258GB.pdf,,Washer_Manuals\WAN28258GB.pdf,3374759,2025-04-29 05:04:50.736000-07:00
5,6,WAN28282GC.pdf,,Washer_Manuals\WAN28282GC.pdf,3004904,2025-04-29 05:04:50.736000-07:00
6,7,Washing machine Top-loader C series.pdf,,Washer_Manuals\Washing machine Top-loader C se...,2056394,2025-04-29 05:04:50.736000-07:00
7,8,WAT24168IN.pdf,,Washer_Manuals\WAT24168IN.pdf,4721819,2025-04-29 05:04:50.736000-07:00
8,9,WAV28KH3GB.pdf,,Washer_Manuals\WAV28KH3GB.pdf,5686613,2025-04-29 05:04:50.736000-07:00
9,10,WFL2050.pdf,,Washer_Manuals\WFL2050.pdf,2749291,2025-04-29 05:04:50.736000-07:00


In [7]:
success, nchunks, nrows, output = write_pandas(
    conn=conn,
    df=documents_df,
    database =database,
    table_name="DOCUMENTS",
    schema=schema,
    auto_create_table=False,
    overwrite=False
)

# Lets see the table
cursor.execute("""
    SELECT * 
    FROM DOCUMENTS;
""")

documents_df = cursor.fetch_pandas_all()
documents_df.head()

Unnamed: 0,DOCUMENT_ID,DOCUMENT_NAME,DOC_VERSION,FILE_PATH,FILE_SIZE,CREATED_AT
0,1,k714wm14 service manual.pdf,,Washer_Manuals\k714wm14 service manual.pdf,2241315,2025-04-29 05:04:50.736000-07:00
1,2,mmo_87050793_1630397705_64_10689.pdf,,Washer_Manuals\mmo_87050793_1630397705_64_1068...,3219053,2025-04-29 05:04:50.736000-07:00
2,3,technical-manual-w11663204-revb.pdf,,Washer_Manuals\technical-manual-w11663204-revb...,17270389,2025-04-29 05:04:50.736000-07:00
3,4,WAK20160IN.pdf,,Washer_Manuals\WAK20160IN.pdf,5052759,2025-04-29 05:04:50.736000-07:00
4,5,WAN28258GB.pdf,,Washer_Manuals\WAN28258GB.pdf,3374759,2025-04-29 05:04:50.736000-07:00


## Creating chunks tables with vector embeddings

To include page numbers, i decided to create the tables using pandas, and then uploading them to snowflake

Followed by that will be a query to crete a vector embeddings.

### Extract text chunks using PdfReader

In [8]:
## Extracting section headers from the PDF files

def extract_text_chunks(file_path: str, manual_id: int, chunk_size: int = 512, chunk_overlap: int = 128):
    with open(file_path, 'rb') as file:
        pdf_object = PdfReader(file)
        print(pdf_object.__dict__)
        document = ""
        for page_number in range(len(pdf_object.pages)):
            page = pdf_object.pages[page_number]
            text = page.extract_text()
            document += text + "\n"
            # print(text)
    # print(file_path)
    # Step 1: Combine all text across pages with page tracking
    all_text = ""
    page_map = []  # (char_index, page_number)

    for page_number, doc_page in enumerate(pdf_object.pages):
        # print(doc_page)
        page = pdf_object.pages[page_number]
        text = page.extract_text()
        start_idx = len(all_text)
        all_text += text + " "  # Add space to separate pages
        end_idx = len(all_text)
        page_map.append((start_idx, end_idx, pdf_object.get_page_number(doc_page)))


    print("PAGE MAP: ", page_map)

    # Step 2: Create chunks with overlap, spanning across pages
    chunks = []
    chunk_order = []
    page_start_list = []
    page_end_list = []

    idx = 0
    chunk_idx = 0
    
    # print(extract_error_codes(all_text))

    while idx < len(all_text):
        chunk = all_text[idx:idx + chunk_size]

        # Determine pages involved in this chunk
        chunk_start = idx
        chunk_end = idx + len(chunk)

        pages_in_chunk = [
            page_num
            for start, end, page_num in page_map
            if not (end <= chunk_start or start >= chunk_end)  # overlap condition
        ]

        page_start = min(pages_in_chunk) if pages_in_chunk else None
        page_end = max(pages_in_chunk) if pages_in_chunk else None

        chunks.append(chunk)
        page_start_list.append(page_start)
        page_end_list.append(page_end)
        chunk_order.append(chunk_idx)

        chunk_idx += 1
        idx += chunk_size - chunk_overlap

    # Step 3: Create DataFrame
    rows = [{
        'DOCUMENT_ID': manual_id,
        'PAGE_START_NUMBER': start,
        'PAGE_END_NUMBER': end,
        'CHUNK_TEXT': chunk,
        'CHUNK_ORDER': order
    } for chunk, start, end, order in zip(chunks, page_start_list, page_end_list, chunk_order)]

    df = pd.DataFrame(rows, columns=["DOCUMENT_ID", "PAGE_START_NUMBER", "PAGE_END_NUMBER", "CHUNK_TEXT", "CHUNK_ORDER"])

    return df


# Text chunk mapper

In [9]:
def text_chunk_mapper(document_table, existing_chunk_table = None):
    # Prepare empty list to store mapping rows
    if extract_text_chunks:
        chunks = [existing_chunk_table]
    else:
        chunks = []

    for document_id, document_row_object in document_table.iterrows():
        file_path = document_row_object['FILE_PATH']
        chunks.append(extract_text_chunks(file_path = file_path, 
                                             manual_id = document_id,
                                            chunk_size = 1024,
                                            chunk_overlap = 128))
    df_chunk = pd.concat(chunks, axis=0, ignore_index=True)
    df_chunk["CHUNK_ID"] = df_chunk.index
    return df_chunk

### Create sql table in snowflake for large chunks

In [10]:

# large_chunks_df = pd.DataFrame()
# for row in tqdm(documents_df.iterrows(), total = len(documents_df)):
#     manual_id = row[1]["DOCUMENT_ID"]
#     file_path = os.path.join(pdf_files_path, row[1]["DOCUMENT_NAME"])

#     tmp_chunked_df = extract_text_chunks(file_path = file_path, 
#                         manual_id = manual_id,
#                         chunk_size = 6000,#1024,
#                         chunk_overlap = 128)  # Show first 5 chunks
#     large_chunks_df = pd.concat([large_chunks_df, tmp_chunked_df], ignore_index=True)

# # large_chunks_df
# create_table_sql = """
# CREATE OR REPLACE TABLE CHUNKS_LARGE (
#     CHUNK_ID INT AUTOINCREMENT PRIMARY KEY,
#     DOCUMENT_ID INT NOT NULL,
#     PAGE_START_NUMBER INT,
#     PAGE_END_NUMBER INT,
#     CHUNK_ORDER INT,
#     CHUNK_TEXT STRING NOT NULL,
#     EMBEDDING VECTOR(FLOAT, 1024),
#     CREATED_AT TIMESTAMP_LTZ DEFAULT CURRENT_TIMESTAMP(),
#     CONSTRAINT fk_document
#         FOREIGN KEY (DOCUMENT_ID)
#         REFERENCES DOCUMENTS(DOCUMENT_ID)
# );
# """
# cursor.execute(create_table_sql)

In [11]:
# success, nchunks, nrows, output = write_pandas(
#     conn=conn,
#     df=large_chunks_df,
#     database =database,
#     table_name="CHUNKS_LARGE",
#     schema=schema,
#     auto_create_table=False,
#     overwrite=False
# )

# print(f"Success: {success}, Chunks: {nchunks}, Rows: {nrows}")

# # Update the embeddings for the chunks in the CHUNKS_LARGE table
# cursor.execute("""
#     UPDATE CHUNKS_LARGE
#     SET EMBEDDING = SNOWFLAKE.CORTEX.EMBED_TEXT_1024(
#         'snowflake-arctic-embed-l-v2.0',
#         CHUNK_TEXT
#     )
#     WHERE EMBEDDING IS NULL;
# """)


### Create sql table for small chunks

In [12]:
# small_chunks_df = pd.DataFrame()
# for document_id, document_row_object in document_table.iterrows():
#     file_path = document_row_object['FILE_PATH']
#     tmp_chunked_df = extract_text_chunks(file_path= file_path, 
#                         manual_id = manual_id,
#                         chunk_size = 1024,
#                         chunk_overlap = 64)  # Show first 5 chunks
#     small_chunks_df = pd.concat([small_chunks_df, tmp_chunked_df], ignore_index=True)
# tmp_chunked_df
small_chunks_df = text_chunk_mapper(documents_df)

create_table_sql = """
CREATE OR REPLACE TABLE CHUNKS_SMALL (
    CHUNK_ID INT AUTOINCREMENT PRIMARY KEY,
    DOCUMENT_ID INT NOT NULL,
    PAGE_START_NUMBER INT,
    PAGE_END_NUMBER INT,
    CHUNK_ORDER INT,
    CHUNK_TEXT STRING NOT NULL,
    EMBEDDING VECTOR(FLOAT, 1024),
    CREATED_AT TIMESTAMP_LTZ DEFAULT CURRENT_TIMESTAMP(),
    CONSTRAINT fk_document
        FOREIGN KEY (DOCUMENT_ID)
        REFERENCES DOCUMENTS(DOCUMENT_ID)
);
"""
cursor.execute(create_table_sql)

success, nchunks, nrows, output = write_pandas(
    conn=conn,
    df=small_chunks_df,
    database =database,
    table_name="CHUNKS_SMALL",
    schema=schema,
    auto_create_table=False,
    overwrite=False
)

print(f"Success: {success}, Chunks: {nchunks}, Rows: {nrows}")

# Update the embeddings for the small chunks
cursor.execute("""
    UPDATE CHUNKS_SMALL
    SET EMBEDDING = SNOWFLAKE.CORTEX.EMBED_TEXT_1024(
        'snowflake-arctic-embed-l-v2.0',
        CHUNK_TEXT
    )
    WHERE EMBEDDING IS NULL;
""")


{'strict': False, 'flattened_pages': None, 'resolved_objects': {(0, 468): {'/DecodeParms': {'/Columns': 4, '/Predictor': 12}, '/Filter': '/FlateDecode', '/ID': [b'\xed\xf8\x9fV\xe9\x88|\x03\x05\x02\x01J|(Z\x0e', 'âÍëÜùV\x02H—Y\x11_˝Ì‹\x15'], '/Index': [456, 25], '/Info': IndirectObject(455, 0, 1759879196896), '/Prev': 2240089, '/Root': IndirectObject(457, 0, 1759879196896), '/Size': 481, '/Type': '/XRef', '/W': [1, 2, 1]}, (0, 340): {'/DecodeParms': {'/Columns': 5, '/Predictor': 12}, '/Filter': '/FlateDecode', '/ID': [b'\xed\xf8\x9fV\xe9\x88|\x03\x05\x02\x01J|(Z\x0e', 'âÍëÜùV\x02H—Y\x11_˝Ì‹\x15'], '/Info': IndirectObject(455, 0, 1759879196896), '/Root': IndirectObject(457, 0, 1759879196896), '/Size': 456, '/Type': '/XRef', '/W': [1, 3, 1]}}, 'xref_index': 0, '_page_id2num': None, 'xref': {0: {456: 16, 457: 1197, 458: 1282, 459: 1521, 460: 2427, 461: 5075, 462: 7768, 463: 7910, 464: 18699, 465: 19014, 466: 42915, 467: 43124, 468: 116, 480: 482, 1: 51237, 2: 51472, 3: 53683, 4: 53877, 5:

<snowflake.connector.cursor.SnowflakeCursor at 0x199c0cbbaa0>

# Create table for error codes

In [13]:

error_codes_df["ERROR_ID"] = error_codes_df.index
error_codes_df

Unnamed: 0,ERROR_CODES,ERROR_ID
0,E:17,0
1,E:18,1
2,E:23,2
3,E:36,3
4,E:30,4
5,E:60,5
6,E:32,6
7,E:35/-10,7
8,E:30/-80,8
9,E:36/-25/-26,9


In [14]:
create_table_sql = """
CREATE OR REPLACE TABLE ERROR_CODES (
    ERROR_ID INT AUTOINCREMENT PRIMARY KEY,
    ERROR_CODES STRING NOT NULL
);
"""
cursor.execute(create_table_sql)


success, nchunks, nrows, output = write_pandas(
    conn=conn,
    df= error_codes_df,
    database =database,
    table_name="ERROR_CODES",
    schema=schema,
    auto_create_table=False,
    overwrite=False
)

print(f"Success: {success}, Chunks: {nchunks}, Rows: {nrows}")

Success: True, Chunks: 1, Rows: 32


# Map errors to chunks where they are named

In [15]:
def map_error_codes_to_chunks(tmp_chunked_df, error_codes):
    # Prepare empty list to store mapping rows
    mappings = []

    for chunk_id, chunk_row in tmp_chunked_df.iterrows():
        chunk_text = chunk_row['CHUNK_TEXT']#.lower()
        for error_id, error_row in error_codes.iterrows():
            error_code = error_row['ERROR_CODES']
            if error_code in chunk_text.replace(" ", ""):
                mappings.append({
                    'CHUNK_ID': chunk_id,
                    'ERROR_ID': error_id
                })
                
    ERROR_CODE_MAPPING = pd.DataFrame(mappings)
    
    return ERROR_CODE_MAPPING

ERROR_CODE_MAPPING = map_error_codes_to_chunks(small_chunks_df, error_codes_df)


In [35]:
# print(ERROR_CODE_MAPPING.head())
display(ERROR_CODE_MAPPING)

Unnamed: 0,CHUNK_ID,ERROR_ID
0,74,15
1,75,15
2,75,16
3,75,17
4,75,18
5,76,15
6,76,16
7,76,17
8,76,18
9,77,15


## Create sql table for error mappings to small chunks

In [17]:
create_table_sql = """
CREATE OR REPLACE TABLE ERROR_MAPPING_SMALL_CHUNKS (
    CHUNK_ID INT NOT NULL,
    ERROR_ID INT NOT NULL,
    PRIMARY KEY (CHUNK_ID, ERROR_ID),
    FOREIGN KEY (CHUNK_ID) REFERENCES CHUNKS_SMALL(CHUNK_ID),
    FOREIGN KEY (ERROR_ID) REFERENCES ERROR_CODES(ERROR_ID)
);
"""
cursor.execute(create_table_sql)

success, nchunks, nrows, output = write_pandas(
    conn=conn,
    df= ERROR_CODE_MAPPING,
    database = database,
    table_name="ERROR_MAPPING_SMALL_CHUNKS",
    schema=schema,
    auto_create_table=False,
    overwrite=False
)

print(f"Success: {success}, Chunks: {nchunks}, Rows: {nrows}")

Success: True, Chunks: 1, Rows: 164


## Function for getting relevant chunks for error codes based on query 

In [4]:
def get_text_chunks_for_error_code(chunks_df, error_codes_df, mapping_df, prompt_error_codes_df):
    """
    Returns pandas table for text chunks that are associated with a specific error code.
    
    Parameters:
    -----------
    chunks_df : pandas.DataFrame
        DataFrame containing text chunks with 'CHUNK_ID' and 'CHUNK_TEXT' columns
    error_codes_df : pandas.DataFrame
        DataFrame containing error codes with 'ERROR_CODES' column
    mapping_df : pandas.DataFrame
        DataFrame mapping chunk IDs to error IDs with 'CHUNK_ID' and 'ERROR_ID' columns
    prompt_error_codes_df : pandas.DataFrame
        The error code to search for
        
    Returns:
    --------
    pandas.DataFrame
        Filtered DataFrame containing only text chunks associated with the error code
    """
    matching_errors = error_codes_df[error_codes_df['ERROR_CODES'].isin(prompt_error_codes_df['ERROR_CODES'])]

    # print(matching_errors)
    
    if matching_errors.empty:
        return pd.DataFrame()  # Return empty DataFrame if no matching error codes
    
    # Get the error IDs
    error_ids = matching_errors.index.tolist()
    
    # Step 2: Find chunk IDs associated with these error IDs
    matching_chunks = mapping_df[mapping_df['ERROR_ID'].isin(error_ids)]
    
    if matching_chunks.empty:
        return pd.DataFrame()  # Return empty DataFrame if no mappings found
    
    # Get the chunk IDs
    chunk_ids = matching_chunks['CHUNK_ID'].tolist()
    
    # Step 3: Return the text chunks with these IDs
    result_chunks = chunks_df[chunks_df['CHUNK_ID'].isin(chunk_ids)]
    
    return result_chunks

In [19]:
# error_codes = extract_error_codes("The error is E:30/-10 and E:17")
# error_list = error_codes["ERROR_CODES"].tolist()
# print(error_list)

## Create search service for error codes

In [20]:
create_error_code_search = """
create or replace CORTEX SEARCH SERVICE ERROR_SEARCH_SERVICE
ON CHUNK_TEXT
ATTRIBUTES CHUNK_ID
warehouse = COMPUTE_WH
TARGET_LAG = '1 minute'
as (
    select CHUNK_ID,
        CHUNK_TEXT,
        PAGE_START_NUMBER,
        PAGE_END_NUMBER,
        DOCUMENT_ID
    from CHUNKS_SMALL
);
"""
cursor.execute(create_error_code_search)   

<snowflake.connector.cursor.SnowflakeCursor at 0x199c0cbbaa0>

## Create chunk retrieval snowflake function

In [26]:
chunk_retrieval_function= """
CREATE OR REPLACE FUNCTION get_chunks_for_error_codes(error_codes_str STRING)
RETURNS TABLE (
  chunk_id NUMBER,       -- Match the actual type
  chunk_text STRING,      -- Or VARCHAR if you prefer
  Embedding VECTOR(FLOAT, 1024)
)
AS
$$
  SELECT tc.chunk_id, tc.chunk_text, tc.EMBEDDING
  FROM WASHING_MACHINE_MANUALS.PUBLIC.CHUNKS_SMALL tc
  JOIN (
    SELECT DISTINCT ecm.chunk_id
    FROM WASHING_MACHINE_MANUALS.PUBLIC.ERROR_MAPPING_SMALL_CHUNKS ecm
    JOIN (
      SELECT ec.error_id
      FROM WASHING_MACHINE_MANUALS.PUBLIC.ERROR_CODES ec
      JOIN TABLE(SPLIT_TO_TABLE(error_codes_str, ',')) ic
        ON ec.error_codes = ic.value
    ) matched_error_ids
    ON ecm.error_id = matched_error_ids.error_id
  ) matched_chunks
  ON tc.chunk_id = matched_chunks.chunk_id
$$;
"""
cursor.execute(chunk_retrieval_function)

<snowflake.connector.cursor.SnowflakeCursor at 0x199c0cbbaa0>

## Getting relevant chunks from snowflake

In [5]:
# from snowflake.core import Root
# session = get_active_session()
root = Root(session)
# svc = KnowledgeBase(session, "WASHING_MACHINE_MANUALS.PUBLIC.KB_NAME")
# svc = KnowledgeBase(session, "WASHING_MACHINE_MANUALS.PUBLIC.CHUNKS_SMALL")
svc = root.databases[database].schemas[schema].cortex_search_services["ERROR_SEARCH_SERVICE"]

query = "The error is E03408 and E:17"
def get_similar_chunks_search_service(query):

    NUM_CHUNKS = 10
    COLUMNS = ["CHUNK_TEXT", "CHUNK_ID"]

    error_codes = extract_error_codes(query)
    error_list = error_codes["ERROR_CODES"].tolist()

    # print("ERROR LIST: ", error_list)
    try:
        error_code_str = ",".join(error_list)
        query_str = f"SELECT chunk_id FROM TABLE(get_chunks_for_error_codes('{error_code_str}'))"
        cursor.execute(query_str)
        chunk_ids = [row[0] for row in cursor.fetchall()]
        
    finally:
        print(f"Success!")
        # cursor.close()
        # conn.close()
    # print("CHUNK IDS: ", chunk_ids, "\n")
    if not chunk_ids:
        print("No matching chunks found for those error codes.")
        # return {}

    # Step 3: Create vector search filter using chunk_ids
    filter_obj = {
        "@or": [{"@eq": {"CHUNK_ID": chunk_id}} for chunk_id in chunk_ids]
        }

    # print("\nFILTER OBJ: ", filter_obj)

    # Step 4: Perform similarity search using filter to retrieve relevant chunks
    response = svc.search(query, COLUMNS, filter=filter_obj, limit=NUM_CHUNKS)
    
    prompt_context = response.json()
    return response.json()

In [44]:
json_object = json.loads(str(response.json()))
json_formatted_str = json.dumps(json_object, indent=3)
print(json_formatted_str)

{
   "results": [
      {
         "CHUNK_TEXT": "laundry and \npush it closed again.\n\u25a0If necessary, switch the appliance off and on again; set the programme and make your individual settings; start the programme.\nE:17 \u25a0Turn the water tap on fully,\n\u25a0The supply hose is kinked/trapped,\n\u25a0The water pressure is too low.\u2013 Clean the filter in the water supply. ~ Page 26\n\u2013 If an additional water filter has been installed (depending on model) \n~ Page 35 , check for contaminants and clean if required ~ separate \ninstallation and cleaning instruction for water filter.\nE:18 \u25a0The detergent solution pump is blocked. Clean the detergent solution pump. ~ Page 25\n\u25a0The drain hose/waste pipe is blocked. Clean the drain hose at the siphon. \n~ Page 25\nE:23 Water in the base tub, appliance leaking. Turn off the water tap. Call the after-sales \nservice. ~ Page 31\n D The childproof lock is activated; to deactivate: ~ Page 20\n N  flashes Too much foam detec

C:\Users\jeed\AppData\Local\Temp\ipykernel_861220\3946670438.py:1: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  json_object = json.loads(str(response.json()))


In [41]:
result_df = get_text_chunks_for_error_code(small_chunks_df, error_codes_df, ERROR_CODE_MAPPING, extract_error_codes("The error is E03408 and E:17"))
result_df

Unnamed: 0,DOCUMENT_ID,PAGE_START_NUMBER,PAGE_END_NUMBER,CHUNK_TEXT,CHUNK_ORDER,CHUNK_ID
320,3,27,27,laundry and \npush it closed again.\n■If neces...,46,320
1066,12,0,1,Register yournew device on\nMyBosch now and\ng...,0,1066


In [None]:
cursor.close()
conn.close()
session.close()

### Create function for prompting an LLM with snowflake cortex

In [6]:
def create_prompt(myquestion):

    prompt_context = get_similar_chunks_search_service(myquestion)

    prompt = f"""
        You are an Washing machine expert assistance that extracs information from the CONTEXT provided
        between <context> and </context> tags.
        When ansering the question contained between <question> and </question> tags
        be concise and do not hallucinate. Please convert the answer to a simple step-by-step guide.
        If you don't have the information just say so.
        Only anwer the question if you can extract it from the CONTEXT provideed.
        
        Do not mention the CONTEXT used in your answer.

        <context>          
        {prompt_context}
        </context>
        <question>  
        {myquestion}
        </question>
        Answer: 
        """

    json_data = json.loads(prompt_context)

    # document_ids = set(item['DOCUMENT_ID'] for item in json_data['results'])
    return prompt
        
def complete(myquestion):

    prompt = create_prompt(myquestion)
    cmd = """
            select snowflake.cortex.complete(?, ?) as response
          """
    
    response = session.sql(cmd, params=["mistral-large2", prompt]).collect()
    response_text = response[0]['RESPONSE']
    # print("DF RESPONSE OBJECT: ", df_response)
    return response_text



In [68]:
result = complete("What sould I do in order to fix the errors E:60 and E:17")
print(result)

Success!


C:\Users\jeed\AppData\Local\Temp\ipykernel_861220\1635895992.py:43: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  prompt_context = response.json()
C:\Users\jeed\AppData\Local\Temp\ipykernel_861220\1635895992.py:44: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  return response.json()


 To fix the errors E:60 and E:17, follow these steps:

**For E:60:**
1. Redistribute the laundry in the drum.

**For E:17:**
1. Turn the water tap on fully.
2. Check if the supply hose is kinked or trapped.
3. Ensure the water pressure is sufficient.
4. Clean the filter in the water supply (refer to Page 26).
5. If an additional water filter has been installed, check for contaminants and clean if required (refer to separate installation and cleaning instructions for the water filter).


## Creating sections table using LLM for TOC extraction

The function `extract_TOC` takes quite a while due to the chunk size and the model. This can be tampered with, but i found most consistent results with said model. I also think that larger chunks are better for this task, as the model can see context of the first few pages, and it also ensures that the table of contents is included in the first chunk.

In [None]:
def extract_TOC(text: str, model : str) -> str:
    prompt = (
    """
    I will provide a long string of text that most likely contains a table of contents, 
    although it may also include additional body text from a document. Your task is to carefully 
    extract only the table of contents and structure it as a JSON object in the following 
    format:
    {
      "Section": "<section name>",
      "Section Number": "<section name>",
      "Page": <page number>,
      "Sub Sections" : [{
        "Section": "<section name>",
        "Section Number": "<section name>",
        "Page": <page number>,
        "Sub Sections" : []}
      ],
    }    

    Guideines:
    - All keys in the json object must be either "Section", "Section Number", "Page", "Sub Sections".
    - "Section Number" must be represented as an integer or float - E.G: 1, 2, 5.3, 1,4, etc.
    - Ignore any text that is not part of the table of contents.
    - Ensure that sub-sections are nested appropriately under their parent section.
    - Page numbers should be extracted as integers, if possible.
    - Be tolerant of inconsistencies in formatting, spacing, or punctuation (e.g. dashes, colons, ellipses).
    - Do not include duplicate or repeated sections.
    - You should only consider items which are part of the table of contents, nothing before, nothing after.
    - "Section" must consist of words
    - "Section Number" must be represented as an integer or float - E.G: 1, 2, 5.3, 1,4, etc.
    - You must include a top level key value pair called "Section":"Table of contents".

    """
    f"Text:\n{text}"
    )
    start_time = time.time()
    result = cursor.execute(f"""
        SELECT SNOWFLAKE.CORTEX.COMPLETE('{model}', $$ {prompt} $$)
    """)
    print(f"Runtime in seconds: {time.time() - start_time:.4f}")

    return cursor.fetch_pandas_all().iloc[0,0]


# This example prints out section 4 of the first document of the database. mistral-large2 mistral-7b
# llm_output = extract_TOC(df_large_chunks.loc[0,"CHUNK"], model = 'mistral-7b')

# llm_output = extract_TOC(large_chunks_df.loc[0,"CHUNK_TEXT"], model = 'llama3.1-70b')
# llm_output

In [None]:
def extract_json_from_llm_output(llm_output: str) -> dict:
    try:
        # Confirming that a JSON block is returned
        match = re.search(r"```\s*(\{.*?\})\s*```", llm_output, re.DOTALL)
        if not match:
            raise ValueError("No JSON code block found in the text.")

        # Extracting sub string (json string)
        raw_json = match.group(1)

        # Clean common JSON errors (e.g., trailing commas)
        cleaned_json = re.sub(r",\s*([\]}])", r"\1", raw_json)  # remove trailing commas before ] or }
        
        # Parse string to json
        parsed = json.loads(cleaned_json)
        return parsed
    
    except Exception as e:
        print("Failed to extract JSON:", e)
        return {}

        
# parsed_dict = extract_json_from_llm_output(llm_output)
# print(json.dumps(parsed_dict, indent=2))

In [None]:
def traverse_sections(node, parent_section=None):
    rows = []

    # Get info from the current node
    section = node.get("Section")
    section_number = node.get("Section Number")
    page = node.get("Page")

    # Add current node to the list
    evaluator = load_evaluator("string_distance")
    levenshtein_score_toc = evaluator.evaluate_strings(
    prediction=section,
    reference="Table of Contents",
    metric="levenshtein"
    )["score"]  # This will be a float between 0 and 1, where 0 means identical

    if levenshtein_score_toc > 0.1:  # if the levenshtein distance is very small its likely to match "Table of Contents"
        rows.append({
            "SECTION": section,
            "SECTION_NUMBER": section_number,
            "PAGE": page,
            "PARENT_SECTION_NUMBER": parent_section
        })

    # Recurse into each sub-section, if any
    for subsection in node.get("Sub Sections", []):
        rows.extend(traverse_sections(subsection, parent_section=section_number))

    return rows

# flat_rows = traverse_sections(parsed_dict)
# toc_df = pd.DataFrame(flat_rows)
# toc_df.head(10)


In [None]:
def create_TOC_table(documents_df, large_chunks_df, model ="llama3.1-70b"):
    df_list = []

    for row in tqdm(documents_df.iterrows(), total = len(documents_df)):
        manual_id = row[1]["DOCUMENT_ID"]
        file_path = os.path.join(pdf_files_path, row[1]["DOCUMENT_NAME"])
        first_chunk_of_doc = large_chunks_df.loc[large_chunks_df["DOCUMENT_ID"] == manual_id, "CHUNK_TEXT"].iloc[0]
        # print("First chunk:", first_chunk_of_doc)

        llm_output = extract_TOC(first_chunk_of_doc, model = model)
        parsed_dict = extract_json_from_llm_output(llm_output)
        flat_rows = traverse_sections(parsed_dict)
        local_toc_df = pd.DataFrame(flat_rows)
        local_toc_df["DOCUMENT_ID"] = manual_id
        df_list.append(local_toc_df)

    return pd.concat(df_list, ignore_index=True)
     
sections_df = create_TOC_table(documents_df, large_chunks_df, model ="llama3.1-70b")
sections_df

  0%|          | 0/3 [01:27<?, ?it/s]

Runtime in seconds: 87.0377





ImportError: Please install the rapidfuzz library to use the FuzzyMatchStringEvaluator.Please install it with `pip install rapidfuzz`.

In [None]:
cursor.execute("""
    CREATE OR REPLACE TABLE SECTIONS (
    SECTION_ID INT AUTOINCREMENT PRIMARY KEY,
    DOCUMENT_ID INT NOT NULL,
    SECTION STRING NOT NULL,
    SECTION_NUMBER STRING NOT NULL,
    PARENT_SECTION_NUMBER STRING,
    PAGE INT,
    CREATED_AT TIMESTAMP_LTZ DEFAULT CURRENT_TIMESTAMP(),
    CONSTRAINT fk_document
        FOREIGN KEY (DOCUMENT_ID)
        REFERENCES DOCUMENTS(DOCUMENT_ID)
);
""")


success, nchunks, nrows, output = write_pandas(
    conn=conn,
    df=sections_df,
    database =database,
    table_name="SECTIONS",
    schema=schema,
    auto_create_table=False,
    overwrite=False
)
print(f"Success: {success}, Chunks: {nchunks}, Rows: {nrows}")


Success: True, Chunks: 1, Rows: 221


In [None]:
# Lets see the table
cursor.execute("""
    SELECT * 
    FROM CHUNKS_LARGE;
""")

large_chunks_df = cursor.fetch_pandas_all()
large_chunks_df.head()


Unnamed: 0,CHUNK_ID,DOCUMENT_ID,PAGE_START_NUMBER,PAGE_END_NUMBER,CHUNK_ORDER,CHUNK_TEXT,EMBEDDING,CREATED_AT
0,1,1,0,3,0,Register your b M o ge s y n c t B e h f o r w...,"[0.048339844, 0.0158844, -0.015388489, -0.0038...",2025-04-22 01:53:38.352000-07:00
1,2,1,3,7,1,t. ¡ Up to an altitude of max. 4000 m above se...,"[0.038604736, 0.08459473, -0.023513794, -0.014...",2025-04-22 01:53:38.352000-07:00
2,3,1,7,10,2,on or lean against the appliance door. ▶ Do no...,"[0.074279785, 0.068359375, -0.0036392212, 0.02...",2025-04-22 01:53:38.352000-07:00
3,4,1,10,15,3,"ese instructions, your creases energy and wate...","[0.008338928, 0.00027441978, 0.044067383, 0.00...",2025-04-22 01:53:38.352000-07:00
4,5,1,15,22,4,ht. Water outlet connection types 4.6 Aligning...,"[0.052764893, 0.074157715, -0.033294678, 0.011...",2025-04-22 01:53:38.352000-07:00


In [None]:
# Lets see the table
cursor.execute("""
    SELECT * 
    FROM CHUNKS_SMALL;
""")

small_chunks_df = cursor.fetch_pandas_all()
small_chunks_df.head()


Unnamed: 0,CHUNK_ID,DOCUMENT_ID,PAGE_START_NUMBER,PAGE_END_NUMBER,CHUNK_ORDER,CHUNK_TEXT,EMBEDDING,CREATED_AT
0,1,1,0,1,0,Register your b M o ge s y n c t B e h f o r w...,"[0.032073975, 0.05307007, -0.021911621, -0.006...",2025-04-22 01:54:15.568000-07:00
1,2,1,1,1,1,..... 29 2Preventing material damage.... 10 13...,"[0.018035889, 0.035339355, -0.0013151169, 0.00...",2025-04-22 01:54:15.568000-07:00
2,3,1,1,2,2,moving the transit bolts...... 13 13.10 Cancel...,"[0.086364746, 0.014724731, 0.004234314, -0.003...",2025-04-22 01:54:15.568000-07:00
3,4,1,1,3,3,......... 20 2 en 15 Basic settings..............,"[0.029403687, -0.017074585, -0.033416748, 0.00...",2025-04-22 01:54:15.568000-07:00
4,5,1,3,3,4,e the following safety instructions. 1.1 Gener...,"[0.050964355, 0.06311035, -0.032409668, 0.0217...",2025-04-22 01:54:15.568000-07:00


In [None]:
# Lets see the table
cursor.execute("""
    SELECT * 
    FROM SECTIONS;
""")

sections_df = cursor.fetch_pandas_all()
sections_df.head()

sections_df


Unnamed: 0,SECTION_ID,DOCUMENT_ID,SECTION,SECTION_NUMBER,PARENT_SECTION_NUMBER,PAGE,CREATED_AT
0,1,1,Safety,1,,4,2025-04-22 03:14:00.711000-07:00
1,2,1,General information,1.1,1,4,2025-04-22 03:14:00.711000-07:00
2,3,1,Intended use,1.2,1,4,2025-04-22 03:14:00.711000-07:00
3,4,1,Restriction on user group,1.3,1,4,2025-04-22 03:14:00.711000-07:00
4,5,1,Safe installation,1.4,1,5,2025-04-22 03:14:00.711000-07:00
...,...,...,...,...,...,...,...
216,217,3,Disposing of old appliance,18.4,18,47,2025-04-22 03:14:00.711000-07:00
217,218,3,Customer Service,19,,47,2025-04-22 03:14:00.711000-07:00
218,219,3,Product number (E-Nr.) and production number (FD),19.1,19,47,2025-04-22 03:14:00.711000-07:00
219,220,3,Consumption values,20,,48,2025-04-22 03:14:00.711000-07:00


# Extracting images from the manual

This chosen method which appears to be more diverse across the manuals treats each page as an image. This is a good way to ensure that all images are extracted. 
The downside is that tables and other image like content will be extracted as images. Currently this is a feature not a bug. Adjusting the image extraction method is a task for the future when we have the real PDFs.

In [None]:
def render_pdf_to_images(pdf_path, zoom=2.0):
    doc = fitz.open(pdf_path)
    images = []
    for i, page in enumerate(doc):
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat)
        img_data = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        images.append({
            "page_number": i + 1,
            "image": img_data
        })
    return images


def get_pdf_page_pixel_size(pdf_image):
    width, height = pdf_image.size
    return width * height


def detect_image_regions(page_image, buffer=0, min_size=70, max_size = 1000, threshold=240):
    image = np.array(page_image)
    gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)

    # Applying blur to reduce fine lines from tables
    _, thresh = cv2.threshold(gray, threshold, 255, cv2.THRESH_BINARY_INV)
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    regions = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > min_size and h > min_size:  # Skip tiny blocks (Maybe reconsider)
            regions.append([x - buffer, 
                            y - buffer, 
                            x + w + buffer, 
                            y + h + buffer])
            if w * h > max_size:
                regions.pop(-1)  
    return regions


def crop_regions_from_image(page_image, regions, output_dir, page_num, manual_id):
    os.makedirs(output_dir, exist_ok=True)
    saved_images = []

    for i, coords in enumerate(regions):
        x1, y1, x2, y2 = map(int, coords)
        cropped = page_image.crop((x1, y1, x2, y2))
        save_path = os.path.join(output_dir, f"doc_{manual_id}_page_{page_num}_img_{i+1}.png")
        cropped.save(save_path)
        saved_images.append({
            "page": page_num,
            "image_path": save_path,
            "coords": (x1, y1, x2, y2)
        })
    return saved_images



def add_region_to_page(page_image, regions, output_dir, page_num, pdf_path ,color=(0, 255, 0), alpha=50, save=True, verbose=0):
    os.makedirs(output_dir, exist_ok=True)

    # Convert to RGBA to allow transparency
    annotated = page_image.convert("RGBA")
    overlay = Image.new("RGBA", annotated.size, (0, 0, 0, 0))
    draw = ImageDraw.Draw(overlay)

    for coords in regions:
        x1, y1, x2, y2 = map(int, coords)
        draw.rectangle([x1, y1, x2, y2], outline=color + (alpha,), fill=color + (alpha,))

    # Combine original image with overlay
    combined = Image.alpha_composite(annotated, overlay)

    if save:
        save_path = os.path.join(output_dir, f"page_{page_num:03d}_with_regions_{color}.png")
        combined.convert("RGB").save(save_path)
        if verbose > 0:
            print(f"Saved page {page_num} with highlighted regions to {save_path}")

    return combined


def merge_overlapping_regions(regions, buffer=0):
    """
    Merges overlapping or intersecting regions.

    Args:
        regions (List[List[int]]): List of regions as [x1, y1, x2, y2].
        buffer (int): Optional buffer added to each region before checking overlaps.

    Returns:
        List[List[int]]: Merged list of non-overlapping regions.
    """
    from shapely.geometry import box
    from shapely.ops import unary_union

    # Convert to shapely boxes with optional buffer
    boxes = [box(x1 - buffer, y1 - buffer, x2 + buffer, y2 + buffer) for x1, y1, x2, y2 in regions]

    # Merge all overlapping boxes (A fix to a previous issues of diagrams being cropped into multiple images)
    merged = unary_union(boxes)

    # Ensure output is a list of boxes
    if merged.geom_type == 'Polygon':
        merged_boxes = [merged]
    else:
        merged_boxes = list(merged.geoms)

    # Convert back to [x1, y1, x2, y2] format (round to int)
    merged_regions = []
    for b in merged_boxes:
        x1, y1, x2, y2 = b.bounds
        merged_regions.append([int(x1), int(y1), int(x2), int(y2)])

    return merged_regions



# This is the main function to extract images from the PDF
def extract_images_from_pdf(pdf_path:str, manual_id:int, output_dir: str, verbose:int =0):
    rendered_pages = render_pdf_to_images(pdf_path)
    all_extracted = []

    for page_idx,page in enumerate(rendered_pages):
        page_num = page["page_number"] 
        image = page["image"]
        if verbose > 0:
            print(f"Processing page {page_num}...")

        # Detecting regions
        regions = detect_image_regions(image , buffer=2, min_size=70, 
                                        max_size=get_pdf_page_pixel_size(image) * 0.99)
        # Creates new regions by merging overlapping regions (this is a fix for cropped images  )
        new_regions = merge_overlapping_regions(regions, buffer=0)

        if verbose > 0:
            print(f"Found {len(new_regions)} image regions on page {page_num}")

        if not new_regions:
            if verbose > 0:
                print(f"No image regions found on page {page_num}")
            continue
        
        # Creates an image directory for each PDF file
        image_output_dir = pdf_path.split("/")[-1].replace(".pdf", "").replace("Washer_Manuals", output_dir)
        os.makedirs(image_output_dir, exist_ok=True)

        # Showing the pages with the masked regions 
        modified_image = add_region_to_page(image, new_regions, image_output_dir, page_num, pdf_path, color=(0, 0, 255), alpha=50, save = False)

        # OLD code 
        extracted = crop_regions_from_image(
            image, new_regions, output_dir=image_output_dir, page_num=page_num, manual_id=manual_id
        )
        all_extracted.extend(extracted)
    return all_extracted


for idx,row in tqdm(enumerate(documents_df.iterrows()), total = len(documents_df)):
    manual_id = row[1]["DOCUMENT_ID"]
    file_path = os.path.join(pdf_files_path, row[1]["DOCUMENT_NAME"])
    extract_images_from_pdf(file_path, manual_id, output_dir="Washer_Images", verbose = 0)

  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:06<00:00,  2.33s/it]


# Creating table for image references and metadata

Currently the images are matched to the sections using the page number, which is problematic if the end of section 4.3 is one the same page as the start of section 4.4. On the top of my head i'm not quite sure how to match the images to the sections accurately, but this method yields mostly correct results.

In [None]:
def extract_page_number_from_filename(filename):
    return filename.split("_")[3] if "_" in filename else None

def generate_image_table(documents_df, sections_df, image_dir):
    image_records = []

    # Loop over all subdirectories in image_dir
    for subfolder in os.listdir(image_dir):
        subfolder_path = os.path.join(image_dir, subfolder)
        
        if not os.path.isdir(subfolder_path):
            continue  # skip files
        
        # Match to document by DOCUMENT_NAME (strip extension if needed)
        matching_docs = documents_df[documents_df['DOCUMENT_NAME'].str.contains(subfolder, case=False)]
        if matching_docs.empty:
            print(f"No matching document for subfolder: {subfolder}")
            continue
        
        document_id = matching_docs.iloc[0]['DOCUMENT_ID']
        document_name = matching_docs.iloc[0]['DOCUMENT_NAME']
        
        # List all image files in subdirectory
        for image_file in os.listdir(subfolder_path):
            if not image_file.lower().endswith((".png")):
                continue
            
            image_path = os.path.join(subfolder_path, image_file)
            page_number = extract_page_number_from_filename(image_file)
            order_number = image_file.split("img_")[-1].strip(".png")

            image_size = os.path.getsize(image_path)
            image_width, image_height = Image.open(image_path).size
            
            # Try to match to a section (same document, closest PAGE <= image page)
            section_match = None
            if page_number is not None:
                matching_sections = sections_df[
                    (sections_df['DOCUMENT_ID'] == document_id) & 
                    (sections_df['PAGE'].astype(str) <= str(page_number))
                ]
                if not matching_sections.empty:
                    section_match = matching_sections.sort_values("PAGE", ascending=False).iloc[0]
            
            image_records.append({
                "DOCUMENT_ID": document_id,
                "SECTION_ID": section_match["SECTION_ID"] if section_match is not None else None,
                "SECTION_NUMBER": section_match["SECTION_NUMBER"] if section_match is not None else None,
                "PAGE": page_number,
                "IMG_ORDER": order_number,
                "IMAGE_FILE": image_file,
                "IMAGE_PATH": image_path,
                "IMAGE_SIZE": image_size,
                "IMAGE_WIDTH": image_width,
                "IMAGE_HEIGHT": image_height
            })

    return pd.DataFrame(image_records)


image_df = generate_image_table(documents_df, sections_df, ".\\Washer_Images")
image_df.head(5)

Unnamed: 0,DOCUMENT_ID,SECTION_ID,SECTION_NUMBER,PAGE,IMG_ORDER,IMAGE_FILE,IMAGE_PATH,IMAGE_SIZE,IMAGE_WIDTH,IMAGE_HEIGHT
0,1,15,4.2,13,1,doc_1_page_13_img_1.png,.\Washer_Images\WAV28KH3GB\doc_1_page_13_img_1...,27070,318,452
1,1,15,4.2,13,2,doc_1_page_13_img_2.png,.\Washer_Images\WAV28KH3GB\doc_1_page_13_img_2...,4549,113,152
2,1,15,4.2,13,3,doc_1_page_13_img_3.png,.\Washer_Images\WAV28KH3GB\doc_1_page_13_img_3...,9425,157,140
3,1,17,4.4,14,1,doc_1_page_14_img_1.png,.\Washer_Images\WAV28KH3GB\doc_1_page_14_img_1...,3428,166,121
4,1,17,4.4,14,2,doc_1_page_14_img_2.png,.\Washer_Images\WAV28KH3GB\doc_1_page_14_img_2...,4973,166,120


In [None]:
cursor.execute("""
    CREATE OR REPLACE TABLE IMAGES (
    IMAGE_ID INT AUTOINCREMENT PRIMARY KEY,
    SECTION_ID INT NOT NULL,
    DOCUMENT_ID INT NOT NULL,
    SECTION_NUMBER STRING NOT NULL,
    PAGE INT,
    IMG_ORDER INT,
    IMAGE_FILE STRING,
    IMAGE_PATH STRING,
    IMAGE_SIZE NUMBER,
    IMAGE_WIDTH NUMBER,
    IMAGE_HEIGHT NUMBER,
    CREATED_AT TIMESTAMP_LTZ DEFAULT CURRENT_TIMESTAMP(),

    CONSTRAINT fk_document
        FOREIGN KEY (DOCUMENT_ID)
        REFERENCES DOCUMENTS(DOCUMENT_ID),
        
    CONSTRAINT fk_section
            FOREIGN KEY (SECTION_ID)
            REFERENCES SECTIONS(SECTION_ID)
);
""")


success, nchunks, nrows, output = write_pandas(
    conn=conn,
    df=image_df,
    database =database,
    table_name="IMAGES",
    schema=schema,
    auto_create_table=False,
    overwrite=False
)
print(f"Success: {success}, Chunks: {nchunks}, Rows: {nrows}")


Success: True, Chunks: 1, Rows: 194


In [None]:
# Lets see the table
cursor.execute("""
    SELECT * 
    FROM IMAGES;
""")

images_df = cursor.fetch_pandas_all()
images_df.head()

images_df.tail()

Unnamed: 0,IMAGE_ID,SECTION_ID,DOCUMENT_ID,SECTION_NUMBER,PAGE,IMG_ORDER,IMAGE_FILE,IMAGE_PATH,IMAGE_SIZE,IMAGE_WIDTH,IMAGE_HEIGHT,CREATED_AT
189,190,210,3,16.3,36,2,doc_3_page_36_img_2.png,.\Washer_Images\WGG254Z0GB\doc_3_page_36_img_2...,28987,328,237,2025-04-22 04:25:31.443000-07:00
190,191,210,3,16.3,36,3,doc_3_page_36_img_3.png,.\Washer_Images\WGG254Z0GB\doc_3_page_36_img_3...,15373,163,237,2025-04-22 04:25:31.443000-07:00
191,192,210,3,16.3,36,4,doc_3_page_36_img_4.png,.\Washer_Images\WGG254Z0GB\doc_3_page_36_img_4...,13176,164,237,2025-04-22 04:25:31.443000-07:00
192,193,216,3,18.3,46,1,doc_3_page_46_img_1.png,.\Washer_Images\WGG254Z0GB\doc_3_page_46_img_1...,16799,328,237,2025-04-22 04:25:31.443000-07:00
193,194,216,3,18.3,46,2,doc_3_page_46_img_2.png,.\Washer_Images\WGG254Z0GB\doc_3_page_46_img_2...,16547,328,237,2025-04-22 04:25:31.443000-07:00
