## This section of the notebook creates the foundation of the database, with text chunks and vector embeddings

In [None]:
CREATE DATABASE IF NOT EXISTS WASHING_MACHINE_MANUALS;
CREATE SCHEMA IF NOT EXISTS WASHING_MACHINE_MANUALS.PUBLIC;
USE DATABASE WASHING_MACHINE_MANUALS;
USE SCHEMA PUBLIC;

In [None]:
-- Creating stage to dump PDF documents into
create or replace stage docs ENCRYPTION = (TYPE = 'SNOWFLAKE_SSE') DIRECTORY = ( ENABLE = true );

In [None]:
-- Uploading the documents to the @docs stage directly (DO THIS MANUALLY)
-- Check that the files were uploaded
LS @docs;

In [None]:
CREATE OR REPLACE TABLE DOCUMENTS (
    DOCUMENT_ID INT AUTOINCREMENT PRIMARY KEY,
    RELATIVE_PATH STRING NOT NULL,
    FILE_URL STRING,
    SIZE NUMBER,
    STAGE_NAME STRING DEFAULT '@docs',
    CREATED_AT TIMESTAMP_LTZ DEFAULT CURRENT_TIMESTAMP()
);

INSERT INTO DOCUMENTS (RELATIVE_PATH, FILE_URL, SIZE)
SELECT 
    RELATIVE_PATH,
    FILE_URL,
    SIZE
FROM DIRECTORY(@docs);

In [None]:
SELECT * 
FROM DOCUMENTS;

In [None]:
-- Scale up!
-- ALTER WAREHOUSE COMPUTE_WH SET WAREHOUSE_SIZE = '4X-Large'; -- Didn't seem to have any effect on the run time. Probably have to ask about this.

In [None]:

-- Creates the table for storing the chunks and vector embeddings
CREATE OR REPLACE TABLE CHUNKS (
    CHUNK_ID INT AUTOINCREMENT PRIMARY KEY,
    DOCUMENT_ID INT NOT NULL,
    CHUNK_INDEX INT,
    CHUNK STRING NOT NULL,
    EMBEDDING VECTOR(FLOAT, 768),
    CREATED_AT TIMESTAMP_LTZ DEFAULT CURRENT_TIMESTAMP(),
    CONSTRAINT fk_document
        FOREIGN KEY (DOCUMENT_ID)
        REFERENCES DOCUMENTS(DOCUMENT_ID)
);


-- Creates a temp table with parsed text (1 row for each document, with a super long string of raw text of the document)
CREATE OR REPLACE TEMP TABLE parsed_text_table AS
SELECT 
  relative_path,
  size,
  file_url,
  BUILD_SCOPED_FILE_URL(@docs, relative_path) AS scoped_file_url,
  TO_VARCHAR(SNOWFLAKE.CORTEX.PARSE_DOCUMENT(@docs, relative_path, {'mode': 'LAYOUT'})) AS full_text
FROM DIRECTORY(@docs);


-- Using the temporary table to fill the CHUNKS tables with 
INSERT INTO CHUNKS (DOCUMENT_ID, CHUNK_INDEX, CHUNK, EMBEDDING)
SELECT 
    d.DOCUMENT_ID,
    chunk_data.index AS CHUNK_INDEX,
    chunk_data.value::STRING AS CHUNK,
    SNOWFLAKE.CORTEX.EMBED_TEXT_768('snowflake-arctic-embed-m-v1.5', chunk_data.value::STRING) AS EMBEDDING
FROM parsed_text_table p
JOIN DOCUMENTS d ON p.RELATIVE_PATH = d.RELATIVE_PATH
JOIN LATERAL FLATTEN(
    INPUT => SNOWFLAKE.CORTEX.SPLIT_TEXT_RECURSIVE_CHARACTER(
        p.full_text,
        'none',     -- or 'markdown'
        256,           -- chunk size
        32             -- overlap
    )
) AS chunk_data
WHERE p.full_text IS NOT NULL;

SELECT * 
FROM CHUNKS 
LIMIT 10;

In [None]:
SELECT * FROM CHUNKS;

### This section will focus on classifying the sections of the document, using a sequence of LLM functions and logic

In [None]:
import pandas as pd
from snowflake.snowpark import Session
from snowflake.snowpark.functions import col

session = Session.builder.getOrCreate()

df_chunks = session.table("CHUNKS").to_pandas()
df_chunks.head()

In [None]:
def classify_toc_chunk(text: str) -> str:
    prompt = (
    "Determine whether the following text is part of a document's Table of Contents (TOC). "
    "Use the following rules to guide your decision:\n\n"
    "Consider the text to be part of the TOC **only if** it satisfies most of the following conditions:\n"
    "- It contains multiple lines or entries of newline characters.\n"
    "- The chunk contains a sequence of numbers (e.g., 1, 1.2, 2.3.4), possibly indicating section or subsection numbers.\n"
    "- This is followed by short, non-sentence fragments (i.e., not full grammatical sentences), typically a title or heading.\n"
    "- Ends with a page number, often preceded by dots or whitespace for alignment (e.g., '..... 12').\n"
    "- The tone is formal and lacks narrative or explanatory text.\n\n"
    "- Sequences of numbers apper in the chunk of text, such as 2.1, 2.2, 2.3, or 5.3, 5.4, 5.5. These sequences are usually seperated by some natural language"
    "Do **not** classify the text as TOC if:\n"
    "- It mostly contains complete paragraphs or full sentences.\n"
    "- It does not contain numbered sections or page numbers.\n"
    "- It appears to be body content, such as an introduction, abstract, or explanation.\n\n"
    "Respond strictly with 'Yes' or 'No'.\n\n"
    f"Text:\n{text}"
)
    
    result = session.sql(f"""
        SELECT SNOWFLAKE.CORTEX.COMPLETE('snowflake-arctic', $$ {prompt} $$)
    """).collect()
    
    return result[0][0].strip()

df_chunks_sample = df_chunks.head(50)  # or filter by a specific DOCUMENT_ID
toc_labels = df_chunks_sample["CHUNK"].apply(classify_toc_chunk)
df_chunks_sample["TOC_LABEL"] = toc_labels
df_chunks_sample

In [None]:

df_toc_chunks.head()

In [None]:
def extract_section_names(text: str) -> list[str]:
    prompt = (
        """
        The following text is either completely or partially part of a Table of Contents. 
        Extract each section and subsection into a clean list of tuples with section number and section titles mentioned in the text. 
        For an example. the input text 'Further information and explanations are available online:\n  \nTable of contents\n|Safety \
        ..|4|Buttons|22|\n| :---: | :---: | :---: | :---: |\n|1.1 General information..|4|||\n|1.2 \
        Intended use....|4|Programmes...|24|\n|1.3 Restriction on user'
        Should return a list in the structure of: 
        [(1.), ()]
        Return the list as specified.\n\n
        """
    )
    
    result = session.sql(f"""
        SELECT SNOWFLAKE.CORTEX.EXTRACT_ANSWER('{text.strip()}', $$ {prompt} $$)
    """).collect()

    return result
    # try:
    #     sections = json.loads(result[0][0])
    #     return sections if isinstance(sections, list) else []
    # except Exception as e:
    #     print("Error parsing output:", e)
    #     return []

extract_section_names(df_toc_chunks.loc[0,"CHUNK"])
    
# section_lists = df_toc_chunks["CHUNK"].apply(extract_section_names)
# section_lists

