# Washing Machine Manual Notebook


### Step 1: Extract Text from PDFs Using Cortex

In [None]:
-- Step 1a: Create table to store extracted text
CREATE OR REPLACE TABLE manuals_raw (
  file_name STRING,
  page_number INT,
  full_text STRING
);


In [None]:
-- Step 1b: Parse PDFs and extract full layout text
CREATE OR REPLACE TABLE manuals_raw AS
SELECT 
  relative_path AS file_name,
  0 AS page_number,  -- fake page for now, we can enrich later
  TO_VARCHAR(
    SNOWFLAKE.CORTEX.PARSE_DOCUMENT(@docs, relative_path, {'mode': 'LAYOUT'})
  ) AS full_text
FROM DIRECTORY(@docs);


In [None]:
-- Step 1c: Extract the actual "content" from the JSON blob

CREATE OR REPLACE TABLE manuals_cleaned AS
SELECT
  file_name,
  PARSE_JSON(full_text):content::STRING AS content_text
FROM manuals_raw;


### Step 2: Split Text into Chunks Using Cortex

In [None]:
CREATE OR REPLACE TABLE manuals_chunks AS
SELECT 
  file_name,
  ROW_NUMBER() OVER (PARTITION BY file_name ORDER BY SEQ4()) - 1 AS chunk_index,
  value::STRING AS chunk_text
FROM manuals_cleaned,
LATERAL FLATTEN(
  INPUT => SNOWFLAKE.CORTEX.SPLIT_TEXT_RECURSIVE_CHARACTER(
    content_text,
    'markdown',
    3000,
    200
  )
);


In [None]:
SELECT * FROM manuals_chunks LIMIT 5;

### Step 3: Embed Each Chunk with Cortex

In [None]:
CREATE OR REPLACE TABLE manuals_embeddings AS
SELECT
  file_name,
  chunk_index,
  chunk_text,
  SNOWFLAKE.CORTEX.EMBED_TEXT_768(
    'snowflake-arctic-embed-m-v1.5',  
    chunk_text                       
  ) AS embedding
FROM manuals_chunks;



-- Can also try:
                -- snowflake-arctic-embed-m-v1.5

                -- snowflake-arctic-embed-m

                -- e5-base-v2


In [None]:
SELECT 
  file_name,
  chunk_index,
  LEFT(chunk_text, 200) AS chunk_preview,
  embedding
FROM manuals_embeddings
LIMIT 5;


###  Step 4: Semantic Search (Build Search + Ask Questions)

In [None]:
-- Find best matching chunks
WITH query AS (
  SELECT SNOWFLAKE.CORTEX.EMBED_TEXT_768(
    'snowflake-arctic-embed-m-v1.5',
    'The washing machine is not draining. What should I do?'
  ) AS query_vec
)

SELECT 
  file_name,
  chunk_index,
  chunk_text,
  VECTOR_COSINE_SIMILARITY(embedding, query.query_vec) AS similarity
FROM manuals_embeddings, query
ORDER BY similarity DESC
LIMIT 5;


### Step 5: Use CORTEX.COMPLETE() to Generate Answers from Retrieved Chunks

In [None]:
WITH base_prompt AS (
  SELECT 
    file_name,
    chunk_text
  FROM manuals_chunks
  WHERE LOWER(chunk_text) LIKE '%emergency release%'
  QUALIFY ROW_NUMBER() OVER (PARTITION BY file_name ORDER BY chunk_index) = 1
),

full_prompt AS (
  SELECT
    file_name,
    'Search the following manual content and return the full section titled "Emergency release", including any relevant sub-sections or instructions. If the section is not present in this text, say "Not found." 

    Manual text:
    ' || chunk_text AS full_prompt
  FROM base_prompt
),

raw_answers AS (
  SELECT 
    file_name,
    SNOWFLAKE.CORTEX.COMPLETE(
      'mistral-7b',
      ARRAY_CONSTRUCT(
        OBJECT_CONSTRUCT('role', 'user', 'content', full_prompt.full_prompt)
      ),
      OBJECT_CONSTRUCT(
        'TEMPERATURE', 0.2,
        'MAX_TOKENS', 1024
      )
    ) AS section_answer
  FROM full_prompt
)

SELECT 
  file_name,
  REPLACE(section_answer:choices[0]:messages::STRING, '\\n', '\n') AS section_text
FROM raw_answers;


In [None]:
WITH base_prompt AS (
  SELECT 
    file_name,
    chunk_text
  FROM manuals_chunks
  WHERE LOWER(chunk_text) LIKE '%table of contents%'
  QUALIFY ROW_NUMBER() OVER (PARTITION BY file_name ORDER BY chunk_index) = 1
),

full_prompt AS (
  SELECT
    file_name,
    'Extract the complete "Table of contents" section from the following manual text. If no table of contents is present, say "Not found."

    Manual text:
    ' || chunk_text AS full_prompt
  FROM base_prompt
),

raw_answers AS (
  SELECT 
    file_name,
    SNOWFLAKE.CORTEX.COMPLETE(
      'mistral-7b',
      ARRAY_CONSTRUCT(
        OBJECT_CONSTRUCT('role', 'user', 'content', full_prompt.full_prompt)
      ),
      OBJECT_CONSTRUCT(
        'TEMPERATURE', 0.2,
        'MAX_TOKENS', 1024
      )
    ) AS section_answer
  FROM full_prompt
)

SELECT 
  file_name,
  REPLACE(section_answer:choices[0]:messages::STRING, '\\n', '\n') AS section_text
FROM raw_answers;
