In [36]:
from dotenv import load_dotenv
from snowflake.snowpark.session import Session
import os

load_dotenv()

connection_params = {
  "account":  os.getenv("SNOWFLAKE_ACCOUNT"),
  "user": os.getenv("SNOWFLAKE_USER"),
  "password": os.getenv("SNOWFLAKE_USER_PASSWORD"),
  "role": os.getenv("SNOWFLAKE_ROLE"),
  "database": os.getenv("SNOWFLAKE_DATABASE"),
  "warehouse": os.getenv("SNOWFLAKE_WAREHOUSE"),
  "schema": os.getenv("SNOWFLAKE_SCHEMA")
}

snowpark_session = Session.builder.configs(connection_params).create()

In [None]:
from snowflake.cortex import Complete

# print(Complete("mistral-large2", "what does my child need to excel in math in the first grade?", session=snowpark_session))

In [None]:
import nest_asyncio
nest_asyncio.apply()

from llama_index.readers.github import GithubRepositoryReader, GithubClient

github_token = os.environ["GITHUB_TOKEN"]
github_client = GithubClient(github_token=github_token, verbose=False)

reader = GithubRepositoryReader(
  github_client=github_client,
  owner="JamesKim-4811",
  repo="SnowflakeRAG",
  use_parser=False,
  verbose=True,
  filter_directories=(
    ["marked_docs"],
    GithubRepositoryReader.FilterType.INCLUDE,
  ),
  filter_file_extensions=(
    [".md"],
    GithubRepositoryReader.FilterType.INCLUDE,
  )
)

documents = reader.load_data(branch="main")

In [44]:
# documents

[Document(id_='fe73feac87e94f4d38df659c46ada1ede8b50de5', embedding=None, metadata={'file_path': 'marked_docs/1st-math-clarification/1st-math-clarification.md', 'file_name': '1st-math-clarification.md', 'url': 'https://api.github.com/JamesKim-4811/SnowflakeRAG/blob/main/marked_docs/1st-math-clarification/1st-math-clarification.md'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='![](_page_0_Picture_0.jpeg)\n\n#### *1 st Grade Mathematics* ● Unpacked Contents\n\nFor the new Standard Course of Study that will be effective in all North Carolina schools in the 2017-18 School Year.\n\nThis document is designed to help North Carolina educators teach the 1 st Grade Mathematics Standard Course of Study. NCDPI staff are continually updating and improving these tools to better serve teachers and districts.\n\n# **What is the purpose of this

In [5]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SemanticSplitterNodeParser

embed_model = HuggingFaceEmbedding("Snowflake/snowflake-arctic-embed-m")

splitter = SemanticSplitterNodeParser(
  buffer_size=1, breakpoint_percentile_threshold=85, embed_model=embed_model
) 

In [6]:
from llama_index.core.ingestion import IngestionPipeline

cortex_search_pipeline = IngestionPipeline(
  transformations=[
    splitter,
  ],
)

results = cortex_search_pipeline.run(show_progress=True, documents=documents)

Parsing nodes:   0%|          | 0/3 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/719 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/717 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/578 [00:00<?, ?it/s]

In [52]:
# results[0]

TextNode(id_='c14d6a41-28e3-49a2-a294-5eaf636fb65a', embedding=None, metadata={'file_path': 'marked_docs/1st-math-clarification/1st-math-clarification.md', 'file_name': '1st-math-clarification.md', 'url': 'https://api.github.com/JamesKim-4811/SnowflakeRAG/blob/main/marked_docs/1st-math-clarification/1st-math-clarification.md'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='fe73feac87e94f4d38df659c46ada1ede8b50de5', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'marked_docs/1st-math-clarification/1st-math-clarification.md', 'file_name': '1st-math-clarification.md', 'url': 'https://api.github.com/JamesKim-4811/SnowflakeRAG/blob/main/marked_docs/1st-math-clarification/1st-math-clarification.md'}, hash='5110d1a63c14df6039935a1d21de3c28f5e8685e65d6864cd073d121fb015209'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='f3734973-107e-4f71-924d-b9409f7e7036', node_type=<ObjectType.TE

In [55]:
# print(results[0].metadata['url'])
# updated_url = results[0].metadata['url'].replace('api.', '').replace('.md', '.pdf')
# print(updated_url)

https://api.github.com/JamesKim-4811/SnowflakeRAG/blob/main/marked_docs/1st-math-clarification/1st-math-clarification.md
https://github.com/JamesKim-4811/SnowflakeRAG/blob/main/marked_docs/1st-math-clarification/1st-math-clarification.pdf


In [47]:
# from snowflake.cortex import Complete
# prompt = f"Given the name of the file between <file> and </file>, determine the intended grade. Use only one word. <file> {results[0].metadata['file_name']} </file>"
# response = Complete(model="mistral-large2", prompt=prompt, session=snowpark_session)
# print(response)

# # print(Complete("mistral-large2", 'Given the name of the file between <file> and </file> determine if it is related to math, science, or english. Use only one word <file> ' + results[0].metadata['file_name'] + '</file>'))

1st


In [102]:
# for curr in tqdm(results[0]):
#     print(curr[1])

0it [00:00, ?it/s]

c14d6a41-28e3-49a2-a294-5eaf636fb65a
None
{'file_path': 'marked_docs/1st-math-clarification/1st-math-clarification.md', 'file_name': '1st-math-clarification.md', 'url': 'https://api.github.com/JamesKim-4811/SnowflakeRAG/blob/main/marked_docs/1st-math-clarification/1st-math-clarification.md'}
[]
[]
{<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='fe73feac87e94f4d38df659c46ada1ede8b50de5', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'marked_docs/1st-math-clarification/1st-math-clarification.md', 'file_name': '1st-math-clarification.md', 'url': 'https://api.github.com/JamesKim-4811/SnowflakeRAG/blob/main/marked_docs/1st-math-clarification/1st-math-clarification.md'}, hash='5110d1a63c14df6039935a1d21de3c28f5e8685e65d6864cd073d121fb015209'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='f3734973-107e-4f71-924d-b9409f7e7036', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='2d85744e279d1a8b66e18884bb593f93a41b6b58d6e4e927c9862997cecd1bcd')}
{key}: {valu

In [85]:
# for curr in tqdm(results[0]):
#     print(curr[1])

0it [00:00, ?it/s]

c14d6a41-28e3-49a2-a294-5eaf636fb65a
None
{'file_path': 'marked_docs/1st-math-clarification/1st-math-clarification.md', 'file_name': '1st-math-clarification.md', 'url': 'https://api.github.com/JamesKim-4811/SnowflakeRAG/blob/main/marked_docs/1st-math-clarification/1st-math-clarification.md'}
[]
[]
{<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='fe73feac87e94f4d38df659c46ada1ede8b50de5', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'marked_docs/1st-math-clarification/1st-math-clarification.md', 'file_name': '1st-math-clarification.md', 'url': 'https://api.github.com/JamesKim-4811/SnowflakeRAG/blob/main/marked_docs/1st-math-clarification/1st-math-clarification.md'}, hash='5110d1a63c14df6039935a1d21de3c28f5e8685e65d6864cd073d121fb015209'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='f3734973-107e-4f71-924d-b9409f7e7036', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='2d85744e279d1a8b66e18884bb593f93a41b6b58d6e4e927c9862997cecd1bcd')}
{key}: {valu

In [103]:
import os
import snowflake.connector
from tqdm.auto import tqdm

snowflake_connector = snowflake.connector.connect(**connection_params)

cursor = snowflake_connector.cursor()
cursor.execute("""
    CREATE OR REPLACE TABLE education_docs(
        doc_text VARCHAR,
        GRADE VARCHAR,
        SUBJECT VARCHAR,
        DOCTYPE VARCHAR, 
        RELATIVE_PATH VARCHAR, 
        FILE_URL VARCHAR
    )
""")
for curr in tqdm(results):
    prompt1 = f"Given the name of the file between <file> and </file>, determine the intended grade. Use only one word. <file> {curr.metadata['file_name']} </file>"
    prompt2 = f"Given the name of the file between <file> and </file>, determine the intended school subject. Use only one word. <file> {curr.metadata['file_name']} </file>"
    prompt3 = f"Given the name of the file between <file> and </file>, determine if it is the standards or unpacking. Use only one word. <file> {curr.metadata['file_name']} </file>"

    cursor.execute("""
        INSERT INTO education_docs 
        (doc_text, GRADE, SUBJECT, DOCTYPE, RELATIVE_PATH, FILE_URL)
        VALUES (%s, %s, %s, %s, %s, %s)
    """, (curr.text, 
          Complete(model="mistral-large2", prompt=prompt1, session=snowpark_session),
          Complete(model="mistral-large2", prompt=prompt2, session=snowpark_session),
          Complete(model="mistral-large2", prompt=prompt3, session=snowpark_session),
          curr.metadata['file_path'].replace('.md', '.pdf'),
          curr.metadata['url'].replace('api.', '').replace('.md', '.pdf')
         ))


  0%|          | 0/306 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [8]:
import os
from snowflake.core import Root
from typing import List

class CortexSearchRetriever:

    def __init__(self, session: Session, limit_to_retrieve: int = 4):
        self._session = session
        self._limit_to_retrieve = limit_to_retrieve

    def retrieve(self, query: str) -> List[str]:
        root = Root(self._session)
        cortex_search_service = (
        root
        .databases[os.environ["SNOWFLAKE_DATABASE"]]
        .schemas[os.environ["SNOWFLAKE_SCHEMA"]]
        .cortex_search_services[os.environ["SNOWFLAKE_CORTEX_SEARCH_SERVICE"]]
    )
        resp = cortex_search_service.search(
                query=query,
                columns=["doc_text"],
                limit=self._limit_to_retrieve,
            )

        if resp.results:
            return [curr["doc_text"] for curr in resp.results]
        else:
            return []

In [9]:
# retriever = CortexSearchRetriever(session=snowpark_session, limit_to_retrieve=4)

# retrieved_context = retriever.retrieve(query="what does my child need to excel in math in the first grade?")

# len(retrieved_context)

4

In [10]:
# retriever.retrieve(query="what does my child need to excel in math in the first grade?")

['|\n| 3. | Construct | Mathematically proficient students in First Grade continue to develop their ability to clearly express, explain, organize and consolidate |\n| viable |  | their math thinking using both verbal and written representations. Their understanding of grade appropriate vocabulary helps them to |\n|  | arguments and | construct viable arguments about mathematics. For example, when justifying why a particular shape isn\'t a square, a first grade student |\n| critique the |  | may hold up a picture of a rectangle, pointing to the various parts, and reason, "It can\'t be a square because, even though it has 4 sides |\n| reasoning of |  | and 4 angles, the sides aren\'t all the same size." In a classroom where risk-taking and varying perspectives are encouraged, |\n| others. |  | mathematically proficient students are willing and eager to share their ideas with others, consider other ideas proposed by classmates, |\n|  |  | and question ideas that don\'t seem to make sense.