In [110]:
from dotenv import load_dotenv
from snowflake.snowpark.session import Session
import os

load_dotenv()

connection_params = {
  "account":  os.getenv("SNOWFLAKE_ACCOUNT"),
  "user": os.getenv("SNOWFLAKE_USER"),
  "password": os.getenv("SNOWFLAKE_USER_PASSWORD"),
  "role": os.getenv("SNOWFLAKE_ROLE"),
  "database": os.getenv("SNOWFLAKE_DATABASE"),
  "warehouse": os.getenv("SNOWFLAKE_WAREHOUSE"),
  "schema": os.getenv("SNOWFLAKE_SCHEMA")
}

snowpark_session = Session.builder.configs(connection_params).create()

In [111]:
from snowflake.cortex import Complete

# print(Complete("mistral-large2", "what does my child need to excel in math in the first grade?", session=snowpark_session))

In [125]:
import nest_asyncio
nest_asyncio.apply()

from llama_index.readers.github import GithubRepositoryReader, GithubClient

github_token = os.environ["GITHUB_TOKEN"]
github_client = GithubClient(github_token=github_token, verbose=False)

reader = GithubRepositoryReader(
  github_client=github_client,
  owner="JamesKim-4811",
  repo="SnowflakeRAG",
  use_parser=False,
  verbose=True,
  filter_directories=(
    ["docs"],
    GithubRepositoryReader.FilterType.INCLUDE,
  ),
  filter_file_extensions=(
    [".md"],
    GithubRepositoryReader.FilterType.INCLUDE,
  )
)

documents = reader.load_data(branch="main")

current path: 
tree data: GitTreeResponseModel(sha='906ac7416b1e488dc47a85571cf1cbc78d724b37', url='https://api.github.com/repos/JamesKim-4811/SnowflakeRAG/git/trees/906ac7416b1e488dc47a85571cf1cbc78d724b37', tree=[GitTreeResponseModel.GitTreeObject(path='.DS_Store', mode='100644', type='blob', sha='d72263bc78a601699fdcc45eeaa8d1d0d3cf3ab8', url='https://api.github.com/repos/JamesKim-4811/SnowflakeRAG/git/blobs/d72263bc78a601699fdcc45eeaa8d1d0d3cf3ab8', size=6148), GitTreeResponseModel.GitTreeObject(path='.devcontainer', mode='040000', type='tree', sha='436ec75543b9fe5a0d471f703bc19598a9647958', url='https://api.github.com/repos/JamesKim-4811/SnowflakeRAG/git/trees/436ec75543b9fe5a0d471f703bc19598a9647958', size=None), GitTreeResponseModel.GitTreeObject(path='.gitattributes', mode='100644', type='blob', sha='dfe0770424b2a19faf507a501ebfc23be8f54e7b', url='https://api.github.com/repos/JamesKim-4811/SnowflakeRAG/git/blobs/dfe0770424b2a19faf507a501ebfc23be8f54e7b', size=66), GitTreeRespon

In [113]:
# documents

In [126]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SemanticSplitterNodeParser

embed_model = HuggingFaceEmbedding("Snowflake/snowflake-arctic-embed-m")

splitter = SemanticSplitterNodeParser(
  buffer_size=1, breakpoint_percentile_threshold=85, embed_model=embed_model
) 

In [127]:
from llama_index.core.ingestion import IngestionPipeline

cortex_search_pipeline = IngestionPipeline(
  transformations=[
    splitter,
  ],
)

results = cortex_search_pipeline.run(show_progress=True, documents=documents)

Parsing nodes:   0%|          | 0/3 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/739 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/717 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/578 [00:00<?, ?it/s]

In [116]:
# results[0]

In [117]:
# print(results[0].metadata['url'])
# updated_url = results[0].metadata['url'].replace('api.', '').replace('.md', '.pdf')
# print(updated_url)

In [118]:
# from snowflake.cortex import Complete
# prompt = f"Given the name of the file between <file> and </file>, determine the intended grade. Use only one word. <file> {results[0].metadata['file_name']} </file>"
# response = Complete(model="mistral-large2", prompt=prompt, session=snowpark_session)
# print(response)

# # print(Complete("mistral-large2", 'Given the name of the file between <file> and </file> determine if it is related to math, science, or english. Use only one word <file> ' + results[0].metadata['file_name'] + '</file>'))

In [119]:
# for curr in tqdm(results[0]):
#     print(curr[1])

In [120]:
# for curr in tqdm(results[0]):
#     print(curr[1])

In [128]:
import os
import snowflake.connector
from tqdm.auto import tqdm

snowflake_connector = snowflake.connector.connect(**connection_params)

cursor = snowflake_connector.cursor()
cursor.execute("""
    CREATE OR REPLACE TABLE education_docs(
        doc_text VARCHAR,
        GRADE VARCHAR,
        SUBJECT VARCHAR,
        DOCTYPE VARCHAR, 
        RELATIVE_PATH VARCHAR, 
        FILE_URL VARCHAR
    )
""")
for curr in tqdm(results):
    prompt1 = f"Given the name of the file between <file> and </file>, determine the intended grade. Use only one word. <file> {curr.metadata['file_name']} </file>"
    prompt2 = f"Given the name of the file between <file> and </file>, determine the intended school subject. Use only one word. <file> {curr.metadata['file_name']} </file>"
    prompt3 = f"Given the name of the file between <file> and </file>, determine if it is the standards or unpacking. Use only one word. <file> {curr.metadata['file_name']} </file>"

    cursor.execute("""
        INSERT INTO education_docs 
        (doc_text, GRADE, SUBJECT, DOCTYPE, RELATIVE_PATH, FILE_URL)
        VALUES (%s, %s, %s, %s, %s, %s)
    """, (curr.text, 
          Complete(model="mistral-large2", prompt=prompt1, session=snowpark_session),
          Complete(model="mistral-large2", prompt=prompt2, session=snowpark_session),
          Complete(model="mistral-large2", prompt=prompt3, session=snowpark_session),
          curr.metadata['file_path'].replace('.md', '.pdf'),
          curr.metadata['url'].replace('api.', '').replace('.md', '.pdf')
         ))


  0%|          | 0/309 [00:00<?, ?it/s]

In [129]:
import os
from snowflake.core import Root
from typing import List

class CortexSearchRetriever:

    def __init__(self, session: Session, limit_to_retrieve: int = 4):
        self._session = session
        self._limit_to_retrieve = limit_to_retrieve

    def retrieve(self, query: str) -> List[str]:
        root = Root(self._session)
        cortex_search_service = (
        root
        .databases[os.environ["SNOWFLAKE_DATABASE"]]
        .schemas[os.environ["SNOWFLAKE_SCHEMA"]]
        .cortex_search_services[os.environ["SNOWFLAKE_CORTEX_SEARCH_SERVICE"]]
    )
        resp = cortex_search_service.search(
                query=query,
                columns=["doc_text"],
                limit=self._limit_to_retrieve,
            )

        if resp.results:
            return [curr["doc_text"] for curr in resp.results]
        else:
            return []

In [123]:
# retriever = CortexSearchRetriever(session=snowpark_session, limit_to_retrieve=4)

# retrieved_context = retriever.retrieve(query="what does my child need to excel in math in the first grade?")

# len(retrieved_context)

In [124]:
# retriever.retrieve(query="what does my child need to excel in math in the first grade?")