In [None]:
!pip install chromadb
!pip install PyPDF2
!pip install langchain
!pip install sentence-transformers

In [None]:
import chromadb
import PyPDF2
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
client = chromadb.Client() # this is similar to the create database
client.list_collections()
collection = client.create_collection(name = "sample_collection",metadata={"hnsw:space": "cosine"})
file_path ='/content/drive/MyDrive/LLM_data/snowflake_container.pdf'

In [None]:
client.list_collections()

[Collection(name=sample_collection)]

In [None]:
pdf_file = open(file_path, 'rb')
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page_num in range( len(pdf_reader.pages)):
    text += pdf_reader.pages[page_num].extract_text()
pdf_file.close()

In [None]:
# Split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100,separators="\n\n ")
chunks = text_splitter.split_text(text)

In [None]:
documents_list = []
ids_list = []
for i, chunk in enumerate(chunks):
    documents_list.append(chunk)
    ids_list.append(f"snw_{i}")

In [None]:
# insert into table
collection.add(
    documents=documents_list,
    ids=ids_list
)

In [None]:
# select *
collection.get([],)

{'ids': ['snw_0',
  'snw_1',
  'snw_10',
  'snw_11',
  'snw_12',
  'snw_13',
  'snw_14',
  'snw_15',
  'snw_16',
  'snw_17',
  'snw_18',
  'snw_19',
  'snw_2',
  'snw_20',
  'snw_21',
  'snw_22',
  'snw_23',
  'snw_3',
  'snw_4',
  'snw_5',
  'snw_6',
  'snw_7',
  'snw_8',
  'snw_9'],
 'embeddings': None,
 'metadatas': [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 'documents': ['PDFm yURL converts web pages and ev en full websites to PDF easily and quickly .\nSnowpark Container Services — A\nTech Primer\nCaleb Baechtold·Follow\nPublished inSnowflake·10 min read·Jul 7, 2023\n91 2\nUpdated 12/20/2023\nIntroduction\nAt our annual Summit 2023 user conference the last week of June, Snowflake\nannounced a new product feature, Snowpark Container Services. Snowpark\nContainer Services is a fully managed container offering that allows you to\neasily 

In [None]:
#select with where
collection.get(ids=["snw_1"])

{'ids': [],
 'embeddings': None,
 'metadatas': [],
 'documents': [],
 'uris': None,
 'data': None}

In [None]:
collection.query(query_texts=['object type']) #,where_document={"$contains":"object type"})

In [None]:
collection.delete(ids= "snw_1")

# Embeddings

In [None]:
collection_em = client.create_collection(name = "sample_collection_em",metadata={"hnsw:space": "cosine"})
client.list_collections()

[Collection(name=sample_collection_em), Collection(name=sample_collection)]

In [None]:
# Initialize embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [None]:
embeddings_list = []
documents_list = []
ids_list = []
for i, chunk in enumerate(chunks):
    vector = embeddings.embed_query(chunk)
    documents_list.append(chunk)
    embeddings_list.append(vector)
    ids_list.append(f"snw_{i}")

In [None]:
collection_em.add(
    embeddings=embeddings_list,
    documents=documents_list,
    ids=ids_list
)

In [None]:
collection_em.get([],)

In [None]:
query = "object types"
query_vector = embeddings.embed_query(query)

In [None]:
collection_em.query(query_embeddings=query_vector)

In [None]:
client.delete_collection("sample_collection")

In [None]:
client.list_collections()

[Collection(name=sample_collection_em)]