In [1]:
from langchain_community.document_loaders import TextLoader, PyPDFLoader, WebBaseLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceBgeEmbeddings
import hashlib
import os
import bs4

# Initialize embedding model
model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)


# Load and split the text
loader = WebBaseLoader(
    web_paths=("https://news.microsoft.com/source/features/ai/ai-agents-what-they-are-and-how-theyll-change-the-way-we-work/",),
)
documents = loader.load()

splitter = CharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=20
)
chunks = splitter.split_documents(documents)
# Reload the existing vectorstore
vectorstore = Chroma.from_documents(documents=chunks,
    persist_directory="webdocdb",
    embedding=embeddings
)

print("Total chunks:", len(chunks))
# OPTIONAL: Avoid duplicates by hashing content
def doc_hash(doc):
    return hashlib.md5(doc.page_content.encode('utf-8')).hexdigest()

# Build a set of hashes of already indexed documents
existing_texts = vectorstore.get(include=["documents"])["documents"]
print("Existing Chunk:",existing_texts)
existing_hashes = set(hashlib.md5(text.encode("utf-8")).hexdigest() for text in existing_texts)

print("\n\n\nexisting_hashes:",existing_hashes)

# Filter only new chunks
new_chunks = [doc for doc in chunks if doc_hash(doc) not in existing_hashes]
print("New Chunks:", new_chunks)
# Add new chunks (if any)
if new_chunks:
    vectorstore.add_documents(new_chunks)
    vectorstore.persist()
    print(f"✅ Added {len(new_chunks)} new chunk(s) to vectorstore.")
else:
    print("🟰 No new chunks found. Vectorstore is up to date.")


USER_AGENT environment variable not set, consider setting it to identify your requests.
  embeddings = HuggingFaceBgeEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm
Created a chunk of size 1537, which is longer than the specified 500
Created a chunk of size 1600, which is longer than the specified 500
Created a chunk of size 2676, which is longer than the specified 500
Created a chunk of size 2369, which is longer than the specified 500
Created a chunk of size 1520, which is longer than the specified 500
Created a chunk of size 2340, which is longer than the specified 500


Total chunks: 14
Existing Chunk: ["AI agents — what they are, and how they'll change the way we work - Source\n\n\n \n\n\nSkip to main content\n\nMicrosoft\n\nSource\n\n\nSource\n\n\n                            Source\n                        \n\n\n Home \n\nOur Company\n\n\nCompany News\n\n\nOfficial Microsoft Blog\n\n\nMicrosoft On The Issues\n\n\nEurope\n\n\nAsia\n\n\nLatin America\n\n\nIndia\n\n\nUK\n\n\nInclusion is Innovation\n\n\nConexiones\n\n \nAI\n\n\nInnovation\n\n\nDigital Transformation\n\n\nDiversity & Inclusion\n\n\nSustainability\n\n\nWork & Life\n\n\nSecurity\n\n\nUnlocked\n\nMore", 'Unlocked\n\nMore\n\n \n\n All Microsoft\n\n\nGlobal\n\n\nMicrosoft 365\n\n\nTeams\n\n\nCopilot\n\n\nWindows\n\n\nSurface\n\n\nXbox\n\n\nDeals\n\n\nSmall Business\n\n\nSupport\n\n\nSoftware\nSoftware\n\n\nWindows Apps\n\n\nAI\n\n\nOutlook\n\n\nOneDrive\n\n\nMicrosoft Teams\n\n\nOneNote\n\n\nMicrosoft Edge\n\n\nSkype\n\n\nPCs & Devices  \nPCs & Devices  \n\n\nComputers\n\n\nShop Xbox\n\n\nAc

In [2]:
vectorstore.similarity_search("what is agent")

[Document(metadata={'title': "AI agents — what they are, and how they'll change the way we work - Source", 'language': 'en-US', 'source': 'https://news.microsoft.com/source/features/ai/ai-agents-what-they-are-and-how-theyll-change-the-way-we-work/', 'description': 'AI agents take the power of generative AI a step further by working alongside you or even on your behalf, and they can be built and used by anyone.'}, page_content='What are agents, anyway?\nAn agent takes the power of generative AI a step further, because instead of just assisting you, agents can work alongside you or even on your behalf. Agents can do a range of things, from responding to questions to more complicated or multistep assignments. What sets them apart from a personal assistant is that they can be tailored to have a particular expertise.\nFor example, you could create an agent to know everything about your company’s product catalog so it can draft detailed responses to customer questions or automatically compil