# Store Vector Data

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import chromadb

In [3]:
chromadb.__version__

'0.5.20'

In [4]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader

In [9]:
loader = TextLoader("data/langchain.txt", encoding="utf-8")
documents = loader.load()

In [10]:
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=500)
docs = text_splitter.split_documents(documents)

In [11]:
len(docs)

6

In [12]:
embedding_function = OpenAIEmbeddings()

  embedding_function = OpenAIEmbeddings()


In [13]:
db = Chroma.from_documents(docs, embedding_function, persist_directory='./my_first_vecdb' )

In [14]:
db.persist()

  db.persist()


# Load Embeddings from Disk

In [15]:
db_connection= Chroma(persist_directory='./my_first_vecdb', embedding_function=embedding_function)

  db_connection= Chroma(persist_directory='./my_first_vecdb', embedding_function=embedding_function)


In [16]:
new_doc = "How do chains in LangChain streamline complex tasks that require multiple steps or logic branches?"

In [17]:
docs = db_connection.similarity_search(new_doc)

In [18]:
len(docs)

4

In [20]:
docs[0].page_content

'7. Chains and Flow Control\nLangChain’s Chain objects act like pipelines, linking multiple steps together. A typical chain might:\n\nTake user input.\nSplit or parse the text.\nEmbed the text to find relevant documents.\nPass those documents and user input into a prompt template for generation.\nReturn a refined answer to the user.\nMore advanced “agents” can branch in their logic. For example, an agent might decide to do sentiment analysis first, then use a summarization tool, and finally convert the result into a user-facing answer—making decisions on the fly based on intermediate outputs.\n\n8. Real-World Applications\nCustomer Support Chatbots: Integrate with existing knowledge bases (e.g., product manuals, FAQs), to answer complex user queries with correct references.\nDocument Analysis and Summaries: Researchers or analysts can summarize large collections of documents, retaining the ability to do deep dives and retrieval-based queries.\nCoding Assistants: Provide step-by-step so

# Add New Document

In [22]:
# load the document and split it into chunks
loader = TextLoader("data/cricket.txt", encoding="utf8")
documents = loader.load()
# split it into chunks
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=500)
docs = text_splitter.split_documents(documents)
# load it into Chroma
db = Chroma.from_documents(docs, embedding_function,persist_directory='./my_first_vecdb')

In [23]:
docs = db.similarity_search("sachin")
docs[0]

Document(metadata={'source': 'data/cricket.txt'}, page_content='However, the future of cricket appears bright. With technological innovations refining the viewing experience and new markets embracing the sport, cricket evolves while retaining its historical charm. Enthusiasts point to increased inclusivity, better governance, and adaptive formats as steps toward continued global growth.\n\n---\n\n## Conclusion\n\nCricket’s enduring appeal lies in its unique combination of tradition, strategy, and passion. Born centuries ago in England, it has grown into a global phenomenon, captivating audiences with every swing of the bat and every bowl. The sport’s ability to adapt—evident in its multiple formats and technological advancements—ensures it remains dynamic and relevant. From tense Test matches that push players to their limits over five days, to enthralling T20 clashes decided in a matter of hours, cricket offers something for everyone.\n\nAt its heart, however, cricket remains a game s

In [24]:
docs = db.similarity_search("langsmith")
docs[0]

Document(metadata={'source': 'data/langchain.txt'}, page_content='An In-Depth Look at LangChain and Its Role in Modern LLM Applications\n\nLangChain is a powerful framework designed to streamline the development of applications that leverage large language models (LLMs). Since the release of modern LLMs such as GPT-3.5, GPT-4, and other transformer-based architectures, developers and researchers have looked for ways to integrate these models into real-world applications. LangChain addresses this need by offering a cohesive set of tools and abstractions that enable efficient prompt management, memory handling, retrieval-augmented generation, and more.\n\n1. The Rise of Large Language Models\nOver the past few years, transformer-based models have dramatically improved natural language understanding and generation. Capable of composing coherent text, answering complex questions, and even reasoning about real-world scenarios, these models power a range of applications such as chatbots, que

# Vector Store Retriever

In [25]:
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

In [26]:
embedding_function = OpenAIEmbeddings()

In [27]:
db_connection= Chroma(persist_directory='./my_first_vecdb', embedding_function=embedding_function)

In [34]:
retriever = db_connection.as_retriever(
    search_type="similarity_score_threshold", 
    search_kwargs={"score_threshold": 0.68, "k": 3}
)

docs = retriever.invoke(input="llm")
print(len(docs))
docs[0]

3


Document(metadata={'source': 'data/langchain.txt'}, page_content='An In-Depth Look at LangChain and Its Role in Modern LLM Applications\n\nLangChain is a powerful framework designed to streamline the development of applications that leverage large language models (LLMs). Since the release of modern LLMs such as GPT-3.5, GPT-4, and other transformer-based architectures, developers and researchers have looked for ways to integrate these models into real-world applications. LangChain addresses this need by offering a cohesive set of tools and abstractions that enable efficient prompt management, memory handling, retrieval-augmented generation, and more.\n\n1. The Rise of Large Language Models\nOver the past few years, transformer-based models have dramatically improved natural language understanding and generation. Capable of composing coherent text, answering complex questions, and even reasoning about real-world scenarios, these models power a range of applications such as chatbots, que

# Multi Query Retriever

In [35]:
from langchain_openai import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever
question="what is the cricket use?"
llm = ChatOpenAI(temperature=0)
retriever_from_llm = MultiQueryRetriever.from_llm(retriever=db.as_retriever(),llm=llm)

In [36]:
# Set logging for the queries
import logging
logging.basicConfig()
logging.getLogger('langchain.retrievers.multi_query').setLevel(logging.INFO)
unique_docs = retriever_from_llm.invoke(input=question)

INFO:langchain.retrievers.multi_query:Generated queries: ['1. What are the various applications of cricket?', '2. How is cricket utilized in different contexts?', '3. In what ways is cricket commonly used?']


In [37]:
len(unique_docs)

5

In [38]:
print(unique_docs[0].page_content)

---

## 7. Evolving Nature of the Sport

Cricket continues to evolve with the introduction of new formats, technology, and professional structures:

1. **Technology in Cricket**:  
   - Tools like Hawk-Eye, Snickometer, and Hot Spot help with more accurate umpiring decisions.  
   - The Decision Review System (DRS) allows players and umpires to rectify mistakes, reducing controversy and increasing fairness.

2. **Rise of T20 Leagues**:  
   - Domestic T20 leagues generate immense revenue and fan engagement, attracting global superstars.  
   - This commercial success often raises debates on how it influences international cricket schedules and player availability.

3. **Women’s Cricket**:  
   - Growing rapidly, especially with high-profile tournaments like the Women’s World Cup and T20 World Cup.  
   - Increased media coverage, sponsorship deals, and professional contracts have boosted opportunities for female cricketers worldwide.

4. **Inclusivity and Global Spread**:  
   - Emergi