# Retrieval
Retrieval is the centerpiece of our retrieval augmented generation (RAG) flow.

Let's get our vectorDB from before.

# VECTOR STORE RETRIEVAL

In [7]:
import os
import openai
import sys
sys.path.append('../.env')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [1]:
%pip install lark

Collecting lark
  Downloading lark-1.1.9-py3-none-any.whl.metadata (1.9 kB)
Downloading lark-1.1.9-py3-none-any.whl (111 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.7/111.7 kB[0m [31m748.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: lark
Successfully installed lark-1.1.9
Note: you may need to restart the kernel to use updated packages.


# similarity search

In [8]:
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
persist_directory = '../docs/chroma/'

In [9]:
embedding = OpenAIEmbeddings()
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding
)

AttributeError: type object 'hnswlib.Index' has no attribute 'file_handle_count'

In [None]:
print(vectordb._collection.count())

In [7]:
texts = [
    """The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).""",
    """A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.""",
    """A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.""",
]

In [28]:
smalldb = Chroma.from_texts(texts, embedding=embedding)

In [29]:
question = "Tell me about all-white mushrooms with large fruiting bodies"

In [30]:
smalldb.similarity_search(question, k=2)

[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),
 Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.')]

In [13]:
smalldb.max_marginal_relevance_search(question,k=2, fetch_k=3)

[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),
 Document(page_content='The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).')]

# Addressing Diversity: Maximum marginal relevance
Last class we introduced one problem: how to enforce diversity in the search results.

Maximum marginal relevance strives to achieve both relevance to the query and diversity among the results.

In [31]:
question = "what did they say about matlab?"
docs_ss = vectordb.similarity_search(question,k=3)

In [32]:
docs_ss[0].page_content[:100]

"Email\t(AIOU\temployees\tonly)\nFAQ's\t\nFinancial\tSupport\tScheme\t\nICMAP\tStudy\tMaterial\t\nJamia\tNama\t\nJobs\t"

In [33]:
docs_ss[1].page_content[:100]

"Email\t(AIOU\temployees\tonly)\nFAQ's\t\nFinancial\tSupport\tScheme\t\nICMAP\tStudy\tMaterial\t\nJamia\tNama\t\nJobs\t"

In [34]:
# difference in results with MMR

In [35]:
docs_mmr = vectordb.max_marginal_relevance_search(question,k=3)

Number of requested results 20 is greater than number of elements in index 18, updating n_results = 18


In [36]:
docs_mmr[0].page_content[:100]

"Email\t(AIOU\temployees\tonly)\nFAQ's\t\nFinancial\tSupport\tScheme\t\nICMAP\tStudy\tMaterial\t\nJamia\tNama\t\nJobs\t"

In [37]:
docs_mmr[1].page_content[:100]

'university\tof\tAsia\tin\tdistance\teducation,\twhich\tprimarily\tfocuses\ton\tthe\teducational\tneeds\tof\tmasses'

# Addressing Specificity: working with metadata
In last lecture, we showed that a question about the third lecture can include results from other lectures as well.

To address this, many vectorstores support operations on metadata.

metadata provides context for each embedded chunk.

In [38]:
question = "what did they say about regression in the third lecture?"

In [None]:
# docs = vectordb.similarity_search(
#     question,
#     k=3,
#     filter={"source":"docs/cs229_lectures/MachineLearning-Lecture03.pdf"}
# )

In [None]:
# for d in docs:
#     print(d.metadata)

# Addressing Specificity: working with metadata using self-query retriever
But we have an interesting challenge: we often want to infer the metadata from the query itself.

To address this, we can use SelfQueryRetriever, which uses an LLM to extract:

The query string to use for vector search
A metadata filter to pass in as well
Most vector databases support metadata filters, so this doesn't require any new databases or indexes.

In [1]:
from langchain_openai import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [2]:
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The lecture the chunk is from, should be one of `docs/cs229_lectures/MachineLearning-Lecture01.pdf`, `docs/cs229_lectures/MachineLearning-Lecture02.pdf`, or `docs/cs229_lectures/MachineLearning-Lecture03.pdf`",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page from the lecture",
        type="integer",
    ),
]

Note: The default model for OpenAI ("from langchain.llms import OpenAI") is text-davinci-003. Due to the deprication of OpenAI's model text-davinci-003 on 4 January 2024, you'll be using OpenAI's recommended replacement model gpt-3.5-turbo-instruct instead.

In [None]:
document_content_description = "Lecture notes"
llm = OpenAI(model='gpt-3.5-turbo-instruct', temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [44]:
question = "what did they say about regression in the third lecture?"

In [46]:
#You will receive a warning about predict_and_parse being deprecated the first time you executing the next line. This can be safely ignored.

docs = retriever.get_relevant_documents(question)

In [48]:
for d in docs:
    print(d.metadata)

# Additional tricks: compression
Another approach for improving the quality of retrieved docs is compression.

Information most relevant to a query may be buried in a document with a lot of irrelevant text.

Passing that full document through your application can lead to more expensive LLM calls and poorer responses.

Contextual compression is meant to fix this.

In [49]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [51]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))


In [52]:
# Wrap our vectorstore
llm = OpenAI(temperature=0, model="gpt-3.5-turbo-instruct")
compressor = LLMChainExtractor.from_llm(llm)

In [53]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever()
)

In [54]:
question = "what did they say about matlab?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

Document 1:

Financial	Support	Scheme	
Workshop	Schedule	
Financial	Support	Scheme	
Workshop	Schedule	
Whats	Happening	News	&	Events
----------------------------------------------------------------------------------------------------
Document 2:

Financial	Support	Scheme	
Workshop	Schedule	
Financial	Support	Scheme	
Workshop	Schedule	
Whats	Happening	News	&	Events
----------------------------------------------------------------------------------------------------
Document 3:

Allama	Iqbal	Open	University,	the	largest	university	of	Asia	in	distance	education,	which
primarily	focuses	on	the	educational	needs	of	masses	by	providing	quality	education	at	their	doorsteps	all	over	the	country
----------------------------------------------------------------------------------------------------
Document 4:

Allama	Iqbal	Open	University,	the	largest	university	of	Asia	in	distance	education,	which
primarily	focuses	on	the	educational	needs	of	masses	by	providing	quality	education	at	their	doorstep

## Combining various techniques

In [55]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_type = "mmr")
)

In [56]:
question = "what did they say about matlab?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

Number of requested results 20 is greater than number of elements in index 18, updating n_results = 18


Document 1:

Financial	Support	Scheme	
Workshop	Schedule	
Financial	Support	Scheme	
Workshop	Schedule	
Whats	Happening	News	&	Events


# Other types of retrieval
It's worth noting that vectordb as not the only kind of tool to retrieve documents.

The LangChain retriever abstraction includes other ways to retrieve documents, such as TF-IDF or SVM.

In [1]:
from langchain.retrievers import SVMRetriever
from langchain.retrievers import TFIDFRetriever
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
# Load PDF
loader = PyPDFLoader("../docs/pdf/data.pdf")
pages = loader.load()
all_page_text=[p.page_content for p in pages]
joined_page_text=" ".join(all_page_text)

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500,chunk_overlap = 150)
splits = text_splitter.split_text(joined_page_text)


In [61]:
%pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.4.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m688.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.4.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.2/12.2 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading joblib-1.4.2-py3-none-any.whl (301 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

In [7]:
# Retrieve
svm_retriever = SVMRetriever.from_texts(splits,embedding)
tfidf_retriever = TFIDFRetriever.from_texts(splits)

In [8]:
question = "What are major topics for this class?"
docs_svm=svm_retriever.get_relevant_documents(question)
docs_svm[0]

  warn_deprecated(


Document(page_content="Email\t(AIOU\temployees\tonly)\nFAQ's\t\nFinancial\tSupport\tScheme\t\nICMAP\tStudy\tMaterial\t\nJamia\tNama\t\nJobs\t\nMIT\tOpen\tCourseware\t\nNews\t&\tViews\t\nOAS\t(For\nProgram\tCoordinators)\t\nOverseas\t\nPakistan\tCitizen's\tPortal\t\nPakistan\tInfographic\t\nQuality\tEnhancement\t\nRTI\t(Right\tTo\tInformation)\nRegional\tCampuses\t\nSwift\tCenter\t\nTender\tNotices\t\nHome\t\nApply\tOnline\nÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\t\nÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\t\nÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\t\nÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\t\nÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ

In [9]:
question = "what did they say about matlab?"
docs_tfidf=tfidf_retriever.get_relevant_documents(question)
docs_tfidf[0]

Document(page_content="Email\t(AIOU\temployees\tonly)\nFAQ's\t\nFinancial\tSupport\tScheme\t\nICMAP\tStudy\tMaterial\t\nJamia\tNama\t\nJobs\t\nMIT\tOpen\tCourseware\t\nNews\t&\tViews\t\nOAS\t(For\nProgram\tCoordinators)\t\nOverseas\t\nPakistan\tCitizen's\tPortal\t\nPakistan\tInfographic\t\nQuality\tEnhancement\t\nRTI\t(Right\tTo\tInformation)\nRegional\tCampuses\t\nSwift\tCenter\t\nTender\tNotices\t\nHome\t\nApply\tOnline\nÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\t\nÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\t\nÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\t\nÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\t\nÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ