In [None]:
# https://github.com/JTasnim/LangChainChat/blob/main/VectorstoresEmbedding.ipynb

In [1]:
#############################################################
# # Vectorstores and Embeddings
# 
# Recall the overall workflow for 
#    Retrieval Augmented Generation (RAG):
#
# 1. Load documents 
# 2. Split the documents into small, 
#    semantically meaningful chunks
# 3. Create an index for each chunk by embeddings
#    - The index is created by embeddings which are 
#      numerical representations of text.
#    - Text with semantically similar content has similar 
#      vectors in this numeric space.
# 4. Store these index in a vector stores for 
#    easy retrieval when answering questions
# 5. Search answer of a question. 
#    - Both should have similar index
# 6. Edge Cases - Failure
#    - 2 types of failures in similarity search
#      + Diversity (Example)
#      + Specifity (Example)
#    - Solved by Advanced Retrieval
#############################################################


In [2]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']


# We just discussed `Document Loading` and `Splitting`.


# In[ ]:


In [3]:
from langchain.document_loaders import PyPDFLoader

#############################################################
# 1. Load PDF
#
# References of different loading:
# - PDF
# - Youtube
# - URL
# - Notion DB
#############################################################
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader(
      "docs/cs229_lectures/MachineLearning-Lecture01.pdf"),
    PyPDFLoader(
      "docs/cs229_lectures/MachineLearning-Lecture01.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [4]:
#############################################################
# 2. Split the content to create chunks
#
# References
# - Document Splitting
#############################################################


In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)


# In[ ]:


splits = text_splitter.split_documents(docs)


# In[ ]:


len(splits)

114

In [6]:
#############################################################
# 3. Create an index for each chunk by embeddings
# 
# Let's take our splits and embed them.
#############################################################


In [59]:
from langchain.embeddings.openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings()
# In[ ]:
sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"
# In[ ]:
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

# In[ ]:
import numpy as np
# In[ ]:
# numpy.dot(vector_a, vector_b, out = None) 
# returns the dot product of vectors a and b.
np.dot(embedding1, embedding2)
# In[ ]:
np.dot(embedding1, embedding3)
# In[ ]:
np.dot(embedding2, embedding3)


0.7590147680413902

In [8]:
#############################################################
# 4. Vectorstores
#############################################################


In [9]:
from langchain.vectorstores import Chroma


# In[ ]:


persist_directory = 'docs/chroma/'


# In[ ]:


# remove old database files if any

get_ipython().system('rm -rf ./docs/chroma')  


# In[ ]:


vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

# In[ ]:


print(vectordb._collection.count())

114


In [10]:
#############################################################
# 5. Similarity Search
#############################################################


In [60]:
# In[ ]:


question = "is there an email i can ask for help"


# In[ ]:


docs = vectordb.similarity_search(question,k=3)


# In[ ]:


len(docs)


# In[ ]:


docs[0].page_content


# Let's save this so we can use it later!


# In[ ]:


vectordb.persist()

In [12]:
#############################################################
# 6. Edge Case - Failure modes
# 
# This seems great, and basic similarity 
# search will get you 80% of the way there 
# very easily. 
# 
# But there are some failure modes that can creep up. 
# 
# Here are some edge cases that can arise - we'll fix 
# them in the next class.
#############################################################


In [13]:
question = "what did they say about matlab?"


# In[ ]:


docs = vectordb.similarity_search(question,k=5)

In [14]:
#############################################################
# 6.1 Edge Case 1 - Failure modes: Diversity
# 
# Notice that we're getting duplicate chunks 
# (because of the duplicate 
# `MachineLearning-Lecture01.pdf` in the index).
# 
# Semantic search fetches all similar documents, 
# but does not enforce diversity.
# 
# `docs[0]` and `docs[1]` are indentical.
#############################################################


In [15]:
# In[ ]:


docs[0]


# In[ ]:


docs[1]


Document(metadata={'page': 8, 'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf'}, page_content='those homeworks will be done in either MATLAB or in Octave, which is sort of — I \nknow some people call it a free version of MATLAB, which it sort of is, sort of isn\'t.  \nSo I guess for those of you that haven\'t seen MATLAB before, and I know most of you \nhave, MATLAB is I guess part of the programming language that makes it very easy to \nwrite codes using matrices, to write code for numerical routines, to move data around, to \nplot data. And it\'s sort of an extremely easy to learn tool to use for implementing a lot of \nlearning algorithms.  \nAnd in case some of you want to work on your own home computer or something if you \ndon\'t have a MATLAB license, for the purposes of this class, there\'s also — [inaudible] \nwrite that down [inaudible] MATLAB — there\' s also a software package called Octave \nthat you can download for free off the Internet. And it has somewhat 

In [16]:
#############################################################
# 6.2 Edge Case 2 - Failure modes: Specifity
#
# We can see a new failure mode.
# 
# The question below asks a question about 
# the third lecture, 
# but includes results from other lectures 
# as well.
#############################################################

In [17]:
# In[ ]:


question = "what did they say about regression \
  in the third lecture?"


# In[ ]:


docs = vectordb.similarity_search(question,k=5)


# In[ ]:


for doc in docs:
    print(doc.metadata)


# In[ ]:




{'page': 8, 'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf'}
{'page': 8, 'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf'}
{'page': 8, 'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf'}
{'page': 8, 'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf'}
{'page': 13, 'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf'}


In [18]:
print(docs[4].page_content)

he says it in sort of a really touching, sincere way, and then he has this — you can see it 
in his eyes — he has this deep appreciation of the truth and beauty in the universe as 
revealed to him by the math he does.  
In this class, I'm not gonna do any truth and beauty. In this class, I'm gonna talk about 
learning theory to try to convey to you an understanding of how and why learning 
algorithms work so that we can apply these learning algorithms as effectively as possible.  
So, for example, it turns out you can prove surprisingly deep theorems on when you can 
guarantee that a learning algorithm will work, all right? So think about a learning


In [19]:
#############################################################
# Retrieval
# 
#  - Retrieval is the centerpiece of our retrieval 
#    augmented generation (RAG) flow. 
#    + Let's get our vectorDB from before.
#  - Vectorstore Retrieval by Similarity Search
#    + Could have 2 types of Edge Failures
#      - Diversity
#        + Solved by Maximum Marginal Relevance
#      - Specifity 
#        + Solved by working with metadata using
#          - Self-Query Retriever
#          - Compression
# - Traditional approaches which does not use Vectorstore
#   + SVM Retrieval
#   + TF-IDF Retrieval
#############################################################

#############################################################
# Vectorstore retrieval
# 
#############################################################


In [61]:
#############################################################
# Similarity Search
#############################################################

# In[ ]:


from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
persist_directory = 'docs/chroma/'

# In[ ]:


embedding = OpenAIEmbeddings()
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding
)


# In[ ]:


print(vectordb._collection.count())


114


In [21]:
# In[ ]:


texts = [
    """The Amanita phalloides has a large and \
       imposing epigeous (aboveground) fruiting \
       body (basidiocarp).""",
    """A mushroom with a large fruiting body is \
       the Amanita phalloides. Some varieties are \
       all-white.""",
    """A. phalloides, a.k.a Death Cap, is one of \
       the most poisonous of all known mushrooms.""",
]

# In[ ]:


smalldb = Chroma.from_texts(texts, embedding=embedding)


# In[ ]:


question = "Tell me about all-white mushrooms with \
       large fruiting bodies"


# In[ ]:


smalldb.similarity_search(question, k=2)


# In[ ]:


smalldb.max_marginal_relevance_search(question,k=2, 
       fetch_k=3)

[Document(metadata={}, page_content='A mushroom with a large fruiting body is        the Amanita phalloides. Some varieties are        all-white.'),
 Document(metadata={}, page_content='A. phalloides, a.k.a Death Cap, is one of        the most poisonous of all known mushrooms.')]

In [22]:
#############################################################
# Addressing Diversity: Maximum marginal relevance
# 
# Last class we introduced one problem: how to enforce 
# diversity in the search results.
#  
# `Maximum marginal relevance` strives to achieve 
# both relevance to the query *and diversity* 
# among the results.
#############################################################


In [23]:
# In[ ]:


question = "what did they say about matlab?"
docs_ss = vectordb.similarity_search(question,k=3)


# In[ ]:


docs_ss[0].page_content[:100]

# In[ ]:

docs_ss[1].page_content[:100]

# In[ ]:

'those homeworks will be done in either MATLAB or in Octave, which is sort of — I \nknow some people c'

In [24]:
#############################################################
# Note the difference in results with `MMR`.
#############################################################
docs_mmr = vectordb.max_marginal_relevance_search(
              question,k=3)


# In[ ]:


docs_mmr[0].page_content[:100]


# In[ ]:


docs_mmr[1].page_content[:100]

'into his office and he said, "Oh, professor, professor, thank you so much for your \nmachine learning'

In [25]:

#############################################################
# ### Addressing Specificity: working with metadata
# 
# In last lecture, we showed that a question about 
# the third lecture can include results from other 
# lectures as well.
# 
# To address this, many vectorstores support 
# operations on `metadata`.
# 
# `metadata` provides context for each embedded chunk.
#############################################################


In [26]:
# In[ ]:


question = "what did they say about regression \
            in the third lecture?"


# In[ ]:


docs = vectordb.similarity_search(
    question,
    k=3,
    filter={"source":
     "docs/cs229_lectures/MachineLearning-Lecture01.pdf"}
)


# In[ ]:


for d in docs:
    print(d.metadata)

{'page': 8, 'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf'}
{'page': 8, 'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf'}
{'page': 8, 'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf'}


In [27]:
#############################################################
# Addressing Specificity: working with metadata 
#                     using Self-Query Retriever
# 
# But we have an interesting challenge: we often 
# want to infer the metadata from the query itself.
# 
# To address this, we can use `SelfQueryRetriever`, 
# which uses an LLM to extract:
#  
# 1. The `query` string to use for vector search
# 2. A metadata filter to pass in as well
# 
# Most vector databases support metadata filters, 
# so this doesn't require any new databases or indexes.
############################################################# 


In [49]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

metadata_field_info = [

 AttributeInfo(
   name="source",
   description="The lecture the chunk is from, should \
      be one of \
      `docs/cs229_lectures/MachineLearning-Lecture01.pdf`",
   type="string",
   ),

 AttributeInfo(
   name="page",
   description="The page from the lecture",
   type="integer",
 ),

]


document_content_description = "Lecture notes"
llm = OpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)
question = "what did they say about regression in the third lecture?"


In [50]:
#############################################################
# You will receive a warning about predict_and_parse 
# being deprecated the first time you executing the 
# next line. This can be safely ignored.
#############################################################

# In[ ]:


docs = retriever.get_relevant_documents(question)


# In[ ]:


for d in docs:
    print(d.metadata)


# In[ ]:

In [None]:
#############################################################
# Additional tricks: compression
# 
# Another approach for improving the quality of 
# retrieved docs is compression.
# 
# Information most relevant to a query may be 
# buried in a document with a lot of irrelevant text. 
# 
# Passing that full document through your application 
# can lead to more expensive LLM calls and poorer 
# responses.
# 
# Contextual compression is meant to fix this. 
#############################################################


In [51]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor


# In[ ]:


def pretty_print_docs(docs):
  print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" 
   + d.page_content for i, d in enumerate(docs)]))


In [53]:
#############################################################
# Wrap our vectorstore 
#############################################################
llm = OpenAI(temperature=0)
compressor = LLMChainExtractor.from_llm(llm)


# In[ ]:


compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever()
)


# In[ ]:


question = "what did they say about matlab?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

Document 1:

- those homeworks will be done in either MATLAB or in Octave
- I know some people call it a free version of MATLAB
- MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to plot data
- it's sort of an extremely easy to learn tool to use for implementing a lot of learning algorithms
- there's also a software package called Octave that you can download for free off the Internet
- it has somewhat fewer features than MATLAB, but it's free, and for the purposes of this class, it will work for just about everything
- once a colleague of mine at a different university, not at Stanford, actually teaches another machine learning course
----------------------------------------------------------------------------------------------------
Document 2:

- those homeworks will be done in either MATLAB or in Octave
- I know some people call it a free version of MATLAB
- MATLAB is

In [55]:
#############################################################
# Combining various techniques
#############################################################
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(
        search_type = "mmr")
)

# In[ ]:


question = "what did they say about matlab?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

# In[ ]:

Document 1:

- those homeworks will be done in either MATLAB or in Octave
- I know some people call it a free version of MATLAB
- MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to plot data
- it's sort of an extremely easy to learn tool to use for implementing a lot of learning algorithms
- there's also a software package called Octave that you can download for free off the Internet
- it has somewhat fewer features than MATLAB, but it's free, and for the purposes of this class, it will work for just about everything
- once a colleague of mine at a different university, not at Stanford, actually teaches another machine learning course
----------------------------------------------------------------------------------------------------
Document 2:

"Oh, it was the MATLAB."
----------------------------------------------------------------------------------------------------


In [58]:
#############################################################
# Other types of retrieval
# 
# Traditional approaches which does not use Vectorstore
# It's worth noting that vectordb as not the only 
#    kind of tool to retrieve documents. 
# 
# The `LangChain` retriever abstraction includes 
#    other ways to retrieve documents, such as 
#     - TF-IDF 
#     - SVM
#############################################################

from langchain.retrievers import SVMRetriever
from langchain.retrievers import TFIDFRetriever
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# In[ ]:


#############################################################
# Load PDF
#############################################################
loader = PyPDFLoader(
  "docs/cs229_lectures/MachineLearning-Lecture01.pdf")
pages = loader.load()
all_page_text=[p.page_content for p in pages]
joined_page_text=" ".join(all_page_text)

#############################################################
# Split
#############################################################
text_splitter = RecursiveCharacterTextSplitter(
  chunk_size = 1500,chunk_overlap = 150)
splits = text_splitter.split_text(joined_page_text)


# In[ ]:


#############################################################
# Retrieve
#############################################################

#############################################################
# SVM Retriever
#############################################################
svm_retriever = SVMRetriever.from_texts(splits,embedding)

#############################################################
# TFIDF Retriever
#############################################################
tfidf_retriever = TFIDFRetriever.from_texts(splits)

# In[ ]:


#############################################################
# Retrieve with SVM Retriever
#############################################################
question = "What are major topics for this class?"
docs_svm=svm_retriever.get_relevant_documents(question)
docs_svm[0]


# In[ ]:


#############################################################
# Retrieve with TFIDF Retriever
#############################################################
question = "what did they say about matlab?"
docs_tfidf=tfidf_retriever.get_relevant_documents(question)
docs_tfidf[0]


# In[ ]:
    


Document(metadata={}, page_content="yourselves. You can also come and talk to me or the TAs if you want to brainstorm ideas \nwith us.  \nOkay. So one more organizational question. I'm curious, how many of you know \nMATLAB? Wow, cool, quite a lot. Okay. So as part of the — act ually how many of you \nknow Octave or have used Octave? Oh, okay, much smaller number.  \nSo as part of this class, especially in the homeworks, we'll ask you to implement a few \nprograms, a few machine learning algorithms as part of the homeworks. And most of  those homeworks will be done in either MATLAB or in Octave, which is sort of — I \nknow some people call it a free version of MATLAB, which it sort of is, sort of isn't.  \nSo I guess for those of you that haven't seen MATLAB before, and I know most of you \nhave, MATLAB is I guess part of the programming language that makes it very easy to \nwrite codes using matrices, to write code for numerical routines, to move data around, to \nplot data. And it's 