In [1]:
#############################################################
# # Vectorstores and Embeddings
# 
# Recall the overall workflow for 
#    Retrieval Augmented Generation (RAG):
#
# 1. Load documents 
# 2. Split the documents into small, 
#    semantically meaningful chunks
# 3. Create an index for each chunk by embeddings
#    - The index is created by embeddings which are 
#      numerical representations of text.
#    - Text with semantically similar content has similar 
#      vectors in this numeric space.
# 4. Store these index in a vector stores for 
#    easy retrieval when answering questions
# 5. Search answer of a question. 
#    - Both should have similar index
# 6. Edge Cases - Failure
#    - 2 types of failures in similarity search
#      + Diversity (Example)
#      + Specifity (Example)
#    - Solved by Advanced Retrieval
#############################################################


In [2]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']


# We just discussed `Document Loading` and `Splitting`.


# In[ ]:


In [3]:
from langchain.document_loaders import PyPDFLoader

#############################################################
# 1. Load PDF
#
# References of different loading:
# - PDF
# - Youtube
# - URL
# - Notion DB
#############################################################
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader(
      "docs/cs229_lectures/sfbu-2024-2025-university-catalog.pdf"),
    PyPDFLoader(
      "docs/cs229_lectures/sfbu-2024-2025-university-catalog.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [4]:
#############################################################
# 2. Split the content to create chunks
#
# References
# - Document Splitting
#############################################################


In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)


# In[ ]:


splits = text_splitter.split_documents(docs)


# In[ ]:


len(splits)

870

In [6]:
#############################################################
# 3. Create an index for each chunk by embeddings
# 
# Let's take our splits and embed them.
#############################################################


In [7]:
#############################################################
# 4. Vectorstores
#############################################################


In [46]:
from langchain.embeddings.openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings()
# In[ ]:
sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"
# In[ ]:
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

# In[ ]:
import numpy as np



In [9]:
# In[ ]:
# numpy.dot(vector_a, vector_b, out = None) 
# returns the dot product of vectors a and b.
np.dot(embedding1, embedding2)


0.9630396460189721

In [10]:
# In[ ]:
np.dot(embedding1, embedding3)


0.7702742084408517

In [11]:
# In[ ]:
np.dot(embedding2, embedding3)

0.7590147680413902

In [12]:
from langchain.vectorstores import Chroma


# In[ ]:


persist_directory = 'docs/chroma/'


# In[ ]:


# remove old database files if any

get_ipython().system('rm -rf ./docs/chroma')  


# In[ ]:


vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

# In[ ]:


print(vectordb._collection.count())

870


In [13]:
#############################################################
# 5. Similarity Search
#############################################################


In [14]:
# In[ ]:


question = "is there an email i can ask for help"


# In[ ]:


docs = vectordb.similarity_search(question,k=3)


# In[ ]:


len(docs)


3

In [15]:
# In[ ]:


docs[0].page_content


'Library users can find help by using Ask-a-Librarian on the library website. To access the library \ncatalog, library patrons have two options: \n1) Using the computer in the library lobby whose home page is the catalog \n2) Access the catalog from the library’s website \nTo access the library’s electronic collection, library users have three options: \n1) Using the computer in the library lobby \n2) Access the e-library via the link on the student/faculty portal: \na. Go to: https://my.sfbu.edu/ \nb. Click e-Services tab, top right \nc. Select eLibrary > ProQuest or O’Reilly \n3) 24/7 access from anywhere is provided via EZProxy: \na. Go to: https://elib.sfbu.edu/login \nb. Enter your on-campus computer login information \nc. Click on “ProQuest Digital Library” or “O’Reilly for Higher Education \nMySFBU portal for Faculty and Students \nFaculty members use the Canvas LMS and MySFBU faculty portal as tools to help them manage their \nc\nourses online, including maintaining their stude

In [47]:
# Let's save this so we can use it later!


# In[ ]:


vectordb.persist()

In [17]:
#############################################################
# 6. Edge Case - Failure modes
# 
# This seems great, and basic similarity 
# search will get you 80% of the way there 
# very easily. 
# 
# But there are some failure modes that can creep up. 
# 
# Here are some edge cases that can arise - we'll fix 
# them in the next class.
#############################################################


In [18]:
question = "what did they say about departments?"


In [19]:
# In[ ]:

docs = vectordb.similarity_search(question,k=5)

In [20]:
#############################################################
# 6.1 Edge Case 1 - Failure modes: Diversity
# 
# Notice that we're getting duplicate chunks 
# (because of the duplicate 
# `MachineLearning-Lecture01.pdf` in the index).
# 
# Semantic search fetches all similar documents, 
# but does not enforce diversity.
# 
# `docs[0]` and `docs[1]` are indentical.
#############################################################


In [21]:
# In[ ]:


docs[0]


Document(metadata={'page': 24, 'source': 'docs/cs229_lectures/sfbu-2024-2025-university-catalog.pdf'}, page_content="the preceding semester. \nMany degree program classes, especially graduate courses, are conducted on weekday evenings and \non Saturdays to allow both non-working students and working professionals to pursue their studies \nduring after-work hours. A number of degree courses are conducted on weekdays in the daytime. \nSince the Learning Resource Center is open during the day and on Saturday, full-time students may \nuse weekdays’ daytime to study, conduct research, do homework, practice hands-on exercises in \nthe labs or work on projects in the practicum labs, or engage in extracurricular activities. \nAdministrative personnel are available during office hours to assist students, faculty, and \nprospective applicants. \nAddress of Instruction \nThe address where the class sessions will be held is as follows: \nMain Campus: 161 Mission Falls Lane, Fremont, CA 94539 \n \n

In [22]:

# In[ ]:


docs[1]

Document(metadata={'page': 24, 'source': 'docs/cs229_lectures/sfbu-2024-2025-university-catalog.pdf'}, page_content="the preceding semester. \nMany degree program classes, especially graduate courses, are conducted on weekday evenings and \non Saturdays to allow both non-working students and working professionals to pursue their studies \nduring after-work hours. A number of degree courses are conducted on weekdays in the daytime. \nSince the Learning Resource Center is open during the day and on Saturday, full-time students may \nuse weekdays’ daytime to study, conduct research, do homework, practice hands-on exercises in \nthe labs or work on projects in the practicum labs, or engage in extracurricular activities. \nAdministrative personnel are available during office hours to assist students, faculty, and \nprospective applicants. \nAddress of Instruction \nThe address where the class sessions will be held is as follows: \nMain Campus: 161 Mission Falls Lane, Fremont, CA 94539 \n \n

In [23]:
#############################################################
# 6.2 Edge Case 2 - Failure modes: Specifity
#
# We can see a new failure mode.
# 
# The question below asks a question about 
# the third lecture, 
# but includes results from other lectures 
# as well.
#############################################################

In [24]:
# In[ ]:


question = "what did they say about scholarship \
  for MSEE?"


In [25]:
# In[ ]:


docs = vectordb.similarity_search(question,k=5)


In [26]:
# In[ ]:


for doc in docs:
    print(doc.metadata)

{'page': 21, 'source': 'docs/cs229_lectures/sfbu-2024-2025-university-catalog.pdf'}
{'page': 21, 'source': 'docs/cs229_lectures/sfbu-2024-2025-university-catalog.pdf'}
{'page': 22, 'source': 'docs/cs229_lectures/sfbu-2024-2025-university-catalog.pdf'}
{'page': 22, 'source': 'docs/cs229_lectures/sfbu-2024-2025-university-catalog.pdf'}
{'page': 20, 'source': 'docs/cs229_lectures/sfbu-2024-2025-university-catalog.pdf'}


In [27]:
print(docs[4].page_content)

required for completion. 
• Students are not eligible to receive any other SFBU academic scholarship unless 
students apply for, and are awarded, the Startup Scholars Scholarship, in which case 
the Startup Scholars Scholarship would replace this scholarship. 
• If students are unable to meet any of the terms, the tuition scholarship will be 
rescinded. 
• The university reserves the right to rescind a scholarship if it deems the decision to be 
in the best interest of the university.


In [28]:
#############################################################
# Retrieval
# 
#  - Retrieval is the centerpiece of our retrieval 
#    augmented generation (RAG) flow. 
#    + Let's get our vectorDB from before.
#  - Vectorstore Retrieval by Similarity Search
#    + Could have 2 types of Edge Failures
#      - Diversity
#        + Solved by Maximum Marginal Relevance
#      - Specifity 
#        + Solved by working with metadata using
#          - Self-Query Retriever
#          - Compression
# - Traditional approaches which does not use Vectorstore
#   + SVM Retrieval
#   + TF-IDF Retrieval
#############################################################

#############################################################
# Vectorstore retrieval
# 
#############################################################


In [48]:
#############################################################
# Similarity Search
#############################################################

# In[ ]:


from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
persist_directory = 'docs/chroma/'

# In[ ]:


embedding = OpenAIEmbeddings()
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding
)


# In[ ]:


print(vectordb._collection.count())


870


In [30]:
# In[ ]:


texts = [
    """The Amanita phalloides has a large and \
       imposing epigeous (aboveground) fruiting \
       body (basidiocarp).""",
    """A mushroom with a large fruiting body is \
       the Amanita phalloides. Some varieties are \
       all-white.""",
    """A. phalloides, a.k.a Death Cap, is one of \
       the most poisonous of all known mushrooms.""",
]

# In[ ]:


smalldb = Chroma.from_texts(texts, embedding=embedding)


# In[ ]:


question = "Tell me about all-white mushrooms with \
       large fruiting bodies"


# In[ ]:


smalldb.similarity_search(question, k=2)


# In[ ]:


smalldb.max_marginal_relevance_search(question,k=2, 
       fetch_k=3)

[Document(metadata={}, page_content='A mushroom with a large fruiting body is        the Amanita phalloides. Some varieties are        all-white.'),
 Document(metadata={}, page_content='A. phalloides, a.k.a Death Cap, is one of        the most poisonous of all known mushrooms.')]

In [31]:
#############################################################
# Addressing Diversity: Maximum marginal relevance
# 
# Last class we introduced one problem: how to enforce 
# diversity in the search results.
#  
# `Maximum marginal relevance` strives to achieve 
# both relevance to the query *and diversity* 
# among the results.
#############################################################


In [32]:
# In[ ]:


question = "what did they say about matlab?"
docs_ss = vectordb.similarity_search(question,k=3)


# In[ ]:


docs_ss[0].page_content[:100]

# In[ ]:

docs_ss[1].page_content[:100]

# In[ ]:

'such as n-grams, Hidden Markov Models, text classifiers, and recurrent neural networks. \nPractical a'

In [33]:
#############################################################
# Note the difference in results with `MMR`.
#############################################################
docs_mmr = vectordb.max_marginal_relevance_search(
              question,k=3)


# In[ ]:


docs_mmr[0].page_content[:100]


# In[ ]:


docs_mmr[1].page_content[:100]

'program in C with UNIX/Linux system calls and other advanced topics such as the UNIX file \nsystem, p'

In [34]:

#############################################################
# ### Addressing Specificity: working with metadata
# 
# In last lecture, we showed that a question about 
# the third lecture can include results from other 
# lectures as well.
# 
# To address this, many vectorstores support 
# operations on `metadata`.
# 
# `metadata` provides context for each embedded chunk.
#############################################################


In [35]:
# In[ ]:


question = "what did they say about CPT \
            in the third trimester?"


# In[ ]:


docs = vectordb.similarity_search(
    question,
    k=3,
    filter={"source":
     "docs/cs229_lectures/sfbu-2024-2025-university-catalog.pdf"}
)


# In[ ]:


for d in docs:
    print(d.metadata)

{'page': 22, 'source': 'docs/cs229_lectures/sfbu-2024-2025-university-catalog.pdf'}
{'page': 22, 'source': 'docs/cs229_lectures/sfbu-2024-2025-university-catalog.pdf'}
{'page': 169, 'source': 'docs/cs229_lectures/sfbu-2024-2025-university-catalog.pdf'}


In [36]:
#############################################################
# Addressing Specificity: working with metadata 
#                     using Self-Query Retriever
# 
# But we have an interesting challenge: we often 
# want to infer the metadata from the query itself.
# 
# To address this, we can use `SelfQueryRetriever`, 
# which uses an LLM to extract:
#  
# 1. The `query` string to use for vector search
# 2. A metadata filter to pass in as well
# 
# Most vector databases support metadata filters, 
# so this doesn't require any new databases or indexes.
############################################################# 


In [49]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

metadata_field_info = [

 AttributeInfo(
   name="source",
   description="The lecture the chunk is from, should \
      be one of \
      `sfbu-2024-2025-university-catalog.pdf`",
   type="string",
   ),

 AttributeInfo(
   name="page",
   description="The page from the lecture",
   type="integer",
 ),

]


document_content_description = "Lecture notes"
llm = OpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)

question = "what did they say about IEEE?"


In [50]:
#############################################################
# You will receive a warning about predict_and_parse 
# being deprecated the first time you executing the 
# next line. This can be safely ignored.
#############################################################

# In[ ]:


docs = retriever.get_relevant_documents(question)


# In[ ]:


for d in docs:
    print(d.metadata)


# In[ ]:

In [39]:
#############################################################
# Additional tricks: compression
# 
# Another approach for improving the quality of 
# retrieved docs is compression.
# 
# Information most relevant to a query may be 
# buried in a document with a lot of irrelevant text. 
# 
# Passing that full document through your application 
# can lead to more expensive LLM calls and poorer 
# responses.
# 
# Contextual compression is meant to fix this. 
#############################################################


In [40]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor


# In[ ]:


def pretty_print_docs(docs):
  print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" 
   + d.page_content for i, d in enumerate(docs)]))


In [41]:
#############################################################
# Wrap our vectorstore 
#############################################################
llm = OpenAI(temperature=0)
compressor = LLMChainExtractor.from_llm(llm)


# In[ ]:


compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever()
)


# In[ ]:


question = "what did they say about CPT?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

Document 1:

CPT Curricular Practicum
----------------------------------------------------------------------------------------------------
Document 2:

CPT Curricular Practicum
----------------------------------------------------------------------------------------------------
Document 3:

- "Details of the qualifications are specified in the application process for the student."
- "The supervising staff is responsible for checking the students’ qualifications."
- "F-1 International students must observe additional rules required by the U.S. Immigration & Customs Enforcement on Curricular Practical Training (CPT)."
----------------------------------------------------------------------------------------------------
Document 4:

- "Details of the qualifications are specified in the application process for the student."
- "The supervising staff is responsible for checking the students’ qualifications."
- "F-1 International students must observe additional rules required by the U.S. Immigr

In [42]:
#############################################################
# Combining various techniques
#############################################################
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(
        search_type = "mmr")
)

# In[ ]:


question = "what did they say about matlab?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

# In[ ]:

Document 1:

Typical control cases include service and product design choices, sales forecasting, scheduling, metrics for production/inventory control, statistical quality control, and logistical constraints. MGT460L Production and Operations Management Lab (1 credit hour) Designed to be taken with MGT460, during this hands-on lab course students will learn software-based techniques to solve various time, labor, material, forecasting, capacity, take control of the conversion process from input to outputs, and costs optimizations in classic production planning and operations scenarios. Students will be expected to develop their own mathematical models, transform their models into software-based implementations and then determine the optimized best fit business solution. Students should be comfortable with or refresh themselves on solving multivariate simultaneous equations before the first-class meeting. Students should be comfortable installing software on their machines and/or using c

In [43]:
#############################################################
# Other types of retrieval
# 
# Traditional approaches which does not use Vectorstore
# It's worth noting that vectordb as not the only 
#    kind of tool to retrieve documents. 
# 
# The `LangChain` retriever abstraction includes 
#    other ways to retrieve documents, such as 
#     - TF-IDF 
#     - SVM
#############################################################

from langchain.retrievers import SVMRetriever
from langchain.retrievers import TFIDFRetriever
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# In[ ]:


#############################################################
# Load PDF
#############################################################
loader = PyPDFLoader(
  "docs/cs229_lectures/sfbu-2024-2025-university-catalog.pdf")
pages = loader.load()
all_page_text=[p.page_content for p in pages]
joined_page_text=" ".join(all_page_text)

    


In [44]:
#############################################################
# Split
#############################################################
text_splitter = RecursiveCharacterTextSplitter(
  chunk_size = 1500,chunk_overlap = 150)
splits = text_splitter.split_text(joined_page_text)


# In[ ]:


#############################################################
# Retrieve
#############################################################

#############################################################
# SVM Retriever
#############################################################
svm_retriever = SVMRetriever.from_texts(splits,embedding)

#############################################################
# TFIDF Retriever
#############################################################
tfidf_retriever = TFIDFRetriever.from_texts(splits)

# In[ ]:


#############################################################
# Retrieve with SVM Retriever
#############################################################
question = "What are major topics for genAI class?"
docs_svm=svm_retriever.get_relevant_documents(question)
docs_svm[0]




Document(metadata={}, page_content='named entity recognition, part-of-speech tagging, text classification, machine translation, \nsentiment analysis, and language models. It also covers different models and algorithms, \nsuch as n-grams, Hidden Markov Models, text classifiers, and recurrent neural networks. \nPractical assignments and projects allow students to apply their knowledge to real-world   \n \n2024 – 2025 University Catalog  154 \napplications and use cases such as sentiment analysis, chatbot development, and search \nengine relevance. \nPrerequisite: DS500 \nDS565 Generative AI-Driven Intelligent Apps Development (3 credit hours) \nIn the fast-changing world of technology, the demand for intelligent applications powered \nby AI and ML is rapidly increasing. This course aims to provide students with the necessary \nexpertise to develop cutting-edge applications and harness the potential of generative AI \ntechnology. Intelligent apps using generative AI technology stand apart

In [45]:
# In[ ]:


#############################################################
# Retrieve with TFIDF Retriever
#############################################################
question = "what did they say about graduation?"
docs_tfidf=tfidf_retriever.get_relevant_documents(question)
docs_tfidf[0]

Document(metadata={}, page_content='o NC = (No credit) The student did not pass a challenge examination. Prior to May \n1998 the grade NC might also be issued to a student taking an ESL course. \no U = (Unauthorized withdraw) The student did not withdraw from the course \nbut failed to meet attendance and course requirements. “U” grade equals “F” \ngrade. \no * = Co urse  has been repeated. \nGrade Point Average (GPA and CGPA) \nThe grade point average (GPA) is based on courses in which letter grades are earned. \nInstructors may add plus (+) or minus (-) options to letter grades in order to refine \nevaluation procedures. GPA may be calculated either based on semester, or cumulatively \n(CGPA). CGPA is calculated based on all courses and grades earned to meet a degree \nprogram’s graduation requirements. To compute the GPA or CGPA, divide the total \nnumber of grade points by the total number of credit hours attempted in courses \nreceiving letter grades. Use the following table for g