# **03. Vector Stores and Embeddings**

In [1]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [51]:
from langchain_community.document_loaders import PyPDFLoader

# Load PDF
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader("assets/1.Commerce-undergrad.pdf"),
    PyPDFLoader("assets/2.Commerce-postgrad.pdf"),
    PyPDFLoader("assets/3.Engineering-undergrad.pdf"),
    PyPDFLoader("assets/4.Engineering-postgrad.pdf"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [52]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [53]:
splits = text_splitter.split_documents(docs)

In [54]:
len(splits)

3152

## **Embeddings**

In [55]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [56]:
sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"

In [57]:
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

In [58]:
import numpy as np

In [59]:
np.dot(embedding1, embedding2)

0.9630396460189721

In [60]:
np.dot(embedding1, embedding3)

0.7702742084408517

In [61]:
np.dot(embedding2, embedding3)

0.7590147680413902

## **Vector Stores**

In [62]:
! pip install chromadb # chromadb is lightweight & stored in memory



In [63]:
from langchain.vectorstores import Chroma

In [64]:
persist_directory = 'docs/chroma/'

In [65]:
!rm -rf .docs/chroma/  # remove old database files if any

In [66]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [67]:
print(vectordb._collection.count())

6087


## **Similarity Search**

In [80]:
question = "what 5 courses can i take if i am an engineering postgrad degree?"

In [81]:
# k is the number of documents to return
docs = vectordb.similarity_search(question, k=2)
len(docs)

2

In [82]:
# print the metadata of the documents that are returned
for doc in docs:
    print(doc.metadata)

{'page': 63, 'source': 'assets/3.Engineering-undergrad.pdf'}
{'page': 63, 'source': 'assets/3.Engineering-undergrad.pdf'}


In [83]:
print(docs[1].page_content)

PROGRAMMES OF STUDY    63 
even from within the EBE Faculty. We suggest that you choose something that interests you and you 
would like to explore and/or that you think may be useful once you graduate. Your choices will need 
to be checked and approved by a student advisor and must fit into your timetable.  
   
 
 
 
 
Bachelor of Science in Engineering in Mechanical Engineering 5 -
year curriculum  
BSc(Engineering)(Mechanical Engineering) [EB805MEC01]  
 
Programme Convener:  
Prof S Chung Kim Yuen, BSc(Eng) MSc(Eng) PhD Cape Town  
 
Students on the 5 -year curriculum take the same courses and credits as in the 4 -year curriculum, but 
the courses are spaced out over 5 years to allow more time for learning new concepts, grappling with 
assignments, asking questions, and obtaining feedback. The 5 -year curriculum is supported by 
ASPECT to ensure student success.  
  
All students are admitted into the 4 -year curriculum, and there are two opportunities in the first year 
to change

In [84]:
# persist the vector store to make sure we can use it in future sessions
vectordb.persist()