#############################################################
## Vectorstores and Embeddings
#
### Recall the overall workflow for
####    Retrieval Augmented Generation (RAG):
#
##### 1. Load documents
##### 2. Split the documents into small,
#####   semantically meaningful chunks
##### 3. Create an index for each chunk by embeddings
#####    - The index is created by embeddings which are
#####      numerical representations of text.
#####    - Text with semantically similar content has similar
#####      vectors in this numeric space.
##### 4. Store these index in a vector stores for
#####    easy retrieval when answering questions
##### 5. Search answer of a question.
#####    - Both should have similar index
##### 6. Edge Cases - Failure
######    - 2 types of failures in similarity search
######      + Diversity (Example)
######      + Specifity (Example)
######    - Solved by Advanced Retrieval
#############################################################





In [None]:
!pip install python-dotenv
!pip install OpenAI
!pip install langchain
!pip install langchain_community

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1
Collecting OpenAI
  Downloading openai-1.14.0-py3-none-any.whl (257 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m257.5/257.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from OpenAI)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->OpenAI)
  Downloading httpcore-1.0.4-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.8/77.8 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->OpenAI)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

In [None]:
!pip install PyPDF
!pip install pydub
!pip install ffmpeg

Collecting PyPDF
  Downloading pypdf-4.1.0-py3-none-any.whl (286 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/286.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m143.4/286.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.1/286.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF
Successfully installed PyPDF-4.1.0
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Collecting ffmpeg
  Downloading ffmpeg-1.4.tar.gz (5.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ffmpeg
  Building wheel for ffmpeg (setup.py) ... [?25l[?25hdone
  Created wheel for ffmpeg: filename=ffmpeg-1.4-py3-none-any.whl size=6082 sha256=5a6c2c847075331fbd3784ba6fdc8

In [None]:
%env OPENAI_API_KEY=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

env: OPENAI_API_KEY=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


In [None]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']


In [None]:
from langchain_community.document_loaders import PyPDFLoader

#############################################################
#### 1. Load PDF
#
#### References of different loading:
##### - PDF
##### - Youtube
##### - URL
##### - Notion DB
#############################################################


In [None]:
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader(
      "/content/MachineLearning-Lecture01 (1).pdf"),
    PyPDFLoader(
      "/content/MachineLearning-Lecture02.pdf"),
    PyPDFLoader(
      "/content/MachineLearning-Lecture03.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())


#############################################################
#### 2. Split the content to create chunks
#
#### References
### - Document Splitting
#############################################################


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)


In [None]:
splits = text_splitter.split_documents(docs)

In [None]:
len(splits)

152

#############################################################
#### 3. Create an index for each chunk by embeddings
#
#### Let's take our splits and embed them.
#############################################################


In [None]:
from langchain_community.embeddings.openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings()


In [None]:
sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"

In [None]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.6.0


In [None]:
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

In [None]:
import numpy as np

In [None]:
# numpy.dot(vector_a, vector_b, out = None)
# returns the dot product of vectors a and b.
np.dot(embedding1, embedding2)


0.9631227500523626

In [None]:
np.dot(embedding1, embedding3)

0.7703257495981698

In [None]:
np.dot(embedding2, embedding3)

0.759162740110803

#############################################################
##### 4. Vectorstores
#############################################################



In [None]:
! pip install chromadb

Collecting chromadb
  Downloading chromadb-0.4.24-py3-none-any.whl (525 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m525.5/525.5 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting chroma-hnswlib==0.7.3 (from chromadb)
  Downloading chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.110.0-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.1/92.1 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn[standard]>=0.18.3 (from chromadb)
  Downloading uvicorn-0.28.0-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.5.0-py2

In [None]:
from langchain_community.vectorstores import Chroma

In [None]:
persist_directory = 'docs/chroma/'

##### remove old database files if any

In [None]:
get_ipython().system('rm -rf ./docs/chroma')

In [None]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)


In [None]:
print(vectordb._collection.count())

152


#############################################################
##### 5. Similarity Search
#############################################################


In [None]:
question = "is there an email i can ask for help"

In [None]:
docs = vectordb.similarity_search(question,k=3)

In [None]:
len(docs)

3

In [None]:
docs[0].page_content

"cs229-qa@cs.stanford.edu. This goes to an acc ount that's read by all the TAs and me. So \nrather than sending us email individually, if you send email to this account, it will \nactually let us get back to you maximally quickly with answers to your questions.  \nIf you're asking questions about homework probl ems, please say in the subject line which \nassignment and which question the email refers to, since that will also help us to route \nyour question to the appropriate TA or to me  appropriately and get the response back to \nyou quickly.  \nLet's see. Skipping ahead — let's see — for homework, one midterm, one open and term \nproject. Notice on the honor code. So one thi ng that I think will help you to succeed and \ndo well in this class and even help you to enjoy this cla ss more is if you form a study \ngroup.  \nSo start looking around where you' re sitting now or at the end of class today, mingle a \nlittle bit and get to know your classmates. I strongly encourage you to f

#### Let's save this so we can use it later!

In [None]:
vectordb.persist()

#############################################################
# 6. Edge Case - Failure modes
#
# This seems great, and basic similarity
# search will get you 80% of the way there
# very easily.
#
# But there are some failure modes that can creep up.
#
# Here are some edge cases that can arise - we'll fix
# them in the next class.
#############################################################


In [None]:
question = "what did they say about matlab?"

In [None]:
docs = vectordb.similarity_search(question,k=5)


#############################################################
# 6.1 Edge Case 1 - Failure modes: Diversity
#
# Notice that we're getting duplicate chunks
# (because of the duplicate
# `MachineLearning-Lecture01.pdf` in the index).
#
# Semantic search fetches all similar documents,
# but does not enforce diversity.
#
# `docs[0]` and `docs[1]` are indentical.
#############################################################


In [None]:
docs[0]

Document(page_content='those homeworks will be done in either MATLA B or in Octave, which is sort of — I \nknow some people call it a free ve rsion of MATLAB, which it sort  of is, sort of isn\'t.  \nSo I guess for those of you that haven\'t s een MATLAB before, and I know most of you \nhave, MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to \nplot data. And it\'s sort of an extremely easy to  learn tool to use for implementing a lot of \nlearning algorithms.  \nAnd in case some of you want to work on your  own home computer or something if you \ndon\'t have a MATLAB license, for the purposes of  this class, there\'s also — [inaudible] \nwrite that down [inaudible] MATLAB — there\' s also a software package called Octave \nthat you can download for free off the Internet. And it has somewhat fewer features than MATLAB, but it\'s free, and for the purposes of  this class,

In [None]:
docs[1]

Document(page_content='into his office and he said, "Oh, professo r, professor, thank you so much for your \nmachine learning class. I learned so much from it. There\'s this stuff that I learned in your \nclass, and I now use every day. And it\'s help ed me make lots of money, and here\'s a \npicture of my big house."  \nSo my friend was very excited. He said, "W ow. That\'s great. I\'m glad to hear this \nmachine learning stuff was actually useful. So what was it that you learned? Was it \nlogistic regression? Was it the PCA? Was it the data ne tworks? What was it that you \nlearned that was so helpful?" And the student said, "Oh, it was the MATLAB."  \nSo for those of you that don\'t know MATLAB yet, I hope you do learn it. It\'s not hard, \nand we\'ll actually have a short MATLAB tutori al in one of the discussion sections for \nthose of you that don\'t know it.  \nOkay. The very last piece of logistical th ing is the discussion s ections. So discussion \nsections will be taught by 

#############################################################
# 6.2 Edge Case 2 - Failure modes: Specifity
#
# We can see a new failure mode.
#
# The question below asks a question about
# the third lecture,
# but includes results from other lectures
# as well.
#############################################################



In [None]:
question = "what did they say about regression \
  in the third lecture?"

In [None]:
docs = vectordb.similarity_search(question,k=5)

In [None]:
for doc in docs:
    print(doc.metadata)


{'page': 0, 'source': '/content/MachineLearning-Lecture03.pdf'}
{'page': 2, 'source': '/content/MachineLearning-Lecture02.pdf'}
{'page': 14, 'source': '/content/MachineLearning-Lecture03.pdf'}
{'page': 0, 'source': '/content/MachineLearning-Lecture02.pdf'}
{'page': 6, 'source': '/content/MachineLearning-Lecture03.pdf'}


In [None]:
print(docs[4].page_content)

data sets as well. So don’t want to talk about  that. If you’re interested, look up the work 
of Andrew Moore on KD-trees. He, sort of, fi gured out ways to fit these models much 
more efficiently. That’s not something I want  to go into today. Okay? Let me move one. 
Let’s take more questions later.  
So, okay. So that’s locally weighted regres sion. Remember the outline I had, I guess, at 
the beginning of this lecture. What I want to do now is talk about a probabilistic interpretation of linear regres sion, all right? And in partic ular of the – it’ll be this 
probabilistic interpretati on that let’s us move on to talk  about logistic regression, which 
will be our first classification algorithm. So le t’s put aside locally weighted regression for 
now. We’ll just talk about ordinary unwei ghted linear regression. Let’s ask the question 
of why least squares, right? Of all the thi ngs we could optimize how do we come up with 
this criteria for minimizing the square of  the area betw