In [1]:
import os
import openai
import sys
sys.path.append('../.env')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [9]:
from langchain.document_loaders import PyPDFLoader

# Load PDF
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader("../docs/pdf/data.pdf"),
    PyPDFLoader("../docs/pdf/data.pdf")
    # PyPDFLoader("../docs/pdf/00_knowledge.docx")

]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [8]:
len(docs)

4

In [10]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [11]:
splits = text_splitter.split_documents(docs)

In [7]:
len(splits)

18

# Embeddings

In [1]:
from langchain_openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [11]:
sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"

In [12]:
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

In [14]:
print(embedding1)
print(embedding2)
print(embedding3)

[-0.027587823278315052, -0.005437984539486727, -0.02584463703150127, -0.03312054614004236, -0.027385715105887828, 0.02259826822546168, -0.010414908530055323, -0.008115922946421546, 0.0024931989346793915, -0.019667692274686416, 0.0006821165371333874, 0.02917942932713098, -0.0053748255027725785, 0.0007910656775592488, 0.00019648350800491784, 0.01405917931396018, 0.02991207284916351, -0.00123712570496483, 0.004187437661456227, -0.0039032222290731997, -0.011791773947175403, 0.0068780080619983905, 0.013010741074018187, -0.047192356981948604, -0.00238424979425353, 0.004730604539007597, 0.016939226358983508, -0.0002524383863307302, -0.025857269490769896, -0.01634553243832533, 0.026728862614176785, 0.0029937335905055634, -0.015448676259026321, -0.02475830327639853, 0.006003256823774343, -0.014968667952527817, 0.009707528995237476, -0.011532821570684495, -0.004481126506968159, -0.010806494278286276, -0.017103440878895114, 0.011122288996195736, 0.006915903577159136, -0.022017204901426998, -0.002

In [15]:
import numpy as np

In [16]:
np.dot(embedding1, embedding2)

0.9631227500523621

In [17]:
np.dot(embedding1, embedding3)

0.7703257495981696

In [18]:
np.dot(embedding2, embedding3)

0.759162740110803

# Vector stores

In [19]:
%pip install chromadb

Collecting chromadb
  Downloading chromadb-0.5.0-py3-none-any.whl.metadata (7.3 kB)
Collecting build>=1.0.3 (from chromadb)
  Using cached build-1.2.1-py3-none-any.whl.metadata (4.3 kB)
Collecting chroma-hnswlib==0.7.3 (from chromadb)
  Downloading chroma-hnswlib-0.7.3.tar.gz (31 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.111.0-py3-none-any.whl.metadata (25 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Using cached uvicorn-0.29.0-py3-none-any.whl.metadata (6.3 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.5.0-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.17.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting opentelemetry-api>=1.2.0

In [7]:
from langchain.vectorstores import Chroma

In [1]:
persist_directory = '../docs/chroma'

In [2]:
!rm -rf ../docs/chrom

In [12]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [13]:
print(vectordb._collection.count())

18


# similarity search 

In [14]:
question = "is there an email i can ask for help"

In [15]:
docs = vectordb.similarity_search(question,k=3)

In [16]:
len(docs)

3

In [17]:
docs[0].page_content

"Email\t(AIOU\temployees\tonly)\nFAQ's\t\nFinancial\tSupport\tScheme\t\nICMAP\tStudy\tMaterial\t\nJamia\tNama\t\nJobs\t\nMIT\tOpen\tCourseware\t\nNews\t&\tViews\t\nOAS\t(For\nProgram\tCoordinators)\t\nOverseas\t\nPakistan\tCitizen's\tPortal\t\nPakistan\tInfographic\t\nQuality\tEnhancement\t\nRTI\t(Right\tTo\tInformation)\nRegional\tCampuses\t\nSwift\tCenter\t\nTender\tNotices\t\nHome\t\nApply\tOnline\nÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\t\nÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\t\nÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\t\nÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\t\nÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\

In [18]:
vectordb.persist()

# Failure Modes

In [19]:
question = "what did they say about matlab?"

In [20]:
docs = vectordb.similarity_search(question,k=5)

In [21]:
len(docs)

5

In [22]:
docs[0]

Document(page_content="Email\t(AIOU\temployees\tonly)\nFAQ's\t\nFinancial\tSupport\tScheme\t\nICMAP\tStudy\tMaterial\t\nJamia\tNama\t\nJobs\t\nMIT\tOpen\tCourseware\t\nNews\t&\tViews\t\nOAS\t(For\nProgram\tCoordinators)\t\nOverseas\t\nPakistan\tCitizen's\tPortal\t\nPakistan\tInfographic\t\nQuality\tEnhancement\t\nRTI\t(Right\tTo\tInformation)\nRegional\tCampuses\t\nSwift\tCenter\t\nTender\tNotices\t\nHome\t\nApply\tOnline\nÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\t\nÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\t\nÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\t\nÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\t\nÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ

In [23]:
docs[1]

Document(page_content="Email\t(AIOU\temployees\tonly)\nFAQ's\t\nFinancial\tSupport\tScheme\t\nICMAP\tStudy\tMaterial\t\nJamia\tNama\t\nJobs\t\nMIT\tOpen\tCourseware\t\nNews\t&\tViews\t\nOAS\t(For\nProgram\tCoordinators)\t\nOverseas\t\nPakistan\tCitizen's\tPortal\t\nPakistan\tInfographic\t\nQuality\tEnhancement\t\nRTI\t(Right\tTo\tInformation)\nRegional\tCampuses\t\nSwift\tCenter\t\nTender\tNotices\t\nHome\t\nApply\tOnline\nÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\t\nÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\t\nÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\t\nÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\t\nÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ\tÂ

new failure mode

In [24]:
question = "what did they say about regression in the third lecture?"

In [25]:
docs = vectordb.similarity_search(question,k=5)

In [26]:
for doc in docs:
    print(doc.metadata)

{'page': 0, 'source': '../docs/pdf/data.pdf'}
{'page': 0, 'source': '../docs/pdf/data.pdf'}
{'page': 1, 'source': '../docs/pdf/data.pdf'}
{'page': 1, 'source': '../docs/pdf/data.pdf'}
{'page': 0, 'source': '../docs/pdf/data.pdf'}


In [27]:
print(docs[4].page_content)

university	of	Asia	in	distance	education,	which	primarily	focuses	on	the	educational	needs	of	masses	by	providing	quality
education	at	their	doorsteps	all	over	the	country	and	by	blurring	spatial	and	temporal	boundaries.	Currently,	the	university
offers	unique	educational	opportunities	to	a	huge	proportion	of	1.4	million	students	in	a	diverse	and	dynamic	range	of	programs
starting	from	Matriculation	to	the	PhD	level.	
Prof.	Dr.	Nasir	Mahmood	
Faculty	of	Education	
"Dean	Message"	The	University
exclusively	focuses	on	enhancing	female	literacy	ratio	in	the	country	so	that	women	can	perform	their	vital	role	in	the	process
