In [1]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector
load_dotenv()
PG_VECTOR_PWD = os.environ["PG_VECTOR_PWD"]

In [2]:
model_embedding = HuggingFaceEmbeddings(model_name='multi-qa-mpnet-base-dot-v1')

connection = f"postgresql+psycopg://vector_user:{PG_VECTOR_PWD}@localhost:5431/vector_db"
collection_name = "udlbook"

vector_store = PGVector(
    embeddings=model_embedding,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)

  from tqdm.autonotebook import tqdm, trange


In [3]:
loader = PyMuPDFLoader("../files/UnderstandingDeepLearning_08_05_24_C.pdf")

In [4]:
data = loader.load()

In [5]:
data[:3]

[Document(metadata={'source': 'UnderstandingDeepLearning_08_05_24_C.pdf', 'file_path': 'UnderstandingDeepLearning_08_05_24_C.pdf', 'page': 0, 'total_pages': 541, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'LaTeX with hyperref', 'producer': 'xdvipdfmx (20200315)', 'creationDate': 'D:20240805224447Z', 'modDate': "D:20240805184627-04'00'", 'trapped': ''}, page_content='Understanding Deep Learning\nSimon J.D. Prince\nAugust 5, 2024\nIf you enjoy this book, here are four ways you can help me:\n1. Spread the word via social media. Posts in languages other than English par-\nticularly welcome. Tag me on LinkedIn or X and I’ll probably say hi.\n2. Write me an Amazon review.\nPreferably positive, but all publicity is good\npublicity...\n3. Send me comments (see bottom of this page). I reply to everything eventually.\n4. Buy a copy. I took 18 months completely off work to write this book and ideally\nI’d like to make minimum wage (or better) for thi

In [6]:
print(data[51])

page_content='38
3
Shallow neural networks
functions, which saturate (become close to zero) for large positive and large negative inputs.
However, the ReLU function has the disadvantage that its derivative is zero for negative inputs.
If all the training examples produce negative inputs to a given ReLU function, then we cannot
improve the parameters feeding into this ReLU during training. The gradient with respect to
the incoming weights is locally flat, so we cannot “walk downhill.” This is known as the dying
ReLU problem.
Many variations on the ReLU have been proposed to resolve this problem
(figure 3.13b), including (i) the leaky ReLU (Maas et al., 2013), which also has a linear output
for negative values with a smaller slope of 0.1, (ii) the parametric ReLU (He et al., 2015), which
treats the slope of the negative portion as an unknown parameter, and (iii) the concatenated
ReLU (Shang et al., 2016), which produces two outputs, one of which clips below zero (i.e., like
a typical ReL

In [19]:
len(data)

541

In [23]:
documents =data[500:]

In [24]:
def clean_text(text):
    return text.replace('\x00', '')

for doc in documents:
    doc.page_content = clean_text(doc.page_content)
    # If there are any other text fields, clean them as well
    if "metadata" in doc:
        for key in doc.metadata:
            if isinstance(doc.metadata[key], str):
                doc.metadata[key] = clean_text(doc.metadata[key])

In [25]:
vector_store.add_documents(documents, ids=[doc.metadata["page"] for doc in documents])

[500,
 501,
 502,
 503,
 504,
 505,
 506,
 507,
 508,
 509,
 510,
 511,
 512,
 513,
 514,
 515,
 516,
 517,
 518,
 519,
 520,
 521,
 522,
 523,
 524,
 525,
 526,
 527,
 528,
 529,
 530,
 531,
 532,
 533,
 534,
 535,
 536,
 537,
 538,
 539,
 540]

In [26]:
query = "What is Machine Learning?"
similar = vector_store.similarity_search_with_score(query, k=2)

for doc in similar:
    print(doc, end="\n\n")

(Document(metadata={'page': 15, 'title': '', 'author': '', 'format': 'PDF 1.5', 'source': 'UnderstandingDeepLearning_08_05_24_C.pdf', 'creator': 'LaTeX with hyperref', 'modDate': "D:20240805184627-04'00'", 'subject': '', 'trapped': '', 'keywords': '', 'producer': 'xdvipdfmx (20200315)', 'file_path': 'UnderstandingDeepLearning_08_05_24_C.pdf', 'total_pages': 541, 'creationDate': 'D:20240805224447Z'}, page_content='2\n1\nIntroduction\nFigure 1.1 Machine learning is an area\nof artificial intelligence that fits math-\nematical models to observed data.\nIt\ncan coarsely be divided into supervised\nlearning, unsupervised learning, and re-\ninforcement learning. Deep neural net-\nworks contribute to each of these areas.\n1.1.1\nRegression and classification problems\nFigure 1.2 depicts several regression and classification problems. In each case, there is a\nmeaningful real-world input (a sentence, a sound file, an image, etc.), and this is encoded\nas a vector of numbers. This vector forms 