In [1]:
import os
import requests
import shutil
from time import sleep
#import logging
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector
from dotenv import load_dotenv, find_dotenv
import warnings
import sys
sys.path.append('../..')
warnings.filterwarnings("ignore")
_ = load_dotenv(find_dotenv())
PG_VECTOR_PWD = os.environ["PG_VECTOR_PWD"]

In [2]:
loader = PyMuPDFLoader("../files/UnderstandingDeepLearning_08_05_24_C.pdf")
docs = loader.load()

In [None]:
docs[0]

In [None]:
len(docs)

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 150
        )
splits = text_splitter.split_documents(docs)

In [None]:
splits[0]

In [None]:
len(splits)

In [8]:
def clean_text(text):
    return text.replace('\x00', '')

In [9]:
model_embedding = HuggingFaceEmbeddings(model_name='multi-qa-mpnet-base-dot-v1')

connection = f"postgresql+psycopg://vector_user:{PG_VECTOR_PWD}@localhost:5431/vector_db"
collection_name = "udlbookb"

vector_store = PGVector(
    embeddings=model_embedding,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)

In [None]:
len(splits)

In [None]:
batch_size = 50

# Iterate through splits in batches of 50
for i in range(0, len(splits), batch_size):
    batch = splits[i:i + batch_size]
    print(i)
    for doc in batch:
        #print(doc)
        doc.page_content = clean_text(doc.page_content)
        # If there are any other text fields, clean them as well
        if "metadata" in doc:
            for key in doc.metadata:
                if isinstance(doc.metadata[key], str):
                    doc.metadata[key] = clean_text(doc.metadata[key])
    vector_store.add_documents(batch)