In [29]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.embeddings import Embeddings
from openai import OpenAI
from typing import List
from langchain_postgres import PGVector
from rich import print

load_dotenv()

True

In [21]:
folder_path = "D:\\ADK_pregnancy_agent\\crawler"

# Get all .txt files recursively from folder and subfolders
txt_files = []
for root, dirs, files in os.walk(folder_path):
    for file in files:
        if file.endswith(".txt"):
            txt_files.append(os.path.join(root, file))

all_pages = []  # To store pages from all files

for txt_file_path in txt_files:
    # print(f"Processing {txt_file_path}...")
    loader = TextLoader(file_path=txt_file_path, encoding="utf-8")

    # Load the documents for this file and append to `all_pages`
    for doc in loader.lazy_load():
        all_pages.append(doc)

# Output all collected pages
print(f"Total pages loaded: {len(all_pages)}")


In [22]:
# Splitting document

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800, chunk_overlap=300, separators=["\n"]
)

splits = text_splitter.split_documents(all_pages)
# splits

In [23]:
len(splits)

2945

# Embedding model: BAAI/bge-m3

In [None]:
class NvidiaOpenAIEmbeddings_BGE_M3(Embeddings):
    def __init__(self, api_key: str, base_url: str, model: str = "baai/bge-m3"):
        self.client = OpenAI(api_key=api_key, base_url=base_url)
        self.model = model

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        response = self.client.embeddings.create(
            input=texts,
            model=self.model,
            encoding_format="float",
            extra_body={"truncate": "NONE"},
        )
        return [data.embedding for data in response.data]

    def embed_query(self, text: str) -> List[float]:
        response = self.client.embeddings.create(
            input=[text],
            model=self.model,
            encoding_format="float",
            extra_body={"truncate": "NONE"},
        )
        return response.data[0].embedding


embedding_engine = NvidiaOpenAIEmbeddings_BGE_M3(
    api_key=os.getenv("NVIDIA_API_KEY"),
    base_url="https://integrate.api.nvidia.com/v1",
    model="baai/bge-m3",
)

In [30]:
SUPABASE_PG_CONN_URL = os.getenv("DB_URI")

vector_store = PGVector(
    embeddings=embedding_engine,
    collection_name="pregnancy_bot",
    connection=SUPABASE_PG_CONN_URL,
    use_jsonb=True,
)
print("PGVector Store is loaded.")

# push embedding to collection
for i in range(0, len(splits), 250):
    chunk = splits[i : i + 250]
    try:
        # Add the chunk to the vector store
        vector_store.add_documents(documents=chunk)
        print(f"Chunk {i // 250} added successfully")
    except Exception as e:
        print(f"Error adding chunk {i // 250}: {e}")
        continue

## Vector Retriever

In [31]:
vec_retriever = vector_store.as_retriever(
    search_type="similarity", search_kwargs={"k": 5}
)

In [33]:
docs = vec_retriever.invoke("১১ সপ্তাহে কি খেতে হয়?")
# print(docs)
for doc in docs:
    print(f"Document metadata: {doc.metadata}\n")
    print(f"Document Content: {doc.page_content}")
    print("---------------------------------------------------------------\n")