In [None]:
import os
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.memory import ConversationBufferMemory
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import ConversationalRetrievalChain
import json
import shutil
import chromadb

CHROMA_PATH = "./docs/vectordb"

In [None]:
with open('source.json', 'r') as f:
    pdf_to_url_mapping = json.load(f)

def load_documents(source_docs):
    loader = PyPDFDirectoryLoader(source_docs)
    pages = loader.load()

    for page in pages:

        pdf_name = os.path.splitext(os.path.basename(page.metadata.get('source', '')))[0]  # Get the PDF file name without extension
        page.metadata['source'] = pdf_to_url_mapping.get(pdf_name, pdf_name)  # Use URL if available, otherwise use the file name

    return pages

def split_text(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True
    )

    splits = text_splitter.split_documents(documents)

    return splits

def split_text2(documents):
    splits = []
    for doc in documents:
        doc_length = len(doc.page_content)
        
        # Set chunk size based on document length
        if doc_length < 1000:
            chunk_size = 300  # For shorter texts
        elif doc_length < 5000:
            chunk_size = 500  # For medium texts
        else:
            chunk_size = 700  # For longer texts

        # Text splitter with adaptive chunk size and overlap
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=int(chunk_size * 0.2),  # 20% overlap
            length_function=len,
            add_start_index=True
        )

        # Split the current document and add to splits
        splits.extend(text_splitter.split_documents([doc]))
    
    return splits


def save_to_chroma(sectionName, splits):

    # if os.path.exists(CHROMA_PATH):
    #     shutil.rmtree(CHROMA_PATH)

    embedding = OpenAIEmbeddings()
    vectordb = Chroma.from_documents(
        collection_name=sectionName,
        documents=splits,
        embedding=embedding,
        persist_directory=CHROMA_PATH
    )

    print(f"Saved {sectionName} to {CHROMA_PATH}")

    return vectordb


In [None]:
sections = {
    "Living-in-Singapore": {
        "source_docs": "./docs/Living-in-Singapore",
    },
    "Working-in-Singapore": {
        "source_docs": "./docs/Working-in-Singapore",
    },
    "Health-and-Safety": {
        "source_docs": "./docs/Health-and-Safety",
    },
    "Legal": {
        "source_docs": "./docs/Legal",
    },
    "Financial": {
        "source_docs": "./docs/Financial",
    },
    "Work-Permit": {
        "source_docs": "./docs/Work-Permit",
    },
    "Salary-and-Wages": {
        "source_docs": "./docs/Salary-and-Wages",
    },
    "Help-and-Resources": {
        "source_docs": "./docs/Help-and-Resources",
    },
    "all": {
        "source_docs": "./docs/all",
    }
}

for sectionName, section in sections.items():
    documents = load_documents(section['source_docs'])
    splits = split_text(documents)
    vectordb = save_to_chroma(sectionName, splits)


In [None]:
def getRetriever(sectionName):

    embedding = OpenAIEmbeddings()

    persistent_client = chromadb.PersistentClient(path="./docs/vectordb")
    vector_store_from_client = Chroma(
        client=persistent_client,
        collection_name=sectionName,
        embedding_function=embedding,
    )
    # retriever = vector_store_from_client.as_retriever(
    #     search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.7,}
    # )

    retriever = vector_store_from_client.as_retriever(
        search_type="mmr", search_kwargs={'k': 5, 'lambda': 0.3}
    )

    return retriever


retriever = getRetriever("Legal")

retriever.invoke("What should I do if I receive a call or text message from an unfamiliar phone number who says they are from government agencies such as MOM or the police?")