In [1]:
import os
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.memory import ConversationBufferMemory
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import ConversationalRetrievalChain
import json
import shutil

CHROMA_PATH = "./docs/vectordb"

In [2]:
with open('source.json', 'r') as f:
    pdf_to_url_mapping = json.load(f)

def load_documents(source_docs):
    loader = PyPDFDirectoryLoader(source_docs)
    pages = loader.load()

    for page in pages:

        pdf_name = os.path.splitext(os.path.basename(page.metadata.get('source', '')))[0]  # Get the PDF file name without extension
        page.metadata['source'] = pdf_to_url_mapping.get(pdf_name, pdf_name)  # Use URL if available, otherwise use the file name

    return pages

def split_text(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True
    )

    splits = text_splitter.split_documents(documents)

    return splits

def save_to_chroma(sectionName, splits):

    # if os.path.exists(CHROMA_PATH):
    #     shutil.rmtree(CHROMA_PATH)

    embedding = OpenAIEmbeddings()
    vectordb = Chroma.from_documents(
        collection_name=sectionName,
        documents=splits,
        embedding=embedding,
        persist_directory=CHROMA_PATH
    )

    print(f"Saved {sectionName} to {CHROMA_PATH}")

    return vectordb


In [3]:
sections = {
    "Living-in-Singapore": {
        "source_docs": "./docs/Living-in-Singapore",
    },
    "Working-in-Singapore": {
        "source_docs": "./docs/Working-in-Singapore",
    },
    "Health-and-Safety": {
        "source_docs": "./docs/Health-and-Safety",
    },
    "Legal": {
        "source_docs": "./docs/Legal",
    },
    "Financial": {
        "source_docs": "./docs/Financial",
    },
    "Work-Permit": {
        "source_docs": "./docs/Work-Permit",
    },
    "Salary-and-Wages": {
        "source_docs": "./docs/Salary-and-Wages",
    },
    "Help-and-Resources": {
        "source_docs": "./docs/Help-and-Resources",
    },
    "all": {
        "source_docs": "./docs/all",
    }
}

for sectionName, section in sections.items():
    documents = load_documents(section['source_docs'])
    splits = split_text(documents)
    vectordb = save_to_chroma(sectionName, splits)


  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


Saved Living-in-Singapore to ./docs/vectordb
Saved Working-in-Singapore to ./docs/vectordb
Saved Health-and-Safety to ./docs/vectordb
Saved Legal to ./docs/vectordb
Saved Financial to ./docs/vectordb
Saved Work-Permit to ./docs/vectordb
Saved Salary-and-Wages to ./docs/vectordb
Saved Help-and-Resources to ./docs/vectordb
Saved all to ./docs/vectordb


In [None]:
persistent_client = chromadb.PersistentClient(path="./chroma_langchain_db")
# collection = persistent_client.get_or_create_collection("iqma_collection")
vector_store_from_client = Chroma(
    client=persistent_client,
    collection_name="iqma_collection",
    embedding_function=embeddings,
)
retriever = vector_store_from_client.as_retriever(
    search_type="mmr", search_kwargs={"k": 1, "fetch_k": 5}
)