In [1]:
import streamlit as st
import os
import logging
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_ollama import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever
import ollama
from pdf2image import convert_from_path

In [2]:
logging.basicConfig(level=logging.INFO)

# Constants
DOC_PATH = "./doc/"
MODEL_NAME = "llama3.2"
EMBEDDING_MODEL = "nomic-embed-text"
VECTOR_STORE_NAME = "mobitel-test-rag1"
PERSIST_DIRECTORY = "./chroma_db"
files = [os.path.join(DOC_PATH, f) for f in os.listdir(DOC_PATH) if os.path.isfile(os.path.join(DOC_PATH, f))]
all_data = []

In [3]:
def ingest_pdf(doc_path):
   
    if os.path.exists(doc_path):
        loader = UnstructuredPDFLoader(file_path=doc_path)
        data = loader.load()
        logging.info("PDF loaded successfully.")
        return data
    else:
        logging.error(f"PDF file not found at path: {doc_path}")
        st.error("PDF file not found.")
        return None

In [4]:
def split_documents(documents):
    """Split documents into smaller chunks."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=1000)
    chunks = text_splitter.split_documents(documents)
    logging.info("Documents split into chunks.")
    return chunks

In [12]:
@st.cache_resource
def load_vector_db():
    """Load or create the vector database."""
    # Pull the embedding model if not already available
    ollama.pull(EMBEDDING_MODEL)

    embedding = OllamaEmbeddings(model=EMBEDDING_MODEL)

    if os.path.exists(PERSIST_DIRECTORY):
        vector_db = Chroma(
            embedding_function=embedding,
            collection_name=VECTOR_STORE_NAME,
            persist_directory=PERSIST_DIRECTORY,
        )
        logging.info("Loaded existing vector database.")
    else:
        # Load and process the PDF document

        for DOC_PATH1 in files:

            data = ingest_pdf(DOC_PATH1)
            all_data.extend(data)

        # Split the documents into chunks
        chunks = split_documents(all_data)

        vector_db = Chroma.from_documents(
            documents=chunks,
            embedding=embedding,
            collection_name=VECTOR_STORE_NAME,
            persist_directory=PERSIST_DIRECTORY,
        )
        vector_db.persist()
        logging.info("Vector database created and persisted.")
    return vector_db

In [6]:
def create_retriever(vector_db, llm):
    """Create a multi-query retriever."""
    QUERY_PROMPT = PromptTemplate(
        input_variables=["question"],
        template="""You are an AI Radio Network Planning and Optimization model assistant. Your task is to generate five
        different versions of the given user question to retrieve relevant documents from
        a vector database. By generating multiple perspectives on the user question, your
        goal is to help the user overcome some of the limitations of the distance-based
        similarity search. Provide these alternative questions separated by newlines.
        Original question: {question}""",
    )

    retriever = MultiQueryRetriever.from_llm(
        vector_db.as_retriever(), llm, prompt=QUERY_PROMPT
    )
    logging.info("Retriever created.")
    return retriever


In [7]:
def create_chain(retriever, llm):
    """Create the chain with preserved syntax."""
    # RAG prompt
    template = """Answer the question based ONLY on the following context:
    {context}
    Question: {question}
    """

    prompt = ChatPromptTemplate.from_template(template)

    chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )

    logging.info("Chain created with preserved syntax.")
    return chain

In [8]:
llm = ChatOllama(model=MODEL_NAME)

In [None]:
vector_db = load_vector_db()

INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/pull "HTTP/1.1 200 OK"
INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.




INFO:root:PDF loaded successfully.
INFO:root:Documents split into chunks.
INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [None]:
retriever = create_retriever(vector_db, llm)

In [None]:
chain = create_chain(retriever, llm)

In [None]:
input = "What are the measurements to be taken to improve RRC setup success rate in a site."

In [None]:
response = chain.invoke(input)

In [None]:
response