In [1]:
#from langchain_community.document_loaders.unstructured import UnstructuredPDFLoader
from langchain.document_loaders import DirectoryLoader, UnstructuredPDFLoader
from langchain_community.document_loaders import OnlinePDFLoader

In [None]:
# Initialize the DirectoryLoader for PDFs
loader = DirectoryLoader(
    "Data/", 
    glob="**/*.pdf", 
    loader_cls=UnstructuredPDFLoader
)

if loader:
    # Load all PDF files in the specified directory
    books = loader.load()
    # Output the number of PDF files loaded
    print(len(books))
else:
    print('Please upload PDF file')

In [None]:
#Preview the content of PDF
print(books[0].page_content)

In [None]:
!ollama list

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.embeddings import OllamaEmbeddings

# Optimized Chunking Strategy
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=7500,         # Smaller chunk size for better context
    chunk_overlap=100,       # Overlap to ensure context retention
)

# Split documents
all_splits = text_splitter.split_documents(books)

# Batch processing setup
#batch_size = 4  # You may adjust this based on your GPU capacity
#batches = [all_splits[i:i + batch_size] for i in range(0, len(all_splits), batch_size)]

In [None]:
#Add to vectore databse
vector_db = Chroma.from_documents(
    documents = all_splits,
    embedding = OllamaEmbeddings(model="nomic-embed-text",show_progress=True),
    collection_name="local-RAG"
)

# Process each batch and add to vectorstore
#for batch in batches:
    #vectorstore.add_documents(documents=batch)

# Retrieval

In [None]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [None]:
#LLM from OLLAMA:
model = "phi3.5:latest"
llm = ChatOllama(model=model)

In [None]:
#Prompt template:
query_prompt = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}"""
)

In [None]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=query_prompt
)

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [None]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
chain.invoke("Fetch information about Almonds?")