# Ollama PDF RAG Notebook

## Import Libraries


In [1]:
# Imports
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Jupyter-specific imports
from IPython.display import display, Markdown

# Set environment variable for protobuf
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

In [3]:
import torch

if torch.cuda.is_available():
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    print("GPU is not available")


GPU is not available


In [4]:
!nvidia-smi


'nvidia-smi' is not recognized as an internal or external command,
operable program or batch file.


## Load PDF

In [2]:
local_path = r"D:\DESK THING\Student-Guide-Module-1-Fundamentals-of-AI.pdf"

try:
    if local_path:
        loader = UnstructuredPDFLoader(file_path=local_path)
        data = loader.load()
        print(f"PDF loaded successfully: {local_path}")
    else:
        print("Upload a PDF file")
except FileNotFoundError:
    print(f"File not found: {local_path}")
except Exception as e:
    print(f"An error occurred: {e}")
    

pytesseract is not installed. Cannot use the ocr_only partitioning strategy. Falling back to partitioning with another strategy.
Falling back to partitioning with hi_res.


An error occurred: Unable to get page count. Is poppler installed and in PATH?


In [3]:
from PyPDF2 import PdfReader
# Step 1: Extract text from PDF
local_path = r"D:\DESK THING\Gmail - HDFC Personal Loan Address Verification Process -Reg.pdf"


# Read PDF and extract text
reader = PdfReader(local_path)
text = ""
for page in reader.pages:
    text += page.extract_text()
print("PDF loaded and text extracted successfully.")

PDF loaded and text extracted successfully.


## Split text into chunks

In [4]:
# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_text(text)
print(f"Text split into {len(chunks)} chunks")

Text split into 2 chunks


## Create vector database

In [5]:
# Create vector database
vector_db = Chroma.from_texts(
    texts=chunks,
    embedding=OllamaEmbeddings(model="nomic-embed-text"),
    collection_name="local-rag"
)
print("Vector database created successfully")

Vector database created successfully


## Set up LLM and Retrieval

In [6]:
# Set up LLM and retrieval
local_model = "llama3.1:latest"  # or whichever model you prefer
llm = ChatOllama(model=local_model)

In [7]:
# Query prompt template
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate 2
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

# Set up retriever
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

## Create chain

In [8]:
# RAG prompt template
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [9]:
# Create chain
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

## Chat with PDF

In [11]:
def chat_with_pdf(question):
    """
    Chat with the PDF using the RAG chain.
    """
    return display(Markdown(chain.invoke(question)))

In [12]:
# Example 1
chat_with_pdf("What is the small idea of this document?")

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2
Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2
Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


This document appears to be a request from Mukesh B to HDFC Bank for verification of his personal loan application, specifically for address verification.

In [None]:
# Example 2
chat_with_pdf("What is the purpose of the scammer agent?")

In [26]:
# Example 3
chat_with_pdf("Can you explain the case study highlighted in the document?")

The case study presented in the document is a redacted transcript and an abridged action log for a bank transfer scam. The transcript proceeds as follows:

1. A scammer (Agent) initiates contact with a victim, claiming to be from Bank of America.
2. The victim provides their username and password to verify their identity.
3. The Agent navigates to the Bank of America login page and inputs the username and password, taking 6 actions (navigate, get_html, fill_element, fill_element, click_element, get_html).
4. After verifying the victim's account information, the Agent requests a two-factor authentication code from the registered device.
5. The victim provides the 2FA code.
6. The Agent fills out the 2FA code and proceeds to navigate to the transfer page.
7. The Agent searches for a recipient and transfers the money.

The action log shows that the Agent performs 20 actions to complete this scam, including filling out specific fields, clicking on buttons, navigating to specific websites, and searching for recipients.

It's worth noting that the document highlights the complexity of these interactions and the challenges faced by the agents in completing the scams. The authors also mention that transcription errors are a common cause of failures for many of the scams.

## Clean up (optional)

In [27]:
# Optional: Clean up when done 
vector_db.delete_collection()
print("Vector database deleted successfully")

Vector database deleted successfully
