# INGESTING PDFS

In [2]:
from langchain_community.document_loaders import PyPDFLoader

In [3]:
# Load the PDFs into the program

PATH = r'C:\Users\Kola PC\Desktop\RAGging\sourcepdfs\Lawal, Kolawole Oluranti_C.V.pdf'

if PATH:
    loader = PyPDFLoader(PATH)
    data = loader.load()
else:
    print('There is no file!')

In [4]:
data[0].page_content

"Kolawole Lawal\nI develop, implement, deploy and maintain cost-effective software solutions to enterprise-\nscale problems.\nIbadan\nlawalkolawole902@gmail.com\n+234 707 571 2794\nA highly motivated individual willing and capable of seamlessly honing and integrating relevant set of\ntechnical and soft skills to leverage and exploit opportunities in dynamic situations in order to achieve\norganisational business and operational goals.\nWilling to relocate: Anywhere\nPersonal Details\nCurrently Employed: No\nDate of Birth: 2000-01-11\nHighest Level of Education: Master's\nIndustry: Administrative Assistance, Analytics, Business Operations, Customer Service, Hospitality\n& Tourism, IT Operations & Helpdesk, Information Design & Documentation, Information Technology,\nManagement, Project Management, Quality Assurance, Software Development\nNYSC Status: Completed\nWork Experience\nNIGHT SUPERVISOR AND AUDITOR\nBRAVA HOTEL-Ibadan\nSeptember 2024 to Present\n1. I developed and maintained sta

# VECTOR EMBEDDING

This is to convert the ingested human readable document to computer readable format

In [7]:
!ollama pull nomic-embed-text

[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest 
pulling 970aa74c0a90... 100% ▕████████████████▏ 274 MB                         
pulling c71d239df917... 100% ▕████████████████▏  11 KB                         
pulling ce4a164fc046... 100% ▕████████████████▏   17 B                         
pulling 31df23ea7daa... 100% ▕████████████████▏  420 B                         
verifying sha256 digest 
writing manifest 
success [?25h


In [8]:
!ollama list

NAME                       ID              SIZE      MODIFIED       
nomic-embed-text:latest    0a109f422b47    274 MB    10 seconds ago    
llama3.2:latest            a80c4f17acd5    2.0 GB    12 days ago       


In [16]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

In [17]:
# split and chunk the text
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 200,
    chunk_overlap = 20
)

chunks = text_splitter.split_documents(data)

In [18]:
# add the chunks into a vector database
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=OllamaEmbeddings(model = 'nomic-embed-text', show_progress=True),
    collection_name = 'local-rag'
)

  embedding=OllamaEmbeddings(model = 'nomic-embed-text', show_progress=True),
OllamaEmbeddings: 100%|██████████| 57/57 [08:04<00:00,  8.49s/it]


# INFORMATION RETRIEVAL

In [19]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [20]:
# LLM from Ollama. Note that this LLM has to be intsalled in your local device already before you can use them.
# You may use the command <ollama pull the_LLM-model> to download and install an Ollama Model.

local_model = 'llama3.2'
LLM = ChatOllama(model = local_model)

  LLM = ChatOllama(model = local_model)


In [21]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template = """You are an AI language model assistant. Your task is to generate five different
    versions of the given user question to retrieve relevant documents from a vector database. By
    generating multiple perspective on the user question, your goal is to help the user overcome
    some of the limitations of the distance-based similarity search. Provide these alternative
    questions seperated by newlines.
    Original Question: {question}""",
)

In [22]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(),
    LLM,
    prompt=QUERY_PROMPT
)

# RAG prompt
template = """ Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template=template)

In [23]:
chain = (
    {'context': retriever, 'question': RunnablePassthrough()}
    | prompt
    | LLM
    | StrOutputParser()
)

In [24]:
chain.invoke(input('Ask question here: '))

OllamaEmbeddings: 100%|██████████| 1/1 [00:07<00:00,  7.80s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:04<00:00,  4.66s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:05<00:00,  5.34s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:05<00:00,  5.04s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:05<00:00,  5.57s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:05<00:00,  5.26s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:11<00:00, 11.58s/it]


'This appears to be a report or a document detailing the tasks and experiences of an intern (Data Analyst) during their internship at KPMG (Forage)-Remote from September 2023 to November 2023. The document outlines various projects, tasks, and responsibilities undertaken by the intern, including data analysis, web development, communication with stakeholders, and quality control.'