In [32]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_classic.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.llms import Ollama
from langchain_core.output_parsers import JsonOutputParser
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_classic.chains import RetrievalQA
from langchain_classic.prompts import PromptTemplate
import time

In [33]:
FILE_PATH = "sample.pdf"
DOC_CHUNK_SIZE = 800
DOC_CHUNK_OVERLAP = 150
INGESTION_MODEL = "qwen2.5:7b"
EMBEDDING_MODEL = "nomic-embed-text"
VECTORDB_DIR = "./chroma_store"
QUERY_MODEL = "llama3.1:8b"
QUERY_TEMPERATURE = 0.2

In [34]:
def normalize_metadata(meta: dict) -> dict:
    clean = {}
    for key, value in meta.items():
        if isinstance(value, list):
            clean[key] = ", ".join(map(str, value))
        elif isinstance(value, dict):
            clean[key] = str(value)
        else:
            clean[key] = value
    return clean


In [35]:
loader = PyPDFLoader(FILE_PATH)
docs = loader.load()

print("File loaded")

splitter = RecursiveCharacterTextSplitter(
    chunk_size=DOC_CHUNK_SIZE,
    chunk_overlap=DOC_CHUNK_OVERLAP
)
docs = splitter.split_documents(docs)

print("Docs created")
print("Num: ", len(docs))

File loaded
Docs created
Num:  4


In [36]:
metadata_llm = Ollama(
    model=INGESTION_MODEL,
    temperature=0
)

prompt = PromptTemplate(
    input_variables=["chunk", "prev", "next"],
    template="""
You are annotating a document chunk for retrieval.

Chunk:
{chunk}

Previous:
{prev}

Next:
{next}

Return valid JSON with:
summary
chunk_type
importance_score
main_topics
prev_relation
next_relation
"""
)

In [37]:
metadata_chain = prompt | metadata_llm | JsonOutputParser()

print("Creating metadata")

for i, doc in enumerate(docs):
    print("Working on doc", i + 1)
    start = time.time()
    meta = metadata_chain.invoke({
        "chunk": doc.page_content,
        "prev": docs[i-1].page_content if i > 0 else "None",
        "next": docs[i+1].page_content if i < len(docs)-1 else "None"
    })

    meta = normalize_metadata(meta)
    meta["position"] = i
    doc.metadata.update(meta)

    print("Time taken:", time.time() - start)

print("Metadata created")

Creating metadata
Working on doc 1
Time taken: 94.0237090587616
Working on doc 2
Time taken: 99.41127634048462
Working on doc 3
Time taken: 89.39200353622437
Working on doc 4
Time taken: 65.14144039154053
Metadata created


In [38]:
embeddings = OllamaEmbeddings(
    model=EMBEDDING_MODEL
)

vectordb = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    persist_directory=VECTORDB_DIR
)

print("Stored in vector DB")

vectordb.persist()
print("Ingestion complete")

Stored in vector DB
Ingestion complete


In [39]:
qa_llm = Ollama(
    model=QUERY_MODEL,
    temperature=QUERY_TEMPERATURE
)

vectordb = Chroma(
    persist_directory=VECTORDB_DIR,
    embedding_function=embeddings
)

retriever = vectordb.as_retriever(
    search_kwargs={
        "k": 6,
        "filter": {
            "importance_score": {"$gte": 0.3}
        }
    }
)

qa = RetrievalQA.from_chain_type(
    llm=qa_llm,
    retriever=retriever,
    return_source_documents=True
)


In [40]:
query = "What are Kartik's main skills?"

response = qa(query)
print(response["result"])

  response = qa(query)


Kartik's main skills appear to be:

1. Proficiency in Python
2. Familiarity with PyTorch
3. Experience with prompt engineering and LLM APIs (Large Language Model Application Programming Interfaces)
4. Knowledge of AI, including Generative AI projects
5. Certifications in:
	* IBM AI Developer
	* AWS AI Practitioner
	* AWS Cloud Practioner
	* Azure Certifications

He also has experience building and evaluating machine learning models using CNNs (Convolutional Neural Networks) and RNNs (Recurrent Neural Networks).


In [41]:
llm = Ollama(model="llama3.1:8b")
print(llm("Say exactly: Ollama is working"))

Ollama is working.
