In [1]:
from langchain import hub
from langchain_community.document_loaders import PyPDFDirectoryLoader, PyMuPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import textwrap
import os

In [2]:
loader = PyMuPDFLoader('./data/Szeliski_CVAABook_2ndEd.pdf')
documents = loader.load()
print(f"{len(documents)} pages of documentss loaded")

1232 pages of documentss loaded


In [3]:
text_splitter = RecursiveCharacterTextSplitter(
    # chunk_size=1200, # removed to chunk one page at a time
    # chunk_overlap=200,
    )
docs = text_splitter.split_documents(documents)

print(f'Created {len(docs)} chunks from {len(documents)} pages')


Created 1238 chunks from 1232 pages


In [6]:
# print(docs[0].metadata)
print(docs[200].page_content)

3.6 Geometric transformations
177
(a)
(b)
(c)
Figure 3.50
Line-based image warping (Beier and Neely 1992) © 1992 ACM: (a) distance
computation and position transfer; (b) rendering algorithm; (c) two intermediate warps used
for morphing.
segment correspondence speciﬁes a translation, rotation, and scaling, i.e., a similarity trans-
form (Table 3.3), for pixels in its vicinity, as shown in Figure 3.50a. Line segments inﬂuence
the overall displacement of the image using a weighting function that depends on the mini-
mum distance to the line segment (v in Figure 3.50a if u ∈[0, 1], else the shorter of the two
distances to P and Q).
One ﬁnal possibility for specifying displacement ﬁelds is to use a mesh speciﬁcally
adapted to the underlying image content, as shown in Figure 3.49d. Specifying such meshes
by hand can involve a fair amount of work; Gomes, Darsa et al. (1999) describe an interactive
system for doing this. Once the two meshes have been speciﬁed, intermediate warps can be
generat

In [8]:
embeddings = OllamaEmbeddings(model='nomic-embed-text')

In [9]:
# # # Create a Chroma database
# db = Chroma.from_documents(
#     documents=docs,
#     embedding=embeddings,
#     collection_name="szeliski_cv",
#     persist_directory="./chroma_db"
#     )

db = FAISS.from_documents(
    documents=docs,
    embedding=embeddings)

db.save_local('./faiss_db/', index_name='szeliski_cv')
# db.load_local('./faiss_db/', embeddings=embeddings)

In [10]:
db.load_local('./faiss_db/', embeddings=embeddings, index_name='szeliski_cv', allow_dangerous_deserialization=True)

<langchain_community.vectorstores.faiss.FAISS at 0x7ffabe66a920>

In [11]:
# Use similarity searching algorithm and return 3 most relevant documents.
retriever = db.as_retriever()

In [14]:
llm = Ollama(
    model = "llama3",
    # model = "phi3",
    # model = "gemma:2b",
    temperature=0
)


In [16]:
prompt_template = """
You need either to explain the concept or answer the question about Computer Vision. 
Be detailed, use simple words and examples in your explanations. If required, utilize the relevant information.
Also give source of information, along with page number which relates to retrieved content.
{context}

Question: {question}
Answer:"""


prompt = PromptTemplate(
    template = prompt_template, 
    input_variables = ["context", "question"]
)

In [17]:
retriever = db.as_retriever(search_kwargs = {"k": 3, "search_type" : "similarity"})

qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
    retriever = retriever, 
    chain_type_kwargs = {"prompt": prompt},
    return_source_documents = True,
    verbose = False
)


In [18]:
### testing MMR search
question = "What is Laplacian pyramid blending?"
db.max_marginal_relevance_search(question, k = 2)


[Document(page_content='158\nComputer Vision: Algorithms and Applications, 2nd ed. (ﬁnal draft, Sept. 2021)\nspace:\n−\n=\nfrequency:\n−\n=\nlow-pass\nlower-pass\nFigure 3.34\nThe difference of two low-pass ﬁlters results in a band-pass ﬁlter. The dashed\nblue lines show the close ﬁt to a half-octave Laplacian of Gaussian.\nthe output of the L box directly to the subtraction in Figure 3.33). This variant has less\naliasing, since it avoids one downsampling and upsampling round-trip, but it is not self-\ninverting, since the Laplacian images are no longer adequate to reproduce the original image.\nAs with the Gaussian pyramid, the term Laplacian is a bit of a misnomer, since their\nband-pass images are really differences of (approximate) Gaussians, or DoGs,\nDoG{I; σ1, σ2} = Gσ1 ∗I −Gσ2 ∗I = (Gσ1 −Gσ2) ∗I.\n(3.70)\nA Laplacian of Gaussian (which we saw in (3.26)) is actually its second derivative,\nLoG{I; σ} = ∇2(Gσ ∗I) = (∇2Gσ) ∗I,\n(3.71)\nwhere\n∇2 = ∂2\n∂x2 + ∂2\n∂y2\n(3.72)\nis the

In [19]:
def wrap_text_preserve_newlines(text, width=700):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text


def process_llm_response(llm_response):
    ans = wrap_text_preserve_newlines(llm_response['result'])
    
    sources_used = ' \n'.join(
        [
            source.metadata['title'].split('/')[-1][:-4]
            + ' - page: '
            + str(source.metadata['page']+1)
            for source in llm_response['source_documents']
        ]
    )
    
    ans = ans + '\n\nSources: \n' + sources_used
    return ans


In [20]:
import time

def llm_ans(query):
    start = time.time()
    
    llm_response = qa_chain.invoke(query)
    ans = process_llm_response(llm_response)
    
    end = time.time()

    time_elapsed = int(round(end - start, 0))
    time_elapsed_str = f'\n\nTime elapsed: {time_elapsed} s'
    return ans + time_elapsed_str


In [21]:
query = "How image stitching is normally used to composite partially overlapping photographs"
print(llm_ans(query))

According to the text, image stitching is normally used to composite partially overlapping photographs by:

1. Stitching the background scene to create a single sprite image that can be transmitted and used to re-create the background in each frame (Figure 8.7).
2. Removing moving foreground objects using median filtering or extracting them into a separate layer and later composing them back into the stitched panoramas.
3. Creating animated panoramic video textures by animating different portions of a panoramic scene with independently moving video loops.

These techniques are used to summarize and compress videos taken with a panning camera, allowing for efficient transmission and storage of large amounts of visual data.

Sources: 
Computer Vision: Algorithms and Applications, 2nd Edi - page: 1080 
Computer Vision: Algorithms and Applications, 2nd Edi - page: 548 
Computer Vision: Algorithms and Applications, 2nd Edi - page: 574

Time elapsed: 85 s


In [22]:
query = "What is category recognition in the bag of words (also known as bag of features or bag of keypoints)?"
print(llm_ans(query))

Category recognition in the bag of words (also known as bag of features or bag of keypoints) approach is a simple algorithm for recognizing categories of objects or images. It represents objects and images as unordered collections of feature descriptors, also known as visual words.

In this approach, the distribution (histogram) of visual words found in the query image is computed and compared to those found in the training images. This is done by extracting features at keypoints and then quantizing them to get a distribution over the learned visual words (feature cluster centers). The feature distribution histogram is used to learn a decision surface using a classification algorithm, such as a support vector machine.

The bag of words approach does not require geometric verification, unlike instance recognition, since individual instances of generic visual categories have relatively little spatial coherence to their features. This makes it a simple and efficient way to recognize categ

## Evalutation


In [None]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain.chat_models import ChatOllama

# generator with openai models
generator_llm = llm
critic_llm = llm
generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# generate testset
testset = generator.generate_with_langchain_docs(documents, test_size=10, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})
