## Install required packages

In [None]:
%pip install ollama tavily-python requests pydantic pymupdf PyPDF2 openai 




In [29]:
import ollama
from tavily import TavilyClient
from typing import Dict, Callable
import re


## Pull the models to run them locally

In [8]:
# pull model
ollama.pull('gemma3:4b')

ProgressResponse(status='success', completed=None, total=None, digest=None)

In [9]:
ollama.pull('llava:latest')

ProgressResponse(status='success', completed=None, total=None, digest=None)

In [10]:
ollama.pull('qwen3:4b')

ProgressResponse(status='success', completed=None, total=None, digest=None)

### To verify the LLMs that we have currently downloaded

In [11]:
!ollama list

NAME            ID              SIZE      MODIFIED      
qwen3:4b        2bfd38a7daaf    2.6 GB    2 seconds ago    
llava:latest    8dd30f6b0cb1    4.7 GB    3 seconds ago    
gemma3:4b       a2af6cc3eb7f    3.3 GB    5 seconds ago    


(Optional) You can delete them with '! ollama rm <name_of_model>'

In [30]:
!ollama rm llama3.2-vision:latest

[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25hdeleted 'llama3.2-vision:latest'


### Test the LLMs

First off, we test the ability of Gemma to describe and read an image

In [12]:
# interact with model (locally)
stream = ollama.chat(
    model='gemma3:4b',
    messages=[{
        'role': 'user',
        'content': "Can you describe this image?",
        'images': ['/Users/jacopocirica/Desktop/AI Bootcamp/Screenshot 2025-06-12 at 16.33.02.png']
    }],
    stream=True,
)

for chunk in stream:
    print(chunk['message']['content'], end='', flush=True)

Okay, let’s break down this diagram illustrating Causal Language Modeling pretraining.

**Overall Concept:**

The image represents a simplified model used for causal language modeling. Causal language models are designed to predict the next word in a sequence, considering only the words that came before it. This is in contrast to autoregressive models which consider the entire sequence.

**Components and Flow:**

1.  **Input Prompt:**  The input prompt - in this case, "is" - is represented as a series of colored blocks (PAD - padding).  The diagram is illustrating how the model receives the input sequence.

2.  **LLM (Large Language Model):** This is the core component. It's depicted as a large square grid.  This represents the layers of the neural network responsible for processing the input sequence.

3.  **Hidden States:**  The circles within the LLM represent "hidden states." These are the intermediate representations that the model creates at each step during the processing of the

Then, we try Qwen for the function calling

In [13]:
# Define the function

def get_url(topic: str):
    tavily_client = TavilyClient(api_key="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")
    response = tavily_client.search(query=f"Tell me the main paper about this topic:{topic}",include_domains=["arxiv.org/abs/"], search_depth= "advanced")
    return response['results'][0]['url']

# Call the function

get_url("Transformer Architecture")


'https://arxiv.org/abs/2311.17633'

Qwen calls the function

In [14]:
available_functions: Dict[str, Callable] = {
    "get_url": get_url
}
response = ollama.chat(
    model='qwen3:4b',
    messages=[{
        'role': 'user',
        'content': "Can you tell me the main paper about this topic: Mamba Architecture?"
    }],
    tools=[get_url]
)

if response.message.tool_calls:
    for tool_call in response.message.tool_calls:
        if tool_call.function.name == "get_url":
            print("Topic: ", tool_call.function.arguments['topic'])
            print("Tool call: ", tool_call.function.name)
            url=available_functions[tool_call.function.name](tool_call.function.arguments['topic'])
            print("URL: ", url)


Topic:  Mamba Architecture
Tool call:  get_url
URL:  https://arxiv.org/abs/2502.07161


## Function to download the PDF

In [15]:
import requests
import os

def download_arxiv_pdf(arxiv_url: str, destination_folder: str, filename: str = None):
    """
    Downloads a PDF from an arXiv URL and saves it to a specified folder.

    Args:
        arxiv_url (str): Direct link to the arXiv PDF (e.g., https://arxiv.org/pdf/2311.17633).
        destination_folder (str): Local folder path to save the file.
        filename (str, optional): Name to save the file as (e.g., 'paper.pdf').
                                  If None, it uses the arXiv ID as the filename.

    Returns:
        str: Full path to the saved file.
    """
    # Extract default filename from URL if not provided
    if filename is None:
        filename = arxiv_url.strip("/").split("/")[-1] + ".pdf"

    os.makedirs(destination_folder, exist_ok=True)
    destination_path = os.path.join(destination_folder, filename)

    try:
        response = requests.get(arxiv_url)
        response.raise_for_status()  # Raise error for bad responses
        with open(destination_path, "wb") as f:
            f.write(response.content)
        print(f"✅ PDF downloaded to: {destination_path}")
        return destination_path
    except Exception as e:
        print(f"❌ Error downloading file: {e}")
        return None


In [16]:
download_arxiv_pdf("https://arxiv.org/pdf/2502.07161", "./PDFs")

✅ PDF downloaded to: ./PDFs/2502.07161.pdf


'./PDFs/2502.07161.pdf'

## Function to convert each page of PDF into an image

In [17]:
import fitz
import sys
import os
import glob

# pdf_path: the folder of all the PDF files
# saved_path: the path of the saved page images
def convert_pdf_to_image(pdf_path, pdf_file, saved_path):

    if not os.path.exists(saved_path):
        os.mkdir(saved_path)
    else:
        files = glob.glob('saved_path/*')
        for f in files:
            os.remove(f)

    try:
        fitz.TOOLS.mupdf_warnings()  # empty the problem message container
        doc = fitz.open(pdf_path + "/" + pdf_file)
        warnings = fitz.TOOLS.mupdf_warnings()
        if warnings:
            print(warnings)
            raise RuntimeError()

        for page in doc:  # iterate through the pages
            pix = page.get_pixmap()  # render page to an image
            pix.save(saved_path + "/" + f"{pdf_file[:-4]}-{page.number}.png")  # store image as a PNG
        return

    except:
        print("error when opening the pdf file {}".format(pdf_file))
        return None

In [18]:
convert_pdf_to_image("./PDFs", "2502.07161.pdf", "./PDFs/images_v1")

## Function to extract the text from the PDF

In [19]:
def extract_text(filepath):
    pdf=fitz.open(filepath)
    text="".join([page.get_text() for page in pdf])

    return text

In [20]:
extract_text("./PDFs/2502.07161.pdf")

'A Survey on Mamba Architecture for Vision\nApplications\nFady Ibrahim\nDepartment of Computer Science\nToronto Metropolitan University\nToronto, Canada\nf1ibrahim@torontomu.ca\nGuangjun Liu\nDepartment of Aerospace Engineering\nToronto Metropolitan University\nToronto, Canada\ngjliu@torontomu.ca\nGuanghui Wang\nDepartment of Computer Science\nToronto Metropolitan University\nToronto, Canada\nwangcs@torontomu.ca\nAbstract—Transformers have become foundational for visual\ntasks such as object detection, semantic segmentation, and\nvideo understanding, but their quadratic complexity in attention\nmechanisms presents scalability challenges. To address these\nlimitations, the Mamba architecture utilizes state-space models\n(SSMs) for linear scalability, efficient processing, and improved\ncontextual awareness. This paper investigates Mamba architec-\nture for visual domain applications and its recent advancements,\nincluding Vision Mamba (ViM) and VideoMamba, which in-\ntroduce bidirection

## Vision Model to interpret and store the figures and table in the PDF

### First we create a Pydantic schema

In [21]:
import json
from pydantic import BaseModel, Field, field_validator, ValidationError
from typing import Literal
# Step 1: Define our data contract
class Extract(BaseModel):
    type: Literal["table", "image"] = Field(..., description="Type of element to extract")
    page: str = Field(..., description="Number of the page to extract from")
    number: str = Field(..., description="Number of the element to extract")
    description: str = Field(..., description="Description of the element to extract")
    headline: str = Field(None, description="Title of the element to extract")
    subheadline: str = Field(None, description="Subtitle of the element to extract")
    text: str = Field(None, description="Text of the element to extract")

print("Product schema:")
print(json.dumps(Extract.model_json_schema(), indent=2))

Product schema:
{
  "properties": {
    "type": {
      "description": "Type of element to extract",
      "enum": [
        "table",
        "image"
      ],
      "title": "Type",
      "type": "string"
    },
    "page": {
      "description": "Number of the page to extract from",
      "title": "Page",
      "type": "string"
    },
    "number": {
      "description": "Number of the element to extract",
      "title": "Number",
      "type": "string"
    },
    "description": {
      "description": "Description of the element to extract",
      "title": "Description",
      "type": "string"
    },
    "headline": {
      "default": null,
      "description": "Title of the element to extract",
      "title": "Headline",
      "type": "string"
    },
    "subheadline": {
      "default": null,
      "description": "Subtitle of the element to extract",
      "title": "Subheadline",
      "type": "string"
    },
    "text": {
      "default": null,
      "description": "Text of the ele

In [22]:
from pydantic import BaseModel, Field
from typing import Literal, List, Optional

class Extract(BaseModel):
    type: Literal["table", "image"] = Field(..., description="Type of element to extract, a table or a figure")
    page: int = Field(..., description="Number of the page to extract from")
    number: int = Field(..., description="Number of the element to extract")
    description: str = Field(..., description="Description of the element to extract")
    headline: Optional[str] = Field(None, description="Title of the element to extract")
    subheadline: Optional[str] = Field(None, description="Subtitle of the element to extract")
    text: Optional[str] = Field(None, description="Text of the element to extract")

class ExtractResponse(BaseModel):
    elements: List[Extract] = Field(..., description="List of tables or images extracted from the page")

print("ExtractResponse schema:")
print(json.dumps(ExtractResponse.model_json_schema(), indent=2))


ExtractResponse schema:
{
  "$defs": {
    "Extract": {
      "properties": {
        "type": {
          "description": "Type of element to extract, a table or a figure",
          "enum": [
            "table",
            "image"
          ],
          "title": "Type",
          "type": "string"
        },
        "page": {
          "description": "Number of the page to extract from",
          "title": "Page",
          "type": "integer"
        },
        "number": {
          "description": "Number of the element to extract",
          "title": "Number",
          "type": "integer"
        },
        "description": {
          "description": "Description of the element to extract",
          "title": "Description",
          "type": "string"
        },
        "headline": {
          "anyOf": [
            {
              "type": "string"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "Titl

In [23]:
# interact with model (locally)
response = ollama.chat(
    model='gemma3:4b',
    messages=[{
        'role': 'user',
        'content': "Can you find and describe all tables and images on this page?",
        'images': ['/Users/jacopocirica/Desktop/AI Bootcamp/PDFs/images_v1/2502.07161-0.png']
    }],
    stream=False,
    format=ExtractResponse.model_json_schema(),
)


final = ExtractResponse.model_validate_json(response.message.content)
print(final)


elements=[Extract(type='table', page=3, number=1, description='This table lists the key innovations and characteristics of the Mamba architecture. It outlines core concepts like its design choices, quantization features, and performance metrics. The table is organized around the fundamental components of Mamba: its key design aspects, its use of implicit convolutions, and its optimization techniques.', headline=None, subheadline=None, text=None), Extract(type='table', page=4, number=1, description="This table compares Mamba and existing models (ViM, VideoMamba) based on various metrics like computational cost, model size, and performance on specific tasks. It highlights Mamba's advantages, particularly in terms of computational efficiency and model size compared to Transformers.", headline=None, subheadline=None, text=None), Extract(type='image', page=1, number=1, description='This figure shows the Mamba architecture, depicting the implicit convolution layers, the gating mechanism, and

### Router

In [25]:
class BinaryResponse(BaseModel):
    answer: Literal["Yes", "No"] = Field(..., description="Whether the image contains Tables or Figures")

In [26]:
response = ollama.chat(
    model='llava:latest',
    messages=[{
        'role': 'user',
        'content': "Are there Figures (charts or diagram) or tables? Please answer only with 'Yes' or 'No'.",
        'images': ['/Users/jacopocirica/Desktop/AI Bootcamp/PDFs/images_v1/2502.07161-3.png']
    }],
    stream=False,
    format=BinaryResponse.model_json_schema(),
)

final = BinaryResponse.model_validate_json(response.message.content)
print(final.answer)


No


In [27]:
def tableandimage(page: str):
    response = ollama.chat(
    model='llava:latest',
    messages=[{
        'role': 'user',
        'content': "Are there Figures (charts or diagram) or tables? Please answer only with 'Yes' or 'No'.",
        'images': [page]
    }],
    stream=False,
    format=BinaryResponse.model_json_schema(),
)

    final = BinaryResponse.model_validate_json(response.message.content)

    return final.answer


### Combine Router with Table/Figure extraction

In [26]:
results = {}

image_dir = "/Users/jacopocirica/Desktop/AI Bootcamp/PDFs/images_v1"
image_files = sorted([f for f in os.listdir(image_dir) if f.endswith(".png")])

for img_file in image_files:
    page_number = os.path.splitext(img_file)[0].split("-")[-1]  # get page number from filename
    image_path = os.path.join(image_dir, img_file)

    if tableandimage(image_path) != "Yes":
        print(f"Skipping page {page_number} (no table or image)")
        continue  # Skip this page if there's nothing to extract

    try:
        response = ollama.chat(
            model='gemma3:4b',
            messages=[{
                'role': 'user',
                'content': "Can you find and describe all tables and images on this page?",
                'images': [image_path]
            }],
            stream=False,
            format=ExtractResponse.model_json_schema(),
        )

        parsed = ExtractResponse.model_validate_json(response.message.content)
        results[page_number] = [e.model_dump() for e in parsed.elements]

    except Exception as e:
        print(f"Failed on page {page_number}: {e}")
        results[page_number] = []

# --- Final output ---
print(json.dumps(results, indent=2))


Skipping page 0 (no table or image)
Skipping page 5 (no table or image)
Skipping page 6 (no table or image)
Skipping page 9 (no table or image)
{
  "1": [
    {
      "type": "table",
      "page": 1,
      "number": 1,
      "description": "Mamba Block Architecture with Selective State Space Model:  This table shows a diagram of the Mamba architecture, which utilizes a selective state space model. The diagram illustrates the key components including the input, state, and output layers, with emphasis on the selective state space modeling. It is used to show how information is integrated for a more efficient hidden state model.",
      "headline": null,
      "subheadline": null,
      "text": null
    },
    {
      "type": "table",
      "page": 1,
      "number": 2,
      "description": "Bi-Directional Selective Scanning Mechanism: This table depicts a diagram illustrating the mechanism of the Bi-Directional Selective Scanning Mechanism. It shows the sequential processing flow of inf

## Let's put all toghether 

In [39]:
### Step 1: User query is converted into a keyword search and the pdf is downloaded
#### 1.1 function to get the url from the keyword
def get_url(topic: str):
    tavily_client = TavilyClient(api_key="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")
    response = tavily_client.search(query=f"Tell me the main paper about this topic:{topic}",include_domains=["arxiv.org/abs/"], search_depth= "advanced")
    return response['results'][0]['url']
#### 1.2 utility function
def extract_arxiv_id(url: str) -> str:
    return url.rstrip("/").split("/")[-1] + ".pdf"

#### 1.3 function to download the pdf

def download_arxiv_pdf(arxiv_url: str, destination_folder: str, filename: str = None):
    """
    Downloads a PDF from an arXiv URL and saves it to a specified folder.

    Args:
        arxiv_url (str): Direct link to the arXiv PDF (e.g., https://arxiv.org/pdf/2311.17633).
        destination_folder (str): Local folder path to save the file.
        filename (str, optional): Name to save the file as (e.g., 'paper.pdf').
                                  If None, it uses the arXiv ID as the filename.

    Returns:
        str: Full path to the saved file.
    """
    # Extract default filename from URL if not provided
    if filename is None:
        filename = arxiv_url.strip("/").split("/")[-1] + ".pdf"

    os.makedirs(destination_folder, exist_ok=True)
    destination_path = os.path.join(destination_folder, filename)

    try:
        response = requests.get(arxiv_url)
        response.raise_for_status()  # Raise error for bad responses
        with open(destination_path, "wb") as f:
            f.write(response.content)
        print(f"✅ PDF downloaded to: {destination_path}")
        return destination_path
    except Exception as e:
        print(f"❌ Error downloading file: {e}")
        return None
def convert_to_pdf_url(url: str) -> str:
    return url.replace("abs", "pdf").replace("html", "pdf")
   
def identify_keywords_download_pdf(user_query: str):
    available_functions: Dict[str, Callable] = {
    "get_url": get_url}

    response = ollama.chat(
        model='qwen3:4b',
        messages=[{
        'role': 'user',
        'content': f"Can you tell me the main paper about this topic: {user_query}",
        }],
        tools=[get_url])
    if response.message.tool_calls:
        for tool_call in response.message.tool_calls:
            if tool_call.function.name == "get_url":
                print("Topic: ", tool_call.function.arguments['topic'])
                print("Tool call: ", tool_call.function.name)
                url=available_functions[tool_call.function.name](tool_call.function.arguments['topic'])
                url_pdf=convert_to_pdf_url(url)
                print("Downloading URL: ", url_pdf)
                download_arxiv_pdf(url_pdf, "./PDFs")

                return extract_arxiv_id(url_pdf)
    else:
        print("No URL found")
        return None
             
### Step 2 convert each PDFs to image, extract images and text 

#### Step 2.1 Extract text
def extract_text(pdf_path):
    pdf=fitz.open(pdf_path)
    text="".join([page.get_text() for page in pdf])

    return text

####Step 2.2 convert each page to image


# pdf_path: the folder of all the PDF files
# saved_path: the path of the saved page images
def convert_pdf_to_image(pdf_path, pdf_file, saved_path):

    if not os.path.exists(saved_path):
        os.mkdir(saved_path)
    else:
        files = glob.glob('saved_path/*')
        for f in files:
            os.remove(f)

    try:
        fitz.TOOLS.mupdf_warnings()  # empty the problem message container
        doc = fitz.open(pdf_path + "/" + pdf_file)
        warnings = fitz.TOOLS.mupdf_warnings()
        if warnings:
            print(warnings)
            raise RuntimeError()

        for page in doc:  # iterate through the pages
            pix = page.get_pixmap()  # render page to an image
            pix.save(saved_path + "/" + f"{pdf_file[:-4]}-{page.number}.png")  # store image as a PNG
        return

    except:
        print("error when opening the pdf file {}".format(pdf_file))
        return None


#Step 2.3 Decide if the image contains figures or tables
def tableandimage(page: str):
    response = ollama.chat(
    model='llava:latest',
    messages=[{
        'role': 'user',
        'content': "Are there Figures (charts or diagram) or tables? Please answer only with 'Yes' or 'No'.",
        'images': [page]
    }],
    stream=False,
    format=BinaryResponse.model_json_schema(),
)

    final = BinaryResponse.model_validate_json(response.message.content)

    return final.answer

#Step 2.4 Extract the description of each table or image
def extract_tableandimage_content(image_dir: str):
    results={}
    image_files = sorted([f for f in os.listdir(image_dir) if f.endswith(".png")])
    for img_file in image_files:
        page_number = os.path.splitext(img_file)[0].split("-")[-1]  # get page number from filename
        image_path = os.path.join(image_dir, img_file)

        if tableandimage(image_path) != "Yes":
            print(f"Skipping page {page_number} (no table or image)")
            continue  # Skip this page if there's nothing to extract

        try:
            response = ollama.chat(
            model='gemma3:4b',
            messages=[{
                'role': 'user',
                'content': "Can you find and describe all tables and images on this page?",
                'images': [image_path]
            }],
            stream=False,
            format=ExtractResponse.model_json_schema(),
        )

            parsed = ExtractResponse.model_validate_json(response.message.content)
            results[page_number] = [e.model_dump() for e in parsed.elements]

        except Exception as e:
            print(f"Failed on page {page_number}: {e}")
            results[page_number] = []

    return results


####Step 2.5 Put togheter PDF text and PDF figures and tables
def extract_combine_images_text(user_query, pdf_path, saved_path):
    pdf_file=identify_keywords_download_pdf(user_query)
    file_path=pdf_path+"/"+pdf_file
    text_pdf=extract_text(file_path)
    convert_pdf_to_image(pdf_path, pdf_file, saved_path)
    figures_image_content=extract_tableandimage_content(saved_path)
    figures_image_content= str(figures_image_content)
    content_pdf = f"PDF content: {text_pdf}. DESCRITPION FIGURES AND TABLES: {figures_image_content}"
    

    return content_pdf

###Step 3: Conversate with the PDF
#### Step 3.1 Edit the response
def remove_think_block(response_str):
    return re.sub(r'<think>.*?</think>', '', response_str, flags=re.DOTALL).strip()
    

def reply_user(query: str, content_pdf: str)->str:
    system_prompt=f"""# Role and Objective

You are a research assistant. Your task is to read technical articles and reply user questions. You will have the PDF content

# Instructions

Review the article provided by the user and generate a response to user query

# Guidelines

- Write in a neutral and academic tone.
- Use simple, precise language to ensure clarity for a broad audience.
- Keep the responses concise (150-300 words) unless otherwise specified.
- Assume the audience has general technical knowledge but may not be familiar with the specific field of the paper.
- Please limit bullets under key terms, key findings, and further reading, sections to the 3-5 most essential.
"""

    user_prompt=f"""
Given the content of an academic paper, reply the the user query
#PDF Content:
{content_pdf}
#User query:
{query}
"""
    response = ollama.chat(
    model='qwen3:4b',
    messages=[{
        'role': 'system',
        'content': system_prompt
    },{
        'role': 'user',
        'content': user_prompt
    }])
    final_response= remove_think_block(response.message.content)
    return final_response



    





            
    






## Test it

In [None]:
def chat_about_paper():
    # Step 1: Get the keyword and extract the PDF content
    keyword = input("Enter the paper that you want to look up: ")
    content = extract_combine_images_text(keyword, "./PDFs", "./PDFs/images_v2")
    print("Content loaded. You can now ask anything about the paper. Type 'exit' to end the chat.\n")

    # Step 2: Loop for querying the paper content
    while True:
        query = input("Your question: ")
        if query.lower().strip() == "exit":
            print("Exiting chat. Goodbye!")
            break

        response = reply_user(query, content)
        
        # Optional: clean output to remove <think> block
        print("\nAnswer:")
        print(extract_final_answer(response))  # using the function from previous message
        print("\n---\n")

# Now you can call this function to start the interactive session
chat_about_paper()


Topic:  Mamba Architecture
Tool call:  get_url
Downloading URL:  https://arxiv.org/pdf/2502.07161
✅ PDF downloaded to: ./PDFs/2502.07161.pdf
Skipping page 3 (no table or image)
Skipping page 5 (no table or image)
Skipping page 8 (no table or image)
Skipping page 9 (no table or image)
Content loaded. You can now ask anything about the paper. Type 'exit' to end the chat.


Answer:
The main topic of the paper is **the comparison and evaluation of various computer vision models** (including MAMBA, VideoMamba, YOLO, Transformers, and others) across different tasks such as **image classification, semantic segmentation, object detection, and video understanding**. The paper analyzes their performance metrics (e.g., FLOPs, accuracy, computational efficiency) to determine their suitability for specific applications, such as resource-constrained environments versus complex scenes. It highlights the trade-offs between computational efficiency, accuracy, and scalability of these models.

---


Ans