In [17]:
from langchain_chroma import Chroma
import fitz
import torch
from langchain_community.document_loaders.parsers import PyMuPDFParser
from langchain_core.documents.base import Blob
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import CLIPProcessor,CLIPModel
from PIL import Image
import base64
import io
import camelot
import pdfplumber
import pandas as pd

In [18]:
file_path="../data/sample/Nvidia/nvidia_2023.pdf"

model=CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor=CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

splitter=RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
docs=fitz.open(file_path)

In [19]:
model.eval()  

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e

In [20]:
def embed_text(text):
    inputs=processor(
        text=text,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=77
    )
    with torch.no_grad():
        features=model.get_text_features(**inputs)

        #normalize embeddings
        features=features/features.norm(dim=-1,keepdim=True)
        return features.squeeze().numpy()

In [21]:
def embed_image(image_data):
    if isinstance(image_data,str):
        image=Image.Open(image_data).convert("RGB")
    else: 
        image=image_data

    inputs=processor(images=image,return_tensors='pt')

    with torch.no_grad():
        features=model.get_image_features(**inputs)

        features=features/features.norm(dim=-1,keepdim=True)

        return features.squeeze().numpy()

In [22]:
## FUNCTION TO EXTRACT TABLEDS from the page or the doc

def extract_tables_from_page(file_path: str, page_num: int):
    """Extract *only real tables* from a given PDF page using Camelot or pdfplumber.
    Returns a list of extracted tables, or an empty list if none found."""
    
    tables_data = []

    def is_valid_table(df):
        """Heuristic filter to detect real tables (not paragraphs)."""
        # Must have at least 2 rows and 2 columns
        if df.shape[0] < 2 or df.shape[1] < 2:
            return False
        
        # Reject if there's only one long text cell (likely paragraph)
        avg_len = df.applymap(lambda x: len(str(x)) if x else 0).mean().mean()
        if avg_len > 100:  # large cell content => not a table
            return False
        
        # Reject empty or NaN-heavy tables
        if df.isnull().mean().mean() > 0.8:
            return False
        
        return True

    # 1️⃣ Camelot lattice
    try:
        tables = camelot.read_pdf(file_path, pages=str(page_num + 1), flavor='lattice')
        if tables:
            for idx, table in enumerate(tables):
                df = table.df
                if is_valid_table(df):
                    tables_data.append({
                        "table_num": idx,
                        "content": df.to_markdown(index=True),
                        "method": "camelot-lattice"
                    })
                
            if tables_data:
                return tables_data
    except Exception as e:
        print(f"Camelot lattice extraction failed on page {page_num}: {e}")

    # 2️⃣ Camelot stream
    try:
        tables = camelot.read_pdf(file_path, pages=str(page_num + 1), flavor='stream')
        if tables:
            for idx, table in enumerate(tables):
                df = table.df
                if is_valid_table(df):
                    tables_data.append({
                        "table_num": idx,
                        "content": df.to_markdown(index=True),
                        "method": "camelot-stream"
                    })
            if tables_data:
                return tables_data
    except Exception as e:
        print(f"Camelot stream extraction failed on page {page_num}: {e}")

    # 3️⃣ pdfplumber
    try:
        with pdfplumber.open(file_path) as pdf:
            page = pdf.pages[page_num]
            tables = page.extract_tables()
            if tables:
                for idx, table in enumerate(tables):
                    df = pd.DataFrame(table)
                    if is_valid_table(df):
                        tables_data.append({
                            "table_num": idx,
                            "content": df.to_markdown(index=True),
                            "method": "pdfplumber"
                        })
                if tables_data:
                    return tables_data
    except Exception as e:
        print(f"pdfplumber extraction failed on page {page_num}: {e}")

    return []


In [23]:
import os
from dotenv import load_dotenv

load_dotenv()

def generate_table_summary(table_text: str):
    """
    Uses Gemini (via LangChain) to create a short natural-language summary of the table.
    You can plug this function into your table extraction pipeline.
    """
    try:
        from langchain_google_genai import ChatGoogleGenerativeAI
        from langchain.prompts import ChatPromptTemplate

        api_key = os.getenv("GEMINI_API_KEY")

        if not api_key:
            raise ValueError("Missing GEMINI_API_KEY in environment variables.")

        llm = ChatGoogleGenerativeAI(model="gemini-2.5-pro", temperature=0.3, google_api_key=api_key)

        # Prompt template
        prompt = ChatPromptTemplate.from_template(
            """You are a data summarizer. Summarize the key content of the following table in one or two sentences. 
            Focus on what the table represents, not exact numbers.

            Table:
            {table_text}
            """
        )

        # Combine prompt + LLM
        chain = prompt | llm

        # Run the chain
        summary = chain.invoke({"table_text": table_text}).content.strip()
        return summary

    except Exception as e:
        print(f"[WARN] Table summary generation failed: {e}")
        return "A table containing structured data extracted from the document."


In [24]:
print(docs.page_count)
print(docs[0].get_text())
all_docs=[]
all_embeddings=[]
image_data_store={}

169
Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
____________________________________________________________________________________________
FORM 10-K
☒
ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
    For the fiscal year ended January 29, 2023
OR
☐
TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
Commission file number: 0-23985
 
 
NVIDIA CORPORATION
(Exact name of registrant as specified in its charter)
Delaware
94-3177549
(State or other jurisdiction of
(I.R.S. Employer
Incorporation or Organization)
Identification No.)
2788 San Tomas Expressway
Santa Clara, California 95051
(408) 486-2000
(Address, including zip code, and telephone number, including area code, of principal executive offices)
Securities registered pursuant to Section 12(b) of the Act:
Title of each class
Trading Symbol(s)
Name of each exchange on which registered
Common Stock, $0.001 par valu

In [25]:
for i,page in enumerate(docs):
    
    ### processing text cotent of the page
    text=page.get_text()
    if text.strip():
        temp_doc=Document(page_content=text,metadata={"page":i,"type":"text"})
        text_chunks=splitter.split_documents([temp_doc])

        for chunk in text_chunks:
            all_docs.append(chunk)
            embedding=embed_text(chunk.page_content)
            all_embeddings.append(embedding)
    

    #### processing image content of the page

    for img_i,img in enumerate(page.get_images(full=True)):
        try:
            xref=img[0]
            base_image=docs.extract_image(xref)
            image_bytes=base_image['image']

            ###convert to PIL Image
            pil_image=Image.open(io.BytesIO(image_bytes)).convert('RGB')

            # pil_image.show()

            image_id=f"page: {i}_img_{img_i}"

            #save the image bytes in base64 format for mdoel use
            buffered=io.BytesIO()
            pil_image.save(buffered,format="PNG")
            img_base64=base64.b64encode(buffered.getvalue()).decode()
            image_data_store[image_id]=img_base64

            #now create embeddings of image and store

            all_embeddings.append(embed_image(pil_image))
            img_doc=Document(
                page_content=f"[Image: {image_id}]",
                metadata={"page":i,"type":"image","image_id":image_id}
            )
            all_docs.append(img_doc)
        except Exception as e:
            print(f"Error processing image {img_i} on page {i}: {e}")
            continue  
    
    ### EXTRACTING TAABLE FOR THE PAGE
    tables=extract_tables_from_page(docs,i)
    if tables:
        for table in tables:
            table_summary=generate_table_summary(table['content'])
            table_doc=Document(
                page_content=f"This table was extracted from page {i}. "
                             f"Summary: {table_summary}\n\n"
                             f"Table data:\n{table['content']}",
                metadata={
                    "page":i,
                    "type":"table",
                    "method":table['method'],
                    "table_num":table['table_num'],   
                }
            )
            #creating embedding for the table
            table_embed=embed_text(table_doc.page_content)
            all_docs.append(table_doc)
            all_embeddings.append(table_embed)
            

docs.close()        

Camelot lattice extraction failed on page 0: 'Document' object has no attribute 'seek'
Camelot stream extraction failed on page 0: 'Document' object has no attribute 'seek'
pdfplumber extraction failed on page 0: 'Document' object has no attribute 'seek'
Camelot lattice extraction failed on page 1: 'Document' object has no attribute 'seek'
Camelot stream extraction failed on page 1: 'Document' object has no attribute 'seek'
pdfplumber extraction failed on page 1: 'Document' object has no attribute 'seek'
Camelot lattice extraction failed on page 2: 'Document' object has no attribute 'seek'
Camelot stream extraction failed on page 2: 'Document' object has no attribute 'seek'
pdfplumber extraction failed on page 2: 'Document' object has no attribute 'seek'
Camelot lattice extraction failed on page 3: 'Document' object has no attribute 'seek'
Camelot stream extraction failed on page 3: 'Document' object has no attribute 'seek'
pdfplumber extraction failed on page 3: 'Document' object has 

In [26]:
all_docs[0]

Document(metadata={'page': 0, 'type': 'text'}, page_content='Table of Contents\nUNITED STATES\nSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C.\xa020549\n____________________________________________________________________________________________\nFORM 10-K\n☒\nANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\n\xa0\xa0\xa0\xa0For the fiscal year ended January\xa029, 2023\nOR\n☐\nTRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nCommission file number: 0-23985\n\xa0\n\xa0\nNVIDIA CORPORATION\n(Exact name of registrant as specified in its charter)\nDelaware\n94-3177549\n(State or other jurisdiction of\n(I.R.S. Employer\nIncorporation or Organization)\nIdentification No.)\n2788 San Tomas Expressway\nSanta Clara, California\xa095051\n(408) 486-2000\n(Address, including zip code, and telephone number, including area code, of principal executive offices)\nSecurities registered pursuant to Section 12(b) of the Act

In [27]:
import numpy as np
embeddings_array=np.array(all_embeddings)
embeddings_array

array([[-1.2875976e-02,  4.8772685e-02, -3.0231319e-02, ...,
        -6.5450622e-03, -7.6667597e-03,  7.3045469e-04],
       [ 1.8559901e-02, -3.1148756e-02, -4.7518387e-03, ...,
         4.4267699e-02, -1.5061229e-02, -4.5306623e-02],
       [ 9.4747823e-03, -6.3045239e-03,  5.4866103e-03, ...,
        -1.1381921e-02,  1.1250130e-02, -2.3404704e-02],
       ...,
       [-4.1294522e-03,  4.0898877e-03, -1.3361252e-02, ...,
         3.6599703e-02, -4.6057776e-03, -5.0448384e-02],
       [-1.9561185e-03, -8.9585978e-05,  4.2568678e-03, ...,
        -3.4495432e-02, -5.4750638e-03, -1.3920547e-02],
       [ 3.8375720e-02, -3.5405692e-03, -2.5180852e-02, ...,
         4.8641521e-02,  8.3123548e-03, -2.4033880e-02]], dtype=float32)

In [28]:
(all_docs,embeddings_array)

([Document(metadata={'page': 0, 'type': 'text'}, page_content='Table of Contents\nUNITED STATES\nSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C.\xa020549\n____________________________________________________________________________________________\nFORM 10-K\n☒\nANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\n\xa0\xa0\xa0\xa0For the fiscal year ended January\xa029, 2023\nOR\n☐\nTRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nCommission file number: 0-23985\n\xa0\n\xa0\nNVIDIA CORPORATION\n(Exact name of registrant as specified in its charter)\nDelaware\n94-3177549\n(State or other jurisdiction of\n(I.R.S. Employer\nIncorporation or Organization)\nIdentification No.)\n2788 San Tomas Expressway\nSanta Clara, California\xa095051\n(408) 486-2000\n(Address, including zip code, and telephone number, including area code, of principal executive offices)\nSecurities registered pursuant to Section 12(b) of the A

In [29]:
import faiss
from langchain_community.vectorstores import FAISS

In [30]:
vector_store=FAISS.from_embeddings(
    text_embeddings=[(doc.page_content,emb) for doc,emb in zip(all_docs,embeddings_array)],
    embedding=None,
    metadatas=[doc.metadata for doc in all_docs]    
)


`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [31]:
vector_store

<langchain_community.vectorstores.faiss.FAISS at 0x294ba135ad0>

In [32]:
vector_store.save_local("ReportVectorDB")

In [33]:
vector_store

<langchain_community.vectorstores.faiss.FAISS at 0x294ba135ad0>

In [34]:
import json
with open("image_data_store.json","w") as f:
    json.dump(image_data_store,f)

In [39]:
#creating method to retrieve text related to query

def docs_retrieval_based_query(query:str,k:int):
    """"Unified retrival using clip embedding model"""
    query_embedding=embed_text(query)

    retrived_docs=vector_store.asimilarity_search_by_vector(
        embedding=query_embedding,
        k=k
    )
    retrived_docs

In [35]:
def multimodel_query_message(query: str, retrieved_docs: list[Document]):
    """Create a complete message to pass to the model for a particular query."""

    context = [
        {
            "type": "text",
            "text": f"Question: {query}\n\nContext:\n"
        }
    ]

    text_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "text"]
    image_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "image"]
    table_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "table"]

    # Add text docs
    if text_docs:
        text_context = "\n\n".join([
            f"[Page: {doc.metadata.get('page')}] : {doc.page_content}" for doc in text_docs
        ])
        context.append({
            "type": "text",
            "text": f"Text excerpts: {text_context}"
        })

    # Add images
    for doc in image_docs:
        image_id = doc.metadata.get("image_id")
        if image_id and image_id in image_data_store:
            context.append({
                "type": "text",
                "text": f"[Image from page {doc.metadata.get('page')}]"
            })
            context.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{image_data_store[image_id]}"
                }
            })

    # Add tables with optional summaries
    for doc in table_docs:
        table_content = doc.page_content
        context.append({
            "type": "text",
            "text": f"[Table from page {doc.metadata.get('page')} - method: {doc.metadata.get('method')}] Content: {table_content}"
        })

        
    context.append({
        "type": "text",
        "text": "Please generate a response to the user's query based on the context above. "
                "If the context is insufficient, reply that there is not enough information to answer the query."
    })

    return context
