In [None]:
# Install required packages
!pip install -q pdfplumber langchain faiss-cpu sentence-transformers transformers google-generativeai

# Import libraries
import pdfplumber
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
import google.generativeai as genai
from google.colab import files
import os


In [None]:

# Upload PDF
uploaded = files.upload()

# Set Gemini API Key
os.environ["GOOGLE_API_KEY"] = "AIzaSyCz0rI1wotUwVoHuN0w8NMrOP4OEdEoSmQ"  # Replace this
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
model = genai.GenerativeModel("models/gemini-2.5-pro")  # Avoid 404 by using correct model

# Step 1: Extract text and tables from PDF
def extract_text_and_tables(file_path):
    all_chunks = []
    with pdfplumber.open(file_path) as pdf:
        for i, page in enumerate(pdf.pages):
            text = page.extract_text()
            if text:
                all_chunks.append(f"Text from Page {i+1}:\n{text.strip()}")

            tables = page.extract_tables()
            for table in tables:
                if table:
                    header = [cell if cell is not None else "" for cell in table[0]]
                    rows = table[1:]

                    table_str = f"Table from Page {i+1}:\n" + "\t".join(header) + "\n"
                    for row in rows:
                        clean_row = [cell if cell is not None else "" for cell in row]
                        table_str += "\t".join(clean_row) + "\n"
                    all_chunks.append(table_str)
    return all_chunks


pdf_path = list(uploaded.keys())[0]
chunks = extract_text_and_tables(pdf_path)



Saving Meta’s Q1 2024 Financial Report.pdf to Meta’s Q1 2024 Financial Report (1).pdf


In [None]:
# Step 2: Embed and store in FAISS
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embed_model.encode(chunks)
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

# Step 3: Retrieve relevant chunks for a query
def retrieve_top_chunks(query, k=3):
    query_embedding = embed_model.encode([query])
    distances, indices = index.search(np.array(query_embedding), k)
    return [chunks[i] for i in indices[0]]



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Step 4: Ask Gemini for the answer
def gemini_answer(query, context_chunks):
    context = "\n".join(context_chunks)
    prompt = f"""You're a helpful financial analyst assistant.

Based on the following extracted context from a financial report (text and tables), answer the question clearly:

Context:
{context}

Question:
{query}
"""
    try:
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        return f"❌ Error: {str(e)}"

# Step 5: Main interface to ask questions
def ask_query(query, top_k=3):
    top_chunks = retrieve_top_chunks(query, k=top_k)
    print(f"\n🔍 Top {top_k} retrieved chunks:\n")
    for i, chunk in enumerate(top_chunks):
        print(f"--- Chunk {i+1} ---\n{chunk}\n")
    print("💡 Gemini Answer:")
    print(gemini_answer(query, top_chunks))





🔍 Top 3 retrieved chunks:

--- Chunk 1 ---
Text from Page 6:
META PLATFORMS, INC.
CONDENSED CONSOLIDATED BALANCE SHEETS
(In millions)
(Unaudited)
March 31, 2024 December 31, 2023
Assets
Current assets:
Cash and cash equivalents $ 32,307 $ 41,862
Marketable securities 25,813 23,541
Accounts receivable, net 13,430 16,169
Prepaid expenses and other current assets 3,780 3,793
Total current assets 75,330 85,365
Non-marketable equity securities 6,218 6,141
Property and equipment, net 98,908 96,587
Operating lease right-of-use assets 13,555 13,294
Goodwill 20,654 20,654
Other assets 8,179 7,582
Total assets $ 222,844 $ 229,623
Liabilities and stockholders' equity
Current liabilities:
Accounts payable $ 3,785 $ 4,849
Operating lease liabilities, current 1,676 1,623
Accrued expenses and other current liabilities 22,640 25,488
Total current liabilities 28,101 31,960
Operating lease liabilities, non-current 17,570 17,226
Long-term debt 18,387 18,385
Long-term income taxes 7,795 7,514
Other liabi

In [None]:
# ✅ Example usage
# ask_query("What was Meta’s revenue in Q1 2024?")
# ask_query("In META PLATFORMS, INC. CONDENSED CONSOLIDATED BALANCE SHEETS, What was Goodwill in March 31, 2024?")
# ask_query("In META PLATFORMS, INC. CONDENSED CONSOLIDATED BALANCE SHEETS, What was Total liabilities in March 31, 2024?")
# ask_query("In META PLATFORMS, INC. CONDENSED CONSOLIDATED BALANCE SHEETS, What was Total stockholders' equity in March 31, 2024?")
ask_query("what was Net cash used in financing activities in 2024")


🔍 Top 3 retrieved chunks:

--- Chunk 1 ---
Text from Page 8:
META PLATFORMS, INC.
CONDENSED CONSOLIDATED STATEMENTS OF CASH FLOWS
(In millions)
(Unaudited)
Three Months Ended March 31,
2024 2023
Supplemental cash flow data
Cash paid for income taxes, net $ 630 $ 405
Cash paid for interest, net of amounts capitalized $ 121 $ 182
Non-cash investing and financing activities:
Property and equipment in accounts payable and accrued expenses and
other current liabilities $ 4,217 $ 4,466
Acquisition of businesses in accrued expenses and other current liabilities
and other liabilities $ 116 $ 263
8

--- Chunk 2 ---
Table from Page 8:
Supplemental cash flow data
Cash paid for income taxes, net $ 630 $ 405
Cash paid for interest, net of amounts capitalized $ 121 $ 182
Non-cash investing and financing activities:
Property and equipment in accounts payable and accrued expenses and
other current liabilities $ 4,217 $ 4,466
Acquisition of businesses in accrued expenses and other current liabilities
