In [None]:
!pip install -q pdfplumber langchain faiss-cpu sentence-transformers transformers google-generativeai


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m94.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Import libraries
import pdfplumber
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
import google.generativeai as genai
from google.colab import files
import os

In [None]:
# Upload PDF
uploaded = files.upload()  # e.g., upload "Meta’s Q1 2024 Financial Report.pdf"


Saving Meta’s Q1 2024 Financial Report.pdf to Meta’s Q1 2024 Financial Report.pdf


In [None]:
# Set Gemini API Key
os.environ["GOOGLE_API_KEY"] = "AIzaSyCz0rI1wotUwVoHuN0w8NMrOP4OEdEoSmQ"  # 🔒 Replace securely
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
model = genai.GenerativeModel("models/gemini-2.5-pro")

In [None]:
# Step 1: Extract text from PDF
def extract_text_from_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

pdf_path = list(uploaded.keys())[0]
pdf_text = extract_text_from_pdf(pdf_path)

In [None]:
# Step 2: Split into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]
)
chunks = text_splitter.split_text(pdf_text)


In [None]:
# Step 3: Embed and store in FAISS
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embed_model.encode(chunks)
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

In [None]:
# Step 4: Retrieve relevant chunks for a query
def retrieve_top_chunks(query, k=3):
    query_embedding = embed_model.encode([query])
    distances, indices = index.search(np.array(query_embedding), k)
    return [chunks[i] for i in indices[0]]

In [None]:
# Step 5: Ask Gemini for the answer
def gemini_answer(query, context_chunks):
    context = "\n".join(context_chunks)
    prompt = f"""Based on the following financial report context, answer the question:

Context:
{context}

Question:
{query}
"""
    try:
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        return f"❌ Error: {str(e)}"

In [None]:
# 🔎 Step 6: Main interface to ask questions
def ask_query(query, top_k=3):
    top_chunks = retrieve_top_chunks(query, k=top_k)
    print(f"\n🔍 Top {top_k} retrieved chunks:\n")
    for i, chunk in enumerate(top_chunks):
        print(f"--- Chunk {i+1} ---\n{chunk}\n")
    print("💡 Gemini Answer:")
    print(gemini_answer(query, top_chunks))


In [None]:
# ✅ Example Usage
# ask_query("What was Meta’s revenue in Q1 2024?")
# ask_query("What was Family daily active people (DAP) 2024?")
ask_query("In META PLATFORMS, INC. CONDENSED CONSOLIDATED BALANCE SHEETS, What was Operating lease right-of-use assets in March 31, 2024?")
# ask_query("What was Headcount of March 31, 2024?")
# ask_query("What was Total revenue and revenue on a constant currency basis in 2024?")


🔍 Top 3 retrieved chunks:

--- Chunk 1 ---
5
META PLATFORMS, INC.
CONDENSED CONSOLIDATED BALANCE SHEETS
(In millions)
(Unaudited)
March 31, 2024 December 31, 2023
Assets
Current assets:
Cash and cash equivalents $ 32,307 $ 41,862
Marketable securities 25,813 23,541
Accounts receivable, net 13,430 16,169
Prepaid expenses and other current assets 3,780 3,793
Total current assets 75,330 85,365
Non-marketable equity securities 6,218 6,141
Property and equipment, net 98,908 96,587
Operating lease right-of-use assets 13,555 13,294

--- Chunk 2 ---
Property and equipment, net 98,908 96,587
Operating lease right-of-use assets 13,555 13,294
Goodwill 20,654 20,654
Other assets 8,179 7,582
Total assets $ 222,844 $ 229,623
Liabilities and stockholders' equity
Current liabilities:
Accounts payable $ 3,785 $ 4,849
Operating lease liabilities, current 1,676 1,623
Accrued expenses and other current liabilities 22,640 25,488
Total current liabilities 28,101 31,960
Operating lease liabilities, non-curr