In [None]:
!pip install fitz PyMuPDF pinecone-client langchain openai


Collecting fitz
  Downloading fitz-0.0.1.dev2-py2.py3-none-any.whl.metadata (816 bytes)
Collecting PyMuPDF
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting pinecone-client
  Downloading pinecone_client-6.0.0-py3-none-any.whl.metadata (3.4 kB)
Collecting configobj (from fitz)
  Downloading configobj-5.0.9-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting configparser (from fitz)
  Downloading configparser-7.2.0-py3-none-any.whl.metadata (5.5 kB)
Collecting nipype (from fitz)
  Downloading nipype-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting pyxnat (from fitz)
  Downloading pyxnat-1.6.3-py3-none-any.whl.metadata (5.4 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Collecting prov>=1.5.2 (from nipype->fitz)
  Downloading prov-2.0.1-py3-none-any.whl.metadata (3.6 kB)
Collecting rdflib>=5.0.0 (from nipype->fit

In [7]:
# ⬇️ Install required packages
!pip install -q google-generativeai PyMuPDF pinecone-client langchain

# ⬇️ Import libraries
import os
import time
import fitz  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
import google.generativeai as genai
from google.colab import files
from pinecone import Pinecone, ServerlessSpec

# ⬇️ Set your keys here
GEMINI_API_KEY = "AIzaSyDG0-pvREKlQeYznW3uEAHhWCK88cggxr4"
PINECONE_API_KEY = "pcsk_n17xx_EBXvBTaYCf1wFhPocY6556DbWyLBZnqWdZb17kz63v9Xy27a8nbnjTJKcfYiiE6"
INDEX_NAME = "pdf-context-index"

# ⬇️ Configure Gemini
genai.configure(api_key=GEMINI_API_KEY)

# ⬇️ Pinecone Setup
pc = Pinecone(api_key=PINECONE_API_KEY)

# Create index if not exists
if INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name=INDEX_NAME,
        dimension=768,  # Gemini embeddings
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    # Wait until index is ready
    while not pc.describe_index(INDEX_NAME).status['ready']:
        time.sleep(1)

index = pc.Index(INDEX_NAME)

# ⬇️ Helper functions
def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    return "".join([page.get_text() for page in doc])

def chunk_text(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    return splitter.split_text(text)

def get_embedding(text):
    res = genai.embed_content(
        model="models/embedding-001",
        content=text,
        task_type="retrieval_document"
    )
    return res['embedding']

def index_pdf(file_path):
    file_name = os.path.basename(file_path)
    text = extract_text_from_pdf(file_path)
    chunks = chunk_text(text)
    vectors = []
    for i, chunk in enumerate(chunks):
        emb = get_embedding(chunk)
        vectors.append((f"{file_name}_{i}", emb, {"text": chunk, "file_name": file_name}))
    index.upsert(vectors)
    print(f"✅ Indexed: {file_name}")

def search_pdf_by_context(query):
    query_emb = get_embedding(query)
    result = index.query(vector=query_emb, top_k=5, include_metadata=True)
    print("\n🔍 Top Matching PDFs:")
    seen = set()
    for match in result['matches']:
        file = match['metadata']['file_name']
        snippet = match['metadata']['text'][:200].replace("\n", " ") + "..."
        if file not in seen:
            seen.add(file)
            print(f"📄 {file} — Snippet: {snippet}")

# ⬇️ Upload PDFs
uploaded_files = {}
print("📤 Upload your PDFs one by one. Type 'exit' when done.\n")
while True:
    action = input("Type 'upload' to upload a PDF or 'exit' to finish uploading: ").strip().lower()
    if action == 'exit':
        break
    elif action == 'upload':
        uploaded = files.upload()
        uploaded_files.update(uploaded)
    else:
        print("❌ Invalid input. Please type 'upload' or 'exit'.")

# ⬇️ Index uploaded PDFs
for path in uploaded_files.keys():
    index_pdf(path)

# ⬇️ Contextual Search
query = input("\n📝 Enter the context of the PDF you remember: ")
search_pdf_by_context(query)


📤 Upload your PDFs one by one. Type 'exit' when done.

Type 'upload' to upload a PDF or 'exit' to finish uploading: upload


Saving PDF1.pdf to PDF1.pdf
Saving PDF2.pdf to PDF2.pdf
Saving PDF3.pdf to PDF3.pdf
Type 'upload' to upload a PDF or 'exit' to finish uploading: exit
✅ Indexed: PDF1.pdf
✅ Indexed: PDF2.pdf
✅ Indexed: PDF3.pdf

📝 Enter the context of the PDF you remember: amrita

🔍 Top Matching PDFs:
📄 PDF1.pdf — Snippet: serene and eco-conscious campuses spread across Amritapuri, Bengaluru, Coimbatore, Chennai, Kochi, and Mysuru, the institution provides a nurturing environment for students to grow intellectually and ...
📄 PDF3.pdf — Snippet: ֺֺ PDF 3: The World of Artificial Intelligence and Its Transformative Power Artificial Intelligence, commonly known as AI, is one of the most revolutionary technologies of our time. It refers to the a...


In [2]:
!pip uninstall -y fitz
!pip install --upgrade pymupdf


[0mCollecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m56.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.5


In [3]:
!pip uninstall -y pinecone-client
!pip install -U pinecone


[0mCollecting pinecone
  Downloading pinecone-6.0.2-py3-none-any.whl.metadata (9.0 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone-6.0.2-py3-none-any.whl (421 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m421.9/421.9 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone
Successfully installed pinecone-6.0.2 pinecone-plugin-interface-0.0.7


In [None]:
!pip uninstall -y pinecone-client
!pip install -U pinecone




In [4]:
# 📦 Install necessary packages
!pip uninstall -y pinecone-client
!pip install -U pinecone pymupdf google-generativeai


