# 📄 PDF Upload + Chunking + Embedding to Pinecone
Enhanced with best practices:
- Stable unique IDs
- Metadata in Pinecone
- Namespace support
- Optional retrieval test
- Avoid reuploading if not needed

In [None]:
# import sys
# !{sys.executable} -m pip install tiktoken pymupdf openai pinecone python-dotenv tqdm

In [2]:
import os
import fitz  # PyMuPDF
import hashlib
from dotenv import load_dotenv
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm import tqdm
from typing import List

In [3]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV")

assert OPENAI_API_KEY and PINECONE_API_KEY and PINECONE_ENV, "❌ Missing environment variables"

client = OpenAI(api_key=OPENAI_API_KEY)
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "funding-search"
index = pc.Index(index_name)

In [4]:
# 📥 Upload your PDF path here (change this line)
pdf_path = "/Users/kiranmulawad/AI-Funding/openai_model/sample_user_profile.pdf"

In [5]:
# 📄 Extract text from PDF
def extract_text_from_pdf(pdf_path: str) -> str:
    doc = fitz.open(pdf_path)
    return "\n".join([page.get_text() for page in doc])

text = extract_text_from_pdf(pdf_path)
print("✅ Sample extracted text:")
print(text[:500])

✅ Sample extracted text:
Company Name: RoboAI Solutions
Industry: Artificial Intelligence, Robotics
Location: Rhineland-Palatinate, Germany
Company Description:
RoboAI Solutions is a startup focused on the intersection of artificial intelligence and robotics. We
are currently in the early research and prototyping phase for developing intelligent control systems
for industrial robots.
Goals:
- Advance AI-based robotic systems for automation
- Collaborate with academic institutions
- Apply for regional and national fundin


In [6]:
# ✂️ Chunk text using tiktoken
import tiktoken

def chunk_text(text: str, max_tokens: int = 300) -> List[str]:
    enc = tiktoken.get_encoding("cl100k_base")
    words = text.split(". ")
    chunks, current = [], ""
    for sentence in words:
        tentative = current + sentence + ". "
        if len(enc.encode(tentative)) <= max_tokens:
            current = tentative
        else:
            chunks.append(current.strip())
            current = sentence + ". "
    if current:
        chunks.append(current.strip())
    return chunks

chunks = chunk_text(text)
print(f"✅ Total chunks created: {len(chunks)}")

✅ Total chunks created: 1


In [9]:
for i, chunk in enumerate(chunks):
    print(f"Chunk {i}: {chunk[:150]}")

Chunk 0: Company Name: RoboAI Solutions
Industry: Artificial Intelligence, Robotics
Location: Rhineland-Palatinate, Germany
Company Description:
RoboAI Solutio


In [None]:
# # 🚀 Upload to Pinecone (only run if not already uploaded)
# # ⚠️ COMMENT this cell if you already uploaded this PDF!

# namespace = "pdf-upload"
# prefix = hashlib.md5(pdf_path.encode()).hexdigest()[:8]  # Stable prefix for IDs
# print(f"🔐 PDF upload prefix (ID base): {prefix}")

# for i, chunk in enumerate(tqdm(chunks, desc="Embedding + Uploading chunks")):
#     embedding = client.embeddings.create(
#         input=[chunk], model="text-embedding-3-small"
#     ).data[0].embedding

#     vector_id = f"{prefix}-chunk-{i}"
#     metadata = {
#         "source": os.path.basename(pdf_path),
#         "chunk_index": i,
#         "text": chunk
#     }
#     index.upsert([(vector_id, embedding, metadata)], namespace=namespace)

# print("✅ All chunks embedded and uploaded to Pinecone.")

In [7]:
# # Define the namespace used during upload
namespace = "pdf-upload"

In [8]:
# Check how many vectors exist for that PDF
stats = index.describe_index_stats(namespace="pdf-upload")
print(stats)

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'openai-v3': {'vector_count': 79},
                'pdf-upload': {'vector_count': 1}},
 'total_vector_count': 80,
 'vector_type': 'dense'}
