In [None]:
# %%
# Install packages if needed (run once)
import sys
!{sys.executable} -m pip install -q openai pinecone python-dotenv pandas tqdm tenacity

In [7]:
# %%
import os
import pandas as pd
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from tqdm import tqdm
from openai import OpenAI
from tenacity import retry, wait_random_exponential, stop_after_attempt
import hashlib

In [8]:
# %%
# Load environment variables
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV")

assert OPENAI_API_KEY, "❌ OPENAI_API_KEY not found in .env"
assert PINECONE_API_KEY, "❌ PINECONE_API_KEY not found in .env"
assert PINECONE_ENV, "❌ PINECONE_ENV not found in .env"


In [9]:
# %%
# Initialize OpenAI and Pinecone clients
client = OpenAI(api_key=OPENAI_API_KEY)
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "funding-search"
dimension = 1536  # OpenAI embedding size for text-embedding-3-small

# Create index if it doesn't exist
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region=PINECONE_ENV)
    )
index = pc.Index(index_name)


In [10]:
# %%
# Load funding dataset
df = pd.read_csv("/Users/kiranmulawad/AI-Funding/2_preprocessing/data/merged_funding_data.csv")


In [11]:
# %%
# Build semantic_corpus from multiple fields
def build_semantic_corpus(row):
    fields = [
        row.get("name", ""),
        row.get("description", ""),
        row.get("domain", ""),
        row.get("eligibility", ""),
        row.get("amount", ""),
        row.get("location", ""),
        row.get("procedure", "")
    ]
    return ". ".join([str(f) for f in fields if pd.notna(f) and str(f).strip()])

df["semantic_corpus"] = df.apply(build_semantic_corpus, axis=1)


In [12]:
# %%
# Generate stable hash-based IDs from URL
def hash_id(text):
    return hashlib.md5(text.encode("utf-8")).hexdigest()

df["id"] = df["url"].fillna("no-url").apply(hash_id)


In [13]:
# %%
# Embed text with retry for rate limits
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def get_embedding(text):
    response = client.embeddings.create(
        input=[text],
        model="text-embedding-3-small"
    )
    return response.data[0].embedding


In [14]:
# %%
# Upsert embeddings to Pinecone in batches
batch_size = 32
namespace = "openai-v3"  # Optional versioning

for i in tqdm(range(0, len(df), batch_size), desc="Uploading embeddings to Pinecone"):
    batch = df.iloc[i:i+batch_size]
    texts = batch["semantic_corpus"].fillna("").tolist()
    ids = batch["id"].tolist()
    embeddings = [get_embedding(text) for text in texts]
    
    metadata = batch[[
        "name", "description", "domain", "eligibility", "location",
        "amount", "procedure", "contact", "deadline", "url"
    ]].fillna("").to_dict(orient="records")
    
    vectors = list(zip(ids, embeddings, metadata))
    
    index.upsert(vectors=vectors, namespace=namespace)

print("✅ All OpenAI embeddings uploaded to Pinecone under namespace:", namespace)


Uploading embeddings to Pinecone: 100%|██████████| 3/3 [00:34<00:00, 11.44s/it]

✅ All OpenAI embeddings uploaded to Pinecone under namespace: openai-v3



