In [3]:
# %%
# Install required packages (run only once)
import sys
!{sys.executable} -m pip install -q sentence-transformers pinecone python-dotenv pandas tqdm

In [12]:
# %%
# Imports
import os
import hashlib
import pandas as pd
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

In [None]:
# %%
# Load .env config
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

assert PINECONE_API_KEY, "❌ PINECONE_API_KEY not found in .env"

In [14]:
# %%
# Load funding dataset
df = pd.read_csv("/Users/kiranmulawad/AI-Funding/2_preprocessing/data/merged_funding_data.csv")

In [15]:
# %%
# Build semantic_corpus with labeled structure
def build_semantic_corpus(row):
    return ". ".join([
        f"name = {row.get('name', '')}",
        f"description = {row.get('description', '')}",
        f"domain = {row.get('domain', '')}",
        f"eligibility = {row.get('eligibility', '')}",
        f"amount = {row.get('amount', '')}",
        f"location = {row.get('location', '')}",
        f"procedure = {row.get('procedure', '')}",
        f"contact = {row.get('contact', '')}",
        f"deadline = {row.get('deadline', '')}"
    ]).strip(". ")

df["semantic_corpus"] = df.apply(build_semantic_corpus, axis=1)

In [16]:
# %%
# Generate stable hash IDs from URLs
def hash_id(text):
    return hashlib.md5(text.encode("utf-8")).hexdigest()

df["id"] = df["url"].fillna("no-url").apply(hash_id)

In [17]:
# %%
# Initialize embedding model (BGE-small)
model = SentenceTransformer("BAAI/bge-small-en-v1.5")

In [22]:
# Initialize Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)

# Index configuration
index_name = "funding-search-bge"
dimension = 384
namespace = "open-source-v1"

# Create index if it doesn't exist
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

# Connect to index
index = pc.Index(index_name)

In [23]:
# %%
# Upload in batches
batch_size = 32

for i in tqdm(range(0, len(df), batch_size), desc="Uploading embeddings to Pinecone"):
    batch = df.iloc[i:i+batch_size]
    texts = batch["semantic_corpus"].fillna("").tolist()
    ids = batch["id"].tolist()
    embeddings = model.encode(texts, show_progress_bar=False).tolist()

    metadata = batch[[
        "id", "name", "description", "domain", "eligibility", "location",
        "amount", "procedure", "contact", "deadline", "url"
    ]].fillna("").to_dict(orient="records")

    vectors = list(zip(ids, embeddings, metadata))
    index.upsert(vectors=vectors, namespace=namespace)

print(f"✅ All embeddings uploaded to Pinecone under namespace: {namespace}")

Uploading embeddings to Pinecone: 100%|██████████| 3/3 [00:06<00:00,  2.01s/it]

✅ All embeddings uploaded to Pinecone under namespace: open-source-v1



