In [1]:
# 1_upload_embeddings.ipynb

# %%
# Install necessary packages (run only if not already installed)
!pip install -q sentence-transformers pinecone-client python-dotenv pandas tqdm


In [2]:
# %%
# Import required libraries
import os
import pandas as pd
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from tqdm import tqdm


In [3]:

# %%
# Load environment variables from .env file
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV")
assert PINECONE_API_KEY and PINECONE_ENV, "Pinecone API key or environment not found in .env!"


In [5]:
# %%
# Load your merged funding data CSV
df = pd.read_csv("/Users/kiranmulawad/AI-Funding/2_preprocessing/data/merged_funding_data.csv")  # Adjust the path if needed


In [6]:
# %%
# Build a semantic_corpus column by combining relevant fields for semantic search
def build_corpus(row):
    fields = [
        row.get("name", ""),
        row.get("description", ""),
        row.get("domain", ""),
        row.get("eligibility", ""),
        row.get("amount", ""),
        row.get("location", ""),
        row.get("procedure", ""),
    ]
    # Join non-empty fields with ". "
    return ". ".join([str(f) for f in fields if pd.notna(f) and str(f).strip()])

df["semantic_corpus"] = df.apply(build_corpus, axis=1)


In [7]:
# %%
# Initialize the embedding model (MiniLM is fast and good for semantic search)
model = SentenceTransformer("all-MiniLM-L6-v2")


In [8]:
# %%
# Initialize Pinecone client and create index if needed
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "funding-search"
dimension = 384  # Dimension for MiniLM-L6-v2 embeddings


In [9]:
# Create the index only if it doesn't exist
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region=PINECONE_ENV)
    )
index = pc.Index(index_name)


In [10]:
# %%
# Batch upload embeddings and metadata to Pinecone
batch_size = 100  # Adjust batch size for your memory/connection
for i in tqdm(range(0, len(df), batch_size), desc="Uploading to Pinecone"):
    batch = df.iloc[i:i+batch_size]
    # Encode the semantic_corpus column to get embeddings
    embeddings = model.encode(batch["semantic_corpus"].tolist(), show_progress_bar=False).tolist()
    # Use a stable, unique ID (here: hash of URL)
    ids = batch["url"].apply(lambda x: f"id-{hash(x)}").tolist()
    # Include all relevant metadata fields for later retrieval
    metadata = batch[["name", "description", "eligibility", "amount", "domain", "location", "procedure", "url", "source"]].fillna("").to_dict(orient="records")
    # Prepare data for upsert
    to_upsert = list(zip(ids, embeddings, metadata))
    # Upload to Pinecone
    index.upsert(vectors=to_upsert)

print("✅ All embeddings uploaded to Pinecone.")


Uploading to Pinecone: 100%|██████████| 1/1 [00:02<00:00,  2.81s/it]

✅ All embeddings uploaded to Pinecone.



