In [None]:
# import sys
# !{sys.executable} -m pip install -q openai pinecone python-dotenv pandas tqdm tenacity tiktoken


In [5]:
import os
import hashlib
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
from pinecone import Pinecone
from tqdm import tqdm
import tiktoken
from tenacity import retry, wait_random_exponential, stop_after_attempt

In [6]:
# Load environment variables (adjust path if your .env is not in cwd)
load_dotenv()

True

In [7]:
# %%
# Load environment variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV")

assert OPENAI_API_KEY, "Missing OPENAI_API_KEY"
assert PINECONE_API_KEY, "Missing PINECONE_API_KEY"
assert PINECONE_ENV, "Missing PINECONE_ENV"

In [None]:
# Create a Pinecone client instance FIRST
pc = Pinecone(api_key=PINECONE_API_KEY)

# Index parameters
index_name = "funding-search"
dimension = 1536  # For text-embedding-3-small

# List indexes using the client
existing_indexes = pc.list_indexes().names()

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric="cosine",
        environment=PINECONE_ENV,
    )

In [9]:
# Connect to the index (if you need the Index object later)
index = pc.Index(index_name)

In [10]:
# %%
# Load funding dataset
df = pd.read_csv("/Users/kiranmulawad/AI-Funding/2_preprocessing/data/merged_funding_data.csv")


In [11]:
# Build semantic_corpus field combining info for embedding
def build_semantic_corpus(row):
    parts = []
    for field in ["name", "description", "domain", "eligibility", "amount", "location", "procedure", "contact", "deadline"]:
        val = row.get(field, "")
        if val:
            parts.append(f"{field} = {val}")
    return ". ".join(parts).strip(". ")

df["semantic_corpus"] = df.apply(build_semantic_corpus, axis=1)

In [17]:
# Display the first few semantic_corpus entries as a DataFrame
print(df[["semantic_corpus"]].head())

                                     semantic_corpus
0  name = FORTIS 1st Open Call. description = FOR...
1  name = MASTER 2nd Open Call. description = MAS...
2  name = PEDVolution Open Call. description = PE...
3  name = SMURF 2nd Open Call. description = The ...
4  name = GUARDIANS 1st Open Call. description = ...


In [12]:
# Token counting to respect OpenAI limits (~8192 tokens)
tokenizer = tiktoken.encoding_for_model("text-embedding-3-small")
df["token_count"] = df["semantic_corpus"].apply(lambda txt: len(tokenizer.encode(txt)))
max_token_limit = 8192
too_long = df[df["token_count"] > max_token_limit]
if not too_long.empty:
    print(f"⚠️ WARNING: {len(too_long)} rows exceed {max_token_limit} tokens and may cause API errors.")

In [13]:
# Generate stable IDs using hash on URL (or fallback)
def hash_id(text):
    return hashlib.md5(text.encode("utf-8")).hexdigest()

df["id"] = df["url"].fillna("no-url").apply(hash_id)


In [14]:
client = OpenAI(api_key=OPENAI_API_KEY)

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def get_embedding(text: str):
    response = client.embeddings.create(
        input=[text],
        model="text-embedding-3-small"
    )
    return response.data[0].embedding


In [15]:
# Upsert embeddings to Pinecone in batches
batch_size = 32
namespace = "openai-v3"  # Your namespace in Pinecone

for i in tqdm(range(0, len(df), batch_size), desc="Uploading embeddings to Pinecone"):
    batch = df.iloc[i:i+batch_size]
    texts = batch["semantic_corpus"].fillna("").tolist()
    ids = batch["id"].tolist()
    embeddings = [get_embedding(text) for text in texts]

    metadata = batch[[
        "id", "name", "description", "domain", "eligibility", "location",
        "amount", "procedure", "contact", "deadline", "url", "source"
    ]].fillna("").to_dict(orient="records")

    vectors = list(zip(ids, embeddings, metadata))
    index.upsert(vectors=vectors, namespace=namespace)

print(f"✅ Uploaded {len(df)} embeddings to Pinecone index '{index_name}' under namespace '{namespace}'")

Uploading embeddings to Pinecone:   0%|          | 0/3 [00:00<?, ?it/s]

Uploading embeddings to Pinecone: 100%|██████████| 3/3 [00:29<00:00,  9.69s/it]

✅ Uploaded 79 embeddings to Pinecone index 'funding-search' under namespace 'openai-v3'





In [16]:
stats = index.describe_index_stats()
print(stats)

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'openai-v3': {'vector_count': 79}},
 'total_vector_count': 79,
 'vector_type': 'dense'}
