# 10 - Ingest: load CSV → embeddings → upsert vectors to Pinecone


In [1]:
# Load CSV and inspect
import pandas as pd, os
from dotenv import load_dotenv
load_dotenv()
CSV_PATH = os.getenv("CSV_PATH","./data/niva_dataset1.csv")
df = pd.read_csv(CSV_PATH)
print("Rows:", len(df))
display(df.head(5))


Rows: 4920


Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection


In [2]:
# Choose a text column (adjust if your column name differs)
text_col = df.columns[0]
texts = df[text_col].astype(str).tolist()
print("Using column:", text_col)


Using column: itching


In [3]:
# Create embeddings (small test set first)
from sentence_transformers import SentenceTransformer
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
N = min(len(texts), 1000)   # start with up to 1k rows
subset_texts = texts[:N]
embs = embed_model.encode(subset_texts, show_progress_bar=True)
print("Encoded", len(embs), "vectors. Dim:", len(embs[0]))


  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 32/32 [00:01<00:00, 26.02it/s]

Encoded 1000 vectors. Dim: 384





In [4]:
# Initialize Pinecone and create index if missing (new SDK)
from pinecone import Pinecone, ServerlessSpec
import os
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
INDEX_NAME = os.getenv("PINECONE_INDEX","medical-knowledge")
dim = len(embs[0])
if INDEX_NAME not in [i["name"] for i in pc.list_indexes()]:
    print("Creating index", INDEX_NAME, "dim", dim)
    pc.create_index(name=INDEX_NAME, dimension=dim, metric="cosine",
                    spec=ServerlessSpec(cloud="aws", region=os.getenv("PINECONE_ENV")))
index = pc.Index(INDEX_NAME)
print("Index ready:", INDEX_NAME)



Index ready: medical-knowledge


In [5]:
# Check embedding dim and index name
print("Embedding dim:", len(embs[0]))      # from your previous embedding cell
import os
print("Pinecone index name:", os.getenv("PINECONE_INDEX"))


Embedding dim: 384
Pinecone index name: medical-knowledge


In [6]:
# Upsert in batches
batch_size = 128
def upsert_batch(start):
    vecs = []
    for i in range(start, min(start+batch_size, len(subset_texts))):
        vecs.append((f"id-{i}", embs[i].tolist(), {"text": subset_texts[i]}))
    index.upsert(vectors=vecs)

for i in range(0, len(subset_texts), batch_size):
    upsert_batch(i)
    print("Upserted", i, "to", min(i+batch_size, len(subset_texts)))
print("Upsert finished for", len(subset_texts), "items")



Upserted 0 to 128
Upserted 128 to 256
Upserted 256 to 384
Upserted 384 to 512
Upserted 512 to 640
Upserted 640 to 768
Upserted 768 to 896
Upserted 896 to 1000
Upsert finished for 1000 items


In [7]:
# Quick verify: query the index for a sample phrase
q = "fever and cough"
qvec = embed_model.encode(q).tolist()
res = index.query(vector=qvec, top_k=3, include_metadata=True)
for m in res.get("matches", []):
    print(m["id"], round(m["score"],3), m["metadata"]["text"][:200])


id-993 0.106 0
id-997 0.106 0
id-994 0.106 0
