In [5]:
import pinecone
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
import pandas as pd
import uuid

In [None]:
pc = Pinecone(api_key="key")

In [7]:
index_name = "learning-buddy"

In [8]:
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  # Matches all-MiniLM-L6-v2
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-west-2"
        )
    )

In [9]:
index = pc.Index(index_name)

In [10]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [11]:
csv_file = "study_courses.csv"
df = pd.read_csv(csv_file)

In [12]:
text_column = "description"
if text_column not in df.columns:
    raise ValueError(f"Column '{text_column}' not found in CSV")

In [13]:
items_to_upsert = []
for _, row in df.iterrows():
    # Generate embedding for the text column
    text = str(row[text_column])  # Convert to string to handle non-string data
    embedding = model.encode(text).tolist()  # Convert numpy array to list
    
    # Use existing ID if available, otherwise generate a unique ID
    item_id = str(row.get("id", uuid.uuid4()))
    
    # Create metadata from all columns except the text column
    metadata = {col: str(row[col]) for col in df.columns if col != text_column}
    metadata["text"] = text  # Include the original text in metadata
    
    # Add to upsert list
    items_to_upsert.append((item_id, embedding, metadata))

In [14]:
batch_size = 100
for i in range(0, len(items_to_upsert), batch_size):
    batch = items_to_upsert[i:i + batch_size]
    index.upsert(vectors=batch)
    print(f"Upserted batch {i // batch_size + 1} of {len(items_to_upsert) // batch_size + 1}")

Upserted batch 1 of 2
Upserted batch 2 of 2


In [15]:
query_text = "Is there a course related to Stochastic Learning"
query_embedding = model.encode(query_text).tolist()
results = index.query(vector=query_embedding, top_k=2, include_metadata=True)


In [16]:
print("\nQuery Results:")
for match in results["matches"]:
    print(f"ID: {match['id']}, Score: {match['score']}, Text: {match['metadata']['text']}")


Query Results:
ID: 3, Score: 0.439509213, Text: Machine Learning teaches models like regression, decision trees, and neural networks
ID: 64, Score: 0.417834163, Text: Courses in Semantics include labs on prior distributions
