In [6]:
!pip install sentence-transformers
!pip uninstall -y pinecone-client
!pip install pinecone


Found existing installation: pinecone-client 6.0.0
Uninstalling pinecone-client-6.0.0:
  Successfully uninstalled pinecone-client-6.0.0
Collecting pinecone
  Downloading pinecone-6.0.2-py3-none-any.whl.metadata (9.0 kB)
Downloading pinecone-6.0.2-py3-none-any.whl (421 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m421.9/421.9 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pinecone
Successfully installed pinecone-6.0.2


In [2]:
!wget https://s3-geospatial.s3.us-west-2.amazonaws.com/medium_data.csv

--2025-04-11 09:11:44--  https://s3-geospatial.s3.us-west-2.amazonaws.com/medium_data.csv
Resolving s3-geospatial.s3.us-west-2.amazonaws.com (s3-geospatial.s3.us-west-2.amazonaws.com)... 3.5.76.197, 52.92.241.50, 52.92.131.178, ...
Connecting to s3-geospatial.s3.us-west-2.amazonaws.com (s3-geospatial.s3.us-west-2.amazonaws.com)|3.5.76.197|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 599580 (586K) [text/csv]
Saving to: ‘medium_data.csv’


2025-04-11 09:11:45 (1.62 MB/s) - ‘medium_data.csv’ saved [599580/599580]



In [3]:
import pandas as pd

df = pd.read_csv("medium_data.csv")

df['title'] = df['title'].astype(str).fillna('')
df['subtitle'] = df['subtitle'].astype(str).fillna('')

df['metadata'] = df.apply(lambda row: {'title': row['title'] + " " + row['subtitle']}, axis=1)

df['id'] = df.index.map(lambda x: str(x))

df[['id', 'metadata']].head()


Unnamed: 0,id,metadata
0,0,{'title': 'Not All Rainbows and Sunshine: The ...
1,1,{'title': 'Ethics in AI: Potential Root Causes...
2,2,"{'title': 'Python Tuple, The Whole Truth and O..."
3,3,{'title': 'Dates and Subqueries in SQL Working...
4,4,{'title': 'Temporal Differences with Python: F...


In [4]:
from google.colab import userdata

API_KEY = userdata.get('pinecone_key')
ENV = userdata.get('pinecone_env') or 'us-east-1-aws'


In [15]:
from pinecone import Pinecone, ServerlessSpec
import os

region = ENV.replace("-aws", "")
cloud = "aws"

pc = Pinecone(api_key=API_KEY)
index_name = "semantic-search-fast-v2"

if index_name in pc.list_indexes().names():
    print(f"Index '{index_name}' already exists. (Skipping creation)")
else:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud=cloud,
            region=region
        )
    )
    print(f" Index '{index_name}' created with 384 dimensions.")


 Index 'semantic-search-fast-v2' created with 384 dimensions.


In [16]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

text_data = [m['title'] for m in df['metadata']]

# Generate embeddings
embeddings = model.encode(text_data).tolist()

# Prepare records for Pinecone
records = [
    {
        'id': df.iloc[i]['id'],
        'values': embeddings[i],
        'metadata': df.iloc[i]['metadata']
    }
    for i in range(len(df))
]

In [17]:

index = pc.Index("semantic-search-fast-v2")

for i in range(0, len(records), 100):
    index.upsert(vectors=records[i:i+100])

print(f" Uploaded {len(records)} vectors to Pinecone index 'semantic-search-fast-v2'.")

 Uploaded 2498 vectors to Pinecone index 'semantic-search-fast-v2'.


In [18]:
query = "how to build a data pipeline using Airflow"

# Embed the query
query_vec = model.encode(query).tolist()

results = index.query(vector=query_vec, top_k=5, include_metadata=True)

for match in results['matches']:
    print(f"\n Score: {match['score']:.4f}")
    print(f" Title: {match['metadata']['title']}")



 Score: 0.7390
 Title: Building Pipelines In Apache Airflow - For Beginners A quick and simple demo for running DAGs on…

 Score: 0.5956
 Title: Data pipeline design patterns Choosing the right architecture with examples

 Score: 0.5673
 Title: 5 Fantastic Data Pipeline Orchestration Tools For R Explore Excellent Options for Data Pipeline…

 Score: 0.5610
 Title: End-to-End ML Pipelines with MLflow: Tracking, Projects & Serving A Definitive Guide to Advanced Use…

 Score: 0.5610
 Title: End-to-End ML Pipelines with MLflow: Tracking, Projects & Serving A Definitive Guide to Advanced Use…
