<a href="https://colab.research.google.com/github/Muntasir2179/vector-database-learning/blob/main/VD_Pinecone.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing and importing dependencies

In [1]:
!pip install pinecone-client

Collecting pinecone-client
  Downloading pinecone_client-3.0.1-py3-none-any.whl (201 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/201.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/201.0 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.0/201.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pinecone-client
Successfully installed pinecone-client-3.0.1


In [2]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(
    api_key="af8ce84a-ab95-401c-ba72-b5249afdb37a"
)

# Accessing Indexes

In [3]:
pc.list_indexes()

{'indexes': [{'dimension': 3,
              'host': 'stock-prices-reg6nin.svc.gcp-starter.pinecone.io',
              'metric': 'cosine',
              'name': 'stock-prices',
              'spec': {'pod': {'environment': 'gcp-starter',
                               'pod_type': 'starter',
                               'pods': 1,
                               'replicas': 1,
                               'shards': 1}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [4]:
# delete index if it exists
if "index_name" in pc.list_indexes():
  pc.delete_index(name="index_name")

In [5]:
pc.describe_index('stock-prices')

{'dimension': 3,
 'host': 'stock-prices-reg6nin.svc.gcp-starter.pinecone.io',
 'metric': 'cosine',
 'name': 'stock-prices',
 'spec': {'pod': {'environment': 'gcp-starter',
                  'pod_type': 'starter',
                  'pods': 1,
                  'replicas': 1,
                  'shards': 1}},
 'status': {'ready': True, 'state': 'Ready'}}

# Inserting data into the index

* upsert() - It updates the data if it exists and insert the data if not exists.

In [6]:
index = pc.Index(name="stock-prices")

In [7]:
index.upsert(
  vectors=[
    {"id": "A", "values": [0.1, 0.1, 0.1], "metadata":{"stock_name": "Tesla", "sector": "automobile"}},
    {"id": "B", "values": [0.2, 0.2, 0.2]},
    {"id": "C", "values": [0.3, 0.3, 0.3]},
    {"id": "D", "values": [0.4, 0.4, 0.4]}
  ]
)

{'upserted_count': 4}

In [8]:
index.describe_index_stats()

{'dimension': 3,
 'index_fullness': 5e-05,
 'namespaces': {'': {'vector_count': 5}},
 'total_vector_count': 5}

In [9]:
import pandas as pd

data = {
    'id': ['C', 'E'],
    'vector': [[0.4, 0.4, 0.4], [1., 3., 4.]]
}

df = pd.DataFrame(data)
df

Unnamed: 0,id,vector
0,C,"[0.4, 0.4, 0.4]"
1,E,"[1.0, 3.0, 4.0]"


In [10]:
index.upsert(vectors=(zip(df.id, df.vector)))

{'upserted_count': 2}

In [11]:
index.describe_index_stats()

{'dimension': 3,
 'index_fullness': 5e-05,
 'namespaces': {'': {'vector_count': 5}},
 'total_vector_count': 5}

# Query the data

> It uses cosine similarith to find the nearest vector.

In [19]:
response = index.query(
    vector=[0.4, 0.4, 0.3],  # query vector
    top_k=1,  # the number of similar results we want to see
    include_values=True,  # by default it is false, it returns the vector as a response with maximum similarity
    include_metadata=True,
    filter={
        "sector": "automobile"
    }
)

In [20]:
response

{'matches': [{'id': 'A',
              'metadata': {'sector': 'automobile', 'stock_name': 'Tesla'},
              'score': 0.991126597,
              'values': [0.1, 0.1, 0.1]}],
 'namespace': '',
 'usage': {'read_units': 6}}

# Upserting data

In [21]:
pc.delete_index(name="stock-prices")
pc.list_indexes()

{'indexes': []}

> Manually created an index named "prices" in the Pinecone

In [24]:
pc.describe_index('prices')

{'dimension': 64,
 'host': 'prices-reg6nin.svc.gcp-starter.pinecone.io',
 'metric': 'cosine',
 'name': 'prices',
 'spec': {'pod': {'environment': 'gcp-starter',
                  'pod_type': 'starter',
                  'pods': 1,
                  'replicas': 1,
                  'shards': 1}},
 'status': {'ready': True, 'state': 'Ready'}}

In [25]:
import random

ids = ['a', 'b', 'c', 'd']
vectors = [[random.random() for _ in range(64)] for vec in range(4)]

ids, vectors

(['a', 'b', 'c', 'd'],
 [[0.8803878124144144,
   0.1586749025552069,
   0.2211769082974503,
   0.14000759419411624,
   0.4533031055278026,
   0.4781178003361357,
   0.85399869335948,
   0.37242689872571366,
   0.6357751952642251,
   0.9295821883008992,
   0.5856329729671373,
   0.10953589535388308,
   0.17886450587613922,
   0.8204560211577442,
   0.09328441506440543,
   0.8246393753821802,
   0.4016701560082727,
   0.04344156210662142,
   0.002700532485554019,
   0.9346211961973108,
   0.8088042841046542,
   0.9959387096580437,
   0.9130995306706018,
   0.9089346653718225,
   0.4729504750875497,
   0.998536481993486,
   0.7896941200082584,
   0.6235117205711002,
   0.9821218008036651,
   0.40935571668552906,
   0.41692135953175824,
   0.5259929180261936,
   0.7056863910335753,
   0.5017631733891517,
   0.3420537924639917,
   0.42476959785554125,
   0.5170459223267561,
   0.03442501571039713,
   0.8803034461157849,
   0.12856157887464237,
   0.9844815273510794,
   0.1028322355813257,
 

In [26]:
index = pc.Index(name='prices')

In [27]:
index.upsert(zip(ids, vectors))

{'upserted_count': 4}

In [28]:
index.describe_index_stats()

{'dimension': 64,
 'index_fullness': 4e-05,
 'namespaces': {'': {'vector_count': 4}},
 'total_vector_count': 4}

## Inserting data in batches over several requests

In order to insert data parallely:

https://docs.pinecone.io/docs/upsert-data#send-upserts-in-parallel

In [29]:
import itertools

vector_dimension = 64
vector_count = 1000

In [33]:
# (id, vector)
example_data_generator = map(
    lambda i:
    (f'id-{i}', [random.random() for _ in range(vector_dimension)]), range(vector_count)
)

# helper function to handle chunking of pairs
def chunks(iterable, batch_size=100):
  it = iter(iterable)
  chunk = tuple(itertools.islice(it, batch_size))

  while chunk:
    yield chunk
    chunk = tuple(itertools.islice(it, batch_size))

In [35]:
for chunk in chunks(example_data_generator):
  index.upsert(vectors=chunk)

In [36]:
index.describe_index_stats()

{'dimension': 64,
 'index_fullness': 0.00904,
 'namespaces': {'': {'vector_count': 904}},
 'total_vector_count': 904}