<a href="https://colab.research.google.com/github/Muntasir2179/vector-database-learning/blob/main/VD_Pinecone.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing and importing dependencies

In [1]:
!pip install pinecone-client

Collecting pinecone-client
  Downloading pinecone_client-3.0.1-py3-none-any.whl (201 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.0/201.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pinecone-client
Successfully installed pinecone-client-3.0.1


In [2]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(
    api_key="af8ce84a-ab95-401c-ba72-b5249afdb37a"
)

# Accessing Indexes

In [3]:
pc.list_indexes()

{'indexes': [{'dimension': 64,
              'host': 'prices-reg6nin.svc.gcp-starter.pinecone.io',
              'metric': 'cosine',
              'name': 'prices',
              'spec': {'pod': {'environment': 'gcp-starter',
                               'pod_type': 'starter',
                               'pods': 1,
                               'replicas': 1,
                               'shards': 1}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [4]:
# delete index if it exists
if "index_name" in pc.list_indexes():
  pc.delete_index(name="index_name")

In [None]:
pc.describe_index('stock-prices')

# Inserting data into the index

* upsert() - It updates the data if it exists and insert the data if not exists.

In [None]:
index = pc.Index(name="stock-prices")

In [None]:
index.upsert(
  vectors=[
    {"id": "A", "values": [0.1, 0.1, 0.1], "metadata":{"stock_name": "Tesla", "sector": "automobile"}},
    {"id": "B", "values": [0.2, 0.2, 0.2]},
    {"id": "C", "values": [0.3, 0.3, 0.3]},
    {"id": "D", "values": [0.4, 0.4, 0.4]}
  ]
)

{'upserted_count': 4}

In [None]:
index.describe_index_stats()

{'dimension': 3,
 'index_fullness': 5e-05,
 'namespaces': {'': {'vector_count': 5}},
 'total_vector_count': 5}

In [None]:
import pandas as pd

data = {
    'id': ['C', 'E'],
    'vector': [[0.4, 0.4, 0.4], [1., 3., 4.]]
}

df = pd.DataFrame(data)
df

Unnamed: 0,id,vector
0,C,"[0.4, 0.4, 0.4]"
1,E,"[1.0, 3.0, 4.0]"


In [None]:
index.upsert(vectors=(zip(df.id, df.vector)))

{'upserted_count': 2}

In [None]:
index.describe_index_stats()

{'dimension': 3,
 'index_fullness': 5e-05,
 'namespaces': {'': {'vector_count': 5}},
 'total_vector_count': 5}

# Query the data

> It uses cosine similarith to find the nearest vector.

In [None]:
response = index.query(
    vector=[0.4, 0.4, 0.3],  # query vector
    top_k=1,  # the number of similar results we want to see
    include_values=True,  # by default it is false, it returns the vector as a response with maximum similarity
    include_metadata=True,
    filter={
        "sector": "automobile"
    }
)

In [None]:
response

{'matches': [{'id': 'A',
              'metadata': {'sector': 'automobile', 'stock_name': 'Tesla'},
              'score': 0.991126597,
              'values': [0.1, 0.1, 0.1]}],
 'namespace': '',
 'usage': {'read_units': 6}}

# Upserting data

In [None]:
pc.delete_index(name="stock-prices")
pc.list_indexes()

> Manually created an index named "prices" in the Pinecone

In [7]:
pc.describe_index('prices')

{'dimension': 64,
 'host': 'prices-reg6nin.svc.gcp-starter.pinecone.io',
 'metric': 'cosine',
 'name': 'prices',
 'spec': {'pod': {'environment': 'gcp-starter',
                  'pod_type': 'starter',
                  'pods': 1,
                  'replicas': 1,
                  'shards': 1}},
 'status': {'ready': True, 'state': 'Ready'}}

In [None]:
import random

ids = ['a', 'b', 'c', 'd']
vectors = [[random.random() for _ in range(64)] for vec in range(4)]

ids, vectors

In [9]:
index = pc.Index(name='prices')

In [None]:
index.upsert(zip(ids, vectors))

{'upserted_count': 4}

In [10]:
index.describe_index_stats()

{'dimension': 64,
 'index_fullness': 0.00904,
 'namespaces': {'': {'vector_count': 904}},
 'total_vector_count': 904}

## Inserting data in batches over several requests

In order to insert data parallely:

https://docs.pinecone.io/docs/upsert-data#send-upserts-in-parallel

In [None]:
import itertools

vector_dimension = 64
vector_count = 1000

In [None]:
# (id, vector)
example_data_generator = map(
    lambda i:
    (f'id-{i}', [random.random() for _ in range(vector_dimension)]), range(vector_count)
)

# helper function to handle chunking of pairs
def chunks(iterable, batch_size=100):
  it = iter(iterable)
  chunk = tuple(itertools.islice(it, batch_size))

  while chunk:
    yield chunk
    chunk = tuple(itertools.islice(it, batch_size))

In [None]:
for chunk in chunks(example_data_generator):
  index.upsert(vectors=chunk)

In [None]:
index.describe_index_stats()

{'dimension': 64,
 'index_fullness': 0.00904,
 'namespaces': {'': {'vector_count': 904}},
 'total_vector_count': 904}

# Vector IDs must be string

In [18]:
import random

id = [1]
random_vector = [[random.random() for _ in range(64)]]

index.upsert(zip(id, random_vector))

PineconeApiTypeError: Invalid type for variable 'id'. Required value type is str and passed type was int at ['id']

In [19]:
import random

id = ['1']
random_vector = [[random.random() for _ in range(64)]]

index.upsert(zip(id, random_vector))

{'upserted_count': 1}

# Embeddings - Sentence Transformer

In [21]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m81.9/86.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transf

In [23]:
pc.delete_index(name='prices')

In [25]:
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [27]:
# loading the model from huggingface model hub
model = SentenceTransformer(model_name_or_path="all-MiniLm-L6-v2", device=device)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [32]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [33]:
# created "embedding-transformer" manually
pc.list_indexes()

{'indexes': [{'dimension': 384,
              'host': 'embedding-transformer-reg6nin.svc.gcp-starter.pinecone.io',
              'metric': 'cosine',
              'name': 'embedding-transformer',
              'spec': {'pod': {'environment': 'gcp-starter',
                               'pod_type': 'starter',
                               'pods': 1,
                               'replicas': 1,
                               'shards': 1}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [34]:
text_embds = model.encode(['Apple is leading the techonological progress in recent time.'])
text_embds

array([[ 4.06453349e-02, -2.30671335e-02,  7.24701062e-02,
        -5.29509522e-02,  2.79339254e-02, -2.44009076e-03,
        -1.84074752e-02,  2.60139033e-02, -2.33547739e-03,
         1.09420232e-02,  2.08288524e-02,  7.57119209e-02,
        -2.20431876e-03,  1.29308552e-02,  3.25933215e-03,
        -4.85509122e-03, -1.79759189e-02, -1.05627403e-01,
        -3.46856602e-02, -6.88787401e-02, -1.81724783e-02,
         3.98154855e-02,  4.74057868e-02,  7.66553683e-03,
         6.12883419e-02,  8.12241733e-02, -7.75113760e-04,
        -9.23549235e-02,  6.37516205e-04, -4.88603546e-04,
        -7.63772428e-02,  4.00515087e-02,  2.91253738e-02,
         5.11594266e-02, -6.09511659e-02, -5.24287559e-02,
         7.97616318e-02,  2.37959325e-02,  2.14159545e-02,
        -7.07309544e-02, -6.36180565e-02, -4.12112661e-02,
         2.56526144e-03,  1.12548769e-01,  2.31901165e-02,
         3.51442839e-03,  2.62660254e-02, -6.99062943e-02,
        -3.75706889e-02,  4.59561087e-02, -5.44566810e-0