**Vector databases** are purpose-built to handle the unique structure of **vector embeddings**. They **index vectors** for **easy search and retrieval** by comparing values and finding those that are most similar to one another.

In [29]:
!pip install sentence-transformers pinecone-client -q

In [30]:
import pandas as pd

In [40]:
# https://www.kaggle.com/datasets/quora/question-pairs-dataset
data = pd.read_csv("/content/questions.csv",nrows=500)
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [32]:
len(data)

404351

In [41]:
data[data['is_duplicate']==1].head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
11,11,23,24,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1
12,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1
13,13,27,28,What was your first sexual experience like?,What was your first sexual experience?,1


In [34]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2',device='cuda')

In [42]:
embeding = model.encode("This is sentence")
len(embeding)

768

In [46]:
import pinecone

pinecone.init(
	api_key='YOUR API KEY',
	environment='gcp-starter'
)
index = pinecone.Index('semantic')

**Storing way of vector into Pinecone**

In [37]:
# [(id,vector,metadata)]

In [47]:
question_list = []
for i,row in data.iterrows():
  question_list.append(
      (
        str(row['id']),
        model.encode(row['question1']).tolist(),
        {
            'is_duplicate': int(row['is_duplicate']),
            'question1': row['question1']
        }
      )
  )
  if len(question_list)==50 or len(question_list)==len(data):
    index.upsert(vectors=question_list)
    question_list = []

In [48]:
query = "How do I prepare for civil service?"
xq = model.encode([query]).tolist()
result = index.query(xq, top_k=2, includeMetadata=True)
result

{'matches': [{'id': '38',
              'metadata': {'is_duplicate': 1.0,
                           'question1': 'How do we prepare for UPSC?'},
              'score': 0.616592407,
              'values': []},
             {'id': '29',
              'metadata': {'is_duplicate': 1.0,
                           'question1': 'How should I prepare for CA final '
                                        'law?'},
              'score': 0.611808121,
              'values': []}],
 'namespace': ''}

In [49]:
query = "How to levergae internet for business"
xq = model.encode([query]).tolist()
result = index.query(xq, top_k=2, includeMetadata=True)
result

{'matches': [{'id': '78',
              'metadata': {'is_duplicate': 0.0,
                           'question1': 'How can I make money through the '
                                        'Internet?'},
              'score': 0.473412395,
              'values': []},
             {'id': '300',
              'metadata': {'is_duplicate': 0.0,
                           'question1': 'How should I start small business '
                                        'effectively?'},
              'score': 0.453923494,
              'values': []}],
 'namespace': ''}