In [1]:
from pinecone import Pinecone,ServerlessSpec
from langchain_huggingface import HuggingFaceEmbeddings
import time

In [2]:
# Getting up the openai key file on local pc
from dotenv import load_dotenv
import os

# Load environment variables from the .env file
load_dotenv()

# Retrieve the API key from the environment
pc_api_key = os.getenv("PINECONE_KEY")

In [3]:
# Initialize client
pc = Pinecone(api_key=pc_api_key)

In [4]:
# list existing indexes
pc.list_indexes()

[
    {
        "name": "hybrid-search-langchain-pinecone",
        "dimension": 384,
        "metric": "dotproduct",
        "host": "hybrid-search-langchain-pinecone-3eg6zed.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "deletion_protection": "disabled"
    },
    {
        "name": "genai-serverless",
        "dimension": 1024,
        "metric": "cosine",
        "host": "genai-serverless-3eg6zed.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "deletion_protection": "disabled"
    }
]

In [5]:
index_name = "genai-serverless"  #Allowed characters are lowercase letters, numbers, and hyphens and the name may not begin or end with hyphens. Maximum length is 45 characters.
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

#if index_name exists delete it
if index_name in existing_indexes:
    pc.delete_index(index_name)
    while index_name in existing_indexes:
        existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
        time.sleep(1)
        
if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1024,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

In [6]:
host = pc.describe_index(index_name)['host']
index = pc.Index(index_name,host=host)

In [7]:
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en",encode_kwargs={'normalize_embeddings':True})

In [8]:
# List of sentences to add to Pinecone
sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "Artificial intelligence is transforming the world.",
    "The sky is clear and the sun is shining.",
    "Machine learning models require a lot of data.",
    "Natural language processing is a fascinating field.",
    "There is a bear in the town square."
]

page_numbers = [1,2,3,4,5,6,7,8,9,10]

In [9]:
# Convert the sentences to embeddings
embeddings_list = embeddings.embed_documents(sentences)
len(embeddings_list[0])


1024

In [10]:
# Prepare the data for Pinecone with metadata
vectors = [
    {
        "id": str(i),
        "values": embedding,
        "metadata": {"sentence": sentence, "page_number": page_numbers[i]}
    }
    for i, (embedding, sentence) in enumerate(zip(embeddings_list, sentences))
]


In [11]:
vectors

[{'id': '0',
  'values': [-0.05134882777929306,
   0.003944123163819313,
   0.000425247591920197,
   -0.004382328130304813,
   -0.06313794106245041,
   -0.020360829308629036,
   -0.020125361159443855,
   0.0034135342575609684,
   0.04519397392868996,
   0.04413892328739166,
   0.04295136034488678,
   0.022044993937015533,
   0.02655239775776863,
   -0.012788763269782066,
   0.011266466230154037,
   0.03475988656282425,
   -0.03237318620085716,
   -0.032693807035684586,
   -0.029884064570069313,
   0.014361388050019741,
   0.03672785684466362,
   -0.02685377188026905,
   -0.059346526861190796,
   -0.0027741906233131886,
   0.017337733879685402,
   0.02450679987668991,
   0.023505983874201775,
   -0.030779575929045677,
   0.065408855676651,
   0.050638843327760696,
   0.029802901670336723,
   -0.005086053628474474,
   0.012845730409026146,
   -0.053968366235494614,
   -0.011869185604155064,
   -0.006404159590601921,
   0.0007347920327447355,
   -0.031000537797808647,
   0.005260278470814

In [12]:
# Upsert the vectors to Pinecone
index.upsert(vectors)

# Show the updated database
print("Embeddings added to Pinecone successfully.")

Embeddings added to Pinecone successfully.
Current vectors in the index:
{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}
