In [8]:
from langchain_community.retrievers import PineconeHybridSearchRetriever
import os 
from pinecone import Pinecone,ServerlessSpec
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
from pinecone_text.sparse import BM25Encoder



In [1]:
api_key = ""

In [3]:
index_name = "hybrid-search-langchain-pinecone"

pc = Pinecone(api_key=api_key)

#create a index 
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name = index_name,
        dimension= 384,
        metric='dotproduct',
        spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 


    )

In [4]:
index = pc.Index(index_name)
index

<pinecone.data.index.Index at 0x1bf72e17dc0>

In [6]:
#vector embeddings and sparse matrix 

embeddings = HuggingFaceEmbeddings()

In [7]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-mpnet-base-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [9]:
bm25_encoder = BM25Encoder().default()

In [10]:
sentences = [
    "This is a sentence about AI.",
    "Another sentence about AI.",
    "AI is transforming the world.",
    "AI is making computers smarter.",
    "AI is enabling new industries.",
    "AI is changing the way we live and work.",
    "AI is creating jobs.",
    "AI is making us more connected.",
    "AI is changing the way we learn and grow.",
    "AI is making us more creative.",
    "AI is making us better at solving problems.",
    "AI is making us more resilient.",
    "AI is making us more empathetic.",
]

In [11]:
bm25_encoder.fit(sentences)

  0%|          | 0/13 [00:00<?, ?it/s]

<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x1bf67106f50>

In [12]:
#store the values to the json file 

bm25_encoder.dump("bm25_values.json")

In [13]:
retriever = PineconeHybridSearchRetriever(embeddings=embeddings,sparse_encoder=bm25_encoder,index=index)

In [14]:
retriever

PineconeHybridSearchRetriever(embeddings=HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-mpnet-base-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x000001BF67106F50>, index=<pinecone.data.index.Index object at 0x000001BF72E17DC0>)

In [21]:
retriever.add_texts([]
)

0it [00:00, ?it/s]

In [22]:
retriever.invoke("Explain about AI")

PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Thu, 18 Jul 2024 12:48:12 GMT', 'Content-Type': 'application/json', 'Content-Length': '102', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '55', 'x-pinecone-request-id': '8427900658331817532', 'x-envoy-upstream-service-time': '55', 'server': 'envoy'})
HTTP response body: {"code":3,"message":"Vector dimension 768 does not match the dimension of the index 384","details":[]}
