### Hybrid Search

In [None]:
from langchain_community.retrievers import PineconeHybridSearchRetriever  

In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
api_key = os.environ["PINECONE_API_KEY"] 

In [10]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

index_name = "hybrid-search"

#initialize the pinecone client
pc = Pinecone(api_key= api_key)

#create index
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name = index_name,
        dimension=384, #dimension of dense vector
        metric="dotproduct",  #sparse value
        spec= ServerlessSpec(cloud = "aws", region = "us-east-1"),
    )

In [11]:
index = pc.Index(index_name)
index

<pinecone.grpc.index_grpc.GRPCIndex at 0x28346925c30>

In [14]:
#vector embedding and sparse matrix

os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")

from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")
embeddings



HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [22]:
from pinecone_text.sparse import BM25Encoder

corpus = ["The quick brown fox jumps over the lazy dog",
          "The lazy dog is brown",
          "The fox is brown"]

# Initialize BM25 and fit the corpus.
bm25 = BM25Encoder.default()
bm25.fit(corpus)

bm25.dump("bm25_values.json")

100%|██████████| 3/3 [00:00<00:00, 1500.29it/s]


In [24]:
retriever = PineconeHybridSearchRetriever(embeddings=embeddings, sparse_encoder=bm25, index=index)
retriever

PineconeHybridSearchRetriever(embeddings=HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x0000028367296350>, index=<pinecone.grpc.index_grpc.GRPCIndex object at 0x0000028346925C30>)

In [25]:
retriever.add_texts(["The quick brown fox jumps over the lazy dog",
          "The lazy dog is brown",
          "The fox is brown"])

100%|██████████| 1/1 [00:02<00:00,  2.38s/it]


In [26]:
retriever.invoke("what was the colour of the dog?")

[Document(metadata={'score': 0.32993686}, page_content='The lazy dog is brown'),
 Document(metadata={'score': 0.26332238}, page_content='The quick brown fox jumps over the lazy dog'),
 Document(metadata={'score': 0.22421938}, page_content='The fox is brown')]

In [27]:
retriever.invoke("who jumped over the dog?")

[Document(metadata={'score': 0.4352481}, page_content='The quick brown fox jumps over the lazy dog'),
 Document(metadata={'score': 0.20621812}, page_content='The lazy dog is brown'),
 Document(metadata={'score': 0.113595754}, page_content='The fox is brown')]