In [2]:
!pip install --upgrade --quiet pinecone-client pinecone-text pinecone-notebooks sentence-transformers

In [3]:
api_key = "d87bd2ab-a264-4a34-982f-c0a12e24fa12"

In [4]:
from langchain_community.retrievers import PineconeHybridSearchRetriever

In [5]:
import os
from pinecone import Pinecone, ServerlessSpec

index_name = "hybrid-search-experiment"

pc=Pinecone(api_key=api_key)

pc

<pinecone.control.pinecone.Pinecone at 0x31d2e9e50>

In [6]:
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name, 
        metric="dotproduct", #Spearse values supportes only for dotproduct
        dimension=384, # Hugging face embbeding technique used 384 embbedings
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )


print(pc.list_indexes().names())

['hybrid-search-experiment']


In [7]:
pinecone_index = pc.Index(index_name)
pinecone_index

<pinecone.data.index.Index at 0x31d2e9f70>

In [9]:
import os
from dotenv import load_dotenv

load_dotenv()
os.environ["HS_KEY"] = os.getenv("HUGGINGFACE_KEY")

from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embeddings



HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [14]:
from pinecone_text.sparse import BM25Encoder

bm25_encoder=BM25Encoder().default()
bm25_encoder

<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x321e8ee80>

In [19]:
import nltk
nltk.download('punkt_tab')

sentences = [
    "In 2023 I will be 30 years old",
    "In 2021 I was in Paris",
    "In 2022 I visited the Eiffel Tower",
]

bm25_encoder.fit(sentences)

bm25_encoder.dump("bm25_values.json")

bm25_encoder.load("bm25_values.json")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/higinosilva/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
100%|██████████| 3/3 [00:00<00:00, 3021.11it/s]


<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x321e8ee80>

In [20]:
retriever=PineconeHybridSearchRetriever(embeddings=embeddings, sparse_encoder=bm25_encoder,index=pinecone_index) 
retriever


PineconeHybridSearchRetriever(embeddings=HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x321e8ee80>, index=<pinecone.data.index.Index object at 0x31d2e9f70>)

In [21]:
retriever.add_texts(sentences)

100%|██████████| 1/1 [00:05<00:00,  5.17s/it]


In [22]:
retriever.invoke("Where was I in 2021?")

[Document(metadata={'score': 0.631594062}, page_content='In 2021 I was in Paris'),
 Document(metadata={'score': 0.225767761}, page_content='In 2023 I will be 30 years old'),
 Document(metadata={'score': 0.21348311}, page_content='In 2022 I visited the Eiffel Tower')]