In [1]:
!pip install --upgrade --quiet pinecone-client pinecone-text pinecone-notebooks

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.6/67.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for wget (setup.py) ... [?25l[?25hdone


In [2]:
api_key = "******************************************"

In [4]:
pip install langchain langchain_community pinecone-client

Collecting langchain_community
  Downloading langchain_community-0.3.13-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain
  Downloading langchain-0.3.13-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.25 (from langchain)
  Downloading langchain_core-0.3.28-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.7.0-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.23.2-py3-none-any.whl.metadata (7.1 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading

In [5]:
from langchain_community.retrievers import PineconeHybridSearchRetriever

### Analogy

Think of the index like a library catalog:

Books = Documents or items you want to search.
Book Summaries = Embeddings or vectors (representing the content of the books).
Catalog = The index, which organizes the books and their summaries for quick lookup.
When you search for a topic, the catalog helps you find relevant books efficiently.

### In Summary

Creating an index is essential because:

It organizes your data (vectors + metadata).
It enables fast, scalable, and accurate searches.
It supports advanced use cases like semantic search, hybrid search, and recommendations.

In [6]:
import os
from pinecone import Pinecone, ServerlessSpec
index_name = "hybrid-search-langchain-pinecone"
# initialize the Pinecone Client
pc = Pinecone(api_key=api_key)

# create the index
if index_name not in pc.list_indexes():
    pc.create_index(
        name=index_name,
        dimension=384,  #dimension of dense vector since hugging face technique gives vector of this size
        metric="dotproduct",
        spec=ServerlessSpec(cloud = 'aws', region = "us-east-1")
        )

# ServerlessSpec - Specifies that this index will run in a serverless mode (i.e., automatically scaled based on usage).

In [7]:
index = pc.Index(index_name)
index

<pinecone.data.index.Index at 0x7c9f6e5352d0>

In [16]:
# vector embeddings and sparse matrix
import os
# from dotenv import load_dotenv
# load_dotenv()

os.environ["HF_TOKEN"] = "***********"

from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [17]:
from pinecone_text.sparse import BM25Encoder

bm25_encoder = BM25Encoder().default() #uses TF-IDF by default
bm25_encoder

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x7c9e45abc970>

In [18]:
senetences = [
    "Artificial intelligence is transforming industries.",
    "Pinecone provides vector database solutions.",
    "Hybrid search combines semantic and keyword search."
]

In [20]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [27]:
# tf-idf values on these sentences
bm25_encoder.fit(senetences)

# store the values to a json file
bm25_encoder.dump("bm25_values.json")

# load to your BM25Encoder object
bm25_encoder = BM25Encoder().load("/content/bm25_values.json")


  0%|          | 0/3 [00:00<?, ?it/s]

In [28]:
retriever = PineconeHybridSearchRetriever(
    embeddings=embeddings,
    sparse_encoder=bm25_encoder,
    index=index)

In [29]:
retriever

PineconeHybridSearchRetriever(embeddings=HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x7c9e420399c0>, index=<pinecone.data.index.Index object at 0x7c9f6e5352d0>)

In [30]:
retriever.add_texts(
    texts=["Artificial intelligence is transforming industries.",
    "Pinecone provides vector database solutions.",
    "Hybrid search combines semantic and keyword search."])

  0%|          | 0/1 [00:00<?, ?it/s]

In [31]:
retriever.invoke("What is Hybrid Search?")

[Document(metadata={'score': 0.603588581}, page_content='Hybrid search combines semantic and keyword search.'),
 Document(metadata={'score': 0.117051736}, page_content='Pinecone provides vector database solutions.'),
 Document(metadata={'score': 0.0879552066}, page_content='Artificial intelligence is transforming industries.')]

In [32]:
retriever.invoke("What is AI?")

[Document(metadata={'score': 0.284713715}, page_content='Artificial intelligence is transforming industries.'),
 Document(metadata={'score': 0.0795862824}, page_content='Hybrid search combines semantic and keyword search.'),
 Document(metadata={'score': 0.0588818714}, page_content='Pinecone provides vector database solutions.')]