In [1]:
import os 
from dotenv import load_dotenv
load_dotenv

<function dotenv.main.load_dotenv(dotenv_path: Union[str, ForwardRef('os.PathLike[str]'), NoneType] = None, stream: Optional[IO[str]] = None, verbose: bool = False, override: bool = False, interpolate: bool = True, encoding: Optional[str] = 'utf-8') -> bool>

In [14]:
pinecone_db=os.getenv('pinecone_db')
HF_TOKEN=os.getenv('HF_TOKEN')

## Creating DB

In [6]:
from langchain_community.retrievers import PineconeHybridSearchRetriever
from pinecone import Pinecone,ServerlessSpec

index_name='hybrid-search-langchain-pinecone'
pc=Pinecone(api_key=pinecone_db)

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric='dotproduct',
        spec=ServerlessSpec(cloud='aws',region="us-east-1"),
    )


In [7]:
index=pc.Index(index_name)
index

  from .autonotebook import tqdm as notebook_tqdm


<pinecone.db_data.index.Index at 0x1a3bdf69810>

In [15]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings=HuggingFaceEmbeddings(model='all-MiniLM-L6-v2')
embeddings

HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [16]:
from pinecone_text.sparse import BM25Encoder

bm25_encoder=BM25Encoder().default()
bm25_encoder

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x1a38d726a90>

In [17]:
sentences=[
    "The sun was shining brightly over the quiet village.",
    "She quickly finished her homework before dinner.",
    "They decided to take a long walk along the river."
]

In [None]:
# TFIDF
bm25_encoder.fit(sentences)
bm25_encoder.dump("bm25_values.json")
bm25_encoder=BM25Encoder().load("bm25_values.json")


100%|██████████| 3/3 [00:00<00:00, 73.54it/s]


In [21]:
retriever=PineconeHybridSearchRetriever(embeddings=embeddings,sparse_encoder=bm25_encoder,index=index)
retriever

PineconeHybridSearchRetriever(embeddings=HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x000001A38DBF5590>, index=<pinecone.db_data.index.Index object at 0x000001A3BDF69810>)

In [23]:
retriever.add_texts(["The sun was shining brightly over the quiet village.",
    "She quickly finished her homework before dinner.",
    "They decided to take a long walk along the river."])

100%|██████████| 1/1 [00:02<00:00,  2.18s/it]


In [30]:
retriever.invoke("who decided to take a walk")

[Document(metadata={'score': 0.517222524}, page_content='They decided to take a long walk along the river.'),
 Document(metadata={'score': 0.0809991583}, page_content='The sun was shining brightly over the quiet village.'),
 Document(metadata={'score': 0.0613835752}, page_content='She quickly finished her homework before dinner.')]