In [1]:
from langchain_community.retrievers import PineconeHybridSearchRetriever
from langchain_huggingface.embeddings import  HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEndpoint
from langchain_huggingface.chat_models import ChatHuggingFace
from dotenv import load_dotenv
import os

load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")

In [2]:
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embedding

  from .autonotebook import tqdm as notebook_tqdm


HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [3]:
llm = HuggingFaceEndpoint(
                repo_id="mistralai/Mistral-7B-Instruct-v0.3",
                task="text-generation",
                max_new_tokens=512,
                do_sample=False,
                repetition_penalty=1.03,
                huggingfacehub_api_token=str(HUGGINGFACE_TOKEN), #HUGGINGFACE_TOKEN,
            )

chat = ChatHuggingFace(llm=llm, verbose=True)

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFDirectoryLoader

# # Define the Directory Containing the Text Files & persistent Directory
# current_dir = os.path.dirname(os.path.abspath("04_hybrid_search.ipynb"))
# book_dir  = os.path.join(current_dir, "books")

# print(book_dir)
# if not os.path.exists(book_dir):
#     raise FileExistsError(f"Directory {book_dir} does not exist.")

loader = PyPDFDirectoryLoader("../books")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = loader.load()
docs = text_splitter.split_documents(docs)
print(len(docs))

316


In [19]:
from pinecone import Pinecone, ServerlessSpec

INDEX_NAME = "hybrid-search"
pc = Pinecone(api_key=PINECONE_API_KEY)

if INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name=INDEX_NAME,
        dimension=384,
        metric="dotproduct",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )


In [20]:
index_name = pc.Index(INDEX_NAME)

index_name

<pinecone.data.index.Index at 0x2619db11ee0>

In [9]:
from pinecone_text.sparse import BM25Encoder

bm25encoder = BM25Encoder().default()
bm25encoder

<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x2619e3a0c80>

In [13]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to C:\Users\Fine
[nltk_data]     Gallery\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [14]:
# Extract text from the documents (assuming each document has a 'page_content' attribute)
corpus = [doc.page_content for doc in docs]

# Now, you can fit the encoder with the corpus (list of strings)
bm25encoder.fit(corpus)

bm25encoder.dump("bm25_values.json")

bm25encoder = BM25Encoder().load("bm25_values.json")

100%|██████████| 316/316 [00:03<00:00, 90.52it/s] 


In [21]:
retriever = PineconeHybridSearchRetriever(index=index_name, embeddings=embedding, sparse_encoder=bm25encoder)
retriever

PineconeHybridSearchRetriever(embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x000002619F3AD2B0>, index=<pinecone.data.index.Index object at 0x000002619DB11EE0>)

In [22]:
retriever.add_texts(corpus)

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [01:17<00:00,  7.72s/it]


In [23]:
retriever.invoke("WHAT IS THE AMERICAN COMMUNITY SURVEY?")

[Document(metadata={'score': 0.731581748}, page_content='from the ACS and how they differ \nfrom those based on the CPS \nASEC, SIPP, and SAIPE, refer \nto the information and survey \ncomparisons at <www.census.gov/\ntopics/income-poverty/poverty/\nguidance/data-sources.html>.\nWHAT IS THE AMERICAN COMMUNITY SURVEY?\nThe American Community Survey (ACS) is a nationwide survey \ndesigned to provide communities with reliable and timely \ndemographic, social, economic, and housing data for the nation, \nstates, congressional districts, counties, places, and other localities \nevery year. It has an annual sample size of about 3.5 million \naddresses across the United States and Puerto Rico and includes \nboth housing units and group quarters (e.g., nursing facilities and \nprisons).1 The ACS is conducted in every county throughout the \nnation and every municipio in Puerto Rico, where it is called the \nPuerto Rico Community Survey. Beginning in 2006, ACS 1-year data \nhave been released a