In [3]:
corpus = [
"Python is a versatile programming language used for web development, data science, and artificial intelligence.",
"Artificial intelligence is transforming healthcare by enabling faster diagnoses and personalized treatments.",
"Machine learning techniques, such as deep learning, have revolutionized image recognition and language processing.",
"Web development often involves Python alongside HTML, CSS, and JavaScript.",
"The healthcare industry benefits from AI, robotics, and machine learning.",
"FAISS is a library for fast similarity search and clustering of dense vectors.",
"BM25 is used for keyword-based retrieval in information systems.",

]

In [2]:
query = "AI in healthcare and web development"


In [4]:
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from langchain_community.vectorstores import FAISS
from langchain_ollama import OllamaEmbeddings

In [6]:
bm25_retriever = BM25Retriever.from_texts(
    corpus, metadatas=[{"source":"bm25"}] * len(corpus)
)

In [8]:
bm25_retriever.k = 2

In [9]:
bm25_retriever.invoke(query)

[Document(metadata={'source': 'bm25'}, page_content='Web development often involves Python alongside HTML, CSS, and JavaScript.'),
 Document(metadata={'source': 'bm25'}, page_content='BM25 is used for keyword-based retrieval in information systems.')]

In [11]:
embedding = OllamaEmbeddings( model="phi3")

In [13]:
faiss_vectorstore = FAISS.from_texts(
    texts=corpus, embedding=embedding, metadatas=[{"source": "faiss"}]* len(corpus)
)

In [14]:
fais_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k":2})

In [15]:
fais_retriever.invoke(query)

[Document(metadata={'source': 'faiss'}, page_content='The healthcare industry benefits from AI, robotics, and machine learning.'),
 Document(metadata={'source': 'faiss'}, page_content='BM25 is used for keyword-based retrieval in information systems.')]

In [17]:
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, fais_retriever], weights=[0.5,0.5]
)

In [18]:
docs = ensemble_retriever.invoke(query)

In [19]:
docs

[Document(metadata={'source': 'bm25'}, page_content='BM25 is used for keyword-based retrieval in information systems.'),
 Document(metadata={'source': 'bm25'}, page_content='Web development often involves Python alongside HTML, CSS, and JavaScript.'),
 Document(metadata={'source': 'faiss'}, page_content='The healthcare industry benefits from AI, robotics, and machine learning.')]