In [38]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.retrievers import BM25Retriever, EnsembleRetriever

Load data from a web page using a `WebBaseLoader`

In [43]:
loader = WebBaseLoader('https://en.wikipedia.org/wiki/Arsenal_F.C.')

docs = loader.load()

The result is an array of `Documents`

In [44]:
print(f'Loaded webpage into {len(docs)} documents')

Loaded webpage into 1 documents


In thise, there is just one document in the array.   
What are its properties?

In [45]:
docs[0].dict().keys()

dict_keys(['page_content', 'metadata', 'type'])

We need to split the web page into smaller chunks, so the retriever can return only the relevant parts of the page.

In [57]:
text_splitter = RecursiveCharacterTextSplitter()
docs = text_splitter.split_documents(docs)
print(f'Split into {len(docs)} documents')

Split into 36 documents


In [58]:
docs

[Document(page_content="Arsenal F.C. - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation\n\t\n\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate\n\n\n\n\n\n\t\tContribute\n\t\n\n\nHelpLearn to editCommunity portalRecent changesUpload file\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCreate account\n\nLog in\n\n\n\n\n\n\n\n\nPersonal tools\n\n\n\n\n\n Create account Log in\n\n\n\n\n\n\t\tPages for logged out editors learn more\n\n\n\nContributionsTalk\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nContents\nmove to sidebar\nhide\n\n\n\n\n(Top)\n\n\n\n\n\n1History\n\n\n\nToggle History subsection\n\n\n\n\n\n1.11886–1912: from Dial Square to Arsenal\n\n\n\n\n\n\n\n1.21912–1925: Bank of England club\n\n\n\n\n\n\n\n1.31925–1934

But, we can do all of this more simply

In [60]:
docs = loader.load_and_split(
    text_splitter=RecursiveCharacterTextSplitter())

print(f'Split into {len(chunks)} docs')

Split into 36 docs


# Semantic search

In [62]:
vectorstore = FAISS.from_documents(documents, OpenAIEmbeddings())

In [64]:
semantic_retriever = vectorstore.as_retriever()

In [73]:
query = 'Who is Thierry Henry?'

top_docs = semantic_retriever.invoke(query)
top_docs


[Document(page_content='Executive Vice-chair\n\nTim Lewis\n\n\nDirector\n\nLord Harris of Peckham\n\nStatistics and records\nFurther information: List of Arsenal F.C. records and statistics\nThierry Henry is Arsenal\'s record goalscorer, with 228 goals in all competitions.[247]\nArsenal\'s tally of 13 League Championships is the third highest in English football, after Manchester United (20) and Liverpool (19),[248]\nand they were the first club to reach a seventh and an eighth League Championship. As of June 2020, they are one of seven teams, the others being Manchester United, Blackburn Rovers, Chelsea, Manchester City, Leicester City and Liverpool, to have won the Premier League since its formation in 1992.[249]\nThey hold the highest number of FA Cup trophies, with 14.[250] The club is one of only six clubs to have won the FA Cup twice in succession, in 2002 and 2003, and 2014 and 2015.[251]\nArsenal have achieved three League and FA Cup "Doubles" (in 1971, 1998 and 2002), a feat o

In [70]:
print(top_docs[0].page_content)

Executive Vice-chair

Tim Lewis


Director

Lord Harris of Peckham

Statistics and records
Further information: List of Arsenal F.C. records and statistics
Thierry Henry is Arsenal's record goalscorer, with 228 goals in all competitions.[247]
Arsenal's tally of 13 League Championships is the third highest in English football, after Manchester United (20) and Liverpool (19),[248]
and they were the first club to reach a seventh and an eighth League Championship. As of June 2020, they are one of seven teams, the others being Manchester United, Blackburn Rovers, Chelsea, Manchester City, Leicester City and Liverpool, to have won the Premier League since its formation in 1992.[249]
They hold the highest number of FA Cup trophies, with 14.[250] The club is one of only six clubs to have won the FA Cup twice in succession, in 2002 and 2003, and 2014 and 2015.[251]
Arsenal have achieved three League and FA Cup "Doubles" (in 1971, 1998 and 2002), a feat only previously achieved by Manchester Uni

# BM25 search

In [67]:
bm25_retriever = BM25Retriever.from_documents(docs)

In [74]:
top_docs = bm25_retriever.invoke(query)
top_docs

[Document(page_content='Executive Vice-chair\n\nTim Lewis\n\n\nDirector\n\nLord Harris of Peckham\n\nStatistics and records\nFurther information: List of Arsenal F.C. records and statistics\nThierry Henry is Arsenal\'s record goalscorer, with 228 goals in all competitions.[247]\nArsenal\'s tally of 13 League Championships is the third highest in English football, after Manchester United (20) and Liverpool (19),[248]\nand they were the first club to reach a seventh and an eighth League Championship. As of June 2020, they are one of seven teams, the others being Manchester United, Blackburn Rovers, Chelsea, Manchester City, Leicester City and Liverpool, to have won the Premier League since its formation in 1992.[249]\nThey hold the highest number of FA Cup trophies, with 14.[250] The club is one of only six clubs to have won the FA Cup twice in succession, in 2002 and 2003, and 2014 and 2015.[251]\nArsenal have achieved three League and FA Cup "Doubles" (in 1971, 1998 and 2002), a feat o

In [72]:
print(top_docs[0].page_content)

Executive Vice-chair

Tim Lewis


Director

Lord Harris of Peckham

Statistics and records
Further information: List of Arsenal F.C. records and statistics
Thierry Henry is Arsenal's record goalscorer, with 228 goals in all competitions.[247]
Arsenal's tally of 13 League Championships is the third highest in English football, after Manchester United (20) and Liverpool (19),[248]
and they were the first club to reach a seventh and an eighth League Championship. As of June 2020, they are one of seven teams, the others being Manchester United, Blackburn Rovers, Chelsea, Manchester City, Leicester City and Liverpool, to have won the Premier League since its formation in 1992.[249]
They hold the highest number of FA Cup trophies, with 14.[250] The club is one of only six clubs to have won the FA Cup twice in succession, in 2002 and 2003, and 2014 and 2015.[251]
Arsenal have achieved three League and FA Cup "Doubles" (in 1971, 1998 and 2002), a feat only previously achieved by Manchester Uni

# Hybrid search

In [77]:
# Initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, semantic_retriever],
    weights=[0.4, 0.6]
)

top_docs = ensemble_retriever.invoke(query)
top_docs

[Document(page_content='Executive Vice-chair\n\nTim Lewis\n\n\nDirector\n\nLord Harris of Peckham\n\nStatistics and records\nFurther information: List of Arsenal F.C. records and statistics\nThierry Henry is Arsenal\'s record goalscorer, with 228 goals in all competitions.[247]\nArsenal\'s tally of 13 League Championships is the third highest in English football, after Manchester United (20) and Liverpool (19),[248]\nand they were the first club to reach a seventh and an eighth League Championship. As of June 2020, they are one of seven teams, the others being Manchester United, Blackburn Rovers, Chelsea, Manchester City, Leicester City and Liverpool, to have won the Premier League since its formation in 1992.[249]\nThey hold the highest number of FA Cup trophies, with 14.[250] The club is one of only six clubs to have won the FA Cup twice in succession, in 2002 and 2003, and 2014 and 2015.[251]\nArsenal have achieved three League and FA Cup "Doubles" (in 1971, 1998 and 2002), a feat o