In [1]:
from langchain_community.llms import Ollama
from langchain_chroma import Chroma
from langchain import hub
from langchain.chains.retrieval_qa.base import RetrievalQA
from langchain.callbacks.manager import CallbackManager
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
from langchain_core.callbacks import StreamingStdOutCallbackHandler
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser


from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)

loader = WebBaseLoader("https://en.wikipedia.org/wiki/2023%E2%80%9324_Premier_League")
data = loader.load()
print("Web page loaded successfully")

Web page loaded successfully


In [2]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
splits = text_splitter.split_documents(data)
print(f"Split {len(splits)} chunks")

Split 84 chunks


In [3]:
print("Storing into ChromaDB...")
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
db = Chroma.from_documents(splits, embedding_function)
print("Done storing into ChromaDB.")

Storing into ChromaDB...


  from .autonotebook import tqdm as notebook_tqdm


Done storing into ChromaDB.


In [4]:
# # query it
# query = "Who won the premier league ?"
# docs = db.similarity_search(query)

# # print results
# print(docs[0].page_content)

In [5]:
# Retrieve and generate using the relevant snippets of the blog.
retriever = db.as_retriever()
prompt = hub.pull("rlm/rag-prompt")


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

llm = Ollama(
    model="mistral",
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
    base_url="http://localhost:11434"
)
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)



In [6]:
rag_chain.invoke("Who won the premier league title in the 2024/2025 season ? The answer MUST be accurate. If you don't know the answer, just says so.")

 Manchester City won the premier league title in the 2023-24 season. [Refer to context: "Manchester City wins record fourth consecutive Premier League title, the first men's team in English league history to achieve this feat".]

' Manchester City won the premier league title in the 2023-24 season. [Refer to context: "Manchester City wins record fourth consecutive Premier League title, the first men\'s team in English league history to achieve this feat".]'