In [1]:
! pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain

Defaulting to user installation because normal site-packages is not writeable


In [21]:
import bs4
from langchain import hub
from langchain.text_splitter import  RecursiveCharacterTextSplitter
from langchain.embeddings import OllamaEmbeddings
from langchain_community.document_loaders import WebBaseLoader
from langchain.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.llms import Ollama

In [10]:
loader = WebBaseLoader(
    web_paths = ("https://whc.unesco.org/en/list/120/",),
    bs_kwargs = dict(
        parse_only= bs4.SoupStrainer(
            class_ = ("col-12 col-lg-8 mb-4 mb-lg-0")
        )
    ),
)

In [11]:
docs = loader.load()

In [12]:
docs

[Document(page_content="\n\n\n\n\n Sagarmatha National Park\nSagarmatha is an exceptional area with dramatic mountains, glaciers and deep valleys, dominated by Mount Everest, the highest peak in the world (8,848 m). Several rare species, such as the snow leopard and the lesser panda, are found in the park. The presence of the Sherpas, with their unique culture, adds further interest to this site.\nDescription is available under license CC-BY-SA IGO 3.0\n\n\n\n Parc national de Sagarmatha\nDans un paysage de montagnes grandioses où culmine le plus haut sommet du monde, l'Everest (8 848 m), de glaciers et de vallées profondes, le parc abrite des espèces rares, comme le léopard des neiges et le petit panda. La présence des Sherpas, qui y ont développé une culture originale, ajoute à l'intérêt du site.\nDescription is available under license CC-BY-SA IGO 3.0\n\n\n\nمنتزه ساغارماتا الوطني\nيقع هذا المنتزه في طبيعةٍ تتألَّف من جبالٍ عظيمةٍ حيث تطل أعلى القمم في العالم، قمة الايفيرست (8848 م)

In [13]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 100,
    chunk_overlap = 20
)
splits = text_splitter.split_documents(docs)

In [14]:
embeddings = OllamaEmbeddings(model = "llama2")

In [15]:
embeddings

OllamaEmbeddings(base_url='http://localhost:11434', model='llama2', embed_instruction='passage: ', query_instruction='query: ', mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None, show_progress=False, headers=None, model_kwargs=None)

In [16]:
db = Chroma.from_documents(splits[:1], embedding = embeddings)

In [28]:
splits[:1]

[Document(page_content='Sagarmatha National Park', metadata={'source': 'https://whc.unesco.org/en/list/120/'})]

In [18]:
retriever = db.as_retriever()

In [19]:
prompt = hub.pull("rlm/rag-prompt")

In [22]:
llm = Ollama(model = "llama2")

In [23]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [26]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [29]:
rag_chain.invoke("what is Sagarmatha National Par?")

Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


'Sagarmatha National Park, also known as the Everest National Park, is a protected area located in the Himalayas of Nepal. It is home to Mount Everest, the highest peak in the world, and several other mountain peaks. The park was established in 1976 to preserve the natural beauty and biodiversity of the region, and it is known for its stunning landscapes, unique wildlife, and cultural significance.'