In [None]:
import os

In [None]:
#!pip install langchain_community langchainhub chromadb langchain langchain_openai

In [None]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document
import json

def scrape_and_cache(url, cache_filepath):
  if os.path.exists(cache_filepath):
    with open(cache_filepath, 'r') as f:
      cached_data = json.load(f)
      docs = [Document(page_content=item['page_content'], metadata=item['metadata']) for item in cached_data]
      print("cached Data is found")
      return docs
  else:
    print(f"Caching not found, Scraping from: {url}")
    loader = WebBaseLoader(web_paths=[url]) # edit for the nix page
    docs= loader.load()

    os.makedirs(os.path.dirname(cache_filepath), exist_ok=True)
    serializable_docs = []
    for doc in docs:
      serializable_docs.append({
          'page_content': doc.page_content,
          'metadata' : doc.metadata #check if metadatas
      })

    with open(cache_filepath, 'w') as f:
      json.dump(serializable_docs, f, indent=4, ensure_ascii=False)
    print(f"scraped data is cached to: {cache_filepath}")
    return docs


web_path = "https://nixos.org/manual/nixpkgs/stable/"
cache_dir = "scraped_data_cache"
cache_file = os.path.join(cache_dir, "nix_docs.json")
docs = scrape_and_cache(web_path, cache_file)

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)
splits = text_splitter.split_documents(docs)
print("the length of the split is", len(splits))
#for i in splits:
#  print(i)

In [None]:
# Add docs to vector DB using Chroma DB
from langchain_openai import OpenAIEmbeddings
# from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import Chroma

vectorstore_nix = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

In [None]:
print(vectorstore_nix._collection.count())
#print(vectorstore_nix._collection.get())

In [None]:
#Here check the text to embedding mapping # here we are creating embeddings using openAI
print("collection 1", vectorstore_nix._collection.get(ids=['983384c7-d506-4084-964d-403511f3a85a'], include=["embeddings", "documents"]))

In [None]:
# RAG pipeline
# "vectorstore" acts as a retriever

retriever = vectorstore_nix.as_retriever()

In [None]:
#Auugmentation
# fetch the documents from the vector DB and then along with question whcih is a context send it to the

#https://smith.langchain.com/hub/rlm/rag-prompt?organizationId=05726ff1-dd0c-4484-9c9c-cc8927681d12 # prompt from the lanchain hub

from langchain import hub
prompt = hub.pull("rlm/rag-prompt")

In [None]:
print(prompt)

In [None]:
#setup LLM
from langchain_openai import ChatOpenAI
# from langchain_ollama import ChatOllama
llm = ChatOpenAI(model="gpt-3.5-turbo") # default model is being used here
from langchain_core.runnables import RunnablePassthrough # RunnablePassthrough is used when you want to pass the input as it is.
from langchain_core.output_parsers import StrOutputParser # the output from llm has lot of info so to get only the correct content

In [None]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

In [None]:
from langchain.chains import create_history_aware_retriever, create_retrieval_chain

### Contextualize question ###
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [None]:
from langchain.chains.combine_documents import create_stuff_documents_chain
### Answer question ###
qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

In [None]:
from langchain_core.chat_history import BaseChatMessageHistory
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)


### Statefully manage chat history ###
store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [None]:
### Statefully manage chat history ###
store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [None]:
print(conversational_rag_chain.invoke(
    {"input": "what are the best supported platforms?"},
    config={
        "configurable": {"session_id": "abc123"}
    },
)["answer"])