In [1]:
from langchain.vectorstores import Chroma
from langchain_community.llms import Ollama
from langchain_community.chat_models import ChatOllama
from langchain.schema import SystemMessage,HumanMessage,AIMessage
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import WebBaseLoader
from langchain.embeddings import GPT4AllEmbeddings,OllamaEmbeddings
from langchain.chains import RetrievalQA





In [2]:
#Loading Data and creating Vector
loader=WebBaseLoader("https://en.wikipedia.org/wiki/Assam")
data=loader.load()

text_splits=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=300)
all_splits=text_splits.split_documents(data)
print(len(all_splits))
vectorstore = Chroma.from_documents(documents=all_splits,
                                        embedding=OllamaEmbeddings(model="mistral"),persist_directory="./chroma_db")


250


In [3]:
# Creating ChatModel using Ollama
chat_model = ChatOllama(base_url="http://172.17.10.68:11434", model="mistral")

In [4]:
# Prompt making function
def augment_prompt(query: str)->str:
    # get top 3 results from knowledge base. You can play with k to tune the answers, it also depend on how your knowledge was stored.ie Chuck Size etc
    results = vectorstore.similarity_search(query, k=2)  # result also return the source documents as metadata
    # get the text from the results
    source_knowledge = "\n".join([x.page_content for x in results])
    # feed source_knowledge to construct a augmented prompt

    # Can try to play around the initial part prompt
    # Using the contexts below, answer the query. OR
    # Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
    augmented_prompt = f"""Use the following pieces of information to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
    Contexts:
    {source_knowledge}
    Query: {query}"""
    return augmented_prompt

In [5]:
# Initial Conversation (1st query)
messages=[SystemMessage(content="You helpful AI which only answers from information provided")]
#messages=[]
query="Assam is located in which part of India?"
prompt = HumanMessage(
    content=augment_prompt(query)
)
messages.append(prompt)

res = chat_model(messages)
messages.append(res)
res.content


  warn_deprecated(


' Assam is located in the northeastern part of India. It shares its borders with several countries including Bangladesh and Bhutan, as well as Indian states like West Bengal, Bihar, and Arunachal Pradesh.'

In [6]:
# 2nd query
query="What is the population?"
prompt = HumanMessage(
    content=augment_prompt(query)
)
messages.append(prompt)

res = chat_model(messages)
messages.append(res)
res.content

' I cannot provide an answer with the given context as it does not mention the population of Assam or any specific data related to it.'

In [7]:
res

AIMessage(content=' I cannot provide an answer with the given context as it does not mention the population of Assam or any specific data related to it.')

## Only RAG, No Chat

In [8]:
#Loading Data and creating Vector
loader=WebBaseLoader("https://en.wikipedia.org/wiki/Assam")
data=loader.load()

text_splits=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=300)
all_splits=text_splits.split_documents(data)
print(len(all_splits))
vectorstore = Chroma.from_documents(documents=all_splits,
                                        embedding=OllamaEmbeddings(model="mistral"),persist_directory="./chroma_db")


250


In [9]:
# Creating Query Model
query_model = Ollama(base_url="http://172.17.10.68:11434", model="mistral")


# Get a default QA prompt template from langchain hub https://smith.langchain.com/hub
from langchain import hub
QA_CHAIN_PROMPT = hub.pull("rlm/rag-prompt-mistral") #rlm/rag-prompt-mistral "rlm/rag-prompt-llama"

In [10]:
#Setup RetrievalQA from langchain
qa_chain = RetrievalQA.from_chain_type(
        query_model,
        retriever=vectorstore.as_retriever(),
        chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},

    )

In [11]:
# Ask a question
question = f"Assam is located in which part of India?"
result = qa_chain({"query": question})
result

  warn_deprecated(


{'query': 'Assam is located in which part of India?',
 'result': ' Assam is a region located in northeastern India. Historically, it was home to the Ahom kingdom which was weakened and annexed by external forces due to internal political rivalries. Presently, Assam has significant population of indigenous tribes including the Karbi, and has seen demands for autonomous statehood or even a separate state.'}

In [12]:
result["result"]

' Assam is a region located in northeastern India. Historically, it was home to the Ahom kingdom which was weakened and annexed by external forces due to internal political rivalries. Presently, Assam has significant population of indigenous tribes including the Karbi, and has seen demands for autonomous statehood or even a separate state.'

# Testing ChromaDB

In [14]:
#Loading Data and creating Vector
loader=WebBaseLoader("https://example.com/")
data=loader.load()

text_splits=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=300)
all_splits=text_splits.split_documents(data)
print(len(all_splits))
vectorstore = Chroma.from_documents(documents=all_splits,
                                        embedding=OllamaEmbeddings(model="mistral"), persist_directory="./chroma_db")


1


In [15]:
# Clearing the collections
for collection in vectorstore._client.list_collections():
    ids = collection.get()['ids']
    print('REMOVE %s document(s) from %s collection' % (str(len(ids)), collection.name))
    if len(ids): collection.delete(ids)


REMOVE 751 document(s) from langchain collection
REMOVE 120 document(s) from ff1d0b1a-8a8e-4659-a5f4-4b3c30b516fc collection
REMOVE 191 document(s) from 22235f9b-301e-4371-9abb-e157b1eefaed collection
REMOVE 191 document(s) from 4613a3f1-5073-4388-affc-55881469788a collection
REMOVE 191 document(s) from b48a2be0-f4a0-4df2-ad03-603322775bdd collection
REMOVE 120 document(s) from bbad8923-5a7b-4757-82f1-a2802f223781 collection
REMOVE 191 document(s) from 3c44d672-061a-4225-a017-a74908be1443 collection
REMOVE 641 document(s) from 4f024bb4-0159-4dae-b097-dcdfff168611 collection
REMOVE 98 document(s) from a19eb05c-a40a-48ed-9e1e-9df2acc4a9b1 collection
REMOVE 191 document(s) from c5797a36-0a1e-44eb-98a6-618ded9872e5 collection
REMOVE 213 document(s) from 5470305f-7cd1-43c0-9f4c-cda7164083d0 collection
REMOVE 98 document(s) from 6f2d7985-70f9-47e8-b1e7-b957de26ec7a collection
REMOVE 1 document(s) from aaf8271f-7145-4b7d-9c15-e2777de9ae5d collection


In [16]:
import shutil
def delete_chromba_db():
    directory_path="chroma_db"
    try:
        # Use shutil.rmtree to delete the directory and its contents recursively
        shutil.rmtree(directory_path)
        print(f"Directory '{directory_path}' successfully deleted.")
    except Exception as e:
        print(f"Error deleting directory '{directory_path}': {e}")

In [17]:
from langchain import hub
QA_CHAIN_PROMPT = hub.pull("rlm/rag-prompt-mistral") #rlm/rag-prompt-mistral
QA_CHAIN_PROMPT

PromptTemplate(input_variables=['context', 'question'], template="<s> [INST] You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise. [/INST] </s> \n[INST] Question: {question} \nContext: {context} \nAnswer: [/INST]")

## embbeding with Ollamaembedings

In [None]:
#Loading Data and creating Vector
loader=WebBaseLoader("https://en.wikipedia.org/wiki/Assam")
data=loader.load()

text_splits=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=300)
all_splits=text_splits.split_documents(data)
print(len(all_splits))
vectorstore = Chroma.from_documents(documents=all_splits,
                                        embedding=OllamaEmbeddings(model="mistral"), persist_directory="./chroma_db")
