# RAG Chat model

## Insatall dependencies

In [1]:
import IPython
! pip install langchain huggingface_hub langchain_community chroma tiktoken sentence_transformers chromadb langchain_huggingface
# !pip install flash-attn --no-build-isolation
IPython.display.clear_output()

In [2]:
! unzip -q "/content/drive/MyDrive/Colab Notebooks/Chatbot/db.zip"

In [1]:
from langchain.embeddings import HuggingFaceEmbeddings
import torch
import os

from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from tqdm.auto import tqdm
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
from langchain_huggingface import HuggingFacePipeline
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.vectorstores import Chroma
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

## Embedding model

In [2]:
model_name = "sentence-transformers/all-mpnet-base-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name)

  embeddings = HuggingFaceEmbeddings(model_name=model_name)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## LLM model

### Flan-t5

In [3]:
model_path = "google/flan-t5-xl"
device = 0 if torch.cuda.is_available() else -1
model = AutoModelForSeq2SeqLM.from_pretrained(model_path, torch_dtype=torch.float16 if device == 0 else torch.float32)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Set up text generation pipeline with reduced tokens
text_generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=128, device=device)

# HuggingFacePipeline for chat
llm = HuggingFacePipeline(pipeline=text_generator)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## load vector db

In [4]:
persistent_directory = os.path.join("db", "chroma_db_with_metadata")

In [5]:
db = Chroma(persist_directory=persistent_directory, embedding_function=embeddings)

  db = Chroma(persist_directory=persistent_directory, embedding_function=embeddings)


## Create a retriever for querying the vector store
* `search_type` specifies the type of search (e.g., similarity)
* `search_kwargs` contains additional arguments for the search (e.g., number of results to return)

In [6]:
retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3},
)

## Contextualize question prompt
* This system prompt helps the AI understand that it should reformulate the question
* based on the chat history to make it a standalone question

In [7]:
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, just "
    "reformulate it if needed and otherwise return it as is."
)

# Create a prompt template for contextualizing questions
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

## Create a history-aware retriever
* This uses the LLM to help reformulate the question based on chat history

In [8]:

history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)


## Test history aware retriever

In [9]:
chat_history = []
query = input("You: ")
response = history_aware_retriever.invoke({"input": query, "chat_history": chat_history})

You: I also just finishsed romeo and juliet. How did she die?


In [10]:
response

[Document(metadata={'source': 'romeo_and_juliet.txt'}, page_content='FRIAR LAWRENCE.\nI hear some noise. Lady, come from that nest\nOf death, contagion, and unnatural sleep.\nA greater power than we can contradict\nHath thwarted our intents. Come, come away.\nThy husband in thy bosom there lies dead;\nAnd Paris too. Come, I’ll dispose of thee\nAmong a sisterhood of holy nuns.\nStay not to question, for the watch is coming.\nCome, go, good Juliet. I dare no longer stay.\n\nJULIET.\nGo, get thee hence, for I will not away.\n\n [_Exit Friar Lawrence._]\n\nWhat’s here? A cup clos’d in my true love’s hand?\nPoison, I see, hath been his timeless end.\nO churl. Drink all, and left no friendly drop\nTo help me after? I will kiss thy lips.\nHaply some poison yet doth hang on them,\nTo make me die with a restorative.\n\n [_Kisses him._]\n\nThy lips are warm!\n\nFIRST WATCH.\n[_Within._] Lead, boy. Which way?\n\nJULIET.\nYea, noise? Then I’ll be brief. O happy dagger.\n\n [_Snatching Romeo’s dagg

## Answer question prompt
* This system prompt helps the AI understand that it should provide concise answers
* based on the retrieved context and indicates what to do if the answer is unknown

In [11]:
qa_system_prompt = (
    "You are an assistant for question-answering tasks. Use "
    "the following pieces of retrieved context to answer the "
    "question. If you don't know the answer, just say that you "
    "don't know. Use three sentences maximum and keep the answer "
    "concise."
    "\n\n"
    "{context}"
)


## Create a prompt template for answering questions

In [12]:
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

# Create a chain to combine documents for question answering
* `create_stuff_documents_chain` feeds all retrieved context into the LLM

In [13]:
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

## Create a retrieval chain that combines the history-aware retriever and the question answering chain

In [14]:
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

## Function to simulate a continual chat

In [15]:
print("Start chatting with the AI! Type 'exit' to end the conversation.")
chat_history = []  # Collect chat history here (a sequence of messages)
while True:
    query = input("You: ")
    if query.lower() == "exit":
        break
    # Process the user's query through the retrieval chain
    result = rag_chain.invoke({"input": query, "chat_history": chat_history})
    # Display the AI's response
    print(f"AI: {result['answer']}")
    # Update the chat history
    chat_history.append(HumanMessage(content=query))
    chat_history.append(SystemMessage(content=result["answer"]))

Start chatting with the AI! Type 'exit' to end the conversation.
You: I also just finishsed romeo and juliet. How did she die?


Token indices sequence length is longer than the specified maximum sequence length for this model (993 > 512). Running this sequence through the model will result in indexing errors


AI: stabs herself
You: who is brandon again?
AI: not enough information
You: exit


In [16]:
# !pip install transformers

# import transformers
# from langchain.chains import RetrievalQA
# from langchain.chat_models import ChatOpenAI
# from langchain.document_loaders import TextLoader
# from langchain.embeddings.openai import OpenAIEmbeddings
# from langchain.indexes import VectorstoreIndexCreator
# from langchain.llms import HuggingFacePipeline
# from langchain.prompts import PromptTemplate
# from langchain.schema import (
#     HumanMessage,
#     SystemMessage,
# )
# from langchain.vectorstores import FAISS
# from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline


# def create_stuff_documents_chain(llm, qa_prompt):
#     # Load the pre-trained model and tokenizer
#     model_name = "distilbert-base-uncased-distilled-squad"
#     tokenizer = AutoTokenizer.from_pretrained(model_name)
#     model = AutoModelForQuestionAnswering.from_pretrained(model_name)

#     # Create the pipeline
#     nlp = pipeline("question-answering", model=model, tokenizer=tokenizer)
#     hf_llm = HuggingFacePipeline(pipeline=nlp)

#     chain = RetrievalQA.from_chain_type(
#         llm=hf_llm,
#         chain_type="stuff",  # Use "stuff" chain type to insert context into the prompt
#         retriever=None,  # We will use history_aware_retriever in the next step
#         chain_type_kwargs={"prompt": qa_prompt},
#     )
#     return chain


# def create_retrieval_chain(retriever, question_answer_chain):
#     chain = RetrievalQA.from_chain_type(
#         llm=question_answer_chain.llm,
#         chain_type="stuff",
#         retriever=retriever,
#         chain_type_kwargs={"prompt": question_answer_chain.prompt},
#     )
#     return chain


# # Assuming qa_prompt and history_aware_retriever are defined elsewhere
# # ... (Your existing code for loading documents, creating retriever, qa_prompt, etc.)

# # Initialize the chat model and create the chains
# llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")
# question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
# rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

# print("Start chatting with the AI! Type 'exit' to end the conversation.")
# chat_history = []  # Collect chat history here (a sequence of messages)
# while True:
#     query = input("You: ")
#     if query.lower() == "exit":
#         break
#     # Process the user's query through the retrieval chain

#     # This part needs modification to fit the expected input format
#     # Assuming 'input' is the query and chat_history provides context:
#     result = rag_chain({"query": query, "chat_history": chat_history})
#     # Display the AI's response
#     print(f"AI: {result['result']}")  # Access the result with 'result' key
#     # Update the chat history
#     chat_history.append(HumanMessage(content=query))
