In [1]:
import gdown 
from langchain_huggingface import HuggingFaceEndpoint
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import DataFrameLoader
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.documents import Document
from langchain import hub
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.messages import HumanMessage
import uuid
import chromadb
from chromadb.config import Settings
import os
from getpass import getpass
import pandas as pd
from typing import List
import zipfile

This is a storybook with a purpose of presenting a way of creating a custom RAG(Retrieval-Augmented Generation) framework. The code from this notebook is also present in python files which are used by the llm_app. Here, you can see the thought process of creating the RAG more clearly.

### Data preparation

I chose the data from the following site: [Blog Corpus](https://www.kaggle.com/datasets/rtatman/blog-authorship-corpus/data). <br>
This is a dataset of different blogs written on or before 2004, with each blog being the work of a single user. <br>
The dataset originally cotains 681,288 posts, but for the LLM I took only 10000 posts because of computational resource limitations.

I stored this filtered dataset on google drive here: [Blog Corpus Filtered](https://drive.google.com/file/d/1KB6gCv2aTc1DOBF1RoEVhqFHfBL4_ZMn/view?usp=sharing). <br>
Now, we can download and extract the data using this function:

In [12]:
def download_and_extract_data() -> None:
    gdown.download('https://drive.google.com/uc?id=1KB6gCv2aTc1DOBF1RoEVhqFHfBL4_ZMn', './data.zip', quiet=False)
    with zipfile.ZipFile('./data.zip', 'r') as zip_ref:
        zip_ref.extractall('.')

download_and_extract_data()

In [14]:
# Load the data
df = pd.read_csv('blogtext_small.csv', delimiter=',')
df.head()

Unnamed: 0.1,Unnamed: 0,id,gender,age,topic,sign,date,text
0,0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


Now, as per RAG model, the data is split into chunks:

In [None]:
def load_and_split_documents(df: pd.DataFrame, chunk_size: int = 128, chunk_overlap: int = 32) -> List[Document]:
    loader = DataFrameLoader(df)
    data = loader.load()

    text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(data)
    return docs

docs = load_and_split_documents(df=df)

With the split data we can load it into our vector database (I chose chromadb). So we need to connect to chromadb docker container, create or get desired collection and fill the collection with out data.

In [2]:
# First set chromadb vectorstore connection variables
CHROMADB_HOST = 'host.docker.internal' # chromadb is running as a docker container
CHROMADB_PORT = 8000
CHROMADB_SETTINGS = Settings(allow_reset=True)

In [None]:
# Init chromadb client and collection
client = chromadb.HttpClient(host=CHROMADB_HOST, port=CHROMADB_PORT, settings=CHROMADB_SETTINGS)
client.reset()  # resets the database
collection = client.get_or_create_collection("blog_collection")

# Add the data to the collection
for doc in docs:
    collection.add(
        ids=[str(uuid.uuid1())], metadatas=doc.metadata, documents=doc.page_content
    )

The data is now set and stored. We can search it whenever we want.

### LLM model response generaton

To use an open source huggingface LLM model from the cloud, you need to put your access token.
To get an access token you need to go to Hugging Face website and perform the following:

    1. Register or Login.
    2. Create a User Access or API token in your Hugging Face profile settings.

In the profile settings, under Access Tokens, you should see a token hf_xxxxx (old tokens are api_XXXXXXXX or api_org_XXXXXXX). If you do not submit your API token when sending requests to the API, you will not be able to run inference on your private models.

When you have the token, run the following cell and enter the token in the input field:


In [3]:
HUGGINGFACEHUB_API_TOKEN = getpass()
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN

 ········


Chromadb is already populated with the documents. Now, we have to connect to the database to access those documents.

In [4]:
# Load the data for preview
df = pd.read_csv('blogtext_small.csv', delimiter=',')
df.head()

Unnamed: 0.1,Unnamed: 0,id,gender,age,topic,sign,date,text
0,0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


Let's take one example from the dataframe and try to examine the LLM performance on information extraction and question answering.

In [5]:
df.text.iloc[9]

"             I surf the English news sites a lot looking for tidbits on Korea and how foreigners (like me) view the 'Hermit Kingdom' but also as a way to keep up with this fast-moving place.  Sometimes, though, one needs to check the veracity of the figures put in the papers...especially the local ones.  Here are two examples of how the English version of the Korea Times and that of the JoongAng Ilbo (Daily).  The first is pretty straightforward.   urlLink Korea Times  said that 249 people were arrested for forging Korean passports, but  urlLink JoongAng Ilbo  says just 114 were accused.  Huh?  Another one:  urlLink JoongAng Ilbo  said that S&P is positive on Korean banks (a good thing), while the  urlLink Korea Times  said that S&P was a tad worried about the bad loans that banks extended to small and medium-sized firms.  I have no idea why the simple facts seem to be presented so differently...it can't simply be translation, can it?         "

Let's ask the model the following question: How many people did rlLink Korea Times say were arrested for forging Korean passports? <br>
From the text we see the answer is 249.

If we want the model to understand the question and the documents, we need to initialize embedding function and I chose the following model:

In [6]:
%%capture
embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

Now we can connect to the chromadb, set its embedding function for retrieval and create the retriever:

In [7]:
client = chromadb.HttpClient(host=CHROMADB_HOST, port=CHROMADB_PORT, settings=CHROMADB_SETTINGS)

In [10]:
client = chromadb.HttpClient(host=CHROMADB_HOST, port=CHROMADB_PORT, settings=CHROMADB_SETTINGS)
db = Chroma(
    client=client,
    collection_name="langchain",
    embedding_function=embedding_function,
)

retriever = db.as_retriever()

Before we generate the LLM's output response, let's see if the retriever works. It should return relevant documents for the question.

In [11]:
retriever.get_relevant_documents('How many people did rlLink Korea Times say were arrested for forging Korean passports?', )

  warn_deprecated(


[Document(metadata={'Unnamed: 0': 9, 'age': 33, 'date': '09,June,2004', 'gender': 'male', 'id': 3581210, 'sign': 'Aquarius', 'topic': 'InvestmentBanking'}, page_content="I surf the English news sites a lot looking for tidbits on Korea and how foreigners (like me) view the 'Hermit Kingdom' but also as a way to keep up with this fast-moving place.  Sometimes, though, one needs to check the veracity of the figures put in the papers...especially the local ones.  Here are two examples of how the English version of the Korea Times and that of the JoongAng Ilbo (Daily).  The first is pretty straightforward.   urlLink Korea Times  said that 249 people were arrested for forging Korean passports, but  urlLink JoongAng Ilbo  says just 114 were accused.  Huh?  Another one:  urlLink JoongAng Ilbo  said that S&P is positive on Korean banks (a good thing), while the  urlLink Korea Times  said that S&P was a tad worried about the bad loans that banks extended to small and medium-sized firms.  I have n

And it does! We see the first retrieved document is the document we saw earlier (the one on the 9th row in out data).

Now we can perform model inference. For the inference we need:
<ul>
    <li> question </li>
    <li> context </li>
    <li> prompt </li>
    <li> LLM model </li>
</ul>

Question: "How many people did rlLink Korea Times say were arrested for forging Korean passports?" <br>
Context represents the knowledge from the chromadb, i.e. the retriever. <br>
Prompt is the part that combines the question with the context to represent the input to the model. Here we use a popular prompt template which looks like this: <br>
<blockquote>
"You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

Question: {question} 

Context: {context} 

Answer:"
</blockquote> <br>

This is a general prompt template which should work good for our example. You can find more about it here: [rlm/rag-prompt](https://smith.langchain.com/hub/rlm/rag-prompt?organizationId=f8b0bb50-1c01-5bf0-864a-1fcba128b633) <br>
And for the LLM, we use a Hugging Face model <b> mistralai/Mistral-7B-Instruct-v0.2 </b>.

In [12]:
prompt = hub.pull("rlm/rag-prompt")

In [13]:
repo_id = "mistralai/Mistral-7B-Instruct-v0.2" 

llm = HuggingFaceEndpoint(
    repo_id=repo_id, temperature=0.5
)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [14]:
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("How many people did rlLink Korea Times say were arrested for forging Korean passports?") # 249 is the answer

' The Korea Times reported that 249 people were arrested for forging Korean passports.'

It is also possible to use chat history with the llm. The following code was taken from the [langchain chat history](https://python.langchain.com/v0.1/docs/use_cases/question_answering/chat_history/). <br>
Below I use chat history with my example.

In [15]:
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [17]:
qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [22]:
chat_history = []

question = "How many people did rlLink Korea Times say were arrested for forging Korean passports?"
ai_msg_1 = rag_chain.invoke({"input": question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=question), ai_msg_1["answer"]])

print(ai_msg_1["answer"])

second_question = "Can you just repeat the number?"
ai_msg_2 = rag_chain.invoke({"input": second_question, "chat_history": chat_history})

print(ai_msg_2["answer"])


Assistant: The Korea Times reported that 249 people were arrested for forging Korean passports.

Assistant: Yes, 249 people were arrested.


We can see he gave the same answer (249) and we asked him a second question which gave no context what was being asked other than referencing the last question.