# RAG without using Langchain


## Packages loading & import

In [None]:
!pip install langchain
!pip install langchain_community
!pip install langchain_huggingface
!pip install langchain_text_splitters
!pip install langchain_chroma
# !pip install pyserini
!pip install rank-bm25
!pip install huggingface_hub
!pip install tiktoken



In [None]:
import os
import json
import bs4
import nltk
import torch
import pickle
import re
import numpy as np

# from pyserini.index import IndexWriter
# from pyserini.search import SimpleSearcher
from numpy.linalg import norm
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize

from langchain_community.llms import Ollama
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain.vectorstores import Chroma
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.embeddings import JinaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter
from langchain.docstore.document import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import WebBaseLoader
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer

from tqdm import tqdm

  from tqdm.autonotebook import tqdm, trange


In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## Hugging face login
- Please apply the model first: https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct
- If you haven't been granted access to this model, you can use other LLM model that doesn't have to apply.
- You must save the hf token otherwise you need to regenrate the token everytime.

In [None]:
from huggingface_hub import login

hf_token = "" # @param{type:“string”}
login(token=hf_token, add_to_git_credential=True)

In [None]:
!huggingface-cli whoami

## Database preparing

### Loading data from website

Other Documents Loader: https://python.langchain.com/docs/integrations/document_loaders/

In [None]:
# Load, chunk and index the contents of the blog.
loader = WebBaseLoader(
    web_paths=("https://www.linkedin.com/pulse/deep-dive-retrieval-augmented-generation-rag-working-tejas-bankar-q9erf",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            # class_=("post-content", "post-title", "post-header")
            class_ = ("core-section-container relative my-3", "article-main__content")
        )
    ),
)
docs = loader.load()

print(docs[0].page_content)
print(type(docs[0].page_content))
doc_content = docs[0].page_content


Deep Dive into Retrieval Augmented Generation (RAG) - Architecture & Working of Naive and Advanced RAG Framework.











                      Report this article
                    
    




 





              
          
        Tejas Bankar
      
      
          



 
            
        Tejas Bankar
      
          

              
            AI Consultant @ EY | GenAI | Machine Learning | Deep Learning | NLP | Data Science | Python | Continuous Learner | Ex-TCS
                
            


                
          Published Mar 25, 2024
      
              
 


            + Follow
          





Retrieval Augmented Generation is becoming key framework for industries and GenAI practitioners to built LLM powered applications. It has lot of potential to leverage LLMs optimally and efficiently to build end to end GenAI applications like multi functional chatbots, search engines and many more.



In this article, I have explained in detail about what is Retrieval A

### Preprosessing

In [None]:
emb_model_id = 'jinaai/jina-embeddings-v2-base-en'
emb_tokenizer = AutoTokenizer.from_pretrained(emb_model_id)
emb_model = AutoModel.from_pretrained(emb_model_id, trust_remote_code=True) # trust_remote_code is needed to use the encode method

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

configuration_bert.py:   0%|          | 0.00/8.24k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-implementation:
- configuration_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_bert.py:   0%|          | 0.00/97.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-implementation:
- modeling_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/275M [00:00<?, ?B/s]

In [None]:
def data_preprocessing(text):
    # Replace newline characters
    text = text.replace("\n", " ")
    # Remove excessive punctuation (e.g., "!!!" -> "!")
    text = re.sub(r'([.,!?])\1+', r'\1', text)

    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    # Remove URL,HTML tags
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)

    # Remove special characters (keep alphanumeric and basic punctuation)
    text = re.sub(r'[^a-zA-Z0-9\s.,!?\'"-]', '', text)

    text = text.strip()
    return text

In [None]:
doc_content = data_preprocessing(doc_content)

NameError: name 'doc_content' is not defined

In [None]:
text_splitter = TokenTextSplitter.from_huggingface_tokenizer(
    emb_tokenizer,
    chunk_size=100,
    chunk_overlap=20,
)
textobj_db = text_splitter.create_documents([doc_content])
print(len(textobj_db))
print(textobj_db[:3])

### Build up the database
- Including text_db, dense_db and sparse_db.
- In real-world scenarios, we often deal with databases containing thousands or even millions of chunks. Therefore, efficient saving and loading processes are essential.

In [None]:
text_db_path = "./text_db.json"
vector_db_path = "./vector_db.json"
bm25_db_path = "./bm25_tokenized_corpus.pkl"

In [None]:
# Build up text database & vector database
text_db = []
for id, text_obj in enumerate(textobj_db):
  text_dict = {"id": id,"text": text_obj.page_content}
  text_db.append(text_dict)

vector_db = []
for text in tqdm(text_db):
  vector_dict = {"id": text["id"], "text": text["text"], "vector": emb_model.encode(text["text"]).tolist()}
  vector_db.append(vector_dict)

# Save text_db & vector_db to reuse
with open(text_db_path, "w") as f:
  json.dump(text_db, f)
with open(vector_db_path, "w") as f:
  json.dump(vector_db, f)


In [None]:
def load_text_db(file_path):
    """
    Loads a text database from a JSON file.
    """
    with open(file_path, "r") as f:
        text_db = json.load(f)
    return text_db

## Retriever
- In this practice, we only using hybrid method combined with the pure cos_sim as dense and bm25 as parse.

### Dense retriever

In [None]:
# Dense
def cos_sim(a, b):
    """
    Computes the cosine similarity of two vectors.
    """
    return (a @ b.T) / (norm(a) * norm(b))
def dense_ranker(query, vector_db, model):
    """
    Ranks documents using cos_sim for a given query.
    """
    # Encode the query into a vector
    query_vector = model.encode(query)

    # Compute cosine similarity scores for each vector in the database
    scores = [
      {
          "id" : doc["id"],
          "text": doc.get("text", None),  # Optional: retrieve document content if available
          "score": cos_sim(query_vector, doc["vector"])
      }
      for doc in vector_db
    ]

    # Sort the documents by score in descending order
    ranked_docs = sorted(scores, key=lambda x: x["score"], reverse=True)

    return ranked_docs

def load_vector_db(file_path):
    """
    Loads a vector database from a JSON file.
    """
    with open(file_path, "r") as f:
        serialized_data = json.load(f)
        vector_db = [
            {"id": doc["id"], "text": doc["text"], "vector": np.array(doc["vector"])}
            for doc in serialized_data
        ]
    return vector_db


### Sparse retriever

In [None]:
# Sparse
def preprocess_texts(texts):
    """
    Tokenizes and preprocesses text documents.Tokenizes and preprocesses text documents.
    """
    return [word_tokenize(text.lower()) for text in texts]

def build_bm25_index(text_db):
    """
    Builds a BM25 index from a list of text documents.
    """
    # Extract texts and preprocess them
    texts = [doc["text"] for doc in text_db]
    tokenized_corpus = preprocess_texts(texts)

    # Build BM25 index
    bm25 = BM25Okapi(tokenized_corpus)
    return bm25, tokenized_corpus

def save_bm25_index(bm25, file_path):
    with open(file_path, 'wb') as bm25result_file:
        pickle.dump(bm25, bm25result_file)
    # print(f"BM25 tokenized corpus saved to {file_path}.")

def load_bm25_index(file_path):
    with open(file_path, 'rb') as bm25result_file:
        bm25 = pickle.load(bm25result_file)
    # print(f"BM25 tokenized corpus loaded from {file_path}.")

    return bm25

def bm25_ranker(query, bm25, text_db):
    """
    Ranks documents using BM25 for a given query.
    """
    # Tokenize query
    tokenized_query = word_tokenize(query.lower())

    # Get BM25 scores
    scores = bm25.get_scores(tokenized_query)

    # Rank documents by score
    ranked_docs = sorted(
        [{"id": doc["id"], "text": doc["text"], "score": scores[i]} for i, doc in enumerate(text_db)],
        key=lambda x: x["score"],
        reverse=True
    )

    # Return results
    return ranked_docs


In [None]:
# Build the BM25 index
text_db = load_text_db(text_db_path)
bm25, tokenized_corpus = build_bm25_index(text_db=text_db)

# Save the tokenized corpus
save_bm25_index(bm25=bm25, file_path=bm25_db_path)

### Hybrid retriever (RRF)

In [None]:
def hybrid_ranker_rrf(dense_ranked_docs, sparse_ranked_docs, k=60, k_bm=60):
    """
    Combines dense and BM25 ranking results using Reciprocal Rank Fusion (RRF).
    """
    # Create dictionaries for quick look-up of ranks
    dense_ranks = {doc["id"]: rank for rank, doc in enumerate(dense_ranked_docs, start = 1)}
    sparse_ranks = {doc["id"]: rank for rank, doc in enumerate(sparse_ranked_docs, start = 1)}

    # Collect all unique document IDs from both ranking results
    all_doc_ids = set(dense_ranks.keys()).union(sparse_ranks.keys())

    # Compute RRF scores
    rrf_scores = {}
    for doc_id in all_doc_ids:
        dense_rank = dense_ranks.get(doc_id, len(dense_ranks)+1)  # Use max_len of sorted + 1 for missing docs
        sparse_rank = sparse_ranks.get(doc_id, len(sparse_ranks)+1)

        # RRF score formula: 1 / (k + rank)
        if(k_bm==60):
            rrf_scores[doc_id] = (1 / (k + dense_rank)) + (1 / (k + sparse_rank))
        else:
            rrf_scores[doc_id] = (1 / (k + dense_rank)) + (1 / (k_bm + sparse_rank))

    # Combine results and sort by RRF score
    hybrid_ranked_docs = []
    for doc_id, rrf_score in sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True):
        # Retrieve document content from either dense_ranked_docs or parse_ranked_docs
        doc_content = next(
            (doc["text"] for doc in dense_ranked_docs if doc["id"] == doc_id),
            next(doc["text"] for doc in sparse_ranked_docs if doc["id"] == doc_id)
        )
        hybrid_ranked_docs.append({"id": doc_id, "text": doc_content, "score": rrf_score})

    return hybrid_ranked_docs

In [None]:
def personal_retriever(query, text_db_path, vector_db_path, bm25_db_path, emb_model, topk=3, k_bm=60):
  """
  Using hybrid method to retrieve the top_k docs from database.
  """
  # Load text_db
  text_db = load_text_db(text_db_path)
  # Load vector_db
  vector_db = load_vector_db(vector_db_path)
  # Load the tokenized corpus and rebuild the BM25 index
  bm25 = load_bm25_index(bm25_db_path)

  topk = 3
  dense_ranked_docs = dense_ranker(query=query, vector_db=vector_db, model=emb_model)
  sparse_ranked_docs = bm25_ranker(query=query, bm25=bm25, text_db=text_db)

  # Rank by using hybrid_ranker_rrk
  hybrid_ranked_docs = hybrid_ranker_rrf(dense_ranked_docs=dense_ranked_docs, sparse_ranked_docs=sparse_ranked_docs, k_bm=k_bm)
  return hybrid_ranked_docs[:topk]

## Retrieve the refference text from database based on query

In [None]:
query = "What is the Self-Reflection?"

In [None]:
# Call personal_retriever
retrieved_docs = personal_retriever(query=query, text_db_path=text_db_path, vector_db_path=vector_db_path, \
                           bm25_db_path=bm25_db_path, emb_model=emb_model, topk=3)
print(retrieved_docs)


BM25 tokenized corpus loaded from ./bm25_tokenized_corpus.pkl.
[{'id': 39, 'text': ' output. To evaluate RAG we can use metrics like answer relevancy, faithfulness for generation and context recall, precision for retrieval. RAGAs RAG Assessment RAGAs is the one of the framework which is used to evaluate RAG systems. It is simply one shot prompt technique which uses 4 prompt templates for 4 different metrics. For generation part it uses answer relevancy and answer faithfulness as metrics and for retrieval part it uses context precision and context recall as metric. It uses', 'score': 0.031754032258064516}, {'id': 25, 'text': ' 5-10 . Query Transformation Query transformation is a method of improving quality of user query by restructuring it to improve retrieval quality. It includes techniques like, decomposing main query into multiple sub queries, step-back prompting, query rewriting Multi Query Retrieval Sub Query Decomposition If query is complex and having multiple context then, retr

## Reader to generate the answer by RALM

### LLM model loading

In [None]:
# Initialize the Llama-3.2-1B model
llm_model_id = "meta-llama/Llama-3.2-1B-Instruct"

# Load the tokenizer
llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_id)
print("Tokenizer loaded successfully.")

# Load the model
llm_model = AutoModelForCausalLM.from_pretrained(
    llm_model_id,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)
print("LLM Model initialized successfully.")

Tokenizer loaded successfully.


config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

LLM Model initialized successfully.


In [None]:
# device deploy
device = "cuda:0" if torch.cuda.is_available() else "cpu"
llm_model.to(device)
print(f"LLM Model moved to {device}.")

LLM Model moved to cuda:0.


### Prompt Setting

- System prompts define a model’s role, behavior, and task scope, ensuring
tailored and consistent interactions.
- For the prompt setting example, you could check out the langchain hub website: https://smith.langchain.com/hub.

In [None]:
def format_docs(docs):
    return "\n\n".join(doc["text"] for doc in docs)

In [None]:
# Prompt setting refer to langchain "rlm/rag-prompt"
system_prompt = """
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
"""

user_prompt = """
Question: {question}
Context: {context}
Answer:
"""

input_prompt = (system_prompt + user_prompt).format(question=query, context=format_docs(retrieved_docs))
# print(input_prompt)

In [None]:
# Tokenize the prompt to prepare it for model input
inputs = llm_tokenizer(input_prompt, return_tensors="pt")

### Generate the Results

In [None]:
# Generate the output sequence from the model
outputs = llm_model.generate(**inputs.to(device), pad_token_id=llm_tokenizer.eos_token_id, max_new_tokens=300)

# Decode the generated tokens to convert them back to readable text
output_text = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Output text:", output_text)

Output text: 
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

Question: What is the Self-Reflection?
Context:  output. To evaluate RAG we can use metrics like answer relevancy, faithfulness for generation and context recall, precision for retrieval. RAGAs RAG Assessment RAGAs is the one of the framework which is used to evaluate RAG systems. It is simply one shot prompt technique which uses 4 prompt templates for 4 different metrics. For generation part it uses answer relevancy and answer faithfulness as metrics and for retrieval part it uses context precision and context recall as metric. It uses

 5-10. Query Transformation Query transformation is a method of improving quality of user query by restructuring it to improve retrieval quality. It includes techniques like, decomposing main query into 

In [None]:
# Without RAG result
input_prompt = ("""
You are an assistant for question-answering tasks. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question}
Answer:
""").format(question=query)
inputs = llm_tokenizer(input_prompt, return_tensors="pt")
outputs = llm_model.generate(**inputs.to(device), pad_token_id=llm_tokenizer.eos_token_id, max_new_tokens=300)
output_text = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Output text:", output_text)

Output text: 
You are an assistant for question-answering tasks. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: What is the Self-Reflection?
Answer:
I don't know. Self-reflection is a process of examining one's thoughts, feelings, and behaviors to gain insight and understanding. It involves recognizing areas for improvement and developing personal growth.


# Cat-facts - Retriever Evaluation

In [None]:
import json
from tqdm import tqdm
from transformers import AutoTokenizer
from transformers import AutoModel
from helper_functions import load_text_db, build_bm25_index, save_bm25_index, personal_retriever

In [None]:
!wget https://huggingface.co/ngxson/demo_simple_rag_py/resolve/main/cat-facts.txt

--2024-12-04 13:48:07--  https://huggingface.co/ngxson/demo_simple_rag_py/resolve/main/cat-facts.txt
Resolving huggingface.co (huggingface.co)... 13.35.210.114, 13.35.210.66, 13.35.210.61, ...
Connecting to huggingface.co (huggingface.co)|13.35.210.114|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 22657 (22K) [text/plain]
Saving to: ‘cat-facts.txt.2’


2024-12-04 13:48:07 (320 MB/s) - ‘cat-facts.txt.2’ saved [22657/22657]



In [None]:
with open("cat-facts.txt", "r") as f:
  refs = f.read().splitlines()

In [None]:
for ref in refs[:5]:
    print(ref)

On average, cats spend 2/3 of every day sleeping. That means a nine-year-old cat has been awake for only three years of its life.
Unlike dogs, cats do not have a sweet tooth. Scientists believe this is due to a mutation in a key taste receptor.
When a cat chases its prey, it keeps its head level. Dogs and humans bob their heads up and down.
The technical term for a cat’s hairball is a “bezoar.”
A group of cats is called a “clowder.”


In [None]:
text_db_path = "./cats_text_db.json"
vector_db_path = "./cats_vector_db.json"
bm25_db_path = "./cats_bm25_tokenized_corpus.pkl"

In [None]:
emb_model_id = 'jinaai/jina-embeddings-v2-base-en'
emb_tokenizer = AutoTokenizer.from_pretrained(emb_model_id)
emb_model = AutoModel.from_pretrained(
    emb_model_id,
    trust_remote_code=True, # trust_remote_code is needed to use the encode method
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Build up text database & vector database
text_db = []
for id, text in enumerate(refs):
  text_dict = {"id": id,"text": text}
  text_db.append(text_dict)

vector_db = []
for text in tqdm(text_db):
  vector_dict = {
      "id": text["id"],
      "text": text["text"],
      "vector": emb_model.encode(text["text"]).tolist()
  }
  vector_db.append(vector_dict)

# Save text_db & vector_db to reuse
with open(text_db_path, "w") as f:
  json.dump(text_db, f)
with open(vector_db_path, "w") as f:
  json.dump(vector_db, f)

100%|██████████| 150/150 [00:32<00:00,  4.60it/s]


In [None]:
queries = [
    "How much of a day do cats spend sleeping on average?",
    "What is the technical term for a cat's hairball?",
    "What do scientists believe caused cats to lose their sweet tooth?",
    "What is the top speed a cat can travel over short distances?",
    "What is the name of the organ in a cat's mouth that helps it smell?",
    "Which wildcat is considered the ancestor of all domestic cats?",
    "What is the group term for cats?",
    "How many different sounds can cats make?",
    "What is the name of the first cat in space?",
    "How many toes does a cat have on its back paws?"
]
golden_chunks = [
    "On average, cats spend 2/3 of every day sleeping. That means a nine-year-old cat has been awake for only three years of its life.",
    "The technical term for a cat’s hairball is a “bezoar.”",
    "Unlike dogs, cats do not have a sweet tooth. Scientists believe this is due to a mutation in a key taste receptor.",
    "A cat can travel at a top speed of approximately 31 mph (49 km) over a short distance.",
    "Besides smelling with their nose, cats can smell with an additional organ called the Jacobson’s organ, located in the upper surface of the mouth.",
    "The ancestor of all domestic cats is the African Wild Cat which still exists today.",
    "A group of cats is called a “clowder.”",
    "Cats make about 100 different sounds. Dogs make only about 10.",
    "The first cat in space was a French cat named Felicette (a.k.a. “Astrocat”) In 1963, France blasted the cat into outer space. Electrodes implanted in her brains sent neurological signals back to Earth. She survived the trip.",
    "Cats have five toes on each front paw, but only four toes on each back paw.",
]


In [None]:
# Build the BM25 index
text_db = load_text_db(text_db_path)
bm25, tokenized_corpus = build_bm25_index(text_db=text_db)

# Save the tokenized corpus
save_bm25_index(bm25=bm25, file_path=bm25_db_path)

In [None]:
recall = 0
precision = 0

for i, query in enumerate(queries):
    retrieved_docs = personal_retriever(
        query=query,
        text_db_path=text_db_path,
        vector_db_path=vector_db_path,
        bm25_db_path=bm25_db_path,
        emb_model=emb_model,
        topk=3,
        k_bm=60,
    )
    for j, retrieved_doc in enumerate(retrieved_docs):
        if golden_chunks[i].lower() == retrieved_doc["text"].lower():
            recall += 1
            precision += 1/(j+1)
            break

recall_final = recall / len(queries)
precision_final = precision / len(queries)
print(f"Recall@3: {recall_final}")
print(f"Precision@3: {precision_final}")
print(f"F1 score: {2*recall_final*precision_final/(recall_final+precision_final)}")

Recall@3: 1.0
Precision@3: 1.0
F1 score: 1.0
