# Semantic similarity and RAG

---

S.Yu. Papulin (papulin.study@yandex.ru)

### Contents

- [Semantic Similarity](#Semantic-Similarity)
    - [Sentence Embedding with BERT](#Sentence-Embedding-with-BERT)
    - [Pretrained for inference](#Pretrained-for-inference)
    - [Sentence Transformer](#Sentence-Transformer)
- [RAG using `llamaIndex`](#RAG-using-llamaIndex)
    - [Basics](#Basics)
    - [Hybrid Search](#Hybrid-Search)
    - [Function Calling](#Function-Calling)
- [Sources](#Sources)

In [None]:
import numpy as np
import tensorflow as tf

from transformers import AutoTokenizer, TFAutoModel

from sklearn.metrics.pairwise import cosine_similarity

## Semantic Similarity

### Sentence Embedding with BERT

In [None]:
# Pretrained model name
CHECKPOINT = "bert-base-uncased"

In [None]:
# Load tokinezer associated with the model
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
tokenizer

In [None]:
# ~400MB
model = TFAutoModel.from_pretrained(CHECKPOINT)
model

In [None]:
model.summary()

In [None]:
QUERY = "Sport is your way to be stronger."

SENTS = [
    "Technology drives our world to success.",
    "Regular exercise strengthens the spirit.",
    "Sport is the worst thing that was created.",
    "Recent stock data shows that marker is on the down trend.",
    "London is the capital of Great Britain.",
    "Moscow is the capital of Russia.",
    "Lyon is the capital of France."
]

In [None]:
def tokenize_function(input_text):
    global tokenizer
    return tokenizer(
        input_text, 
        padding='max_length', 
        max_length=60, 
        truncation=True, 
        return_tensors='tf'
    )

In [None]:
q_ids = tokenize_function(QUERY)
q_ids

In [None]:
D_ids = tokenize_function(SENTS)
D_ids

**Pooler Ouput**

In [None]:
model.predict(q_ids).pooler_output.shape

In [None]:
model.bert(D_ids).pooler_output.shape

In [None]:
def get_embedding_pooler(inputs):
    global model
    input_ids = tokenize_function(inputs)
    return model.predict(input_ids).pooler_output

In [None]:
q_embed = get_embedding_pooler(QUERY)
q_embed.shape

In [None]:
D_embed = get_embedding_pooler(SENTS)
D_embed.shape

In [None]:
sims = cosine_similarity(q_embed, D_embed)
sims

In [None]:
def print_by_similarity(query, docs, similatities):
    index_sorted = (-similatities).argsort()
    print(f"Query: {query}")
    print("Results:")
    for i, index in enumerate(index_sorted):
        print(f"{i+1}. {similatities[index]:.3f} -> {docs[index]}")

In [None]:
print_by_similarity(QUERY, SENTS, sims[0])

**Mean Output**

In [None]:
def get_embedding_mean(inputs):
    global model
    input_ids = tokenize_function(inputs)
    return tf.reduce_mean(model.predict(input_ids).last_hidden_state, axis=1)

In [None]:
q_embed = get_embedding_mean(QUERY)
D_embed = get_embedding_mean(SENTS)
sims = cosine_similarity(q_embed, D_embed)
print_by_similarity(QUERY, SENTS, sims[0])

**Masked Mean Output**

In [None]:
def apply_masks(outputs, masks):
    outputs_masks = tf.reshape(masks, [tf.shape(masks)[0], tf.shape(masks)[1], -1])
    outputs_masks = tf.tile(outputs_masks, [1, 1, tf.shape(outputs)[2]])
    return tf.where(
        outputs_masks == 0,
        tf.zeros_like(outputs),
        outputs
    )


def get_embedding_mean_masked(inputs):
    global model
    input_ids = tokenize_function(inputs)
    outputs = model.predict(input_ids).last_hidden_state
    masks = input_ids['attention_mask']
    outputs_masked = apply_masks(outputs, masks)
    return tf.reduce_mean(outputs_masked, axis=1)


In [None]:
q_embed = get_embedding_mean_masked(QUERY)
D_embed = get_embedding_mean_masked(SENTS)
sims = cosine_similarity(q_embed, D_embed)
print_by_similarity(QUERY, SENTS, sims[0])

### Pretrained for inference

In [None]:
# Note: We need torch to use torch model with tensorflow
# %pip install torch --index-url https://download.pytorch.org/whl/cpu

In [None]:
CHECKPOINT = "textattack/bert-base-uncased-snli"

In [None]:
# Load tokinezer associated with the model
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
tokenizer

In [None]:
# ~1GB
model = TFAutoModel.from_pretrained(CHECKPOINT, from_pt=True)
model

In [None]:
# Embedding with pooler
q_embed = get_embedding_pooler(QUERY)
D_embed = get_embedding_pooler(SENTS)
sims = cosine_similarity(q_embed, D_embed)
print_by_similarity(QUERY, SENTS, sims[0])

In [None]:
# Embedding as mean of last hidden layer
q_embed = get_embedding_mean(QUERY)
D_embed = get_embedding_mean(SENTS)
sims = cosine_similarity(q_embed, D_embed)
print_by_similarity(QUERY, SENTS, sims[0])

In [None]:
# Embedding as mean of last hidden layer with mask
q_embed = get_embedding_mean_masked(QUERY)
D_embed = get_embedding_mean_masked(SENTS)
sims = cosine_similarity(q_embed, D_embed)
print_by_similarity(QUERY, SENTS, sims[0])

### Sentence Transformer

In [None]:
# %pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer

#### Model `all-MiniLM-L6-v2`

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")
model

In [None]:
q_embed = model.encode(QUERY)
D_embed = model.encode(SENTS)

In [None]:
sims = model.similarity(q_embed, D_embed)
sims

In [None]:
print_by_similarity(QUERY, SENTS, sims[0])

In [None]:
del model

#### Model `multilingual-e5-large-instruct`

**Similarity**

In [None]:
model = SentenceTransformer("intfloat/multilingual-e5-large-instruct")
model

In [None]:
q_embed = model.encode(QUERY)
D_embed = model.encode(SENTS)
sims = model.similarity(q_embed, D_embed)
print_by_similarity(QUERY, SENTS, sims[0])

**Instruction**

In [None]:
def get_detailed_instruct(task_description, query):
    return f'Instruct: {task_description}\nQuery: {query}'

In [None]:
task = 'Given a web search query, retrieve relevant passages that answer the query'

In [None]:
queries = [
    "What makes you stronger?",
    "What is tha capital of Great Britain?",
    "What is tha capital of France?",
    "What is tha capital of Germany?"
]

prompts = [get_detailed_instruct(task, query) for query in queries]

In [None]:
Q_embed = model.encode(prompts)
D_embed = model.encode(SENTS)

In [None]:
sims = model.similarity(Q_embed, D_embed)
sims

In [None]:
print_by_similarity(queries[0], SENTS, sims[0])

In [None]:
print_by_similarity(queries[1], SENTS, sims[1])

In [None]:
print_by_similarity(queries[2], SENTS, sims[2])

In [None]:
print_by_similarity(queries[3], SENTS, sims[3])

## RAG using `llamaIndex`

In [None]:
# %pip install \
# llama-index-core \
# llama-index-readers-file \
# llama-index-readers-string-iterable \
# llama-index-llms-ollama \
# llama-index-embeddings-huggingface \
# llama-index-llms-huggingface \
# llama-index-llms-deepseek \
# llama-index-retrievers-bm25

In [None]:
# Readers
from llama_index.readers.string_iterable import StringIterableReader
from llama_index.core import SimpleDirectoryReader

# Model
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.llms.ollama import Ollama

# Index
from llama_index.core import VectorStoreIndex
from llama_index.core.vector_stores import VectorStoreQuery

# Retriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import QueryFusionRetriever

<center>
<img src="https://docs.llamaindex.ai/en/stable/_static/getting_started/basic_rag.png" width="70%"/>
</center>

#### Retrieval-Augmented Generation (RAG)

<center>
<img src="https://docs.llamaindex.ai/en/stable/_static/getting_started/stages.png" width="70%"/>
</center>

#### Stages within RAG

### Basics

**Documents**

In [None]:
# load documents
documents = StringIterableReader().load_data(
    texts=SENTS
)
documents[0]

**Embedding and vector storage**

In [None]:
# embedding model (~133M)
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
embed_model

In [None]:
# create index
index = VectorStoreIndex.from_documents(
    documents,
    embed_model=embed_model,
    show_progress=True
)
type(index)

⚠️ **Warning.** We use in-memory storage. For more complex tasks, use a vector storage such as `Qdrant`

In [None]:
type(index.docstore)

In [None]:
# match document id and node id (chunks of document)
index.docstore.get_all_ref_doc_info()

In [None]:
# some node id
node_id = list(index.docstore.docs.keys())[0]

In [None]:
# node detail
index.docstore.get_node(node_id)

In [None]:
# match nodes and documents
index.vector_store.data.text_id_to_ref_doc_id

In [None]:
# embeddings (384) of document chunk
index.vector_store.data.embedding_dict[node_id][:10]

**Retrieval. Semantic similarity**

In [None]:
# QUERY = "What is tha capital of Great Britain?"
QUERY = "Sport is your way to be stronger."

q_embed = embed_model.get_query_embedding(QUERY)

In [None]:
vector_store_query = VectorStoreQuery(
    query_embedding=q_embed, 
    similarity_top_k=7, 
    mode="default"
)
result = index.vector_store.query(query=vector_store_query)
result

In [None]:
node_texts = [index.docstore.get_node(node_id).text for node_id in result.ids]

print_by_similarity(QUERY, node_texts, np.array(result.similarities))

**Generator with LLM**

In [None]:
def load_tiny_llama_llm(checkpoint="TinyLlama/TinyLlama-1.1B-Chat-v1.0"):
    """~2.24G"""
    return HuggingFaceLLM(
        model_name=checkpoint, 
        tokenizer_name=checkpoint,
        system_prompt="You are a helpful assistant",
        context_window=2048,
    )


def load_phi_llm(checkpoint="microsoft/phi-3-mini-4k-instruct"):
    """~8G"""
    from transformers import AutoTokenizer

    # def message_to_prompt(messages):
    #     return f"<|user|>\n{messages[-1].content}<|end|>\n<|assistant|>"

    def completion_to_prompt(completion):
        return f"<|user|>\n{completion}<|end|>\n<|assistant|>"
    
    # Note: We load tokenizer to get eos_token_id
    tokenizer = AutoTokenizer.from_pretrained(
        checkpoint,
        trust_remote_code=True
    )
    return HuggingFaceLLM(
        model_name=checkpoint, 
        tokenizer_name=checkpoint,
        # messages_to_prompt=message_to_prompt,
        completion_to_prompt=completion_to_prompt,
        context_window=2048,
        device_map="auto",
        stopping_ids=[tokenizer.eos_token_id,],
        generate_kwargs={
            "temperature": 0.3, 
            "do_sample": True, 
            "top_p": 0.95,
            "early_stopping": True
            # "use_cache": False,
        },
        model_kwargs={
            "torch_dtype": "auto", 
            # "trust_remote_code": True,
            # "low_cpu_mem_usage": True,
        }
    )


def generate_as_stream(llm, prompt):
    for chunk in llm.stream_complete(prompt):
        print(chunk.delta, end="", flush=True)


In [None]:
# llm = load_phi_llm()
llm = load_tiny_llama_llm()
llm.metadata

In [None]:
QUERY = "What is the capital of France?"

# stream output of complete
generate_as_stream(llm, QUERY)

In [None]:
# entire output of complete
response = llm.complete(QUERY)
print(response)

In [None]:
from llama_index.core.llms import ChatMessage

response = llm.chat([ChatMessage(role="user", content=QUERY)])
print(response)

In [None]:
# Option 1
vector_retriever = index.as_retriever()
query_engine = RetrieverQueryEngine.from_args(
    retriever=vector_retriever, 
    llm=llm,
    similarity_top_k=3
)

In [None]:
# Option 2 (recommended)
query_engine = index.as_query_engine(
    llm=llm,
    similarity_top_k=3
)

In [None]:
QUERY = "What is a way to be stronger?"
# QUERY = "What is tha capital of Great Britain?"
# QUERY = "What is tha capital of Germany?"
# QUERY = "What is tha capital of France?"
# QUERY = "Do you think the capital of France is Lyon or Paris?"

In [None]:
response = query_engine.query(QUERY)
response

In [None]:
print(response)

### Hybrid Search

In [None]:
from llama_index.core.response.notebook_utils import display_source_node

**Keywords Search using BM25 Retriever**

In [None]:
QUERY = "Do you think the capital of France is Lyon or Paris?"

In [None]:
bm25_retriever = BM25Retriever.from_defaults(
    # index=index, 
    docstore=index.docstore,
    similarity_top_k=5
)

In [None]:
nodes = bm25_retriever.retrieve(QUERY)
nodes[:1]

In [None]:
for node in nodes:
    display_source_node(node, source_length=5000)

In [None]:
# Combine with LLM
query_engine = RetrieverQueryEngine.from_args(
    llm=llm,
    retriever=bm25_retriever, 
    similarity_top_k=3
)

In [None]:
print(query_engine.query(QUERY))

**Hybrid**

In [None]:
hybrid_retriever = QueryFusionRetriever(
    llm=llm,
    retrievers=[vector_retriever, bm25_retriever],
    similarity_top_k=3,
    num_queries=1,
    retriever_weights=[0.7, 0.3],
    # mode="reciprocal_rerank"
)

In [None]:
nodes = hybrid_retriever.retrieve(QUERY)
for node in nodes:
    display_source_node(node, source_length=5000)

In [None]:
query_engine = RetrieverQueryEngine.from_args(
    llm=llm,
    retriever=hybrid_retriever, 
    similarity_top_k=3
)

In [None]:
print(query_engine.query(QUERY))

### Function Calling

In [None]:
from llama_index.core.tools import FunctionTool
from llama_index.core.agent.workflow import ReActAgent, FunctionAgent, ToolCallResult
from llama_index.core.workflow import Context

⚠️ **Warning.** Small LLMs are not a good option for this task. So, below code doesn't call tools. Use more advanced models 

In [None]:
def load_deepseek_llm():
    from llama_index.llms.deepseek import DeepSeek
    import os
    from dotenv import load_dotenv
    load_dotenv()
    return DeepSeek(
        model="deepseek-chat", 
        api_key=os.getenv("DEEPSEEK_API_KEY"),
        system_prompt="You are a helpful assistant."
    )

In [None]:
llm = load_deepseek_llm()
llm

In [None]:
llm.metadata

**Basics**

In [None]:
def get_weather(region: str) -> int:
    """Weather of region provided"""
    return 20

In [None]:
weather_tool = FunctionTool.from_defaults(
    fn=get_weather,
    name="get_weather",
    description="Get weather condition in specified region",
)

`ReActAgent`

In [None]:
agent = ReActAgent(
    tools=[weather_tool], 
    llm=llm
)

# Create a context to store the conversation history/session state
# ctx = Context(agent)

In [None]:
response = await agent.run("What is the weather in Paris?")
response

In [None]:
print(response)

In [None]:
print(response.tool_calls)

`FunctionAgent`

In [None]:
llm.is_function_calling_model

In [None]:
agent = FunctionAgent(
    tools=[weather_tool],
    llm=llm
)

In [None]:
response = await agent.run("What is the weather in Paris?")
response

In [None]:
print(response)

**Combining with search**

In [None]:
def search(query: str) -> str:
    return str(query_engine.query(query))

In [None]:
search_tool = FunctionTool.from_defaults(
    fn=search,
    name="search",
    description="Search for information based on user predefined context. Represent the query as a question."
)

In [None]:
agent = ReActAgent(tools=[weather_tool, search_tool], llm=llm)

In [None]:
QUERY = "What is the weather in Lyon, and what is the capital of France?"
# QUERY = "Do you think the capital of France is Lyon or Paris?"

In [None]:
response = await agent.run(QUERY)
print(response)

In [None]:
print(response.tool_calls)

## Sources

- [Building an LLM application](https://docs.llamaindex.ai/en/stable/understanding/)
- [Local Embeddings with HuggingFace](https://docs.llamaindex.ai/en/stable/examples/embeddings/huggingface/)
- [Hugging Face LLMs](https://docs.llamaindex.ai/en/stable/examples/llm/huggingface/)