In [1]:
!wget https://huggingface.co/ngxson/demo_simple_rag_py/resolve/main/cat-facts.txt

--2025-04-18 19:43:22--  https://huggingface.co/ngxson/demo_simple_rag_py/resolve/main/cat-facts.txt
Resolving huggingface.co (huggingface.co)... 13.35.202.34, 13.35.202.97, 13.35.202.40, ...
Connecting to huggingface.co (huggingface.co)|13.35.202.34|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 22657 (22K) [text/plain]
Saving to: ‘cat-facts.txt.1’


2025-04-18 19:43:22 (148 MB/s) - ‘cat-facts.txt.1’ saved [22657/22657]



In [2]:
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import torch
import torch.nn.functional as F
import numpy as np

# Model names
EMBEDDING_MODEL_NAME = 'BAAI/bge-large-zh-v1.5'
LANGUAGE_MODEL_NAME = 'HuggingFaceTB/SmolLM2-1.7B-Instruct'

# Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(LANGUAGE_MODEL_NAME)
language_model = AutoModelForCausalLM.from_pretrained(LANGUAGE_MODEL_NAME).to(device)

In [3]:
# Retrieval
def retrieve(query, top_n=3):
    query_embedding = torch.tensor(
        embedding_model.encode(query, normalize_embeddings=True), dtype=torch.float32
    )
    similarities = F.cosine_similarity(VECTOR_DB, query_embedding.unsqueeze(0))
    top_indices = similarities.topk(top_n).indices.tolist()
    return [(CHUNKS[i], similarities[i].item()) for i in top_indices]

In [4]:
# Load dataset
with open('cat-facts.txt', 'r') as file:
    dataset = [line.strip() for line in file if line.strip()]
    print(f'Loaded {len(dataset)} entries')

# Build vector DB
CHUNKS = []
embeddings = []

print("Encoding chunks:")
for chunk in tqdm(dataset):
    embedding = embedding_model.encode(chunk, normalize_embeddings=True)
    embeddings.append(embedding)
    CHUNKS.append(chunk)

VECTOR_DB = torch.tensor(np.array(embeddings), dtype=torch.float32)

Loaded 150 entries
Encoding chunks:


100%|██████████| 150/150 [00:04<00:00, 34.19it/s]


In [5]:
# Chat
input_query = input('Ask me a question: ')
retrieved_knowledge = retrieve(input_query)

print('Retrieved knowledge:')
for chunk, similarity in retrieved_knowledge:
    print(f' - (similarity: {similarity:.2f}) {chunk}')

context = '\n'.join([f'- {chunk}' for chunk, _ in retrieved_knowledge])
system_prompt = "You are a helpful chatbot.\nUse only the following context to answer the user's question:\n" + context

# Chat-style formatting
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": input_query},
]

input_text = tokenizer.apply_chat_template(messages, tokenize=False)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)

# Generate
outputs = language_model.generate(
    inputs,
    max_new_tokens=300,
    temperature=0.7,
    top_p=0.9,
    do_sample=True
)

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print('\nChatbot response:\n')
print(response.split(input_query)[-1].strip())

Ask me a question: How many years do cats live?


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Retrieved knowledge:
 - (similarity: 0.72) When well treated, a cat can live twenty or more years but the average life span of a domestic cat is 14 years.
 - (similarity: 0.68) On average, cats spend 2/3 of every day sleeping. That means a nine-year-old cat has been awake for only three years of its life.
 - (similarity: 0.68) Neutering a cat extends its life span by two or three years.

Chatbot response:

assistant
The average life span of a domestic cat is 14 years. However, the life span of a cat can vary depending on factors such as the cat's breed, lifestyle, and overall health.


In [6]:
system_prompt = "You are a helpful chatbot.\nUse only the following context to answer the user's question, do not add any additional information, only constrain yourself to those provided:\n" + context

# Chat-style formatting
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": input_query},
]

input_text = tokenizer.apply_chat_template(messages, tokenize=False)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)

# Generate
outputs = language_model.generate(
    inputs,
    max_new_tokens=300,
    temperature=0.7,
    top_p=0.9,
    do_sample=True
)

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print('\nChatbot response:\n')
print(response.split(input_query)[-1].strip())


Chatbot response:

assistant
On average, a cat can live between 14 to 20 years, but the average life span of a domestic cat is 14 years.


In [7]:
system_prompt = "You are a helpful chatbot.\nUse only the following context to answer the user's question, do not add any additional information, only constrain yourself to those provided, but be reasonable:\n" + context

# Chat-style formatting
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": input_query},
]

input_text = tokenizer.apply_chat_template(messages, tokenize=False)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)

# Generate
outputs = language_model.generate(
    inputs,
    max_new_tokens=300,
    do_sample=False
)

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print('\nChatbot response:\n')
print(response.split(input_query)[-1].strip())


Chatbot response:

assistant
On average, cats live 14 years.
