In [1]:
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
from transformers import pipeline
from typing import List, Dict

EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)


# -----------------------------
# Load Vector Store
# -----------------------------
def load_vector_store(persist_dir: str = "vector_store"):
    client = chromadb.Client(
        Settings(persist_directory=persist_dir)
    )
    collection = client.get_collection(name="complaints")
    return collection


# -----------------------------
# Retriever
# -----------------------------
def retrieve_chunks(
    question: str,
    collection,
    k: int = 5
) -> List[Dict]:
    query_embedding = embedding_model.encode(question).tolist()

    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=k
    )

    retrieved_docs = []
    for i in range(len(results["documents"][0])):
        retrieved_docs.append({
            "text": results["documents"][0][i],
            "metadata": results["metadatas"][0][i]
        })

    return retrieved_docs


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [2]:
PROMPT_TEMPLATE = """
You are a financial analyst assistant for CrediTrust Financial.

Your task is to answer questions about customer complaints using ONLY the information
provided in the context below. Do not use prior knowledge.
If the context does not contain enough information, say:
"I do not have enough information from customer complaints to answer this question."

Context:
{context}

Question:
{question}

Answer:
"""


In [3]:
# -----------------------------
# Load LLM
# -----------------------------
generator = pipeline(
    "text-generation",
    model="mistralai/Mistral-7B-Instruct-v0.2",
    max_new_tokens=300,
    temperature=0.3
)


def generate_answer(question: str, retrieved_chunks: List[Dict]) -> str:
    context = "\n\n".join([c["text"] for c in retrieved_chunks])

    prompt = PROMPT_TEMPLATE.format(
        context=context,
        question=question
    )

    response = generator(prompt)[0]["generated_text"]
    return response.split("Answer:")[-1].strip()


config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Device set to use cpu


In [4]:
def rag_pipeline(question: str, k: int = 5):
    collection = load_vector_store()
    retrieved_chunks = retrieve_chunks(question, collection, k)
    answer = generate_answer(question, retrieved_chunks)

    return answer, retrieved_chunks