In [4]:
for i in range(5):
    print(i)
    for j in range(5):
        print("j=", j)
        continue

0
j= 0
j= 1
j= 2
j= 3
j= 4
1
j= 0
j= 1
j= 2
j= 3
j= 4
2
j= 0
j= 1
j= 2
j= 3
j= 4
3
j= 0
j= 1
j= 2
j= 3
j= 4
4
j= 0
j= 1
j= 2
j= 3
j= 4


<div style="background-color: #d54f2b;padding: 1em; color: white;">
<b>Part III</b>: Build Retrieval and Generation Pipeline
</div>

- Load embeding model

In [1]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embed_model = HuggingFaceEmbedding(model_name="manu/bge-m3-custom-fr")

-  Generate a Query Embedding

In [2]:
#query_str = "Que dit le règlement financier de l'UM6P ?"
query_str = "Comment contacter le carrier center ?"
query_embedding = embed_model.get_query_embedding(query_str)

- Query the Vector Database

In [3]:
from llama_index.core.vector_stores import VectorStoreQuery

query_mode = "default"
# query_mode = "sparse"
# query_mode = "hybrid"

vector_store_query = VectorStoreQuery(
    query_embedding=query_embedding, similarity_top_k=2, mode=query_mode
)

In [4]:
from llama_index.vector_stores.postgres import PGVectorStore

# DB Parameters
db_name = "rag_vector_db"
host = "localhost"
password = "rag_password"
port = "5433"
user = "rag_user"

vector_store = PGVectorStore.from_params(
    database=db_name,
    host=host,
    password=password,
    port=port,
    user=user,
    table_name="rag_paper_fr",
    embed_dim=1024,
)

query_result = vector_store.query(vector_store_query)

In [5]:
query_result.ids

['0d538711-cf99-46b1-af72-1aff9f98474b',
 'c4e5c5b5-9134-4c25-a84f-76c89ab4aff9']

In [6]:
query_result.similarities

[0.62223181459782, 0.5314870793513788]

In [7]:
query_result.nodes

[TextNode(id_='0d538711-cf99-46b1-af72-1aff9f98474b', embedding=None, metadata={'total_pages': 2, 'file_path': 'documents/UM6P-Phone_contact.pdf', 'source': '1'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='62\nCONTACTS\xa0:\nAdresse email\nTéléphone\nCareer Center\nComplexe sportif \nFacilities\nHealth Center\nHelpdesk informatique\nLanguage Lab\nLearning Center / \nBibliothèque\nMahir center\nBenguerir : Career.center@um6p.ma \nRabat : SCALE\nPole.sport@um6p.ma pour le \ncampus de Benguerir\nPole.sportcr@um6p.ma pour le \ncampus de Rabat\nHébergement : housingrequest@um6p.ma\nRestauration : cateringrequest@um6p.ma\nConsultation à distance\nAstreinte Health Center 7j/7\net 24H/24\nhealth.center@um6p.ma \nBenguérir : helpdesk@um6p.ma \nAstreinte 7j/7 et 24H/24\nIT Support RABAT:\nit-support-rabat@um6p.ma\nlanguagelab@um6p.ma \nBenguérir : lc@um6p.ma\nPortail : https://learningcenter.um6p.ma\nBureaux des aides documentalistes :\nBureau 1\nBure

In [8]:
query_result.nodes[0].metadata['file_path']

'documents/UM6P-Phone_contact.pdf'

- **Augmented generation**

Now that we have a context that can contain the best response for the query, we need to use a LLM to make prompt so it generate response using this context. Here we use Llama2.

In [9]:
from llama_index.llms.llama_cpp import LlamaCPP
model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf"

llm = LlamaCPP(
    model_url=model_url,
    model_path=None,
    temperature=0.1,
    max_new_tokens=256,
    context_window=3900, #up to 4096
    generate_kwargs={},
    model_kwargs={"n_gpu_layers": 1},
    verbose=True,
)




llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from /Users/genereux/Library/Caches/llama_index/models/llama-2-13b-chat.Q4_0.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv   4:                          llama.block_count u32              = 40
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:           

- Parse Result into a Set of Nodes

In [10]:
from llama_index.core.schema import NodeWithScore
from typing import Optional

nodes_with_scores = []
for index, node in enumerate(query_result.nodes):
    score: Optional[float] = None
    if query_result.similarities is not None:
        score = query_result.similarities[index]
    nodes_with_scores.append(NodeWithScore(node=node, score=score))

- Put into a Retriever

In [11]:
from llama_index.core import QueryBundle
from llama_index.core.retrievers import BaseRetriever
from typing import Any, List


class VectorDBRetriever(BaseRetriever):
    """Retriever over a postgres vector store."""

    def __init__(
        self,
        vector_store: PGVectorStore,
        embed_model: Any,
        query_mode: str = "default",
        similarity_top_k: int = 2,
    ) -> None:
        """Init params."""
        self._vector_store = vector_store
        self._embed_model = embed_model
        self._query_mode = query_mode
        self._similarity_top_k = similarity_top_k
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve."""
        query_embedding = embed_model.get_query_embedding(
            query_bundle.query_str
        )
        vector_store_query = VectorStoreQuery(
            query_embedding=query_embedding,
            similarity_top_k=self._similarity_top_k,
            mode=self._query_mode,
        )
        query_result = vector_store.query(vector_store_query)

        nodes_with_scores = []
        for index, node in enumerate(query_result.nodes):
            score: Optional[float] = None
            if query_result.similarities is not None:
                score = query_result.similarities[index]
            nodes_with_scores.append(NodeWithScore(node=node, score=score))

        return nodes_with_scores

In [12]:
retriever = VectorDBRetriever(
    vector_store, embed_model, query_mode="default", similarity_top_k=2
)

In [13]:
from llama_index.core.query_engine import RetrieverQueryEngine
query_engine = RetrieverQueryEngine.from_args(retriever, llm=llm)

In [14]:
query_str = "Comment contacter le carrier center ?"
response = query_engine.query(query_str)


llama_print_timings:        load time =   19076.97 ms
llama_print_timings:      sample time =       3.54 ms /    79 runs   (    0.04 ms per token, 22316.38 tokens per second)
llama_print_timings: prompt eval time =   34495.13 ms /   749 tokens (   46.05 ms per token,    21.71 tokens per second)
llama_print_timings:        eval time = 1083920.35 ms /    78 runs   (13896.41 ms per token,     0.07 tokens per second)
llama_print_timings:       total time = 1118646.41 ms /   827 tokens


In [15]:
print(str(response))


Le carrier center est accessible par téléphone au 05 25 07 27 00 et par email à career.center@um6p.ma. Vous pouvez également contacter le helpdesk informatique à l'adresse helpdesk@um6p.ma pour obtenir des informations supplémentaires.


In [20]:
#print(response.source_nodes[0].get_content())

In [None]:
#END