## Build RAG with Milvus

In [1]:
from pymilvus import MilvusClient
from pymilvus import CollectionSchema, FieldSchema, DataType


import pandas as pd
import tqdm as tqdm
import numpy as np


from FlagEmbedding import BGEM3FlagModel


### Dataset Articles

In [37]:
df_articles = pd.read_csv("articles.csv")

In [7]:
# Créer un dictionnaire avec chaque catégorie de code comme clé
partitions = {code: df_articles[df_articles['code'] == code] for code in df_articles['code'].unique()}

# Accéder à une partition spécifique
code_judiciaire_df = partitions['Code Judiciaire']


### Model Embeddings

In [8]:
bge_m3 = BGEM3FlagModel('BAAI/bge-m3',  
                       use_fp16=True, 
                       device='cuda')

embeddings_dim = 1024

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

  colbert_state_dict = torch.load(os.path.join(model_dir, 'colbert_linear.pt'), map_location='cpu')
  sparse_state_dict = torch.load(os.path.join(model_dir, 'sparse_linear.pt'), map_location='cpu')


### Create Collection

In [39]:
articles_collection.drop_collection(
    collection_name="articles_collection"
)

In [40]:
articles_collection = MilvusClient(uri="http://localhost:19530")

collection_name = 'articles_collection'

id_field = FieldSchema(name="id", dtype=DataType.INT64, is_primary=True)
text_field = FieldSchema(name="article", dtype=DataType.VARCHAR, max_length=65535)
reference_field = FieldSchema(name="reference", dtype=DataType.VARCHAR, max_length=1000)
embedding_field = FieldSchema(name="embedding_articles", dtype=DataType.FLOAT16_VECTOR, dim=embeddings_dim)

schema = CollectionSchema(fields=[id_field, text_field, reference_field, embedding_field], description="collection d'articles de loi")

articles_collection.create_collection(collection_name = collection_name, schema=schema)

In [125]:
#  set the indexing
index_params = MilvusClient.prepare_index_params()

# Add an index on the vector field.
index_params.add_index(
    field_name="embedding_articles",
    metric_type="COSINE",
    index_type="FLAT",
    index_name="vector_index",
    params={}   #128 clusters units, 8 cluster search
)

articles_collection.create_index(
    collection_name="articles_collection",
    index_params=index_params,
    sync=False # Whether to wait for index creation to complete before returning. Defaults to True.
)


RPC error: [create_index], <MilvusException: (code=65535, message=CreateIndex failed: at most one distinct index is allowed per field)>, <Time:{'RPC start': '2024-10-21 19:16:08.720584', 'RPC error': '2024-10-21 19:16:08.744994'}>
Failed to create an index on collection: articles_collection


MilvusException: <MilvusException: (code=65535, message=CreateIndex failed: at most one distinct index is allowed per field)>

Partitions creation

In [43]:
import re

def normalize_partition_name(name):
    # Remplacer les espaces et les tirets par des underscores
    name = re.sub(r'\s+|-', '_', name)
    # Supprimer les accents et caractères spéciaux
    name = re.sub(r'[^a-zA-Z0-9_]', '', name)
    return name

# Appliquer cette fonction aux noms des codes
df_articles['normalized_code'] = df_articles['code'].apply(normalize_partition_name)


In [44]:
codes

array(['Code_Bruxellois_de_lAir_du_Climat_et_de_la_Matrise_de_lEnergie',
       'Code_Bruxellois_de_lAmnagement_du_Territoire',
       'Code_Bruxellois_du_Logement', 'Code_Civil', 'Code_Consulaire',
       'Code_Electoral', 'Code_Electoral_Communal_Bruxellois',
       'Code_Ferroviaire', 'Code_Forestier', 'Code_Judiciaire',
       'Code_Pnal', 'Code_Pnal_Militaire', 'Code_Pnal_Social',
       'Code_Rural',
       'Code_Rglementaire_Wallon_de_lAction_sociale_et_de_la_Sant',
       'Code_Wallon_de_lAction_sociale_et_de_la_Sant',
       'Code_Wallon_de_lAgriculture',
       'Code_Wallon_de_lEnseignement_Fondamental_et_de_lEnseignement_Secondaire',
       'Code_Wallon_de_lEnvironnement',
       'Code_Wallon_de_lHabitation_Durable',
       'Code_Wallon_du_Bien_tre_des_animaux',
       'Code_Wallon_du_Dveloppement_Territorial',
       'Code_dInstruction_Criminelle', 'Code_de_Droit_Economique',
       'Code_de_Droit_International_Priv',
       'Code_de_lEau_intgr_au_Code_Wallon_de_lEnvironnem

In [45]:
codes = df_articles['normalized_code'].unique()

for code in codes:
    articles_collection.create_partition(collection_name="articles_collection",partition_name=code)

##### Encode articles texts

In [25]:
# max token length articles:  7800
# max sequence input : 8k  -> not need to chunck 

#encode the articles texts

import numpy as np
from tqdm import tqdm

# Define batch size
batch_size = 8

# List of articles
articles_list = df_articles['article'].tolist()

# Initialize an empty list to store encoded vectors
encoded_vectors = []

# Use tqdm to track the progress of the loop
for i in tqdm(range(0, len(articles_list), batch_size), desc="Encoding articles"):
    # Select batch of articles
    batch_articles = articles_list[i:i + batch_size]
    
    # Encode the batch
    batch_encoded = bge_m3.encode(batch_articles,
                                  batch_size=batch_size,
                                  max_length=8*1024
                                 )["dense_vecs"]
    
    # Append the batch result to the list of encoded vectors
    encoded_vectors.append(batch_encoded)

# Concatenate all the batches into a single array
embed_articles = np.concatenate(encoded_vectors, axis=0)


Encoding articles: 100%|██████████| 2830/2830 [1:32:40<00:00,  1.96s/it]  


In [46]:
def generate_embedding(article):
    embedding = bge_m3.encode([article], batch_size=12, max_length=8*1024)["dense_vecs"]
    return embedding[0]


In [74]:
print(embed_articles.shape)
embed_articles[1]

(22633, 1024)


array([-0.00119  ,  0.04907  , -0.0461   , ..., -0.0003068,  0.01453  ,
       -0.01723  ], dtype=float16)

In [47]:
import json

# List to store embeddings
embeddings_list = []

# Iterate over each row of the dataframe to create embeddings
for _, row in tqdm.tqdm(df_articles.iterrows(), desc="Encoding articles"):
    embeddings_articles = generate_embedding(row['article'])
    embedding_record = {
        "id": row['id'],
        "embedding": embeddings_articles.tolist()
    }
    embeddings_list.append(embedding_record)

# Save the embeddings to a JSON file
with open('embeddings.json', 'w', encoding='utf-8') as f:
    json.dump(embeddings_list, f, indent=4, ensure_ascii=False)


Encoding articles: 22633it [27:15, 13.84it/s]


In [49]:
# Load the embeddings from the JSON file
with open('embeddings.json', 'r', encoding='utf-8') as f:
    loaded_embeddings = json.load(f)

In [52]:
# Create a dictionary to store entities by partition
partitioned_entities = {}

# Iterate over each row of the dataframe to create entities
for _, row in tqdm.tqdm(df_articles.iterrows(), desc="Creating entities with partitions"):
    partition = row['normalized_code']
    
    # Find the corresponding embedding
    embedding = next(e['embedding'] for e in loaded_embeddings if e['id'] == row['id'])
    
    # Create an entity with the embedding
    entity = {
        "id": row['id'],
        "article": row['article'],
        "reference": row['reference'],
        "embedding_articles": np.array(embedding, dtype=np.float16)  #en np.array de float16 pour respecter les exigences de Milvus
    }
    
    # Add entity to the list corresponding to the partition
    if partition not in partitioned_entities:
        partitioned_entities[partition] = []
    partitioned_entities[partition].append(entity)

Creating entities with partitions: 22633it [11:25, 33.01it/s]


In [53]:
for partition, entities in partitioned_entities.items():
    print(f"Insertion des entités pour la partition: {partition}")
    
    try:
        articles_collection.insert(data=entities,collection_name="articles_collection",partition_name=partition)
    except Exception as e:
        print(f"Erreur lors de l'insertion pour la partition {partition}: {e}")


Insertion des entités pour la partition: Code_Bruxellois_de_lAir_du_Climat_et_de_la_Matrise_de_lEnergie
Insertion des entités pour la partition: Code_Bruxellois_de_lAmnagement_du_Territoire
Insertion des entités pour la partition: Code_Bruxellois_du_Logement
Insertion des entités pour la partition: Code_Civil
Insertion des entités pour la partition: Code_Consulaire
Insertion des entités pour la partition: Code_Electoral
Insertion des entités pour la partition: Code_Electoral_Communal_Bruxellois
Insertion des entités pour la partition: Code_Ferroviaire
Insertion des entités pour la partition: Code_Forestier
Insertion des entités pour la partition: Code_Judiciaire
Insertion des entités pour la partition: Code_Pnal
Insertion des entités pour la partition: Code_Pnal_Militaire
Insertion des entités pour la partition: Code_Pnal_Social
Insertion des entités pour la partition: Code_Rural
Insertion des entités pour la partition: Code_Rglementaire_Wallon_de_lAction_sociale_et_de_la_Sant
Insertio

### Teste requete 

In [54]:
res = articles_collection.list_partitions(collection_name="articles_collection")
print(res)

['_default', 'Code_Bruxellois_de_lAir_du_Climat_et_de_la_Matrise_de_lEnergie', 'Code_Bruxellois_de_lAmnagement_du_Territoire', 'Code_Bruxellois_du_Logement', 'Code_Civil', 'Code_Consulaire', 'Code_Electoral', 'Code_Electoral_Communal_Bruxellois', 'Code_Ferroviaire', 'Code_Forestier', 'Code_Judiciaire', 'Code_Pnal', 'Code_Pnal_Militaire', 'Code_Pnal_Social', 'Code_Rural', 'Code_Rglementaire_Wallon_de_lAction_sociale_et_de_la_Sant', 'Code_Wallon_de_lAction_sociale_et_de_la_Sant', 'Code_Wallon_de_lAgriculture', 'Code_Wallon_de_lEnseignement_Fondamental_et_de_lEnseignement_Secondaire', 'Code_Wallon_de_lEnvironnement', 'Code_Wallon_de_lHabitation_Durable', 'Code_Wallon_du_Bien_tre_des_animaux', 'Code_Wallon_du_Dveloppement_Territorial', 'Code_dInstruction_Criminelle', 'Code_de_Droit_Economique', 'Code_de_Droit_International_Priv', 'Code_de_lEau_intgr_au_Code_Wallon_de_lEnvironnement', 'Code_de_la_Dmocratie_Locale_et_de_la_Dcentralisation', 'Code_de_la_Fonction_Publique_Wallonne', 'Code_de_l

In [82]:
query_vectors =  generate_embedding(df_questions['question'].iloc[0])

query_vectors

array([-0.03473,  0.00834, -0.03464, ..., -0.0139 , -0.08057,  0.02516],
      dtype=float16)

In [118]:
all_collections = articles_collection.list_collections()
print(all_collections)


['articles_collection']


In [109]:
# Effectuer une requête uniquement dans une partition spécifique (par exemple, Code Civil)
articles_collection.load_partitions(collection_name = "articles_collection",
                                    partition_names=["Code_du_Bien_tre_au_Travail", "Code_des_Socits_et_des_Associations"])

search_results = articles_collection.search(
    collection_name="articles_collection",
    data = [query_vectors],  # Vecteurs de requête
    partition_names=["Code_du_Bien_tre_au_Travail", "Code_des_Socits_et_des_Associations"],  # Rechercher uniquement dans la partition "Code Civil"
    limit=3,  # Nombre de résultats à retourner
    search_params={"metric_type": "COSINE", "params":{"nlist": 128, "nprobe": 8}},
    output_fields= ['id']
)

In [111]:
# Extract entity ids into a list
entity_ids = [result["entity"]["id"] for result in search_results[0]]
entity_ids

[22233, 22176, 21110]

In [110]:
formatted_result = json.dumps(search_results[0], indent=3, ensure_ascii=False)
print(formatted_result)


[
   {
      "id": 22233,
      "distance": 0.6237211227416992,
      "entity": {
         "id": 22233
      }
   },
   {
      "id": 22176,
      "distance": 0.5701108574867249,
      "entity": {
         "id": 22176
      }
   },
   {
      "id": 21110,
      "distance": 0.5691672563552856,
      "entity": {
         "id": 21110
      }
   }
]


# Test similarity Search

In [58]:
df_questions = pd.read_csv("questions_train.csv")

In [66]:
res[1:]

['Code_Bruxellois_de_lAir_du_Climat_et_de_la_Matrise_de_lEnergie',
 'Code_Bruxellois_de_lAmnagement_du_Territoire',
 'Code_Bruxellois_du_Logement',
 'Code_Civil',
 'Code_Consulaire',
 'Code_Electoral',
 'Code_Electoral_Communal_Bruxellois',
 'Code_Ferroviaire',
 'Code_Forestier',
 'Code_Judiciaire',
 'Code_Pnal',
 'Code_Pnal_Militaire',
 'Code_Pnal_Social',
 'Code_Rural',
 'Code_Rglementaire_Wallon_de_lAction_sociale_et_de_la_Sant',
 'Code_Wallon_de_lAction_sociale_et_de_la_Sant',
 'Code_Wallon_de_lAgriculture',
 'Code_Wallon_de_lEnseignement_Fondamental_et_de_lEnseignement_Secondaire',
 'Code_Wallon_de_lEnvironnement',
 'Code_Wallon_de_lHabitation_Durable',
 'Code_Wallon_du_Bien_tre_des_animaux',
 'Code_Wallon_du_Dveloppement_Territorial',
 'Code_dInstruction_Criminelle',
 'Code_de_Droit_Economique',
 'Code_de_Droit_International_Priv',
 'Code_de_lEau_intgr_au_Code_Wallon_de_lEnvironnement',
 'Code_de_la_Dmocratie_Locale_et_de_la_Dcentralisation',
 'Code_de_la_Fonction_Publique_Wallon

#### compute Metrics

In [112]:
import time
import tqdm

def retrieve_ids(articles_collection, df_questions, metric_type="COSINE", nlist=128, nprobe=8, nbre_items_returned=10):
    # Load specific partitions
    partitions = res[1:]  # All partitions except 'default'
    articles_collection.load_partitions(
        collection_name="articles_collection",
        partition_names=partitions
    )
    
    # Initialize variables for metrics
    all_predictions = []
    total_time = 0
    
    # Perform the search for each question
    for idx, question in tqdm.tqdm(enumerate(df_questions['question']), desc="Performing Similarity Search..."):
        query_vector = generate_embedding(question)  # Generate the query vector for the question
        start_time = time.time()  # Start timing the search
        
        # Perform the search on the articles collection
        search_results = articles_collection.search(
            collection_name="articles_collection",
            data=[query_vector],  # Query vectors
            partition_names=partitions,  # Specific partitions
            limit=nbre_items_returned,  # Number of results to return
            search_params={"metric_type": metric_type, "params": {"nlist": nlist, "nprobe": nprobe}},  # Search parameters
            output_fields=['id']  # Output fields (ids in this case)
        )
        
        total_time += time.time() - start_time  # Calculate time taken for the search
        
        # Extract ids from the search results
        result_ids = [result['entity']['id'] for result in search_results[0]]  # Extract 'id' from the 'entity' field
        all_predictions.append(result_ids)  # Store all the returned ids for each query
    
    avg_time_per_query = total_time / len(df_questions['question'])  # Calculate the average time per query
    
    # Return the predictions (all ids) and average time per query
    return {"predictions": all_predictions, "avg_time_per_query": avg_time_per_query}


In [123]:
all_predictions =retrieve_ids(articles_collection, df_questions, metric_type="COSINE", nlist=65536, nprobe=128, nbre_items_returned=10)

Performing Similarity Search...: 0it [00:00, ?it/s]

Performing Similarity Search...: 886it [00:40, 21.91it/s]


In [124]:
# save the predictions
with open('predictions_2.json', 'w') as f:
    json.dump(all_predictions, f)

In [115]:
all_predictions

{'predictions': [[22233,
   7024,
   7026,
   18554,
   7008,
   22176,
   21110,
   22177,
   22178,
   22181],
  [8104, 7250, 1073, 7256, 7255, 5729, 10203, 8270, 758, 7252],
  [1094, 1102, 1121, 1112, 1145, 1123, 1125, 1124, 1103, 1111],
  [12153, 12124, 12155, 12012, 12676, 12031, 11258, 10829, 611, 12152],
  [21123, 21119, 21120, 18707, 8629, 8641, 8650, 9157, 10152, 21115],
  [2501, 2500, 838, 13, 30, 19, 212, 4381, 14, 171],
  [2335, 1676, 2334, 1655, 1203, 1670, 16329, 2227, 15884, 1820],
  [1156, 1067, 1423, 1201, 1150, 1060, 5517, 5516, 1234, 7654],
  [16808, 16806, 16985, 16813, 16811, 18348, 17189, 7036, 16812, 7037],
  [752, 859, 856, 684, 611, 2514, 876, 2486, 857, 861],
  [7742, 1001, 5170, 4684, 15757, 5336, 5169, 13500, 5480, 5597],
  [18762, 18768, 18770, 1007, 18767, 2920, 18758, 18769, 18766, 1254],
  [2769, 13117, 2114, 13118, 2117, 13520, 13741, 13119, 19740, 2772],
  [12249, 12268, 12025, 12198, 13045, 12208, 12153, 12156, 12124, 12212],
  [5984, 5985, 5956, 5991