In [8]:
import pandas as pd
import numpy as np
import re
import json
from tqdm.autonotebook import tqdm
from typing import Dict, List, Any


from pymilvus import CollectionSchema, FieldSchema, DataType, MilvusClient

In [2]:
# Initialize Milvus client
client = MilvusClient(uri="http://localhost:19530")

In [3]:
client.list_collections()

['articles_collection_L2',
 'articles_collection_IP',
 'articles_collection_COSINE']

In [4]:
def benchmark_metric_types(df_articles, partitioned_entities, metric_types, embeddings_dim):
    # Dictionary to store the created collections
    collections = {}

    # Iterate over different metric types (L2, IP, COSINE)
    for metric_type in metric_types:
        collection_name = f"articles_collectionPartition_{metric_type}"

        # Check if the collection already exists and drop it if necessary
        if client.has_collection(collection_name):
            print(f"Collection {collection_name} already exists. Dropping the collection...\n")
            client.drop_collection(collection_name)
        
        # Define the fields of the collection
        id_field = FieldSchema(name="id", dtype=DataType.INT64, is_primary=True)
        text_field = FieldSchema(name="article", dtype=DataType.VARCHAR, max_length=65535)
        reference_field = FieldSchema(name="reference", dtype=DataType.VARCHAR, max_length=1000)
        embedding_field = FieldSchema(name="embedding_articles", dtype=DataType.FLOAT16_VECTOR, dim=embeddings_dim)

        # Define the schema
        schema = CollectionSchema(fields=[id_field, text_field, reference_field, embedding_field], description=f"Collection for {metric_type} benchmark")
        
        # Create the collection
        client.create_collection(collection_name=collection_name, schema=schema)
        
        # Create partitions based on unique codes in df_articles
        codes = df_articles['normalized_code'].unique()
        for code in codes:
            client.create_partition(collection_name=collection_name, partition_name=code)
        
        # Insert entities into the partitions before creating the index
        for partition, entities in partitioned_entities.items():
            print(f"Inserting entities into partition: {partition}")
            try:
                client.insert(data=entities, collection_name=collection_name, partition_name=partition)
            except Exception as e:
                print(f"Error during insertion for partition {partition}: {e}")
        
        # Create the index with the metric type (only using FLAT index)
        index_params = MilvusClient.prepare_index_params()
        index_params.add_index(
            field_name="embedding_articles",
            metric_type=metric_type,  # Change only the metric type (L2, IP, COSINE)
            index_type="FLAT",  # Always use FLAT index
            index_name="vector_index",
            params={}  # No additional parameters needed for FLAT index
        )
        
        # Create the index on the collection after inserting the entities
        try:
            client.create_index(
                collection_name=collection_name,
                index_params=index_params,
                sync=True  # Wait for index creation to complete
            )
        except Exception as e:
            print(f"Failed to create an index on collection: {collection_name}")
            print(e)
            continue  # Skip this index and continue with the next one

        # Store the created collection in the dictionary
        collections[metric_type] = collection_name

    print("\n\nBenchmark completed for all metric types.")

    # Return the dictionary containing the collections
    return collections


In [5]:
df_articles = pd.read_csv("articles.csv")


def normalize_partition_name(name):
    # Remplacer les espaces et les tirets par des underscores
    name = re.sub(r'\s+|-', '_', name)
    # Supprimer les accents et caractères spéciaux
    name = re.sub(r'[^a-zA-Z0-9_]', '', name)
    return name

# Appliquer cette fonction aux noms des codes
df_articles['normalized_code'] = df_articles['code'].apply(normalize_partition_name)

In [14]:
# Load the embeddings from the JSON file
with open('embeddings.json', 'r', encoding='utf-8') as f:
    loaded_embeddings = json.load(f)

In [9]:
def load_embeddings_from_file(filepath: str) -> Dict[int, np.ndarray]:
    # Load the npz file
    data = np.load(filepath)
    
    # Convert arrays back to dictionary
    embeddings_dict = {
        int(id_): emb for id_, emb in zip(data['ids'], data['embeddings'])
    }
    
    print(f"Successfully loaded {len(embeddings_dict)} embeddings")
    
    return embeddings_dict
        

In [10]:
embeddings = load_embeddings_from_file('embeddings_bel.npz')


Successfully loaded 22633 embeddings


In [15]:
# Create a dictionary to store entities by partition
partitioned_entities = {}

# Iterate over each row of the dataframe to create entities
for _, row in tqdm(df_articles.iterrows(), desc="Creating entities with partitions"):
    partition = row['normalized_code']
    
    # Find the corresponding embedding
    embedding = next(e['embedding'] for e in loaded_embeddings if e['id'] == row['id'])
    
    # Create an entity with the embedding
    entity = {
        "id": row['id'],
        "article": row['article'],
        "reference": row['reference'],
        "embedding_articles": np.array(embedding, dtype=np.float16)  #en np.array de float16 pour respecter les exigences de Milvus
    }
    
    # Add entity to the list corresponding to the partition
    if partition not in partitioned_entities:
        partitioned_entities[partition] = []
    partitioned_entities[partition].append(entity)

Creating entities with partitions: 22633it [09:43, 38.81it/s]


In [11]:
def process_embeddings_in_chunks_and_partitions(
    df_articles: pd.DataFrame,
    loaded_embeddings: Dict[int, np.ndarray],
    chunk_size: int = 1000
) -> Dict[str, List[Dict[str, Any]]]:
    """
    Process embeddings in chunks and organize them into partitions.
    
    Args:
        df_articles: DataFrame containing article information
        loaded_embeddings: Dictionary mapping article IDs to their embeddings
        chunk_size: Number of rows to process in each chunk
    
    Returns:
        Dictionary of processed entities organized by partitions
    """
    # Initialize dictionary for partitioned entities
    partitioned_entities = {}

    # Calculate total number of chunks
    total_chunks = (len(df_articles) + chunk_size - 1) // chunk_size

    # Process data in chunks with progress bar
    with tqdm(total=len(df_articles), desc="Processing entities") as pbar:
        for chunk_start in range(0, len(df_articles), chunk_size):
            # Get chunk of dataframe
            chunk_end = min(chunk_start + chunk_size, len(df_articles))
            df_chunk = df_articles.iloc[chunk_start:chunk_end]

            # Process chunk
            for _, row in df_chunk.iterrows():
                partition = row['normalized_code']
                embedding = loaded_embeddings.get(row['id'])
                if embedding is not None:
                    entity = {
                        "id": row['id'],
                        "article": row['article'],
                        "reference": row['reference'],
                        "embedding_articles": np.array(embedding, dtype=np.float16)  # np.array of float16 to meet Milvus requirements
                    }
                    if partition not in partitioned_entities:
                        partitioned_entities[partition] = []
                    partitioned_entities[partition].append(entity)

            # Update progress bar
            pbar.update(len(df_chunk))

    print(f"Processed entities into {len(partitioned_entities)} partitions.")
    return partitioned_entities


In [12]:
partitioned_entities = process_embeddings_in_chunks_and_partitions(df_articles, embeddings, chunk_size=100)

Processing entities:   0%|          | 0/22633 [00:00<?, ?it/s]

Processed entities into 34 partitions.


In [13]:
metric_types = ["L2", "IP", "COSINE"]
index_types = ["FLAT"]
embeddings_dim = 1024  

collections = benchmark_metric_types(df_articles, partitioned_entities, metric_types, embeddings_dim)

Inserting entities into partition: Code_Bruxellois_de_lAir_du_Climat_et_de_la_Matrise_de_lEnergie
Inserting entities into partition: Code_Bruxellois_de_lAmnagement_du_Territoire
Inserting entities into partition: Code_Bruxellois_du_Logement
Inserting entities into partition: Code_Civil
Inserting entities into partition: Code_Consulaire
Inserting entities into partition: Code_Electoral
Inserting entities into partition: Code_Electoral_Communal_Bruxellois
Inserting entities into partition: Code_Ferroviaire
Inserting entities into partition: Code_Forestier
Inserting entities into partition: Code_Judiciaire
Inserting entities into partition: Code_Pnal
Inserting entities into partition: Code_Pnal_Militaire
Inserting entities into partition: Code_Pnal_Social
Inserting entities into partition: Code_Rural
Inserting entities into partition: Code_Rglementaire_Wallon_de_lAction_sociale_et_de_la_Sant
Inserting entities into partition: Code_Wallon_de_lAction_sociale_et_de_la_Sant
Inserting entities

In [14]:
collections

{'L2': 'articles_collectionPartition_L2',
 'IP': 'articles_collectionPartition_IP',
 'COSINE': 'articles_collectionPartition_COSINE'}

### Test Search

In [15]:
df_questions = pd.read_csv("questions_train.csv")

In [16]:
from sentence_transformers import SentenceTransformer

model_bel = SentenceTransformer('Lajavaness/bilingual-embedding-large', trust_remote_code=True, device='cuda')

In [25]:
from FlagEmbedding import BGEM3FlagModel

bge_m3 = BGEM3FlagModel('BAAI/bge-m3',  
                       use_fp16=True, 
                       device='cuda')

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

  colbert_state_dict = torch.load(os.path.join(model_dir, 'colbert_linear.pt'), map_location='cpu')
  sparse_state_dict = torch.load(os.path.join(model_dir, 'sparse_linear.pt'), map_location='cpu')


In [26]:
def generate_embedding(article):
    embedding = bge_m3.encode([article], batch_size=12, max_length=8*1024)["dense_vecs"]
    return embedding[0]

In [17]:
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import Union, List
import torch

def generate_embedding(
    article: Union[str, List[str]],
    model: SentenceTransformer,
    batch_size: int = 32,
    max_length: int = 512,
    device: str = None
) -> np.ndarray:

    # Set device
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    # Ensure model is on correct device
    model = model.to(device)
    
    # Convert single article to list if necessary
    if isinstance(article, str):
        articles = [article]
    else:
        articles = article
    
    # Generate embeddings
    with torch.no_grad():
        embeddings = model.encode(
            articles,
            batch_size=batch_size,
            show_progress_bar=False,
            convert_to_numpy=True,
            normalize_embeddings=True,  # Normalize for cosine similarity
            max_length=max_length,
            device=device
        )

    # Convert embeddings to float16
    embeddings = embeddings.astype(np.float16)
    
    # Return single embedding if input was single article
    if isinstance(article, str):
        return embeddings[0]
    
    return embeddings

In [18]:
df_questions = pd.read_csv('questions_train.csv')

#concat df_questions['question'] + df_questions['extra_description']

df_questions['complet_question'] = df_questions['question'] + df_questions['extra_description']

In [19]:
query_vector = generate_embedding(
    article= df_questions['complet_question'].iloc[0],
    model=model_bel,
    batch_size=32
)

In [20]:
query_vector

array([-0.03096 , -0.004353,  0.03195 , ..., -0.0366  , -0.02815 ,
       -0.0423  ], dtype=float16)

In [21]:
res = client.list_partitions(collection_name="articles_collectionPartition_L2")
print(res)

['_default', 'Code_Bruxellois_de_lAir_du_Climat_et_de_la_Matrise_de_lEnergie', 'Code_Bruxellois_de_lAmnagement_du_Territoire', 'Code_Bruxellois_du_Logement', 'Code_Civil', 'Code_Consulaire', 'Code_Electoral', 'Code_Electoral_Communal_Bruxellois', 'Code_Ferroviaire', 'Code_Forestier', 'Code_Judiciaire', 'Code_Pnal', 'Code_Pnal_Militaire', 'Code_Pnal_Social', 'Code_Rural', 'Code_Rglementaire_Wallon_de_lAction_sociale_et_de_la_Sant', 'Code_Wallon_de_lAction_sociale_et_de_la_Sant', 'Code_Wallon_de_lAgriculture', 'Code_Wallon_de_lEnseignement_Fondamental_et_de_lEnseignement_Secondaire', 'Code_Wallon_de_lEnvironnement', 'Code_Wallon_de_lHabitation_Durable', 'Code_Wallon_du_Bien_tre_des_animaux', 'Code_Wallon_du_Dveloppement_Territorial', 'Code_dInstruction_Criminelle', 'Code_de_Droit_Economique', 'Code_de_Droit_International_Priv', 'Code_de_lEau_intgr_au_Code_Wallon_de_lEnvironnement', 'Code_de_la_Dmocratie_Locale_et_de_la_Dcentralisation', 'Code_de_la_Fonction_Publique_Wallonne', 'Code_de_l

In [41]:
# Partition sur laquelle effectuer la recherche
# partitions = res[1:]  # Liste des partitions à charger

partitions = ["Code_du_Bien_tre_au_Travail", "Code_des_Socits_et_des_Associations"]

# Vecteur de la question à tester
query_vector = generate_embedding(df_questions['question'].iloc[0])

# Paramètres de recherche uniquement pour FLAT, en variant les métriques
search_params = {
    'L2': {"metric_type": "L2", "params": {}},      # Distance euclidienne
    'COSINE': {"metric_type": "COSINE", "params": {}},  # Distance cosinus
    'IP': {"metric_type": "IP", "params": {}},      # Produit scalaire
}

# Résultats de performance
performance_results = {}

# Boucle pour tester chaque métrique
for metric_type, collection_name in collections.items():
    print(f"Testing collection: {collection_name} with metric type: {metric_type}")

    # Charger les partitions pour la collection
    client.load_partitions(collection_name=collection_name, partition_names=partitions)

    # Récupérer les informations sur l'index
    index_info = client.describe_index(collection_name=collection_name, index_name="vector_index")
    print(f"Index info for {metric_type}: {index_info}")

    # Effectuer la recherche avec la métrique correspondante
    search_results = client.search(
        collection_name=collection_name,
        data=[query_vector],
        partition_names=partitions,
        limit=3,
        search_params=search_params[metric_type],  # Utiliser les paramètres de la métrique en cours
        output_fields=['id', 'reference']
    )

    # Formater et afficher les résultats
    formatted_result = json.dumps(search_results[0], indent=3, ensure_ascii=False)
    print(f"Results for {metric_type}:\n{formatted_result}")
    
    # Enregistrer les résultats dans le dictionnaire
    performance_results[metric_type] = search_results


Testing collection: articles_collection_L2 with metric type: L2
Index info for L2: {'index_type': 'FLAT', 'metric_type': 'L2', 'field_name': 'embedding_articles', 'index_name': 'vector_index', 'total_rows': 0, 'indexed_rows': 0, 'pending_index_rows': 0, 'state': 'Finished'}
Results for L2:
[
   {
      "id": 22233,
      "distance": 0.7523195743560791,
      "entity": {
         "id": 22233,
         "reference": "Art. X.5-9, Code du Bien-être au Travail (Livre X, Titre 5)"
      }
   },
   {
      "id": 22176,
      "distance": 0.8595061302185059,
      "entity": {
         "id": 22176,
         "reference": "Art. X.1-1, Code du Bien-être au Travail (Livre X, Titre 1er)"
      }
   },
   {
      "id": 21110,
      "distance": 0.8613051772117615,
      "entity": {
         "id": 21110,
         "reference": "Art. I.4-68, Code du Bien-être au Travail (Livre Ier, Titre 4, Chapitre V, Section 6)"
      }
   }
]
Testing collection: articles_collection_IP with metric type: IP
Index info for

In [43]:
labels = df_questions['article_ids'].iloc[0]
print(f"Labels: {labels} \n")


# Affichage des résultats pour chaque index
for index_type, result in performance_results.items():
    list_ids = [result["entity"]["id"] for result in result[0]]
    list_prods = [result["distance"] for result in result[0]]
    print(f"metric Type: {index_type} - results: {list_ids} - distances: {list_prods}")

    print('\n')


Labels: 22225,22226,22227,22228,22229,22230,22231,22232,22233,22234 

metric Type: L2 - results: [22233, 22176, 21110] - distances: [0.7523195743560791, 0.8595061302185059, 0.8613051772117615]


metric Type: IP - results: [22233, 22176, 21110] - distances: [0.6235238313674927, 0.5699306726455688, 0.5689291954040527]


metric Type: COSINE - results: [22233, 22176, 21110] - distances: [0.6237211227416992, 0.5701108574867249, 0.5691672563552856]




----------

In [24]:
# Partition sur laquelle effectuer la recherche
# partitions = res[1:]  # Liste des partitions à charger

partitions = ["Code_du_Bien_tre_au_Travail", "Code_des_Socits_et_des_Associations"]

# Paramètres de recherche uniquement pour FLAT, en variant les métriques
search_params = {
    'L2': {"metric_type": "L2", "params": {}},      # Distance euclidienne
    'COSINE': {"metric_type": "COSINE", "params": {}},  # Distance cosinus
    'IP': {"metric_type": "IP", "params": {}},      # Produit scalaire
}

# Résultats de performance
performance_results = {}

# Boucle pour tester chaque métrique
for metric_type, collection_name in collections.items():
    print(f"Testing collection: {collection_name} with metric type: {metric_type}")

    # Charger les partitions pour la collection
    client.load_partitions(collection_name=collection_name, partition_names=partitions)

    # Récupérer les informations sur l'index
    index_info = client.describe_index(collection_name=collection_name, index_name="vector_index")
    print(f"Index info for {metric_type}: {index_info}")

    # Effectuer la recherche avec la métrique correspondante
    search_results = client.search(
        collection_name=collection_name,
        data=[query_vector],
        partition_names=partitions,
        limit=3,
        search_params=search_params[metric_type],  # Utiliser les paramètres de la métrique en cours
        output_fields=['id', 'reference']
    )

    # Formater et afficher les résultats
    formatted_result = json.dumps(search_results[0], indent=3, ensure_ascii=False)
    print(f"Results for {metric_type}:\n{formatted_result}")
    
    # Enregistrer les résultats dans le dictionnaire
    performance_results[metric_type] = search_results



Testing collection: articles_collectionPartition_L2 with metric type: L2
Index info for L2: {'index_type': 'FLAT', 'metric_type': 'L2', 'field_name': 'embedding_articles', 'index_name': 'vector_index', 'total_rows': 0, 'indexed_rows': 0, 'pending_index_rows': 0, 'state': 'Finished'}
Results for L2:
[
   {
      "id": 21092,
      "distance": 0.8355220556259155,
      "entity": {
         "id": 21092,
         "reference": "Art. I.4-50, Code du Bien-être au Travail (Livre Ier, Titre 4, Chapitre V, Section 2)"
      }
   },
   {
      "id": 22225,
      "distance": 0.8465765714645386,
      "entity": {
         "id": 22225,
         "reference": "Art. X.5-1, Code du Bien-être au Travail (Livre X, Titre 5)"
      }
   },
   {
      "id": 21111,
      "distance": 0.9035404324531555,
      "entity": {
         "id": 21111,
         "reference": "Art. I.4-69, Code du Bien-être au Travail (Livre Ier, Titre 4, Chapitre V, Section 7)"
      }
   }
]
Testing collection: articles_collectionPartit