# üöÄ Requ√™te WIKIDATA

## üî® Construction de l'environnement n√©cessaire et configuration

### Installation des modules

In [None]:
%pip install SPARQLWrapper tqdm pandas

print("‚¨áÔ∏è Installation termin√©e !")

### Configuration

In [4]:
# üîß CONFIGURATION ET IMPORTS
import json
import csv
import time
import re
import os
from datetime import datetime
from SPARQLWrapper import SPARQLWrapper, JSON
from tqdm import tqdm
import pandas as pd

# Configuration
WIKIDATA_ENDPOINT = "https://query.wikidata.org/sparql"
RATE_LIMIT_DELAY = 2.0 
BATCH_SIZE = 10  
MAX_RETRIES = 5  
REQUEST_TIMEOUT = 60
ENRICHMENT_BATCH_SIZE = 15

# Dossier de sortie
output_dir = "./output"
os.makedirs(output_dir, exist_ok=True)

print("üöÄ CONFIGURATION TERMIN√âE")
print(f"üìÅ Dossier de sortie: {output_dir}")
print(f"‚è±Ô∏è  Rate limit: {RATE_LIMIT_DELAY}s entre requ√™tes")
print(f"üì¶ Taille des batches: {BATCH_SIZE}")

üöÄ CONFIGURATION TERMIN√âE
üìÅ Dossier de sortie: ./output
‚è±Ô∏è  Rate limit: 2.0s entre requ√™tes
üì¶ Taille des batches: 10


In [5]:
def create_sparql_client():
    """
    Cr√©er un client SPARQL pour interagir avec Wikidata
    :return: Instance de SPARQLWrapper configur√©e pour Wikidata
    """
    sparql = SPARQLWrapper(WIKIDATA_ENDPOINT)
    sparql.setReturnFormat(JSON)
    sparql.setTimeout(REQUEST_TIMEOUT)
    return sparql

def execute_sparql_query(query, max_retries=MAX_RETRIES):
    """
    Ex√©cute une requ√™te SPARQL avec gestion des erreurs et rate limiting
    :param query: La requ√™te SPARQL √† ex√©cuter
    :param max_retries: Nombre maximum de tentatives en cas d'√©chec
    :return: R√©sultats de la requ√™te ou une liste vide en cas d'√©chec
    """
    sparql = create_sparql_client()
    for attempt in range(max_retries):
        try:
            sparql.setQuery(query)  # D√©finition de la requ√™te SPARQL √† ex√©cuter
            results = sparql.query().convert()  # Ex√©cution de a requ√™te et conversion en JSON
            time.sleep(RATE_LIMIT_DELAY)  # D√©lai entre les requ√™tes
            return results["results"]["bindings"]  # R√©sultats extraits
        except Exception as e:
            print(f"‚ö†Ô∏è  Tentative {attempt + 1}/{max_retries} √©chou√©e: {e}...")  # Affiche l'erreur et le num√©ro de tentative
            if attempt < max_retries - 1:
                time.sleep(RATE_LIMIT_DELAY * (attempt + 2))  # Attend plus longtemps avant de r√©essayer si les tentatives max ne sont pas d√©pass√©es
            else:
                print(f"‚ùå Requ√™te √©chou√©e apr√®s {max_retries} tentatives") 
                return []
    return []

def clean_entity_id(entity_uri):
    """
    Extrait l'ID d'une entit√© √† partir de son URI
    :param entity_uri: URI de l'entit√© (ex: "http://www.wikidata.org/entity/Q42'")
    :return: ID de l'entit√© (ex: "Q42") ou une cha√Æne vide si l'URI est vide
    """
    if not entity_uri:
        return ""
    return entity_uri.split("/")[-1] if "/" in entity_uri else entity_uri

def execute_batch_queries(queries, description="Requ√™tes"):
    """
    Ex√©cute une liste de requ√™tes SPARQL en batch
    :param queries: Liste de requ√™tes SPARQL √† ex√©cuter
    :param description: Description de la t√¢che pour l'affichage
    :return: Liste de tous les r√©sultats combin√©s
    """
    all_results = []
    for i, query in enumerate(tqdm(queries, desc=description)):  # Boucle sur chaque requ√™te avec barre de progression
        results = execute_sparql_query(query)  # Ex√©cute la requ√™te SPARQL
        all_results.extend(results)  # Ajoute les r√©sultats √† la liste globale
        if (i + 1) % BATCH_SIZE == 0:  # Toutes les BATCH_SIZE requ√™tes, on attend un peu
            time.sleep(RATE_LIMIT_DELAY)  # Pause
    return all_results  # Retourne la liste compl√®te des r√©sultats

print("üîó FONCTIONS CONFIGUR√âES")

üîó FONCTIONS CONFIGUR√âES AVEC DIAGNOSTIC AVANC√â


## üöß Construction de la requ√™te


### Rechercher une entit√© par nom

In [None]:
# üéØ IDENTIFICATION DE L'ENTIT√â

def find_aeronautics_entity():
  """
  Demande √† l'utilisateur le nom de l'entit√© √† rechercher dans Wikidata
  :return: Tuple contenant l'ID, le label et la description de l'entit√©
  """
  entity_name = input("Entrez le nom de l'entit√© √† rechercher (ex: aeronautics) : ").strip()
  if not entity_name:
    print("‚ùå Aucun nom d'entit√© fourni.")
    return None, None, None

  aeronautics_query = f"""
  SELECT DISTINCT ?item ?itemLabel ?itemDescription WHERE {{
    ?item rdfs:label ?label .
    FILTER(LANG(?label) = "fr" || LANG(?label) = "en") .
    FILTER(REGEX(?label, "{re.escape(entity_name)}", "i"))
    SERVICE wikibase:label {{ 
      bd:serviceParam wikibase:language "fr,en" . 
    }}
  }}
  LIMIT 5
  """

  print(f"üîç Recherche de l'entit√© principale '{entity_name}'...")
  results = execute_sparql_query(aeronautics_query)

  if results:
    entity = results[0]
    entity_id = clean_entity_id(entity["item"]["value"])
    entity_label = entity["itemLabel"]["value"]
    entity_desc = entity.get("itemDescription", {}).get("value", "")

    print(f"‚úÖ Entit√© trouv√©e: {entity_label} ({entity_id})")
    print(f"üìÑ Description: {entity_desc}")

    return entity_id, entity_label, entity_desc
  else:
    print("‚ùå Entit√© non trouv√©e")
    return None, None, None

# Identifier l'entit√© principale
aeronautics_entity, aeronautics_label, aeronautics_desc = find_aeronautics_entity()

### CELLULE DE TEST : 
```sql
SELECT ?item ?itemLabel ?parent 
WHERE {
  ?item wdt:P31 ?parent.
  FILTER(CONTAINS(LCASE(?parent), "a√©ronautique")).
  SERVICE wikibase:label { bd:serviceParam wikibase:language "fr". }
}

LIMIT 10
```


In [None]:
# üéØ IDENTIFICATION DE L'ENTIT√â

def find_aeronautics_entity():
  """
  Demande √† l'utilisateur le nom de l'entit√© √† rechercher dans Wikidata
  :return: Tuple contenant l'ID, le label et la description de l'entit√©
  """
  entity_name = input("Entrez le nom de l'entit√© √† rechercher (ex: aeronautics) : ").strip()
  if not entity_name:
    print("‚ùå Aucun nom d'entit√© fourni.")
    return None, None, None
  
  aeronautics_query = f"""
  SELECT ?item ?itemLabel ?parent ?itemDescription
WHERE {{
  ?item wdt:P31 ?parent.
  ?parent rdfs:label ?parentLabel.
  FILTER(LANG(?parentLabel) = "fr").
  FILTER(CONTAINS(LCASE(?parentLabel), "{entity_name}")).
  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "fr". }}
}}

LIMIT 1
"""

  print(f"üîç Recherche de l'entit√© principale '{entity_name}'...")
  results = execute_sparql_query(aeronautics_query)

  if results:
    entity = results[0]
    entity_id = clean_entity_id(entity["item"]["value"])
    entity_label = entity["itemLabel"]["value"]
    entity_desc = entity.get("itemDescription", {}).get("value", "")

    print(f"‚úÖ Entit√© trouv√©e: {entity_label} ({entity_id})")
    print(f"üìÑ Description: {entity_desc}")

    return entity_id, entity_label, entity_desc
  else:
    print("‚ùå Entit√© non trouv√©e")
    return None, None, None

# Identifier l'entit√© principale
aeronautics_entity, aeronautics_label, aeronautics_desc = find_aeronautics_entity()

üîç Recherche de l'entit√© principale 'a√©ronautique'...
‚ö†Ô∏è  Tentative 1/5 √©chou√©e: The read operation timed out...
‚ö†Ô∏è  Tentative 2/5 √©chou√©e: The read operation timed out...
‚ö†Ô∏è  Tentative 3/5 √©chou√©e: The read operation timed out...


### Requ√™tes SPARQL

In [None]:
def build_aeronautics_extraction_queries():
    """Construit les requ√™tes d'extraction des donn√©es"""
    queries = {
        "manufacturers": """
        SELECT DISTINCT ?item 
              
                ?itemLabel
               (COALESCE(?itemDescription_fr, ?itemDescription_en, ?itemDescription_any, "") AS ?itemDescription)
               ?parent
               ?parentLabel
               (GROUP_CONCAT(DISTINCT ?synonym_fr; separator=",") AS ?synonyms_fr)
        WHERE {
          # Tous les √©quipements d'aviation (instances ou sous-classes ou parties)
          ?item wdt:P31*/wdt:P279*/wdt:P361*/wdt:P452*/wdt:P749* wd:Q936518 .
        
          # On cherche le parent imm√©diat selon diff√©rentes relations
            OPTIONAL { ?item wdt:P361 ?partOf . }
            OPTIONAL { ?item wdt:P279 ?subclassOf . }
            OPTIONAL { ?item wdt:P31 ?instanceOf . }
            OPTIONAL { ?item wdt:P452 ?secteur .}
            OPTIONAL { ?item wdt:P176 ?constructeur.}
            OPTIONAL { ?item wdt:P749 ?constructeur.}
            
            BIND(COALESCE(?partOf, ?subclassOf, ?instanceOf, ?secteur, ?constructeur) AS ?parent)
        
          # Description de l'item (fr > en > autre)
          OPTIONAL { ?item schema:description ?itemDescription_fr . FILTER(LANG(?itemDescription_fr) = "fr") }
          OPTIONAL { ?item schema:description ?itemDescription_en . FILTER(LANG(?itemDescription_en) = "en") }
          OPTIONAL { ?item schema:description ?itemDescription_any .
                     FILTER(LANG(?itemDescription_any) != "fr" && LANG(?itemDescription_any) != "en") }

          # Synonymes en fran√ßais (P1709 est "synonymes exacts")
          OPTIONAL { ?item skos:altLabel ?synonym_fr . FILTER(LANG(?synonym_fr) = "fr") }
        
          SERVICE wikibase:label {
            bd:serviceParam wikibase:language "fr,en,[AUTO_LANGUAGE]"
          }
        }
        GROUP BY ?item ?itemLabel ?itemDescription_fr ?itemDescription_en ?itemDescription_any ?parent ?parentLabel
        """,

        "aircraft_models": """
        SELECT DISTINCT ?item 
               ?itemLabel
               (COALESCE(?itemDescription_fr, ?itemDescription_en, ?itemDescription_any, "") AS ?itemDescription)
               ?parent
               ?parentLabel
               (GROUP_CONCAT(DISTINCT ?synonym_fr; separator=" , ") AS ?synonyms_fr)
        WHERE {
          # Tous les mod√®les d'avions (instances ou sous-classes ou parties)
          ?item wdt:P31/wdt:P279* wd:Q11436 .
          
        
          # On cherche le parent imm√©diat selon diff√©rentes relations         
            OPTIONAL { ?item wdt:P179 ?series .}
            OPTIONAL { ?item wdt:176 ?constructeur.}
            OPTIONAL { ?item wdt:P31 ?instanceOf . } 
            OPTIONAL { ?item wdt:P361 ?partOf . }
            OPTIONAL { ?item wdt:P279 ?subclassOf . }
                       
            BIND(COALESCE(?partOf, ?subclassOf, ?instanceOf, ?series, ?constructeur) AS ?parent)
        
          # Description de l'item (fr > en > autre)
          OPTIONAL { ?item schema:description ?itemDescription_fr . FILTER(LANG(?itemDescription_fr) = "fr") }
          OPTIONAL { ?item schema:description ?itemDescription_en . FILTER(LANG(?itemDescription_en) = "en") }
          OPTIONAL { ?item schema:description ?itemDescription_any .
                     FILTER(LANG(?itemDescription_any) != "fr" && LANG(?itemDescription_any) != "en") }

          # Synonymes en fran√ßais
          OPTIONAL { ?item skos:altLabel ?synonym_fr . FILTER(LANG(?synonym_fr) = "fr") }
        
          SERVICE wikibase:label {
            bd:serviceParam wikibase:language "fr,en,[AUTO_LANGUAGE]"
          }
        }
        GROUP BY ?item ?itemLabel ?itemDescription_fr ?itemDescription_en ?itemDescription_any ?parent ?parentLabel
        """,

        "aircraft_components": """
        SELECT DISTINCT ?item 
               ?itemLabel
               (COALESCE(?itemDescription_fr, ?itemDescription_en, ?itemDescription_any, "") AS ?itemDescription)
               ?parent
               ?parentLabel
               (GROUP_CONCAT(DISTINCT ?synonym_fr; separator=" , ") AS ?synonyms_fr)
        WHERE {
          # Tous les √©quipements d'aviation (instances ou sous-classes ou parties)
          ?item wdt:P31*/wdt:P279*/wdt:P361* wd:Q16693356 .
        
          # On cherche le parent imm√©diat selon diff√©rentes relations
            OPTIONAL { ?item wdt:P361 ?partOf . }
            OPTIONAL { ?item wdt:P279 ?subclassOf . }
            OPTIONAL { ?item wdt:P31 ?instanceOf . }
            OPTIONAL { ?item wdt:P452 ?secteur .}
            OPTIONAL { ?item wdt:P176 ?constructeur.}
            
            BIND(COALESCE(?partOf, ?subclassOf, ?instanceOf, ?secteur, ?constructeur) AS ?parent)
        
          # Description de l'item (fr > en > autre)
          OPTIONAL { ?item schema:description ?itemDescription_fr . FILTER(LANG(?itemDescription_fr) = "fr") }
          OPTIONAL { ?item schema:description ?itemDescription_en . FILTER(LANG(?itemDescription_en) = "en") }
          OPTIONAL { ?item schema:description ?itemDescription_any .
                     FILTER(LANG(?itemDescription_any) != "fr" && LANG(?itemDescription_any) != "en") }

          # Synonymes en fran√ßais
          OPTIONAL { ?item skos:altLabel ?synonym_fr . FILTER(LANG(?synonym_fr) = "fr") }
        
          SERVICE wikibase:label {
            bd:serviceParam wikibase:language "fr,en,[AUTO_LANGUAGE]"
          }
        }
        GROUP BY ?item ?itemLabel ?itemDescription_fr ?itemDescription_en ?itemDescription_any ?parent ?parentLabel
        """,

        "aeronautic_profession": """
        SELECT DISTINCT ?item 
               ?itemLabel
               (COALESCE(?itemDescription_fr, ?itemDescription_en, ?itemDescription_any, "") AS ?itemDescription)
               ?parent
               ?parentLabel
               (GROUP_CONCAT(DISTINCT ?synonym_fr; separator=" , ") AS ?synonyms_fr)
        WHERE {
        ?item wdt:P425* ?domaine.
        VALUES ?domaine { wd:Q765633 wd:Q906438 wd:Q1434048 wd:Q206814 wd:Q627716 wd:Q221395 wd:Q765633 wd:Q22719}.  
        
          # On cherche le parent imm√©diat selon diff√©rentes relations
            OPTIONAL { ?item wdt:P361 ?partOf . }
            OPTIONAL { ?item wdt:P279 ?subclassOf . }
            OPTIONAL { ?item wdt:P31 ?instanceOf . }
            OPTIONAL { ?item wdt:P452 ?secteur .}
            OPTIONAL { ?item wdt:P176 ?constructeur.}
            OPTIONAL { ?item wdt:P749 ?constructeur.}
            
            BIND(COALESCE(?partOf, ?subclassOf, ?instanceOf, ?secteur, ?constructeur, ?domaine) AS ?parent) # Attention √† coalesce pour √©viter les doublons
        
          # Description de l'item (fr > en > autre)
          OPTIONAL { ?item schema:description ?itemDescription_fr . FILTER(LANG(?itemDescription_fr) = "fr") }
          OPTIONAL { ?item schema:description ?itemDescription_en . FILTER(LANG(?itemDescription_en) = "en") }
          OPTIONAL { ?item schema:description ?itemDescription_any .
                     FILTER(LANG(?itemDescription_any) != "fr" && LANG(?itemDescription_any) != "en") }

          # Synonymes en fran√ßais
          OPTIONAL { ?item skos:altLabel ?synonym_fr . FILTER(LANG(?synonym_fr) = "fr") }
        
          SERVICE wikibase:label {
            bd:serviceParam wikibase:language "fr,en,[AUTO_LANGUAGE]"
          }
        }
        GROUP BY ?item ?itemLabel ?itemDescription_fr ?itemDescription_en ?itemDescription_any ?parent ?parentLabel
        """
    }
    return queries

print("‚úÖ Requ√™tes d√©finies (avec synonymes fran√ßais)")

## üîé Lancer la recherche

In [None]:
def extract_all_aeronautics_data():
    """Extrait toutes les donn√©es a√©ronautiques de mani√®re optimis√©e"""
    print("üèóÔ∏è EXTRACTION HI√âRARCHIQUE EXHAUSTIVE")
    print("="*50)
    
    queries = build_aeronautics_extraction_queries()
    all_results = []
    
    for category, query in queries.items():
        print(f"\nüîç Extraction: {category}")
        results = execute_sparql_query(query)
        
        # Enrichir chaque r√©sultat avec sa cat√©gorie
        for result in results:
            result["source_category"] = category
        
        all_results.extend(results)
        print(f"‚úÖ {len(results)} entit√©s trouv√©es pour {category}")
    
    print(f"\nüéØ TOTAL: {len(all_results)} entit√©s extraites")
    return all_results

raw_aeronautics_data = extract_all_aeronautics_data()

### Aper√ßu

In [None]:
print(raw_aeronautics_data[:5])  # pour afficher un aper√ßu

json_filename = f"raw.json"
json_filepath = os.path.join(output_dir, json_filename)

with open(json_filepath, 'w', encoding='utf-8') as jsonfile:
    json.dump(raw_aeronautics_data, jsonfile, ensure_ascii=False, indent=2)

## üìÅ Export

### Construction du fichier

In [None]:
# üèóÔ∏è CONSTRUCTION DE LA HI√âRARCHIE FINALE

# def get_id_from_uri(uri):
#     # Ex: "http://www.wikidata.org/entity/Q105557" ‚Üí "Q105557"
#     return uri.split("/")[-1] if uri else ""

def build_final_hierarchy(raw_aeronautics_data):
    """Construit la hi√©rarchie finale avec parents imm√©diats et cat√©gories"""
    print("üèóÔ∏è CONSTRUCTION DE LA HI√âRARCHIE FINALE")
    print("="*45)
    
    # Cr√©er la hi√©rarchie structur√©e
    hierarchy = []
    for entry in raw_aeronautics_data:
    
        hierarchy.append(
            {
            "ID": clean_entity_id(entry.get("item", {}).get("value", "")),
            "Terme": entry.get("itemLabel", {}).get("value", ""),
            "ID_TG": clean_entity_id(entry.get("parent", {}).get("value", "")),
            "TG": entry.get("parentLabel", {}).get("value", ""),
            "Def": entry.get("itemDescription", {}).get("value", ""),
            "EP": entry.get("synonyms_fr", {}).get("value", ""),
            "TA": entry.get("source_category", {})
        }
        )
    
 
    print(f"‚úÖ Hi√©rarchie construite: {len(hierarchy)} entr√©es totales")
    return hierarchy

final_thesaurus = build_final_hierarchy(raw_aeronautics_data)
print(f"üéØ Th√©saurus final: {len(final_thesaurus)} entr√©es")

### Export du fichier

In [None]:
# üíæ EXPORT FINAL UNIQUE - CSV Occidental European Format (semicolon separated)

import os
import csv
import json
from datetime import datetime

def export_final_thesaurus(thesaurus_data):
    """Exporte le th√©saurus final en CSV (point-virgule, format europ√©en) et JSON"""
    print("üíæ EXPORT FINAL UNIQUE")
    print("="*25)
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # 1. Export CSV - Occidental European (semicolon separator, utf-8-sig BOM)
    csv_filename = f"thesaurus_aeronautique_FINAL_{timestamp}.csv"
    csv_filepath = os.path.join(output_dir, csv_filename)
    
    fieldnames = [
        'ID', 'Terme', 'ID_TG','TG', 'Def', 'EP',
        'TA'
    ]
    
    print(f"üìÑ Export CSV: {csv_filename}")
    with open(csv_filepath, 'w', newline='', encoding='utf-8-sig') as csvfile:
        writer = csv.DictWriter(
            csvfile, 
            fieldnames=fieldnames,
            delimiter=';',         # Use semicolon as separator
            quoting=csv.QUOTE_MINIMAL
        )
        writer.writeheader()
        for entry in sorted(thesaurus_data, key=lambda x: x["ID"]):
            # Ensure all values are strings and convert None to empty string
            row = {k: ('' if v is None else str(v)) for k, v in entry.items()}
            # Guarantee all required fields exist in row
            for field in fieldnames:
                row.setdefault(field, '')
            writer.writerow(row)
    
    # 2. Export JSON avec m√©tadonn√©es
    json_filename = f"thesaurus_aeronautique_FINAL_{timestamp}.json"
    json_filepath = os.path.join(output_dir, json_filename)
    
    stats = analyze_thesaurus_statistics(thesaurus_data)
    
    json_data = {
        "metadata": {
            "title": "Th√©saurus A√©ronautique Final - Wikidata",
            "description": "Th√©saurus exhaustif avec hi√©rarchie et parents imm√©diats",
            "version": "1.0-FINAL",
            "created": timestamp,
            "source": "Wikidata SPARQL optimis√©",
            "total_entries": len(thesaurus_data),
            "extraction_method": "multi-query_hierarchical",
            "parent_detection": "automatic_wikidata_relations",
            "multilingual_support": True,
            "format": "structured_hierarchical_thesaurus"
        },
        "statistics": stats,
        "data": thesaurus_data
    }
    
    print(f"üìÑ Export JSON: {json_filename}")
    with open(json_filepath, 'w', encoding='utf-8') as jsonfile:
        json.dump(json_data, jsonfile, ensure_ascii=False, indent=2)
    
    return csv_filepath, json_filepath, stats

def analyze_thesaurus_statistics(thesaurus_data):
    """Analyse les statistiques du th√©saurus final"""
    stats = {
        "total_entries": len(thesaurus_data),
        "categories": {},
        "relation_types": {},
        "languages": {},
        "hierarchy_depth": 0,
        "entries_with_synonyms": 0,
        "entries_with_descriptions": 0
    }
    
    for entry in thesaurus_data:
        # Cat√©gories
        category = entry.get("category", "unknown")
        stats["categories"][category] = stats["categories"].get(category, 0) + 1
        
        # Types de relation
        rel_type = entry.get("relation_type", "unknown")
        stats["relation_types"][rel_type] = stats["relation_types"].get(rel_type, 0) + 1
        
        # Langues
        lang = entry.get("lang", "unknown")
        stats["languages"][lang] = stats["languages"].get(lang, 0) + 1
        
        # Enrichissements
        if entry.get("synonyms"):
            stats["entries_with_synonyms"] += 1
        if entry.get("description"):
            stats["entries_with_descriptions"] += 1
    
    return stats

def display_final_summary(stats, csv_file, json_file):
    """Affiche un r√©sum√© final du th√©saurus g√©n√©r√©"""
    print("\nüéØ R√âSUM√â FINAL DU TH√âSAURUS A√âRONAUTIQUE")
    print("="*50)
    
    print(f"üìä STATISTIQUES G√âN√âRALES:")
    print(f"   ‚Ä¢ Total d'entr√©es: {stats['total_entries']}")
    print(f"   ‚Ä¢ Entr√©es avec synonymes: {stats['entries_with_synonyms']}")
    print(f"   ‚Ä¢ Entr√©es avec descriptions: {stats['entries_with_descriptions']}")
    
    print(f"\nüìÇ R√âPARTITION PAR CAT√âGORIE:")
    for category, count in sorted(stats["categories"].items(), key=lambda x: x[1], reverse=True):
        percentage = (count / stats['total_entries']) * 100
        print(f"   ‚Ä¢ {category}: {count} ({percentage:.1f}%)")
    
    print(f"\nüîó TYPES DE RELATIONS:")
    for rel_type, count in sorted(stats["relation_types"].items(), key=lambda x: x[1], reverse=True):
        print(f"   ‚Ä¢ {rel_type}: {count}")
    
    print(f"\nüåê LANGUES:")
    for lang, count in stats["languages"].items():
        print(f"   ‚Ä¢ {lang}: {count}")
    
    print(f"\nüìÅ FICHIERS G√âN√âR√âS:")
    print(f"   ‚úÖ CSV: {os.path.basename(csv_file)}")
    print(f"   ‚úÖ JSON: {os.path.basename(json_file)}")
    
    print(f"\nüèÜ MISSION ACCOMPLIE !")
    print(f" {stats['total_entries']} entr√©es de th√©saurus")

# Export et r√©sum√© final
if final_thesaurus:
    csv_file, json_file, statistics = export_final_thesaurus(final_thesaurus)
    display_final_summary(statistics, csv_file, json_file)
else:
    print("‚ùå Aucun th√©saurus √† exporter")

### Nettoyage des doublons

In [None]:
# Lire le CSV (remplace 'ton_fichier.csv' par le tien)
df = pd.read_csv(csv_file, sep=';', dtype=str).fillna('')

# Fonction pour concat√©ner les valeurs uniques (s√©par√©es par "|")
def concat_unique(series):
    uniques = set([v.strip() for v in series if v.strip() != ''])
    return " | ".join(sorted(uniques)) if uniques else ''

# Grouper par 'ID', en concat√©nant les valeurs diff√©rentes pour chaque colonne
df_clean = df.groupby('ID', as_index=False).agg(concat_unique)

# Sauvegarder le r√©sultat
df_clean.to_csv(csv_file, sep=';', index=False, encoding='utf-8-sig')

print(f"‚úÖ CSV nettoy√© et export√© sous {csv_file}")

