√âTAPE 2 ‚Äî SPARQL FIX√â (Syntaxe owlready2 correcte)


In [6]:
from owlready2 import get_ontology, default_world
import pickle
import re
from collections import defaultdict

In [2]:
print("üöÄ Initialisation SPARQL...")

üöÄ Initialisation SPARQL...


In [3]:
onto = get_ontology("hp.owl").load()
print(f"üîπ Ontologie: {len(list(onto.classes())):,} classes")


üîπ Ontologie: 32,044 classes


SPARQL 1: Classes HP_ (FIX: result[0])

In [4]:
print("\nüîç SPARQL 1: Classes HP_...")
hp_query = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>

SELECT DISTINCT ?class WHERE {
  ?class a owl:Class .
  FILTER(STRSTARTS(STR(?class), "http://purl.obolibrary.org/obo/HP_"))
}
"""




üîç SPARQL 1: Classes HP_...


In [7]:
hp_results = list(default_world.sparql(hp_query))
hp_classes = [result[0] for result in hp_results]  # ‚úÖ result[0] pas result['class']
print(f"‚úÖ SPARQL 1: {len(hp_classes)} classes HP_")

‚úÖ SPARQL 1: 19903 classes HP_


SPARQL 2: Labels + Comments (FIX syntaxe) 

In [8]:
print("\nüîç SPARQL 2: Labels + Comments...")
data_query = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>

SELECT ?class ?label ?comment WHERE {
  ?class a owl:Class .
  OPTIONAL { ?class rdfs:label ?label }
  OPTIONAL { ?class rdfs:comment ?comment }
  FILTER(STRSTARTS(STR(?class), "http://purl.obolibrary.org/obo/HP_"))
}
"""
results = list(default_world.sparql(data_query))



üîç SPARQL 2: Labels + Comments...


Mapping ‚Üí dict (FIX: result[0], result[1], result[2])

In [9]:
hp_data = {}
for result in results:
    cls, label, comment = result[0], result[1], result[2]
    cls_name = str(cls).split("HP_")[-1].replace("_", ":")
    hp_data[cls_name] = {
        "class": cls,
        "label": str(label) if label else cls_name,
        "comment": str(comment) if comment else ""
    }
print(f"‚úÖ SPARQL 2: {len(hp_data)} avec labels/comments")


‚úÖ SPARQL 2: 19903 avec labels/comments


NETTOYAGE + G√âN√âRATION (NO SPARQL leaf detection)

In [10]:
def clean_text(text):
    if not isinstance(text, str) or not text.strip():
        return ""
    cleaned = re.sub(r'\s+', ' ', text.replace('\n', ' ').replace(';', ',')).strip()
    return cleaned[:200] + "..." if len(cleaned) > 200 else cleaned


In [11]:
print("\nüß© G√©n√©ration chunks SPARQL...")
documents, metadatas, ids = [], [], []
stats = defaultdict(int)


üß© G√©n√©ration chunks SPARQL...


In [12]:


for cls_name, data in hp_data.items():
    label = data["label"]
    raw_comment = data["comment"]
    
    # D√©finition intelligente
    definition = clean_text(raw_comment)
    if not definition:
        definition = f"Sympt√¥me clinique document√© (HP:{cls_name})"
    
    chunk = f"""[Sympt√¥me] : {label}
D√©finition : {definition}
Code HPO : {cls_name}
Source : Ontologie HPO (v2025)"""
    
    # Leaf detection owlready2 (pas SPARQL)
    try:
        cls_obj = data["class"]
        is_leaf = len(list(cls_obj.subclasses())) == 0
    except:
        is_leaf = True
    
    documents.append(chunk)
    metadatas.append({
        "entity_id": str(data["class"]),
        "label": label,
        "type": "Symptom",
        "code_hpo": cls_name,
        "is_leaf": is_leaf,
        "has_comment": bool(raw_comment.strip()),
        "source": "sparql"
    })
    ids.append(cls_name)
    
    stats["total"] += 1
    if not raw_comment.strip(): stats["no_comment"] += 1
    if is_leaf: stats["leaves"] += 1


SAUVEGARDE

In [13]:
print("üíæ Sauvegarde hpo_chunks_sparql.pkl...")
with open("hpo_chunks_sparql.pkl", "wb") as f:
    pickle.dump({
        "documents": documents,
        "metadatas": metadatas,
        "ids": ids,
        "stats": dict(stats)
    }, f)

üíæ Sauvegarde hpo_chunks_sparql.pkl...


RAPPORT

In [14]:
print(f"\nüéâ ‚úÖ {len(documents):,} CHUNKS SPARQL G√âN√âR√âS!")
print(f"üìÅ hpo_chunks_sparql.pkl")
print(f"üçÉ Feuilles: {stats['leaves']:,} ({stats['leaves']/stats['total']*100:.1f}%)")
print(f"üí¨ Avec defs: {stats['total']-stats['no_comment']:,}")



üéâ ‚úÖ 19,903 CHUNKS SPARQL G√âN√âR√âS!
üìÅ hpo_chunks_sparql.pkl
üçÉ Feuilles: 14,032 (70.5%)
üí¨ Avec defs: 4,452


√âCHANTILLONS

In [15]:
print("\nüîç 3 √âCHANTILLONS SPARQL:")
for i in range(min(3, len(documents))):
    print(f"\n--- Chunk {i+1} ({'Feuille' if metadatas[i]['is_leaf'] else 'Interm√©diaire'}) ---")
    print(documents[i][:300] + "...")
    print("-"*60)




üîç 3 √âCHANTILLONS SPARQL:

--- Chunk 1 (Interm√©diaire) ---
[Sympt√¥me] : All
D√©finition : Root of all terms in the Human Phenotype Ontology.
Code HPO : 0000001
Source : Ontologie HPO (v2025)...
------------------------------------------------------------

--- Chunk 2 (Interm√©diaire) ---
[Sympt√¥me] : Abnormality of body height
D√©finition : Sympt√¥me clinique document√© (HP:0000002)
Code HPO : 0000002
Source : Ontologie HPO (v2025)...
------------------------------------------------------------

--- Chunk 3 (Interm√©diaire) ---
[Sympt√¥me] : Growth abnormality
D√©finition : Sympt√¥me clinique document√© (HP:0001507)
Code HPO : 0001507
Source : Ontologie HPO (v2025)...
------------------------------------------------------------
