In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import plotly.express as px
def analyse_universelle(resultats_json):
    data = []
    for row in resultats_json['results']['bindings']:
        data.append({k: v['value'] for k, v in row.items()})
    
    df = pd.DataFrame(data)
    if df.empty: return "Aucune donnée", None

    # Correction .values (sans parenthèses)
    df['text_for_embedding'] = df.apply(lambda row: " ".join(row.values.astype(str)), axis=1)

    # Embeddings
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(df['text_for_embedding'].tolist())

    # Clustering
    n_clusters = min(4, len(df))
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
    df['cluster'] = kmeans.fit_predict(embeddings).astype(str)

    # PCA
    pca = PCA(n_components=2)
    coords = pca.fit_transform(embeddings)
    df['x'], df['y'] = coords[:, 0], coords[:, 1]

    # --- AMÉLIORATION DE LA VISUALISATION ---
    # On essaie de trouver une colonne "nom" ou "label" pour l'affichage
    colonnes_nom = [c for c in df.columns if c.lower() in ['name', 'label', 'artiste', 'nom']]
    nom_affichage = colonnes_nom[0] if colonnes_nom else df.columns[0]

    fig = px.scatter(
        df, x='x', y='y', 
        color='cluster',
        text=nom_affichage, # Affiche le nom sur le point
        hover_data=df.columns.drop(['x', 'y', 'text_for_embedding']),
        title="Analyse Sémantique : Regroupement par proximité de profil",
        template='plotly_dark'
    )

    # Ajustement de la position du texte pour qu'il ne soit pas sur le point
    fig.update_traces(textposition='top center')
    
    return df, fig

In [13]:
# 1. Simulation d'un résultat SPARQL minimaliste
# Structure : Un nom, une date, un lieu (sans aucune phrase descriptive)
mock_minimal_results = {
    "results": {
        "bindings": [
            # Groupe Renaissance
            {"label": {"value": "Leonardo da Vinci"}, "date": {"value": "1452-04-15"}, "city": {"value": "Florence"}},
            {"label": {"value": "Michelangelo"}, "date": {"value": "1475-03-06"}, "city": {"value": "Florence"}},
            {"label": {"value": "Raphael"}, "date": {"value": "1483-04-06"}, "city": {"value": "Urbino"}},
            
            # Groupe Contemporain / Tech
            {"label": {"value": "Steve Jobs"}, "date": {"value": "1955-02-24"}, "city": {"value": "San Francisco"}},
            {"label": {"value": "Bill Gates"}, "date": {"value": "1955-10-28"}, "city": {"value": "Seattle"}},
            {"label": {"value": "Mark Zuckerberg"}, "date": {"value": "1984-05-14"}, "city": {"value": "White Plains"}},
            {"label": {"value": "Elon Musk"}, "date": {"value": "1971-06-28"}, "city": {"value": "Pretoria"}},

            # Groupe Musique Classique
            {"label": {"value": "Ludwig van Beethoven"}, "date": {"value": "1770-12-17"}, "city": {"value": "Bonn"}},
            {"label": {"value": "Johann Sebastian Bach"}, "date": {"value": "1685-03-31"}, "city": {"value": "Eisenach"}},
            {"label": {"value": "Wolfgang Amadeus Mozart"}, "date": {"value": "1756-01-27"}, "city": {"value": "Salzburg"}},
            
            # Entrées isolées pour tester la sensibilité
            {"label": {"value": "Galileo Galilei"}, "date": {"value": "1564-02-15"}, "city": {"value": "Pisa"}},
            {"label": {"value": "Tim Berners-Lee"}, "date": {"value": "1955-06-08"}, "city": {"value": "London"}}
        ]
    }
}

# 2. Exécution de l'analyse
# La fonction va concaténer : "Leonardo da Vinci 1452-04-15 Florence"
df_min, fig_min = analyse_universelle(mock_minimal_results)

# 3. Affichage
fig_min.update_layout(title="Clustering sur données brutes (Noms, Dates, Lieux)")
fig_min.show()


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.

