# TP3 - Population d'un graphe RDF à partir de texte

## 1. Configuration et Imports

In [None]:
import time
import requests
import pandas as pd
from transformers import pipeline
from rdflib import Graph, URIRef, Namespace

# Configuration DBpedia Spotlight
SPOTLIGHT_ENDPOINT = "https://api.dbpedia-spotlight.org/en/annotate"
SPOTLIGHT_CONFIDENCE = 0.5
SPOTLIGHT_SUPPORT = 10
API_DELAY = 0.2  # délai entre appels API pour éviter le rate limiting

# Configuration REBEL
REBEL_MAX_LENGTH = 512
REBEL_NUM_BEAMS = 3

# Namespace pour les entités/relations locales
EX = Namespace("http://example.org/")

2025-12-14 22:02:15.051530: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-14 22:02:15.082266: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-12-14 22:02:15.703664: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


## 2. Fonctions utilitaires

In [2]:
def extract_triplets(text: str) -> list[dict]:
    """Parse la sortie REBEL pour extraire les triplets (head, type, tail)."""
    triplets = []
    relation, subject, object_ = '', '', ''
    text = text.strip()
    current = 'x'
    
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(), 'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(), 'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
                
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(), 'tail': object_.strip()})
    
    return triplets


def get_dbpedia_uri(entity: str, confidence: float = SPOTLIGHT_CONFIDENCE, support: int = SPOTLIGHT_SUPPORT) -> str | None:
    """Interroge DBpedia Spotlight pour obtenir l'URI d'une entité."""
    params = {"text": entity, "confidence": confidence, "support": support}
    headers = {"Accept": "application/json"}
    
    try:
        time.sleep(API_DELAY)
        response = requests.get(SPOTLIGHT_ENDPOINT, params=params, headers=headers, timeout=15)
        if response.status_code == 200:
            data = response.json()
            if 'Resources' in data:
                best = max(data['Resources'], key=lambda x: float(x.get('@similarityScore', '0')))
                return best.get('@URI')
    except Exception:
        pass
    
    return None


def build_rdf_graph(triplets: list[dict], verbose: bool = True) -> Graph:
    """Construit un graphe RDF à partir des triplets extraits."""
    g = Graph()
    
    # Collecter toutes les entités uniques
    entities = {t['head'] for t in triplets} | {t['tail'] for t in triplets}
    
    if verbose:
        print(f"Recherche d'URIs pour {len(entities)} entités...")
    
    # Résoudre les URIs (DBpedia ou fallback local)
    uri_cache = {}
    for entity in entities:
        uri = get_dbpedia_uri(entity)
        if uri:
            uri_cache[entity] = URIRef(uri)
            if verbose:
                print(f"  {entity} -> {uri}")
        else:
            local_uri = EX[entity.replace(" ", "_")]
            uri_cache[entity] = local_uri
            if verbose:
                print(f"  {entity} -> {local_uri} (local)")
    
    # Ajouter les triplets au graphe
    for t in triplets:
        subject = uri_cache[t['head']]
        obj = uri_cache[t['tail']]
        predicate = EX[t['type'].replace(" ", "_").replace("-", "_")]
        g.add((subject, predicate, obj))
    
    return g

## 3. Chargement des données

In [3]:
with open("texte_a_analyser.txt", "r", encoding="utf-8") as f:
    text_source = f.read()

blocs = [b.strip() for b in text_source.split('\n') if b.strip()]

print(f"Texte découpé en {len(blocs)} blocs (paragraphes).")
print(f"Premier bloc : {blocs[0][:100]}...")

Texte découpé en 9 blocs (paragraphes).
Premier bloc : Geoffrey Everest Hinton (born 6 December 1947) is an English Canadian cognitive psychologist and com...


## 4. Extraction des triplets avec REBEL

In [4]:
# Chargement du modèle REBEL (téléchargement ~1.6 Go au premier lancement)
triplet_extractor = pipeline(
    'text2text-generation',
    model='Babelscape/rebel-large',
    tokenizer='Babelscape/rebel-large'
)

Device set to use cuda:0


In [5]:
all_triplets = []

for i, bloc in enumerate(blocs):
    # Génération avec REBEL
    out = triplet_extractor(
        bloc,
        max_length=REBEL_MAX_LENGTH,
        num_beams=REBEL_NUM_BEAMS,
        return_tensors=True,
        return_text=False
    )[0]["generated_token_ids"]

    # Décodage en gardant les tokens spéciaux
    raw_output = triplet_extractor.tokenizer.batch_decode([out], skip_special_tokens=False)[0]
    
    # Extraction des triplets
    triplets = extract_triplets(raw_output)
    all_triplets.extend(triplets)
    print(f"Bloc {i+1}/{len(blocs)} : {len(triplets)} triplets")

print(f"\nTotal : {len(all_triplets)} triplets extraits")

Bloc 1/9 : 5 triplets
Bloc 2/9 : 2 triplets
Bloc 3/9 : 1 triplets
Bloc 4/9 : 2 triplets
Bloc 5/9 : 1 triplets
Bloc 6/9 : 1 triplets
Bloc 7/9 : 2 triplets
Bloc 8/9 : 1 triplets
Bloc 9/9 : 2 triplets

Total : 17 triplets extraits


In [6]:
# Aperçu des triplets extraits
pd.DataFrame(all_triplets)

Unnamed: 0,head,type,tail
0,Everest Hinton,date of birth,6 December 1947
1,Everest Hinton,occupation,computer scientist
2,Everest Hinton,field of work,artificial neural network
3,Everest Hinton,employer,University of Toronto
4,artificial neural network,part of,cognitive psychologist
5,AlexNet,discoverer or inventor,Alex Krizhevsky
6,Alex Krizhevsky,notable work,AlexNet
7,Christopher Longuet-Higgins,employer,University of Edinburgh
8,Google,subsidiary,DNNresearch Inc.
9,DNNresearch Inc.,parent organization,Google


## 5. Analyse des paramètres DBpedia Spotlight (optionnel)

Cette section teste différentes valeurs de `confidence` et `support` pour évaluer leur impact sur le linking.

In [7]:
def evaluate_spotlight_params(entities: list[str], param_grid: list[tuple]) -> pd.DataFrame:
    """Évalue l'impact des paramètres Spotlight sur le taux de linking."""
    rows = []
    
    for conf, supp in param_grid:
        linked = 0
        for entity in entities:
            uri = get_dbpedia_uri(entity, conf, supp)
            if uri and "dbpedia.org/resource/" in uri:
                linked += 1
        
        rows.append({
            "confidence": conf,
            "support": supp,
            "entities_tested": len(entities),
            "linked_dbpedia": linked,
            "linked_pct": linked / len(entities)
        })
    
    return pd.DataFrame(rows).sort_values("linked_dbpedia", ascending=False)

In [8]:
# Entités uniques à tester
entities_to_test = sorted({t["head"] for t in all_triplets} | {t["tail"] for t in all_triplets})

# Grille de paramètres
param_grid = [
    (0.20, 0),
    (0.35, 20),
    (0.50, 10),
    (0.70, 50),
]

df_params = evaluate_spotlight_params(entities_to_test, param_grid)
df_params

Unnamed: 0,confidence,support,entities_tested,linked_dbpedia,linked_pct
0,0.2,0,23,23,1.0
1,0.35,20,23,16,0.695652
2,0.5,10,23,16,0.695652
3,0.7,50,23,13,0.565217


In [9]:
# Export LaTeX pour le rapport
print(df_params.to_latex(
    index=False,
    float_format="%.2f",
    caption="Impact des paramètres confidence et support sur la couverture DBpedia Spotlight.",
    label="tab:spotlight_params"
))

\begin{table}
\caption{Impact des paramètres confidence et support sur la couverture DBpedia Spotlight.}
\label{tab:spotlight_params}
\begin{tabular}{rrrrr}
\toprule
confidence & support & entities_tested & linked_dbpedia & linked_pct \\
\midrule
0.20 & 0 & 23 & 23 & 1.00 \\
0.35 & 20 & 23 & 16 & 0.70 \\
0.50 & 10 & 23 & 16 & 0.70 \\
0.70 & 50 & 23 & 13 & 0.57 \\
\bottomrule
\end{tabular}
\end{table}



## 6. Construction du graphe RDF

In [10]:
graph = build_rdf_graph(all_triplets, verbose=True)

Recherche d'URIs pour 23 entités...
  Terry Sejnowski -> http://dbpedia.org/resource/Terry_Sejnowski
  Richard Zemel -> http://example.org/Richard_Zemel (local)
  Alex Krizhevsky -> http://example.org/Alex_Krizhevsky (local)
  computer scientist -> http://dbpedia.org/resource/Computer_scientist
  machine learning -> http://dbpedia.org/resource/Machine_learning
  research papers -> http://example.org/research_papers (local)
  University of Toronto -> http://dbpedia.org/resource/Toronto
  open-access -> http://dbpedia.org/resource/Open_access
  cognitive psychologist -> http://dbpedia.org/resource/Cognitive_psychology
  Christopher Longuet-Higgins -> http://dbpedia.org/resource/Christopher_Longuet-Higgins
  neural network -> http://dbpedia.org/resource/Neural_network
  Google -> http://dbpedia.org/resource/Google
  Everest Hinton -> http://dbpedia.org/resource/Mount_Everest
  David E. Rumelhart -> http://dbpedia.org/resource/David_Rumelhart
  DNNresearch Inc. -> http://example.org/DNNres

In [11]:
# Export en Turtle
graph.serialize("graph.ttl", format="turtle")
print("Graphe exporté dans graph.ttl")

Graphe exporté dans graph.ttl


In [12]:
# Affichage du graphe
print(graph.serialize(format='turtle'))

@prefix ns1: <http://example.org/> .

<http://dbpedia.org/resource/Boltzmann_machine> ns1:discoverer_or_inventor <http://dbpedia.org/resource/Terry_Sejnowski>,
        ns1:David_Ackley .

<http://dbpedia.org/resource/Christopher_Longuet-Higgins> ns1:employer <http://dbpedia.org/resource/Edinburgh> .

<http://dbpedia.org/resource/David_Rumelhart> ns1:educated_at <http://dbpedia.org/resource/Carnegie_Mellon_University> .

<http://dbpedia.org/resource/Mount_Everest> ns1:date_of_birth ns1:6_December_1947 ;
    ns1:employer <http://dbpedia.org/resource/Toronto> ;
    ns1:field_of_work <http://dbpedia.org/resource/Neural_network> ;
    ns1:occupation <http://dbpedia.org/resource/Computer_scientist> .

<http://dbpedia.org/resource/Open_access> ns1:subclass_of ns1:research_papers .

<http://dbpedia.org/resource/Google> ns1:subsidiary <http://example.org/DNNresearch_Inc.> .

<http://dbpedia.org/resource/Neural_network> ns1:part_of <http://dbpedia.org/resource/Cognitive_psychology> ;
    ns1:use

In [13]:
import pandas as pd
from collections import Counter

# --- Stats rapides ---
subjects = {s for s,p,o in graph}
objects  = {o for s,p,o in graph}
preds    = {p for s,p,o in graph}
nodes    = subjects | objects

is_dbpedia = lambda u: str(u).startswith("http://dbpedia.org/resource/")
is_local   = lambda u: str(u).startswith("http://example.org/")

stats = {
    "triples": len(graph),
    "nodes": len(nodes),
    "predicates": len(preds),
    "dbpedia_nodes": sum(is_dbpedia(n) for n in nodes),
    "local_nodes": sum(is_local(n) for n in nodes),
}
df_stats = pd.DataFrame([stats])
df_stats

print(df_stats.to_latex(
    index=False,
    caption="Statistiques du graphe RDF généré.",
    label="tab:graph_stats"
))

# --- Centralité naïve (degré) pour discuter la structure ---
deg = Counter()
for s,p,o in graph:
    deg[s] += 1
    deg[o] += 1

df_deg = pd.DataFrame([(str(n), d) for n,d in deg.most_common(8)], columns=["node", "degree"])
df_deg


\begin{table}
\caption{Statistiques du graphe RDF généré.}
\label{tab:graph_stats}
\begin{tabular}{rrrrr}
\toprule
triples & nodes & predicates & dbpedia_nodes & local_nodes \\
\midrule
17 & 22 & 14 & 14 & 8 \\
\bottomrule
\end{tabular}
\end{table}



Unnamed: 0,node,degree
0,http://dbpedia.org/resource/Mount_Everest,4
1,http://dbpedia.org/resource/Neural_network,3
2,http://dbpedia.org/resource/Boltzmann_machine,2
3,http://example.org/Alex_Krizhevsky,2
4,http://example.org/AlexNet,2
5,http://example.org/Richard_Zemel,2
6,http://example.org/Brendan_Frey,2
7,http://example.org/DNNresearch_Inc.,2
