In [None]:
# Import

from transformers import pipeline

import requests
import time
from rdflib import Graph, URIRef, Namespace

import re
from datetime import datetime
from rdflib import Literal
from rdflib.namespace import XSD

In [None]:
# Import du texte

with open("texte_a_analyser.txt", "r", encoding="utf-8") as f:
    text_source = f.read()

# Découpage : on sépare par les sauts de ligne (\n) et on enlève les vides
blocs = [b.strip() for b in text_source.split('\n') if b.strip()]

print(f"Texte découpé avec succès en {len(blocs)} blocs (paragraphes).")
print(f"Exemple du bloc 1 : {blocs[0]}")

Texte découpé avec succès en 9 blocs (paragraphes).
Exemple du bloc 1 : Geoffrey Everest Hinton (born 6 December 1947) is an English Canadian cognitive psychologist and computer scientist, most noted for his work on artificial neural networks. Since 2013 he divides his time working for Google (Google Brain) and the University of Toronto. In 2017, he cofounded and became the Chief Scientific Advisor of the Vector Institute in Toronto.


In [13]:
# REBEL

# 1. Chargement du pipeline REBEL
# Le premier lancement téléchargera le modèle (~1.6 Go)
triplet_extractor = pipeline('text2text-generation', model='Babelscape/rebel-large', tokenizer='Babelscape/rebel-large')

# 2. Fonction minimale pour parser la sortie (String -> Liste de Dictionnaires)
def extract_triplets(text):
    triplets = []
    relation, subject, object_ = '', '', ''
    text = text.strip()
    current = 'x'
    # on enlève juste <s>, <pad>, </s> mais on garde <triplet>, <subj>, <obj>
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(), 'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(), 'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(), 'tail': object_.strip()})
    return triplets


# 3. Exécution sur les blocs
print("Démarrage de l'extraction REBEL...\n")
all_triplets = []

for i, bloc in enumerate(blocs):
    # 1) génération : on veut les ids, pas le texte nettoyé
    out = triplet_extractor(
        bloc,
        max_length=512,
        num_beams=3,
        return_tensors=True,
        return_text=False
    )[0]["generated_token_ids"]

    # 2) décodage manuel en gardant les tokens spéciaux
    raw_output = triplet_extractor.tokenizer.batch_decode(
        [out],
        skip_special_tokens=False
    )[0]

    # Debug si tu veux voir la chaîne brute
    print(f"RAW OUTPUT BLOC {i+1} : {repr(raw_output)}")

    # 3) extraction des triplets
    triplets = extract_triplets(raw_output)
    all_triplets.extend(triplets)

    print(f"Bloc {i+1} : {len(triplets)} triplets extraits.")

print(f"\nTerminé ! Total de triplets trouvés : {len(all_triplets)}")


Device set to use cuda:0


Démarrage de l'extraction REBEL...

RAW OUTPUT BLOC 1 : '<s><triplet> Everest Hinton <subj> 6 December 1947 <obj> date of birth <subj> computer scientist <obj> occupation <subj> artificial neural network <obj> field of work <subj> University of Toronto <obj> employer <triplet> artificial neural network <subj> cognitive psychologist <obj> part of</s>'
Bloc 1 : 5 triplets extraits.
RAW OUTPUT BLOC 2 : '<s><triplet> AlexNet <subj> Alex Krizhevsky <obj> discoverer or inventor <triplet> Alex Krizhevsky <subj> AlexNet <obj> notable work</s>'
Bloc 2 : 2 triplets extraits.
RAW OUTPUT BLOC 3 : '<s><triplet> Christopher Longuet-Higgins <subj> University of Edinburgh <obj> employer</s>'
Bloc 3 : 1 triplets extraits.
RAW OUTPUT BLOC 4 : '<s><triplet> Google <subj> DNNresearch Inc. <obj> subsidiary <triplet> DNNresearch Inc. <subj> Google <obj> parent organization</s>'
Bloc 4 : 2 triplets extraits.
RAW OUTPUT BLOC 5 : '<s><triplet> neural network <subj> machine learning <obj> use</s>'
Bloc 5 : 1 tr

In [14]:
def get_dbpedia_uri(text_entity, confidence=0.5, support=20):
    api_url = "https://api.dbpedia-spotlight.org/en/annotate"
    params = {
        "text": text_entity,
        "confidence": confidence,
        "support": support
    }
    headers = {"Accept": "application/json"}
    
    try:
        time.sleep(0.2) 
        response = requests.get(api_url, params=params, headers=headers)
        if response.status_code == 200:
            data = response.json()
            if 'Resources' in data:
                return data['Resources'][0]['@URI']
    except Exception as e:
        print(e)
    
    return None

In [None]:
def to_literal_if_date_or_year(text: str):
    t = text.strip()

    # "1947" / "1978"
    if re.fullmatch(r"\d{4}", t):
        return Literal(t, datatype=XSD.gYear)

    # "6 December 1947"
    try:
        dt = datetime.strptime(t, "%d %B %Y").date()
        return Literal(dt.isoformat(), datatype=XSD.date)
    except Exception:
        return None

In [15]:
g = Graph()
EX = Namespace("http://example.org/")
cache = {}

all_names = set()
for t in all_triplets:
    all_names.add(t['head'])
    all_names.add(t['tail'])

print(f"Recherche URIs pour {len(all_names)} entités...")
for nom in all_names:
    uri = get_dbpedia_uri(nom)
    if uri:
        print(f"{nom} -> {uri}")
        cache[nom] = URIRef(uri)
    else:
        # fallback
        clean_nom = nom.replace(" ", "_")
        local_uri = EX[clean_nom]
        print(f"{nom} -> {local_uri} (Local)")
        cache[nom] = local_uri

print("Ajout des triplets...")
for t in all_triplets:
    s = cache[t['head']]
    o = cache[t['tail']]
    
    # local relation
    rel_clean = t['type'].replace(" ", "_")
    p = EX[rel_clean]
    
    g.add((s, p, o))

print("\n" + "="*40)
print(" RÉSULTAT FINAL (.ttl) ")
print("="*40)
print(g.serialize(format='turtle'))

Recherche URIs pour 23 entités...
neural network -> http://dbpedia.org/resource/Neural_network
Terry Sejnowski -> http://dbpedia.org/resource/Terry_Sejnowski
Carnegie Mellon University -> http://dbpedia.org/resource/Carnegie_Mellon_University
University of Toronto -> http://dbpedia.org/resource/Toronto
David Ackley -> http://example.org/David_Ackley (Local)
AlexNet -> http://example.org/AlexNet (Local)
cognitive psychologist -> http://dbpedia.org/resource/Cognitive_psychology
Alex Krizhevsky -> http://example.org/Alex_Krizhevsky (Local)
Christopher Longuet-Higgins -> http://dbpedia.org/resource/Christopher_Longuet-Higgins
DNNresearch Inc. -> http://example.org/DNNresearch_Inc. (Local)
artificial neural network -> http://dbpedia.org/resource/Neural_network
Google -> http://dbpedia.org/resource/Google
David E. Rumelhart -> http://dbpedia.org/resource/David_Rumelhart
machine learning -> http://dbpedia.org/resource/Machine_learning
6 December 1947 -> http://example.org/6_December_1947 (Loc

In [16]:
# test diff values for conf, support
def test(entities_list):
    scenarios = [
        (0.3, 0),    
        (0.5, 20),  
        (0.8, 100)  
    ]
    sample_entities = list(set(entities_list))[:4] 
    
    for ent in sample_entities:
        for conf, supp in scenarios:
            uri = get_dbpedia_uri(ent, conf, supp)
            res = uri.split('/')[-1] if uri else None
            print(f"{ent}: {uri}")

In [17]:
# build graph
def build_final_graph(triplets):
    g = Graph()
    EX = Namespace("http://example.org/") 
    
    FINAL_CONF = 0.5
    FINAL_SUPP = 20
    
    noms_uniques = set()
    for t in triplets:
        noms_uniques.add(t['head'])
        noms_uniques.add(t['tail'])
        
    test(list(noms_uniques))
        
    uri_cache = {}
    
    for nom in noms_uniques:
        uri = get_dbpedia_uri(nom, FINAL_CONF, FINAL_SUPP)
        if uri:
            uri_cache[nom] = URIRef(uri)
        else:
            clean = nom.replace(" ", "_")
            uri_cache[nom] = EX[clean]

    for t in triplets:
        s = uri_cache[t['head']]
        o = uri_cache[t['tail']]
        
        rel_clean = t['type'].replace(" ", "_").replace("-", "_") # to get local relationships
        p = EX[rel_clean]
        
        g.add((s, p, o))
        
    return g

graph_final = build_final_graph(all_triplets)
graph_final.serialize("graph.ttl", format="turtle")

# print(graph_final.serialize(format='turtle'))

open-access: http://dbpedia.org/resource/Open_access
open-access: http://dbpedia.org/resource/Open_access
open-access: None
neural network: http://dbpedia.org/resource/Neural_network
neural network: http://dbpedia.org/resource/Neural_network
neural network: http://dbpedia.org/resource/Neural_network
Terry Sejnowski: http://dbpedia.org/resource/Terry_Sejnowski
Terry Sejnowski: http://dbpedia.org/resource/Terry_Sejnowski
Terry Sejnowski: None
Carnegie Mellon University: http://dbpedia.org/resource/Carnegie_Mellon_University
Carnegie Mellon University: http://dbpedia.org/resource/Carnegie_Mellon_University
Carnegie Mellon University: http://dbpedia.org/resource/Carnegie_Mellon_University


<Graph identifier=N04dadb4fe85846228573e685afe06758 (<class 'rdflib.graph.Graph'>)>

In [18]:
import time
import pandas as pd
import requests

# 1) Entités uniques à tester (head+tail)
entities = sorted({t["head"] for t in all_triplets} | {t["tail"] for t in all_triplets})

# 2) Petite grille de tests (à ajuster si vous voulez)
param_grid = [
    (0.20, 0),
    (0.35, 20),
    (0.50, 10),
    (0.70, 50),
]

# 3) Pour éviter de spammer l'API : on teste sur un petit échantillon stable
N = min(25, len(entities))
entities_sample = entities[:N]

ENDPOINT = "https://api.dbpedia-spotlight.org/en/annotate"

def spotlight_link(mention: str, confidence: float, support: int):
    """Renvoie une URI DBpedia (string) ou None."""
    try:
        r = requests.get(
            ENDPOINT,
            params={"text": mention, "confidence": confidence, "support": support},
            headers={"Accept": "application/json"},
            timeout=15,
        )
        if r.status_code != 200:
            return None
        data = r.json()
        resources = data.get("Resources", [])
        if not resources:
            return None
        # on prend la ressource la plus "similaire" si dispo
        best = max(resources, key=lambda x: float(x.get("@similarityScore", "0")))
        return best.get("@URI")
    except Exception:
        return None

rows = []
examples = []  # quelques exemples concrets (utile pour le texte du rapport)
for conf, supp in param_grid:
    linked = 0
    local_examples = []
    for e in entities_sample:
        uri = spotlight_link(e, conf, supp)
        time.sleep(0.2)  # évite les 403 si beaucoup d'appels
        ok = uri is not None and "dbpedia.org/resource/" in uri
        linked += int(ok)
        if len(local_examples) < 3:  # garde 2-3 exemples
            local_examples.append((e, uri))
    rows.append({
        "confidence": conf,
        "support": supp,
        "entities_tested": len(entities_sample),
        "linked_dbpedia": linked,
        "linked_pct": linked / len(entities_sample)
    })
    examples.append({"confidence": conf, "support": supp, "examples": local_examples})

df_params = pd.DataFrame(rows).sort_values(["linked_dbpedia", "confidence"], ascending=[False, True])
df_params

# Export (optionnel) + génération LaTeX prête à coller dans le rapport
df_params.to_csv("spotlight_grid_results.csv", index=False)

print(df_params.to_latex(
    index=False,
    float_format="%.2f",
    caption="Impact des paramètres \\texttt{confidence} et \\texttt{support} sur la couverture de DBpedia Spotlight (échantillon d’entités).",
    label="tab:spotlight_params"
))

# Bonus: affiche quelques exemples pour illustrer une erreur/ambiguïté
examples


\begin{table}
\caption{Impact des paramètres \texttt{confidence} et \texttt{support} sur la couverture de DBpedia Spotlight (échantillon d’entités).}
\label{tab:spotlight_params}
\begin{tabular}{rrrrr}
\toprule
confidence & support & entities_tested & linked_dbpedia & linked_pct \\
\midrule
0.20 & 0 & 23 & 23 & 1.00 \\
0.35 & 20 & 23 & 16 & 0.70 \\
0.50 & 10 & 23 & 16 & 0.70 \\
0.70 & 50 & 23 & 13 & 0.57 \\
\bottomrule
\end{tabular}
\end{table}



[{'confidence': 0.2,
  'support': 0,
  'examples': [('6 December 1947', 'http://dbpedia.org/resource/December'),
   ('Alex Krizhevsky', 'http://dbpedia.org/resource/Alex_Krizhevsky'),
   ('AlexNet', 'http://dbpedia.org/resource/AlexNet')]},
 {'confidence': 0.35,
  'support': 20,
  'examples': [('6 December 1947', None),
   ('Alex Krizhevsky', None),
   ('AlexNet', None)]},
 {'confidence': 0.5,
  'support': 10,
  'examples': [('6 December 1947', None),
   ('Alex Krizhevsky', None),
   ('AlexNet', 'http://dbpedia.org/resource/AlexNet')]},
 {'confidence': 0.7,
  'support': 50,
  'examples': [('6 December 1947', None),
   ('Alex Krizhevsky', None),
   ('AlexNet', None)]}]