In [1]:
pip install transformers torch rdflib requests

Collecting transformers
  Using cached transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting torch
  Using cached torch-2.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting rdflib
  Using cached rdflib-7.5.0-py3-none-any.whl.metadata (12 kB)
Collecting requests
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting filelock (from transformers)
  Using cached filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Using cached huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting numpy>=1.17 (from transformers)
  Using cached numpy-2.3.5-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting pyyaml>=5.1 (from transformers)
  Using cached pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (2.4 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.11.3-cp312-cp

In [2]:
import requests
import time
from rdflib import Graph, URIRef, Namespace

In [1]:
with open("texte_a_analyser.txt", "r", encoding="utf-8") as f:
    text_source = f.read()

# Découpage : on sépare par les doubles sauts de ligne (\n\n) et on enlève les vides
blocs = [b.strip() for b in text_source.split('\n') if b.strip()]

print(f"Texte découpé avec succès en {len(blocs)} blocs (paragraphes).")
print(f"Exemple du bloc 1 : {blocs[0]}")

# REBEL

from transformers import pipeline
import pprint

# 1. Chargement du pipeline REBEL
# Le premier lancement téléchargera le modèle (~1.6 Go)
triplet_extractor = pipeline('text2text-generation', model='Babelscape/rebel-large', tokenizer='Babelscape/rebel-large')

# 2. Fonction minimale pour parser la sortie (String -> Liste de Dictionnaires)
def extract_triplets(text):
    triplets = []
    relation, subject, object_ = '', '', ''
    text = text.strip()
    current = 'x'
    # on enlève juste <s>, <pad>, </s> mais on garde <triplet>, <subj>, <obj>
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(), 'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(), 'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(), 'tail': object_.strip()})
    return triplets


# 3. Exécution sur les blocs
print("Démarrage de l'extraction REBEL...\n")
all_triplets = []

for i, bloc in enumerate(blocs):
    # 1) génération : on veut les ids, pas le texte nettoyé
    out = triplet_extractor(
        bloc,
        max_length=512,
        num_beams=3,
        return_tensors=True,
        return_text=False
    )[0]["generated_token_ids"]

    # 2) décodage manuel en gardant les tokens spéciaux
    raw_output = triplet_extractor.tokenizer.batch_decode(
        [out],
        skip_special_tokens=False
    )[0]

    # Debug si tu veux voir la chaîne brute
    print(f"RAW OUTPUT BLOC {i+1} : {repr(raw_output)}")

    # 3) extraction des triplets
    triplets = extract_triplets(raw_output)
    all_triplets.extend(triplets)

    print(f"Bloc {i+1} : {len(triplets)} triplets extraits.")

print(f"\nTerminé ! Total de triplets trouvés : {len(all_triplets)}")
pprint.pprint(all_triplets)


  from .autonotebook import tqdm as notebook_tqdm


Texte découpé avec succès en 9 blocs (paragraphes).
Exemple du bloc 1 : Geoffrey Everest Hinton (born 6 December 1947) is an English Canadian cognitive psychologist and computer scientist, most noted for his work on artificial neural networks. Since 2013 he divides his time working for Google (Google Brain) and the University of Toronto. In 2017, he cofounded and became the Chief Scientific Advisor of the Vector Institute in Toronto.


Device set to use cuda:0


Démarrage de l'extraction REBEL...

RAW OUTPUT BLOC 1 : '<s><triplet> Everest Hinton <subj> 6 December 1947 <obj> date of birth <subj> computer scientist <obj> occupation <subj> artificial neural network <obj> field of work <subj> University of Toronto <obj> employer <triplet> artificial neural network <subj> cognitive psychologist <obj> part of</s>'
Bloc 1 : 5 triplets extraits.
RAW OUTPUT BLOC 2 : '<s><triplet> AlexNet <subj> Alex Krizhevsky <obj> discoverer or inventor <triplet> Alex Krizhevsky <subj> AlexNet <obj> notable work</s>'
Bloc 2 : 2 triplets extraits.
RAW OUTPUT BLOC 3 : '<s><triplet> Christopher Longuet-Higgins <subj> University of Edinburgh <obj> employer</s>'
Bloc 3 : 1 triplets extraits.
RAW OUTPUT BLOC 4 : '<s><triplet> Google <subj> DNNresearch Inc. <obj> subsidiary <triplet> DNNresearch Inc. <subj> Google <obj> parent organization</s>'
Bloc 4 : 2 triplets extraits.
RAW OUTPUT BLOC 5 : '<s><triplet> neural network <subj> machine learning <obj> use</s>'
Bloc 5 : 1 tr

In [2]:
def get_dbpedia_uri(text_entity, confidence=0.5, support=20):
    api_url = "https://api.dbpedia-spotlight.org/en/annotate"
    params = {
        "text": text_entity,
        "confidence": confidence,
        "support": support
    }
    headers = {"Accept": "application/json"}
    
    try:
        time.sleep(0.2) 
        response = requests.get(api_url, params=params, headers=headers)
        if response.status_code == 200:
            data = response.json()
            if 'Resources' in data:
                return data['Resources'][0]['@URI']
    except Exception as e:
        print(e)
    
    return None

In [3]:
# test diff values for conf, support
def test(entities_list):
    scenarios = [
        (0.3, 0),    
        (0.5, 20),  
        (0.8, 100)  
    ]
    sample_entities = list(set(entities_list))[:4] 
    
    for ent in sample_entities:
        for conf, supp in scenarios:
            uri = get_dbpedia_uri(ent, conf, supp)
            res = uri.split('/')[-1] if uri else None
            print(f"{ent}: {uri}")

In [4]:
# build graph
def build_final_graph(triplets):
    g = Graph()
    EX = Namespace("http://example.org/") 
    
    FINAL_CONF = 0.5
    FINAL_SUPP = 20
    
    noms_uniques = set()
    for t in triplets:
        noms_uniques.add(t['head'])
        noms_uniques.add(t['tail'])
        
    test(list(noms_uniques))
        
    uri_cache = {}
    
    for nom in noms_uniques:
        uri = get_dbpedia_uri(nom, FINAL_CONF, FINAL_SUPP)
        if uri:
            uri_cache[nom] = URIRef(uri)
        else:
            clean = nom.replace(" ", "_")
            uri_cache[nom] = EX[clean]

    for t in triplets:
        s = uri_cache[t['head']]
        o = uri_cache[t['tail']]
        
        rel_clean = t['type'].replace(" ", "_").replace("-", "_") # to get local relationships
        p = EX[rel_clean]
        
        g.add((s, p, o))
        
    return g



In [5]:
# just to try
triplets_input = [
    {'head': 'Geoffrey Hinton', 'type': 'works at', 'tail': 'Google'},
    {'head': 'Geoffrey Hinton', 'type': 'born in', 'tail': '1947'},
    {'head': 'AlexNet', 'type': 'designed by', 'tail': 'Alex Krizhevsky'},
    {'head': 'University of Toronto', 'type': 'employer', 'tail': 'Geoffrey Hinton'},
    {'head': 'Vector Institute', 'type': 'located in', 'tail': 'Toronto'},
    {'head': 'Deep Learning', 'type': 'field of work', 'tail': 'Yann LeCun'}
]

In [None]:
unique_names = list(set([t['head'] for t in all_triplets] + [t['tail'] for t in all_triplets]))

In [8]:
graph_final = build_final_graph(triplets_input)

University of Toronto: http://dbpedia.org/resource/University
University of Toronto: http://dbpedia.org/resource/Toronto
University of Toronto: http://dbpedia.org/resource/Toronto
Vector Institute: http://dbpedia.org/resource/State_Research_Center_of_Virology_and_Biotechnology_VECTOR
Vector Institute: http://dbpedia.org/resource/State_Research_Center_of_Virology_and_Biotechnology_VECTOR
Vector Institute: None
Geoffrey Hinton: http://dbpedia.org/resource/Geoffrey_Hinton
Geoffrey Hinton: http://dbpedia.org/resource/Geoffrey_Hinton
Geoffrey Hinton: http://dbpedia.org/resource/Geoffrey_Hinton
Toronto: http://dbpedia.org/resource/Toronto
Toronto: http://dbpedia.org/resource/Toronto
Toronto: http://dbpedia.org/resource/Toronto


In [9]:
print(graph_final.serialize(format='turtle'))

@prefix ns1: <http://example.org/> .

<http://dbpedia.org/resource/Deep_learning> ns1:field_of_work <http://dbpedia.org/resource/Yann_LeCun> .

<http://dbpedia.org/resource/State_Research_Center_of_Virology_and_Biotechnology_VECTOR> ns1:located_in <http://dbpedia.org/resource/Toronto> .

ns1:AlexNet ns1:designed_by ns1:Alex_Krizhevsky .

<http://dbpedia.org/resource/Geoffrey_Hinton> ns1:born_in ns1:1947 ;
    ns1:works_at <http://dbpedia.org/resource/Google> .

<http://dbpedia.org/resource/Toronto> ns1:employer <http://dbpedia.org/resource/Geoffrey_Hinton> .


