# Text2Graph -

In [1]:
!pip install transformers
!pip install requests
!pip install sparqlwrapper
!pip install rdflib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [16]:
text = "Punta Cana is a resort town in the municipality of Higuey, in La Altagracia Province, the eastern most province of the Dominican Republic"

## Rebel -> Relation Extraction By End-to-end Language generation


In [17]:
from transformers import pipeline

triplet_extractor = pipeline('text2text-generation', model='Babelscape/rebel-large', tokenizer='Babelscape/rebel-large')
# We need to use the tokenizer manually since we need special tokens.
extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(text, return_tensors=True, return_text=False)[0]["generated_token_ids"]])
#print(extracted_text[0])


In [18]:
extracted_text[0]

'<s><triplet> Punta Cana <subj> La Altagracia Province <obj> located in the administrative territorial entity <subj> Dominican Republic <obj> country <triplet> Higuey <subj> La Altagracia Province <obj> located in the administrative territorial entity <subj> Dominican Republic <obj> country <triplet> La Altagracia Province <subj> Dominican Republic <obj> country <triplet> Dominican Republic <subj> La Altagracia Province <obj> contains administrative territorial entity</s>'

Function to parse the generated text and extract the triplets

In [19]:
def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
    return triplets

In [43]:
extracted_text

['<s><triplet> Punta Cana <subj> La Altagracia Province <obj> located in the administrative territorial entity <subj> Dominican Republic <obj> country <triplet> Higuey <subj> La Altagracia Province <obj> located in the administrative territorial entity <subj> Dominican Republic <obj> country <triplet> La Altagracia Province <subj> Dominican Republic <obj> country <triplet> Dominican Republic <subj> La Altagracia Province <obj> contains administrative territorial entity</s>']

In [44]:
extracted_triplets = extract_triplets(extracted_text[0])

In [45]:
print(extracted_triplets)

[{'head': 'Punta Cana', 'type': 'located in the administrative territorial entity', 'tail': 'La Altagracia Province'}, {'head': 'Punta Cana', 'type': 'country', 'tail': 'Dominican Republic'}, {'head': 'Higuey', 'type': 'located in the administrative territorial entity', 'tail': 'La Altagracia Province'}, {'head': 'Higuey', 'type': 'country', 'tail': 'Dominican Republic'}, {'head': 'La Altagracia Province', 'type': 'country', 'tail': 'Dominican Republic'}, {'head': 'Dominican Republic', 'type': 'contains administrative territorial entity', 'tail': 'La Altagracia Province'}]


## DB Pedia Spotlight

In [23]:
import requests

url ="https://api.dbpedia-spotlight.org/en/annotate"
headers={'accept':'application/json'}

DBPedia request

In [24]:
resp = requests.get(url, headers=headers, params={"text": text})
#resp.content

Link REBEL extracted entities and DBPedia

In [25]:
# convert json response to dict
data = resp.json()
data

{'@text': 'Punta Cana is a resort town in the municipality of Higuey, in La Altagracia Province, the eastern most province of the Dominican Republic',
 '@confidence': '0.5',
 '@support': '0',
 '@types': '',
 '@sparql': '',
 '@policy': 'whitelist',
 'Resources': [{'@URI': 'http://dbpedia.org/resource/Punta_Cana',
   '@support': '304',
   '@types': 'Wikidata:Q486972,Schema:Place,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:Settlement',
   '@surfaceForm': 'Punta Cana',
   '@offset': '0',
   '@similarityScore': '0.999999999949921',
   '@percentageOfSecondRank': '5.007240534660874E-11'},
  {'@URI': 'http://dbpedia.org/resource/Municipality',
   '@support': '25116',
   '@types': '',
   '@surfaceForm': 'municipality',
   '@offset': '35',
   '@similarityScore': '0.9998466212392793',
   '@percentageOfSecondRank': '6.390964934336406E-5'},
  {'@URI': 'http://dbpedia.org/resource/Higüey,_Dominican_Republic',
   '@support': '66',
   '@types': 'Wikidata:Q486972,Schema:Place,DBpedia:

In [26]:
# linking REBEL and DBPedia information
Graph_dict = extracted_triplets
for resource in data["Resources"]:
  for triplet in Graph_dict:
    if resource["@surfaceForm"] == triplet["head"]:
      triplet["head"] = resource
    elif resource["@surfaceForm"] == triplet["tail"]:
      triplet["tail"] = resource

print(Graph_dict)

[{'head': {'@URI': 'http://dbpedia.org/resource/Punta_Cana', '@support': '304', '@types': 'Wikidata:Q486972,Schema:Place,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:Settlement', '@surfaceForm': 'Punta Cana', '@offset': '0', '@similarityScore': '0.999999999949921', '@percentageOfSecondRank': '5.007240534660874E-11'}, 'type': 'located in the administrative territorial entity', 'tail': {'@URI': 'http://dbpedia.org/resource/La_Altagracia_Province', '@support': '88', '@types': 'Wikidata:Q3455524,Schema:Place,Schema:AdministrativeArea,DBpedia:Region,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:AdministrativeRegion', '@surfaceForm': 'La Altagracia Province', '@offset': '62', '@similarityScore': '1.0', '@percentageOfSecondRank': '0.0'}}, {'head': {'@URI': 'http://dbpedia.org/resource/Punta_Cana', '@support': '304', '@types': 'Wikidata:Q486972,Schema:Place,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:Settlement', '@surfaceForm': 'Punta Cana', 

## Wikidata - property linking
Using Sparqlwrapper to perform Sparql queries

In [27]:
from SPARQLWrapper import SPARQLWrapper, JSON

def SparqlQuery(text):
  sparql = SPARQLWrapper(
      "https://query.wikidata.org/sparql"
  )
  sparql.setReturnFormat(JSON)

  query1 = "SELECT ?p\nWHERE\n{\n?p rdfs:label "
  query2 = "@en.\n}"
  sparql.setQuery(query1 + '"' + text + '"' + query2)
  try:
    ret = sparql.queryAndConvert()
    #print(ret)

    # works for "p" values extracted from wikidata
    uri = ret["results"]["bindings"][0]["p"]["value"]

   #for r in ret["results"]["bindings"]:
    #    print(r)

    info = {
        "uri" : uri,
        "label" : text
    }

    return info

  except Exception as e:
    print(e)

    return 0

In [28]:
# Test SPARQL Query
text1 = "located in the administrative territorial entity"
prop = SparqlQuery(text1)
print(prop)

{'uri': 'http://www.wikidata.org/entity/P131', 'label': 'located in the administrative territorial entity'}


In [29]:
# link Graph with Wikidata properties
for triplet in Graph_dict:
  #print(triplet)
  rebelProperty = triplet["type"]
  print(rebelProperty)
  relation = SparqlQuery(rebelProperty)
  if relation == 0:
    # ??REMOVE the triplets were the wikidata Property is not found??
    print("ERROR! FIX SPARQL query function")
    print(triplet)
    Graph_dict.remove(triplet)
    print("Triplet removed")
  else:
    #print(relation)
    triplet["type"] = relation
    #print(triplet)


located in the administrative territorial entity
country
located in the administrative territorial entity
country
country
contains administrative territorial entity
list index out of range
ERROR! FIX SPARQL query function
{'head': {'@URI': 'http://dbpedia.org/resource/Dominican_Republic', '@support': '17140', '@types': 'Wikidata:Q6256,Schema:Place,Schema:Country,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:Country', '@surfaceForm': 'Dominican Republic', '@offset': '119', '@similarityScore': '0.9999999999989768', '@percentageOfSecondRank': '6.99467264687938E-13'}, 'type': 'contains administrative territorial entity', 'tail': {'@URI': 'http://dbpedia.org/resource/La_Altagracia_Province', '@support': '88', '@types': 'Wikidata:Q3455524,Schema:Place,Schema:AdministrativeArea,DBpedia:Region,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:AdministrativeRegion', '@surfaceForm': 'La Altagracia Province', '@offset': '62', '@similarityScore': '1.0', '@percentageOfSe

Triplets Obtained

In [30]:
Graph_dict

[{'head': {'@URI': 'http://dbpedia.org/resource/Punta_Cana',
   '@support': '304',
   '@types': 'Wikidata:Q486972,Schema:Place,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:Settlement',
   '@surfaceForm': 'Punta Cana',
   '@offset': '0',
   '@similarityScore': '0.999999999949921',
   '@percentageOfSecondRank': '5.007240534660874E-11'},
  'type': {'uri': 'http://www.wikidata.org/entity/P131',
   'label': 'located in the administrative territorial entity'},
  'tail': {'@URI': 'http://dbpedia.org/resource/La_Altagracia_Province',
   '@support': '88',
   '@types': 'Wikidata:Q3455524,Schema:Place,Schema:AdministrativeArea,DBpedia:Region,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:AdministrativeRegion',
   '@surfaceForm': 'La Altagracia Province',
   '@offset': '62',
   '@similarityScore': '1.0',
   '@percentageOfSecondRank': '0.0'}},
 {'head': {'@URI': 'http://dbpedia.org/resource/Punta_Cana',
   '@support': '304',
   '@types': 'Wikidata:Q486972,Schema:Plac

## Convert Dictionary to RDF, Turtle or XML

Build Graph with RDFLib

In [31]:
#print(Graph[0]["type"])

#print(Graph[0]["tail"]["@surfaceForm"])

wikidata = "http://www.wikidata.org/entity/"
schema = "https://schema.org/"
dbpedia = "https://dbpedia.org/ontology/"

#print(Graph_dict[0])

#def replaceUriPrefix(node):
  #types_head = node["head"]["@types"]
  #types_head = types_head.replace("Wikidata:", wikidata)
  #types_head = types_head.replace("Schema:", schema)
  #types_head = types_head.replace("DBpedia:" , dbpedia)
  #types_head = types_head.split(",")

  #types_tail = node["tail"]["@types"]
  #types_tail = types_tail.replace("Wikidata:", wikidata)
  #types_tail = types_tail.replace("Schema:", schema)
  #types_tail = types_tail.replace("DBpedia:" , dbpedia)
  #types_tail = types_tail.split(",")


def replaceUriPrefix(types):
  # heads replacements
  types = types.split(",")
  wiki_flag = 0
  schema_flag = 0
  dbpedia_flag = 0
  new_types = []
  for ht in types:
    if "Wikidata" in ht and wiki_flag == 0:
      new_types.append(ht.replace("Wikidata:", wikidata))
      wiki_flag = 1
    elif "Schema:" in ht and schema_flag == 0:
      new_types.append(ht.replace("Schema:", schema))
      schema_flag = 1
    elif "DBpedia:" in ht and dbpedia_flag == 0:
      new_types.append(ht.replace("DBpedia:" , dbpedia))
      dbpedia_flag = 1

  return new_types

  #print(node["head"]["@types"])
  #types_head = node["head"]["@types"]
  #types_head = types_head.replace("Wikidata:", wikidata)
  #types_head = types_head.replace("Schema:", schema)
  #types_head = types_head.replace("DBpedia:" , dbpedia)
  #types_head = types_head.split(",")
  #print(types_head)

  # tails replacements
  #print(node["tail"]["@types"])
  #types_tail = node["tail"]["@types"]
  #types_tail = types_tail.replace("Wikidata:", wikidata)
  #types_tail = types_tail.replace("Schema:", schema)
  #types_tail = types_tail.replace("DBpedia:" , dbpedia)
  #types_tail = types_tail.split(",")
  #print(types_tail)


  #return types_head, types_tail

In [None]:
from rdflib import Graph, URIRef, Literal, BNode
from rdflib import Graph
from rdflib.namespace import RDFS, RDF
# Create a Graph
g = Graph()

wikidata = "http://www.wikidata.org/entity/"
schema = "https://schema.org/"
dbpedia = "https://dbpedia.org/ontology/"

g.bind("wikidata", wikidata)
g.bind ("dbo" , dbpedia)
g.bind ("dbr" , "http://dbpedia.org/resource")



# Create an RDF URI node to use as the subject for multiple triples
sdllod = URIRef("http://example.org/sdllod#")

for node in Graph_dict:

  types_head = replaceUriPrefix(node["head"]["@types"])
  types_tail = replaceUriPrefix(node["tail"]["@types"])
  print(types_head, types_tail)

  head_uri = URIRef(node["head"]["@URI"])
  print('head_uri', head_uri)
  head_label = Literal(node["head"]["@surfaceForm"])
  print('head_label', head_label)

  tail_uri = URIRef(node["tail"]["@URI"])
  print('tail_uri', tail_uri)
  tail_label = Literal(node["tail"]["@surfaceForm"])
  print('tail_label', tail_label)

  relation_uri = URIRef(node["type"]["uri"])
  print('relation_uri', relation_uri)
  relation_label = Literal(node["type"]["label"])
  print('relation_label', relation_label)

  # general relation
  g.add((head_uri, relation_uri , tail_uri))

  #head
  g.add((head_uri, RDFS.label, head_label))
  for tp in types_head:
    print(tp)
    tp = URIRef(tp)
    g.add((head_uri, RDF.type, tp))

  #tail
  g.add((tail_uri, RDFS.label, tail_label))

  for tp in types_tail:
    tp = URIRef(tp)
    g.add((head_uri, RDF.type, tp))

  # relation label
  g.add((relation_uri, RDFS.label, relation_label))






In [40]:
for stmt in g:
  print(stmt)

(rdflib.term.URIRef('http://dbpedia.org/resource/La_Altagracia_Province'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://www.wikidata.org/entity/Q6256'))
(rdflib.term.URIRef('http://dbpedia.org/resource/La_Altagracia_Province'), rdflib.term.URIRef('http://www.wikidata.org/entity/P17'), rdflib.term.URIRef('http://dbpedia.org/resource/Dominican_Republic'))
(rdflib.term.URIRef('http://dbpedia.org/resource/Higüey,_Dominican_Republic'), rdflib.term.URIRef('http://www.wikidata.org/entity/P131'), rdflib.term.URIRef('http://dbpedia.org/resource/La_Altagracia_Province'))
(rdflib.term.URIRef('http://dbpedia.org/resource/Higüey,_Dominican_Republic'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('https://schema.org/Place'))
(rdflib.term.URIRef('http://dbpedia.org/resource/Higüey,_Dominican_Republic'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://www.

In [41]:
print(g.serialize(format="turtle"))
output = g.serialize(destination="punta_cana.ttl", format="ttl")

@prefix dbo: <https://dbpedia.org/ontology/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix schema: <https://schema.org/> .
@prefix wikidata: <http://www.wikidata.org/entity/> .

<http://dbpedia.org/resource/Higüey,_Dominican_Republic> a wikidata:Q3455524,
        wikidata:Q486972,
        wikidata:Q6256,
        dbo:PopulatedPlace,
        dbo:Region,
        schema:Place ;
    rdfs:label "Higuey" ;
    wikidata:P131 <http://dbpedia.org/resource/La_Altagracia_Province> ;
    wikidata:P17 <http://dbpedia.org/resource/Dominican_Republic> .

<http://dbpedia.org/resource/Punta_Cana> a wikidata:Q3455524,
        wikidata:Q486972,
        wikidata:Q6256,
        dbo:PopulatedPlace,
        dbo:Region,
        schema:Place ;
    rdfs:label "Punta Cana" ;
    wikidata:P131 <http://dbpedia.org/resource/La_Altagracia_Province> ;
    wikidata:P17 <http://dbpedia.org/resource/Dominican_Republic> .

wikidata:P131 rdfs:label "located in the administrative territorial entity" .

w