# Text2Graph

In [23]:
!pip install transformers
!pip install requests
!pip install sparqlwrapper
!pip install rdflib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [149]:
#text = "Punta Cana is a resort town in the municipality of Higuey, in La Altagracia Province, the eastern most province of the Dominican Republic"
text = "Zagreb is the capital of Croatia, a beautiful country from Eastern Europe."

## Rebel -> Relation Extraction By End-to-end Language generation


In [150]:
from transformers import pipeline

triplet_extractor = pipeline('text2text-generation', model='Babelscape/rebel-large', tokenizer='Babelscape/rebel-large')
extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(text, return_tensors=True, return_text=False)[0]["generated_token_ids"]])

In [151]:
extracted_text[0]

'<s><triplet> Zagreb <subj> capital <obj> instance of <subj> Croatia <obj> country <triplet> Croatia <subj> Zagreb <obj> capital</s>'

Function to parse the generated text and extract the triplets

In [152]:
def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
    return triplets

In [153]:
extracted_triplets = extract_triplets(extracted_text[0])

In [154]:
print(extracted_triplets)

[{'head': 'Zagreb', 'type': 'instance of', 'tail': 'capital'}, {'head': 'Zagreb', 'type': 'country', 'tail': 'Croatia'}, {'head': 'Croatia', 'type': 'capital', 'tail': 'Zagreb'}]


## DB Pedia Spotlight

In [155]:
import requests

url ="https://api.dbpedia-spotlight.org/en/annotate"
headers={'accept':'application/json'}

DBPedia request

In [156]:
resp = requests.get(url, headers=headers, params={"text": text})

Link REBEL extracted entities and DBPedia

In [157]:
# convert json response to dict
data = resp.json()
data

{'@text': 'Zagreb is the capital of Croatia, a beautiful country from\xa0Eastern Europe.',
 '@confidence': '0.5',
 '@support': '0',
 '@types': '',
 '@sparql': '',
 '@policy': 'whitelist',
 'Resources': [{'@URI': 'http://dbpedia.org/resource/Zagreb',
   '@support': '17279',
   '@types': 'Wikidata:Q515,Wikidata:Q486972,Schema:Place,Schema:City,DBpedia:Settlement,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:City',
   '@surfaceForm': 'Zagreb',
   '@offset': '0',
   '@similarityScore': '0.9514172491266707',
   '@percentageOfSecondRank': '0.0404535610038901'},
  {'@URI': 'http://dbpedia.org/resource/Croatia',
   '@support': '37175',
   '@types': 'Wikidata:Q6256,Schema:Place,Schema:Country,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:Country',
   '@surfaceForm': 'Croatia',
   '@offset': '25',
   '@similarityScore': '0.9836082105363896',
   '@percentageOfSecondRank': '0.007004428583582536'},
  {'@URI': 'http://dbpedia.org/resource/Eastern_Europe',
   '@support

In [164]:
# linking REBEL and DBPedia information
Graph_dict = extracted_triplets
for resource in data["Resources"]:
  for triplet in Graph_dict:
    if resource["@surfaceForm"] == triplet["head"]:
      triplet["head"] = resource
    elif resource["@surfaceForm"] == triplet["tail"]:
      triplet["tail"] = resource

print(Graph_dict)

[{'head': {'@URI': 'http://dbpedia.org/resource/Zagreb', '@support': '17279', '@types': 'Wikidata:Q515,Wikidata:Q486972,Schema:Place,Schema:City,DBpedia:Settlement,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:City', '@surfaceForm': 'Zagreb', '@offset': '0', '@similarityScore': '0.9514172491266707', '@percentageOfSecondRank': '0.0404535610038901'}, 'type': 'instance of', 'tail': 'capital'}, {'head': {'@URI': 'http://dbpedia.org/resource/Zagreb', '@support': '17279', '@types': 'Wikidata:Q515,Wikidata:Q486972,Schema:Place,Schema:City,DBpedia:Settlement,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:City', '@surfaceForm': 'Zagreb', '@offset': '0', '@similarityScore': '0.9514172491266707', '@percentageOfSecondRank': '0.0404535610038901'}, 'type': 'country', 'tail': {'@URI': 'http://dbpedia.org/resource/Croatia', '@support': '37175', '@types': 'Wikidata:Q6256,Schema:Place,Schema:Country,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:Country', '@

In [165]:
# remove nodes whithout DPpedia correspondences
Graph_aux = Graph_dict
Graph_dict = []
for node in Graph_aux:
  if "@URI" in node["head"] and "@URI" in node["tail"]:
    Graph_dict.append(node)

print(Graph_dict)

[{'head': {'@URI': 'http://dbpedia.org/resource/Zagreb', '@support': '17279', '@types': 'Wikidata:Q515,Wikidata:Q486972,Schema:Place,Schema:City,DBpedia:Settlement,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:City', '@surfaceForm': 'Zagreb', '@offset': '0', '@similarityScore': '0.9514172491266707', '@percentageOfSecondRank': '0.0404535610038901'}, 'type': 'country', 'tail': {'@URI': 'http://dbpedia.org/resource/Croatia', '@support': '37175', '@types': 'Wikidata:Q6256,Schema:Place,Schema:Country,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:Country', '@surfaceForm': 'Croatia', '@offset': '25', '@similarityScore': '0.9836082105363896', '@percentageOfSecondRank': '0.007004428583582536'}}, {'head': {'@URI': 'http://dbpedia.org/resource/Croatia', '@support': '37175', '@types': 'Wikidata:Q6256,Schema:Place,Schema:Country,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:Country', '@surfaceForm': 'Croatia', '@offset': '25', '@similarityScore': '0.9

## Wikidata - property linking
Using Sparqlwrapper to perform Sparql queries

In [166]:
from SPARQLWrapper import SPARQLWrapper, JSON

def SparqlQuery(text):
  sparql = SPARQLWrapper(
      "https://query.wikidata.org/sparql"
  )
  sparql.setReturnFormat(JSON)

  query1 = "SELECT ?p\nWHERE\n{\n?p rdfs:label "
  query2 = "@en.\n}"
  sparql.setQuery(query1 + '"' + text + '"' + query2)
  try:
    ret = sparql.queryAndConvert()

    # works for "p" values extracted from wikidata
    uri = ret["results"]["bindings"][0]["p"]["value"]

    info = {
        "uri" : uri,
        "label" : text
    }

    return info

  except Exception as e:
    print(e)

    return 0

In [167]:
# Simple test of SPARQL Query
text1 = "located in the administrative territorial entity"
prop = SparqlQuery(text1)
print(prop)

{'uri': 'http://www.wikidata.org/entity/P131', 'label': 'located in the administrative territorial entity'}


In [168]:
# link Graph with Wikidata properties
for triplet in Graph_dict:
  rebelProperty = triplet["type"]
  relation = SparqlQuery(rebelProperty)
  if relation == 0:
    # REMOVE the triplets were the wikidata Property is not found
    print("ERROR! FIX SPARQL query function")
    print(triplet)
    Graph_dict.remove(triplet)
    print("Triplet removed")
  else:
    triplet["type"] = relation

Triplets Obtained

In [169]:
Graph_dict

[{'head': {'@URI': 'http://dbpedia.org/resource/Zagreb',
   '@support': '17279',
   '@types': 'Wikidata:Q515,Wikidata:Q486972,Schema:Place,Schema:City,DBpedia:Settlement,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:City',
   '@surfaceForm': 'Zagreb',
   '@offset': '0',
   '@similarityScore': '0.9514172491266707',
   '@percentageOfSecondRank': '0.0404535610038901'},
  'type': {'uri': 'http://www.wikidata.org/entity/P17', 'label': 'country'},
  'tail': {'@URI': 'http://dbpedia.org/resource/Croatia',
   '@support': '37175',
   '@types': 'Wikidata:Q6256,Schema:Place,Schema:Country,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:Country',
   '@surfaceForm': 'Croatia',
   '@offset': '25',
   '@similarityScore': '0.9836082105363896',
   '@percentageOfSecondRank': '0.007004428583582536'}},
 {'head': {'@URI': 'http://dbpedia.org/resource/Croatia',
   '@support': '37175',
   '@types': 'Wikidata:Q6256,Schema:Place,Schema:Country,DBpedia:PopulatedPlace,DBpedia:Place,

## Convert Dictionary to RDF using RDFLIB

In [170]:
wikidata = "http://www.wikidata.org/entity/"
schema = "https://schema.org/"
dbpedia = "https://dbpedia.org/ontology/"


def replaceUriPrefix(types):
  types = types.split(",")
  wiki_flag = 0
  schema_flag = 0
  dbpedia_flag = 0
  new_types = []
  for ht in types:
    if "Wikidata" in ht and wiki_flag == 0:
      new_types.append(ht.replace("Wikidata:", wikidata))
      wiki_flag = 1
    elif "Schema:" in ht and schema_flag == 0:
      new_types.append(ht.replace("Schema:", schema))
      schema_flag = 1
    elif "DBpedia:" in ht and dbpedia_flag == 0:
      new_types.append(ht.replace("DBpedia:" , dbpedia))
      dbpedia_flag = 1

  return new_types

In [171]:
from rdflib import Graph, URIRef, Literal, BNode
from rdflib import Graph
from rdflib.namespace import RDFS, RDF
# Create a Graph
g = Graph()

wikidata = "http://www.wikidata.org/entity/"
schema = "https://schema.org/"
dbpedia = "https://dbpedia.org/ontology/"

g.bind("wikidata", wikidata)
g.bind ("dbo" , dbpedia)
g.bind ("dbr" , "http://dbpedia.org/resource")

# Create an RDF URI node to use as the subject for multiple triples
sdllod = URIRef("http://example.org/sdllod#")

for node in Graph_dict:

  types_head = replaceUriPrefix(node["head"]["@types"])
  types_tail = replaceUriPrefix(node["tail"]["@types"])

  head_uri = URIRef(node["head"]["@URI"])
  head_label = Literal(node["head"]["@surfaceForm"])

  tail_uri = URIRef(node["tail"]["@URI"])
  tail_label = Literal(node["tail"]["@surfaceForm"])

  relation_uri = URIRef(node["type"]["uri"])
  relation_label = Literal(node["type"]["label"])

  # general relation
  g.add((head_uri, relation_uri , tail_uri))

  #head
  g.add((head_uri, RDFS.label, head_label))
  for tp in types_head:
    tp = URIRef(tp)
    g.add((head_uri, RDF.type, tp))

  #tail
  g.add((tail_uri, RDFS.label, tail_label))

  for tp in types_tail:
    tp = URIRef(tp)
    g.add((head_uri, RDF.type, tp))

  # relation label
  g.add((relation_uri, RDFS.label, relation_label))

In [1]:
#for stmt in g:
#  print(stmt)

In [172]:
print(g.serialize(format="turtle"))
#output = g.serialize(destination="punta_cana.ttl", format="ttl")

@prefix dbo: <https://dbpedia.org/ontology/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix schema: <https://schema.org/> .
@prefix wikidata: <http://www.wikidata.org/entity/> .

wikidata:P17 rdfs:label "country" .

wikidata:P36 rdfs:label "capital" .

<http://dbpedia.org/resource/Croatia> a wikidata:Q515,
        wikidata:Q6256,
        dbo:PopulatedPlace,
        dbo:Settlement,
        schema:Place ;
    rdfs:label "Croatia" ;
    wikidata:P36 <http://dbpedia.org/resource/Zagreb> .

<http://dbpedia.org/resource/Zagreb> a wikidata:Q515,
        wikidata:Q6256,
        dbo:PopulatedPlace,
        dbo:Settlement,
        schema:Place ;
    rdfs:label "Zagreb" ;
    wikidata:P17 <http://dbpedia.org/resource/Croatia> .


