In [1]:
text = """The Semantic Web, sometimes known as Web 3.0 (not to be confused with Web3), is an extension of the World Wide Web through standards set by the World Wide Web 
Consortium (W3C). The goal of the Semantic Web is to make Internet data machine-readable.
To enable the encoding of semantics with the data, technologies such as Resource Description Framework (RDF) and Web Ontology Language (OWL) are used. 
These technologies are used to formally represent metadata. For example, ontology can describe concepts, relationships between entities, and categories of things. 
These embedded semantics offer significant advantages such as reasoning over data and operating with heterogeneous data sources."""

In [2]:
import requests

result = requests.post("http://localhost:2223/rest/annotate", data="text=Barack Obama was the president of the United States", headers={"Accept": "application/json"})

In [3]:
result

<Response [200]>

In [4]:
import json
json.loads(result.text)

{'@text': 'Barack Obama was the president of the United States',
 '@confidence': '0.5',
 '@support': '0',
 '@types': '',
 '@sparql': '',
 '@policy': 'whitelist',
 'Resources': [{'@URI': 'http://dbpedia.org/resource/Barack_Obama',
   '@support': '28423',
   '@types': 'Http://xmlns.com/foaf/0.1/Person,Wikidata:Q82955,Wikidata:Q729,Wikidata:Q5,Wikidata:Q215627,Wikidata:Q19088,DUL:NaturalPerson,Schema:Person,DBpedia:Species,DBpedia:Person,DBpedia:Eukaryote,DBpedia:Animal,DBpedia:Politician',
   '@surfaceForm': 'Barack Obama',
   '@offset': '0',
   '@similarityScore': '0.99996873173175',
   '@percentageOfSecondRank': '1.485152359637468E-5'},
  {'@URI': 'http://dbpedia.org/resource/United_States',
   '@support': '468823',
   '@types': 'Wikidata:Q6256,Schema:Place,Schema:Country,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:Country',
   '@surfaceForm': 'United States',
   '@offset': '38',
   '@similarityScore': '0.999784144780593',
   '@percentageOfSecondRank': '1.361731517236

In [5]:
from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(text)

len(sentences)

6

In [8]:
results = []
len_sen = 0
for sen in sentences:
    results.append((requests.post("http://localhost:2223/rest/annotate", data="text=" + sen, headers={"Accept": "application/json"}), len_sen))
    len_sen += len(sen)

In [10]:
results[3][0].text

'{"@text":"These technologies are used to formally represent metadata.","@confidence":"0.5","@support":"0","@types":"","@sparql":"","@policy":"whitelist","Resources":[{"@URI":"http://dbpedia.org/resource/Metadata","@support":"4697","@types":"","@surfaceForm":"metadata","@offset":"50","@similarityScore":"0.9999931626148548","@percentageOfSecondRank":"3.5285671842322437E-6"}]}'

In [11]:
text[50+results[3][1]:58+results[3][1]]

'ent meta'

In [14]:
from nltk.tokenize.punkt import PunktSentenceTokenizer

sen_offsets = []

for start, end in PunktSentenceTokenizer().span_tokenize(text):
    sen_offsets.append((start, end))

In [15]:
results_2 = []

for offset in sen_offsets:
    results_2.append((requests.post("http://localhost:2223/rest/annotate", data="text=" + text[offset[0]:offset[1]], headers={"Accept": "application/json"}), offset[0]))

In [16]:
results_2[3]

(<Response [200]>, 402)

In [17]:
results_2[1][0].text

'{"@text":"The goal of the Semantic Web is to make Internet data machine-readable.","@confidence":"0.5","@support":"0","@types":"","@sparql":"","@policy":"whitelist","Resources":[{"@URI":"http://dbpedia.org/resource/Semantic_Web","@support":"949","@types":"","@surfaceForm":"Semantic Web","@offset":"16","@similarityScore":"0.9999999999997442","@percentageOfSecondRank":"2.508075182659647E-13"},{"@URI":"http://dbpedia.org/resource/Internet","@support":"31775","@types":"","@surfaceForm":"Internet","@offset":"40","@similarityScore":"0.9998837214983619","@percentageOfSecondRank":"4.836831675667598E-5"},{"@URI":"http://dbpedia.org/resource/Machine-readable_data","@support":"139","@types":"","@surfaceForm":"machine-readable","@offset":"54","@similarityScore":"0.9999425826464632","@percentageOfSecondRank":"4.5397267845539746E-5"}]}'

In [24]:
outputs = [(json.loads(result[0].text), result[1]) for result in results_2 if result[0].text != ""]

In [25]:
outputs[0]

({'@text': 'The Semantic Web, sometimes known as Web 3.0 (not to be confused with Web3), is an extension of the World Wide Web through standards set by the World Wide Web \nConsortium (W3C).',
  '@confidence': '0.5',
  '@support': '0',
  '@types': '',
  '@sparql': '',
  '@policy': 'whitelist',
  'Resources': [{'@URI': 'http://dbpedia.org/resource/Semantic_Web',
    '@support': '949',
    '@types': '',
    '@surfaceForm': 'Semantic Web',
    '@offset': '4',
    '@similarityScore': '1.0',
    '@percentageOfSecondRank': '1.882965518287145E-19'},
   {'@URI': 'http://dbpedia.org/resource/Semantic_Web',
    '@support': '949',
    '@types': '',
    '@surfaceForm': 'Web 3.0',
    '@offset': '37',
    '@similarityScore': '0.9999999943622981',
    '@percentageOfSecondRank': '5.636783315961742E-9'},
   {'@URI': 'http://dbpedia.org/resource/Web3',
    '@support': '144',
    '@types': '',
    '@surfaceForm': 'Web3',
    '@offset': '70',
    '@similarityScore': '0.9999999999946283',
    '@percentage

In [27]:
sp_offsets = []
for output in outputs:
    if 'Resources' in output[0].keys():
        for res in output[0]['Resources']:
            sp_offsets.append((
                res['@surfaceForm'],
                int(res['@offset']) + output[1],
                int(res['@offset']) + output[1] + len(res['@surfaceForm']),
                res['@URI']
            ))

In [28]:
sp_offsets[3]

('World Wide Web', 100, 114, 'http://dbpedia.org/resource/World_Wide_Web')

In [29]:
text[100:114]

'World Wide Web'