Skip to content

Commit

Permalink
Added dbpedia_spotlight() single task
Browse files Browse the repository at this point in the history
  • Loading branch information
aolieman authored and larsmans committed May 2, 2014
1 parent 0813fd1 commit 1d43bc7
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 3 deletions.
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ gensim
langid>=1.1.4dev
librabbitmq
nltk
pyspotlight
scikit-learn>=0.13
setuptools>=1.3.2
weighwords
unidecode
weighwords
58 changes: 58 additions & 0 deletions xtas/tasks/single.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from urllib2 import urlopen

import nltk
import spotlight

from .es import fetch
from ..core import app
Expand Down Expand Up @@ -256,3 +257,60 @@ def frog(doc, output='raw'):
if output == 'tokens':
return list(result)
return frog_to_saf(result)


@app.task
def dbpedia_spotlight(doc, lang='en', conf=0.5, supp=0, api_url=None):
"""Run text through a DBpedia Spotlight instance.
Calls the DBpedia Spotlight instance to perform entity linking and
returns the names/links it has found.
See http://spotlight.dbpedia.org/ for details.
This task uses a Python client for DBp Spotlight:
https://github.com/aolieman/pyspotlight
"""
text = fetch(doc)

endpoints_by_language = {
'en': "http://spotlight.sztaki.hu:2222/rest",
'de': "http://spotlight.sztaki.hu:2226/rest",
'nl': "http://spotlight.sztaki.hu:2232/rest",
'fr': "http://spotlight.sztaki.hu:2225/rest",
'it': "http://spotlight.sztaki.hu:2230/rest",
'ru': "http://spotlight.sztaki.hu:2227/rest",
'es': "http://spotlight.sztaki.hu:2231/rest",
'pt': "http://spotlight.sztaki.hu:2228/rest",
'hu': "http://spotlight.sztaki.hu:2229/rest",
'tr': "http://spotlight.sztaki.hu:2235/rest"
}

if lang not in endpoints_by_language and not api_url:
raise ValueError("Not a valid language code: %r" % lang)

if api_url is None:
api_url = endpoints_by_language[lang]

api_url += "/candidates"

try:
spotlight_resp = spotlight.candidates(
api_url, text,
confidence=conf,
support=supp,
spotter='Default'
)
except (spotlight.SpotlightException, TypeError) as e:
return {'error': e.message}

# Return a list of annotation dictionaries
annotations = []
for annotation in spotlight_resp:
# Ignore annotations without disambiguation candidates
if u'resource' in annotation:
# Always return a list of resources, also for single candidates
if isinstance(annotation[u'resource'], dict):
annotation[u'resource'] = [annotation[u'resource']]
annotations.append(annotation)

return annotations
34 changes: 32 additions & 2 deletions xtas/tests/test_single.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# coding: utf-8

from nose.tools import assert_equal, assert_in, assert_less, assert_true
from nose.tools import assert_equal, assert_in, assert_less, assert_true, assert_greater

from xtas.tasks import (guess_language, morphy, movie_review_polarity,
stanford_ner_tag, sentiwords_tag, tokenize)
stanford_ner_tag, sentiwords_tag, tokenize,
dbpedia_spotlight)


def test_langid():
Expand Down Expand Up @@ -62,3 +63,32 @@ def test_stanford_ner():
# Stanford doesn't pick up "Academy Award". This is not our fault.
# (XXX divise a better test.)
assert_equal(names, [("Philip Seymour Hoffman", "PERSON")])


def test_dbpedia_spotlight():
en_text = u"Will the efforts of artists like Moby help to preserve the Arctic?"
nl_text = u"Ik kan me iets herrinneren over de burgemeester van Amstelveen \
en het achterwerk van M\xe1xima. Verder was Koningsdag een zwart gat."

en_annotations = dbpedia_spotlight(en_text, lang='en')
nl_annotations = dbpedia_spotlight(nl_text, lang='nl')

# Expect `Arctic` and `Moby` to be found in en_text
assert_equal(len(en_annotations), 2)
for ann in en_annotations:
assert_in(ann['name'], {'Arctic', 'Moby'})
# The disambiguation candidates should be of type list
assert_true(isinstance(ann['resource'], list))
# In this case, the top candidate's uri == the name
assert_equal(ann['name'], ann['resource'][0]['uri'])

# Expect {"burgemeester", "Amstelveen", u"M\xe1xima",
# "Koningsdag", "zwart gat"} to be found in nl_text
assert_equal(len(nl_annotations), 5)
sf_set = set([ann['name'] for ann in nl_annotations])
assert_equal(sf_set, {u"burgemeester", u"Amstelveen", u"M\xe1xima", u"Koningsdag", u"zwart gat"})
for ann in en_annotations:
# The disambiguation candidates should be of type list
assert_true(isinstance(ann['resource'], list))
# There should be at least one candidate
assert_greater(ann['resource'], 0)

0 comments on commit 1d43bc7

Please sign in to comment.