Added dbpedia_spotlight() single task

NLeSC · May 2, 2014 · 1d43bc7 · 1d43bc7
1 parent 0813fd1
commit 1d43bc7
Show file tree

Hide file tree

Showing 3 changed files with 92 additions and 3 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -5,7 +5,8 @@ gensim
 langid>=1.1.4dev
 librabbitmq
 nltk
+pyspotlight
 scikit-learn>=0.13
 setuptools>=1.3.2
-weighwords
 unidecode
+weighwords
diff --git a/xtas/tasks/single.py b/xtas/tasks/single.py
@@ -13,6 +13,7 @@
 from urllib2 import urlopen
 
 import nltk
+import spotlight
 
 from .es import fetch
 from ..core import app
@@ -256,3 +257,60 @@ def frog(doc, output='raw'):
         if output == 'tokens':
             return list(result)
         return frog_to_saf(result)
+
+
+@app.task
+def dbpedia_spotlight(doc, lang='en', conf=0.5, supp=0, api_url=None):
+    """Run text through a DBpedia Spotlight instance.
+
+    Calls the DBpedia Spotlight instance to perform entity linking and
+    returns the names/links it has found.
+
+    See http://spotlight.dbpedia.org/ for details.
+    This task uses a Python client for DBp Spotlight:
+    https://github.com/aolieman/pyspotlight
+    """
+    text = fetch(doc)
+
+    endpoints_by_language = {
+        'en': "http://spotlight.sztaki.hu:2222/rest",
+        'de': "http://spotlight.sztaki.hu:2226/rest",
+        'nl': "http://spotlight.sztaki.hu:2232/rest",
+        'fr': "http://spotlight.sztaki.hu:2225/rest",
+        'it': "http://spotlight.sztaki.hu:2230/rest",
+        'ru': "http://spotlight.sztaki.hu:2227/rest",
+        'es': "http://spotlight.sztaki.hu:2231/rest",
+        'pt': "http://spotlight.sztaki.hu:2228/rest",
+        'hu': "http://spotlight.sztaki.hu:2229/rest",
+        'tr': "http://spotlight.sztaki.hu:2235/rest"
+    }
+
+    if lang not in endpoints_by_language and not api_url:
+        raise ValueError("Not a valid language code: %r" % lang)
+
+    if api_url is None:
+        api_url = endpoints_by_language[lang]
+
+    api_url += "/candidates"
+
+    try:
+        spotlight_resp = spotlight.candidates(
+            api_url, text,
+            confidence=conf,
+            support=supp,
+            spotter='Default'
+        )
+    except (spotlight.SpotlightException, TypeError) as e:
+        return {'error': e.message}
+
+    # Return a list of annotation dictionaries
+    annotations = []
+    for annotation in spotlight_resp:
+        # Ignore annotations without disambiguation candidates
+        if u'resource' in annotation:
+            # Always return a list of resources, also for single candidates
+            if isinstance(annotation[u'resource'], dict):
+                annotation[u'resource'] = [annotation[u'resource']]
+            annotations.append(annotation)
+
+    return annotations
diff --git a/xtas/tests/test_single.py b/xtas/tests/test_single.py
@@ -1,9 +1,10 @@
 # coding: utf-8
 
-from nose.tools import assert_equal, assert_in, assert_less, assert_true
+from nose.tools import assert_equal, assert_in, assert_less, assert_true, assert_greater
 
 from xtas.tasks import (guess_language, morphy, movie_review_polarity,
-                        stanford_ner_tag, sentiwords_tag, tokenize)
+                        stanford_ner_tag, sentiwords_tag, tokenize,
+                        dbpedia_spotlight)
 
 
 def test_langid():
@@ -62,3 +63,32 @@ def test_stanford_ner():
     # Stanford doesn't pick up "Academy Award". This is not our fault.
     # (XXX divise a better test.)
     assert_equal(names, [("Philip Seymour Hoffman", "PERSON")])
+
+
+def test_dbpedia_spotlight():
+    en_text = u"Will the efforts of artists like Moby help to preserve the Arctic?"
+    nl_text = u"Ik kan me iets herrinneren over de burgemeester van Amstelveen \
+               en het achterwerk van M\xe1xima. Verder was Koningsdag een zwart gat."
+
+    en_annotations = dbpedia_spotlight(en_text, lang='en')
+    nl_annotations = dbpedia_spotlight(nl_text, lang='nl')
+
+    # Expect `Arctic` and `Moby` to be found in en_text
+    assert_equal(len(en_annotations), 2)
+    for ann in en_annotations:
+        assert_in(ann['name'], {'Arctic', 'Moby'})
+        # The disambiguation candidates should be of type list
+        assert_true(isinstance(ann['resource'], list))
+        # In this case, the top candidate's uri == the name
+        assert_equal(ann['name'], ann['resource'][0]['uri'])
+
+    # Expect {"burgemeester", "Amstelveen", u"M\xe1xima",
+    # "Koningsdag", "zwart gat"} to be found in nl_text
+    assert_equal(len(nl_annotations), 5)
+    sf_set = set([ann['name'] for ann in nl_annotations])
+    assert_equal(sf_set, {u"burgemeester", u"Amstelveen", u"M\xe1xima", u"Koningsdag", u"zwart gat"})
+    for ann in en_annotations:
+        # The disambiguation candidates should be of type list
+        assert_true(isinstance(ann['resource'], list))
+        # There should be at least one candidate
+        assert_greater(ann['resource'], 0)