# TEXTUAL ANNOTATION

In [1]:
import requests # For REST calls
import json # for modelling objects in the JSON format

Now open the file config.json, which contains the key required for making REST requests to the SoBigData server

In [21]:
# This line opens the file from the file system, the file is in the same folder of the notebook and it is opened in "read-only mode"
with open("../config.json", 'r') as json_file:
    config = json.load(json_file) # load the json object inside the config file
    KEY = config['d4science_KEY'] # this is the key we will be using for REST calls

In [3]:
TAGME_ENDPOINT = "https://tagme.d4science.org/tagme/tag"
LANG = "en" # Also works in italian and german

Now create the function that will "wrap" the REST call. It needs a textual input

In [4]:
def query_tagme(text):
    payload = {"text": text, "gcube-token": KEY, "lang": LANG}
    # Now we issue a post HTTP request
    r = requests.post(TAGME_ENDPOINT, payload)
    if r.status_code != 200:
        # this means something went wrong with the query
        raise Exception("Error on text: {}\n{}".format(text, r.text))
    return r.json()

And now we display the result for a simple textual query. The interesting part, for us, is under the key _annotations_.
This will be a list of annotations containing the following fields:
- **spot (string)**: how the anchor appears in the text.
- **start (int)**: the index of the starting character of the anchor.
- **end (int)**: the index of the ending character of the anchor.
- **link_probability (float ∈[𝟎,𝟏])**: number of times that the spot is an anchor in Wikipedia / number of occurrences of the spot in Wikipedia.
- **rho (float ∈[𝟎,𝟏])**: semantic coherency of the entity with respect to the query.
- **id (int)**: the Wikipedia identifier of the page _(https://en.wikipedia.org/?curid=<>)_.
- **title (string)**: title of the Wikipedia page.

In [8]:
short_text = "Italy will not be competing in the 2022 world cup"
resp = query_tagme(short_text)
resp

{'test': '5',
 'annotations': [{'spot': 'Italy',
   'start': 0,
   'link_probability': 0.4437723457813263,
   'rho': 0.4525856375694275,
   'end': 5,
   'id': 362466,
   'title': 'Italy national football team'},
  {'spot': 'will',
   'start': 6,
   'link_probability': 0.0036389119923114777,
   'rho': 0.06729841977357864,
   'end': 10,
   'id': 32828260,
   'title': 'Will (2011 film)'},
  {'spot': '2022 world cup',
   'start': 35,
   'link_probability': 0.3492063581943512,
   'rho': 0.3398236632347107,
   'end': 49,
   'id': 17742072,
   'title': '2022 FIFA World Cup'}],
 'time': 35,
 'api': 'tag',
 'lang': 'en',
 'timestamp': '2023-01-02T08:56:34'}

## Handle longer texts / filtering noisy annotations

TagME has been designed for handling short texts, but we also have a way to obtain competitive results on longer ones. 
This requires modifying the window of spots that are checked by TagME when doing disambiguation.

Now open a new text file with a slightly longer text and annotate it with TagME

In [5]:
with open("../data/Leonardo.txt", 'r') as long_file:
    # the text is not a json object, it is just a plaintext, so just read it regularly with read()
    text = long_file.read()
text

'Leonardo da Vinci was an Italian Renaissance polymath whose areas of interest included invention, painting, sculpting, architecture, science, music, mathematics, engineering, literature, anatomy, geology, astronomy, botany, writing, history, and cartography. \nHe has been variously called the father of palaeontology, ichnology, and architecture, and is widely considered one of the greatest painters of all time. Leonardo is revered for his technological ingenuity. He conceptualised flying machines, a type of armoured fighting vehicle, concentrated solar power, an adding machine, and the double hull.'

Now we will change the tagging function we made before, by adding an optional boolean parameter. If true, this means that the text is long, otherwise it is short and we can avoid changing the window.

In [6]:
def query_tagme(text, long_text=False):
    payload = {"text": text, "gcube-token": KEY, "lang": LANG}
    if long_text:
        # long_text is by defaul false, but if specified by the user, we set the window size at 5
        payload["long_text"] = 5
    r = requests.post(TAGME_ENDPOINT, payload)
    if r.status_code != 200:
        raise Exception("Error on text: {}\n{}".format(text, r.text))
    return r.json()

But how do we deal with noisy annotations? TagME gives us a "content relevance" score in the form of the **Rho-score**.
We can filter the lowest ranked annotations on relevancy to remove noise. A common threshold for this task is 0.3.

In [7]:
# Try changing the min_rho parameter and see how it impacts the returned entities
def get_tagme_entities(tagme_response, min_rho=0.3):
    ann = tagme_response["annotations"]
    ann = [a for a in ann if a["rho"] > min_rho] # filter all the annotations with a rho score lower than the threshold
    return [a["title"] for a in ann if "title" in a] # return just the page titles

Now see which entities _disappear_ when filtering

In [8]:
print("BEFORE FILTERING")
resp = query_tagme(text, long_text=True) 
before_filtering = [a["title"] for a in resp['annotations'] if "title" in a]
before_filtering

BEFORE FILTERING


['Leonardo da Vinci',
 'Leonardo da Vinci',
 'Italian Renaissance',
 'Polymath',
 'Attention',
 'Invention',
 'Painting',
 'Sculpture',
 'Architecture',
 'Science',
 'Music and mathematics',
 'Engineering',
 'Literature',
 'Anatomy',
 'Geology',
 'Astronomy',
 'Botany',
 'Writing',
 'History',
 'Cartography',
 'Clergy',
 'Paleontology',
 'Ichnology',
 'Architecture',
 'Neoplatonism',
 'Greatest!',
 'Painting',
 'Time (magazine)',
 'Leonardo da Vinci',
 'Canonization',
 'Technology',
 'Ingenuity',
 'Concept',
 'Flying Machines s.r.o.',
 'Granite',
 'Stellar classification',
 'Armoured fighting vehicle',
 'Concentrated solar power',
 'Adding machine',
 'Double hull']

In [9]:
print("AFTER FILTERING")
after_filtering = get_tagme_entities(resp)
after_filtering

AFTER FILTERING


['Leonardo da Vinci',
 'Leonardo da Vinci',
 'Italian Renaissance',
 'Polymath',
 'Music and mathematics',
 'Geology',
 'Astronomy',
 'Botany',
 'Cartography',
 'Paleontology',
 'Ichnology',
 'Armoured fighting vehicle',
 'Concentrated solar power',
 'Adding machine']

In [10]:
print("The annotations that were filtered out are:")
[a for a in before_filtering if a not in after_filtering]

The annotations that were filtered out are:


['Attention',
 'Invention',
 'Painting',
 'Sculpture',
 'Architecture',
 'Science',
 'Engineering',
 'Literature',
 'Anatomy',
 'Writing',
 'History',
 'Clergy',
 'Architecture',
 'Neoplatonism',
 'Greatest!',
 'Painting',
 'Time (magazine)',
 'Canonization',
 'Technology',
 'Ingenuity',
 'Concept',
 'Flying Machines s.r.o.',
 'Granite',
 'Stellar classification',
 'Double hull']

# TRY OTHER ANNOTATORS: SWAT

TagME is not the only available annotator. There are several more, each one with its own strenghts and weaknesses.
Most of the available annotators are available at [this](https://sobigdata.d4science.org/web/tagme/service-overview) page on the SoBigData Infrastructure

**SWAT** is specifically a salient entity linker, which works best on long, well-constructed texts.
The fields returned are:
- **salience_class (int)**: 1 if the entity is deemed salient, 0 otherwise
- **salience_score (float ∈[𝟎,𝟏])**: the saliency of the enitity in the text (similar to the rho-score in tagme)
- **spans (list)**: list of times where this entity appears, they are described as:
    - *start (int)*: the index of the starting character of the anchor
    - *end (int)*: the index of the ending character of the anchor
- **wiki_id (int)**: the Wikipedia identifier of the page
- **wiki_title (string)**: title of the Wikipedia page

In [12]:
# this is the new URL of the annotator on the SoBigData Infrastructure
SWAT_ENDPOINT = "https://swat.d4science.org/salience"

# SWAT also requires a title of the content
def query_swat(title, content):
    document = json.dumps({"title": title, "content": content})
    r = requests.post(SWAT_ENDPOINT, data = document, params={'gcube-token': KEY})
    if r.status_code != 200:
        raise Exception("Error on article titled: {}\n{}".format(title, r.text))
    return r.json()["annotations"]

query_swat("Leonardo da Vinci", text)[:7]

[{'salience_class': 1.0,
  'salience_score': 0.9471508264541626,
  'spans': [{'end': 17, 'start': 0}, {'end': 422, 'start': 414}],
  'wiki_id': 18079,
  'wiki_title': 'Leonardo_da_Vinci'},
 {'salience_class': 1.0,
  'salience_score': 0.5190669894218445,
  'spans': [{'end': 32, 'start': 25}],
  'wiki_id': 14532,
  'wiki_title': 'Italy'},
 {'salience_class': 1.0,
  'salience_score': 0.5682003498077393,
  'spans': [{'end': 44, 'start': 33}],
  'wiki_id': 25532,
  'wiki_title': 'Renaissance'},
 {'salience_class': 0.0,
  'salience_score': 0.4803982079029083,
  'spans': [{'end': 65, 'start': 60}],
  'wiki_id': 9630,
  'wiki_title': 'Ecology'},
 {'salience_class': 0.0,
  'salience_score': 0.35197311639785767,
  'spans': [{'end': 77, 'start': 69}],
  'wiki_id': 146738,
  'wiki_title': 'Interest'},
 {'salience_class': 0.0,
  'salience_score': 0.42167073488235474,
  'spans': [{'end': 96, 'start': 87}],
  'wiki_id': 44312,
  'wiki_title': 'Invention'},
 {'salience_class': 1.0,
  'salience_score':

# RELATEDNESS
Ok but now that I have entities, how do I deal with them? I need to know which are similar and which are not.
If we don't see any way of "dealing with the entities", how do we unlock its full potential? How is this method more powerful than dealing with generic words as tokens?

There are several ways in which we can obtain the relatedness of couples of entities.
The main one that is shown in this notebook is by querying TagME itself. TagME has an internal relatedness computation framework, so I can ask TagME itself how close two entities are to one another. This metric is computed directly on the Wikipedia Knowledge Graph.

In [13]:
# The URL where the relatedness is given
ENDPOINT_RELATEDNESS = "https://tagme.d4science.org/tagme/rel"

# In case I need efficiency I can do batch queries of 100 couples per HTTP call
def query_relatedness(e1, e2):
    # Entities require underscores in-place of the spaces. The space is between entity one and entity two
    tt = e1.replace(" ", "_") + " " + e2.replace(" ", "_")
    payload = {"tt": tt, "gcube-token": KEY, "lang": LANG}
    r = requests.post(ENDPOINT_RELATEDNESS, payload)
    if r.status_code != 200:
        raise Exception("Error on relatedness computation: {}\n{}".format(tt, r.text))
    return r.json()

Now let's test the relatedness of three entities. 
Two are closely related to one-another (biology and biotechnology).
The last one is completely out of context.

In [14]:
first = query_relatedness("Biology", "Biotechnology")
second = query_relatedness("Barack Obama", "Biotechnology")
thirds = query_relatedness("Barack Obama", "Biology")
print(first['result'])
print(second['result'])
print(thirds['result'])

[{'couple': 'Biology Biotechnology', 'rel': 0.6070536971092224}]
[{'couple': 'Barack_Obama Biotechnology', 'rel': 0.23863035440444946}]
[{'couple': 'Barack_Obama Biology', 'rel': 0.16491788625717163}]


# WIKIPEDIA2VEC
This next section requires some additional setup and loading a large model file. I will just show you how it works but there is no need to execute it for you.

In [16]:
import json
from wikipedia2vec import Wikipedia2Vec
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

MODEL_FILE = r"D:\wiki\enwiki_20180420_100d.pkl"

wiki2vec = Wikipedia2Vec.load(MODEL_FILE)

In [17]:
def get_entity_vector(e):
    try:
        emb = wiki2vec.get_entity_vector(e)
    except:
        raise Exception("Entity vector {} not found\n".format(e))
    return emb

def similarity(v1, v2):
    x = np.array(v1).reshape(1,-1)
    y = np.array(v2).reshape(1,-1)
    return cosine_similarity(x, y)[0][0]

In [18]:
v1 = ("Barack Obama", get_entity_vector("Barack Obama"))
v2 = ("Biology", get_entity_vector("Biology"))
v3 = ("Biotechnology", get_entity_vector("Biotechnology"))
v1

('Barack Obama',
 memmap([-1.7208770e-01,  6.8366393e-02, -2.2114021e-01,  1.8625689e-01,
         -1.6881050e-01, -1.0273114e+00, -1.7403726e-01,  6.5132761e-01,
         -1.1579229e+00, -1.1234688e-02, -6.3258129e-01, -3.6312324e-01,
          1.8983840e+00, -1.0301901e+00, -6.0025400e-01, -7.8052205e-01,
          2.5215951e-01, -2.7664509e-01, -5.1084882e-01,  1.2842940e-01,
          6.3571817e-01,  8.1280574e-02, -1.2716837e+00,  4.2106557e-01,
         -2.0525175e-01,  9.8252594e-02,  1.3547261e-01, -5.7749820e-01,
          2.9801649e-01,  1.4131395e+00, -7.3676556e-01, -1.0151949e+00,
         -1.1703007e-01,  1.2873930e+00, -2.9190008e-02, -2.9518047e-01,
          1.4377959e-01,  3.0795303e-01,  1.4484618e+00,  1.7310138e-01,
          1.4899757e-03, -8.5064566e-01, -1.7244501e-01, -7.6880109e-01,
         -9.8339975e-01,  3.2403290e-01, -6.4912087e-01,  3.7392426e-01,
         -3.1706643e-01,  3.8999528e-01,  7.0731849e-03,  4.8704663e-01,
          8.8876688e-01, -8.457925

In [19]:
print("======================================================================")
from itertools import combinations
for x, y in combinations([v1, v2, v3], 2):
    print("Cosine similarity between {} and {} is {:.2f}".format(x[0], y[0], similarity(x[1], y[1])))

Cosine similarity between Barack Obama and Biology is 0.19
Cosine similarity between Barack Obama and Biotechnology is 0.16
Cosine similarity between Biology and Biotechnology is 0.52
