# TEXTUAL ANNOTATION

In [1]:
import requests
import json

In [2]:
TAGME_ENDPOINT = "https://tagme.d4science.org/tagme/tag"
LANG = "en"
with open("config.json", 'r') as json_file:
    config = json.load(json_file)
    KEY = config['d4science_KEY']

In [3]:
def query_tagme(text):
    payload = {"text": text, "gcube-token": KEY, "lang": LANG}
    r = requests.post(TAGME_ENDPOINT, payload)
    if r.status_code != 200:
        raise Exception("Error on text: {}\n{}".format(text, r.text))
    return r.json()

In [4]:
resp = query_tagme("Italy will not be competing in the 2022 world cup")
resp

{'test': '5',
 'annotations': [{'spot': 'Italy',
   'start': 0,
   'link_probability': 0.4437723457813263,
   'rho': 0.4525856375694275,
   'end': 5,
   'id': 362466,
   'title': 'Italy national football team'},
  {'spot': 'will',
   'start': 6,
   'link_probability': 0.0036389119923114777,
   'rho': 0.06729841977357864,
   'end': 10,
   'id': 32828260,
   'title': 'Will (2011 film)'},
  {'spot': '2022 world cup',
   'start': 35,
   'link_probability': 0.3492063581943512,
   'rho': 0.3398236632347107,
   'end': 49,
   'id': 17742072,
   'title': '2022 FIFA World Cup'}],
 'time': 26,
 'api': 'tag',
 'lang': 'en',
 'timestamp': '2022-11-25T11:56:03'}

## Handle longer texts / filtering noisy annotations

In [5]:
with open("Leonardo.txt", 'r') as long_file:
    text = long_file.read()
text

'Leonardo da Vinci was an Italian Renaissance polymath whose areas of interest included invention, painting, sculpting, architecture, science, music, mathematics, engineering, literature, anatomy, geology, astronomy, botany, writing, history, and cartography. \nHe has been variously called the father of palaeontology, ichnology, and architecture, and is widely considered one of the greatest painters of all time. Leonardo is revered for his technological ingenuity. He conceptualised flying machines, a type of armoured fighting vehicle, concentrated solar power, an adding machine, and the double hull.'

In [6]:
def query_tagme(text, long_text=False):
    payload = {"text": text, "gcube-token": KEY, "lang": LANG}
    if long_text:
        payload["long_text"] = 5
    r = requests.post(TAGME_ENDPOINT, payload)
    if r.status_code != 200:
        raise Exception("Error on text: {}\n{}".format(text, r.text))
    return r.json()

In [7]:
def get_tagme_entities(tagme_response, min_rho=0.3):
    ann = tagme_response["annotations"]
    ann = [a for a in ann if a["rho"] > min_rho]
    return [a["title"] for a in ann if "title" in a]

In [10]:
print("BEFORE FILTERING")
resp = query_tagme(text, long_text=True)
print([a["title"] for a in resp['annotations'] if "title" in a])
print("==========================================================")
print("AFTER FILTERING")
get_tagme_entities(resp)

BEFORE FILTERING
['Leonardo da Vinci', 'Leonardo da Vinci', 'Italian Renaissance', 'Polymath', 'Attention', 'Invention', 'Painting', 'Sculpture', 'Architecture', 'Science', 'Music and mathematics', 'Engineering', 'Literature', 'Anatomy', 'Geology', 'Astronomy', 'Botany', 'Writing', 'History', 'Cartography', 'Clergy', 'Paleontology', 'Ichnology', 'Architecture', 'Neoplatonism', 'Greatest!', 'Painting', 'Time (magazine)', 'Leonardo da Vinci', 'Canonization', 'Technology', 'Ingenuity', 'Concept', 'Flying Machines s.r.o.', 'Granite', 'Stellar classification', 'Armoured fighting vehicle', 'Concentrated solar power', 'Adding machine', 'Double hull']
AFTER FILTERING


['Leonardo da Vinci',
 'Leonardo da Vinci',
 'Italian Renaissance',
 'Polymath',
 'Music and mathematics',
 'Geology',
 'Astronomy',
 'Botany',
 'Cartography',
 'Paleontology',
 'Ichnology',
 'Armoured fighting vehicle',
 'Concentrated solar power',
 'Adding machine']

# TRY OTHER ANNOTATORS: SWAT

In [11]:
SWAT_ENDPOINT = "https://swat.d4science.org/salience"
def query_swat(title, content):
    document = json.dumps({"title": title, "content": content})
    r = requests.post(SWAT_ENDPOINT, data = document, params={'gcube-token': KEY})
    if r.status_code != 200:
        raise Exception("Error on article titled: {}\n{}".format(title, r.text))
    return r.json()["annotations"]

query_swat("Leonardo da Vinci", text)

[{'salience_class': 1.0,
  'salience_score': 0.9471508264541626,
  'spans': [{'end': 17, 'start': 0}, {'end': 422, 'start': 414}],
  'wiki_id': 18079,
  'wiki_title': 'Leonardo_da_Vinci'},
 {'salience_class': 1.0,
  'salience_score': 0.5190669894218445,
  'spans': [{'end': 32, 'start': 25}],
  'wiki_id': 14532,
  'wiki_title': 'Italy'},
 {'salience_class': 1.0,
  'salience_score': 0.5682003498077393,
  'spans': [{'end': 44, 'start': 33}],
  'wiki_id': 25532,
  'wiki_title': 'Renaissance'},
 {'salience_class': 0.0,
  'salience_score': 0.4803982079029083,
  'spans': [{'end': 65, 'start': 60}],
  'wiki_id': 9630,
  'wiki_title': 'Ecology'},
 {'salience_class': 0.0,
  'salience_score': 0.35197311639785767,
  'spans': [{'end': 77, 'start': 69}],
  'wiki_id': 146738,
  'wiki_title': 'Interest'},
 {'salience_class': 0.0,
  'salience_score': 0.42167073488235474,
  'spans': [{'end': 96, 'start': 87}],
  'wiki_id': 44312,
  'wiki_title': 'Invention'},
 {'salience_class': 1.0,
  'salience_score':

# RELATEDNESS
Ok but now that I have entities, how do I deal with them? I need to know which are similar and which are not

In [14]:
ENDPOINT_RELATEDNESS = "https://tagme.d4science.org/tagme/rel"
def query_relatedness(e1, e2):
    tt = e1.replace(" ", "_") + " " + e2.replace(" ", "_")
    payload = {"tt": tt, "gcube-token": KEY, "lang": LANG}
    r = requests.post(ENDPOINT_RELATEDNESS, payload)
    if r.status_code != 200:
        raise Exception("Error on relatedness computation: {}\n{}".format(tt, r.text))
    return r.json()

first = query_relatedness("Biology", "Biotechnology")
second = query_relatedness("Barack Obama", "Biotechnology")
print(first['result'])
print(second['result'])

[{'couple': 'Biology Biotechnology', 'rel': 0.6070536971092224}]
[{'couple': 'Barack_Obama Biotechnology', 'rel': 0.23863035440444946}]


# WIKIPEDIA2VEC
Ok but I need to have relatedness of tens of thousands of items, I cannot rely on slow queries

In [15]:
import json
from wikipedia2vec import Wikipedia2Vec
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
with open("config.json", 'r') as json_file:
    config = json.load(json_file)
    MODEL_FILE = config["path_to_wiki2vec_model"]

wiki2vec = Wikipedia2Vec.load(MODEL_FILE)

In [16]:
def get_entity_vector(e):
    try:
        emb = wiki2vec.get_entity_vector(e)
    except:
        raise Exception("Entity vector {} not found\n".format(e))
    return emb

def similarity(v1, v2):
    x = np.array(v1).reshape(1,-1)
    y = np.array(v2).reshape(1,-1)
    return cosine_similarity(x, y)[0][0]

In [17]:
v1 = ("Barack Obama", get_entity_vector("Barack Obama"))
v2 = ("Biology", get_entity_vector("Biology"))
v3 = ("Biotechnology", get_entity_vector("Biotechnology"))
v1

('Barack Obama',
 memmap([-1.7208770e-01,  6.8366393e-02, -2.2114021e-01,  1.8625689e-01,
         -1.6881050e-01, -1.0273114e+00, -1.7403726e-01,  6.5132761e-01,
         -1.1579229e+00, -1.1234688e-02, -6.3258129e-01, -3.6312324e-01,
          1.8983840e+00, -1.0301901e+00, -6.0025400e-01, -7.8052205e-01,
          2.5215951e-01, -2.7664509e-01, -5.1084882e-01,  1.2842940e-01,
          6.3571817e-01,  8.1280574e-02, -1.2716837e+00,  4.2106557e-01,
         -2.0525175e-01,  9.8252594e-02,  1.3547261e-01, -5.7749820e-01,
          2.9801649e-01,  1.4131395e+00, -7.3676556e-01, -1.0151949e+00,
         -1.1703007e-01,  1.2873930e+00, -2.9190008e-02, -2.9518047e-01,
          1.4377959e-01,  3.0795303e-01,  1.4484618e+00,  1.7310138e-01,
          1.4899757e-03, -8.5064566e-01, -1.7244501e-01, -7.6880109e-01,
         -9.8339975e-01,  3.2403290e-01, -6.4912087e-01,  3.7392426e-01,
         -3.1706643e-01,  3.8999528e-01,  7.0731849e-03,  4.8704663e-01,
          8.8876688e-01, -8.457925

In [18]:
print("======================================================================")
from itertools import combinations
for x, y in combinations([v1, v2, v3], 2):
    print("Cosine similarity between {} and {} is {:.2f}".format(x[0], y[0], similarity(x[1], y[1])))

Cosine similarity between Barack Obama and Biology is 0.19
Cosine similarity between Barack Obama and Biotechnology is 0.16
Cosine similarity between Biology and Biotechnology is 0.52
