In [1]:
import os

import spacy
import rdflib
from sklearn.metrics import pairwise_distances

In [16]:
!pip install rdflib

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting rdflib
  Using cached rdflib-6.2.0-py3-none-any.whl (500 kB)
Collecting isodate
  Using cached isodate-0.6.1-py2.py3-none-any.whl (41 kB)
Installing collected packages: isodate, rdflib
Successfully installed isodate-0.6.1 rdflib-6.2.0


In [15]:
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_trf

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Using cached pydantic-1.8.2-py3-none-any.whl (126 kB)
Installing collected packages: pydantic, en-core-web-sm
  Attempting uninstall: pydantic
    Found existing installation: pydantic 1.10.2
    Uninstalling pydantic-1.10.2:
      Successfully uninstalled pydantic-1.10.2
Successfully installed en-core-web-sm-3.3.0 pydantic-1.8.2
[38;5;2m

# Test the entity linker from spacy derived projects

## Open Tapioca: Precision is to low, though it does pick up predictes or relations

In [2]:
!pip install spacyopentapioca

Collecting spacyopentapioca
  Downloading spacyopentapioca-0.1.6-py3-none-any.whl (7.4 kB)
Collecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Downloading pydantic-1.8.2-py3-none-any.whl (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pydantic, spacyopentapioca
  Attempting uninstall: pydantic
    Found existing installation: pydantic 1.10.2
    Uninstalling pydantic-1.10.2:
      Successfully uninstalled pydantic-1.10.2
Successfully installed pydantic-1.8.2 spacyopentapioca-0.1.6


In [8]:
# https://github.com/UB-Mannheim/spacyopentapioca
import spacy
nlp = spacy.load('en_core_web_trf')
nlp.add_pipe('opentapioca')
doc = nlp('who is the director of Batman 1989')
for span in doc.ents:
    print((span.text, span.kb_id_, span.label_, span._.description, span._.score))

('director', 'Q2526255', 'PERSON', 'occupation of a person who directs a film', 0.896437634369846)
('Batman', '', 'WORK_OF_ART', 'province of Turkey', -0.29949270098223224)


## Spacy fishing: Good to detect movie entities from wikidata

In [3]:
!pip install spacyfishing

Collecting spacyfishing
  Downloading spacyfishing-0.1.8-py3-none-any.whl (24 kB)
Installing collected packages: spacyfishing
Successfully installed spacyfishing-0.1.8


In [3]:
# https://github.com/Lucaterre/spacyfishing
import spacy
text = 'who is the director of the film Batman 1989'
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('entityfishing')
doc = nlp(text)
for ent in doc.ents:
    print((ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata, ent._.nerd_score))

('Batman', 'WORK_OF_ART', None, None, None)
('1989', 'DATE', None, None, None)


## Spacy DBPedia Spotlight: It works well to pull stuff from DBPEDIA

Could be useful to pull descriptions of a detected entity and answer the questions with a QA model

In [None]:
!pip uninstall spacy-dbpedia-spotlight -Y

In [1]:
!pip install spacy-dbpedia-spotlight

Collecting spacy-dbpedia-spotlight
  Using cached spacy_dbpedia_spotlight-0.2.5-py3-none-any.whl
Collecting loguru
  Using cached loguru-0.6.0-py3-none-any.whl (58 kB)
Installing collected packages: loguru, spacy-dbpedia-spotlight
Successfully installed loguru-0.6.0 spacy-dbpedia-spotlight-0.2.5


In [5]:
import spacy
import spacy_dbpedia_spotlight

# https://github.com/MartinoMensio/spacy-dbpedia-spotlight
# load your model as usual
nlp = spacy.load('en_core_web_sm')
# add the pipeline stage
nlp.add_pipe('dbpedia_spotlight')
# get the document
doc = nlp('The president of USA is calling Boris Johnson to decide what to do about coronavirus')
doc = nlp('who is the director of the film Batman 1989')

# see the entities
print('Entities', [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents])
# inspect the raw data from DBpedia spotlight
print(doc.ents[0]._.dbpedia_raw_result['@types'])

Entities [('Batman', 'DBPEDIA_ENT', 'http://dbpedia.org/resource/Batman')]
Wikidata:Q95074,Wikidata:Q24229398,DUL:Agent,DBpedia:FictionalCharacter,DBpedia:Agent,DBpedia:ComicsCharacter


## Spacy entity linker

In [None]:
!pip install spacy-entity-linker

In [None]:
!python -m spacy_entity_linker "download_knowledge_base"

In [6]:
# initialize language model
# https://github.com/egerber/spaCy-entity-linker
import spacy
nlp = spacy.load("en_core_web_trf")

# add pipeline (declared through entry_points in setup.py)
nlp.add_pipe("entityLinker", last=True)

doc = nlp("I watched the Pirates of the Caribbean last silvester, which was directed by Leo Dicaprio")
doc = nlp("who is the director of the film Batman 1989")

# returns all entities in the whole document
all_linked_entities = doc._.linkedEntities
# iterates over sentences and prints linked entities
for sent in doc.sents:
    sent._.linkedEntities.pretty_print()

<EntityElement: https://www.wikidata.org/wiki/Q2526255 film director             occupation of a person who directs a film         >
<EntityElement: https://www.wikidata.org/wiki/Q11424 film                      sequence of images that give the impression of movement>
<EntityElement: https://www.wikidata.org/wiki/Q2695156 Batman                    fictional character, a comic book superhero created by artist Bob Kane and writer Bill Finger>


# Just for curiosity

# NER

In [2]:

import spacy

nlp = spacy.load("en_core_web_trf")

In [12]:

doc = nlp("The Matrix Revolutions was such an amazing movie, Keanu reves was great in it. Pierce brosnan was not in it, as wasn't Juan bermeo. The main language of the movie was english, though you also see some english people speaking in korean ")

for i, ent in enumerate(doc.ents):
    print(i, ent.text, ent.label_)

0 The Matrix Revolutions WORK_OF_ART
1 Keanu reves PERSON
2 Pierce brosnan PERSON
3 Juan bermeo PERSON
4 english LANGUAGE
5 english LANGUAGE
6 korean LANGUAGE


In [20]:

doc = nlp("The movie was filmed in Colombia, in pitalito specifically. They even have some scenes in the magdalena river, it looks like the white city of Minas Tirith. GOT takes places in Westeros and Essos. The Rand Corporation and Fox News suck, though fox searchlight makes some nice films. CNN, Weinstein Company, A24 ")

for i, ent in enumerate(doc.ents):
    print(i, ent.text, ent.label_)

0 Colombia GPE
1 pitalito GPE
2 the magdalena river LOC
3 Minas Tirith GPE
4 GOT WORK_OF_ART
5 Westeros LOC
6 Essos GPE
7 The Rand Corporation ORG
8 Fox News ORG
9 fox searchlight WORK_OF_ART
10 CNN ORG
11 Weinstein Company ORG
12 A24 ORG


In [22]:

doc = nlp("The Civil War was such a hard period. Spain suffered greatly and it stifled the dadaism movement, cubism and the roaring twenties")

for i, ent in enumerate(doc.ents):
    print(i, ent.text, ent.label_)

0 The Civil War EVENT
1 Spain GPE
2 the roaring twenties DATE


In [9]:

doc = nlp("Capatain america civil war has an is rated PG13, meaning it is directed at teenagers")

for i, ent in enumerate(doc.ents):
    print(i, ent.text, ent.label_)

0 Capatain america civil war EVENT


## Language detection

In [14]:
#https://github.com/nickdavidhaynes/spacy-cld
!pip install spacy_cld

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting spacy_cld
  Downloading spacy_cld-0.1.0.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting spacy<3.0.0,>=2.0.0
  Downloading spacy-2.3.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting pycld2>=0.31
  Downloading pycld2-0.41.tar.gz (41.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting thinc<7.5.0,>=7.4.1
  Downloading thinc-7.4.6-cp

In [None]:
import spacy
from spacy_cld import LanguageDetector

nlp = spacy.load('en')
language_detector = LanguageDetector()
nlp.add_pipe(language_detector)
doc = nlp('This is some English text.')

doc._.languages  # ['en']
doc._.language_scores['en']  # 0.96

## Concepcy

In [11]:
!pip install concepcy

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
# Access all the 'RelatedTo' relations from the Doc
for word, relations in doc._.relatedto.items():
    print(f'Word: {word} {relations}')

Word: company [{'start': {'id': '/c/en/company', 'label': 'company', 'language': 'en', 'term': '/c/en/company', '@type': 'Node'}, 'end': {'id': '/c/en/business', 'label': 'business', 'language': 'en', 'term': '/c/en/business', '@type': 'Node'}, 'relation': 'RelatedTo', 'text': '[[company]] is related to [[business]]', 'weight': 6.424017434596516}, {'start': {'id': '/c/en/company', 'label': 'company', 'language': 'en', 'term': '/c/en/company', '@type': 'Node'}, 'end': {'id': '/c/en/corporation', 'label': 'corporation', 'language': 'en', 'term': '/c/en/corporation', '@type': 'Node'}, 'relation': 'RelatedTo', 'text': '[[company]] is related to [[corporation]]', 'weight': 4.432155231938521}, {'start': {'id': '/c/en/company', 'label': 'company', 'language': 'en', 'term': '/c/en/company', '@type': 'Node'}, 'end': {'id': '/c/en/organization', 'label': 'organization', 'language': 'en', 'term': '/c/en/organization', '@type': 'Node'}, 'relation': 'RelatedTo', 'text': '[[company]] is related to [

In [12]:
# Access the 'RelatedTo' relations word by word
for token in doc:
    print(f'Word: {token} {token._.relatedto}')

Word: company [{'start': {'id': '/c/en/company', 'label': 'company', 'language': 'en', 'term': '/c/en/company', '@type': 'Node'}, 'end': {'id': '/c/en/business', 'label': 'business', 'language': 'en', 'term': '/c/en/business', '@type': 'Node'}, 'relation': 'RelatedTo', 'text': '[[company]] is related to [[business]]', 'weight': 6.424017434596516}, {'start': {'id': '/c/en/company', 'label': 'company', 'language': 'en', 'term': '/c/en/company', '@type': 'Node'}, 'end': {'id': '/c/en/corporation', 'label': 'corporation', 'language': 'en', 'term': '/c/en/corporation', '@type': 'Node'}, 'relation': 'RelatedTo', 'text': '[[company]] is related to [[corporation]]', 'weight': 4.432155231938521}, {'start': {'id': '/c/en/company', 'label': 'company', 'language': 'en', 'term': '/c/en/company', '@type': 'Node'}, 'end': {'id': '/c/en/organization', 'label': 'organization', 'language': 'en', 'term': '/c/en/organization', '@type': 'Node'}, 'relation': 'RelatedTo', 'text': '[[company]] is related to [