In [1]:
import spacy
from spacy.matcher import Matcher
from spacy import displacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
with open("wiki_pratica.txt", "r") as file:
    text = file.read()

In [4]:
text

'The Pointer Sisters are an American female vocal group from Oakland, California, who achieved mainstream success during the 1970s and 1980s. They have had a repertoire with many genres, they have sold around 50 million records throughout their career included. The Pointer Sisters have won three Grammy Awards and received a star on the Hollywood Walk of Fame in 1994. The group had 13 US top 20 hits between 1973 and 1987.\n\nThe group had its origins when sisters June and Bonnie Pointer began performing in clubs in 1969 as "Pointers Au Pair". The line-up grew to a trio when sister Anita joined them. Their record deal with Atlantic Records produced several unsuccessful singles. The trio grew to a quartet when sister Ruth joined in December 1972. They then signed with Blue Thumb Records, recorded their debut album and, with their new label, began seeing more success, winning a Grammy Award in 1975 for Best Country Vocal Performance for "Fairytale" (1974). Bonnie left the group in 1977 to 

In [5]:
doc = nlp(text)

In [6]:
doc

The Pointer Sisters are an American female vocal group from Oakland, California, who achieved mainstream success during the 1970s and 1980s. They have had a repertoire with many genres, they have sold around 50 million records throughout their career included. The Pointer Sisters have won three Grammy Awards and received a star on the Hollywood Walk of Fame in 1994. The group had 13 US top 20 hits between 1973 and 1987.

The group had its origins when sisters June and Bonnie Pointer began performing in clubs in 1969 as "Pointers Au Pair". The line-up grew to a trio when sister Anita joined them. Their record deal with Atlantic Records produced several unsuccessful singles. The trio grew to a quartet when sister Ruth joined in December 1972. They then signed with Blue Thumb Records, recorded their debut album and, with their new label, began seeing more success, winning a Grammy Award in 1975 for Best Country Vocal Performance for "Fairytale" (1974). Bonnie left the group in 1977 to com

In [7]:
sentence = list(doc.sents)[0]

In [8]:
sentence

The Pointer Sisters are an American female vocal group from Oakland, California, who achieved mainstream success during the 1970s and 1980s.

In [9]:
for tok in doc[:10]:
    print(tok.text, tok.pos_, tok.dep_)

The DET det
Pointer PROPN compound
Sisters PROPN nsubj
are AUX ROOT
an DET det
American ADJ amod
female ADJ amod
vocal ADJ amod
group NOUN attr
from ADP prep


In [10]:
for ent in doc.ents[:10]:
    print(ent.text, ent.label_)

The Pointer Sisters ORG
American NORP
Oakland GPE
California GPE
the 1970s DATE
1980s DATE
around 50 million CARDINAL
The Pointer Sisters ORG
three CARDINAL
Grammy Awards WORK_OF_ART


In [11]:
# Vamos procurar no texto palavras que são consideradas "WORKS_OF_ART"
matcher = Matcher(nlp.vocab)
pattern = [{"ENT_TYPE": "WORK_OF_ART"}]
matcher.add("WORK_OF_ART_PATTERN", [pattern])
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text, span.label_)

Grammy 
Awards 
Pointers 
Au 
Pair 
a 
Grammy 
Award 
Love 
Sisters 
Are 


In [12]:
displacy.render(doc, style="ent")  # Visualização das entidades nomeadas

In [13]:
matcher = Matcher(nlp.vocab)
pattern = [{"ENT_TYPE": "WORK_OF_ART", "OP": "+"}]
matcher.add("WORK_OF_ART_PATTERN", [pattern], greedy="LONGEST")
matches = matcher(doc)
for match in matches:
    print(match, doc[match[1]:match[2]])

(12030173109389004859, 99, 102) Pointers Au Pair
(12030173109389004859, 168, 171) a Grammy Award
(12030173109389004859, 51, 53) Grammy Awards
(12030173109389004859, 382, 384) Sisters Are
(12030173109389004859, 259, 260) Love


In [14]:
# Vamos procurar pela seguinte estrutura: Nome Próprio + and + Nome Próprio
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN", "OP": "+"},
           {"LOWER": "and"},
           {"POS": "PROPN", "OP": "+"}]
matcher.add("PERSON_AND_PERSON", [pattern], greedy="LONGEST")
matches = matcher(doc)
for match in matches:
    print(match, doc[match[1]:match[2]])

(13555721475752795348, 87, 91) June and Bonnie Pointer


In [15]:
# Faremos agora a busca a partir de um REGEX, onde buscaremos por datas no formato textual "January 2020"
import re
pattern = re.compile(r"\s(January|February|March|April|May|June|July|August|September|October|November|December)\s\d{4}")
for match in re.finditer(pattern, text):
    print(match.group(), match.span())

 December 1972 (735, 749)
 April 2004 (1591, 1602)
 April 2006 (1636, 1647)
 December 2016 (2179, 2193)
 December 2017 (2286, 2300)


In [16]:
# Criaremos então um componente customizado que irá buscar por essas datas e marcá-las como entidades do tipo DATE
from spacy.language import Language

@Language.component("date_cust_component")
def date_component(doc):
    pattern = re.compile(r"\s(January|February|March|April|May|June|July|August|September|October|November|December)\s\d{4}")
    new_ents = list(doc.ents)
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end, label="DATE")
        if span is not None:
            new_ents.append(span)
    doc.ents = new_ents
    return doc
nlp.add_pipe("date_cust_component", after="ner")
doc = nlp(text)
for ent in doc.ents:
    if ent.label_ == "DATE":
        print(ent.text, ent.label_)

the 1970s DATE
1980s DATE
1994 DATE
between 1973 and 1987 DATE
June DATE
1969 DATE
December 1972 DATE
1975 DATE
1974 DATE
1977 DATE
1978 DATE
the 1980s DATE
June DATE
1984 DATE
1979 DATE
1980 DATE
1981 DATE
1984 DATE
1985 DATE
June DATE
several years DATE
April 2004 DATE
April 2006 DATE
2005 DATE
Between 2009 and 2015 DATE
2015 DATE
December 2016 DATE
December 2017 DATE


In [17]:
nlp.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc',
    'pos_acc',
    'tag_micro_p',
    'tag_micro_r',
    'tag_micro_f'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'date_cust_component': {'assigns': [],
   'requires'

In [19]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [20]:
nlp2 = spacy.load("en_core_web_md")

In [21]:
import numpy as np
palavra = "singer"

ms = nlp2.vocab.vectors.most_similar(
    np.asarray([nlp2.vocab.vectors[nlp2.vocab.strings[palavra]]]), n=10)
words = [nlp2.vocab.strings[w] for w in ms[0][0]]
distances = ms[2]
print(words)

['decathlete', 'SPRINTER', 'letterwinner', 'medallists', 'triathletes', 'Racers', 'GYMNASIUM', 'tennis', 'RINK', 'tlc']


In [22]:
displacy.render(doc, style="ent")  # Visualização das entidades nomeadas

In [23]:
doc_md = nlp2(text)
doc1 = list(doc_md.sents)[0]
doc2 = list(doc_md.sents)[1]

In [24]:
doc1, doc2

(The Pointer Sisters are an American female vocal group from Oakland, California, who achieved mainstream success during the 1970s and 1980s.,
 They have had a repertoire with many genres, they have sold around 50 million records throughout their career included.)

In [25]:
print(doc1.similarity(doc2))  # Similaridade entre sentenças

0.879902720451355
