In [1]:
import spacy
from spacy.matcher import Matcher

In [2]:
nlp = spacy.blank("en")

In [3]:
doc = nlp(
    "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are."
)

In [4]:
for token in doc:
    if token.like_num:
        next_token = doc[token.i + 1]
        if next_token.text == "%":
            print("Percentage found:", token.text)

Percentage found: 60
Percentage found: 4


In [5]:
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [6]:
doc = nlp("She ate the pizza")

In [7]:
for token in doc:
    # Print the text and the predicted part-of-speech tag
    print(token.text, token.pos_, token.dep_, token.head.text)

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate


In [8]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In [9]:
matcher = Matcher(nlp.vocab)

In [10]:
pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]
matcher.add("IPHONE_PATTERN", [pattern])

In [11]:
# Process some text
doc = nlp("Upcoming iPhone X release date leaked")

# Call the matcher on the doc
matches = matcher(doc)

In [12]:
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

iPhone X


In [13]:
pattern = [
    {"IS_DIGIT": True},
    {"LOWER": "fifa"},
    {"LOWER": "world"},
    {"LOWER": "cup"},
    {"IS_PUNCT": True}
]

In [15]:
doc = nlp("2018 FIFA World Cup: France won!")
print(doc)

2018 FIFA World Cup: France won!


In [19]:
nlp = spacy.blank("en")

# Importar a classe Doc 
from spacy.tokens import Doc, Span

# As palavras e espaços em branco necessários para criar um doc:
words = ["Hello", "world", "!"]
spaces = [True, False, False]

# Criar um doc manualmente
doc = Doc(nlp.vocab, words=words, spaces=spaces)


In [23]:
# Criar uma particção span manualmente
span = Span(doc, 0, 2)

# Criar uma partição span com um marcador
span_with_label = Span(doc, 0, 2, label="GREETING")

# Adicionar a partição a doc.ents
doc.ents = [span_with_label]
print(doc.ents)

(Hello world,)


In [24]:
import spacy

nlp = spacy.blank("pt")

# Importe as classes Doc e Span 
from spacy.tokens import Doc, Span

words = ["Eu", "adoro", "David", "Bowie"]
spaces = [True, True, True, False]

# Crie um doc Doc a partir das palavras words e espaçamento spaces
doc = Doc(nlp.vocab, words = words, spaces = spaces)
print(doc.text)

# Crie uma partição para "David Bowie" a partir do doc e atribua o marcador "PERSON"
span = Span(doc, 2, 4, label="PERSON")

# Adicione a partição às entidades do doc.
doc.ents = [span]

# Imprima o texto e os marcadores das entidades
print([(ent.text, ent.label_) for ent in doc.ents])

Eu adoro David Bowie
[('David Bowie', 'PERSON')]


In [25]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Berlin looks like a nice city")

for token in doc:
    # Verifica se o token atual é um substantivo próprio.
    if token.pos_ == "PROPN":
        # Verifica se o próximo token é um verbo
        if doc[token.i + 1].pos_ == "VERB":
            print("Found proper noun before a verb:", token.text)

Found proper noun before a verb: Berlin


In [27]:
# Carregar o fluxo (pipeline) de processamento maior com os vetores
spacy.cli.download("en_core_web_md")
nlp = spacy.load("en_core_web_md")

# Comparar dois documentos
doc1 = nlp("I like fast food")
doc2 = nlp("I like pizza")
print(doc1.similarity(doc2))

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
0.8698332283318978


In [28]:
doc = nlp("I like pizza and pasta")
token1 = doc[2]
token2 = doc[4]
print(token1.similarity(token2))

0.685019850730896


In [31]:
# Comparar um documento com um token
doc = nlp("I like pizza")
token = nlp("soap")[0]

print(doc.similarity(token))

0.1821369691957915


In [32]:
# Comparar uma partição com um documento
span = nlp("I like pizza and pasta")[2:5]
doc = nlp("McDonalds sells burgers")

print(span.similarity(doc))

0.47190033157126826


In [33]:
# Carregar um fluxo (pipeline) de processamento maior com vetores
nlp = spacy.load("en_core_web_md")

doc = nlp("I have a banana")
# Acessar o vetor através do atributo token.vector
print(doc[3].vector)

[ 0.20778  -2.4151    0.36605   2.0139   -0.23752  -3.1952   -0.2952
  1.2272   -3.4129   -0.54969   0.32634  -1.0813    0.55626   1.5195
  0.97797  -3.1816   -0.37207  -0.86093   2.1509   -4.0845    0.035405
  3.5702   -0.79413  -1.7025   -1.6371   -3.198    -1.9387    0.91166
  0.85409   1.8039   -1.103    -2.5274    1.6365   -0.82082   1.0278
 -1.705     1.5511   -0.95633  -1.4702   -1.865    -0.19324  -0.49123
  2.2361    2.2119    3.6654    1.7943   -0.20601   1.5483   -1.3964
 -0.50819   2.1288   -2.332     1.3539   -2.1917    1.8923    0.28472
  0.54285   1.2309    0.26027   1.9542    1.1739   -0.40348   3.2028
  0.75381  -2.7179   -1.3587   -1.1965   -2.0923    2.2855   -0.3058
 -0.63174   0.70083   0.16899   1.2325    0.97006  -0.23356  -2.094
 -1.737     3.6075   -1.511    -0.9135    0.53878   0.49268   0.44751
  0.6315    1.4963    4.1725    2.1961   -1.2409    0.4214    2.9678
  1.841     3.0133   -4.4652    0.96521  -0.29787   4.3386   -1.2527
 -1.7734   -3.5637   -0.20035

In [50]:
# Inicializar com o vocabulario compartilhado
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

# Expressões são listas de dicionários descrevendo os tokens
pattern = [{"LEMMA": "love", "POS": "VERB"}, {"LOWER": "cats"}]
matcher.add("LOVE_CATS",[pattern])

# Operadores podem determinar a frequência de correspondência de um token
pattern = [{"TEXT": "very", "OP": "+"}, {"TEXT": "happy"}]
matcher.add("VERY_HAPPY", [pattern])

# Chamar o comparador no documento doc retorna uma lista com tuplas (match_id, start, end) 
doc = nlp("I love cats and I'm very very happy")
matches = matcher(doc)

print(f'Texto original: "{doc}"')
print('Matches encontrados: ')
for index, start, end in matches:
    print(f'\t{doc[start:end]}') 

Texto original: "I love cats and I'm very very happy"
Matches encontrados: 
	love cats
	very happy
	very very happy


In [51]:
matcher = Matcher(nlp.vocab)
matcher.add("DOG", [[{"LOWER": "golden"}, {"LOWER": "retriever"}]])
doc = nlp("I have a Golden Retriever")

for match_id, start, end in matcher(doc):
    span = doc[start:end]
    print("Matched span:", span.text)
    # Obter o token raiz e o token cabeçalho da partição 
    print("Root token:", span.root.text)
    print("Root head token:", span.root.head.text)
    # Obter o token anterior e seu marcador de classe gramatical
    print("Previous token:", doc[start - 1].text, doc[start - 1].pos_)

Matched span: Golden Retriever
Root token: Retriever
Root head token: have
Previous token: a DET


In [52]:
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)

pattern = nlp("Golden Retriever")
matcher.add("DOG", [pattern])
doc = nlp("I have a Golden Retriever")

# Iterar nas correspondências
for match_id, start, end in matcher(doc):
    # Obter a partição que houve correspondência
    span = doc[start:end]
    print("Matched span:", span.text)

Matched span: Golden Retriever
