<a href="https://colab.research.google.com/github/HeleneFabia/nlp-exploration/blob/main/notebooks/nlp_with_spacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP with SpaCy

In [None]:
!python -m spacy download en_core_web_sm

## 1. Finding words, phrases, names and concepts


In [1]:
import spacy

In [2]:
nlp = spacy.blank("en")  # nlp object

In [16]:
doc = nlp("Hello world!")
print(type(doc), type(doc.text))

token = doc[1]
print(token.text)

span = doc[1:3]
print(span.text)

<class 'spacy.tokens.doc.Doc'> <class 'str'>
world
world!


In [18]:
doc = nlp("It costs €5.")
print([token.i for token in doc])
print([token.text for token in doc])
print([token.is_alpha for token in doc])
print([token.is_punct for token in doc])
print([token.like_num for token in doc])

[0, 1, 2, 3, 4]
['It', 'costs', '€', '5', '.']
[True, True, False, False, False]
[False, False, False, False, True]
[False, False, False, True, False]


In [23]:
doc = nlp(
    "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are."
)
for token in doc:
  if token.like_num and doc[token.i + 1].text == "%":
      print(doc[token.i : token.i + 2])

60%
4%


In [36]:
nlp = spacy.load("en_core_web_sm")  # trained pipeline package 
doc = nlp("She ate the pizza.")

for token in doc:
  print(f"{token.text}\t --> Part-of-speech tag: {token.pos_} \t| Dependencies: {token.dep_}, {token.head.text}")

She	 --> Part-of-speech tag: PRON 	| Dependencies: nsubj, ate
ate	 --> Part-of-speech tag: VERB 	| Dependencies: ROOT, ate
the	 --> Part-of-speech tag: DET 	| Dependencies: det, pizza
pizza	 --> Part-of-speech tag: NOUN 	| Dependencies: dobj, ate
.	 --> Part-of-speech tag: PUNCT 	| Dependencies: punct, ate


In [39]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for ent in doc.ents:  # named entities
  print(f"{ent.text} --> {ent.label_} ({spacy.explain(ent.label_)})")

Apple --> ORG (Companies, agencies, institutions, etc.)
U.K. --> GPE (Countries, cities, states)
$1 billion --> MONEY (Monetary values, including unit)


In [49]:
from spacy.matcher import Matcher # rule-based matching (spacy's alternative to regex)

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]  # list of dicts (one per token)
# "TEXT" -- match exact token
# "LOWER" -- match lexical attributes
# "LEMMA" -- match lemma (e.g. "LEMMA": "buy" matches any form of "buy")
# "POS" -- match part-of-speech (e.g. "POS": "NOUN" matches any noun)
matcher.add("IPHONE_PATTERN", [pattern])

doc = nlp("Upcoming iPhone X release date leaked.")

matches = matcher(doc)


for match_id, start, end in matches:
  print(f"Match ID: {match_id}, "
        f"Start Idx: {start}, "
        f"End Idx: {end}")
  matched_span = doc[start:end]
  print(matched_span.text)

Match ID: 9528407286733565721, Start Idx: 1, End Idx: 3
iPhone X


In [48]:
doc = nlp("2018 FIFA World Cup: France won!")
pattern = [
           {"IS_DIGIT": True},
           {"LOWER": "fifa"},
           {"LOWER": "world"},
           {"LOWER": "cup"},
           {"IS_PUNCT": True}
]
matcher.add("FIFA", [pattern])
matches = matcher(doc)
for match_id, start, end in matches:
  print(doc[start: end].text)

2018 FIFA World Cup:


In [51]:
doc = nlp("I loved dogs but now I love cats more.")
pattern = [{"LEMMA": "love", "POS": "VERB"}, {"POS": "NOUN"}]
matcher.add("LOVE", [pattern])
matches = matcher(doc)
for match_id, start, end in matches:
  print(doc[start: end].text)

loved dogs
love cats


In [52]:
doc = nlp("I bought a smartphone. Now I'm buying apps.")
pattern = [
           {"LEMMA": "buy"}, 
           {"POS": "DET", "OP": "?"},  # determinant is optional 
           {"POS": "NOUN"}
           ]
# "OP": "!" -- match 0 times
# "OP": "?" -- match 0-1 times
# "OP": "+" -- match >1 times
# "OP": "*" -- match >0 times 
matcher.add("BUY", [pattern])
matches = matcher(doc)
for match_id, start, end in matches:
  print(doc[start: end].text)

bought a smartphone
buying apps


## 2. Large-scale data analysis