<a href="https://colab.research.google.com/github/HeleneFabia/nlp-exploration/blob/main/notebooks/nlp_with_spacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP with SpaCy

In [None]:
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_md

In [None]:
!python -m spacy download en_core_web_lg

## 1. Finding words, phrases, names and concepts


In [1]:
import spacy

In [2]:
nlp = spacy.blank("en")  # nlp object

In [16]:
doc = nlp("Hello world!")
print(type(doc), type(doc.text))

token = doc[1]
print(token.text)

span = doc[1:3]
print(span.text)

<class 'spacy.tokens.doc.Doc'> <class 'str'>
world
world!


In [18]:
doc = nlp("It costs €5.")
print([token.i for token in doc])
print([token.text for token in doc])
print([token.is_alpha for token in doc])
print([token.is_punct for token in doc])
print([token.like_num for token in doc])

[0, 1, 2, 3, 4]
['It', 'costs', '€', '5', '.']
[True, True, False, False, False]
[False, False, False, False, True]
[False, False, False, True, False]


In [23]:
doc = nlp(
    "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are."
)
for token in doc:
  if token.like_num and doc[token.i + 1].text == "%":
      print(doc[token.i : token.i + 2])

60%
4%


In [36]:
nlp = spacy.load("en_core_web_sm")  # trained pipeline package 
doc = nlp("She ate the pizza.")

for token in doc:
  print(f"{token.text}\t --> Part-of-speech tag: {token.pos_} \t| Dependencies: {token.dep_}, {token.head.text}")

She	 --> Part-of-speech tag: PRON 	| Dependencies: nsubj, ate
ate	 --> Part-of-speech tag: VERB 	| Dependencies: ROOT, ate
the	 --> Part-of-speech tag: DET 	| Dependencies: det, pizza
pizza	 --> Part-of-speech tag: NOUN 	| Dependencies: dobj, ate
.	 --> Part-of-speech tag: PUNCT 	| Dependencies: punct, ate


In [39]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for ent in doc.ents:  # named entities
  print(f"{ent.text} --> {ent.label_} ({spacy.explain(ent.label_)})")

Apple --> ORG (Companies, agencies, institutions, etc.)
U.K. --> GPE (Countries, cities, states)
$1 billion --> MONEY (Monetary values, including unit)


In [49]:
from spacy.matcher import Matcher # rule-based matching (spacy's alternative to regex)

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]  # list of dicts (one per token)
# "TEXT" -- match exact token
# "LOWER" -- match lexical attributes
# "LEMMA" -- match lemma (e.g. "LEMMA": "buy" matches any form of "buy")
# "POS" -- match part-of-speech (e.g. "POS": "NOUN" matches any noun)
matcher.add("IPHONE_PATTERN", [pattern])

doc = nlp("Upcoming iPhone X release date leaked.")

matches = matcher(doc)


for match_id, start, end in matches:
  print(f"Match ID: {match_id}, "
        f"Start Idx: {start}, "
        f"End Idx: {end}")
  matched_span = doc[start:end]
  print(matched_span.text)

Match ID: 9528407286733565721, Start Idx: 1, End Idx: 3
iPhone X


In [48]:
doc = nlp("2018 FIFA World Cup: France won!")
pattern = [
           {"IS_DIGIT": True},
           {"LOWER": "fifa"},
           {"LOWER": "world"},
           {"LOWER": "cup"},
           {"IS_PUNCT": True}
]
matcher.add("FIFA", [pattern])
matches = matcher(doc)
for match_id, start, end in matches:
  print(doc[start: end].text)

2018 FIFA World Cup:


In [51]:
doc = nlp("I loved dogs but now I love cats more.")
pattern = [{"LEMMA": "love", "POS": "VERB"}, {"POS": "NOUN"}]
matcher.add("LOVE", [pattern])
matches = matcher(doc)
for match_id, start, end in matches:
  print(doc[start: end].text)

loved dogs
love cats


In [52]:
doc = nlp("I bought a smartphone. Now I'm buying apps.")
pattern = [
           {"LEMMA": "buy"}, 
           {"POS": "DET", "OP": "?"},  # determinant is optional 
           {"POS": "NOUN"}
           ]
# "OP": "!" -- match 0 times
# "OP": "?" -- match 0-1 times
# "OP": "+" -- match >1 times
# "OP": "*" -- match >0 times 
matcher.add("BUY", [pattern])
matches = matcher(doc)
for match_id, start, end in matches:
  print(doc[start: end].text)

bought a smartphone
buying apps


## 2. Large-scale data analysis

In [58]:
# vocab stores data across multiple documents, strings are saved as hash values 

coffee_hash = nlp.vocab.strings["coffee"]
coffee_string = nlp.vocab.strings[coffee_hash]

print(coffee_hash, coffee_string)

3197928453018144401 coffee


In [63]:
doc = nlp("I love tea.")
nlp.vocab.strings.add("coffee")

tea_hash = nlp.vocab.strings["tea"]
coffee_hash = nlp.vocab.strings["coffee"]

print(nlp.vocab.strings["tea"], nlp.vocab.strings[tea_hash])
print(nlp.vocab.strings["coffee"], nlp.vocab.strings[coffee_hash])
print(doc.vocab.strings["tea"], doc.vocab.strings[tea_hash])

6041671307218480733 tea
3197928453018144401 coffee
6041671307218480733 tea


In [62]:
lexeme = nlp.vocab["tea"]  # lexeme object = entry in vocab
print(
    lexeme.text, # string
    lexeme.orth, # hash value
    lexeme.is_alpha
    )

# doc consists of tokens
# each token is stored as lexeme (represented by hash value) in vocab
# hash value can be used to obtain string value again

tea 6041671307218480733 True


In [75]:
from spacy.tokens import Doc, Span

nlp = spacy.blank("en")

words = ["Hello", "world", "!"]
spaces = [True, False, False]  # whether space comes after word

doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

span = Span(doc, 0, 2)
print(span.text)

span_with_label = Span(doc, 0, 2, label="GREETING")
print(span_with_label.text, span_with_label.label_)

doc.ents = [span_with_label]  # add span to doc's entities
print([(ent.text, ent.label_) for ent in doc.ents])

Hello world!
Hello world
Hello world GREETING
[('Hello world', 'GREETING')]


In [79]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Berlin looks like a nice city")

# Get all tokens and part-of-speech tags
token_texts = [token.text for token in doc]
pos_tags = [token.pos_ for token in doc]

for token in doc:
  if token.pos_ == "PROPN":
    if doc[token.i + 1].pos_ == "VERB":
      print("Found proper noun before a verb:", doc[token.i].text)

Found proper noun before a verb: Berlin


In [None]:
import en_core_web_md
nlp = en_core_web_md.load()

In [92]:
doc1 = nlp("I like fast food.")
doc2 = nlp("I like pizza.")
print(doc1.similarity(doc2))

doc = nlp("I like pizza and pasta")
token1 = doc[2]
token2 = doc[4]
print(token1.similarity(token2))

# similarity is determined via word vectors (= multi-dimensional representation of words)
# word vectors are created via e.g. Word2Vec algorithms and lots of text

0.9009145331610278
0.7369546


In [100]:
doc = nlp("I have a banana.")
print("Shape of word vector for 'banana':", doc[3].vector.shape)

Shape of word vector for 'banana': (300,)


In [101]:
doc1 = nlp("I like cats")
doc2 = nlp("I hate cats")

print(doc1.similarity(doc2))

# concept of similarity depends!

0.9501447503553421


In [109]:
doc = nlp(
    "Twitch Prime, the perks program for Amazon Prime members offering free "
    "loot, games and other benefits, is ditching one of its best features: "
    "ad-free viewing. According to an email sent out to Amazon Prime members "
    "today, ad-free viewing will no longer be included as a part of Twitch "
    "Prime for new members, beginning on September 14. However, members with "
    "existing annual subscriptions will be able to continue to enjoy ad-free "
    "viewing until their subscription comes up for renewal. Those with "
    "monthly subscriptions will have access to ad-free viewing until October 15."
)
pattern1 = [{"LOWER": "amazon"}, {"POS": "PROPN", "IS_TITLE": True}]
pattern2 = [{"LOWER": "ad"}, {"LOWER": "-"}, {"LOWER": "free"}, {"POS": "NOUN"}]

matcher = Matcher(nlp.vocab)
matcher.add("PATTERN1", [pattern1])
matcher.add("PATTERN2", [pattern2])

for match_id, start, end in matcher(doc):
  print(doc.vocab.strings[match_id], doc[start:end].text)

PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing


In [112]:
from spacy.matcher import PhraseMatcher

nlp = spacy.blank("en")
doc = nlp("Italy and Spain are popular tourist destinations in Europe.")

COUNTRIES = ["Austria", "Australia", "Belgium", "Italy", "Spain", "Thailand"]
COUNTRIES = nlp.pipe(COUNTRIES)
matcher = PhraseMatcher(nlp.vocab)

matcher.add("COUNTRY", COUNTRIES)
print([doc[start:end] for match_id, start, end in matcher(doc)])

[Italy, Spain]


## 3. Processing Pipelines