# Chapter 2: Large scale data analysis with Spacy

- https://course.spacy.io/chapter2

# 1)- Data Structures

In [1]:
# import key modules related to text data
import spacy
from spacy.lang.en import English

In [2]:

# Create the nlp object
nlp = English()

In [3]:
doc = nlp("I love coffee")
print('hash value:', nlp.vocab.strings['coffee'])

hash value: 3197928453018144401


In [4]:
print('string value:', nlp.vocab.strings[3197928453018144401])

string value: coffee


In [5]:
doc = nlp("I love coffee")
lexeme = nlp.vocab['coffee']

In [6]:
# Print the lexical attributes
print(lexeme.text, lexeme.orth, lexeme.is_alpha)

coffee 3197928453018144401 True


- Word text: lexeme.text and lexeme.orth (the hash)
- Lexical attributes like lexeme.is_alpha
- Not context-dependent part-of-speech tags, dependencies or entity labels

### Part-1:Strings to hash

In [7]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("I have a cat")

# Look up the hash for the word "cat"
cat_hash = nlp.vocab.strings['cat']
print(cat_hash)

# Look up the cat_hash to get the string
cat_string =nlp.vocab.strings[cat_hash]
print(cat_string)

5439657043933447811
cat


### Part-2

In [8]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("David Bowie is a PERSON")

# Look up the hash for the string label "PERSON"
person_hash = nlp.vocab.strings['PERSON']
print(person_hash)

# Look up the person_hash to get the string
person_string = nlp.vocab.strings[person_hash]
print(person_string)

380
PERSON


# 2)- Data Structures (2) : Doc , span and Token

In [9]:
# Create an nlp object
from spacy.lang.en import English
nlp = English()

# Import the Doc class
from spacy.tokens import Doc

# The words and spaces to create the doc from
words = ['Hello', 'world', '!']
spaces = [False, True, False]

# Create a doc manually
doc = Doc(nlp.vocab, words=words, spaces=spaces)

In [10]:
print(doc)

Helloworld !


### Span Object

In [11]:
# Import the Doc and Span classes
from spacy.tokens import Doc, Span

# The words and spaces to create the doc from
words = ['Hello', 'world', '!']
spaces = [True, False, False]

# Create a doc manually
doc = Doc(nlp.vocab, words=words, spaces=spaces)

print(doc)

Hello world!


In [12]:
# Create a span manually
span = Span(doc, 0, 2)

print(span)

Hello world


In [13]:
# Create a span with a label
span_with_label = Span(doc, 0, 2, label="GREETING")

print(span_with_label)

Hello world


In [14]:

# Add span to the doc.ents
doc.ents = [span_with_label]

print(span_with_label)

Hello world


### Part 1:

In [15]:
# Import the Doc class
from spacy.tokens import Doc

# Desired text: "spaCy is cool!"
words = ["spaCy", "is", "cool", "!"]
spaces = [True, True, False, False]

# Create a Doc from the words and spaces
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

spaCy is cool!


### Part 2:

- Import the Doc from spacy.tokens.
- Create a Doc from the words and spaces. Don’t forget to pass in the vocab!

In [16]:
import spacy

nlp = spacy.load("en_core_web_sm")

# Import the Doc class
from spacy.tokens import Doc

# Desired text: "Go, get started!"
words = ["Go", ",", "get", "started", "!"]
spaces = [False, True, True, False, False]

# Create a Doc from the words and spaces
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

Go, get started!


### Part 3

- Import the Doc from spacy.tokens.
- Complete the words and spaces to match the desired text and create a doc.

In [17]:
# Import the Doc class
from spacy.tokens import Doc

# Desired text: "Oh, really?!"
words = ["Oh", ",", "really", "?", "!"]
spaces = [False, True, False, False, False]

# Create a Doc from the words and spaces
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

Oh, really?!


# 3)- Docs, spans and entities from scratch


- Use the Doc class directly to create a doc from the words and spaces.
- Create a Span for “David Bowie” from the doc and assign it the label "PERSON".
- Overwrite the doc.ents with a list of one entity, the “David Bowie” span.

In [18]:
from spacy.lang.en import English

nlp = English()

# Import the Doc and Span classes
from spacy.tokens import Doc, Span

words = ["I", "like", "David", "Bowie"]
spaces = [True, True, True, False]

# Create a doc from the words and spaces
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

# Create a span for "David Bowie" from the doc and assign it the label "PERSON"
span = Span(doc, 2, 4, label="PERSON")
print(span.text, span.label_)

# Add the span to the doc's entities
doc.ents = [span]

# Print entities' text and labels
print([(ent.text, ent.label_) for ent in doc.ents])

I like David Bowie
David Bowie PERSON
[('David Bowie', 'PERSON')]


# 4)- Get specific POS from text

- Write code so as to analyze a text and collect all proper nouns that are followed by a verb.

In [19]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Berlin is a nice city")

# Iterate over the tokens
for token in doc:
    # Check if the current token is a proper noun
    if token.pos_ == "PROPN":
        # Check if the next token is a verb
        if doc[token.i + 1].pos_ == "VERB":
            print("Found proper noun before a verb:", token.text)

# 5)- Word vectors and semantic similarity

### Inspecting word vector

In [20]:
import spacy

# Load the en_core_web_md model
nlp = spacy.load("en_core_web_sm")

# Process a text
doc = nlp("Two bananas in pyjamas")

# Get the vector for the token "bananas"
bananas_vector = doc[1].vector
print(bananas_vector)

[-2.2387033   0.6563978  -1.3915625  -2.0158877  -2.6757379   0.6049806
 -4.9210434   2.6724901  -0.5421836  -0.8809319   2.6590538  -3.5114794
  5.2498507  -1.2459521   0.87745774 -1.4979573  -2.1457722  -1.1067222
 -2.1124492  -1.3459014   1.1430457  -0.8469293   3.5861008   2.6726484
 -2.3294182   2.974951   -0.02186409 -0.6708566   0.02608067  4.3714023
 -5.7176323  -0.4561702   1.0577139  -2.1087644  -1.9045851   2.221695
  0.6667701   2.1882637  -2.8947964   3.4675171   1.1178943   1.1830177
 -0.02594751  1.6294907   5.041691    0.4304657   0.04791719  0.5513984
 -0.36579067 -3.9674122   0.625882   -1.8514493   2.196616    2.5510483
  5.348059   -0.4365843   0.2364122  -0.630581   -1.3520204  -0.53069663
 -1.686786   -2.9399247   3.8511808   1.1052506  -2.691662    0.39372563
  0.34798118 -4.6369677   2.3882837   0.8316821  -1.1286397  -4.356789
 -1.0592571  -1.6973286  -0.05940416 -1.4840398  -0.2224785   1.9230247
 -0.56688464  2.5266325  -1.8940659  -0.01945698 -0.8910403   3.

# 6)- Comparing similarity 

### Part 1:

- Use the doc.similarity method to compare doc1 to doc2 and print the result.

In [21]:
#! python -m spacy download en_core_web_md

In [22]:
import spacy

nlp = spacy.load("en_core_web_md")

doc1 = nlp("It's a warm summer day")
doc2 = nlp("It's sunny outside")

# Get the similarity of doc1 and doc2
similarity = doc1.similarity(doc2)
print(similarity)

0.8789265574516525


### Part 2

- Use the token.similarity method to compare token1 to token2 and print the result.

In [23]:
import spacy

nlp = spacy.load("en_core_web_md")

doc = nlp("TV and books")
token1, token2 = doc[0], doc[2]

# Get the similarity of the tokens "TV" and "books"
similarity = token1.similarity(token2)
print(similarity)

0.2232533


### Part 3

- Create spans for “great restaurant”/“really nice bar”.
- Use span.similarity to compare them and print the result.

In [24]:
import spacy

nlp = spacy.load("en_core_web_md")

doc = nlp("This was a great restaurant. Afterwards, we went to a really nice bar.")

# Create spans for "great restaurant" and "really nice bar"
span1 = doc[3:5]
span2 = doc[12:15]

# Get the similarity of the spans
similarity = span1.similarity(span2)
print(similarity)

0.7517392


# 7)-Combining models and rules

### a)- Rule based matching

In [25]:
# Initialize with the shared vocab
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

# Patterns are lists of dictionaries describing the tokens
pattern = [{'LEMMA': 'love', 'POS': 'VERB'}, {'LOWER': 'cats'}]
matcher.add('LOVE_CATS', None, pattern)

# Operators can specify how often a token should be matched
pattern = [{'TEXT': 'very', 'OP': '+'}, {'TEXT': 'happy'}]
matcher.add('VERY_HAPPY', None, pattern)

# Calling matcher on doc returns list of (match_id, start, end) tuples
doc = nlp("I love cats and I'm very very happy")
matches = matcher(doc)

In [26]:
matches

[(9137535031263442622, 1, 3),
 (2447047934687575526, 7, 9),
 (2447047934687575526, 6, 9)]

### b)-statistical predictions

In [27]:
matcher = Matcher(nlp.vocab)
matcher.add('DOG', None, [{'LOWER': 'golden'}, {'LOWER': 'retriever'}])
doc = nlp("I have a Golden Retriever")

for match_id, start, end in matcher(doc):
    span = doc[start:end]
    print('Matched span:', span.text)
    # Get the span's root token and root head token
    print('Root token:', span.root.text)
    print('Root head token:', span.root.head.text)
    # Get the previous token and its POS tag
    print('Previous token:', doc[start - 1].text, doc[start - 1].pos_)

Matched span: Golden Retriever
Root token: Retriever
Root head token: have
Previous token: a DET


In [28]:
spacy.explain("DET")

'determiner'

### c)-Efficient phrase matching

In [29]:
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)

pattern = nlp("Golden Retriever")
matcher.add('DOG', None, pattern)
doc = nlp("I have a Golden Retriever")

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Get the matched span
    span = doc[start:end]
    print('Matched span:', span.text)

Matched span: Golden Retriever


### d)- Extract key words from text

- Amazon

- ad-free

In [30]:
doc = nlp(
    "Twitch Prime, the perks program for Amazon Prime members offering free "
    "loot, games and other benefits, is ditching one of its best features: "
    "ad-free viewing. According to an email sent out to Amazon Prime members "
    "today, ad-free viewing will no longer be included as a part of Twitch "
    "Prime for new members, beginning on September 14. However, members with "
    "existing annual subscriptions will be able to continue to enjoy ad-free "
    "viewing until their subscription comes up for renewal. Those with "
    "monthly subscriptions will have access to ad-free viewing until October 15."
)

# Create the match patterns
pattern1 = [{"LOWER": "amazon"}, {"IS_TITLE": True, "POS": "PROPN"}]
pattern2 = [{"LOWER": "ad"}, {"TEXT": "-"}, {"LOWER": "free"}, {"POS": "NOUN"}]

# Initialize the Matcher and add the patterns
matcher = Matcher(nlp.vocab)
matcher.add("PATTERN1", None, pattern1)
matcher.add("PATTERN2", None, pattern2)

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Print pattern string name and text of matched span
    print(doc.vocab.strings[match_id], doc[start:end].text)

PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing
