# Chapter 2: Large scale data analysis with Spacy

- https://course.spacy.io/chapter2

# 1)- Data Structures

In [0]:
# import key modules related to text data
import spacy
from spacy.lang.en import English

In [0]:

# Create the nlp object
nlp = English()

In [3]:
doc = nlp("I love coffee")
print('hash value:', nlp.vocab.strings['coffee'])

hash value: 3197928453018144401


In [4]:
print('string value:', nlp.vocab.strings[3197928453018144401])

string value: coffee


In [0]:
doc = nlp("I love coffee")
lexeme = nlp.vocab['coffee']

In [6]:
# Print the lexical attributes
print(lexeme.text, lexeme.orth, lexeme.is_alpha)

coffee 3197928453018144401 True


- Word text: lexeme.text and lexeme.orth (the hash)
- Lexical attributes like lexeme.is_alpha
- Not context-dependent part-of-speech tags, dependencies or entity labels

### Part-1:Strings to hash

In [7]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("I have a cat")

# Look up the hash for the word "cat"
cat_hash = nlp.vocab.strings['cat']
print(cat_hash)

# Look up the cat_hash to get the string
cat_string =nlp.vocab.strings[cat_hash]
print(cat_string)

5439657043933447811
cat


### Part-2

In [8]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("David Bowie is a PERSON")

# Look up the hash for the string label "PERSON"
person_hash = nlp.vocab.strings['PERSON']
print(person_hash)

# Look up the person_hash to get the string
person_string = nlp.vocab.strings[person_hash]
print(person_string)

380
PERSON


# 2)- Data Structures (2) : Doc , span and Token

In [0]:
# Create an nlp object
from spacy.lang.en import English
nlp = English()

# Import the Doc class
from spacy.tokens import Doc

# The words and spaces to create the doc from
words = ['Hello', 'world', '!']
spaces = [False, True, False]

# Create a doc manually
doc = Doc(nlp.vocab, words=words, spaces=spaces)

In [10]:
print(doc)

Helloworld !


### Span Object

In [11]:
# Import the Doc and Span classes
from spacy.tokens import Doc, Span

# The words and spaces to create the doc from
words = ['Hello', 'world', '!']
spaces = [True, False, False]

# Create a doc manually
doc = Doc(nlp.vocab, words=words, spaces=spaces)

print(doc)

Hello world!


In [12]:
# Create a span manually
span = Span(doc, 0, 2)

print(span)

Hello world


In [13]:
# Create a span with a label
span_with_label = Span(doc, 0, 2, label="GREETING")

print(span_with_label)

Hello world


In [14]:

# Add span to the doc.ents
doc.ents = [span_with_label]

print(span_with_label)

Hello world


### Part 1:

In [15]:
# Import the Doc class
from spacy.tokens import Doc

# Desired text: "spaCy is cool!"
words = ["spaCy", "is", "cool", "!"]
spaces = [True, True, False, False]

# Create a Doc from the words and spaces
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

spaCy is cool!


### Part 2:

- Import the Doc from spacy.tokens.
- Create a Doc from the words and spaces. Don’t forget to pass in the vocab!

In [17]:
import spacy

nlp = spacy.load("en_core_web_sm")

# Import the Doc class
from spacy.tokens import Doc

# Desired text: "Go, get started!"
words = ["Go", ",", "get", "started", "!"]
spaces = [False, True, True, False, False]

# Create a Doc from the words and spaces
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

Go, get started!


### Part 3

- Import the Doc from spacy.tokens.
- Complete the words and spaces to match the desired text and create a doc.

In [18]:
# Import the Doc class
from spacy.tokens import Doc

# Desired text: "Oh, really?!"
words = ["Oh", ",", "really", "?", "!"]
spaces = [False, True, False, False, False]

# Create a Doc from the words and spaces
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

Oh, really?!


# 3)- Docs, spans and entities from scratch


- Use the Doc class directly to create a doc from the words and spaces.
- Create a Span for “David Bowie” from the doc and assign it the label "PERSON".
- Overwrite the doc.ents with a list of one entity, the “David Bowie” span.

In [19]:
from spacy.lang.en import English

nlp = English()

# Import the Doc and Span classes
from spacy.tokens import Doc, Span

words = ["I", "like", "David", "Bowie"]
spaces = [True, True, True, False]

# Create a doc from the words and spaces
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

# Create a span for "David Bowie" from the doc and assign it the label "PERSON"
span = Span(doc, 2, 4, label="PERSON")
print(span.text, span.label_)

# Add the span to the doc's entities
doc.ents = [span]

# Print entities' text and labels
print([(ent.text, ent.label_) for ent in doc.ents])

I like David Bowie
David Bowie PERSON
[('David Bowie', 'PERSON')]


# 4)- Get specific POS from text

- Write code so as to analyze a text and collect all proper nouns that are followed by a verb.

In [21]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Berlin is a nice city")

# Iterate over the tokens
for token in doc:
    # Check if the current token is a proper noun
    if token.pos_ == "PROPN":
        # Check if the next token is a verb
        if doc[token.i + 1].pos_ == "VERB":
            print("Found proper noun before a verb:", token.text)

Found proper noun before a verb: Berlin


# 5)- Word vectors and semantic similarity

### Inspecting word vector

In [23]:
import spacy

# Load the en_core_web_md model
nlp = spacy.load("en_core_web_sm")

# Process a text
doc = nlp("Two bananas in pyjamas")

# Get the vector for the token "bananas"
bananas_vector = doc[1].vector
print(bananas_vector)

[ 2.1561384   0.6859281  -1.8234854   0.4145496  -0.886605    5.0773377
  0.28650832  3.6156225  -2.627604    5.01052     2.6055033   5.4986916
 -0.82726336 -2.4128723  -1.5714562   0.67344356 -1.1230624   3.017315
  3.4531426   2.6312394  -2.3144596   2.0717711  -0.5736556  -0.5199362
 -0.4892068   1.4417053   1.1748309   3.291245    2.7368522   2.1909308
  2.4100504  -1.5442916  -0.81270695 -1.7967525  -2.4401696   0.96489155
 -5.071314    2.4865592  -1.1760099   1.0010973  -1.8218107   6.159581
  5.876448   -1.9877293   6.579393    1.0499439  -1.5798447  -4.1203165
 -0.17076118 -4.819325   -2.1152763  -4.640588    1.5844907  -3.2757292
  2.1921952  -0.47692332 -1.8678508   1.0092752   0.7716696  -0.37776387
  0.07058215 -0.18511617  5.209738   -3.002555   -1.8404679   4.089005
 -2.0230193   1.0394226  -1.7199193   1.0383378   0.23976706 -0.67239416
  1.3192352  -0.33726573  0.21724188 -0.5032941   0.26279616 -0.58214176
 -3.0981517  -4.9684753  -3.2268834  -4.5933228  -3.0618596  -0

# 6)- Comparing similarity 

### Part 1:

- Use the doc.similarity method to compare doc1 to doc2 and print the result.

In [30]:
! python -m spacy download en_core_web_md

Collecting en_core_web_md==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0/en_core_web_md-2.1.0.tar.gz (95.4MB)
[K     |████████████████████████████████| 95.4MB 1.1MB/s 
[?25hBuilding wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.1.0-cp36-none-any.whl size=97126237 sha256=aa047a2953f18d2963a2d66d6de0d72fa2c0da79ea6abd85c32a3326f3af534c
  Stored in directory: /tmp/pip-ephem-wheel-cache-avv1m0x3/wheels/c1/2c/5f/fd7f3ec336bf97b0809c86264d2831c5dfb00fc2e239d1bb01
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [0]:
import spacy

nlp = spacy.load("en_core_web_md")

doc1 = nlp("It's a warm summer day")
doc2 = nlp("It's sunny outside")

# Get the similarity of doc1 and doc2
similarity = doc1.similarity(doc2)
print(similarity)

### Part 2

- Use the token.similarity method to compare token1 to token2 and print the result.

In [0]:
import spacy

nlp = spacy.load("en_core_web_md")

doc = nlp("TV and books")
token1, token2 = doc[0], doc[2]

# Get the similarity of the tokens "TV" and "books"
similarity = token1.similarity(token2)
print(similarity)

### Part 3

- Create spans for “great restaurant”/“really nice bar”.
- Use span.similarity to compare them and print the result.

In [0]:
import spacy

nlp = spacy.load("en_core_web_md")

doc = nlp("This was a great restaurant. Afterwards, we went to a really nice bar.")

# Create spans for "great restaurant" and "really nice bar"
span1 = doc[3:5]
span2 = doc[12:15]

# Get the similarity of the spans
similarity = span1.similarity(span2)
print(similarity)