# spacy
### 1. spacy is object oriented.
### 2.It provides most efficient nlp algorithm for a given task .
### 3.Hence if you are care about the end result ,go with spacy.
### 4.Prefixed setting library.
### 5.Perfect for developers.


## Chapter 1:          Finding words, phrases, names and concepts

 ## loading spacy

In [1]:
# Importing required libraries
import warnings
warnings.filterwarnings('ignore')
import nltk
import spacy

In [2]:
# creating blank nlp object and processing the text

nlp =spacy.blank('en')
# Doc object
doc =nlp('Hello Murthy')

for token in doc:
    print(token)  
    print(type(token))
    print(token.text)
    print(type(token.text))

Hello
<class 'spacy.tokens.token.Token'>
Hello
<class 'str'>
Murthy
<class 'spacy.tokens.token.Token'>
Murthy
<class 'str'>


In [3]:
# token indexing
doc = nlp('this is murthy.I love Statistics.')
indicies =[token.i for token in doc]
print(indicies)
# printing tokens
token_3 =doc[2]
print(token_3)

# Slicing of doc
tokens =doc[2:6] # "." also consedered as one of tokens
#print(tokens[2:6:2])  # throws an error
print(tokens)
print(list(doc)[1:5:2])


[0, 1, 2, 3, 4, 5, 6, 7]
murthy
murthy.I love
[is, .]


## lexical Attributes

In [4]:
 
doc =nlp("Apple is the first U.S.company to reach a $1 trillion.")
print("token and index:",dict(zip([token for token in doc],[token.i for token in doc])))
print("*******************************************************************************************************************************")
print("check alpha:",dict(zip([token.text for token in doc],[token.is_alpha for token in doc])))
print("*******************************************************************************************************************************")
# check whether doc has alpha or not
print(dict(zip([token.text for token in doc],[token.is_alpha for token in doc])))
print("*******************************************************************************************************************************")
# check whether doc has punctuation or not
print(dict(zip([token.text for token in doc],[token.is_punct for token in doc])))

print("*******************************************************************************************************************************")
#check whether doc has numeric or not
print(dict(zip([token.text for token in doc],[token.like_num for token in doc])))

# like_url & like_mail

token and index: {Apple: 0, is: 1, the: 2, first: 3, U.S.company: 4, to: 5, reach: 6, a: 7, $: 8, 1: 9, trillion: 10, .: 11}
*******************************************************************************************************************************
check alpha: {'Apple': True, 'is': True, 'the': True, 'first': True, 'U.S.company': False, 'to': True, 'reach': True, 'a': True, '$': False, '1': False, 'trillion': True, '.': False}
*******************************************************************************************************************************
{'Apple': True, 'is': True, 'the': True, 'first': True, 'U.S.company': False, 'to': True, 'reach': True, 'a': True, '$': False, '1': False, 'trillion': True, '.': False}
*******************************************************************************************************************************
{'Apple': False, 'is': False, 'the': False, 'first': False, 'U.S.company': False, 'to': False, 'reach': False, 'a': False, '$': False, '1'

In [5]:
doc = nlp(
    "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are."
)

# Iterate over the tokens in the doc
for token in doc:
    # Check if the token resembles a number
    if token.like_num:
        # Get the next token in the document
        next_token = doc[token.i + 1]
        # Check if the next token's text equals "%"
        if next_token.text == "%":
            print("Percentage found:", token.text)

Percentage found: 60
Percentage found: 4


## Trained pipelines



## Loading pipelines

In [6]:
# Load the "en_core_web_sm" pipeline
nlp = spacy.load("en_core_web_sm")

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = nlp(text)

# Print the document text
print(doc.text)

It’s official: Apple is the first U.S. public company to reach a $1 trillion market value


## Predicting linguistic annotations

In [7]:
nlp = spacy.load("en_core_web_sm")

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = nlp(text)

for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    # This is for formatting only
    print(f"{token_text:<12}{token_pos:<10}{token_dep:<10}")

It          PRON      nsubj     
’s          VERB      ROOT      
official    ADJ       acomp     
:           PUNCT     punct     
Apple       PROPN     nsubj     
is          AUX       ROOT      
the         DET       det       
first       ADJ       amod      
U.S.        PROPN     nmod      
public      ADJ       amod      
company     NOUN      attr      
to          PART      aux       
reach       VERB      relcl     
a           DET       det       
$           SYM       quantmod  
1           NUM       compound  
trillion    NUM       nummod    
market      NOUN      compound  
value       NOUN      dobj      


In [8]:
text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = nlp(text)

# Iterate over the doc.ents and print the entity text and label_ attribute.
for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text,":", ent.label_)

Apple : ORG
first : ORDINAL
U.S. : GPE
$1 trillion : MONEY


In [9]:
text = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"

# Process the text
doc = nlp(text)

# Iterate over the entities
for ent in doc.ents:
    # Print the entity text and label
    print(ent.text, ent.label_)
# Get the span for "iPhone X"
iphone_x = doc[1:3] 
# Print the span text
print("Missing entity:", iphone_x.text)


Apple ORG
Missing entity: iPhone X


## Predicting name entities

In [10]:
nlp = spacy.load("en_core_web_sm")

text = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"

# Process the text
doc = nlp(text)

# Iterate over the entities
for ent in doc.ents:
    # Print the entity text and label
    print(ent.text, ent.label_)

# Get the span for "iPhone X"
iphone_x = doc[1:3]

# Print the span text
print("Missing entity:", iphone_x.text)

Apple ORG
Missing entity: iPhone X


## Rule based matching

### 1. Using the matcher 

In [11]:
# Import the Matcher
from spacy.matcher import Matcher

# Load a pipeline and create the nlp object
nlp = spacy.load("en_core_web_sm")

# Initialize the matcher with the shared vocab
matcher = Matcher(nlp.vocab)

# Add the pattern to the matcher
pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]
matcher.add("IPHONE_PATTERN", [pattern])

# Process some text
doc = nlp("Upcoming iPhone X release date leaked")

# Call the matcher on the doc
matches = matcher(doc)

In [12]:

# Import the Matcher
from spacy.matcher import Matcher

# Initialize the matcher with the shared vocab
matcher = Matcher(nlp.vocab)

# Add the pattern to the matcher
pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]
matcher.add("IPHONE_PATTERN", [pattern])

# Process some text
doc = nlp("Upcoming iPhone X release date leaked")

# Call the matcher on the doc
matches = matcher(doc)
# Iterate over the matches
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print("matched span is:",matched_span.text)

matched span is: iPhone X


### 2.Using Matcher

In [13]:
pattern = [
    {"LEMMA": "love", "POS": "VERB"},
    {"POS": "NOUN"}
]
matcher.add("pattern", [pattern])

# Process some text
doc = nlp("I loved dogs but now I love cats more.")
# Call the matcher on the doc
matches = matcher(doc)
# Iterate over the matches
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print("matched span is:",matched_span.text)

matched span is: loved dogs
matched span is: love cats


In [14]:
pattern = [
    {"LEMMA": "buy"},
    {"POS": "DET", "OP": "?"},  # optional: match 0 or 1 times
    {"POS": "NOUN"}
]
matcher.add("pattern", [pattern])
# Process some text
doc = nlp("I bought a smartphone. Now I'm buying apps.")
# Call the matcher on the doc
matches = matcher(doc)
# Iterate over the matches
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print("matched span is:",matched_span.text)

matched span is: bought a smartphone
matched span is: buying apps


In [15]:
doc = nlp("Upcoming iPhone X release date leaked as Apple reveals pre-orders")

# Initialize the Matcher with the shared vocabulary
matcher = Matcher(nlp.vocab)

# Create a pattern matching two tokens: "iPhone" and "X"
pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]

# Add the pattern to the matcher
matcher.add("IPHONE_X_PATTERN", [pattern])

# Use the matcher on the doc
matches = matcher(doc)
print("Matches:", [doc[start:end].text for match_id, start, end in matches])

Matches: ['iPhone X']


In [16]:
doc = nlp("Features of the app include a beautiful design, smart search, automatic "
          "labels and optional voice responses.")

# Write a pattern for adjective plus one or two nouns
pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("ADJ_NOUN_PATTERN", [pattern])
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text,)

Total matches found: 5
Match found: beautiful design
Match found: smart search
Match found: automatic labels
Match found: optional voice
Match found: optional voice responses


In [17]:
text ="green fund is a corporation"

In [18]:
D =nlp(text)

In [19]:
# Construction via add_pipe
ruler = nlp.add_pipe("entity_ruler")
# Construction from class
from spacy.pipeline import EntityRuler
ruler = EntityRuler(nlp, overwrite_ents=False)


In [20]:
ptrn =[{'LOWER':'green'},{"LOWER":"fund"},{"ORTH":"is"},{"LOWER":"corportaion"}]

In [21]:
ruler.add_patterns(ptrn)

KeyError: 'pattern'