In [191]:
import spacy

In [192]:
nlp = spacy.load('en_core_web_sm')

In [4]:
doc = nlp(u"the quick brown fox jumped over the lazy dog's back.")

In [5]:
print(doc.text)

the quick brown fox jumped over the lazy dog's back.


In [6]:
print(doc[4].pos_)  # part of speach

VERB


In [7]:
print(doc[4].tag_)  # tag

VBD


In [8]:
print(spacy.explain(doc[4].tag_))  # tag desription

verb, past tense


In [9]:
for token in doc:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

the        DET        DT         determiner
quick      ADJ        JJ         adjective
brown      ADJ        JJ         adjective
fox        NOUN       NN         noun, singular or mass
jumped     VERB       VBD        verb, past tense
over       ADP        IN         conjunction, subordinating or preposition
the        DET        DT         determiner
lazy       ADJ        JJ         adjective
dog        NOUN       NN         noun, singular or mass
's         PART       POS        possessive ending
back       NOUN       NN         noun, singular or mass
.          PUNCT      .          punctuation mark, sentence closer


In [10]:
doc2 = nlp(u"I m reading books on nlp")

In [11]:
word = doc2[2]

In [12]:
word.text

'reading'

In [13]:
token = word
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

reading    VERB       VBG        verb, gerund or present participle


In [14]:
doc3 = nlp(u"i read a book in the nlp")
word = doc3[1]

In [15]:
word.text
token = word
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

read       VERB       VBD        verb, past tense


In [16]:
doc4 = nlp(u"the quick brown fox jumped over the lazy dog's back.")

In [17]:
# attrs -> attributes , itgive dictionary of part of speach counr, code
POS_counts = doc.count_by(spacy.attrs.POS)

In [18]:
POS_counts

{90: 2, 84: 3, 92: 3, 100: 1, 85: 1, 94: 1, 97: 1}

In [19]:
doc4.vocab[84].text  # to get the value of the previous number

'ADJ'

In [20]:
for k, v in sorted(POS_counts.items()):
    # 84 -> POS, ADJ -> POS.description, 3-> no. of. times repeated
    print(f"{k :{10}}. {doc4.vocab[k].text:{5}} {v}")

        84. ADJ   3
        85. ADP   1
        90. DET   2
        92. NOUN  3
        94. PART  1
        97. PUNCT 1
       100. VERB  1


## visualizing POS

In [21]:
import spacy

In [22]:
nlp = spacy.load('en_core_web_sm')

In [23]:
doc = nlp(u"the quick brown for jumped over tht lazy dog")

In [24]:
from spacy import displacy

In [25]:
displacy.render(doc, style='dep', jupyter=True)

In [26]:
options = {'distance': 110, 'compact': 'True', 'colour': 'yellow',
           'bg': '#09a3d5', 'font': 'Times'}  # to edit the previoous diagram

In [27]:
displacy.render(doc, style='dep', jupyter=True, options=options)

## Named Entity Recognition (NER)

#### "Jim bought 300 shares of Acme Corp. in 2006"
#### JIM -> Person
#### Acme Corp -> Organisatiom
#### 2006 -> Time 

In [28]:
import spacy

In [29]:
nlp = spacy.load('en_core_web_sm')

In [30]:
# displacy basic entity Info
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - '+ent.label_+' - ' +
                  str(spacy.explain(ent.label_)))
    else:
        print('No entities found')  # if no entity

In [31]:
doc = nlp(u"Hi how are you?")

In [32]:
show_ents(doc)

No entities found


In [33]:
doc2 = nlp(u"May i go to washington, DC next May to see the Washington Monument?")

In [34]:
show_ents(doc2)

DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.


In [35]:
doc3 = nlp(u"Can i please have 500 dollars of Microsoft stock??")
for i in doc3:
    doc4 = i.title()
print(doc4)

AttributeError: 'spacy.tokens.token.Token' object has no attribute 'title'

In [36]:
show_ents(doc3)

500 dollars - MONEY - Monetary values, including unit
Microsoft - ORG - Companies, agencies, institutions, etc.


### To add the single word which is not available in ents, eg. Tesla, Microsoft etc

In [37]:
doc = nlp(u"Tesla to build a U.K. factory for $6 million")

In [38]:
doc.ents

(U.K., $6 million)

In [39]:
show_ents(doc)  # Tesla is not showing as Org

U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [40]:
from spacy.tokens import Span

In [41]:
ORG = doc.vocab.strings[u"ORG"]

In [42]:
ORG  # gets the hash value for "org"

383

In [43]:
# (take the document in which the word is present ie."doc"), (0,1 -> is thw word Tesla)
new_ent = Span(doc, 0, 1, label=ORG)
# (label -> label to given to the word tesla)

In [44]:
# appending the new entity to the doc.ents
doc.ents = list(doc.ents) + [new_ent]

In [45]:
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


### To add the multiple word which is not available in ents, eg. vacume cleaner etc.

In [46]:
doc = nlp(u"Our company created a brand new vacuum cleaner."
          u"This new vaccum-cleaner is the best in show")

In [47]:
show_ents(doc)

No entities found


In [48]:
# to match the list of things to be matched
from spacy.matcher import PhraseMatcher

In [49]:
matcher = PhraseMatcher(nlp.vocab)

In [50]:
phrse_list = ['vacuum cleaner', 'vaccum-cleaner']  # list of this to be matched

In [61]:
pharse_patterns = [nlp(text) for text in phrse_list]
pharse_patterns

[vacuum cleaner, vaccum-cleaner]

In [52]:
matcher.add('newproduct', None, *pharse_patterns)  # name fot he matched item

In [53]:
found_matches = matcher(doc)

In [54]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [55]:
from spacy.tokens import span

In [56]:
ped = doc.vocab.strings[u"PRODUCT"]

In [57]:
ped

386

In [58]:
new_ent = [Span(doc, match[1], match[2], label=ped) for match in found_matches]

In [59]:
doc.ents = list(doc.ents) + new_ent

In [60]:
show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vaccum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [109]:
from spacy import displacy
from spacy.tokens import span

In [116]:
doc = nlp(u"Over the last quarter Apple sold nearly thousand iPod for a profit of $6 million."
          u"By constrast, Sony sold 8 thousand Walkman music players.")

In [123]:
# We get the two sentence togther!
displacy.render(doc, style='ent', jupyter=True)

In [124]:
# Hence to seperate the lines we do!!
for sent in doc.sents:
    displacy.render(nlp(sent.text), style='ent', jupyter=True)

In [125]:
# Colouring , select specific
colour = {'ORG': 'red', 'Product': 'Blue'}
options = {'ents': ['ORG', 'Product'], 'colors': colour}

In [126]:
for sent in doc.sents:
    displacy.render(nlp(sent.text), style='ent', jupyter=True, options=options)

In [127]:
# Colouring , select specific
colour = {'ORG': 'radial-gradient(yellow, red)', 'Product': 'Blue'}
options = {'ents': ['ORG', 'Product'], 'colors': colour}

In [128]:
for sent in doc.sents:
    displacy.render(nlp(sent.text), style='ent', jupyter=True, options=options)

In [129]:
# Colouring , select specific
colour = {'ORG': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)',
          'Product': 'linear-gradient(180deg, orange,  red)'}
options = {'ents': ['ORG', 'Product'], 'colors': colour}

In [130]:
for sent in doc.sents:
    displacy.render(nlp(sent.text), style='ent', jupyter=True, options=options)

## Sentence Segentation

In [131]:
import spacy

In [133]:
nlp = spacy.load('en_core_web_sm')

In [136]:
doc = nlp(u"This is 1st sentence. This is 2nd sentence. This is 3rd sentence")

In [137]:
for i in doc.sents:
    print(i)

This is 1st sentence.
This is 2nd sentence.
This is 3rd sentence


In [139]:
# If we try to tke specific snetence, this will fail!
# doc.sents[0]
doc[0]

This

In [142]:
# To grab the sentences,
type(list(doc.sents)[0])

spacy.tokens.span.Span

In [170]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right thing." -Kaushik K')

In [171]:
doc.text

'"Management is doing the right things; leadership is doing the right thing." -Kaushik K'

In [172]:
for i in doc.sents:
    print(i)
    print('\n')

"Management is doing the right things; leadership is doing the right thing."


-Kaushik


K




In [173]:
# ADD A SEGMENTATION RULE
def set_custom_boundaries(doc):
    for token in doc:
        print(token)
        print(token.i)

In [174]:
set_custom_boundaries(doc)

"
0
Management
1
is
2
doing
3
the
4
right
5
things
6
;
7
leadership
8
is
9
doing
10
the
11
right
12
thing
13
.
14
"
15
-Kaushik
16
K
17


In [180]:
# ADD A SEGMENTATION RULE
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            # when it finds ';' , the very next word is the starting of the sentence
            doc[token.i+1].is_sent_start = True
    return doc

In [181]:
doc[:-1]

"Management is doing the right things; leadership is doing the right thing." -Kaushik

In [194]:
nlp.add_pipe(set_custom_boundaries)

nlp.pipe_names

ValueError: [E966] `nlp.add_pipe` now takes the string name of the registered component factory, not a callable component. Expected string, but got <function set_custom_boundaries at 0x0000015390C54B80> (name: 'None').

- If you created your component with `nlp.create_pipe('name')`: remove nlp.create_pipe and call `nlp.add_pipe('name')` instead.

- If you passed in a component like `TextCategorizer()`: call `nlp.add_pipe` with the string name instead, e.g. `nlp.add_pipe('textcat')`.

- If you're using a custom component: Add the decorator `@Language.component` (for function components) or `@Language.factory` (for class components / factories) to your custom component and assign it a name, e.g. `@Language.component('your_name')`. You can then run `nlp.add_pipe('your_name')` to add it to the pipeline.

In [183]:
# CHANGE SEGMENTATION RULE