In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm') #loading model

In [3]:
doc = nlp(u'Tesla is lookung at buying $6 million') #u --> unicode

In [4]:
for token in doc:
    print(token.text, token.pos_, token.dep_) #pos_ --> Parts Of Speach #dep syntactic dependency

Tesla PROPN nsubjpass
is AUX auxpass
lookung VERB ROOT
at ADP prep
buying VERB pcomp
$ SYM quantmod
6 NUM compound
million NUM dobj


In [5]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x10fa92beb80>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x10fa92a0400>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x10fa8fbdfa0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x10fa8fc0280>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x10fa92b2300>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x10fa933da00>)]

In [6]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']

In [7]:
doc2 = nlp(u"tesla isn't looking      into its startups anymore.")

In [8]:
for token in doc2:
    print(token.text, token.pos_, token.dep_)

tesla NOUN nsubj
is AUX aux
n't PART neg
looking VERB ROOT
      SPACE dobj
into ADP prep
its PRON poss
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [9]:
doc2[2]

n't

In [10]:
doc2[2].pos_

'PART'

In [11]:
doc2[2].dep_

'neg'

In [12]:
type(doc2)

spacy.tokens.doc.Doc

In [13]:
doc2[2:5]

n't looking      

In [14]:
hi = nlp(u"hello. my name is kaushik. and i stay in mangalore")

In [15]:
for sentence in hi.sents:
    print(sentence)

hello.
my name is kaushik.
and i stay in mangalore


In [16]:
print(hi[1])
hi[1].is_sent_start

.


False

In [17]:
print(hi[2])
hi[2].is_sent_start

my


True

In [18]:
doc3 = nlp(u"apple to build a new branch in hong kong !")

In [19]:
for token in doc3:
    print(token.text)

apple
to
build
a
new
branch
in
hong
kong
!


In [20]:
for token in doc3:
    print(token.text, end="")

appletobuildanewbranchinhongkong!

In [21]:
for token in doc3:
    print(token.text, end=" ")

apple to build a new branch in hong kong ! 

In [22]:
for token in doc3:
    print(token.text, end="|")

apple|to|build|a|new|branch|in|hong|kong|!|

In [23]:
for token in doc3:
    print(token.text, end=" | ")

apple | to | build | a | new | branch | in | hong | kong | ! | 

In [24]:
doc4 = nlp(u"Apple to build a new branch in Hong Kong for $6 millions!")

In [25]:
for entity in doc4.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print("")

Apple
ORG
Companies, agencies, institutions, etc.

Hong Kong
GPE
Countries, cities, states

$6 millions
MONEY
Monetary values, including unit



# Tokens Visualization

In [26]:
import spacy
from spacy import displacy #displacy inbuilt visualizer
nlp = spacy.load('en_core_web_sm') #loading model

In [27]:
doc = nlp(u"apple is going to build a U.K factory for $6 millions")

In [28]:
displacy.render(doc, style='dep', jupyter=True, options={'distance':110}) #we use jupyter=True because we are 
                                                                                #using jypyter notebook
                            #'dep' --> DEPENDENCY

### "Entity Recogniser"

In [29]:
doc2 = nlp("Apple sold nearly 20 thousand iPhone for the profit of $5 millon")
print(doc2)

Apple sold nearly 20 thousand iPhone for the profit of $5 millon


In [30]:
displacy.render(doc2,style='ent', jupyter=True) #'ent' --> Entity

In [31]:
doc3 = nlp(f"this is a sentence")

In [None]:
displacy.serve(doc3,style='ent') #if we are using it in '.py' script




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...



## Stemming

Eg --> caresses ->caress
        ponies -> poni
        cats -> cat
        relational -> relate
        agreed -> agree

In [1]:
import nltk

In [2]:
from nltk.stem.porter import PorterStemmer

In [3]:
p_stemmer = PorterStemmer()

In [6]:
words = ['run', 'runner', 'running', 'runs', 'fairly']

workng


In [8]:
for word in words:
    print(word + '---->' + p_stemmer.stem(word))
# p_stemmer.stem(word)

run---->run
runner---->runner
running---->run
runs---->run
fairly---->fairli


In [9]:
from nltk.stem.snowball import SnowballStemmer #SnowballStemmer --> better version of stemming!!

In [10]:
s_stemmer = SnowballStemmer(language='english')

In [11]:
for word in words:
    print(word + '---->' + s_stemmer.stem(word))

run---->run
runner---->runner
running---->run
runs---->run
fairly---->fair


## Lemmatization

####  eg : nt --> not

In [12]:
import spacy

In [14]:
nlp = spacy.load('en_core_web_sm')

In [16]:
doc5 = nlp(f"i am a runner running in a race i loveto run since i ran today")

In [18]:
for token in doc5:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

i 	 PRON 	 4690420944186131903 	 I
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
i 	 PRON 	 4690420944186131903 	 I
loveto 	 NOUN 	 14826795900270463915 	 loveto
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
i 	 PRON 	 4690420944186131903 	 I
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [28]:
for token in doc5:
    print(f"{token.text:{10}}{token.pos_:{10}}{token.lemma:{60}}{token.lemma_:{30}}")

i         PRON                                               4690420944186131903I                             
am        AUX                                               10382539506755952630be                            
a         DET                                               11901859001352538922a                             
runner    NOUN                                              12640964157389618806runner                        
running   VERB                                              12767647472892411841run                           
in        ADP                                                3002984154512732771in                            
a         DET                                               11901859001352538922a                             
race      NOUN                                               8048469955494714898race                          
i         PRON                                               4690420944186131903I                             
l

## StopWords
### Eg: A, The, And etc

In [29]:
print(nlp.Defaults.stop_words)

{'say', 'she', 'forty', 'along', 'should', 'he', 'name', "'re", 'somewhere', "n't", 'though', 'over', 'since', 'hers', 'only', 'yours', 'down', 'beforehand', 'ourselves', 'into', 'if', '’ll', 'so', 'when', 'around', "'s", 'moreover', 'less', 'hundred', 'himself', 'please', 'but', 'amount', 'being', 'n’t', 'serious', 'least', 'often', 'already', 'becoming', 'themselves', '’d', "'d", 'whoever', 'mostly', 'seeming', 'nowhere', 'seem', 'thereupon', 'us', 'top', 'thus', 'anyone', 'four', 'her', 'bottom', 'few', 'further', 'for', 'some', 'whereas', 'hereby', 'sixty', 'such', 'former', 'could', 'even', 'him', 'against', 'whenever', 'one', 'go', 'anyway', 'nobody', 'beside', 'as', 'before', 'yourselves', 'except', 'mine', 'together', 'may', 'through', 'up', 'back', 'everyone', 'made', 'although', 'between', 'seemed', 'using', 'move', 're', 'about', 'his', 'everything', 'too', 'within', 'and', '’m', 'another', 'might', 'where', 'done', 'unless', 'twelve', 'last', 'latter', 'yet', 'very', 'an', 

In [30]:
len(nlp.Defaults.stop_words)

326

In [32]:
nlp.vocab['is'].is_stop #to check if it is a stop word

True

In [34]:
#to a add our own word
nlp.Defaults.stop_words.add('btw')
nlp.vocab['btw'].is_stop = True

In [37]:
nlp.vocab['btw'].is_stop
len(nlp.Defaults.stop_words) #our lrngth is now chamgednlp.vocab['btw'].is_stop

True

In [42]:
#to a remove a word from list
nlp.Defaults.stop_words.remove('elsewhere')
nlp.vocab['elsewhere'].is_stop = False

KeyError: 'elsewhere'

In [43]:
nlp.vocab['elsewhere'].is_stop
len(nlp.Defaults.stop_words) #our lrngth is now chamged

326

## Phase matching and Vocabulary

In [12]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

In [18]:
# solarpower
pattern1 = [{'LOWER':'solarpower'}]
# solar-power
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True},{'LOWER':'power'}]
# solarpower
pattern3 = [{'LOWER':'solar'},{'LOWER':'power'}]

In [22]:
matcher.add('SolarPower',[pattern1,pattern2,pattern3])

In [38]:
doc6 = nlp(f"The Solar Power industry continuous to grow a solarpower incrases. Solar-Power is amazing")

In [39]:
found_match = matcher(doc6)

In [40]:
print(found_match)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]


In [44]:
# to print out the words
for match_id, start, end in found_match:
    span = doc6[start:end] #start -> start of the string , end -> end of the string
    print(match_id, start, end,span.text)

8656102463236116519 1 3 Solar Power
8656102463236116519 8 9 solarpower
8656102463236116519 11 14 Solar-Power


In [47]:
# remove the pattern
matcher.remove('SolarPower')

In [59]:
# solarpower
pattern4 = [{'LOWER':'solarpower'}]
# solar-power
pattern5 = [{'LOWER':'solar'},{'IS_PUNCT':True, 'OP':'*'},{'LOWER':'power'}] #op -> optional parameter -> means, 'OP':'*'
                                                                                #tells, there might 0 or more '-'

In [60]:
matcher.add('SolarPower',[pattern4,pattern5])

In [61]:
doc7 = nlp(f"Solar--power is solarpower taaa!!")
found_match1 = matcher(doc7)
print(found_match1)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]


In [62]:
for match_id, start, end in found_match1:
    span = doc6[start:end]
    print(match_id, start, end,span.text)

8656102463236116519 0 3 The Solar Power
8656102463236116519 4 5 continuous


## Phase Matching

In [64]:
from spacy.matcher import PhraseMatcher

In [66]:
matcher = PhraseMatcher(nlp.vocab)

In [94]:
with open('hello.txt') as f:
    doc8 = nlp(f.read()) #read the txt file

In [68]:
print(doc8)

By default, the matcher will only return the matches and not do anything else, 
like merge entities or assign labels. This is all up to you and can be defined individually for each pattern, 
by passing in a callback function as the on_match argument on add(). This is useful, 
because it lets you write entirely custom and pattern-specific logic. For example, 
you might want to merge some patterns into one token, while adding entity labels for other pattern types. 
You shouldnâ€™t have to create different matchers for each of those processes


In [95]:
pharses_list = ['adding', 'matches', 'custom', 'logic'] #give the list of items you want to match

In [96]:
pharses_patterns = [nlp(text) for text in pharses_list] # we are making each pharse_list to pharse document

In [97]:
print(pharses_patterns)

[adding, matches, custom, logic]


In [98]:
type(pharses_patterns)

list

In [99]:
type(pharses_patterns[0])

spacy.tokens.doc.Doc

In [100]:
matcher.add('RandomMatcher',[*pharses_patterns]) #to insert all word from list. ie.pharses_patterns[0], pharses_patterns[1]
                                                    #pharses_patterns[2], pharses_patterns[3]

In [101]:
matchs_found = matcher(doc8)

In [102]:
print(matchs_found)

[(17350695310500694472, 9, 10), (17350695310500694472, 66, 67), (17350695310500694472, 71, 72), (17350695310500694472, 89, 90)]


In [28]:
for match_id, start, end in matchs_found:
    span = doc8[start:end]
    print(match_id, start, end,span.text)

NameError: name 'matchs_found' is not defined

In [5]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [6]:
with open('hello.txt') as f:
    doc8 = nlp(f.read())

In [8]:
doc8[:36]

By default, the matcher will only return the matches and not do anything else, 
like merge entities or assign labels. This is all up to you and can be defined individually for

In [9]:
len(doc8)

112

In [15]:
doc_sentences = [sent for sent in doc8.sents]# to check how many sentences are present in the txt file

In [17]:
len(doc_sentences)

6

In [24]:
doc_sentences[1].text #used to call sentsnce line by line

'This is all up to you and can be defined individually for each pattern, \nby passing in a callback function as the on_match argument on add().'

In [27]:
for token in doc_sentences[1]:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.dep_:{10}} {token.lemma_:{10}}")

This       DET        nsubj      this      
is         AUX        ROOT       be        
all        DET        advmod     all       
up         ADP        prep       up        
to         ADP        prep       to        
you        PRON       pobj       you       
and        CCONJ      cc         and       
can        AUX        aux        can       
be         AUX        auxpass    be        
defined    VERB       conj       define    
individually ADV        advmod     individually
for        ADP        prep       for       
each       DET        det        each      
pattern    NOUN       pobj       pattern   
,          PUNCT      punct      ,         

          SPACE      punct      
         
by         ADP        prep       by        
passing    VERB       pcomp      pass      
in         ADP        prt        in        
a          DET        det        a         
callback   NOUN       compound   callback  
function   NOUN       dobj       function  
as         ADP        prep  

In [29]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [31]:
pattern = [{'LOWER':'return'},{'IS_SPACE':True,'OP':'*'} ,{'LOWER':'the'}]

In [32]:
matcher.add('Swimming',[pattern])

In [33]:
found_matches = matcher(doc8)

In [34]:
print(found_matches)

[(12881893835109366681, 7, 9)]


In [36]:
for match_id, start,end in found_matches:
    span = doc8[start:end]
    print(match_id, start,end , span.text)

12881893835109366681 7 9 return the


## print the surrounding text of found match


In [37]:
def surrounding(doc, start, end):
    print(doc[start-5:end+5])

In [38]:
surrounding(doc8, 7, 8)

, the matcher will only return the matches and not do


## print entire sentence that contsains the found match

In [42]:
for sentence in doc_sentences:
    if found_matches[0][1] < sentence.end:
        print(sentence)
        break

By default, the matcher will only return the matches and not do anything else, 
like merge entities or assign labels.
