In [1]:
#!conda install -c conda-forge spacy

In [2]:
#!python -m spacy download en_core_web_sm

In [3]:
import spacy
nlp= spacy.load('en_core_web_sm')

In [4]:
#create a doc
doc = nlp('Tesla is looking at buying U.S. startup for $6 million')
print(doc[0])
#print each token separately
for token in doc:
    print(token.text,token.pos_,token.dep_)

Tesla
Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [5]:
#nlp.pipeline
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [6]:
#also count spaces if given two or more time.
doc2= nlp("Tesla isn't   looking into startup anymore.")
for token in doc2:
    print(token.text,token.pos_,token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
   SPACE dep
looking VERB ROOT
into ADP prep
startup NOUN pobj
anymore ADV advmod
. PUNCT punct


In [7]:
# part of speech
doc2[0].pos_

'PROPN'

In [8]:
#dependencies
doc2[0].dep_
spacy.explain(doc2[0].dep_)# explain is used to write full form

'nominal subject'

In [9]:
#also count spaces if given two or more time.
doc3= nlp("Tesla isn't   looking into startup anymore.I'm not going to invest init.")
for token in doc3:
    print(token.text,token.pos_,token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
   SPACE dep
looking VERB ROOT
into ADP prep
startup NOUN pobj
anymore ADV advmod
. PUNCT punct
I'm PRON nsubj
not PART neg
going VERB ROOT
to PART aux
invest VERB xcomp
init NOUN dobj
. PUNCT punct


In [10]:
spacy.explain(doc3[9].pos_) # explain used to write full form

'pronoun'

In [11]:
print(doc3[0].is_alpha) #is the token an alpha character
print(doc3[1].text) #the original word text
print(doc3[5].lemma_) # shorten word
print(doc3[6].pos_) #part of speech
print(doc3[8].tag) #the detailed part of speech tag
print(doc3[10].shape_) #is the word cap,small
print(doc3[2].is_stop) #is this stop word


True
is
into
NOUN
12646065887601541794
xxx
True


# SPAN

# when token is slice it is called span.

In [12]:
doc4=nlp("This is first sentence. This is second sentence. This is third sentence.")
for sent in doc4.sents:
    print(sent)

This is first sentence.
This is second sentence.
This is third sentence.


In [13]:
doc4[5].is_sent_start

True

# Tokenization

In [14]:
mystring='"we\'re moving to L.A.!"'
print(mystring)

"we're moving to L.A.!"


In [15]:
doc5=nlp(mystring)
for token in doc5:
    print(token.text, end='|')

"|we|'re|moving|to|L.A.|!|"|

# prefixes,suffix,infixes


In [16]:
doc6=nlp("we're here to help! send snail-mail, email support@outside.com or visit us at https://www.outside.com!")
for t in doc6:
    print(t)

we
're
here
to
help
!
send
snail
-
mail
,
email
support@outside.com
or
visit
us
at
https://www.outside.com
!


In [17]:
doc7=nlp(u"Let's visit st. louis in the U.s next year +92310_5849538")
for t in doc7:
    print(t)

Let
's
visit
st
.
louis
in
the
U.s
next
year
+92310_5849538


# Named entities

In [18]:
doc8=nlp(u"Apple to build a hong kong factory for $6million.The apple is healthy fruit.Mango is more sweeter then apple. ")
for token in doc8:
    print(token.text,end='|')
print('\n--')
for ent in doc8.ents:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))

Apple|to|build|a|hong|kong|factory|for|$|6million|.|The|apple|is|healthy|fruit|.|Mango|is|more|sweeter|then|apple|.|
--
Apple - ORG - Companies, agencies, institutions, etc.
hong kong - GPE - Countries, cities, states
6million - MONEY - Monetary values, including unit
Mango - LOC - Non-GPE locations, mountain ranges, bodies of water


# Noun chunks

In [19]:
doc9=nlp("Autonomous cars shift insurance liability towards manufacturers ")
for chunk in doc9.noun_chunks:
    print(chunk.text)

Autonomous cars
insurance liability
manufacturers


In [20]:


import PyPDF2 as pdf
import spacy
nlp= spacy.load('en_core_web_sm')
f=open('Business_Proposal.pdf','rb')
#files reading object
pdf_reader=pdf.PdfReader(f)
pdf_reader.pages
page_one=pdf_reader.pages[0]
page_one_text=page_one.extract_text()
f=nlp(page_one_text)
noun_chunks = list(f.noun_chunks)
with open('noun_chunks.txt', 'a') as f:
            for chunk in noun_chunks:
                 f.write(chunk.text + '\n')

In [21]:
from spacy import displacy

In [31]:
doc=nlp(u"Apple to build a U.K. factory for $6million.")
displacy.render(doc, style="ent", jupyter=True)

In [25]:
doc=nlp(u"Apple to build a U.K. factory for $6million.")
displacy.serve(doc, style="dep",  options={'distance':80})#serve is used to view it on web by writing localhost:5000




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [17/Oct/2023 15:09:10] "GET / HTTP/1.1" 200 7535
127.0.0.1 - - [17/Oct/2023 15:09:10] "GET /favicon.ico HTTP/1.1" 200 7535


Shutting down server on port 5000.


In [39]:
import nltk
from nltk.stem.snowball import SnowballStemmer

s_stemmer = SnowballStemmer(language='english')
words = ['run', 'runner','running', 'ran', 'runs', 'easily', 'fairly']
for word in words:
    print(word+' ---> ' +s_stemmer.stem(word))

run ---> run
runner ---> runner
running ---> run
ran ---> ran
runs ---> run
easily ---> easili
fairly ---> fair


In [41]:
import nltk

from nltk.stem.porter import *

p_stemmer = PorterStemmer()

words = ['run', 'runner','running', 'ran', 'runs', 'easily','fairly']
for word in words:
    print(word+' ---> ' +p_stemmer.stem(word))

run ---> run
runner ---> runner
running ---> run
ran ---> ran
runs ---> run
easily ---> easili
fairly ---> fairli


In [42]:
print('Porter Stemmer:')
for word in words:
    print(word+'---->' +p_stemmer.stem(word))

Porter Stemmer:
run---->run
runner---->runner
running---->run
ran---->ran
runs---->run
easily---->easili
fairly---->fairli


In [43]:
print('Porter2 Stemmer:')
for word in words:
    print(word+'---->' +s_stemmer.stem(word))

Porter2 Stemmer:
run---->run
runner---->runner
running---->run
ran---->ran
runs---->run
easily---->easili
fairly---->fair


In [45]:
phrase="i will meeting him tommor at his places"
for word in phrase.split():
    print(word+'---->' +p_stemmer.stem(word))

i---->i
will---->will
meeting---->meet
him---->him
tommor---->tommor
at---->at
his---->hi
places---->place


In [49]:
doc11=nlp("i am a runner running in a race because i love to run since i ran today")
for token in doc11:
    print(token.text,"\t",token.pos_,'\t','\t',token.lemma_)

i 	 PRON 	 	 I
am 	 AUX 	 	 be
a 	 DET 	 	 a
runner 	 NOUN 	 	 runner
running 	 VERB 	 	 run
in 	 ADP 	 	 in
a 	 DET 	 	 a
race 	 NOUN 	 	 race
because 	 SCONJ 	 	 because
i 	 PRON 	 	 I
love 	 VERB 	 	 love
to 	 PART 	 	 to
run 	 VERB 	 	 run
since 	 SCONJ 	 	 since
i 	 PRON 	 	 I
ran 	 VERB 	 	 run
today 	 NOUN 	 	 today


In [50]:
def show_lemmas (text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [51]:
doc12=nlp("i saw eighteen mice today")
show_lemmas(doc12)

i            PRON   4690420944186131903    I
saw          VERB   11925638236994514241   see
eighteen     NUM    9609336664675087640    eighteen
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today


In [52]:
doc13=nlp("i an meeting him tomorrow at the meeting")
show_lemmas(doc13)

i            PRON   4690420944186131903    I
an           DET    15099054000809333061   an
meeting      NOUN   14798207169164081740   meeting
him          PRON   1655312771067108281    he
tomorrow     NOUN   3573583789758258062    tomorrow
at           ADP    11667289587015813222   at
the          DET    7425985699627899538    the
meeting      NOUN   14798207169164081740   meeting
