In [1]:
import spacy

In [2]:
#loading the model as nlp
nlp = spacy.load('en_core_web_sm') 

In [4]:
#created an object named doc by applying the model to our text
#here, this doc object will hold the processed texts
#unicode string starts with u''.. This will parse this sentence into tokens
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

In [10]:
#pos means parts of speech.. token.pos gives a number corresponding to that pos
#token.pos_ gives the exact name of the parts of speech
#token.dep_ gives the syntactic dependency of the parts of speech
for token in doc:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is VERB aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [11]:
my_sent = nlp(u'I do not feel like going out today. I am very busy learning spacy')
for token in my_sent:
    print(token.text, token.pos_, token.dep_)

I PRON nsubj
do VERB aux
not ADV neg
feel VERB ROOT
like ADP prep
going VERB pcomp
out PART prt
today NOUN npadvmod
. PUNCT punct
I PRON nsubj
am VERB ROOT
very ADV advmod
busy ADJ acomp
learning VERB xcomp
spacy ADJ dobj


In [12]:
nlp.pipeline #to see the basic nlp pipeline

[('tagger', <spacy.pipeline.Tagger at 0x2c1eb69a348>),
 ('parser', <spacy.pipeline.DependencyParser at 0x2c1eb8930a8>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x2c1eb893648>)]

In [15]:
nlp.pipe_names #to see the attribute names

['tagger', 'parser', 'ner']

In [16]:
#first step to process any text is tokenization (split it up- words and punctuations into tokens)


In [20]:
doc2 = nlp(u"Tesla isn't        looking into startup anymore.")

In [25]:
#token.lemma_ gives you the base form of the word
for token in doc2:
    print(token.text, token.lemma_, token.pos_, token.dep_)

Tesla tesla PROPN nsubj
is be VERB aux
n't not ADV neg
                SPACE 
looking look VERB ROOT
into into ADP prep
startup startup NOUN pobj
anymore anymore ADV advmod
. . PUNCT punct


In [26]:
#we can get any index token from the doc2 now
doc2[0].pos_

'PROPN'

In [27]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [29]:
#a span is a slice of a large doc doc[start:end]
life_quote = doc3[16:30]
print(life_quote)

"Life is what happens to us while we are making other plans"


In [30]:
type(life_quote)

spacy.tokens.span.Span

In [31]:
doc4 = nlp(u"This is the first sentence. This is another sentence. This is the last sentence")

In [33]:
#doc.sents splits up sentences. spacy is smart enough to seperate sentences
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence


In [47]:
doc4[6]#.is_sent_start

This