In [10]:
# Spacy library is being used
import spacy

In [11]:
# Load the language core which was downloaded
nlp = spacy.load('en_core_web_sm')  # sm --> small version

In [12]:
# creating a document object
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

* So, the above code parses the entire string into separate components for us.
* After parsing, each component is called as tokens

In [13]:
for token in doc:
    print(token.text, token.pos_)   # grabs the raw text from the string
    # returns the part of speech

Tesla PROPN
is AUX
looking VERB
at ADP
buying VERB
U.S. PROPN
startup NOUN
for ADP
$ SYM
6 NUM
million NUM


In [14]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x204a9e5a9e0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x204a9e5ada0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x204aa9b2570>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x204acd9cfc0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x204acddfdc0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x204aa9b26c0>)]

In [9]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [15]:
# TOKENIZATION - Splitting up into all the components called tokens

doc2 = nlp(u"Tesla isn't looking into startups anymore")

In [17]:
type(doc2)

# This is a document object type

spacy.tokens.doc.Doc

In [20]:
for token in doc2:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod


In [29]:
print(doc2[0])      # indexing can be used to grab the tokens
print(doc2[0].pos_)    # attributers can be used with the indexes as well
print(doc2[0].dep_)    # syntactic dependency attribute
print(doc2[0].tag_)    # gives detailed POS 
print(doc2[3].lemma_)   # gives the root word form

Tesla
PROPN
nsubj
NNP
look


SPANS - a slice of the document object in the form of doc[start:stop]

In [30]:
new_var = doc2[0:4]

In [31]:
new_var

Tesla isn't looking

In [32]:
type(new_var)

spacy.tokens.span.Span

In [33]:
type(doc2)

spacy.tokens.doc.Doc

In [34]:
# to grab the individual sentences from the string

doc3 = nlp(u"This is the first sentence. This is the second sentence. This is the third sentence.")

In [35]:
for sentences in doc3.sents:
    print(sentences)

This is the first sentence.
This is the second sentence.
This is the third sentence.
