# Container object
1. this contains the property of the text.
2. this the logical representation of the text.
   1. Doc
   2. Token
   3. span

In [5]:
import spacy
nlp = spacy.load("en_core_web_md")

In [3]:
spacy.tokens.__all__

['Doc', 'Token', 'Span', 'SpanGroup', 'DocBin', 'MorphAnalysis']

# Doc

doc object that represent the text.

In [6]:
doc = nlp("Apply the pipeline to some text.")
doc.text

'Apply the pipeline to some text.'

In [7]:
type(doc)

spacy.tokens.doc.Doc

## token

In [8]:
for token in doc:
    print(token.text)
type(token)

#? the token are the building block of the doc object.

Apply
the
pipeline
to
some
text
.


spacy.tokens.token.Token

## number of token

In [9]:
len(doc)
#? length of the doc it will return number of token.

7

## sentence

In [10]:
doc = nlp("This is sentence. This is second sentence.")
doc.sents

<generator at 0x1c0d92c9300>

In [11]:
sentences = list(doc.sents)
sentences

[This is sentence., This is second sentence.]

## named entity

In [14]:
doc = nlp("I went to New Delhi with XYZ.")
doc.ents
#? return the sequence of span object.

(New Delhi, XYZ)

In [16]:
type(doc.ents[0])

spacy.tokens.span.Span

# noun_chunk

In [17]:
doc = nlp("Germany's biggest publisher warns AI could 'replace' journalism")
doc.noun_chunks

<generator at 0x1c0d92c9bc0>

In [18]:
list(doc.noun_chunks)

[Germany's biggest publisher, AI, journalism]

In [19]:
doc.lang_

'en'

## to_json

In [23]:
doc = nlp("I love cake.")
doc.to_json()

{'text': 'I love cake.',
 'ents': [],
 'sents': [{'start': 0, 'end': 12}],
 'tokens': [{'id': 0,
   'start': 0,
   'end': 1,
   'tag': 'PRP',
   'pos': 'PRON',
   'morph': 'Case=Nom|Number=Sing|Person=1|PronType=Prs',
   'lemma': 'I',
   'dep': 'nsubj',
   'head': 1},
  {'id': 1,
   'start': 2,
   'end': 6,
   'tag': 'VBP',
   'pos': 'VERB',
   'morph': 'Tense=Pres|VerbForm=Fin',
   'lemma': 'love',
   'dep': 'ROOT',
   'head': 1},
  {'id': 2,
   'start': 7,
   'end': 11,
   'tag': 'NN',
   'pos': 'NOUN',
   'morph': 'Number=Sing',
   'lemma': 'cake',
   'dep': 'dobj',
   'head': 1},
  {'id': 3,
   'start': 11,
   'end': 12,
   'tag': '.',
   'pos': 'PUNCT',
   'morph': 'PunctType=Peri',
   'lemma': '.',
   'dep': 'punct',
   'head': 1}]}

## __ init__

In [29]:
from spacy.tokens import Doc

doc1 = nlp("One way to create doc object")
doc1

One way to create doc object

In [30]:
doc2 = Doc(vocab=nlp.vocab,
           words=["another","way","to","create","doc","object"])

In [31]:
doc2

another way to create doc object 

## set_extension

we can set the custom attribute to "Doc object" which we can access through "Doc._."

In [32]:
city_getter = lambda doc: any(city in doc.text for city in ("Chennai","New Delhi"))
Doc.set_extension(name="has_city",
                  getter=city_getter)

In [33]:
doc = nlp("I went to Chennai")
doc._.has_city

True

## get_extension

In [34]:
Doc.get_extension("has_city")

(None, None, <function __main__.<lambda>(doc)>, None)

## has extension

In [35]:
Doc.has_extension("has_city")

True

## remove extension

In [36]:
Doc.remove_extension("has_city")

(None, None, <function __main__.<lambda>(doc)>, None)

In [39]:
Doc.has_extension("has_city")

False

## char_span
create the span object

In [47]:
doc = nlp("Apply the pipeline to some text")
span = doc.char_span(start_idx=10,end_idx=18,label="main")
span.text

#? if the char indices dont map to proper tokens then it will return "None"

'pipeline'

## set_ents

set the named entity in the document.

In [48]:
from spacy.tokens import Span

In [49]:
doc = nlp("Mr. Doc went to Chennai")
doc.set_ents([Span(doc,0,2,label="PERSON")])
doc.ents

(Mr. Doc,)

## similarity

In [50]:
cake = nlp("I like cake.")
apples = nlp("I like apples.")
apples.similarity(cake)


0.9564221234381862

## count_by

In [52]:
from spacy.attrs import ORTH

doc = nlp("apple apple orange banana")
doc.count_by(attr_id=ORTH)

{8566208034543834098: 2, 2208928596161743350: 1, 2525716904149915114: 1}

## has_annotation

check whether the doc contains annotation on a token attribute.

In [54]:
doc = nlp("This is a test.")
doc.has_annotation("DEP")

True

In [61]:
[text for text in dir(spacy.attrs)
 if not (text.startswith("FLA") or text.endswith("__"))]

['',
 'DEP',
 'ENT_ID',
 'ENT_IOB',
 'ENT_KB_ID',
 'ENT_TYPE',
 'Errors',
 'HEAD',
 'ID',
 'IDS',
 'IDX',
 'IOB_STRINGS',
 'IS_ALPHA',
 'IS_ASCII',
 'IS_BRACKET',
 'IS_CURRENCY',
 'IS_DIGIT',
 'IS_LEFT_PUNCT',
 'IS_LOWER',
 'IS_OOV_DEPRECATED',
 'IS_PUNCT',
 'IS_QUOTE',
 'IS_RIGHT_PUNCT',
 'IS_SPACE',
 'IS_STOP',
 'IS_TITLE',
 'IS_UPPER',
 'LANG',
 'LEMMA',
 'LENGTH',
 'LIKE_EMAIL',
 'LIKE_NUM',
 'LIKE_URL',
 'LOWER',
 'MORPH',
 'NAMES',
 'NORM',
 'ORTH',
 'POS',
 'PREFIX',
 'SENT_START',
 'SHAPE',
 'SPACY',
 'SUFFIX',
 'TAG',
 'intify_attr',
 'intify_attrs',
 'key',
 'value']

## to_array

In [64]:
doc = nlp("This is a test.")
doc.to_array("POS")

array([95, 87, 90, 92, 97], dtype=uint64)

In [65]:
doc.to_array("LEMMA")

array([ 1995909169258310477, 10382539506755952630, 11901859001352538922,
        1618900948208871284, 12646065887601541794], dtype=uint64)

In [67]:
doc[0].pos

'PRON'