# 1) Sentence Segmentation Basics

In [0]:
# Import spaCy
import spacy

In [0]:
# load the English language library
nlp = spacy.load(name='en_core_web_sm')

In [0]:
doc = nlp("This is a sentence. This is second sentence. This is last sentence.")

In [4]:
for sent in doc.sents:
    print(sent.text)

# note that doc.sents is a generator

This is a sentence.
This is second sentence.
This is last sentence.


In [5]:
doc.sents[0]
# if you try to grab individual sentences then you will get an error
# TypeError: 'generator' object is not subscriptable
# because this generates the sentences instead of holding them in memory

TypeError: ignored

In [6]:
# grab a token
doc[0]

This

In [0]:
# we can grab individual tokens because spacy knows the language library,
# we can grab individual sentences with list

In [7]:
list(doc.sents)[0]

This is a sentence.

In [8]:
type(list(doc.sents)[0])
# these are the spacy's span objects not normal strings

spacy.tokens.span.Span

# 2) Adding Rules

We can add our own rules for sentence segmentation, but they have to be added before the creation of doc object

becauseas that is where parsing of segment start

In [0]:
def set_custom_boundaries(doc):
    for token in doc[:-1]:   # we are going through every token except the last one
      if token.text == ';':    # here we are adding a new Segmentation rule
        doc[token.i+1].is_sent_start = True
    return doc

# after the semicolan, the next token is start of new sentence

# we are passing all the tokens through for loop except the last one because,
# we are taking "token.i+1" is the start of new sentence
# that is "token.i+1" includes last token in document(doc_2)

In [0]:
nlp.add_pipe(set_custom_boundaries, before='parser')

In [11]:
nlp.pipe_names
# new rule is added, that is 'set_custom_boundaries'
# tagger, parser and ner are already there

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [0]:
# new rule has to run before the document is parsed, hence define new document object witha same content 
doc_2 = nlp("This is a sentence; This is second sentence; This is last sentence.")

In [13]:
for sent in doc_2.sents:
    print(sent)

# new rule is working fine here

This is a sentence;
This is second sentence;
This is last sentence.
