# 1) Sentence Segmentation Basics

In [None]:
# officaial documentation
# https://spacy.io/usage/linguistic-features/#sbd

In [None]:
# Import spaCy
import spacy

In [None]:
# load the English language library
nlp = spacy.load(name="en_core_web_sm")

In [None]:
document = nlp("This is a sentence. This is second sentence. This is last sentence.")

In [None]:
for sentence in document.sents:
    print(sentence.text)

# note that doc.sents is a generator

This is a sentence.
This is second sentence.
This is last sentence.


In [None]:
document.sents[0]
# if you try to grab individual sentences then you will get an error
# TypeError: 'generator' object is not subscriptable
# because this generates the sentences instead of holding them in memory

TypeError: 'generator' object is not subscriptable

In [None]:
# grab a token
document[0]

This

In [None]:
# we can grab individual tokens because spacy knows the language library,
# we can grab individual sentences with list
list(document.sents)[0]

This is a sentence.

In [None]:
# these are the spacy's span objects not normal strings
type(list(document.sents)[0])

spacy.tokens.span.Span

In [None]:
# now put U.K. to see if it only breaks the sentence on dots or not
documentTwo = nlp("This is a sentence. This is second U.K. sentence. This is last sentence.")

In [None]:
for sentence in documentTwo.sents:
    print(sentence.text)

# even after putting U.K. inside the first string
# spacy able to detect that it is not a indication of sentence ending

This is a sentence.
This is second U.K. sentence.
This is last sentence.


# 2) Adding Rules

We can add our own rules for sentence segmentation, but they have to be added before the creation of doc object

becauseas that is where parsing of segment start

In [None]:
documentThree = nlp("This is a sentence; This is second sentence; This is last sentence.")

In [None]:
for sentence in documentThree.sents:
    print(sentence.text)

# this will show the whole string as a single sentence
# cause by default spacy can not determine ; as a sentence ending properties

This is a sentence;
This is second sentence;
This is last sentence.


In [None]:
# now, let's set custom rules
# first import language from spacy
from spacy.language import Language

In [None]:
# now define custom rule function

@Language.component("setCustomBoundaries")
def setCustomBoundaries(document):
    for token in document[:-1]:   # we are going through every token except the last one
      if token.text == ';':    # here we are adding a new Segmentation rule
        # is_sent_start is the property of specific index
        # which indicates this is sentence start or not
        document[token.i+1].is_sent_start = True
    return document

# after the semicolan, the next token is start of new sentence

# we are passing all the tokens through for loop except the last one because,
# we are taking "token.i+1" is the start of new sentence
# that is "token.i+1" includes last token in document(documentThree)

In [None]:
# now add_pipe with custom rule function
nlp.add_pipe("setCustomBoundaries", before="parser")

ValueError: [E007] 'setCustomBoundaries' already exists in pipeline. Existing names: ['tok2vec', 'tagger', 'setCustomBoundaries', 'parser', 'senter', 'attribute_ruler', 'lemmatizer', 'ner']

In [None]:
nlp.pipe_names
# new rule is added, that is 'set_custom_boundaries'
# tagger, parser and ner are already there

['tok2vec',
 'tagger',
 'setCustomBoundaries',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner']

In [None]:
# now with the new custom rule added
# spacy can determine ; as a sentence ending properties

# but for that we need to reinitialize the following again

documentThree = nlp("This is a sentence; This is second sentence; This is last sentence.")

In [None]:
# now following code will separate the sentences correctly

for sentence in documentThree.sents:
    print(sentence.text)

This is a sentence;
This is second sentence;
This is last sentence.
