In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
doc = nlp(u'This is the first sentence. This is another sentence. This is the last sentence')

# doc.sents generate the sentences, instead of saving it into the memory
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence


In [4]:
doc[0]

This

In [6]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things." - Peter Drucker')

doc.text

'"Management is doing the right things; leadership is doing the right things." - Peter Drucker'

In [7]:
# Here it only detect 2 sentences
for sent in doc.sents:
    print(sent)
    print('\n')

"Management is doing the right things; leadership is doing the right things."


- Peter Drucker




In [8]:
# From start up until the last token
doc[:-1]

"Management is doing the right things; leadership is doing the right things." - Peter

In [22]:
# Each token has its own Index
def index_check(doc):
    for token in doc:
        print(f"{token} | {token.i}")
index_check(doc)

" | 0
Management | 1
is | 2
doing | 3
the | 4
right | 5
things | 6
; | 7
leadership | 8
is | 9
doing | 10
the | 11
right | 12
things | 13
. | 14
" | 15
- | 16
Peter | 17
Drucker | 18


### Add our Own Rule & Add it to the NLP Pipeline

In [23]:
# ADD A SEGMENTATION RULE
# Basically what happened here is, after ';' is gonna be a new sentence
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

In [24]:
# Add Our Rule to the NLP Pipeline / Stages
nlp.add_pipe(set_custom_boundaries, before='parser')

nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [25]:
doc4 = nlp(u'"Management is doing the right things; leadership is doing the right things." - Peter Drucker')

In [26]:
# Notice that after ';' Spacy Tagged that as a New Sentence
for sent in doc4.sents:
    print(sent)

"Management is doing the right things;
leadership is doing the right things.
" - Peter Drucker


### Changing Default Segmentation Rules

In [27]:
# Reloading Original Spacy To Undo our Own Rule 
nlp = spacy.load('en_core_web_sm')

In [28]:
mystring = u"This is a sentence. This is another. \n\nThis is a \nthird sentence"

print(mystring)
# This is a Poetry Case Study Example. In Poetry you want One Line to be detected as one sentence, not a full stop (.)

This is a sentence. This is another. 

This is a 
third sentence


In [29]:
doc = nlp(mystring)

# Here spacy seperate each sentence by full stop
for sentence in doc.sents:
    print(sentence)

This is a sentence.
This is another. 


This is a 
third sentence


In [30]:
from spacy.pipeline import SentenceSegmenter

In [31]:
# * Need to Revisit this Function *
def split_on_newlines(doc):
    start = 0
    seen_newline = False

    for word in doc:
        if seen_newline:
            yield doc[start:word.i]     # Generate the Word from Start to the Current token
            start = word.i
            seen_newline = False
        elif word.text.startswith('\n'):
            seen_newline = True
    
    yield doc[start:]

In [32]:
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)

nlp.add_pipe(sbd)

In [33]:
doc = nlp(mystring)

# Here Notice that Spacy Identify new Sentences as new lines
for sentence in doc.sents:
    print(sentence)

This is a sentence. This is another. 


This is a 

third sentence
