In [36]:
import spacy
nlp = spacy.load('en_core_web_sm')


In [37]:
doc = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [38]:
list(doc.sents)[0]

This is the first sentence.

In [39]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [40]:
doc.text

'"Management is doing the right things; leadership is doing the right things." -Peter Drucker'

In [41]:
for sent in doc.sents:
    print(sent)
    print('\n')

"


Management is doing the right things; leadership is doing the right things.


" -Peter Drucker




In [42]:
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

In [43]:
nlp.add_pipe(set_custom_boundaries, before='parser')
nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [44]:
doc[:-1]

"Management is doing the right things; leadership is doing the right things." -Peter

In [45]:
doc4 = nlp('"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [46]:
for sent in doc4.sents:
    print(sent)


"
Management is doing the right things;
leadership is doing the right things.
" -Peter Drucker


In [47]:
nlp = spacy.load('en_core_web_sm')  # reset to the original

mystring = u"This is a sentence. This is another.\n\nThis is a \nthird sentence."

# SPACY DEFAULT BEHAVIOR:
doc = nlp(mystring)

for sent in doc.sents:
    print([token.text for token in sent])



['This', 'is', 'a', 'sentence', '.']
['This', 'is', 'another', '.', '\n\n']
['This', 'is', 'a', '\n', 'third', 'sentence', '.']


In [48]:
from spacy.pipeline import SentenceSegmenter

In [49]:
def split_on_newlines(doc):
    start = 0
    seen_newline = False
    
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text.startswith('\n'):
            seen_newline = True
    yield doc[start:]

In [50]:
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)

In [51]:
nlp.add_pipe(sbd)

In [52]:
doc = nlp(mystring)

In [53]:
for sent in doc.sents:
    print(sent)

This is a sentence. This is another.


This is a 

third sentence.
