In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp(u"This is the first sentence. This is another sentence. This is the last sentence.")

In [4]:
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [5]:
#we can look up tokens as doc[0] but we cant use doc.sents[0]..
#because doc.sents is not an object. rather it is a generator
#therefore to use/get the sentences we need to make a list of the doc.sents, then use indexing

In [6]:
sentences = list(doc.sents)

In [7]:
sentences

[This is the first sentence.,
 This is another sentence.,
 This is the last sentence.]

In [8]:
sentences[1]

This is another sentence.

In [9]:
type(sentences)

list

In [10]:
type(list(doc.sents)[0])

spacy.tokens.span.Span

In [11]:
#the doc.sents seperates/generates sentences based on full stop. But we can have a custom dataset
#where we want to seperate the sentences by ; or a :
#for that we need to write our own sentence seperator/our own custom rule

In [12]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [13]:
doc.text

'"Management is doing the right things; leadership is doing the right things." -Peter Drucker'

In [14]:
for sent in doc.sents:
    print(sent)
    print('\n')

"Management is doing the right things; leadership is doing the right things."


-Peter Drucker




In [15]:
#ADD A SEGMENTATION RULE


In [16]:
def set_custom_boundaries(doc):
    for token in doc[:-1]:     #doc[:-1] takes all tokens from the start excluding the last one
                                 #token.i gives the index of that token
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

In [17]:
nlp.add_pipe(set_custom_boundaries, before = 'parser')
nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [18]:
doc[:-1]

"Management is doing the right things; leadership is doing the right things." -Peter

In [19]:
doc4 = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [20]:
for sent in doc4.sents:
    print(sent)

"Management is doing the right things;
leadership is doing the right things."
-Peter Drucker


In [21]:
#CHANGE SEGMENTATION RULES

In [22]:
nlp = spacy.load('en_core_web_sm')

In [23]:
mystring = u"This is a sentence. This is another.\n\nThis is a \nthird sentence."

In [24]:
print(mystring)

This is a sentence. This is another.

This is a 
third sentence.


In [25]:
doc = nlp(mystring)

In [26]:
for sent in doc.sents:
    print(sent)

This is a sentence.
This is another.


This is a 
third sentence.


In [27]:
#to change the rules

from spacy.pipeline import SentenceSegmenter

In [28]:
def split_on_newlines(doc):
    start = 0
    seen_newline = False
    
    for token in doc:
        if seen_newline:
            yield doc[start:token.i]
            start = token.i
            seen_newline = False
        elif token.text.startswith('\n'):
            seen_newline = True
            
    yield doc[start:]
    

In [29]:
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)

In [30]:
nlp.add_pipe(sbd)

In [31]:
#we have to redefine the string on the new pipeline
doc = nlp(mystring)

In [32]:
for sent in doc.sents:
    print(sent)

This is a sentence. This is another.


This is a 

third sentence.
