In [19]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [20]:
doc = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [21]:
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [22]:
list(doc.sents)[0]

This is the first sentence.

In [23]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [24]:
doc.text

'"Management is doing the right things; leadership is doing the right things." -Peter Drucker'

In [25]:
for sent in doc.sents:
    print(sent)
    print('\n')

"Management is doing the right things; leadership is doing the right things."


-Peter Drucker



In [26]:
#ADD A SEGMENTATION RULE

In [27]:
from spacy.language import Language

In [28]:
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

In [29]:
nlp.add_pipe("set_custom_boundaries",before="parser")
nlp.pipe_names

['tok2vec',
 'tagger',
 'set_custom_boundaries',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner']

In [30]:
doc[:-1]

"Management is doing the right things; leadership is doing the right things." -Peter

In [31]:
doc4 = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [32]:
for sent in doc4.sents:
    print(sent)

"Management is doing the right things;
leadership is doing the right things."
-Peter Drucker


In [33]:
#CHANGE SEGMENTATION RULES

In [34]:
nlp = spacy.load('en_core_web_sm')

In [35]:
mystring = u"This is a sentence. This is another.\n\nThis is a \nthird sentence."

In [36]:
print(mystring)

This is a sentence. This is another.

This is a 
third sentence.


In [37]:
doc = nlp(mystring)

In [38]:
for sentence in doc.sents:
    print(sentence)

This is a sentence.
This is another.


This is a 
third sentence.


In [42]:
from spacy.pipeline import Sentencizer # I didnt find SentenceSegmenter

In [44]:
@Language.component("split_on_newlines")
def split_on_newlines(doc):
    start = 0
    seen_newline = False
    
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text.startswith('\n'):
            seen_newline = True
        
    yield doc[start:]

In [47]:
#Sürüm farklılığından dolayı üstteki fonksiyonu kullanamıyorum. 

In [48]:
punct_marks = ["\n"]
config = {"punct_chars":punct_marks}
nlp.add_pipe('sentencizer',config=config,before='parser')

<spacy.pipeline.sentencizer.Sentencizer at 0x21131d86e00>

In [51]:
doc = nlp(mystring)
for s in doc.sents:
    print(s)

This is a sentence. This is another.

This is a 

third sentence.
