# Sentence Segmentation or Boundary Detection
Deciding where sentences begin and end

In [1]:
# For warnings
import warnings
warnings.filterwarnings('ignore')
import spacy
nlp = spacy.load('en')

# Get some basic concept

a) If it's a period, it ends a sentence.
(b) If the preceding token is in the hand-compiled list of abbreviations, then it doesn't end a sentence.
(c) If the next token is capitalized, then it ends a sentence.

In [2]:
# Manual or Custom Based
def mycustom_boundary(docx):
    for token in docx[:-1]:
        if token.text == '...':
            docx[token.i+1].is_sent_start = True
    return docx

In [3]:
# Adding the rule before parsing
nlp.add_pipe(mycustom_boundary,before='parser')

In [4]:
mydoc = nlp(u"This is my first sentence...the last comment was so cuul... what if...? this is the last sentence")

In [5]:
for sentence in mydoc.sents:
    print(sentence.text)

This is my first sentence...
the last comment was so cuul...
what if...
?
this is the last sentence


In [6]:
def mycustom_boundary2(docx):
    for token in docx[:-1]:
        if token.text == '---':
            docx[token.i+1].is_sent_start = True
    return docx

In [7]:
nlp2 = spacy.load('en')

In [8]:
# Adding the rule before parsing
nlp2.add_pipe(mycustom_boundary2,before='parser')

In [9]:
mydoc2 = nlp2(u"Last year was great---this year 2018-05-22 will be so cuul. when was your birthday? ---this is the last sentence")

In [10]:
for sentence in mydoc2.sents:
    print(sentence.text)

Last year was great---
this year 2018-05-22 will be so cuul.
when was your birthday?
---this is the last sentence


In [11]:
# Removing the parsing
nlp.remove_pipe('parser')

('parser', <spacy.pipeline.pipes.DependencyParser at 0x1ebaac4b0a8>)

In [12]:
nlp = spacy.load('en')

In [13]:
mydoc3 = nlp(u"This is my first sentence...the last comment was so cuul... what if...? this is the last sentence")

In [14]:
for sentence in mydoc3.sents:
    print(sentence.text)

This is my first sentence...
the last comment was so cuul...
what if...?
this is the last sentence


# Custome Rule Based

In [15]:
from spacy.lang.en import English
from spacy.pipeline import SentenceSegmenter

In [16]:
def split_on_newlines(doc):
    start = 0
    seen_newline = False
    for word in doc:
        if seen_newline and not word.is_space:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text == '\n':
            seen_newline = True
    if start < len(doc):
        yield doc[start:len(doc)]

In [17]:
def split_on_tab(doc):
    start = 0
    seen_newline = False
    for word in doc:
        if seen_newline and not word.is_space:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text == '\t':
            seen_newline = True
    if start < len(doc):
        yield doc[start:len(doc)]

In [18]:
nlp = English()  # just the language with no model
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)
nlp.add_pipe(sbd)

In [19]:
doc = nlp(u"This is a great sentence\n\nThis is another comment\nAnd more")
for sent in doc.sents:
    print(sent.text)

This is a great sentence

This is another comment

And more


In [20]:
doc = nlp(u"This is a great sentence\n\nThis is another comment\nAnd more")
for sent in doc.sents:
    print([token.text for token in sent])

['This', 'is', 'a', 'great', 'sentence', '\n\n', 'This', 'is', 'another', 'comment', '\n']
['And', 'more']


In [21]:
nlp_tab = English()  # just the language with no model
sbd_tab = SentenceSegmenter(nlp.vocab, strategy=split_on_tab)
nlp_tab.add_pipe(sbd_tab)

In [22]:
## Spliting on Tabs
doc_tab = nlp_tab(u"This is a great sentence\t This is another\t comment\t And more")

In [23]:
for sent in doc_tab.sents:
    print(sent.text)

This is a great sentence	 This is another	 comment	 And more
