In [1]:
import csv

import spacy
from spacy.language import Language
from spacy.tokens import Doc
from spacy.matcher import Matcher
from spacy.tokens import Span, Token
from spacy import displacy
from spacy.pipeline import EntityRuler
from pathlib import Path
from spacy.util import filter_spans
from adept.components.registry import ComponentsRegistry
from adept.preprocess import Preprocess
from adept.postprocess import Postproccess
from adept.config import TRAINING_DIR
from adept.traits import Traits
from adept.fields import Fields
from adept.utils.helpers import token_get_ent

from adept.utils.expand import ExpandSpan
from adept.tasks.patterns.trait import TraitPatternsTask   

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nlp = spacy.load(TRAINING_DIR / 'adept')

preprocess = Preprocess() 
preprocess = Preprocess() 

In [3]:
nlp.pipeline

[('transformer',
  <spacy_transformers.pipeline_component.Transformer at 0x13be14d00>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x13bb9c6d0>),
 ('sentencizer',
  <adept.components.sentencizer.SentencizerComponent at 0x13bdb7820>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x13be149a0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x13bb9c7b0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x13be26580>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x13be07b40>)]

In [4]:
# cardinal_ents = [e for e in doc.ents if e.label_ == 'CARDINAL']

  
# # We use the dependency parse to find nummod noun, that's also an entity     
# for cardinal_ent in cardinal_ents:
    
#     print(cardinal_ent)
#     print(cardinal_ent.sent)
#     # root = cardinal_ent.root
#     # print(root)
#     # ent = token_get_ent(root.head, ['PART', 'TRAIT'])     
#     # print(ent)
    
    
# token = cardinal_ent.sent[12]

# next_token = doc[token.i + 1]    

# next_token.shape_

In [5]:
class SentencizerComponent:
    
    """
    Sentencizer, to split sentences on semicolons and periods.
    
    If we just add is_sent_start for each semicolon, the default
    parser will split incorrectly
    """
    
    def __init__(self, nlp: Language):
        self.nlp = nlp
    
    def __call__(self, doc):        
        for token in doc[:-1]:
            next_token = doc[token.i + 1]
            if self._is_semicolon(doc, token):
                next_token.is_sent_start = True
            elif self._is_period(doc, token):
                # period then capital: new sentence                 
                if next_token.shape_.startswith('X'):
                    next_token.is_sent_start = True
                # period then number: possibly new sentence. let the default parse evaluate it    
                elif next_token.shape_.startswith('d'):
                    next_token.is_sent_start = None
                else:
                    next_token.is_sent_start = False
            else:
                next_token.is_sent_start = False

                
        return doc

    @staticmethod
    def _is_semicolon(doc, token):
        return token.text == ";"
    
    @staticmethod
    def _is_period(doc, token):            
        return token.text == "."  
    
    
    
# sent = SentencizerComponent(nlp)
# sent(doc)



In [6]:
@Language.factory("sent")
def create_discrete_traits_component(nlp: Language, name: str):
    return SentencizerComponent(nlp)


nlp.replace_pipe("sentencizer", "sent")



<__main__.SentencizerComponent at 0x104286e30>

In [9]:
text = "Herbs to 40-100.4 cm tall, annual, much branched; 2 ovaries. 56 stamenoids. Seed volume is about 2 cm³. 2n=23,34"  

doc = nlp(text)



DOT
dd
NONE
DOT
Xxxx


In [10]:
for sent in doc.sents:
    print(sent)
    print('--')

Herbs to 40-100.4 cm tall, annual, much branched;
--
2 ovaries.
--
56 stamenoids.
--
Seed volume is about 2 cm³. 2n=23,34
--
