# custom Compenents

In [3]:
import spacy
nlp=spacy.load("en_core_web_sm")

In [4]:
doc=nlp("Britain is a place,Mary is a Doctor")

In [5]:
for ent in doc.ents:
    print(ent.text,ent.label_)

Britain GPE
Mary PERSON


In [6]:
from spacy.language import Language

In [7]:
@Language.component("remove_gpe")
def remove_gpe(doc):
    original_ents=list(doc.ents)
    for ent in doc_ents:
        if ent.label_ =="GPE":
            original_ents.remove(ent)
    doc.ents=original_ents
    return(doc)

In [8]:
nlp.add_pipe("remove_gpe")

<function __main__.remove_gpe(doc)>

In [9]:
nlp.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'remove_gpe': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False}},
 'problems': {'tok2vec': [],
  

In [10]:
nlp.to_disk("data/new_en_core_web_sm")

# RegEx

In [11]:
import re

In [12]:
pattern = r"((\d){1,2} (January|February|March|April|May|June|July|August|September|October|November|December))"

text = "This is a date 2 February. Another date would be 14 August."
matches = re.findall(pattern, text)
print (matches)

[('2 February', '2', 'February'), ('14 August', '4', 'August')]


In [13]:
text = "This is a date February 2. Another date would be 14 August."
matches = re.findall(pattern, text)
print (matches)


[('14 August', '4', 'August')]


In [15]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u'KEEP CALM because TOGETHER We Rock !')
for word in doc:
    print(word.text, word.lemma, word.lemma_, word.tag, word.tag_, word.pos, word.pos_)
    print(word.orth_)

KEEP 9099225972875567996 keep 14200088355797579614 VB 100 VERB
KEEP
CALM 15236377870166270878 calm 10554686591937588953 JJ 84 ADJ
CALM
because 16950148841647037698 because 1292078113972184607 IN 98 SCONJ
because
TOGETHER 12060003407050460571 together 164681854541413346 RB 86 ADV
TOGETHER
We 16064069575701507746 we 13656873538139661788 PRP 95 PRON
We
Rock 3237817430745561104 Rock 15794550382381185553 NNP 96 PROPN
Rock
! 17494803046312582752 ! 12646065887601541794 . 97 PUNCT
!


In [16]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP dobj X.X. False False
startup startup VERB VBD dep xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


# Extract Multi-Word Token

In [54]:
import re

text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."

pattern = r"Paul [A-Z]\w+"

matches = re.finditer(pattern, text)

for match in matches:
    print (match)

<re.Match object; span=(0, 11), match='Paul Newman'>
<re.Match object; span=(39, 53), match='Paul Hollywood'>


In [84]:
import re
import spacy
from spacy.tokens import Span

In [85]:
nlp = spacy.blank("en")
doc = nlp(text)

In [86]:
original_ents = list(doc.ents)

In [89]:
mwt_ents = []
for match in re.finditer(pattern, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    if span is not None:
        mwt_ents.append((span.start,span.end,span.text))
print(mwt_ents)

for ent in mwt_ents:
    start,end,name=ent
    per_ent=Span(doc,start,end,label='Person')
    original_ents.append(per_ent)
    
doc_ents=original_ents
for ent in doc.ents:
    print(ent.text,ent.label_)

[(0, 2, 'Paul Newman'), (8, 10, 'Paul Hollywood')]
Paul Newman Person
Paul Hollywood Person


In [88]:
from spacy.util import filter_spans
filtered = filter_spans(original_ents)
doc.ents = filtered
for ent in doc.ents:
    print (ent.text, ent.label_)

Paul Newman Person
Paul Hollywood Person
