# A jupyter notebook for simple NER

Named Entity Recognition (NER) in email string using spaCy's NER statistics module en_core_web_lg

In [24]:
import sys
#!{sys.executable} -m pip install spacy # install to jupyter notebook path
#!{sys.executable} -m spacy download en_core_web_lg # Scapy NER statistics model (~85% accuracy) 

In [25]:
import re
import spacy

print("starting program....")

nlp = spacy.load('en_core_web_lg') # load EN NER statistical models
print("finished loading...")






starting program....
finished loading...


In [26]:
string = """
Hi James ,

Thank you for meeting with me in London.

Sorry for not getting back to you sooner. I hate the delay but we will have to live with it.

Did Claire send you the Barclays spreadsheet yesterday? It looks like it is very similar to the HSBC one we saw on Friday. Claire's email address is claire@sender.com and her telephone is  07816237458 if you need to get in contact with her.

Speak soon

Chris
"""

In [27]:
def extract_phone_numbers(string):
    r = re.compile(r'0(\d{2}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})')
    phone_numbers = r.findall(string)
    return [re.sub(r'\D', '', number) for number in phone_numbers]


def extract_email_addresses(string):
    r = re.compile(r'[\w\.-]+@[\w\.-]+')
    return r.findall(string)


def spacy_ner(document):
    persons = []
    print("running...")
    doc = nlp(document)
    for ent in doc:
        if ent.pos_ == "PROPN" and ent.ent_type_ == "PERSON":
            persons.append(ent.text)
           
    return persons



def subject(document):
    doc = nlp(document)
    sub_toks = [tok for tok in doc if (tok.dep_ == "nsubj")]
    return sub_toks

In [28]:
numbers = extract_phone_numbers(string)
emails = extract_email_addresses(string)
names = spacy_ner(string)
print(set(names))
print(set(numbers))
print(set(emails))

running...
{'Chris', 'Claire', 'James'}
{'781623745'}
{'claire@sender.com'}
