<a href="https://colab.research.google.com/github/JonNData/Python-Skills/blob/master/Guide_to_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

NER (named entity recognition) - one of the first steps towards information extraction which seeks to locate and classify named entities in text into pre-defind categories (names of persons, organizations, locations, expressions of times, quantities, monetary values, percentages, etc) 

In [0]:
#NLTK:

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [0]:
 nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [0]:
doc = ('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')

In [0]:
#apply word tokenica and pos tagging to the poem: 

def preprocess(sent):
  sent = nltk.word_tokenize(sent)
  sent = nltk.pos_tag(sent)
  return sent 

sent = preprocess(doc)
print(sent)

[('European', 'JJ'), ('authorities', 'NNS'), ('fined', 'VBD'), ('Google', 'NNP'), ('a', 'DT'), ('record', 'NN'), ('$', '$'), ('5.1', 'CD'), ('billion', 'CD'), ('on', 'IN'), ('Wednesday', 'NNP'), ('for', 'IN'), ('abusing', 'VBG'), ('its', 'PRP$'), ('power', 'NN'), ('in', 'IN'), ('the', 'DT'), ('mobile', 'JJ'), ('phone', 'NN'), ('market', 'NN'), ('and', 'CC'), ('ordered', 'VBD'), ('the', 'DT'), ('company', 'NN'), ('to', 'TO'), ('alter', 'VB'), ('its', 'PRP$'), ('practices', 'NNS')]


In [0]:
#you get a list of tuples containing the individual words in the sentence and thier associated part-of-speech

In [0]:
#now implement noun phrase chunkiong to identify named entities using regular expression consiting of tules that indicate how senteces should be chunked: 

#the chunk pattern consists of one rules, that a noun phrase, NP, should be formed whenever the chunker finds an optional determiner, DT, followed by any number of 
#adjectives, JJ, and then a noun, NN. 

In [0]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [0]:
#using this pattern you can create a chunk parser and test it on poem 

cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [0]:
#the output is a tree with S as the first level (denoting a sentence) 

In [0]:
#with nltk.ne_chunk(), you can recognize named entities using a classifier which ass category labels such as PERSON 

In [0]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [0]:
nltk.download('words')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [0]:
from nltk.chunk import ne_chunk
ne_tree = ne_chunk(pos_tag(word_tokenize(doc)))
print(ne_tree)

(S
  (GPE European/JJ)
  authorities/NNS
  fined/VBD
  (PERSON Google/NNP)
  a/DT
  record/NN
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  power/NN
  in/IN
  the/DT
  mobile/JJ
  phone/NN
  market/NN
  and/CC
  ordered/VBD
  the/DT
  company/NN
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [0]:
#google is recognized as a person :( not cool bro 

In [0]:
#spacy's Named Entity Recognition (https://spacy.io/api/annotation#section-named-entities) 

In [0]:
#entity:
import spacy 
from spacy import displacy
from collections import Counter
import en_core_web_sm 
nlp = en_core_web_sm.load()

In [0]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
print([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'), ('Google', 'ORG'), ('$5.1 billion', 'MONEY'), ('Wednesday', 'DATE')]


In [0]:
#now google is recognized as an organization :) and everything is correct in this output 

In [0]:
#extracting named entities from an article : 

from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(ny_bb)
len(article.ents)


155

In [0]:
#there are 155 entities - they are represented as 8 unique labels: 
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'CARDINAL': 3,
         'DATE': 23,
         'GPE': 9,
         'LOC': 1,
         'NORP': 2,
         'ORDINAL': 1,
         'ORG': 39,
         'PERSON': 77})

In [0]:
#3 most frequent : 
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Strzok', 29), ('F.B.I.', 19), ('Trump', 13)]

In [0]:
#randomly select a sentence to examine more:
sentences = [x for x in article.sents] 
print(sentences[20])

A spokeswoman for the F.B.I. did not respond to a message seeking comment about why Mr. Strzok was dismissed rather than demoted.


In [0]:
#use displacy.render to generate a raw markup: 

displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')


In [0]:
#displaCy visualizer : shows what the above sentence and its dependencies look like:
displacy.render(nlp(str(sentences[20])), style='dep', jupyter = True, options = {'distance': 120})

In [0]:
#verbatim extract POS and lemmatize the sentence:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[20])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('spokeswoman', 'NOUN', 'spokeswoman'),
 ('F.B.I.', 'PROPN', 'F.B.I.'),
 ('respond', 'VERB', 'respond'),
 ('message', 'NOUN', 'message'),
 ('seeking', 'VERB', 'seek'),
 ('comment', 'NOUN', 'comment'),
 ('Mr.', 'PROPN', 'Mr.'),
 ('Strzok', 'PROPN', 'Strzok'),
 ('dismissed', 'VERB', 'dismiss'),
 ('demoted', 'VERB', 'demote')]

In [0]:
dict([(str(x), x.label_) for x in nlp(str(sentences[20])).ents])

{'F.B.I.': 'ORG', 'Strzok': 'PERSON'}

In [0]:
#NER extraction is correct except for "F.B.I"
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[20]])

[(A, 'O', ''), (spokeswoman, 'O', ''), (for, 'O', ''), (the, 'O', ''), (F.B.I., 'B', 'ORG'), (did, 'O', ''), (not, 'O', ''), (respond, 'O', ''), (to, 'O', ''), (a, 'O', ''), (message, 'O', ''), (seeking, 'O', ''), (comment, 'O', ''), (about, 'O', ''), (why, 'O', ''), (Mr., 'O', ''), (Strzok, 'B', 'PERSON'), (was, 'O', ''), (dismissed, 'O', ''), (rather, 'O', ''), (than, 'O', ''), (demoted, 'O', ''), (., 'O', '')]


In [0]:
#displacy on the entire article : 
displacy.render(article, jupyter=True, style='ent')