In [45]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ibzcl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ibzcl\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [46]:
example_sentence = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [47]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [48]:
preprocessed_sent = preprocess(example_sentence)
print(preprocessed_sent)

[('European', 'JJ'), ('authorities', 'NNS'), ('fined', 'VBD'), ('Google', 'NNP'), ('a', 'DT'), ('record', 'NN'), ('$', '$'), ('5.1', 'CD'), ('billion', 'CD'), ('on', 'IN'), ('Wednesday', 'NNP'), ('for', 'IN'), ('abusing', 'VBG'), ('its', 'PRP$'), ('power', 'NN'), ('in', 'IN'), ('the', 'DT'), ('mobile', 'JJ'), ('phone', 'NN'), ('market', 'NN'), ('and', 'CC'), ('ordered', 'VBD'), ('the', 'DT'), ('company', 'NN'), ('to', 'TO'), ('alter', 'VB'), ('its', 'PRP$'), ('practices', 'NNS')]


## Part Of Speech Mappings:

### Nouns:
* NN: Common noun (e.g., book, chair, happiness)
* NNP: Proper noun (e.g., Alice, London, Eiffel Tower)
* NNS: Plural noun (e.g., books, chairs, ideas)

### Pronouns:
* PRP: Personal pronoun (e.g., I, you, he, she, it, we, they)
* PRP$: Possessive pronoun (e.g., my, mine, your, yours, his, hers, its, our, ours, their, theirs)
* VBZ: 3rd person singular present tense verb used as a pronoun (e.g., it rains, he walks)

### Verbs:
* VB: Base form of a verb (e.g., talk, walk, sleep)
* VBD: Past tense verb (e.g., talked, walked, slept)
* VBG: Present participle (verb ending in "-ing") (e.g., talking, walking, sleeping)
* VBP: Present tense, non-3rd person singular verb (e.g., talk, walk, sleep)
* VBP-ing: Present participle form that functions as a noun (e.g., singing, dancing, eating)
* VBZ: 3rd person singular present tense verb (e.g., talks, walks, sleeps)

### Adjectives:
* JJ: Adjective (e.g., big, red, happy)
* JJR: Comparative adjective (e.g., bigger, redder, happier)
* JJS: Superlative adjective (e.g., biggest, reddest, happiest)

### Adverbs:
* RB: Adverb (e.g., very, quickly, always)
* RBR: Comparative adverb (e.g., more quickly, less often)
* RBS: Superlative adverb (e.g., most quickly, least often)

### Prepositions:
* IN: Preposition (e.g., of, to, in, on, for, at)

### Conjunctions:
* CC: Conjunction (e.g., and, but, or, for, so, yet)

### Determiners:
* DT: Determiner (e.g., the, a, an, this, that, some, any)
* WDT: Wh-determiner (e.g., what, which, who, whose)

### Other Common Tags:
* CD: Cardinal number (e.g., one, two, three)
* MD: Modal verb (e.g., can, could, may, might, must, shall, should, will, would)
* POS: Possessive marker ('s)


It is important to note that these can vary based on your NLTK tagset


In [49]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

# Our pattern rule is NP should be formed when an Optional Determiner (DT) is followed by any number of adjective (JJ) then a noun (NP)

In [50]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(preprocessed_sent)
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [51]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint

iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('European', 'JJ', 'O'),
 ('authorities', 'NNS', 'O'),
 ('fined', 'VBD', 'O'),
 ('Google', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('$', '$', 'O'),
 ('5.1', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('on', 'IN', 'O'),
 ('Wednesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('abusing', 'VBG', 'O'),
 ('its', 'PRP$', 'O'),
 ('power', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('mobile', 'JJ', 'I-NP'),
 ('phone', 'NN', 'I-NP'),
 ('market', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('ordered', 'VBD', 'O'),
 ('the', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('alter', 'VB', 'O'),
 ('its', 'PRP$', 'O'),
 ('practices', 'NNS', 'O')]


In [52]:
from nltk import ne_chunk
nltk.download('maxent_ne_chunker')
nltk.download('words')

ne_tree = ne_chunk(pos_tag(word_tokenize(example_sentence)))
print(ne_tree)

(S
  (GPE European/JJ)
  authorities/NNS
  fined/VBD
  (PERSON Google/NNP)
  a/DT
  record/NN
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  power/NN
  in/IN
  the/DT
  mobile/JJ
  phone/NN
  market/NN
  and/CC
  ordered/VBD
  the/DT
  company/NN
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\ibzcl\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\ibzcl\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


I will be using SpaCy's NER model here

In [53]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
# Run python -m spacy download en_core_web_sm if entity is not recognized after
# installing spacy

nlp = en_core_web_sm.load()

In [54]:
doc = nlp(example_sentence)
pprint([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'),
 ('Google', 'ORG'),
 ('$5.1 billion', 'MONEY'),
 ('Wednesday', 'DATE')]


In [55]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

#B: Begining
#I: Inside
#O: Outside
#"": No Entity

[(European, 'B', 'NORP'),
 (authorities, 'O', ''),
 (fined, 'O', ''),
 (Google, 'B', 'ORG'),
 (a, 'O', ''),
 (record, 'O', ''),
 ($, 'B', 'MONEY'),
 (5.1, 'I', 'MONEY'),
 (billion, 'I', 'MONEY'),
 (on, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (for, 'O', ''),
 (abusing, 'O', ''),
 (its, 'O', ''),
 (power, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (mobile, 'O', ''),
 (phone, 'O', ''),
 (market, 'O', ''),
 (and, 'O', ''),
 (ordered, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (to, 'O', ''),
 (alter, 'O', ''),
 (its, 'O', ''),
 (practices, 'O', '')]


In [56]:
import spacy
from bs4 import BeautifulSoup
import requests, re

#spacy model for English
nlp = spacy.load('en_core_web_sm')

def url_to_string(URL):
    res = requests.get(URL)
    html = res.text
    soup = BeautifulSoup(html, 'html.parser') 
    for script in soup(["script", "style", "aside"]):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

# Fetch text from the URL
url = 'https://www.dawn.com/news/1813028/the-day-pakistan-voted-in-defiance-and-hope'
ny_bb = url_to_string(url)

# Process the text with spacy
article = nlp(ny_bb)

# Count the entities
num_entities = len(article.ents)
print("Number of entities detected:", num_entities)

Number of entities detected: 163


In [57]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'PERSON': 37,
         'DATE': 35,
         'GPE': 26,
         'ORG': 24,
         'CARDINAL': 12,
         'ORDINAL': 9,
         'TIME': 8,
         'NORP': 7,
         'FAC': 2,
         'PRODUCT': 2,
         'WORK_OF_ART': 1})

In [58]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Pakistan', 11), ('PTI', 9), ('first', 6)]

In [59]:
sentences = [x for x in article.sents]
print(sentences[20])

Not that there weren’t other problems — the election commission put up its usual trash-fire of a performance, while the government helpfully cut off cellphone signals for the whole day.


In [60]:
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')

In [61]:
displacy.render(nlp(str(sentences[20])), style='dep', jupyter = True, options = {'distance': 120})

In [62]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[20])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]


[('problems', 'NOUN', 'problem'),
 ('election', 'NOUN', 'election'),
 ('commission', 'NOUN', 'commission'),
 ('usual', 'ADJ', 'usual'),
 ('trash', 'NOUN', 'trash'),
 ('fire', 'NOUN', 'fire'),
 ('performance', 'NOUN', 'performance'),
 ('government', 'NOUN', 'government'),
 ('helpfully', 'ADV', 'helpfully'),
 ('cut', 'VERB', 'cut'),
 ('cellphone', 'NOUN', 'cellphone'),
 ('signals', 'NOUN', 'signal'),
 ('day', 'NOUN', 'day')]

In [63]:
dict([(str(x), x.label_) for x in nlp(str(sentences[20])).ents])

{'the whole day': 'DATE'}

In [64]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[20]])

[(Not, 'O', ''), (that, 'O', ''), (there, 'O', ''), (were, 'O', ''), (n’t, 'O', ''), (other, 'O', ''), (problems, 'O', ''), (—, 'O', ''), (the, 'O', ''), (election, 'O', ''), (commission, 'O', ''), (put, 'O', ''), (up, 'O', ''), (its, 'O', ''), (usual, 'O', ''), (trash, 'O', ''), (-, 'O', ''), (fire, 'O', ''), (of, 'O', ''), (a, 'O', ''), (performance, 'O', ''), (,, 'O', ''), (while, 'O', ''), (the, 'O', ''), (government, 'O', ''), (helpfully, 'O', ''), (cut, 'O', ''), (off, 'O', ''), (cellphone, 'O', ''), (signals, 'O', ''), (for, 'O', ''), (the, 'B', 'DATE'), (whole, 'I', 'DATE'), (day, 'I', 'DATE'), (., 'O', '')]


In [65]:
#colors = {"ORG": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
#options = {"ents": ["ORG"], "colors": colors}
displacy.render(article, jupyter=True, style='ent')

In [None]:
# displacy.serve(article,style='ent')