In [22]:

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk

In [40]:
sentence = 'Due to ongoing floods at Silchar City (Assam), it has been decided by the competent authority to postpone Chartered Accountants Foundation Examination, Paper – 1 [Principles and Practice of Accounting] and Paper - 2 [Business Laws & Business Correspondence and Reporting] scheduled to be held on 24th and 26th June 2022 respectively in Silchar (Assam) Examination Centre only," the ICAI said in an official notification.'


In [41]:
ne_tree = ne_chunk(pos_tag(word_tokenize(sentence)))

In [42]:

print(ne_tree)

(S
  Due/JJ
  to/TO
  ongoing/VBG
  floods/NNS
  at/IN
  (FACILITY Silchar/NNP City/NNP)
  (/(
  (ORGANIZATION Assam/NNP)
  )/)
  ,/,
  it/PRP
  has/VBZ
  been/VBN
  decided/VBN
  by/IN
  the/DT
  competent/JJ
  authority/NN
  to/TO
  postpone/VB
  (PERSON Chartered/NNP Accountants/NNP)
  Foundation/NNP
  Examination/NNP
  ,/,
  (PERSON Paper/NNP)
  –/VBZ
  1/CD
  [/NN
  Principles/NNS
  and/CC
  (ORGANIZATION Practice/NNP)
  of/IN
  Accounting/NNP
  ]/NNP
  and/CC
  (PERSON Paper/NNP)
  -/:
  2/CD
  [/NN
  (ORGANIZATION Business/NNP Laws/NNP)
  &/CC
  (ORGANIZATION Business/NNP Correspondence/NNP)
  and/CC
  Reporting/NNP
  ]/NNP
  scheduled/VBD
  to/TO
  be/VB
  held/VBN
  on/IN
  24th/CD
  and/CC
  26th/CD
  June/NNP
  2022/CD
  respectively/RB
  in/IN
  (GPE Silchar/NNP)
  (/(
  (ORGANIZATION Assam/NNP)
  )/)
  Examination/NNP
  Centre/NNP
  only/RB
  ,/,
  ''/''
  the/DT
  (ORGANIZATION ICAI/NNP)
  said/VBD
  in/IN
  an/DT
  official/JJ
  notification/NN
  ./.)


In [43]:
ex = 'Due to ongoing floods at Silchar City (Assam), it has been decided by the competent authority to postpone Chartered Accountants Foundation Examination, Paper – 1 [Principles and Practice of Accounting] and Paper - 2 [Business Laws & Business Correspondence and Reporting] scheduled to be held on 24th and 26th June 2022 respectively in Silchar (Assam) Examination Centre only," the ICAI said in an official notification.'


In [44]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [45]:
sent = preprocess(ex)
sent

[('Due', 'JJ'),
 ('to', 'TO'),
 ('ongoing', 'VBG'),
 ('floods', 'NNS'),
 ('at', 'IN'),
 ('Silchar', 'NNP'),
 ('City', 'NNP'),
 ('(', '('),
 ('Assam', 'NNP'),
 (')', ')'),
 (',', ','),
 ('it', 'PRP'),
 ('has', 'VBZ'),
 ('been', 'VBN'),
 ('decided', 'VBN'),
 ('by', 'IN'),
 ('the', 'DT'),
 ('competent', 'JJ'),
 ('authority', 'NN'),
 ('to', 'TO'),
 ('postpone', 'VB'),
 ('Chartered', 'NNP'),
 ('Accountants', 'NNP'),
 ('Foundation', 'NNP'),
 ('Examination', 'NNP'),
 (',', ','),
 ('Paper', 'NNP'),
 ('–', 'VBZ'),
 ('1', 'CD'),
 ('[', 'NN'),
 ('Principles', 'NNS'),
 ('and', 'CC'),
 ('Practice', 'NNP'),
 ('of', 'IN'),
 ('Accounting', 'NNP'),
 (']', 'NNP'),
 ('and', 'CC'),
 ('Paper', 'NNP'),
 ('-', ':'),
 ('2', 'CD'),
 ('[', 'NN'),
 ('Business', 'NNP'),
 ('Laws', 'NNP'),
 ('&', 'CC'),
 ('Business', 'NNP'),
 ('Correspondence', 'NNP'),
 ('and', 'CC'),
 ('Reporting', 'NNP'),
 (']', 'NNP'),
 ('scheduled', 'VBD'),
 ('to', 'TO'),
 ('be', 'VB'),
 ('held', 'VBN'),
 ('on', 'IN'),
 ('24th', 'CD'),
 ('and',

In [46]:

pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [47]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  Due/JJ
  to/TO
  ongoing/VBG
  floods/NNS
  at/IN
  Silchar/NNP
  City/NNP
  (/(
  Assam/NNP
  )/)
  ,/,
  it/PRP
  has/VBZ
  been/VBN
  decided/VBN
  by/IN
  (NP the/DT competent/JJ authority/NN)
  to/TO
  postpone/VB
  Chartered/NNP
  Accountants/NNP
  Foundation/NNP
  Examination/NNP
  ,/,
  Paper/NNP
  –/VBZ
  1/CD
  (NP [/NN)
  Principles/NNS
  and/CC
  Practice/NNP
  of/IN
  Accounting/NNP
  ]/NNP
  and/CC
  Paper/NNP
  -/:
  2/CD
  (NP [/NN)
  Business/NNP
  Laws/NNP
  &/CC
  Business/NNP
  Correspondence/NNP
  and/CC
  Reporting/NNP
  ]/NNP
  scheduled/VBD
  to/TO
  be/VB
  held/VBN
  on/IN
  24th/CD
  and/CC
  26th/CD
  June/NNP
  2022/CD
  respectively/RB
  in/IN
  Silchar/NNP
  (/(
  Assam/NNP
  )/)
  Examination/NNP
  Centre/NNP
  only/RB
  ,/,
  ''/''
  the/DT
  ICAI/NNP
  said/VBD
  in/IN
  (NP an/DT official/JJ notification/NN)
  ./.)


In [48]:
NPChunker = nltk.RegexpParser(pattern) 
result = NPChunker.parse(sent)
result.draw()

In [49]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint

iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('Due', 'JJ', 'O'),
 ('to', 'TO', 'O'),
 ('ongoing', 'VBG', 'O'),
 ('floods', 'NNS', 'O'),
 ('at', 'IN', 'O'),
 ('Silchar', 'NNP', 'O'),
 ('City', 'NNP', 'O'),
 ('(', '(', 'O'),
 ('Assam', 'NNP', 'O'),
 (')', ')', 'O'),
 (',', ',', 'O'),
 ('it', 'PRP', 'O'),
 ('has', 'VBZ', 'O'),
 ('been', 'VBN', 'O'),
 ('decided', 'VBN', 'O'),
 ('by', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('competent', 'JJ', 'I-NP'),
 ('authority', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('postpone', 'VB', 'O'),
 ('Chartered', 'NNP', 'O'),
 ('Accountants', 'NNP', 'O'),
 ('Foundation', 'NNP', 'O'),
 ('Examination', 'NNP', 'O'),
 (',', ',', 'O'),
 ('Paper', 'NNP', 'O'),
 ('–', 'VBZ', 'O'),
 ('1', 'CD', 'O'),
 ('[', 'NN', 'B-NP'),
 ('Principles', 'NNS', 'O'),
 ('and', 'CC', 'O'),
 ('Practice', 'NNP', 'O'),
 ('of', 'IN', 'O'),
 ('Accounting', 'NNP', 'O'),
 (']', 'NNP', 'O'),
 ('and', 'CC', 'O'),
 ('Paper', 'NNP', 'O'),
 ('-', ':', 'O'),
 ('2', 'CD', 'O'),
 ('[', 'NN', 'B-NP'),
 ('Business', 'NNP', 'O'),
 ('Laws', 'NNP', 'O'),

In [34]:

# source .env/bin/activate
# pip install -U pip setuptools wheel
# pip install -U spacy

^C

Note: you may need to restart the kernel to use updated packages.


In [35]:
# conda update -n base -c defaults conda

^C

Note: you may need to restart the kernel to use updated packages.


In [50]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [51]:
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0.tar.gz

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\medgini\anaconda3

  added / updated specs:
    - spacy


The following NEW packages will be INSTALLED:

  python_abi         conda-forge/win-64::python_abi-3.9-2_cp39

The following packages will be UPDATED:

  conda              pkgs/main::conda-4.13.0-py39haa95532_0 --> conda-forge::conda-4.13.0-py39hcbf5309_1


Preparing transaction: ...working... done
Verifying transaction: ...working... done
Executing transaction: ...working... done
Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3

In [38]:
# conda install -c conda-forge spacy
# python -m spacy download en_core_web_sm


^C
Note: you may need to restart the kernel to use updated packages.


In [52]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [53]:
doc = nlp('Due to ongoing floods at Silchar City (Assam), it has been decided by the competent authority to postpone Chartered Accountants Foundation Examination, Paper – 1 [Principles and Practice of Accounting] and Paper - 2 [Business Laws & Business Correspondence and Reporting] scheduled to be held on 24th and 26th June 2022 respectively in Silchar (Assam) Examination Centre only," the ICAI said in an official notification.')
print([(X.text, X.label_) for X in doc.ents])

[('Silchar City', 'GPE'), ('Chartered Accountants Foundation Examination', 'ORG'), ('Practice of Accounting', 'ORG'), ('24th and', 'DATE'), ('Silchar (Assam', 'ORG'), ('ICAI', 'ORG')]


In [54]:
print([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(Due, 'O', ''), (to, 'O', ''), (ongoing, 'O', ''), (floods, 'O', ''), (at, 'O', ''), (Silchar, 'B', 'GPE'), (City, 'I', 'GPE'), ((, 'O', ''), (Assam, 'O', ''), (), 'O', ''), (,, 'O', ''), (it, 'O', ''), (has, 'O', ''), (been, 'O', ''), (decided, 'O', ''), (by, 'O', ''), (the, 'O', ''), (competent, 'O', ''), (authority, 'O', ''), (to, 'O', ''), (postpone, 'O', ''), (Chartered, 'B', 'ORG'), (Accountants, 'I', 'ORG'), (Foundation, 'I', 'ORG'), (Examination, 'I', 'ORG'), (,, 'O', ''), (Paper, 'O', ''), (–, 'O', ''), (1, 'O', ''), ([, 'O', ''), (Principles, 'O', ''), (and, 'O', ''), (Practice, 'B', 'ORG'), (of, 'I', 'ORG'), (Accounting, 'I', 'ORG'), (], 'O', ''), (and, 'O', ''), (Paper, 'O', ''), (-, 'O', ''), (2, 'O', ''), ([, 'O', ''), (Business, 'O', ''), (Laws, 'O', ''), (&, 'O', ''), (Business, 'O', ''), (Correspondence, 'O', ''), (and, 'O', ''), (Reporting, 'O', ''), (], 'O', ''), (scheduled, 'O', ''), (to, 'O', ''), (be, 'O', ''), (held, 'O', ''), (on, 'O', ''), (24th, 'B', 'DATE'),

In [55]:
from bs4 import BeautifulSoup
import requests
import re

In [67]:
ny_bb = url_to_string('https://www.medicalnewstoday.com/articles/early-alzheimers-diagnosis-possible-in-a-single-mri-scan-using-new-algorithm#High-accuracy')
article = nlp(ny_bb)
len(article.ents)

67

In [68]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'PERSON': 17,
         'DATE': 3,
         'ORG': 24,
         'PERCENT': 8,
         'ORDINAL': 2,
         'CARDINAL': 6,
         'WORK_OF_ART': 1,
         'QUANTITY': 1,
         'PRODUCT': 1,
         'GPE': 4})

In [69]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('CSF', 3), ('MNT', 3), ('Katharine Lang', 2)]

In [70]:
sentences = [x for x in article.sents]
print(sentences[20])

These patients had either early or late-stage Alzheimer’s and were compared with healthy controls and patients with other neurological conditions.


In [75]:
displacy.render(nlp(str(sentences[50])), jupyter=True, style='ent')

In [72]:
displacy.render(nlp(str(sentences[20])), style='dep', jupyter = True, options = {'distance': 120})

In [73]:
displacy.render(nlp(str(article.ents)), jupyter=True, style='ent')