

Information Extraction: 
1. Part-of-Speech Tagging
2. Chunking
3. Chinking
4. Named Entity Recognition
5. Relation Extraction



1. Part-of-Speech Tagging


In [188]:
import nltk
import string
import re

from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.tokenize.regexp import WhitespaceTokenizer
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def pos_tagging(text):
 word_tokens = word_tokenize(text)
 return pos_tag(word_tokens) 
  
pos_tagging('You just gave me a scare')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('You', 'PRP'),
 ('just', 'RB'),
 ('gave', 'VBD'),
 ('me', 'PRP'),
 ('a', 'DT'),
 ('scare', 'NN')]

In [189]:
nltk.download('tagsets')

[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [190]:
nltk.help.upenn_tagset('NN')


NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...


2. Chunking

In [191]:
locs = [('Omnicom', 'IN', 'New York'),
...         ('DDB Needham', 'IN', 'New York'),
...         ('Kaplan Thaler Group', 'IN', 'New York'),
...         ('BBDO South', 'IN', 'Atlanta'),
...         ('Georgia-Pacific', 'IN', 'Atlanta')]
query = [e1 for (e1, rel, e2) in locs if e2=='Atlanta']
print(query)

def ie_preprocess(document):
...    sentences = nltk.sent_tokenize(document) 
...    sentences = [nltk.word_tokenize(sent) for sent in sentences]
...    sentences = [nltk.pos_tag(sent) for sent in sentences] 

sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),
... ("dog", "NN"), ("barked", "VBD"), ("at", "IN"),  ("the", "DT"), ("cat", "NN")]
grammar = "NP: {<DT>?<JJ>*<NN>}"
cp = nltk.RegexpParser(grammar)
result = cp.parse(sentence)
print(result)


['BBDO South', 'Georgia-Pacific']
(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))


3. Chinking


In [192]:
grammar = r"""
  NP: {<DT|PP\$>?<JJ>*<NN>}  
      {<NNP>+}                
"""
cp = nltk.RegexpParser(grammar)
sentence = [("Rapunzel", "NNP"), ("let", "VBD"), ("down", "RP"), 
                 ("her", "PP$"), ("long", "JJ"), ("golden", "JJ"), ("hair", "NN")]
print(cp.parse(sentence))

(S
  (NP Rapunzel/NNP)
  let/VBD
  down/RP
  (NP her/PP$ long/JJ golden/JJ hair/NN))


In [193]:
nouns = [("money", "NN"), ("market", "NN"), ("fund", "NN")]
grammar = "NP: {<NN><NN>}  # Chunk two consecutive nouns"
cp = nltk.RegexpParser(grammar)
print(cp.parse(nouns))

(S (NP money/NN market/NN) fund/NN)


4. Named Entity Recognition

In [194]:
import nltk
nltk.download('treebank')
nltk.download('maxent_ne_chunker')
sent = nltk.corpus.treebank.tagged_sents()
print(nltk.ne_chunk(sent, binary=True))
(S
 The/DT
  (NE U.S./NNP)
  is/VBZ
  one/CD
  
  according/VBG
  to/TO
  (NE Brooke/NNP T./NNP Mossman/NNP)
)

SyntaxError: ignored

Relation Extraction

In [None]:
import nltk
nltk.download('ieer')
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
 for rel in nltk.sem.extract_rels('ORG', 'LOC', doc,
                         corpus='ieer', pattern = IN):
  print(nltk.sem.rtuple(rel))

In [None]:
from nltk.corpus import conll2002
nltk.download('conll2002')
vnv = """
(
is/V|    # 3rd sing present and
was/V|   # past forms of the verb zijn ('be')
werd/V|  # and also present
wordt/V  # past of worden ('become)
)
 .*       # followed by anything
van/Prep # followed by van ('of')
"""

VAN = re.compile(vnv, re.VERBOSE)
for doc in conll2002.chunked_sents('ned.train'):
 for rel in nltk.sem.extract_rels('PER', 'ORG', doc,
                                corpus='conll2002', pattern=VAN):
  print(nltk.sem.clause(rel, relsym="VAN"))