# Information Extraction

## IE Tasks
- named entity recognition
- relation extraction
- event extraction
- temporal expression normalization
-  template filling

## IE Architecture
![](http://www.nltk.org/images/ie-architecture.png)

In [1]:
import nltk, re, pprint

def ie_preprocess(document):
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]

## Chunking

- One of the most useful sources of information for NP-chunking is part-of-speech tags
- The code below contains a simple regex chunker

In [2]:
sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"), ("dog", "NN"), ("barked", "VBD"), ("at", "IN"),  ("the", "DT"), ("cat", "NN")]

grammar = "NP: {<DT>?<JJ>*<NN.*>+}"

cp = nltk.RegexpParser(grammar)
result = cp.parse(sentence) 
print(result) 


(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))


### Chinking
-  Chinking is the process of removing a sequence of tokens from a chunk. 

In [3]:
grammar = r"""
  NP:
    {<.*>+}          # Chunk everything
    }<VBD|IN>+{      # Chink sequences of VBD and IN
  """
sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),
       ("dog", "NN"), ("barked", "VBD"), ("at", "IN"),  ("the", "DT"), ("cat", "NN")]
cp = nltk.RegexpParser(grammar)
print(cp.parse(sentence))

(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))


### Representing chunks

- IOB tags have become the standard way to represent chunk structures in files
- Example:

```
We PRP B-NP
saw VBD O
the DT B-NP
yellow JJ I-NP
dog NN I-NP
```

## Developing and Evaluating Chunkers

### RegEx Chunkers

In [4]:
from nltk.corpus import conll2000
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
grammar = r"NP: {<[CDJNP].*>+}"
cp = nltk.RegexpParser(grammar)
print(cp.accuracy(test_sents))

ChunkParse score:
    IOB Accuracy:  87.7%%
    Precision:     70.6%%
    Recall:        67.8%%
    F-Measure:     69.2%%


### Unigram Chunker

In [5]:
class UnigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents): 
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.UnigramTagger(train_data) 

    def parse(self, sentence): 
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
unigram_chunker = UnigramChunker(train_sents)
print(unigram_chunker.accuracy(test_sents))
postags = sorted(set(pos for sent in train_sents for (word,pos) in sent.leaves()))
print(unigram_chunker.tagger.tag(postags))

ChunkParse score:
    IOB Accuracy:  92.9%%
    Precision:     79.9%%
    Recall:        86.8%%
    F-Measure:     83.2%%
[('#', 'B-NP'), ('$', 'B-NP'), ("''", 'O'), ('(', 'O'), (')', 'O'), (',', 'O'), ('.', 'O'), (':', 'O'), ('CC', 'O'), ('CD', 'I-NP'), ('DT', 'B-NP'), ('EX', 'B-NP'), ('FW', 'I-NP'), ('IN', 'O'), ('JJ', 'I-NP'), ('JJR', 'B-NP'), ('JJS', 'I-NP'), ('MD', 'O'), ('NN', 'I-NP'), ('NNP', 'I-NP'), ('NNPS', 'I-NP'), ('NNS', 'I-NP'), ('PDT', 'B-NP'), ('POS', 'B-NP'), ('PRP', 'B-NP'), ('PRP$', 'B-NP'), ('RB', 'O'), ('RBR', 'O'), ('RBS', 'B-NP'), ('RP', 'O'), ('SYM', 'O'), ('TO', 'O'), ('UH', 'O'), ('VB', 'O'), ('VBD', 'O'), ('VBG', 'O'), ('VBN', 'O'), ('VBP', 'O'), ('VBZ', 'O'), ('WDT', 'B-NP'), ('WP', 'B-NP'), ('WP$', 'B-NP'), ('WRB', 'O'), ('``', 'O')]


### Bigram Chunker

In [6]:
class BigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents): 
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.BigramTagger(train_data) 

    def parse(self, sentence): 
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
bigram_chunker = BigramChunker(train_sents)
print(bigram_chunker.accuracy(test_sents))
postags = sorted(set(pos for sent in train_sents for (word,pos) in sent.leaves()))
print(bigram_chunker.tagger.tag(postags))

ChunkParse score:
    IOB Accuracy:  93.3%%
    Precision:     82.3%%
    Recall:        86.8%%
    F-Measure:     84.5%%
[('#', 'B-NP'), ('$', 'I-NP'), ("''", 'O'), ('(', 'O'), (')', 'O'), (',', 'O'), ('.', 'O'), (':', 'O'), ('CC', 'O'), ('CD', 'B-NP'), ('DT', 'I-NP'), ('EX', 'B-NP'), ('FW', 'I-NP'), ('IN', 'O'), ('JJ', 'B-NP'), ('JJR', 'I-NP'), ('JJS', 'I-NP'), ('MD', 'O'), ('NN', 'B-NP'), ('NNP', 'I-NP'), ('NNPS', 'I-NP'), ('NNS', 'I-NP'), ('PDT', 'I-NP'), ('POS', 'B-NP'), ('PRP', 'B-NP'), ('PRP$', 'I-NP'), ('RB', 'O'), ('RBR', 'O'), ('RBS', 'B-NP'), ('RP', None), ('SYM', None), ('TO', None), ('UH', None), ('VB', None), ('VBD', None), ('VBG', None), ('VBN', None), ('VBP', None), ('VBZ', None), ('WDT', None), ('WP', None), ('WP$', None), ('WRB', None), ('``', None)]


### Classifier-Based Chunkers

*Note:  In this example, I removed the Megma algorithm since installation can be challenging.*

In [8]:
class ConsecutiveNPChunkTagger(nltk.TaggerI): 

    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = npchunk_features(untagged_sent, i, history) 
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.MaxentClassifier.train( 
            train_set, trace=0)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = npchunk_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

class ConsecutiveNPChunker(nltk.ChunkParserI): 
    def __init__(self, train_sents):
        tagged_sents = [[((w,t),c) for (w,t,c) in
                         nltk.chunk.tree2conlltags(sent)]
                        for sent in train_sents]
        self.tagger = ConsecutiveNPChunkTagger(tagged_sents)

    def parse(self, sentence):
        tagged_sents = self.tagger.tag(sentence)
        conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]
        return nltk.chunk.conlltags2tree(conlltags)
    
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    if i == len(sentence)-1:
        nextword, nextpos = "<END>", "<END>"
    else:
        nextword, nextpos = sentence[i+1]
    return {"pos": pos, "word": word, "prevpos": prevpos,"nextpos": nextpos,"prevpos+pos": "%s+%s" % (prevpos, pos), "pos+nextpos": "%s+%s" % (pos, nextpos),"tags-since-dt": tags_since_dt(sentence, i)}  

def tags_since_dt(sentence, i):
    tags = set()
    for word, pos in sentence[:i]:
        if pos == 'DT':
            tags = set()
        else:
            tags.add(pos)
    return '+'.join(sorted(tags))

chunker = ConsecutiveNPChunker(train_sents)
print(chunker.accuracy(test_sents))

ChunkParse score:
    IOB Accuracy:  96.0%%
    Precision:     88.3%%
    Recall:        91.1%%
    F-Measure:     89.7%%


## Named Entity Recognition

- Named entities are noun phrases that commonly include people, places, and organizations.

![](http://cs.lewisu.edu/~howardcy/nlp/named-entity-types.png)

- Named entity recognition can be broken down into two sub-tasks:
	- identifying the boundaries of the named entitiy 
	- identifying its type
- NLTK provides a classifier that has already been trained to recognize named entities, accessed with the function nltk.ne_chunk(). 

In [9]:
import nltk

sentence = "Ray Klump teaches at Lewis University in Romeoville"
sentence1 = "Citing high fuel prices, United Airlines said Friday it has increased fares by $6 per round trip on flights to some cities also served by lower-cost carriers. American Airlines, a unit of  AMR Corp., immediately matched the move, spokesman Tim Wagner said. United, a unit of UAL Corp., said the increase took effect [TIME Thursday] and applies to most routes where it competes against discount carriers, such as Chicago to Dallas and Denver to San Francisco."
sentence_tokenized = nltk.word_tokenize(sentence) 
sentence_tagged = nltk.pos_tag(sentence_tokenized) 
print(nltk.ne_chunk(sentence_tagged)) 

(S
  (PERSON Ray/NNP)
  (ORGANIZATION Klump/NNP)
  teaches/NNS
  at/IN
  (ORGANIZATION Lewis/NNP University/NNP)
  in/IN
  (GPE Romeoville/NNP))


In [10]:
import nltk

document = "Citing high fuel prices, United Airlines said Friday it has increased fares by $6 per round trip on flights to some cities also served by lower-cost carriers. American Airlines, a unit of  AMR Corp., immediately matched the move, spokesman Tim Wagner said. United, a unit of UAL Corp., said the increase took effect [TIME Thursday] and applies to most routes where it competes against discount carriers, such as Chicago to Dallas and Denver to San Francisco."
sentences = nltk.sent_tokenize(document)
sentences = [nltk.word_tokenize(sent) for sent in sentences]
sentences = [nltk.pos_tag(sent) for sent in sentences]
for sentence in sentences:
    print(nltk.ne_chunk(sentence)) 

(S
  Citing/VBG
  high/JJ
  fuel/NN
  prices/NNS
  ,/,
  (GPE United/NNP Airlines/NNPS)
  said/VBD
  Friday/NNP
  it/PRP
  has/VBZ
  increased/VBN
  fares/NNS
  by/IN
  $/$
  6/CD
  per/IN
  round/NN
  trip/NN
  on/IN
  flights/NNS
  to/TO
  some/DT
  cities/NNS
  also/RB
  served/VBN
  by/IN
  lower-cost/JJ
  carriers/NNS
  ./.)
(S
  (GPE American/NNP)
  (ORGANIZATION Airlines/NNPS)
  ,/,
  a/DT
  unit/NN
  of/IN
  (ORGANIZATION AMR/NNP Corp./NNP)
  ,/,
  immediately/RB
  matched/VBD
  the/DT
  move/NN
  ,/,
  spokesman/NN
  (PERSON Tim/NNP Wagner/NNP)
  said/VBD
  ./.)
(S
  (GPE United/NNP)
  ,/,
  a/DT
  unit/NN
  of/IN
  (ORGANIZATION UAL/NNP Corp./NNP)
  ,/,
  said/VBD
  the/DT
  increase/NN
  took/VBD
  effect/NN
  [/NN
  (ORGANIZATION TIME/NNP)
  Thursday/NNP
  ]/NNP
  and/CC
  applies/NNS
  to/TO
  most/JJS
  routes/NNS
  where/WRB
  it/PRP
  competes/VBZ
  against/IN
  discount/NN
  carriers/NNS
  ,/,
  such/JJ
  as/IN
  (GPE Chicago/NNP)
  to/TO
  (GPE Dallas/NNP)
  and/CC
  

## Relation Extraction
- After we've identified named entities we then want to extract the relations that exist between them
- One way to do this is using a regular expression

In [11]:
import nltk
import nltk.corpus
import re

IN = re.compile(r'.*\bin\b(?!\b.+ing)')
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
    for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern = IN):
        print(nltk.sem.rtuple(rel))

[ORG: 'WHYY'] 'in' [LOC: 'Philadelphia']
[ORG: 'McGlashan &AMP; Sarrail'] 'firm in' [LOC: 'San Mateo']
[ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington']
[ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington']
[ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles']
[ORG: 'Open Text'] ', based in' [LOC: 'Waterloo']
[ORG: 'WGBH'] 'in' [LOC: 'Boston']
[ORG: 'Bastille Opera'] 'in' [LOC: 'Paris']
[ORG: 'Omnicom'] 'in' [LOC: 'New York']
[ORG: 'DDB Needham'] 'in' [LOC: 'New York']
[ORG: 'Kaplan Thaler Group'] 'in' [LOC: 'New York']
[ORG: 'BBDO South'] 'in' [LOC: 'Atlanta']
[ORG: 'Georgia-Pacific'] 'in' [LOC: 'Atlanta']
