### CoNLL Chunking Corpus

This notebook contains a standard process for building chunkers using the CoNLL chunking corpus from NLTK. 


In [5]:
import nltk
from nltk.corpus import conll2000
import numpy as np
import pandas as pd


In [87]:
# reading conll corpus in IO format
train_iob = conll2000.iob_sents('train.txt')
test_iob = conll2000.iob_sents('test.txt')

In [69]:
# there are 8936 sentences in the training data
print(len(train_iob))

8936


In [71]:
# sample IOB tagged sentence
train_iob[0]

[('Confidence', 'NN', 'B-NP'),
 ('in', 'IN', 'B-PP'),
 ('the', 'DT', 'B-NP'),
 ('pound', 'NN', 'I-NP'),
 ('is', 'VBZ', 'B-VP'),
 ('widely', 'RB', 'I-VP'),
 ('expected', 'VBN', 'I-VP'),
 ('to', 'TO', 'I-VP'),
 ('take', 'VB', 'I-VP'),
 ('another', 'DT', 'B-NP'),
 ('sharp', 'JJ', 'I-NP'),
 ('dive', 'NN', 'I-NP'),
 ('if', 'IN', 'B-SBAR'),
 ('trade', 'NN', 'B-NP'),
 ('figures', 'NNS', 'I-NP'),
 ('for', 'IN', 'B-PP'),
 ('September', 'NNP', 'B-NP'),
 (',', ',', 'O'),
 ('due', 'JJ', 'B-ADJP'),
 ('for', 'IN', 'B-PP'),
 ('release', 'NN', 'B-NP'),
 ('tomorrow', 'NN', 'B-NP'),
 (',', ',', 'O'),
 ('fail', 'VB', 'B-VP'),
 ('to', 'TO', 'I-VP'),
 ('show', 'VB', 'I-VP'),
 ('a', 'DT', 'B-NP'),
 ('substantial', 'JJ', 'I-NP'),
 ('improvement', 'NN', 'I-NP'),
 ('from', 'IN', 'B-PP'),
 ('July', 'NNP', 'B-NP'),
 ('and', 'CC', 'I-NP'),
 ('August', 'NNP', 'I-NP'),
 ("'s", 'POS', 'B-NP'),
 ('near-record', 'JJ', 'I-NP'),
 ('deficits', 'NNS', 'I-NP'),
 ('.', '.', 'O')]

### Distribution of Chunks

Let's have a quick look at the distribution of chunks.

In [88]:
# store all chunk tags in a list of chunks
train_chunk_tags = [tup[2] for sent in train_iob for tup in sent]
test_chunk_tags = [tup[2] for sent in test_iob for tup in sent]

In [79]:
#there are total 22 chunk tags (IOB format)
chunk_set = set(chunk_tags)
print(len(chunk_set))
print(chunk_set)

22
{'B-ADJP', 'I-PRT', 'I-ADJP', 'B-NP', 'I-ADVP', 'I-CONJP', 'I-NP', 'B-VP', 'I-SBAR', 'I-UCP', 'B-UCP', 'B-ADVP', 'B-PRT', 'I-VP', 'O', 'I-INTJ', 'B-SBAR', 'B-CONJP', 'B-LST', 'I-PP', 'B-INTJ', 'B-PP'}


### Conversion from IOB to Tree Format

In [117]:
# use the conlltags2tree method
print(nltk.chunk.conlltags2tree(train_iob[0]))

(S
  (NP Confidence/NN)
  (PP in/IN)
  (NP the/DT pound/NN)
  (VP is/VBZ widely/RB expected/VBN to/TO take/VB)
  (NP another/DT sharp/JJ dive/NN)
  (SBAR if/IN)
  (NP trade/NN figures/NNS)
  (PP for/IN)
  (NP September/NNP)
  ,/,
  (ADJP due/JJ)
  (PP for/IN)
  (NP release/NN)
  (NP tomorrow/NN)
  ,/,
  (VP fail/VB to/TO show/VB)
  (NP a/DT substantial/JJ improvement/NN)
  (PP from/IN)
  (NP July/NNP and/CC August/NNP)
  (NP 's/POS near-record/JJ deficits/NNS)
  ./.)


### Tree Format

In [122]:
# can read the sentences directly in Tree format
train_sents = conll2000.chunked_sents('train.txt')
test_sents = conll2000.chunked_sents('test.txt')

In [73]:
# each training sentence is of type nltk.tree
type(train_sents[0])

nltk.tree.Tree

In [74]:
# sample chunked sentence
# has three labels for chunks - NP chunk, VP chunk, PP chunk
print(train_sents[0])

(S
  (NP Confidence/NN)
  (PP in/IN)
  (NP the/DT pound/NN)
  (VP is/VBZ widely/RB expected/VBN to/TO take/VB)
  (NP another/DT sharp/JJ dive/NN)
  if/IN
  (NP trade/NN figures/NNS)
  (PP for/IN)
  (NP September/NNP)
  ,/,
  due/JJ
  (PP for/IN)
  (NP release/NN)
  (NP tomorrow/NN)
  ,/,
  (VP fail/VB to/TO show/VB)
  (NP a/DT substantial/JJ improvement/NN)
  (PP from/IN)
  (NP July/NNP and/CC August/NNP)
  (NP 's/POS near-record/JJ deficits/NNS)
  ./.)


In [80]:
# the corresponding IOB format is this:
train_iob[0]

[('Confidence', 'NN', 'B-NP'),
 ('in', 'IN', 'B-PP'),
 ('the', 'DT', 'B-NP'),
 ('pound', 'NN', 'I-NP'),
 ('is', 'VBZ', 'B-VP'),
 ('widely', 'RB', 'I-VP'),
 ('expected', 'VBN', 'I-VP'),
 ('to', 'TO', 'I-VP'),
 ('take', 'VB', 'I-VP'),
 ('another', 'DT', 'B-NP'),
 ('sharp', 'JJ', 'I-NP'),
 ('dive', 'NN', 'I-NP'),
 ('if', 'IN', 'B-SBAR'),
 ('trade', 'NN', 'B-NP'),
 ('figures', 'NNS', 'I-NP'),
 ('for', 'IN', 'B-PP'),
 ('September', 'NNP', 'B-NP'),
 (',', ',', 'O'),
 ('due', 'JJ', 'B-ADJP'),
 ('for', 'IN', 'B-PP'),
 ('release', 'NN', 'B-NP'),
 ('tomorrow', 'NN', 'B-NP'),
 (',', ',', 'O'),
 ('fail', 'VB', 'B-VP'),
 ('to', 'TO', 'I-VP'),
 ('show', 'VB', 'I-VP'),
 ('a', 'DT', 'B-NP'),
 ('substantial', 'JJ', 'I-NP'),
 ('improvement', 'NN', 'I-NP'),
 ('from', 'IN', 'B-PP'),
 ('July', 'NNP', 'B-NP'),
 ('and', 'CC', 'I-NP'),
 ('August', 'NNP', 'I-NP'),
 ("'s", 'POS', 'B-NP'),
 ('near-record', 'JJ', 'I-NP'),
 ('deficits', 'NNS', 'I-NP'),
 ('.', '.', 'O')]

### Conversion from Tree to IOB Format

The following method can convert from tree to IOB format.

In [112]:
nltk.chunk.tree2conlltags(train_sents[0])

[('Confidence', 'NN', 'B-NP'),
 ('in', 'IN', 'B-PP'),
 ('the', 'DT', 'B-NP'),
 ('pound', 'NN', 'I-NP'),
 ('is', 'VBZ', 'B-VP'),
 ('widely', 'RB', 'I-VP'),
 ('expected', 'VBN', 'I-VP'),
 ('to', 'TO', 'I-VP'),
 ('take', 'VB', 'I-VP'),
 ('another', 'DT', 'B-NP'),
 ('sharp', 'JJ', 'I-NP'),
 ('dive', 'NN', 'I-NP'),
 ('if', 'IN', 'O'),
 ('trade', 'NN', 'B-NP'),
 ('figures', 'NNS', 'I-NP'),
 ('for', 'IN', 'B-PP'),
 ('September', 'NNP', 'B-NP'),
 (',', ',', 'O'),
 ('due', 'JJ', 'O'),
 ('for', 'IN', 'B-PP'),
 ('release', 'NN', 'B-NP'),
 ('tomorrow', 'NN', 'B-NP'),
 (',', ',', 'O'),
 ('fail', 'VB', 'B-VP'),
 ('to', 'TO', 'I-VP'),
 ('show', 'VB', 'I-VP'),
 ('a', 'DT', 'B-NP'),
 ('substantial', 'JJ', 'I-NP'),
 ('improvement', 'NN', 'I-NP'),
 ('from', 'IN', 'B-PP'),
 ('July', 'NNP', 'B-NP'),
 ('and', 'CC', 'I-NP'),
 ('August', 'NNP', 'I-NP'),
 ("'s", 'POS', 'B-NP'),
 ('near-record', 'JJ', 'I-NP'),
 ('deficits', 'NNS', 'I-NP'),
 ('.', '.', 'O')]

## Creating Chunkers

### Dummy Parser

In [81]:
# dummy parser: tag everything as 'O'
cp = nltk.RegexpParser('')
test_sents = conll2000.chunked_sents('test.txt')

In [82]:
# sample test sentence
print(test_sents[0])

(S
  (NP Rockwell/NNP International/NNP Corp./NNP)
  (NP 's/POS Tulsa/NNP unit/NN)
  (VP said/VBD)
  (NP it/PRP)
  (VP signed/VBD)
  (NP a/DT tentative/JJ agreement/NN)
  (VP extending/VBG)
  (NP its/PRP$ contract/NN)
  (PP with/IN)
  (NP Boeing/NNP Co./NNP)
  (VP to/TO provide/VB)
  (NP structural/JJ parts/NNS)
  (PP for/IN)
  (NP Boeing/NNP)
  (NP 's/POS 747/CD jetliners/NNS)
  ./.)


In [83]:
# parse the test sentences
test_parsed = cp.parse(test_sents)

In [84]:
# evaluate the parsed sentences
print(cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  17.8%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%


An accuracy of 17.8% indicates that about as many words are tagged with a 'O', i.e. belonging to none of the three chunks. 

In [107]:
# NP parser: any POS tag beginning with C, D, J, P is to be tagged as a NP chunk 
cp = nltk.RegexpParser(r'''
NP: {<[CDJNP].*>+}
''')

# evaluate
print(cp.evaluate(test_sents))


ChunkParse score:
    IOB Accuracy:  62.5%%
    Precision:     70.6%%
    Recall:        38.5%%
    F-Measure:     49.8%%


Let's also add verb phrases (again using a naive logic - any sequence of POS tags that contains one or more words that start with a V are to be tagged as a VP chunk.

In [108]:
# NP and VP parser
cp = nltk.RegexpParser(r'''
NP: {<[CDJNP].*>+} 
VP: {<[V].*>+}''')

# evaluate
print(cp.evaluate(test_sents))


ChunkParse score:
    IOB Accuracy:  71.6%%
    Precision:     67.1%%
    Recall:        53.2%%
    F-Measure:     59.3%%


## Unigram Chunker

Let's now try a simple unigram chunker - given a part of speech tag, it will identify the most likely chunk tag (NP, VP or PP chunk) for a given word (using the word's POS tag). 

In [140]:
# unigram tagger

from nltk import ChunkParserI

class UnigramChunker(ChunkParserI):    
    def __init__(self, train_sents):
        # convert train sents from tree format to tags
        train_data = [[(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)] 
                      for sent in train_sents]
        self.tagger = nltk.UnigramTagger(train_data)
        
    def parse(self, sentence):
        pos_tags = [pos for (word, pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        
        # convert to tree again
        conlltags = [(word, pos, chunktag) for ((word, pos), chunktag) in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)
        

Let's use the UnigramChunker to tag the conll sentences. Note that the sentences have to be passed in the tree format.

In [141]:
# read the sentences in tree format
train_sents = conll2000.chunked_sents('train.txt')
test_sents = conll2000.chunked_sents('test.txt')

# unigram chunker 
unigram_chunker = UnigramChunker(train_sents)
print(unigram_chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  86.5%%
    Precision:     74.3%%
    Recall:        86.4%%
    F-Measure:     79.9%%


The unigram chunker seems to be doing much better than the previous ones. Rest of the chunkers are applied on the ATIS dataset.