In [4]:
from nltk import word_tokenize, pos_tag, ne_chunk,download
import os
import collections
import string
from nltk.stem.snowball import SnowballStemmer
import pickle
from collections import Iterable
from nltk.tag import  ClassifierBasedTagger
from nltk.chunk import ChunkParserI
from nltk.chunk import conlltags2tree, tree2conlltags

The following functions helps us parse the file and create the chunker.
<br>
-__read_gmb__ :reads the directory that the files exists. Checks and includes only the words with the required tags and dismisses the other words.
<br>
-__transform__ :Insert I and B on tags (inner of a segment and beginning of segment).
<br>

In [10]:
ner_tags=collections.Counter()

def read_gmb(corpus_root):
    print 'About to read from file in ',corpus_root
    #Move in every folder of the directory.
    for root, dirs, files in os.walk(corpus_root):
        for filename in files:
            #Keep only the .tags files.
            if filename.endswith(".tags"):
                with open(os.path.join(root, filename), 'rb') as file_handle:
                    file_content = file_handle.read().decode('utf-8').strip()
                    #Sentences are separated by two newlines.
                    annotated_sentences = file_content.split('\n\n')
                    for annotated_sentence in annotated_sentences:
                        #Words are separated by a newline.
                        annotated_tokens = [seq for seq in annotated_sentence.split('\n') if seq]
                        standard_form_tokens = []

                        for idx, annotated_token in enumerate(annotated_tokens):
                            #Each word is  separated via tab by its annotation
                            annotations = annotated_token.split('\t')
                            word, tag, ner = annotations[0], annotations[1], annotations[3]
                            #As the exercises tells,keep only the most important-primary categories.
                            #Case where there is an interesting notation ,split the subcategories(geo-nam->geo) and kee only the 
                            #bigger categories.
                            if ner != 'O':
                                ner = ner.split('-')[0]
                                ner_tags[ner] += 1
                            standard_form_tokens.append((word, tag, ner))
                        b_i_ = transform(standard_form_tokens)
                        # The naive bayes classifier version must get as input ((word,tag),iob_notation)
                        yield [((w, t), iob) for w, t, iob in b_i_]

In [6]:
def transform(annotated_sentence):
    #[(w1, t1, iob1), ...] with not proper format that would be transformed to proper(B- will be added on the beginning of the IOB notation 
    #I_ will be added on the inner segment elements.)
    not_ = []
    #For each word we meet in a sentence
    for idx, annotated_token in enumerate(annotated_sentence):
        t, wrd, category = annotated_token
        if category != 'O': #O are the objects annotated as outer to any chunk.
            #if previous item notation of the segment  is B-,then the following item should be inner and add I- to the beginning
            if annotated_sentence[idx - 1][2] == category:
                category = "I-" + category
            else:         #Otherwise it is the B- case where the item is in the beginning of a segment 
                category = "B-" + category
        not_.append((t, wrd, category))
    return not_

class NamedEntityChunker is child class of ChunkParse. It has two fields: 
<br>
__feature_detector__ : used for identifying the features.It takes a word and gets valuable information such as the previous,next words etc.
<br>
__tagger__ : Naive-Bayes Classifier used to predict the different sequences.

In [7]:
class NamedEntityChunker(ChunkParserI):
    def __init__(self, train_sents, **kwargs):
        #The function features that will be used for each instance to return its features.
        self.feature_detector = features
        self.tagger = ClassifierBasedTagger(train=train_sents,feature_detector=features,**kwargs)
 
    def parse(self, tagged_sent):
        #Call the classifier for the input and keep the result in chunks variable.
        chunks = self.tagger.tag(tagged_sent)
        #Perform the opposite than the previous procedure: ((word,tag),notation)->(word,tag,notation)
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]
        return conlltags2tree(iob_triplets)

In [40]:
def repr_(word):
    str_=''
    for x in word:
        if x.isupper()==True:
            str_=str_+'X'
        else:
            str_=str_+'x'
        return str_

def features(tokens, index, history):
    #Features contain different properties tha may be useful to be known
    # init the stemmer
    stemmer = SnowballStemmer('english')
    # Pad the sequence with placeholders
    tokens = [('[START2]', '[START2]'), ('[START1]', '[START1]')] + list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]')]
    history = ['[START2]', '[START1]'] + list(history)
    index += 2 
    word, pos = tokens[index] #Actual word
    prevword, prevpos = tokens[index - 1]  #Previous word/Previous index
    prevprevword, prevprevpos = tokens[index - 2] #Prev-previous word,prev-previous index
    nextword, nextpos = tokens[index + 1] #Next word,next index
    nextnextword, nextnextpos = tokens[index + 2] #Next-next word,next-next index
    prefix1=word[0] #The first letter of the word
    suffix1=word[-1] #The last letter of the world
    previob = history[index - 1]  #The previous IOB tag assigned.
    allascii = all([True for c in word if c in string.ascii_lowercase]) #All lowercase
    repr_w=repr_(word) #representation of current,previous and next words. Representation is of form X for capital
                        #x for lowercase.
    prev_repr=repr_(prevword)
    next_repr=repr_(nextword)
   
    return {
        'word': word,
        'lemma': stemmer.stem(word),
        'pos': pos,
        'all-ascii': allascii,
        'next-word': nextword,
        'next-lemma': stemmer.stem(nextword),
        'next-pos': nextpos,
        'next-next-word': nextnextword,
        'nextnextpos': nextnextpos,
        'prev-word': prevword,
        'prev-lemma': stemmer.stem(prevword),
        'prev-pos': prevpos,
        'prev-prev-word': prevprevword,
        'prev-prev-pos': prevprevpos,
        'prev-iob': previob,
        'prefix':prefix1,
        'suffix':suffix1,
        'representation':repr_w,
        'prev-repr':prev_repr,
        'next-repr':next_repr
    }

In [14]:
root='/media/sf_mine/gmb-2.2.0/'
reader = read_gmb(root)
data = list(reader)
print ner_tags

About to read from file in  /media/sf_mine/gmb-2.2.0/
Counter({u'geo': 116776, u'org': 96188, u'per': 88508, u'tim': 69578, u'gpe': 41360, u'art': 1734, u'eve': 1418, u'nat': 600})


In [41]:
tr= data[:int(len(data) * 0.8)] #80 percent of the dataset will be the training sample
te = data[int(len(data) * 0.8):]#20 percent of the dataset will be from the test sample
 
print "Train instances = %s" % len(tr) 
print "Test instances  = %s" % len(te) 
chunker = NamedEntityChunker(tr[:2000])

Train instances = 49608
Test instances  = 12402


In [42]:
score = chunker.evaluate([conlltags2tree([(w, t, iob) for (w, t), iob in iobs]) for iobs in te[:500]])
print score.accuracy()  

0.925805578121
