In [1]:
from itertools import chain
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelBinarizer
import pycrfsuite
import io
from __future__ import print_function


data_fname = 'training-data-set-1.txt'

In [2]:
def loadfile(fname):
    
    def iterfile(fname):
        with io.open(fname,'r',encoding='utf8') as f:
            for line in f:
                yield line

    l = []
    l_temp = []
    
    for line in iterfile(fname):
        if line != '\n':
            try:
                token,label = line.strip().split()
                if label == '0':
                    label = 'O'
            except ValueError:
                print(idx)
            l_temp.append((token,label))
        else:
            l.append(l_temp)
            l_temp = []
            
    return l


def word2features(sent, i):
    
    word=sent[i][0]
    
    features = {
        'bias': 1.0,
        'word.lower': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper': word.isupper(),
        'word.istitle': word.istitle(),
        'word.isdigit': word.isdigit(),
        'BOS': False,
        'EOS': False,
        '-1:word.lower': '<pad>',
        '-1:word.istitle': False,
        '-1:word.isupper': False,
        '+1:word.lower': '<pad>',
        '+1:word.istitle': False,
        '+1:word.isupper': False
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower': word1.lower(),
            '-1:word.istitle': word1.istitle(),
            '-1:word.isupper': word1.isupper(),
        })
    else:
        features['BOS']= True
        
    if i < len(sent)-1:
        word1 = sent[i+1][0] 
        features.update({
            '+1:word.lower': word1.lower(),
            '+1:word.istitle': word1.istitle(),
            '+1:word.isupper': word1.isupper(),
        })
    else:
        features['EOS']= True
        
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token,label in sent]

def sent2tokens(sent):
    return[token for token,label in sent]

def tran_test_split(data,per=0.2):
    split = int(len(data)*(1-per))
    train_sents = data[:split]
    test_sents = data[split:]
    return train_sents,test_sents

def data2feats(sents):
    X = [sent2features(s) for s in sents]
    y = [sent2labels(s) for s in sents]
    return X,y

def bio_classification_report(y_true, y_pred):
    
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )
    

## Feature Extraction

In [3]:
data = loadfile(data_fname)
data[:2]

[[(u'3', u'B_amount'),
  (u'Liter', u'B_unit'),
  (u'frische', u'B_productName'),
  (u'Bio', u'I_productName'),
  (u'Vollmilch', u'I_productName'),
  (u'mit', 'O'),
  (u'1.5%', u'B_productAttribute'),
  (u'Fett', u'I_productAttribute'),
  (u'von', 'O'),
  (u'Berchtesgadener', u'B_brand'),
  (u'Land', u'I_brand')],
 [(u'2', u'B_amount'),
  (u'Liter', u'B_unit'),
  (u'frische', u'B_productName'),
  (u'Bio', u'I_productName'),
  (u'Vollmilch', u'I_productName'),
  (u'mit', 'O'),
  (u'3.8%', u'B_productAttribute'),
  (u'Fett', u'I_productAttribute')]]

In [4]:
%%time
train_sents,test_sents = tran_test_split(data, per=0.2)
X_train,y_train = data2feats(train_sents)
X_test,y_test = data2feats(test_sents)

print('Train Sents: \t%i\n' %len(X_train) +
      'Train Tokens: \t%i\n' %sum([len(x) for x in X_train]) +
      '- - - - - - -\n'
      'Test Sents: \t%i\n' %len(X_test) +
      'Test Tokens: \t%i\n' %sum([len(x) for x in X_test])
     )

Train Sents: 	107
Train Tokens: 	535
- - - - - - -
Test Sents: 	27
Test Tokens: 	133

Wall time: 6 ms


In [5]:
X_train[0][0]

{'+1:word.istitle': True,
 '+1:word.isupper': False,
 '+1:word.lower': u'liter',
 '-1:word.istitle': False,
 '-1:word.isupper': False,
 '-1:word.lower': '<pad>',
 'BOS': True,
 'EOS': False,
 'bias': 1.0,
 'word.isdigit': True,
 'word.istitle': False,
 'word.isupper': False,
 'word.lower': u'3',
 'word[-2:]': u'3',
 'word[-3:]': u'3'}

## Training

In [6]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

Wall time: 13 ms


In [7]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 100,  # epoches

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [8]:
%%time
trainer.train('UG_NER_1203_1st.model')

Wall time: 166 ms


## Testing

In [9]:
tagger = pycrfsuite.Tagger()
tagger.open('UG_NER_1203_1st.model')

<contextlib.closing at 0x7632a90>

In [10]:
example = test_sents[0]

print('Sent:\t\t' + ' '.join(sent2tokens(example)) + '\n')
print('Predict:\t' + ' '.join(tagger.tag(example)))
print('Labels:\t\t' + ' '.join(sent2labels(example)))

Sent:		eine Tüte Chio Tortillas Nacho Cheese

Predict:	B_amount B_unit B_brand I_brand B_productName I_productName
Labels:		B_amount B_unit B_brand B_productName I_productName B_productAttribute


In [11]:
%%time
y_pred = [tagger.tag(xseq) for xseq in X_test]

Wall time: 4 ms


In [12]:
print(bio_classification_report(y_test, y_pred))

                    precision    recall  f1-score   support

          B_amount       0.97      1.00      0.98        28
           B_brand       0.50      0.89      0.64         9
B_productAttribute       0.33      0.25      0.29         8
     B_productName       0.57      0.53      0.55        30
            B_unit       0.72      0.95      0.82        22
           I_brand       0.67      0.50      0.57         4
I_productAttribute       0.00      0.00      0.00         1
     I_productName       0.56      0.29      0.38        17

       avg / total       0.70      0.69      0.68       133

