In [1]:
from sklearn.svm import SVC
import nltk
from nltk.corpus.reader import ConllCorpusReader

train = ConllCorpusReader('CoNLL-2003', 'eng.train', ['words', 'pos', 'ignore', 'chunk'])
test = ConllCorpusReader('CoNLL-2003', 'eng.testa', ['words', 'pos', 'ignore', 'chunk'])


In [2]:
from collections import defaultdict

def load_gazetteer_dict():
    with open('./gazetteer.txt') as f:
        lines = f.readlines()
        lines = [i[:-1] for i in lines]
        g_dict = defaultdict(set)
        for line in lines:
            tag, word = line.split()[0], (' ').join(line.split()[1:])
            g_dict[tag].add(word) #stores a set of words for each tag
    
    # print ('gazetteer dict sample: ',g_dict.keys())
    return g_dict
g_dict = load_gazetteer_dict()
g_dict.keys()

dict_keys(['LOC', 'MISC', 'ORG', 'PER'])

In [3]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
import re
from copy import copy, deepcopy

def word2features(i, wordseq):
    wi = wordseq[i]
    features = defaultdict(lambda x: -1)
    if wi == '<START>' or wi == '<STOP>':
        features.update({
            wi: True
        })
        return features
    
    features.update({
        # 'Wi': wi,
        'Wiaslower': wi.lower(),
        'iswialpha': wi.isalpha(),
        'iswititle': wi.istitle(),
        'iswiupper': wi.isupper(),
        'iswilower': wi.islower(),
        'iswidigit': wi.isdigit(),
        'iswinumeric': wi.isnumeric(),
        'Wishape': len(wi),
        # 'ishyphens': re.match('-',wi) == None,

    })
    if i>1:
        wiminus1 = wordseq[i-1]
        features.update({
            # 'Wi-1': wiminus1,
            'iswiminus1title': wiminus1.istitle(),
            'iswiminus1upper': wiminus1.isupper(),
            'iswiminus1lower': wiminus1.islower(),
            'Wi-1aslower': wiminus1.lower(),
        })
    elif i==1:
        features.update({
            'BOS':True,
        })
    if i<len(wordseq)-2:
        wiplus1 = wordseq[i+1]
        features.update({
            # 'Wi+1': wiplus1,
            'iswiplus1title': wiplus1.istitle(),
            'iswiplus1upper': wiplus1.isupper(),
            'iswiplus1lower': wiplus1.islower(),
            'Wi+1aslower': wiplus1.lower(),
        })
    elif i==len(wordseq)-2:
        features.update({
            'EOS':True,
        })
    if wi != '.':
        gaz = False
        for k in g_dict.keys():
            if wi in g_dict[k]:
                gaz = True
                features.update({
                    'gaztag-'+str(k): 1,
                })
        features.update({'gaz': gaz})
    else : features.update({'gaz': -1})
    return features

def sent2features(sentence):
    assert isinstance(sentence, list) and isinstance(sentence[0], str), '`sentence` should be list of words as str'
    xs = [None]*len(sentence)
    for i in range(len(sentence)):
        xs[i] = word2features(i, sentence)
    return xs

dummyfeature = [word2features(i,['<START>','aaa','China','Matthew','Jordans','AaA9','123','<STOP>']) for i in range(8)]
d2v = DictVectorizer(sparse=False)
d2v.fit(dummyfeature)
save_transform = None
red2v = DictVectorizer(sparse=False)
def feats2vects(features, test=False, refit=True):
    if not refit and not test: tX = d2v.transform(features)
    elif refit and not test:
        tX = red2v.fit_transform(features)
    # print(tX.shape)
    elif test:
        tX = red2v.transform(features)
    return tX

# def sent2featvects(sentence):
#     xs = sent2features(sentence)
#     tx = feats2vects(xs)
#     # print(tx.shape)
#     return tx



all_labels = ['B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O', '<START>', '<STOP>']                 
le = LabelEncoder()
le.fit(all_labels)
def ylabel_encode_decode(y, todo='encode'):
    if todo == 'encode':
        ty = le.transform(y)
    elif todo == 'decode':
        ty = le.inverse_transform(y)
    # print(ty.shape)
    return ty


In [4]:
import numpy as np
from itertools import chain
def iob_sents2Xy(iob_sents, test=False) :
    Xs = list(chain.from_iterable([['<START>']+[w for w, _, _ in wseq]+['<STOP>'] for wseq in iob_sents]))
    y = list(chain.from_iterable([['<START>']+[e for _, _, e in wseq]+['<STOP>'] for wseq in iob_sents]))
    X = sent2features(Xs)
    X = feats2vects(X, test=test)
        # for wseq in iob_sents:
        #     xi = []
        #     y.append('<START>')
        #     xi.extend(sent2featvects(['<START>']+[w for w, _, _ in wseq]+['<STOP>']))
        #     y.extend([entity for w, _, entity in wseq])
        #     y.append('<STOP>')
        #     X.extend(xi)
    return np.array(X), np.array(y)

In [5]:
m = 1000
Xtrain, ytrain = iob_sents2Xy(train.iob_sents()[:m])
Xtrain.shape, ytrain.shape

((13653, 9531), (13653,))

In [6]:
svmclassifier = SVC()

svmclassifier.fit(Xtrain,ytrain)

Xtest, ytest = iob_sents2Xy(train.iob_sents()[m:m+100], test=True)

predictions = svmclassifier.predict(Xtest)

print(f'{(predictions == ytest).sum()} correct out of {ytest.shape[0]} entities. '
      f'Accuracy = {(predictions == ytest).sum()/ytest.shape[0]}')
print(predictions[:10], ytest[:10])

1073 correct out of 1185 entities. Accuracy = 0.9054852320675105
['<START>' 'O' 'O' 'B-ORG' 'O' 'O' 'O' 'O' 'O' 'O'] ['<START>' 'O' 'O' 'B-ORG' 'O' 'O' 'O' 'O' 'O' 'O']


In [7]:
red2v.inverse_transform(Xtest[:3])
# d2v.inverse_transform(d2v.transform(dummyfeature))

[{'<START>': 1.0},
 {'Wi+1aslower=-': 1.0,
  'Wiaslower=soccer': 1.0,
  'Wishape': 6.0,
  'iswialpha': 1.0,
  'iswiupper': 1.0},
 {'Wi-1aslower=soccer': 1.0,
  'Wiaslower=-': 1.0,
  'Wishape': 1.0,
  'iswiminus1upper': 1.0,
  'iswiplus1upper': 1.0}]

In [8]:
train.iob_sents()[m]

[('SOCCER', 'NN', 'O'),
 ('-', ':', 'O'),
 ('UEFA', 'NNP', 'B-ORG'),
 ('REWARDS', 'NNPS', 'O'),
 ('THREE', 'CD', 'O'),
 ('COUNTRIES', 'NNS', 'O'),
 ('FOR', 'IN', 'O'),
 ('FAIR', 'NNP', 'O'),
 ('PLAY', 'NNP', 'O'),
 ('.', '.', 'O')]