In [5]:
from sklearn.svm import SVC
import nltk
from nltk.corpus.reader import ConllCorpusReader
from collections import defaultdict
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
import numpy as np
from itertools import chain
import re
from copy import copy, deepcopy

train = ConllCorpusReader('CoNLL-2003', 'eng.train', ['words', 'pos', 'ignore', 'chunk'])
test = ConllCorpusReader('CoNLL-2003', 'eng.testa', ['words', 'pos', 'ignore', 'chunk'])

def load_gazetteer_dict():
    with open('gazetteer.txt') as f:
        lines = f.readlines()
        lines = [i.strip() for i in lines]
        g_dict = defaultdict(set)
        for line in lines:
            tag, word = line.split()[0], (' ').join(line.split()[1:])
            g_dict[tag].add(word)
    return g_dict

g_dict = load_gazetteer_dict()

# Function to generate word-level features
def word2features(i, wordseq):
    wi = wordseq[i]
    features = defaultdict(lambda: -1)

    if wi == '<START>' or wi == '<STOP>':
        features.update({
            wi: True
        })
        return features

    features.update({
        'Wiaslower': wi.lower(),
        'iswialpha': wi.isalpha(),
        'iswititle': wi.istitle(),
        'iswiupper': wi.isupper(),
        'iswilower': wi.islower(),
        'iswidigit': wi.isdigit(),
        'iswinumeric': wi.isnumeric(),
        'Wishape': len(wi),
    })

    if i > 1:
        wiminus1 = wordseq[i - 1]
        features.update({
            'iswiminus1title': wiminus1.istitle(),
            'iswiminus1upper': wiminus1.isupper(),
            'iswiminus1lower': wiminus1.islower(),
            'Wi-1aslower': wiminus1.lower(),
        })
    elif i == 1:
        features.update({
            'BOS': True,
        })

    if i < len(wordseq) - 2:
        wiplus1 = wordseq[i + 1]
        features.update({
            'iswiplus1title': wiplus1.istitle(),
            'iswiplus1upper': wiplus1.isupper(),
            'iswiplus1lower': wiplus1.islower(),
            'Wi+1aslower': wiplus1.lower(),
        })
    elif i == len(wordseq) - 2:
        features.update({
            'EOS': True,
        })

    if wi != '.' or not wi.isnumeric():
        gaz = False
        for k in g_dict.keys():
            if wi in g_dict[k]:
                gaz = True
                features.update({
                    'gaztag-' + str(k): 1,
                })
        features.update({'gaz': gaz})
    else:
        features.update({'gaz': -1})

    return features

# Function to convert sentence into features
def sent2features(sentence):
    assert isinstance(sentence, list) and isinstance(sentence[0], str), '`sentence` should be list of words as str'
    assert sentence[0]=='<START>' and sentence[-1]=='<STOP>' , '`sentence` should have <START> and <STOP> tags'
    xs = [None] * len(sentence)
    for i in range(len(sentence)):
        xs[i] = word2features(i, sentence)
    return xs

# Label Encoder to return 1 for B- and I- tags, 0 otherwise
def ylabel_encode_decode(y, todo='encode'):
    if todo == 'encode':
        ty = [1 if label.startswith('B-') or label.startswith('I-') else 0 for label in y]
    return ty

# Function to vectorize the features and labels from sentences in IOB format
def iob_sents2Xy(iob_sents, test=False):
    Xs = list(chain.from_iterable([['<START>'] + [w for w, _, _ in wseq] + ['<STOP>'] for wseq in iob_sents]))
    y = list(chain.from_iterable([['<START>'] + [e for _, _, e in wseq] + ['<STOP>'] for wseq in iob_sents]))

    y = ylabel_encode_decode(y)

    X = sent2features(Xs)
    X = feats2vects(X, test=test)

    return np.array(X), np.array(y)

d2v = DictVectorizer(sparse=False)

# Function to transform features into vectors
def feats2vects(features, test=False):
    if not test:
        return d2v.fit_transform(features)
    else:
        return d2v.transform(features)

m = 1000

# Prepare training data
Xtrain, ytrain = iob_sents2Xy(train.iob_sents()[:m])

svmclassifier = SVC()
svmclassifier.fit(Xtrain, ytrain)

Xtest, ytest = iob_sents2Xy(train.iob_sents()[m:m + 100], test=True)

predictions = svmclassifier.predict(Xtest)

print(f'{(predictions == ytest).sum()} correct out of {ytest.shape[0]} entities. '
      f'Accuracy = {(predictions == ytest).sum() / ytest.shape[0]}')

print(predictions[:10], ytest[:10])

1153 correct out of 1185 entities. Accuracy = 0.9729957805907173
[0 0 0 1 1 0 1 0 0 0] [0 0 0 1 0 0 0 0 0 0]


In [12]:
k = 20
print(predictions[:k],'\n', ytest[:k],'\n',[[w for w, _, _ in s] for s in train.iob_sents()[m:m+k]])

[0 0 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 1 0 1] 
 [0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1] 
 [['SOCCER', '-', 'UEFA', 'REWARDS', 'THREE', 'COUNTRIES', 'FOR', 'FAIR', 'PLAY', '.'], ['GENEVA', '1996-08-22'], ['Norway', ',', 'England', 'and', 'Sweden', 'were', 'rewarded', 'for', 'their', 'fair', 'play', 'on', 'Thursday', 'with', 'an', 'additional', 'place', 'in', 'the', '1997-98', 'UEFA', 'Cup', 'competition', '.'], ['Norway', 'headed', 'the', 'UEFA', 'Fair', 'Play', 'rankings', 'for', '1995-96', 'with', '8.62', 'points', ',', 'ahead', 'of', 'England', 'with', '8.61', 'and', 'Sweden', '8.57', '.'], ['The', 'rankings', 'are', 'based', 'on', 'a', 'formula', 'that', 'takes', 'into', 'account', 'many', 'factors', 'including', 'red', 'and', 'yellow', 'cards', ',', 'and', 'coaching', 'and', 'spectators', "'", 'behaviour', 'at', 'matches', 'played', 'at', 'an', 'international', 'level', 'by', 'clubs', 'and', 'national', 'teams', '.'], ['Only', 'the', 'top', 'three', 'countries', 'are', 'allocated', '