In [1]:
from lxml import etree
import pycrfsuite
import csv

In [2]:
data_file = 'data/ssj500k-sl.TEI/ssj500k-sl.body.xml'
data_split = 0.8

#### Classes representing sentences and words within sentences

In [3]:
class Sentence():
    def __init__(self, sentence):
        self.words = []
        for part in sentence.getchildren():
            if part.tag == '{http://www.tei-c.org/ns/1.0}seg':
                named_entity = part.get('subtype')
                for ne in part.getchildren():
                    self.words.append(Word(ne, named_entity))
            elif part.tag == '{http://www.tei-c.org/ns/1.0}w':
                self.words.append(Word(part))
        
class Word():
    def __init__(self, word, ne=None):
        self.properties = {}
        for prop in word.keys():
            self.properties[prop] = word.get(prop)
        if ne == None:
            self.properties['named_entity'] = 'none'
        else:
            self.properties['named_entity'] = ne
        self.properties['text'] = word.text

#### We use lxml to parse input data and save it into our classes

In [4]:
sentences = []

for event, element in etree.iterparse(data_file, events=('end', 'start'), encoding='utf-8'):
    if element.tag == '{http://www.tei-c.org/ns/1.0}s' and event == 'start':
        sentences.append(Sentence(element))
    
sentences = sentences[:int(len(sentences) / 8) * 3]
    
sentences = list(filter(lambda x: len(x.words) > 0, sentences))

#### Helper functions for generating features from words and their relationships within sentences

In [6]:
def features_from_sentence(sentence):
    return [features_from_word(sentence, i) for i in range(len(sentence.words))]

def labels_from_sentence(sentence):
    return [label_from_word(sentence, i) for i in range(len(sentence.words))]

def features_from_word(sentence, i):
    word = sentence.words[i]
    text = word.properties['text']
    if text != None:
        features = [
            'word_to_lowercase=' + text.lower(),
            'last_3_chars=' + text[-3:],
            'last_2_chars=' + text[-2:],
            'word_all_uppercase=%s' % (text.isupper()),
            'word_is_title_case=%s' % (text.istitle()),
            'word_all_digits=%s' % (text.isdigit())
        ]
    else:
        features = []
    if 'ana' in word.properties:
        features.extend([
            'word_ana=%s' % (word.properties['ana'])
        ])
    
    if i > 0:
        prev_word = sentence.words[i-1]
        text = prev_word.properties['text']
        if text != None:
            features.extend([
                'prev_word_to_lowercase=' + text.lower(),
                'prev_word_is_title_case=%s' % (text.istitle()),
                'prev_word_all_uppercase=%s' % (text.isupper())
            ])
        if 'ana' in prev_word.properties:
            features.extend([
                'word_ana=%s' % (prev_word.properties['ana'])
            ])
    else:
        features.append('Start')
    
    if i < len(sentence.words) - 1:
        next_word = sentence.words[i + 1]
        text = next_word.properties['text']
        if text != None:
            features.extend([
                'next_word_to_lowercase=' + text.lower(),
                'next_word_is_title_case=%s' % (text.istitle()),
                'next_word_all_uppercase=%s' % (text.isupper())
            ])
        if 'ana' in next_word.properties:
            features.extend([
                'word_ana=%s' % (next_word.properties['ana'])
            ])
    else:
        features.append('End')
        
    return features

def label_from_word(sentence, i):
    word = sentence.words[i]
    return word.properties['named_entity']

In [None]:
msd_features = len(list(filter(lambda sen: len(list(filter(lambda word: 'msd' in word.properties, sen.words))) > 0, sentences)))
ana_features = len(list(filter(lambda sen: len(list(filter(lambda word: 'ana' in word.properties, sen.words))) > 0, sentences)))
ne_features = len(list(filter(lambda sen: len(list(filter(lambda word: word.properties['named_entity'] != 'none', sen.words))) > 0, sentences)))

(len(sentences), msd_features, ana_features, ne_features)

In [7]:
split_index = int(len(sentences) * data_split)
X_train = [features_from_sentence(s) for s in sentences[:split_index]]
Y_train = [labels_from_sentence(s) for s in sentences[:split_index]]

X_test = [features_from_sentence(s) for s in sentences[split_index:]]
Y_test = [labels_from_sentence(s) for s in sentences[split_index:]]

In [9]:
sentences = []

In [10]:
trainer = pycrfsuite.Trainer(verbose=False)

for x, y in zip(X_train, Y_train):
    trainer.append(x, y)

In [11]:
trainer.set_params({
    'c1': 1.0,
    'c2': 1e-3,
    'max_iterations': 50,
    'feature.possible_transitions': True
})

In [12]:
trainer.params()

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

In [13]:
trainer.train('models/crf_model.crfsuite')

In [14]:
trainer.logparser.iterations[-1]

{'num': 50,
 'scores': {},
 'loss': 6717.305201,
 'feature_norm': 52.27461,
 'error_norm': 713.910368,
 'active_features': 4439,
 'linesearch_trials': 1,
 'linesearch_step': 1.0,
 'time': 0.096}

In [15]:
tagger = pycrfsuite.Tagger()
tagger.open('models/crf_model.crfsuite')

<contextlib.closing at 0x3f38cf40>

In [16]:
predictions = []

for ex in X_test:
    predictions.append(tagger.tag(ex))

In [18]:
with open('data/predictions.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(predictions)

In [19]:
with open('data/true_values.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(Y_test)