In [1]:
import numpy as np
from pystruct.datasets import load_letters
letters = load_letters()
X, y, folds = letters['data'], letters['labels'], letters['folds']
# we convert the lists to object arrays, as that makes slicing much more
# convenient
X, y = np.array(X), np.array(y)

In [2]:
def convert_letter_to_dict(letter):
    d = {}
    for ind, i in enumerate(letter):
        d["p_"+str(ind)] = i 
    return d

In [3]:
def word2features(word, i):
    letter = word[i]
    features = {
    #     'bias',
    'letter': convert_letter_to_dict(letter)
    }
    if i < len(word)-1:
        letter = word[i+1]
        features.update({
        '+1:letter': str(convert_letter_to_dict(letter))
        })
    return features

In [4]:
def create_word_features(data):
    return [word2features(data, i) for i in range(len(data))]
X_features = [create_word_features(word) for word in X]

In [5]:
X_features = np.array(X_features)

In [6]:
import pycrfsuite
trainer = pycrfsuite.Trainer(verbose=True)

In [7]:
X_train, X_test = X_features[folds == 1], X_features[folds != 1]
y_train, y_test = y[folds == 1], y[folds != 1]

In [8]:
y_tr = []
for y_i in y_train:
    z = []
    for i in y_i:
        z.append(str(i))
    y_tr.append(z)
    
y_te = []
for y_i in y_test:
    z = []
    for i in y_i:
        z.append(str(i))
    y_te.append(z)

In [9]:
cnt=0
a=0
for xseq, yseq in zip(X_train, y_tr):
#     print(xseq)
#     print(yseq)
    cnt +=1
    ystr = [str(i) for i in yseq]
    if(len(ystr)!=len(xseq)):
       print(cnt)
       continue
    a+=1
    trainer.append(xseq, ystr)

In [10]:
trainer.set_params({
 'c1': 0.10, 
 'c2': 1e-3,
 'max_iterations': 60,
 'feature.possible_transitions': True
})
trainer.train('ocr.crfsuite')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 8661
Seconds required: 0.058

L-BFGS optimization
c1: 0.100000
c2: 0.001000
num_memories: 6
max_iterations: 60
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 14984.138788
Feature norm: 1.000000
Error norm: 2061.068660
Active features: 8648
Line search trials: 1
Line search step: 0.000317
Seconds required for this iteration: 0.100

***** Iteration #2 *****
Loss: 11915.842670
Feature norm: 3.311380
Error norm: 1972.717443
Active features: 8394
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.050

***** Iteration #3 *****
Loss: 9874.478508
Feature norm: 5.262990
Error norm: 1720.714647
Active features: 8475
Line search trials: 1
Line search step: 1.000000
Seconds required for this ite

In [11]:
tagger = pycrfsuite.Tagger()
tagger.open('ocr.crfsuite')

<contextlib.closing at 0x7f55bd2349b0>

In [12]:
def calc_acc(data, y):
    tot=0.0
    cor=0.0
    for i,d in enumerate(data):
        prediction = tagger.tag(d)
        cor += np.sum(np.array(y)[i]==np.array(prediction))
        tot += len(y[i])
    return (cor/tot)

In [13]:
print("Train acc:", calc_acc(X_train, y_tr))
print("Test acc:", calc_acc(X_test, y_te))

Train acc: 0.999255813953
Test acc: 0.783975030464


In [14]:
1/27

0.037037037037037035

In [15]:
X_train[0][0].keys()

dict_keys(['letter', '+1:letter'])

In [16]:
y_tr

[['14', '12', '12', '0', '13', '3', '8', '13', '6'],
 ['14', '12', '12', '0', '13', '3', '8', '13', '6'],
 ['14', '12', '12', '0', '13', '3', '8', '13', '6'],
 ['14', '12', '12', '0', '13', '3', '8', '13', '6'],
 ['14', '12', '12', '0', '13', '3', '8', '13', '6'],
 ['14', '12', '12', '0', '13', '3', '8', '13', '6'],
 ['14', '12', '12', '0', '13', '3', '8', '13', '6'],
 ['14', '12', '12', '0', '13', '3', '8', '13', '6'],
 ['14', '12', '12', '0', '13', '3', '8', '13', '6'],
 ['14', '12', '12', '0', '13', '3', '8', '13', '6'],
 ['12', '1', '17', '0', '2', '4', '18'],
 ['12', '1', '17', '0', '2', '4', '18'],
 ['12', '1', '17', '0', '2', '4', '18'],
 ['12', '1', '17', '0', '2', '4', '18'],
 ['12', '1', '17', '0', '2', '4', '18'],
 ['12', '1', '17', '0', '2', '4', '18'],
 ['12', '1', '17', '0', '2', '4', '18'],
 ['12', '1', '17', '0', '2', '4', '18'],
 ['12', '1', '17', '0', '2', '4', '18'],
 ['12', '1', '17', '0', '2', '4', '18'],
 ['12', '1', '17', '0', '2', '4', '18'],
 ['12', '1', '17', 