In [None]:
from sklearn import svm
import numpy as np
import matplotlib.pyplot as plt
import sys
import re

In [None]:
def l2i(a):
        return int(ord(a)-ord('a'))
def i2l(i):
    if i >= 0:
        return chr(i+ord('a'))
    else:
        return '_'
def iors(s):
    try:
        return int(s)
    except ValueError: # if it is a string, return a string
        return s

In [None]:
# Read the entire dataset into lists or list of lists
def read_OCR(filename, n_features):
    F = open(filename)
    dataset = {}
    dataset['ids'] = []#np.zeros(n_examples, dtype=int)
    dataset['labels'] = []#np.zeros(n_examples,dtype=int)
    dataset['labelDic'] = {} # To profile the distribution of labels
    dataset['next_ids'] = []#np.zeros(n_examples,dtype=int)
    dataset['word_ids'] = []#np.zeros(n_examples,dtype=int)
    dataset['positions'] = []#np.zeros(n_examples,dtype=int)
    dataset['folds'] = []#np.zeros(n_examples,dtype=int)
    dataset['features'] = []#np.zeros([n_examples,n_features])
    
    for str_line in F.readlines():
        #line0 = map(iors, filter(None, re.split('\t', str_line.strip())))
        ## ATTENTION: If you are using Python3, use the following line instead
        line0 = list(map(iors, filter(None, re.split('\t', str_line.strip()))))


        dataset['ids'].append(int(line0.pop(0)))
        dataset['labels'].append(l2i(line0.pop(0))) # The label is converted into integer('a'=>0, 'z'=>25)
        if dataset['labels'][-1] in dataset['labelDic']:
            dataset['labelDic'][dataset['labels'][-1]] += 1
        else:
            dataset['labelDic'][dataset['labels'][-1]] = 1
            
        dataset['next_ids'].append(int(line0.pop(0)))
        dataset['word_ids'].append(int(line0.pop(0)))
        dataset['positions'].append(int(line0.pop(0)))
        dataset['folds'].append(int(line0.pop(0)))
        if len(line0) != 128:  # Sanity check of the length
            print (len(line0))
        dataset['features'].append(line0)

    return dataset

In [None]:
class AutoContext(object):
    def __init__(self, unstructured_data, n_classes, n_iter, w_size, split):
        self.Nu, self.du = unstructured_data.shape
        self.unstructured_data = unstructured_data
        self.n_classes = n_classes
        self.num_iterations = n_iter
        self.window_size = w_size
        self.forest = None
        self.models = []
        self.split = split
        
        self.train, self.test, self.N, self.dtr, self.Ntr, self.Ntst = self.prep_ocr_data()
        self.test_labels = np.zeros((self.Ntst))

    def prep_ocr_data(self, fold=[0, 1], test_fold=9):
        if type(fold) is int:
            fold = [fold]
        if type(test_fold) is int:
            test_fold = [test_fold]
        
        if self.split == 1000:
            fold = [0, 5]
            test_fold = [2, 3, 4, 6, 7, 8]
        elif self.split == 2500:
            fold = [0, 1, 2, 3]
            test_fold = [4, 5, 8, 9]
        elif self.split == 4000:
            fold = [0, 1, 2, 3, 4, 5]
            test_fold = [6]

        selected_unstructured_data = self.unstructured_data
        Nt, du = selected_unstructured_data.shape
        dt = du - 6
        data_tmp = np.zeros((1, dt + 1))
        train = []
        test  = []
        Ntr = 0
        Ntst = 0

        for i in range(Nt):
            y = selected_unstructured_data[i, 1]    # scalar encoding of label
            data_tmp = np.vstack((data_tmp, np.hstack((selected_unstructured_data[i, 6:], y))))
            data_len = data_tmp.shape[0] - 1
            if selected_unstructured_data[i, 2] == -1:
                if selected_unstructured_data[i, 5] in fold:
                    train.append(data_tmp[1:, :])
                    Ntr += data_len
                elif selected_unstructured_data[i, 5] in test_fold:
                    test.append(data_tmp[1:, :])
                    Ntst += data_len
                data_tmp = np.zeros((1, dt + 1))

        return train, test, Nt, dt, Ntr, Ntst
    
#     def prep_ocr_data2(self):
#         split = self.split
#         selected_unstructured_data = self.unstructured_data
#         Nt, du = selected_unstructured_data.shape
#         dt = du - 6
#         data_tmp = np.zeros((1, dt + 1))
#         train = []
#         test  = []
#         Ntr = 0
#         Ntst = 0
#         count = 0
#         for i in range(Nt):
#             y = selected_unstructured_data[i, 1]
#             data_tmp = np.vstack((data_tmp, np.hstack((selected_unstructured_data[i, 6:], y)) ) )
#             data_len = data_tmp.shape[0] - 1
#             if selected_unstructured_data[i, 2] == -1:
#                 if count < split:
#                     train.append(data_tmp[1:, :])
#                     Ntr += data_len
#                 elif count >= split and count < 5000:
#                     test.append(data_tmp[1:, :])
#                     Ntst += data_len
#                 data_tmp = np.zeros((1, dt + 1))
#                 count += 1
#                 if count == 5000: break
#         return train, test, Nt, dt, Ntr, Ntst

    def train(self):
        if self.train is None:
            self.prep_ocr_data()

        confidence = np.zeros((self.Ntr, self.n_classes))
        accurracy1 = []
        accurracy2 = []

        for i in range(self.num_iterations):
            print('Iteration number ' + str(i+1) + ' out of ' + str(self.num_iterations))
            W = np.zeros((self.Ntr, self.dtr + self.n_classes * self.window_size * 2))  # Weight matrix: X + confidence
            Y = np.zeros(self.Ntr)                                                      # Cached predictions

            curr_line = 0
            for j in range(len(self.train)):
                word = self.train[j]        # get current word (X, which consists of x_1, x_2, ... x_m)
                word_len = word.shape[0]    # find num letters in X (i.e. m)

                W[curr_line:curr_line+word_len, :self.dtr] = word[:, :self.dtr]
                W[curr_line:curr_line+word_len, self.dtr:] = self.extend_context(
                        confidence[curr_line:curr_line+word_len, :]
                )

                Y[curr_line:curr_line+word_len] = self.train[j][:, -1]
                curr_line += word_len

            # Build model
            svm_class = svm.LinearSVC(multi_class='crammer_singer', random_state=42) 
            svm_class.fit(W, Y)

            self.models.append((svm_class, W))

            # Prediction
            if i < self.num_iterations:
                acc1, acc2, confidence = self.svm_inference(self.train, confidence, svm_class)
                accurracy1.append(acc1)
                accurracy2.append(acc2)

        return accurracy1, accurracy2, confidence

    def svm_inference(self, data, confidence, svm, norm=True, in_test=False):
        Nt = len(data)
#         print(Nt)
        acc1 = 0
        acc2 = 0
        total1 = 0
        total2 = 0
        conf_new = np.zeros(confidence.shape)

        cur_line = 0
        for i in range(Nt):

            word = data[i]
            word_len = word.shape[0]

            Y = word[:, -1]

            if in_test:
                self.test_labels[cur_line:cur_line+word_len] = Y

            W_prime = np.zeros((word_len, self.dtr + self.n_classes * self.window_size * 2))
            W_prime[:, :self.dtr] = word[:, :self.dtr]
            W_prime[:, self.dtr:] = self.extend_context(confidence[cur_line:(cur_line + word_len), :])

            conf = svm.decision_function(W_prime)   # Confidence measures of predictions

            if norm:
                conf = (1 + np.exp(-1*conf))**-1    # Sigmoid function --> Normalization

            conf_new[cur_line : cur_line+word_len, :] = conf
            cur_line += word_len

            # Calculate accuracy
            total1 += word_len
            total2 += 1
            subtask_acc = svm.score(W_prime, Y)
            acc2 += subtask_acc
            acc1 += subtask_acc * word_len

        return acc1/total1, acc2/total2, conf_new

    def svm_predict(self, test_data=None):
        if test_data is None:
            test_data = self.test
        n_iter = len(self.models)

        confidence = np.zeros((self.Ntst, self.n_classes))

        accuracy1 = []
        accuracy2 = []

        for i in range(n_iter):
            curr_model, _ = self.models[i]
            acc1, acc2, confidence = self.svm_inference(test_data, confidence, curr_model, True, True)
            accuracy1.append(acc1)
            accuracy2.append(acc2)

        return accuracy1, accuracy2, confidence

    def extend_context(self, conf, window_size=None, n_classes=None):
        if window_size is None:
            window_size = self.window_size
        if n_classes is None:
            n_classes = self.n_classes

        word_len = conf.shape[0]
        W = np.zeros((word_len, 2*window_size*n_classes))
        for i in range(word_len):
            for w in range(-window_size, window_size):
                if 0 <= i + w < word_len:
                    if w < 0:
                        W[i, (window_size + w)*n_classes : (window_size+w)*n_classes + n_classes] =\
                            conf[i + w, :n_classes]
                    elif w > 0:
                        W[i, (window_size + w - 1)*n_classes : (window_size + w - 1)*n_classes + n_classes] =\
                            conf[i + w, :n_classes]

        return W

In [None]:
dataset1 = read_OCR('../letter.data', 128)
letter_data = np.hstack(((np.array(dataset1['ids']).reshape(-1,1)), (np.array(dataset1['labels']).reshape(-1,1)), 
        (np.array(dataset1['next_ids']).reshape(-1,1)), (np.array(dataset1['word_ids']).reshape(-1,1)), 
        (np.array(dataset1['positions']).reshape(-1,1)), (np.array(dataset1['folds']).reshape(-1,1)), 
        np.array(dataset1['features'])))

In [None]:
letter_data.shape

In [None]:
# 1000/4000 Split

j = 4
test_accuracies1 = np.zeros((1, j))
test_accuracies2 = np.zeros((1, j))
# Hyper-parameters: 
# i: window size
# j: number of iterations
for i in range(1, 4):

    print('Creating AutoContext object, prepping OCR dataset')
    ac = AutoContext(letter_data,26,j,i,1000)
    # print(ac.train[1].shape)  # sanity check
    # print(ac.Ntr, ac.dtr)

    print('Training Strategy 2: SVM-based Auto Context')
    tr_accuracy1, tr_accuracy2, conf = ac.train()
    print('Training accuracy (by word and letter):')
    print(tr_accuracy1)
    print(tr_accuracy2)
    print('Testing Strategy 2')
    ts_accuracy1, ts_accuracy2, conf = ac.svm_predict()
    print('Testing accuracy (by word and by letter):')
    print(ts_accuracy1)
    print(ts_accuracy2)
    test_accuracies1 = np.vstack((test_accuracies1, ts_accuracy1))
    test_accuracies2 = np.vstack((test_accuracies2, ts_accuracy2))

print(test_accuracies1)
print(test_accuracies2)

np.savetxt('accuracies1', test_accuracies1[1:,:], '%.5f')
np.savetxt('accuracies2', test_accuracies2[1:,:], '%.5f')

In [None]:
# 2500/2500 Split

j = 4
train_accuracies = np.zeros((1, j))
test_accuracies1 = np.zeros((1, j))
test_accuracies2 = np.zeros((1, j))
# Hyper-parameters: 
# i: window size
# j: number of iterations
for i in range(1, 4):

    print('Creating AutoContext object, prepping OCR dataset')
    ac = AutoContext(letter_data,26,j,i,2500)
    # print(ac.train[1].shape)  # sanity check
    # print(ac.Ntr, ac.dtr)

    print('Training Strategy 2: SVM-based Auto Context')
    tr_accuracy1, tr_accuracy2, conf = ac.train()
    print('Training accuracy (by word and letter):')
    print(tr_accuracy1)
    print(tr_accuracy2)
    train_accuracies = np.vstack((train_accuracies, tr_accuracy2))  # token accuracy
    print('Testing Strategy 2')
    ts_accuracy1, ts_accuracy2, conf = ac.svm_predict()
    print('Testing accuracy (by word and by letter):')
    print(ts_accuracy1)
    print(ts_accuracy2)
    test_accuracies1 = np.vstack((test_accuracies1, ts_accuracy1))
    test_accuracies2 = np.vstack((test_accuracies2, ts_accuracy2))

print(test_accuracies1)
print(test_accuracies2)
#
np.savetxt('accuracies1_55', test_accuracies1[1:,:], '%.5f')
np.savetxt('accuracies2_55', test_accuracies2[1:,:], '%.5f')
np.savetxt('tr_accuracies_55', train_accuracies[1:,:], '%.5f')

In [None]:
# 4000/1000 Split

j = 4
train_accuracies = np.zeros((1, j))
test_accuracies1 = np.zeros((1, j))
test_accuracies2 = np.zeros((1, j))
# Hyper-parameters: 
# i: window size
# j: number of iterations
for i in range(1, 4):

    print('Creating AutoContext object, prepping OCR dataset')
    ac = AutoContext(letter_data,26,j,i,4000)
    # print(ac.train[1].shape)  # sanity check
    # print(ac.Ntr, ac.dtr)

    print('Training Strategy 2: SVM-based Auto Context')
    tr_accuracy1, tr_accuracy2, conf = ac.train()
    print('Training accuracy (by word and letter):')
    print(tr_accuracy1)
    print(tr_accuracy2)
    train_accuracies = np.vstack((train_accuracies, tr_accuracy2))  # token accuracy
    print('Testing Strategy 2')
    ts_accuracy1, ts_accuracy2, conf = ac.svm_predict()
    print('Testing accuracy (by word and by letter):')
    print(ts_accuracy1)
    print(ts_accuracy2)
    test_accuracies1 = np.vstack((test_accuracies1, ts_accuracy1))
    test_accuracies2 = np.vstack((test_accuracies2, ts_accuracy2))

print(test_accuracies1)
print(test_accuracies2)
#
np.savetxt('accuracies1_82', test_accuracies1[1:,:], '%.5f')
np.savetxt('accuracies2_82', test_accuracies2[1:,:], '%.5f')
np.savetxt('tr_accuracies_82', train_accuracies[1:,:], '%.5f')