In [2]:
import re
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
%matplotlib inline

In [302]:
N_EXAMPLES = 5000
N_FEATURES = 128

In [3]:
def atoi(a):
    return int(ord(a)-ord('a'))
def itoa(i):
    return chr(i+ord('a'))

def iors(s):
    try:
        return int(s)
    except ValueError: # if it is a string, return a string
        return s

In [4]:
def read_OCR(filename, n_examples, n_features):
    F = open(filename)
    dataset = {}
    dataset['ids'] = np.zeros(n_examples, dtype=int)
    dataset['labels'] = np.zeros(n_examples,dtype=int)
    dataset['next_ids'] = np.zeros(n_examples,dtype=int)
    dataset['word_ids'] = np.zeros(n_examples,dtype=int)
    dataset['positions'] = np.zeros(n_examples,dtype=int)
    dataset['folds'] = np.zeros(n_examples,dtype=int)
    dataset['features'] = np.zeros([n_examples,n_features])
    
    i = 0
    for str_line in F.readlines():
        line0 = list(map(iors, filter(None, re.split('\t', str_line.strip()))))
        
        dataset['ids'][i] = line0.pop(0)
        dataset['labels'][i] = atoi(line0.pop(0))
        dataset['next_ids'][i] = line0.pop(0)
        dataset['word_ids'][i] = line0.pop(0)
        dataset['positions'][i] = line0.pop(0)
        dataset['folds'][i] = line0.pop(0)
        if len(line0) != 128:  # Sanity check of the length
            print(len(line0))

        for j, v in enumerate(line0):
            dataset['features'][i][j] = v
        i += 1
        if i == n_examples:
            break
            
    return dataset

In [5]:
ocr = read_OCR('letter.data')

In [255]:
def chop_idxs(ocr, window = 2, start = 0, stop = None):
    if stop is None: stop = len(ocr['ids'])
    chops = []
    chop = []
    i = start
    while i < stop:
        nextid = ocr['next_ids'][i]
        if len(chop) < window:
            chop.append(i)
            if nextid == -1 or i == stop-1:
                while len(chop) < window:
                    chop.append('_')
                if i == stop-1:
                    chops.append(chop)
            i = i+1
        else:
            chops.append(chop)
            chop = []
    return(np.array(chops))

In [283]:
chops = chop_idxs(ocr, window = 4, start = 0, stop = 18)

In [297]:
def chops_to_str(ocr, chops):
    return np.array([[itoa(ocr['labels'][int(idx)]) if idx != '_' else '_' for idx in chop] for chop in chops])

In [298]:
chops_to_str(ocr, chops)

array([['o', 'm', 'm', 'a'],
       ['n', 'd', 'i', 'n'],
       ['g', '_', '_', '_'],
       ['o', 'm', 'm', 'a'],
       ['n', 'd', 'i', 'n'],
       ['g', '_', '_', '_']],
      dtype='<U1')

In [295]:
def chops_to_labels(ocr, chops):
    return np.array([[ocr['labels'][int(idx)] if idx != '_' else 26 for idx in chop] for chop in chops])

In [296]:
chops_to_labels(ocr, chops)

array([[14, 12, 12,  0],
       [13,  3,  8, 13],
       [ 6, 26, 26, 26],
       [14, 12, 12,  0],
       [13,  3,  8, 13],
       [ 6, 26, 26, 26]])

In [299]:
def chops_to_features(ocr, chops):
    return np.array([[ocr['features'][int(idx)] if idx != '_' else np.zeros(N_FEATURES) for idx in chop] for chop in chops])

In [303]:
features = chops_to_features(ocr, chops)

In [304]:
features.shape

(6, 4, 128)