In [1]:
# collect training data
from utils import get_training_data
raw_training_data = get_training_data()
training_data = [map(int, data.split(',')) for id, data in raw_training_data]
training_data[0] # show example of first entry

[1,
 3,
 13,
 87,
 1053,
 28576,
 2141733,
 508147108,
 402135275365,
 1073376057490373,
 9700385489355970183L,
 298434346895322960005291L,
 31479360095907908092817694945L,
 11474377948948020660089085281068730L]

In [135]:
# flatten training data in preparation for vectorization
def flat(data_array):
    index = 1
    flat_array = []
    for data in data_array:
        flat_array.extend(data)
        flat_array.append(-index)
        index += 1
    return flat_array

In [136]:
# compress data to remove negatives
from utils import Compressor

values = set()
for data in training_data:
    values.update(data)
    
compressor = Compressor(values)

In [178]:
# get flat training data for suffix arrray
# note: sequence is reversed
flat_training_data = flat([map(compressor.compress, data[::-1]) for data in training_data])

In [185]:
%%time
# vectorize reversed data using suffix array
from progvar.strings.suffix import SuffixArray
suffix_array = SuffixArray(flat_training_data)

CPU times: user 1min 5s, sys: 176 ms, total: 1min 5s
Wall time: 1min 5s


In [186]:
# mark indices that are not last
mark = [0] * len(suffix_array)
for i in xrange(len(flat_training_data) - 1):
    if flat_training_data[i] >= 0 and flat_training_data[i + 1] >= 0:
        mark[suffix_array.position[i + 1]] = 1

# accumulate sum for range query
for i in xrange(1, len(mark)):
    mark[i] += mark[i - 1]
    
def range_sum_query(a, b):
    if b < a: return 0
    else: return mark[b] - (mark[a - 1] if a >= 0 else 0)

In [240]:
# define a radix search function to narrow the range of predictions
# returns a pair (L, R) indicating the the narrowest range of possible sequences
def radix_search(sequence):
    data = map(compressor.compress, sequence)[::-1] # reverse data
    # cut of None's
    for i in xrange(len(data)):
        if data[i] is None:
            data = data[:i]
            break
    if not data: return None
    L, R = 0, len(suffix_array) - 1
    for radix in xrange(len(data)):
        # find the lower bound => L
        l, r = L, R
        while l < r:
            m = ((l + r) >> 1)
            ch = suffix_array.text[suffix_array[m] + radix]
            if ch < data[radix]:
                l = m + 1
            else:
                r = m
        nL = l
        # find the upper bound => R
        l, r = nL, R
        while l < r:
            m = ((l + r) >> 1) + 1
            ch = suffix_array.text[suffix_array[m] + radix]
            if ch > data[radix]:
                r = m - 1
            else:
                l = m
        nR = l
        if range_sum_query(nL, nR) == 0:
            if radix == 0:
                return None # not in training data
            break
        L, R = nL, nR
    # prediction is between L and R
    predictions = []
    for i in xrange(L, R + 1):
        if range_sum_query(i, i):
            sai = suffix_array[i]
            predictions.append(suffix_array.text[sai - 1])
    return map(compressor.decompress, predictions)
    # return map(compressor.decompress, predictions)

In [255]:
# test: fibonacci
from utils import mode
mode(radix_search([8, 13, 21]))

[34]

In [256]:
# gather test data
from utils import get_test_data
raw_test_data = get_test_data()

In [259]:
test_data = [map(int, data.split(',')) for id, data in raw_test_data[1:]]
test_data[0]

[1,
 1,
 5,
 11,
 35,
 93,
 269,
 747,
 2115,
 5933,
 16717,
 47003,
 132291,
 372157,
 1047181,
 2946251,
 8289731,
 23323853,
 65624397,
 184640891,
 519507267,
 1461688413,
 4112616845,
 11571284395,
 32557042499,
 91602704493,
 257733967693]

In [295]:
# have a heuristic to recursively predict a sequence not in the training set
def delta(array):
    return [y - x for x, y in zip(array[:-1], array[1:])]

def recursive_predict(array):
    if len(array) == 0: return 0
    rs = radix_search(array)
    if rs is None: return array[-1] + recursive_predict(delta(array))
    return rs[0]

In [296]:
# try first data
print 'depth one:', radix_search(test_data[0])
print 'depth recursive:', recursive_predict(test_data[0])

depth one: None
depth recursive: 731271829377


In [303]:
%%time
# try 1000
results = []
for data in test_data[:1000]:
    results.append(recursive_predict(data))
print results[:10]

[731271829377, 32, 427, 13, 1440, 6340257784102197, 103044825589401482985824L, 137188181530537831056059441L, 40478370666294985929991L, 11531]
CPU times: user 1.06 s, sys: 4 ms, total: 1.07 s
Wall time: 1.07 s


In [300]:
%%time
# try 10000
results = []
for data in test_data[:10000]:
    results.append(recursive_predict(data))

CPU times: user 9.42 s, sys: 24 ms, total: 9.44 s
Wall time: 9.42 s


In [304]:
%%time
# try 1000 non-recursive
results = []
for data in test_data[:1000]:
    results.append(radix_search(data))
print results[:10]

[None, [32], [427], [13], [1440], None, None, None, None, [11531]]
CPU times: user 440 ms, sys: 0 ns, total: 440 ms
Wall time: 439 ms


In [306]:
%%time
# count the percentage of accuracy of radix search
ave = 0
for data in test_data:
    if radix_search(data):
        ave += 1
print 'Success:', 1.0 * ave / len(test_data)

Success: 0.584782685078
CPU times: user 1min 7s, sys: 124 ms, total: 1min 7s
Wall time: 1min 7s
