# Suffix Array Algorithms Analysis


This notebook summarizes analysis for certain algorithms that build the [suffix array](
https://en.wikipedia.org/wiki/Suffix_array) data structure, classified according to their complexities.

1. **Brute Force** - *O(n^2)*
2. **Radix Sort** - *O(n log^2 n)*
3. **Counting Sort** - *O(n log n)*
4. **DC3 Algorithm** - *O(kn)*

Conclusion summary:
- Brute Force special case is best for all (letters == 1).
- Brute Force average complexity is inversely proportional to the increase of letters if (letters >= 2).
- Brute Force is **best** for small cases (length <= 100) in terms of average complexity.
- DC3 is **best** for medium cases (100 <= length <= 1000) with (2 <= letters < 10).
- Brute Force is **best** for medium cases (100 <= length <= 1000) with (letters >= 10).
- DC3 is **best** for all large test cases (length > 1000) with (letters >= 2).
- Counting Sort has the same best case and worst case *O(n log n)* complexity.
- Radix Sort beats Counting Sort's average case complexity.


In [2]:
from algorithm.strings.suffix import SuffixArray
from algorithm.html import Table
from time import time

In [15]:
# Testing
trial_text = []
statistics = Table()

def generate_text(trials=10, text_length=1000, letters=26):
    print 'Trials:', trials
    print 'Text length:', text_length
    print 'Letters:', letters
    print ''
    from algorithm.strings.utils import random_text
    global trial_text
    trial_text = [random_text(length=text_length, letters=letters) for i in xrange(trials)]
    
results = {}
best = []

def perform_algorithm(algorithm):
    
    global results
    results[algorithm] = []
    
    # profile time
    time_elapsed = 0
    total_text_length = 0
    total_letters = set()
    
    
    for text in trial_text:
        time_start = time()
        
        # algorithm result
        result = SuffixArray(text, algo=algorithm)
        
        # collect time statistics
        delta_time = time() - time_start
        time_elapsed += delta_time
        
        # collect text statistics
        total_text_length += len(text)
        total_letters.update(list(text))
        
        # append to results
        results[algorithm].append(result)
    
    
    time_per_trial = time_elapsed / len(trial_text)
    data = {
        'algorithm': algorithm,
        'trials': len(trial_text),
        'total time': time_elapsed,
        'time per trial': time_per_trial,
        'letters': len(total_letters),
        'length': total_text_length / len(trial_text),
    }
    
    global statistics, best
    statistics.append(data)
    best.append((time_elapsed, algorithm))

def test(trials=10, text_length=10000, letters=5, exclude=[]):
    global results, best
    generate_text(trials=trials, text_length=text_length, letters=letters)
    results.clear()
    best = []
    for algorithm in SuffixArray.algorithms:
        if algorithm in exclude: continue
        perform_algorithm(algorithm)
    L = list(results)
    for i in xrange(len(L)):
        for j in xrange(i + 1, len(L)):
            algo1, algo2 = L[i], L[j]
            if results[algo1] != results[algo2]:
                print "WARNING! %s and %s have different suffix arrays" % (algo1, algo2)
    best.sort()
    print '\n'.join(map(lambda (t, algo): "%s: %.4f ms" % (algo, t * 1000.0), best))

# # Small Test Cases (n <= 100)

In [16]:
test(trials=1000, text_length=(1, 100), letters=1)

Trials: 1000
Text length: (1, 100)
Letters: 1

brute: 74.9996 ms
default: 76.0000 ms
counting sort: 361.0003 ms
dc3: 457.0000 ms
radix sort: 489.9993 ms


In [17]:
test(trials=1000, text_length=(1, 100), letters=2)

Trials: 1000
Text length: (1, 100)
Letters: 2

brute: 293.9999 ms
default: 295.0003 ms
counting sort: 369.9999 ms
dc3: 389.9999 ms
radix sort: 515.0001 ms


In [18]:
test(trials=1000, text_length=(1, 100), letters=5)

Trials: 1000
Text length: (1, 100)
Letters: 5

brute: 216.0001 ms
default: 216.0003 ms
radix sort: 360.0001 ms
counting sort: 360.9998 ms
dc3: 374.9998 ms


In [19]:
test(trials=1000, text_length=(1, 100), letters=10)

Trials: 1000
Text length: (1, 100)
Letters: 10

brute: 192.9998 ms
default: 207.0003 ms
radix sort: 305.0001 ms
dc3: 315.0003 ms
counting sort: 350.0001 ms


In [20]:
test(trials=1000, text_length=(1, 100), letters=26)

Trials: 1000
Text length: (1, 100)
Letters: 26

brute: 184.0000 ms
default: 197.9997 ms
radix sort: 255.0004 ms
dc3: 255.9996 ms
counting sort: 341.9998 ms


In [21]:
test(trials=1000, text_length=(1, 100), letters=50)

Trials: 1000
Text length: (1, 100)
Letters: 50

brute: 175.0002 ms
default: 185.0004 ms
radix sort: 219.9998 ms
dc3: 258.9996 ms
counting sort: 341.9998 ms


## Medium Test Cases (100 <= n <= 1000)

In [22]:
test(trials=100, text_length=(100, 1000), letters=1)

Trials: 100
Text length: (100, 1000)
Letters: 1

brute: 64.9998 ms
default: 69.0000 ms
dc3: 485.9998 ms
counting sort: 550.9999 ms
radix sort: 653.9998 ms


In [23]:
test(trials=100, text_length=(100, 1000), letters=2)

Trials: 100
Text length: (100, 1000)
Letters: 2

dc3: 395.9999 ms
default: 396.0001 ms
counting sort: 507.9999 ms
brute: 573.0002 ms
radix sort: 701.9997 ms


In [24]:
test(trials=100, text_length=(100, 1000), letters=5)

Trials: 100
Text length: (100, 1000)
Letters: 5

dc3: 358.0003 ms
default: 378.0003 ms
brute: 418.0000 ms
counting sort: 541.9998 ms
radix sort: 623.9998 ms


In [25]:
test(trials=100, text_length=(100, 1000), letters=10)

Trials: 100
Text length: (100, 1000)
Letters: 10

default: 363.0004 ms
dc3: 375.0000 ms
brute: 376.0002 ms
radix sort: 530.9999 ms
counting sort: 534.9996 ms


In [26]:
test(trials=100, text_length=(100, 1000), letters=26)

Trials: 100
Text length: (100, 1000)
Letters: 26

brute: 288.0001 ms
default: 291.9998 ms
dc3: 338.0003 ms
radix sort: 398.0000 ms
counting sort: 472.9998 ms


In [27]:
test(trials=100, text_length=(100, 1000), letters=50)

Trials: 100
Text length: (100, 1000)
Letters: 50

brute: 295.9998 ms
default: 305.0003 ms
dc3: 306.0002 ms
radix sort: 379.0002 ms
counting sort: 515.0001 ms


In [28]:
## Large Test Cases (1000 <= n <= 10000)

In [29]:
test(trials=10, text_length=(1000, 10000), letters=1)

Trials: 10
Text length: (1000, 10000)
Letters: 1

brute: 68.0001 ms
default: 70.0002 ms
dc3: 534.0002 ms
counting sort: 750.0000 ms
radix sort: 966.0001 ms


In [30]:
test(trials=10, text_length=(1000, 10000), letters=2)

Trials: 10
Text length: (1000, 10000)
Letters: 2

dc3: 578.9998 ms
default: 579.0002 ms
counting sort: 1031.0001 ms
radix sort: 1295.0003 ms
brute: 1561.9998 ms


In [31]:
test(trials=10, text_length=(1000, 10000), letters=5)

Trials: 10
Text length: (1000, 10000)
Letters: 5

default: 419.9998 ms
dc3: 436.9998 ms
brute: 671.0000 ms
counting sort: 779.0000 ms
radix sort: 825.0003 ms


In [32]:
test(trials=10, text_length=(1000, 10000), letters=10)

Trials: 10
Text length: (1000, 10000)
Letters: 10

dc3: 368.9997 ms
default: 376.9999 ms
brute: 540.0000 ms
radix sort: 756.0000 ms
counting sort: 806.9999 ms


In [33]:
test(trials=10, text_length=(1000, 10000), letters=26)

Trials: 10
Text length: (1000, 10000)
Letters: 26

dc3: 302.9997 ms
default: 313.0000 ms
brute: 368.0000 ms
radix sort: 488.0002 ms
counting sort: 632.0002 ms


In [34]:
test(trials=10, text_length=(1000, 10000), letters=50)

Trials: 10
Text length: (1000, 10000)
Letters: 50

default: 211.0002 ms
dc3: 217.0000 ms
brute: 227.0000 ms
radix sort: 303.0000 ms
counting sort: 418.0000 ms


## Extra Large Test Case (n = 100000)

In [35]:
test(trials=3, text_length=100000, letters=1)

Trials: 3
Text length: 100000
Letters: 1

brute: 369.0002 ms
default: 372.9999 ms
dc3: 2907.0001 ms
counting sort: 5013.9999 ms
radix sort: 7915.9999 ms


In [36]:
test(trials=3, text_length=100000, letters=2)

Trials: 3
Text length: 100000
Letters: 2

default: 3063.9999 ms
dc3: 3076.0000 ms
radix sort: 6363.0002 ms
counting sort: 9188.0000 ms
brute: 9605.9999 ms


In [37]:
test(trials=3, text_length=100000, letters=5)

Trials: 3
Text length: 100000
Letters: 5

dc3: 3154.0000 ms
default: 3178.0000 ms
brute: 5453.0001 ms
radix sort: 5634.0001 ms
counting sort: 9700.0000 ms


In [38]:
test(trials=3, text_length=100000, letters=10)

Trials: 3
Text length: 100000
Letters: 10

dc3: 3082.0003 ms
default: 3102.9999 ms
brute: 4460.9997 ms
radix sort: 5652.9999 ms
counting sort: 10471.0000 ms


In [39]:
test(trials=3, text_length=100000, letters=26)

Trials: 3
Text length: 100000
Letters: 26

dc3: 2614.0001 ms
default: 2908.0002 ms
brute: 3822.0000 ms
radix sort: 4804.9998 ms
counting sort: 9937.0003 ms


In [40]:
test(trials=3, text_length=100000, letters=50)

Trials: 3
Text length: 100000
Letters: 50

dc3: 2883.0001 ms
default: 2887.9998 ms
brute: 3555.0001 ms
radix sort: 4475.0001 ms
counting sort: 9759.0001 ms


In [41]:
from IPython.display import display, HTML
statistics.sort(key=lambda this: (-this['trials'], this['letters'], this['total time']))
display(HTML(statistics.to_html()))

0,1,2,3,4,5
letters,algorithm,total time,trials,time per trial,length
1,brute,0.0749995708466,1000,7.49995708466e-05,49
1,default,0.0759999752045,1000,7.59999752045e-05,49
1,counting sort,0.361000299454,1000,0.000361000299454,49
1,dc3,0.457000017166,1000,0.000457000017166,49
1,radix sort,0.489999294281,1000,0.000489999294281,49
2,brute,0.293999910355,1000,0.000293999910355,50
2,default,0.295000314713,1000,0.000295000314713,50
2,counting sort,0.369999885559,1000,0.000369999885559,50
2,dc3,0.389999866486,1000,0.000389999866486,50
