## Custom Collapsing

In [1]:
import os
import sys
import numpy as np
import pandas as pd

from src import pdf
from collections import Counter

In [2]:
def print_counts(samples):
    ctr = Counter(samples)
    for k in sorted(ctr.keys()):
        v = ctr[k]
        print('{0} => {1}'.format(k,v))

In [3]:
def collapse_unordered(samples, other_key, rebin_threshold=100):
    """
    """
    
    changed = False
    
    # bin the data and get the counts for each value
    ctr = Counter(samples)
    
    if other_key not in ctr.keys():
        print('collapse_unordered: other_key "{0}" not found in sample array'.format(other_key))
        return samples, changed
    
    # iterate through the bins and find the first one with a count below the threshold
    move_key = None
    for k,v in ctr.items():
        if v < rebin_threshold:
            move_key = k
            break
            
    if move_key is None:
        # nothing to do
        return samples, changed
    else:
        # move the below-threshold key's counts to the other_key
        print('\tcollapse_unordered: change key {0} to key {1}'.format(move_key, other_key))
        new_samples = []
        for s in samples:
            if s == move_key:
                new_samples.append(other_key)
            else:
                new_samples.append(s)
                
    changed = True
    return new_samples, changed

In [4]:
def collapse_ordered(samples, rebin_threshold=100):
    """
    """
    
    changed = False
    
    # bin the data and get the counts for each value
    ctr = Counter(samples)
    
    # sort the keys in order and get counts in the same order
    sorted_keys = sorted([k for k in ctr.keys()])
    counts = [ctr[k] for k in sorted_keys]
        
    # find the index of the bin with the smallest count
    min_count = np.min(counts)
    min_bin_index = counts.index(min_count)
#     print('  counts : {0}'.format(counts))
#     print('    keys : {0}'.format(sorted_keys))
#     print(' indices : {0}'.format([q for q in range(len(counts))]))
#     print('Min count {0} for value {1} at bin index {2}'.format(min_count,
#                                                                 sorted_keys[min_bin_index],
#                                                                 min_bin_index))
            
    if min_count >= rebin_threshold:
        # no need to rebin
        return samples, changed
    
    # collapse the bin with the minimum count into an adjacent bin
    
    # get count in the adjacent bin to the left
    left_count = None
    if min_bin_index > 0:
        check_index = min_bin_index - 1
        left_count = counts[check_index]
        
    # get count in the adjacent bin to the right
    right_count = None
    if min_bin_index < len(ctr) - 1:
        check_index = min_bin_index + 1
        right_count = counts[check_index]
        
    old_index = min_bin_index
    new_index = None
    if left_count is not None and right_count is not None:
        if left_count > right_count:
            # move right
            new_index = old_index+1
        else:
            # move left
            new_index = old_index-1
    elif left_count is None and right_count is not None:
        # move right
        new_index = old_index+1
    elif left_count is not None and right_count is None:
        # move left
        new_index = old_index-1
    else:
        # nothing to do
        return samples
    
    old_key = sorted_keys[old_index]
    new_key = sorted_keys[new_index]
    print('\tcollapse_ordered: change key {0} to key {1}'.format(old_key, new_key))
    
    new_samples = []
    for s in samples:
        if s == old_key:
            new_samples.append(new_key)
        else:
            new_samples.append(s)
    
    changed = True
    return new_samples, changed

In [5]:
samples = [5,5,5,5,5,2,2,2,2,2,3,4,4,4,4,4,1,3,1,1,1]
#samples = [9,9,9,9,9,8,8,8,8,8,7,6,6,6,6,6,5,6,5,5,5]
sample_count = len(samples)

ctr = Counter(samples)

print('Before (ordered): ')
print_counts(samples)
print()

ordered_result, changed = collapse_ordered(samples, rebin_threshold=3)
assert len(ordered_result) == sample_count

print('After (ordered): ')
print_counts(ordered_result)
print()


print('Before (unordered): ')
print_counts(samples)
print()

unordered_result, changed = collapse_unordered(samples, other_key=4, rebin_threshold=5)
assert len(unordered_result) == sample_count
while changed:
    unordered_result, changed = collapse_unordered(unordered_result, other_key=4, rebin_threshold=5)
    assert len(unordered_result) == sample_count

print('After (unordered): ')
print_counts(unordered_result)
print()

Before (ordered): 
1 => 4
2 => 5
3 => 2
4 => 5
5 => 5

	collapse_ordered: change key 3 to key 2
After (ordered): 
1 => 4
2 => 7
4 => 5
5 => 5

Before (unordered): 
1 => 4
2 => 5
3 => 2
4 => 5
5 => 5

	collapse_unordered: change key 3 to key 4
	collapse_unordered: change key 1 to key 4
After (unordered): 
2 => 5
4 => 11
5 => 5

