In [10]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import MinMaxScaler 
from math import ceil, pi
from itertools import combinations
from scipy.spatial import ConvexHull

In [4]:
raw_data = pd.read_csv('club_zips.csv').sort_values('club_no')
raw_data.head()

Unnamed: 0,club_no,zip,lat,long
135,437,75237,32.6623,-96.8742
64,713,75206,32.8335,-96.7717
5,718,71101,32.5072,-93.7444
175,967,75024-3607,33.0838,-96.8128
146,1165,75201-6318,32.7877,-96.7999


In [5]:
with open('../data/loc.pickle', 'rb') as f:  
    loc = pickle.load(f)

In [7]:
loc[:5]

[array([-96.8742,  32.6623], dtype=float32),
 array([-96.7717,  32.8335], dtype=float32),
 array([-93.7444,  32.5072], dtype=float32),
 array([-96.8128,  33.0838], dtype=float32),
 array([-96.7999,  32.7877], dtype=float32)]

In [17]:
# Create district plus areas problem variable
problem = len(loc) + ceil(len(loc) / 5) - 1
clubs_with_areas = np.random.randint(problem, size=problem)

In [18]:
clubs_with_areas[:10]

array([110, 246,  18,   7, 229,  16,  16, 232,  57, 231])

In [19]:
clubs_without_areas = np.random.randint(len(loc), size=len(loc))

In [20]:
clubs_without_areas[:10]

array([115, 131, 178, 204, 111,  54, 117,  51, 165, 135])

Test whether iterating through separators is faster than using the 04-1 group areas which makes it groups of 5 and wouldn't have to be optimized.

In [21]:
def get_areas(club_list):
        areas_list = []
        area = []
        for i in club_list:            
            if not is_separator_index(i):
                area.append(i)            
            else:
                areas_list.append(area)
                print(area)
                area = []  # Reset area list
        # Collect any remaining area
        if area or is_separator_index(i):
            areas_list.append(area)
        return areas_list

def is_separator_index(index):
    # check if the index is larger than the number of the participating locations:
    return index >= len(loc) - (ceil(len(loc)/5) - 1)

Since 221 % 5 = 1, use the grouping formula for that.

In [26]:
def group_areas(clubs):
    for i in range(0, len(clubs)-16, 5):
        yield clubs[i:i + 5]
    for i in range(len(clubs)-16, len(clubs), 4):
        yield clubs[i:i + 4]

def areas_list(clubs=clubs):
    return list(group_areas(list(clubs)))

In [29]:
%timeit list(group_areas(clubs_without_areas))

23.6 µs ± 113 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [23]:
%timeit get_areas(clubs_with_areas)
# 11.4 ms ± 332 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# This function also creates areas with random numbers of members including empty lists

[110]
[18, 7]
[16, 16]
[57]
[]
[]
[]
[136]
[12]
[]
[]
[86]
[]
[87]
[130, 107, 87, 165]
[85]
[119]
[]
[]
[]
[]
[]
[171, 133, 103, 32]
[26]
[123]
[]
[165]
[95, 150, 38]
[27]
[132, 136, 3]
[]
[130, 106, 39, 83, 163]
[86, 113, 113, 124]
[82, 86, 105]
[]
[16, 63, 146]
[55, 122, 172]
[]
[]
[17, 158, 151, 171]
[]
[]
[67, 145]
[135]
[]
[]
[147, 145, 136]
[40, 52]
[99, 117]
[123]
[]
[13, 67, 83]
[]
[]
[36, 122]
[14, 34, 20]
[62]
[100]
[]
[91, 11, 163, 139, 84, 57, 69]
[]
[]
[]
[68, 97, 61, 74, 55, 1]
[47]
[127, 65, 56, 92]
[164, 73, 105, 125, 165]
[76]
[]
[161, 132]
[155, 106]
[]
[112, 144, 173, 56, 117, 29, 35, 124]
[27]
[]
[144, 70, 4, 60, 159, 8]
[74, 60, 59, 140]
[146, 136, 8, 122, 153, 169]
[72, 36]
[174, 78, 27, 95, 144, 143]
[]
[70, 120, 35]
[]
[]
[9, 53, 26, 108, 49, 19, 137]
[]
[]
[161, 5, 128, 18, 151]
[20, 40, 17, 77, 54, 28]
[]
[30, 75, 35]
[64, 96, 3, 64, 11]
[176, 67, 105]
[]
[120]
[110]
[18, 7]
[16, 16]
[57]
[]
[]
[]
[136]
[12]
[]
[]
[86]
[]
[87]
[130, 107, 87, 165]
[85]
[119]
[]

In [48]:
"""
These functions break up a list of clubs into
lists of area lists based on the number of clubs
at the end that end up with four instead of five.
The function at the end makes the selection.
:param clubs: The list of club indices.
:returns: The function for that number of clubs.
"""
# For districts with 0 or 4 clubs left
def districts_with_zero_left(clubs):
    for i in range(0, len(clubs), 5): 
        yield clubs[i:i + 5]
        
# For districts with 3 left
def districts_with_three_left(clubs):
    for i in range(0, len(clubs)-8, 5):
        yield clubs[i:i + 5]
    for i in range(len(clubs)-8, len(clubs), 4):
        yield clubs[i:i + 4]
        
# For districts with 2 left
def districts_with_two_left(clubs):
    for i in range(0, len(clubs)-12, 5):
        yield clubs[i:i + 5]
    for i in range(len(clubs)-12, len(clubs), 4):
        yield clubs[i:i + 4]
                
# For districts with 1 left
def districts_with_one_left(clubs):
    for i in range(0, len(clubs)-16, 5):
        yield clubs[i:i + 5]
    for i in range(len(clubs)-16, len(clubs), 4):
        yield clubs[i:i + 4]

def select_grouping_function(clubs_size):
    leftover = clubs_size % 5
    if leftover in [0, 4]:
        return districts_with_zero_left
    elif leftover == 3:
        return districts_with_three_left
    elif leftover == 2:
        return districts_with_two_left
    else:
        return districts_with_one_left

In [49]:
selected = select_grouping_function(len(clubs_without_areas))

In [134]:
selected

<function __main__.districts_with_one_left(clubs)>

In [137]:
list(selected(clubs_without_areas))[-5:-1] # last 4 areas

[array([121, 160,   4, 220, 137]),
 array([184, 148,  82, 206]),
 array([186,  49, 176,  96]),
 array([ 7, 12, 43,  1])]

In [54]:
len(list(selected(clubs_without_areas)))

45

In [74]:
np.random.normal(0, 0.00009)

0.00010787276138932763

In [59]:
loc[:5]

[array([-96.8742,  32.6623], dtype=float32),
 array([-96.7717,  32.8335], dtype=float32),
 array([-93.7444,  32.5072], dtype=float32),
 array([-96.8128,  33.0838], dtype=float32),
 array([-96.7999,  32.7877], dtype=float32)]

In [116]:
new_loc = raw_data.copy().reset_index(drop=True)
new_loc['zip'] = [i[:5] for i in new_loc.zip]
new_loc['long'] += np.random.normal(0, 0.0009, len(new_loc))
new_loc['lat'] += np.random.normal(0, 0.0009, len(new_loc))

In [117]:
new_loc.head()

Unnamed: 0,club_no,zip,lat,long
0,437,75237,32.662103,-96.875141
1,713,75206,32.833057,-96.76983
2,718,71101,32.505312,-93.744276
3,967,75024,33.083214,-96.81167
4,1165,75201,32.787634,-96.800328


In [None]:
# Don't seem to need these after all

def districts_compactness(self, district_lists):
    ''' 
    Computes max and avg compactness scores. Best is 1.
    :param district_list: a list of districts in a generation
    :returns: the best (max) and average compactness scores
    '''
    max_compactness = 0
    district_scores = []
    for district in district_lists:
        compactness = self.area_average_compactness(
                        self.get_areas(district))
        district_scores.append(compactness)
        max_compactness = max(max_compactness, compactness)
        avg_compactness = sum(district_scores)/len(district_scores)
    return max_compactness, avg_compactness

def districts_quality(self, district_lists):
    ''' 
    Computes min and avg quality scores. Quality is based on
    distance from the ideal so we want to minimize it.
    :param district_list: a list of districts in a generation
    :returns: the best (min) and average quality scores
    '''
    min_quality = 1
    district_scores = []
    for district in district_lists:
        quality = self.area_average_quality(
                        self.get_areas(district))
        district_scores.append(quality)
        min_quality = min(min_quality, quality)
        avg_quality = sum(district_scores)/len(district_scores)
    return min_quality, avg_quality