## Results

Analysis of the AllOfUs results.

In [1]:
import os
import re
import sys
import numpy as np
import pandas as pd
from collections import namedtuple, Counter

from src import raking

In [2]:
# csv files with results from SQL-style table
RESULT_DIR = os.path.join('results', 'aou_2020_imputed')

# two-letter abbreviations for the 17 states with more than 1000 AllOfUs samples
STATES1 = {
    'CA', 'AZ', 'IL', 'PA', 'NY', 'MA', 'AL',
    'WI', 'MI', 'FL', 'TX', 'GA', 'LA', 'MS',
    'SC', 'TN', 'CT'
}

#### Load the raking models for six variables

In [3]:
# extract only the interactions, convert to [ij] notation, build fwd and inverse maps
# i.e. [[0], [1], [2], [3], [4,5]] -> [45]
model_to_index = {}
for i, model in enumerate(raking.MODELS_6):
    str_model = str(model)
    iterator = re.finditer(r'\[(?P<first>\d), (?P<second>\d)\]', str_model)
    interactions = []
    for match in iterator:
        interaction = '[{0}{1}]'.format(match.group('first'), match.group('second'))
        interactions.append(interaction)

    if 0 == len(interactions):
        shorthand_str = '[]'
    else:
        shorthand_str = ''.join(interactions)

    model_to_index[shorthand_str] = i

index_to_model = {v:k for k,v in model_to_index.items()}
print('Loaded {0} models'.format(len(index_to_model)))
# print('Models: ')
# for index in sorted(index_to_model.keys()):
#     m = index_to_model[index]
#     print('\t{0:2d} : {1}'.format(index, m))

Loaded 61 models


#### Load the data for all 50 states and DC

In [4]:
state_data = {}
StateTup = namedtuple('StateTup', ['max_wt', 'min_cell', 'score', 'model'])

STATES = set()
for f in os.listdir(RESULT_DIR):
    # assume the two-letter state abbrev is found between the first pair of underscores
    match = re.search(r'_(?P<state>[a-z][a-z])_', f, re.IGNORECASE)
    assert match
    state_abbrev = match.group('state').upper()
    STATES.add(state_abbrev)
    #print('{0}: '.format(state_abbrev))
    filepath = os.path.join(RESULT_DIR, f)
    state_list = []
    with open(filepath) as infile:
        for i, line in enumerate(infile):
            if 0 == i:
                continue
            text = line.strip()
            max_wt, min_cell, score, model = text.split(',')
            # convert values to float and int; otherwise sorts will be wrong
            tup = StateTup(float(max_wt), int(min_cell), float(score), model)
            state_list.append(tup)
    state_data[state_abbrev] = state_list

#     for item in state_data[state_abbrev]:
#         print('\t{0}'.format(item))

print('Loaded data for {0} states'.format(len(STATES)))

# all states with fewer than 1K samples
STATES2 = {s for s in STATES if not s in STATES1}

Loaded data for 51 states


In [5]:
print('States with greater than 1k samples: ({0} total)'.format(len(STATES1)))
print('\t{0}'.format(sorted(list(STATES1))))
print('All other states: ({0} total)'.format(len(STATES2)))
print('\t{0}'.format(sorted(list(STATES2))))
states_check = STATES1 | STATES2
assert 0 == len(STATES1 & STATES2)
assert states_check == STATES

States with greater than 1k samples: (17 total)
	['AL', 'AZ', 'CA', 'CT', 'FL', 'GA', 'IL', 'LA', 'MA', 'MI', 'MS', 'NY', 'PA', 'SC', 'TN', 'TX', 'WI']
All other states: (34 total)
	['AK', 'AR', 'CO', 'DC', 'DE', 'HI', 'IA', 'ID', 'IN', 'KS', 'KY', 'MD', 'ME', 'MN', 'MO', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'OH', 'OK', 'OR', 'RI', 'SD', 'UT', 'VA', 'VT', 'WA', 'WV', 'WY']


#### Top N models sorted by weight and score

In [6]:
def get_interactions(model_list):

    interaction_list = []
    for model in model_list:
        iterator = re.finditer(r'\[(\d+)\]', model)
        for match in iterator:
            interaction = match.group()
            #print('\t\tinteraction: {0}'.format(interaction))
            interaction_list.append(interaction)
            
    return interaction_list

In [7]:
def count_items(item_list):
    ctr = Counter(item_list)
    ctr_tuples = [(count, item) for item, count in ctr.items()]
    # sort by count in decreasing order
    for tup in sorted(ctr_tuples, key=lambda x: x[0], reverse=True):
        count = tup[0]
        item = tup[1]
        print('{0:>12} : {1}'.format(item, count))
    return ctr

In [8]:
def top_n(state_dict, N=5):

    models_by_weight = []
    interactions_by_weight = []

    models_by_score = []
    interactions_by_score = []

    for state, data_list in state_dict.items():
        #print('State: {0}, {1} items'.format(state, len(data_list)))

        # extract (max_wt, model) pairs and sort by increasing weight
        items = [(tup.max_wt, tup.model) for tup in data_list]
        sorted_items = sorted(items, key=lambda x: x[0])[:N]
        for wt, model in sorted_items:
            models_by_weight.append(model)

        # extract (score, model) pairs and sort by increasing score
        items = [(tup.score, tup.model) for tup in data_list]
        sorted_items = sorted(items, key=lambda x: x[0])[:N]
        for score, model in sorted_items:
            models_by_score.append(model)

    unique_by_weight = len(set(models_by_weight))
    unique_by_score = len(set(models_by_score))

    total_by_weight = len(models_by_weight)
    total_by_score = len(models_by_score)
    print('Found {0} unique models sorting by weight, {1} unique models sorting by score'.format(unique_by_weight,
                                                                                                 unique_by_score))


    print('** Ranking by weight: **')
    print('\nModels: ')
    ctr = count_items(models_by_weight)
    assert total_by_weight == sum([v for k,v in ctr.items()])
    interactions = get_interactions(models_by_weight)
    total_int_by_weight = len(interactions)
    print('\nInteractions: ')
    ctr = count_items(interactions)
    assert total_int_by_weight == sum([v for k,v in ctr.items()])

    print('\n** Ranking by score: **')
    print('\nModels: ')
    ctr = count_items(models_by_score)
    assert total_by_score == sum([v for k,v in ctr.items()])
    interactions = get_interactions(models_by_score)
    total_int_by_score = len(interactions)
    print('\nInteractions: ')
    ctr = count_items(interactions)
    assert total_int_by_score == sum([v for k,v in ctr.items()])

In [9]:
print('STATES1: ')
state_dict_1 = {k:state_data[k] for k in STATES1}
top_n(state_dict_1)

print('\nSTATES2: ')
state_dict_2 = {k:state_data[k] for k in STATES2}
top_n(state_dict_2)

print('\nALL STATES: ')
top_n(state_data)

STATES1: 
Found 28 unique models sorting by weight, 31 unique models sorting by score
** Ranking by weight: **

Models: 
        [35] : 9
        [45] : 7
          [] : 6
        [25] : 6
    [02][35] : 5
    [01][25] : 5
        [15] : 5
        [23] : 4
        [02] : 4
    [02][45] : 4
    [01][23] : 3
[01][23][45] : 3
    [02][15] : 3
        [24] : 2
        [01] : 2
    [01][24] : 2
    [01][45] : 2
    [03][14] : 1
        [05] : 1
    [05][12] : 1
    [05][23] : 1
    [01][35] : 1
[01][24][35] : 1
        [12] : 1
[03][15][24] : 1
    [03][24] : 1
    [03][15] : 1
        [03] : 1

Interactions: 
        [01] : 19
        [35] : 16
        [45] : 16
        [02] : 16
        [23] : 11
        [25] : 11
        [15] : 10
        [24] : 7
        [03] : 5
        [05] : 3
        [12] : 2
        [14] : 1

** Ranking by score: **

Models: 
        [35] : 11
        [23] : 6
    [02][35] : 6
        [15] : 5
          [] : 4
        [45] : 4
    [02][15] : 4
        [02] : 4
    

In [10]:
# TOP_N = 5

# model_list = []
# interaction_list = []
# for f in os.listdir(RESULT_DIR):
#     filepath = os.path.join(RESULT_DIR, f)
#     with open(filepath) as infile:
#         #print(filepath)
#         for i, line in enumerate(infile):
#             if 0 == i:
#                 continue
#             text = line.strip()
#             #print('[{0}]: {1}'.format(i, text))
#             max_wt, min_cell, score, model = text.split(',')
#             # get the model
#             #print('\t\tmodel: {0}'.format(model))
#             model_list.append(model)
#             # get the interactions
#             iterator = re.finditer(r'\[(\d+)\]', model)
#             for match in iterator:
#                 interaction = match.group()
#                 #print('\t\tinteraction: {0}'.format(interaction))
#                 interaction_list.append(interaction)
#             if i == TOP_N:
#                 break
            
# print('Found {0} models'.format(len(model_list)))
            
# print('\nModels: ')
# model_ctr = Counter(model_list)
# model_ctr_tuples = [(v,k) for k,v in model_ctr.items()]
# for tup in sorted(model_ctr_tuples, key=lambda x: x[0], reverse=True):
#     count = tup[0]
#     k = tup[1]
#     print('{0:>12} : {1}'.format(k, count))
    
# print('\nInteractions: ')
# interaction_ctr = Counter(interaction_list)
# interaction_ctr_tuples = [(v,k) for k,v in interaction_ctr.items()]
# for tup in sorted(interaction_ctr_tuples, key=lambda x: x[0], reverse=True):
#     count = tup[0]
#     k = tup[1]
#     print('{0:>12} : {1}'.format(k, count))