This notebook contains the code for comparing the base ensembles, and measureing the autocorrelation, effective samples size, and redundancy of the other ensembles.

In [None]:
import pandas as pd
import numpy as np
import json
from itertools import combinations
import re

from knobs_functions import *

In [2]:
# Here are the lists and dictionaries that were imported from the fetch.py file
print(f'base_list = {base_list}')
print(f'ensemble_list = {ensemble_list}')
print(f'state_chamber_list = {state_chamber_list}')
print(f'primary_score_list = {primary_score_list}')
print(f'secondary_score_list = {secondary_score_list}')
print(f'num_seats_dict = {num_seats_dict}')
print(f'ensemble_name_dict = {ensemble_name_dict}')

base_list = ['base0', 'base1', 'base2', 'base3', 'base4']
ensemble_list = ['pop_minus', 'pop_plus', 'distpair', 'ust', 'distpair_ust', 'reversible', 'county25', 'county50', 'county75', 'county100']
state_chamber_list = [('FL', 'congress'), ('FL', 'upper'), ('FL', 'lower'), ('IL', 'congress'), ('IL', 'upper'), ('IL', 'lower'), ('MI', 'congress'), ('MI', 'upper'), ('MI', 'lower'), ('NC', 'congress'), ('NC', 'upper'), ('NC', 'lower'), ('NY', 'congress'), ('NY', 'upper'), ('NY', 'lower'), ('OH', 'congress'), ('OH', 'upper'), ('OH', 'lower'), ('WI', 'congress'), ('WI', 'upper'), ('WI', 'lower')]
primary_score_list = ['Reock', 'Polsby-Popper', 'cut edges', 'Dem seats', 'efficiency gap', 'mean-median', 'partisan bias', 'competitive districts', 'average margin', 'MMD black', 'MMD hispanic', 'MMD coalition', 'county splits', 'counties split']
secondary_score_list = ['efficiency_gap_statewide', 'estimated_seats', 'disproportionality', 'pr_deviation', 'proportionality', 'efficiency_gap', 'seats_b

In [3]:
# Read Todd's data
# which is a list with one entry per state-chamber-ensemble_type combination.
with open('redundancy3.jsonl', 'r', encoding='utf-8') as file:
    data = [json.loads(line) for line in file]

In [5]:
def fetch_redundancy_data(state, chamber, ensemble_type):
    """
    Fetches the redundancy data for a given state, chamber, and ensemble type (from 'redundancy3.sjsonl').
    Returns a dictionary mapping frequencies to counts.
    For example, {1: 238439, 2: 1759,...} means that 238439 districts appeared once in the ensemble, 1759 districts appeared twice, etc.
    """
    if chamber == 'congress':
        pop0 = '0.01'
        pop_minus = '0.005'
        pop_plus = '0.015'
    else:
        pop0 = '0.05'
        pop_minus = '0.025'
        pop_plus = '0.075'

    type0 = 'cut-edges-rmst'
    type1 = 'cut-edges-region-aware'
    county0 = '0.0'

    ensemble_dict = {
        'base0' : f'T{pop0}_S{county0}_R0_V{type0}',
        'base1' : f'T{pop0}_S{county0}_R1_V{type0}',
        'base2' : f'T{pop0}_S{county0}_R2_V{type0}',
        'base3' : f'T{pop0}_S{county0}_R3_V{type0}',
        'base4' : f'T{pop0}_S{county0}_R4_V{type0}',
        'pop_minus' : f'T{pop_minus}_S{county0}_R0_V{type0}',
        'pop_plus' : f'T{pop_plus}_S{county0}_R0_V{type0}',
        'ust' : f'T{pop0}_S{county0}_R0_Vcut-edges-ust',
        'distpair' : f'T{pop0}_S{county0}_R0_Vdistrict-pairs-rmst',
        'distpair_ust' : f'T{pop0}_S{county0}_R0_Vdistrict-pairs-ust',
        'reversible' : f'T{pop0}_S{county0}_R0_Vreversible',
        'county25' : f'T{pop0}_S{0.25}_R0_V{type1}',
        'county50' : f'T{pop0}_S{0.5}_R0_V{type1}',
        'county75' : f'T{pop0}_S{0.75}_R0_V{type1}',
        'county100' : f'T{pop0}_S{1.0}_R0_V{type1}',   
    }
    snipet = ensemble_dict[ensemble_type]

    filename = f'/Users/todd/ensembles/{state}_{chamber}/{state}_{chamber}_{snipet}/{state}_{chamber}_{snipet}_ensemble.jsonl.xz'

    index = [n for n in range(len(data)) if data[n]['file'] == filename]
    X = data[index[0]]['district_frequencies']
    Y = {int(a) :b for a,b in X.items()} # convert keys from str to int
    sorted_Y = {k: Y[k] for k in sorted(Y.keys(), reverse=True)}  # sort by keys
    return sorted_Y

In [6]:
def autocorr(a): # lag 1 autocorrelation of the given array
    return np.corrcoef(a[:-1], a[1:])[0,1]

In [None]:
def summary_table(score_list=None, measurement = 'autocorr'):
    """
    Returns a dataframe showing (for each state-chamber pair and each ensemble type) the following measurement:
    - measurement = 'autocorr' : the max (over the scores in score_list) of the lag-1 autocorrelation of the score array 
    - measurement = 'ESS' : the effective sample size (ESS) approximated from the autocorrelations via the AR(1) model.
    - measurement = 'max_repeats': the max number of times a district appears in the ensemble.
    - measurement = 'mean_repeats': the mean number of times a district appears in the ensemble.
    """
    column_list = ['base0'] + ensemble_list
    index_list = [f'{a[0]} {a[1]}' for a in state_chamber_list]
    df = pd.DataFrame(columns = column_list, index = index_list)
    for state, chamber in state_chamber_list:
        k = num_seats_dict[(state, chamber)]
        for ensemble in column_list:
            if measurement in ['max_repeats', 'mean_repeats']:
                Y = fetch_redundancy_data(state, chamber, ensemble)
                Y1 = [a for a, b in Y.items() for _ in range(b)]
                S = sum(Y1)//k # ensemble size
                answer = max(Y1) if measurement == 'max_repeats' else np.mean(Y1)
            else:            
                ls = []
                for score in score_list:
                    a = fetch_score_array(state, chamber, ensemble, score)
                    n = len(a)
                    auto = np.abs(autocorr(a))
                    ls.append(auto)
                maxa = max(ls)
                ESS = n*(1 - maxa)/(1 + maxa)
                answer = maxa if measurement == 'autocorr' else ESS
            df.loc[f'{state} {chamber}', ensemble] = answer
    df = df.apply(pd.to_numeric)
    return df

In [8]:
# I'll use a restricted list of primary scores here.  The MMD scores are constant at zero for certain states and chambers, 
# which causes the autocorrelation to be infinite.

my_short_score_list = ['Reock', 'Polsby-Popper', 'cut edges', 'Dem seats', 'efficiency gap', 'mean-median',
 'partisan bias', 'competitive districts', 'average margin']
my_score_list = my_short_score_list+['counties split', 'county splits']

In [9]:
df1 = summary_table(score_list = my_score_list, measurement = 'autocorr').round(2)
df2 = summary_table(score_list = my_score_list, measurement = 'ESS').round().astype(int)

In [17]:
df3 = summary_table(score_list = my_short_score_list, measurement = 'max_repeats').round(2)
df4 = summary_table(score_list = my_short_score_list, measurement = 'mean_repeats').round(2)

In [20]:
# Overlay two dataframes into a single LaTex table that stacks their values:
state_chamber_size_dict = {f'{state} {chamber}': f'{state} {num_seats_dict[(state, chamber)]}' 
                           for state, chamber in state_chamber_list}

def overlay_dataframes(df1, df2, filename):
    """
    Overlay two dataframes into a single LaTex table that stacks their values:
    df1 = the first dataframe (e.g., autocorrelation values)
    df2 = the second dataframe (e.g., ESS values)
    """
    pd.set_option("display.max_colwidth", None) # Prevent column truncation
    color1 = "blue"
    color2 = "red"
    combined = pd.DataFrame(index=df1.index, columns=df1.columns)
    for row in df1.index:
        for col in df1.columns:
            val1 = df1.loc[row, col]
            val2 = df2.loc[row, col]
            combined.loc[row, col] = f"\\makecell{{\\textcolor{{{color1}}}{{{val1}}} \\\\ \\textcolor{{{color2}}}{{{val2}}}}}"
    combined.rename(columns=ensemble_name_dict, index = state_chamber_size_dict, inplace=True)
    combined.to_latex(filename, escape=False)

In [21]:
overlay_dataframes(df1, df2, 'latex tables/autocorr_table.tex')

  combined.to_latex(filename, escape=False)


In [22]:
overlay_dataframes(df3, df4, 'latex tables/redundancy_table.tex')

  combined.to_latex(filename, escape=False)


In [None]:
# Check how the result changes if you omit the county splits scores:
df2_short = summary_table(score_list = my_short_score_list, measurement='ESS').round().astype(int)

In [10]:
df2_short

Unnamed: 0,base0,pop_minus,pop_plus,distpair,ust,distpair_ust,reversible,county25,county50,county75,county100
FL congress,17780,17864,17893,16990,18064,18418,8,17453,15316,15036,14295
FL upper,18587,17999,18794,18424,19106,18472,40,18846,16318,15558,15970
FL lower,16525,16610,16610,16493,17420,17637,8,16263,15852,15563,15762
IL congress,19493,19564,19311,19549,19654,19722,38,19479,18912,18967,18763
IL upper,18014,17934,17888,18565,18801,19224,22,18046,16582,16145,14663
IL lower,16341,16506,14951,16118,17512,17515,7,16356,15795,15479,15325
MI congress,19467,19570,19442,19363,19517,19410,65,19543,19271,18811,18988
MI upper,19273,19174,19088,19114,19594,19384,46,18870,17956,16558,16750
MI lower,17232,17026,17775,17771,18593,18366,11,16601,15461,14504,15023
NC congress,19615,19502,19551,19210,19216,19287,33,19432,19730,19475,19717


Next check the closeness of pairs of the base ensembles.

In [48]:
def summarize_multistart(score_list = primary_score_list, dist = 'ks', combine_method = 'max'):
    column_list = score_list
    index_list = [f'{a[0]} {a[1]}' for a in state_chamber_list]
    df = pd.DataFrame(columns = column_list, index = index_list)

    for state, chamber in state_chamber_list:
        for score in score_list:
            ls = []
            for ensemble1, ensemble2 in combinations(base_list,2):
                a1 = fetch_score_array(state, chamber, ensemble1, score)
                a0 = fetch_score_array(state, chamber, ensemble2, score)
                t_statistic, t_pvalue = t_test(a1, a0)
                ks_statistic, ks_pvalue, ks_sign = ks_test(a0, a1)

                if dist == 'ks':
                    ls.append(ks_statistic)
                elif dist == 't':
                    ls.append(t_statistic)
                elif dist == 'gelman':
                    ls.append(gelman_rubin_rhat(a1, a0))
            if combine_method == 'max':
                df.loc[f'{state} {chamber}', score] = max(ls)
            elif combine_method == 'mean':
                df.loc[f'{state} {chamber}', score] = np.mean(ls)
            elif combine_method == 'list':
                df.loc[f'{state} {chamber}', score] = np.array(ls)
    return df 

In [49]:
my_score_list = ['Reock', 'Polsby-Popper', 'cut edges', 'Dem seats', 
                 'efficiency gap', 'mean-median', 'partisan bias', 'competitive districts', 
                 'average margin', 'county splits']
df_multi = summarize_multistart(score_list=my_score_list)

In [50]:
df_multi = df_multi.apply(pd.to_numeric, errors='coerce')  # Convert columns to numeric
df_multi.round(2)  # Round the numeric values to 2 decimal places

Unnamed: 0,Reock,Polsby-Popper,cut edges,Dem seats,efficiency gap,mean-median,partisan bias,competitive districts,average margin,county splits
FL congress,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01
FL upper,0.02,0.01,0.01,0.01,0.01,0.02,0.01,0.01,0.02,0.01
FL lower,0.02,0.01,0.01,0.01,0.02,0.01,0.02,0.01,0.01,0.01
IL congress,0.01,0.01,0.01,0.01,0.01,0.01,0.02,0.01,0.01,0.01
IL upper,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01
IL lower,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.02
MI congress,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01
MI upper,0.02,0.02,0.02,0.01,0.02,0.01,0.01,0.01,0.02,0.01
MI lower,0.01,0.01,0.01,0.01,0.01,0.01,0.02,0.01,0.01,0.01
NC congress,0.01,0.01,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.01


In [51]:
largest_value = df_multi.max().max()
largest_value

0.022150000000000003

In [52]:
def stack_string(a):
    l = re.split(r"[ -]+", a)
    if len(l) == 1:
        return l[0]
    else:
        return f'\\makecell{{{l[0]} \\\\ {l[1]}}}'
    
col_dict = {score: stack_string(score) for score in my_score_list}

df_multi.rename(index = state_chamber_size_dict, columns=col_dict, inplace=True)

In [53]:
df_multi.round(2).to_latex('latex tables/multistart.tex', escape=False)

  df_multi.round(2).to_latex('latex tables/multistart.tex', escape=False)
