This notebook generates tables and graphs to compare ensemble pairs

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import json
from itertools import combinations

from knobs_functions import *

In [4]:
# make dictionaries from which later dataframes will be created.
# Each dictionary will map (state, chamber, ensemble, score) to a specific measurement of closeness to the base0 ensemble.

mean_diff_dict = {}
KS_stat_dict = {}
KS_pvalue_dict = {}
T_pvalue_dict = {}

for state, chamber in state_chamber_list:
    for score in primary_score_list + secondary_score_list:
        a0 = fetch_score_array(state, chamber, 'base0', score)
        for ensemble in ['base1', 'base2', 'base3', 'base4'] + ensemble_list:
            a1 = fetch_score_array(state, chamber, ensemble, score)
            mean_diff = np.mean(a1) - np.mean(a0)
            KS_stat, KS_pvalue, KS_sign = ks_test(a0, a1)
            KS_signed_stat = KS_stat * KS_sign
            _, T_pvalue = t_test(a1, a0)

            mean_diff_dict[(state, chamber, ensemble, score)] = mean_diff
            KS_stat_dict[(state, chamber, ensemble, score)] = KS_signed_stat
            KS_pvalue_dict[(state, chamber, ensemble, score)] = KS_pvalue
            T_pvalue_dict[(state, chamber, ensemble, score)] = T_pvalue

  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)


In [55]:
def mean_diff_table(score, pvalue = None, latex_filename = None, rounding = 2):
    """
    Returns a dataframe showing (for each state-chamber pair and each ensemble type)
    the mean-difference between that ensemble and the base0 ensemble with respect to the given score.
    If pvalue is set, it will mark values that are significantly different from the base0 ensemble.
    if latex_filename is set, it will also save the dataframe as a latex table.

    Args:
        score: use any valid score, or use 'Dem share' for 'Dem seats' divided by the number of seats.
        p_value: set to None if you don't want marks.
    """

    index_list = [f'{a[0]} {a[1]}' for a in state_chamber_list] + ['AVERAGE']
    columns_list = ['base1']+ ensemble_list
    df = pd.DataFrame(columns = columns_list, index = index_list)
    df_mark = df.copy() # True/False signifying whether the value is marked as statistically significant

    effective_score = 'Dem seats' if score == 'Dem share' else score
    for state, chamber in state_chamber_list:
        denom = num_seats_dict[(state, chamber)] if score == 'Dem share' else 1
        for ensemble in columns_list:
            mean_diff = mean_diff_dict[(state, chamber, ensemble, effective_score)] / denom
            mean_diff = np.round(mean_diff, 2)
            p_value = T_pvalue_dict[(state, chamber, ensemble, effective_score)]
            df.loc[f'{state} {chamber}', ensemble] = mean_diff
            df_mark.loc[f'{state} {chamber}', ensemble] = (pvalue != None and p_value < pvalue)
    df.loc['AVERAGE'] = df.mean().round(rounding)
    df = df.round(rounding) 
    df_latex = df.copy()

    # combine the values and markings into dataframes to return and for Latex
    for state, chamber in state_chamber_list:
        for ensemble in columns_list:
            if df_mark.loc[f'{state} {chamber}', ensemble]:
                df_latex.loc[f'{state} {chamber}', ensemble] = f'\\textbf{{{df.loc[f"{state} {chamber}", ensemble]}}}'
                df.loc[f'{state} {chamber}', ensemble] = f'*{df.loc[f"{state} {chamber}", ensemble]}'

    if latex_filename is not None:
        df_latex.rename(columns=ensemble_name_dict, inplace=True)
        df_latex.to_latex(latex_filename, escape=False)
    return df

In [56]:
ds = mean_diff_table('Dem seats', pvalue=.001, latex_filename='mean_diff_table.tex')
ds

  df_latex.to_latex(latex_filename, escape=False)


Unnamed: 0,base1,pop_minus,pop_plus,distpair,ust,distpair_ust,reversible,county25,county50,county75,county100
FL congress,0.0,0.01,0.0,*-0.18,*-0.11,*-0.29,*-0.83,*0.42,*0.59,*0.56,*0.55
FL upper,0.02,-0.01,0.02,*-0.09,-0.02,*-0.14,*-0.57,*0.07,0.04,*0.05,*0.1
FL lower,0.01,0.03,0.03,0.01,-0.0,-0.06,*1.69,*-0.12,*-0.21,*-0.09,*0.07
IL congress,0.0,-0.01,-0.0,*0.03,0.01,*0.04,*-0.05,*-0.06,*-0.23,*-0.23,*-0.21
IL upper,-0.01,-0.03,0.04,-0.01,*0.1,*0.09,*0.37,-0.03,0.0,*-0.07,*-0.15
IL lower,0.01,0.03,0.0,-0.05,*0.08,0.01,*-0.67,*-0.14,*-0.14,*-0.22,*-0.35
MI congress,-0.0,0.0,0.0,*0.05,*0.06,*0.09,*0.24,*-0.04,*-0.16,*-0.26,*-0.27
MI upper,-0.01,-0.0,-0.03,*0.07,0.03,*0.18,0.03,0.01,*0.11,*0.13,*0.14
MI lower,-0.03,-0.02,0.01,0.03,-0.06,0.04,*0.27,*0.23,*0.46,*0.53,*0.48
NC congress,0.01,0.01,-0.0,*-0.03,*-0.05,*-0.08,*0.42,*-0.04,*-0.07,*-0.08,*-0.09


In [57]:
ds2 = mean_diff_table('competitive districts', pvalue=.001, latex_filename='competitive_table.tex')
ds2

  df_latex.to_latex(latex_filename, escape=False)


Unnamed: 0,base1,pop_minus,pop_plus,distpair,ust,distpair_ust,reversible,county25,county50,county75,county100
FL congress,-0.02,-0.04,-0.01,*0.46,*0.3,*0.88,*0.38,-0.03,*-0.09,*-0.11,*-0.1
FL upper,-0.02,-0.02,*-0.06,0.03,0.05,*0.16,*-0.18,*-0.14,*-0.31,*-0.37,*-0.43
FL lower,0.05,0.09,-0.0,*0.11,*0.14,*0.19,*0.12,*0.45,*0.39,*0.28,*0.44
IL congress,-0.0,-0.02,-0.01,*0.04,0.02,*0.06,*0.15,*-0.16,0.01,*0.06,*0.05
IL upper,-0.04,*-0.12,*0.1,*0.08,*0.07,*0.21,*-0.1,*-0.07,*0.13,*0.26,*0.37
IL lower,0.0,-0.02,0.05,*0.25,*0.1,*0.34,*0.71,*0.23,*0.78,*1.43,*1.65
MI congress,0.0,-0.01,-0.0,*-0.07,*-0.06,*-0.15,-0.02,0.01,*0.07,*0.12,*0.1
MI upper,-0.02,0.03,-0.04,*0.35,0.01,*0.43,*1.02,*-0.1,*-0.08,*-0.07,-0.06
MI lower,0.02,-0.02,-0.03,-0.08,-0.07,-0.08,*-0.19,*0.34,*0.63,*0.76,*0.76
NC congress,-0.01,0.0,0.01,-0.01,*0.05,0.04,*-0.2,0.0,*-0.08,*-0.14,*-0.16


In [5]:
def make_table(my_score_list = primary_score_list, my_ensemble_list= ensemble_list, my_state_chamber_list=state_chamber_list,
               based_on = 'ks_p', threshold = .001, display_extremes = True, transpose = True):
    """
    Create a dataframe whose columns are the scores and whose rows are the ensembles.
    Each cell contains a string of +, -, or 0, one for each state-chamber, to compare that ensemble's score to that of the base0 ensemble.
    if based_on == 'ks', the symbol indicates whether the KS statistic is above the threshold.
    if based_on == 'ks_p', it indicates whether the KS pvalue is below the threshold.
    if based_on == 't_p', it indicates whether the t-test pvalue is below the threshold.
    if display_extremes is True, the cell will also contain a dictionary with the state-chamber with 
    the largest and smallest KS-statistic (or mean-difference statistics if based_on='t_p').

    It returns the transpose if transpose is True.
    """
    df = pd.DataFrame(columns=my_score_list, index=my_ensemble_list).fillna('')
    for score in my_score_list:
        for ensemble in my_ensemble_list:
            string_of_symbols = ''
            min_stat = 0
            max_stat = 0
            max_state_chamber = ''
            min_state_chamber = ''
            for state, chamber in my_state_chamber_list:
                md = mean_diff_dict[state, chamber, ensemble, score]
                ks = KS_stat_dict[state, chamber, ensemble, score]
                ks_p = KS_pvalue_dict[state, chamber, ensemble, score]
                t_p = T_pvalue_dict[state, chamber, ensemble, score]

                if based_on == 'ks':
                    if np.abs(ks) > threshold:
                        x = '+' if ks > 0 else '-'
                    else:
                        x = '0'
                if based_on == 't_p':
                    if t_p < threshold:
                        x = '+' if md > 0 else '-'
                    else:
                        x = '0'
                elif based_on == 'ks_p':
                    if ks_p < threshold:
                        x = '+' if ks > 0 else '-'
                    else:
                        x = '0'
                string_of_symbols += x
                
                if x in ['+', '-']:
                    stat = ks if based_on in ['ks', 'ks_p'] else md
                    if stat > max_stat:
                        max_stat = stat
                        max_state_chamber = f'{state}_{chamber}'
                    if stat < min_stat:
                        min_stat = stat
                        min_state_chamber = f'{state}_{chamber}'

            if display_extremes:
                D = dict()
                if max_stat > 0:
                    D[max_state_chamber] = np.round(max_stat,3)
                if min_stat < 0:
                    D[min_state_chamber] = np.round(min_stat,3)
                if len(D) > 0:
                    df.at[ensemble, score] = f'{string_of_symbols}{D}'
                else:
                    df.at[ensemble, score] = string_of_symbols
            else:
                df.at[ensemble, score] = string_of_symbols

    if transpose:
        df = df.transpose()
    return df

In [6]:
# test that the other 4 base ensembles are all close to base0
test = make_table(my_ensemble_list=['base1', 'base2', 'base3', 'base4'],
                    based_on = 'ks_p', threshold = .001, display_extremes=False)
test

Unnamed: 0,base1,base2,base3,base4
Reock,0,0,0,00-000000000000000000
Polsby-Popper,0,0,0,000000000000000000000
cut edges,0,0,0,000000000000000000000
Dem seats,0,0,0,000000000000000000000
efficiency gap,0,0,0,000000000000000000000
mean-median,0,0,0,000000000000000000000
partisan bias,0,0,0,000000000000000000000
competitive districts,0,0,0,000000000000000000000
average margin,0,0,0,000000000000000000000
MMD black,0,0,0,000000000000000000000


In [10]:
compact_score_list = ['Reock', 'Polsby-Popper', 'cut edges']
partisan_score_list = ['Dem seats', 'efficiency gap', 'mean-median', 'partisan bias', 'competitive districts', 'average margin']
MMD_score_list = ['MMD black', 'MMD hispanic', 'MMD coalition']
county_score_list = ['county splits', 'counties split']

base_ensemble_list = ['base1', 'base2', 'base3', 'base4']
ensemble_list = ['pop_minus', 'pop_plus', 'distpair', 'ust',  'distpair_ust', 'reversible', 
                'county25', 'county50', 'county75', 'county100']
county_ensemble_list = ['county25', 'county50', 'county75', 'county100']
pop_ensemble_list = ['pop_minus', 'pop_plus']
recom_ensemble_list = ['distpair', 'ust', 'distpair_ust', 'reversible']

In [13]:
T = make_table(my_ensemble_list=pop_ensemble_list)
#T.to_csv('pop_knob.csv')
T

Unnamed: 0,pop_minus,pop_plus
Reock,"00000+0-000000000000+{'IL_lower': 0.025, 'MI_u...",0000000+0000000000000{'MI_upper': 0.025}
Polsby-Popper,0--0000-000-0--000000{'MI_upper': -0.083},0+000+0++0+000+000000{'MI_upper': 0.047}
cut edges,0++0++0++00+0++00+000{'MI_upper': 0.056},0--0--0--0-00--0000--{'FL_lower': -0.035}
Dem seats,00000000000000000000+{'WI_lower': 0.02},000000000000000000000
efficiency gap,"0000+00+00-000000000-{'IL_upper': 0.029, 'WI_l...","000+-00+00000000000-0{'MI_upper': 0.023, 'IL_u..."
mean-median,0000+0000000000000000{'IL_upper': 0.021},000000000000000000000
partisan bias,0000000000000--000000{'NY_lower': -0.026},00000-000000000000000{'IL_lower': -0.021}
competitive districts,0000-0000000000000000{'IL_upper': -0.024},0000+0000000000000000{'IL_upper': 0.021}
average margin,00-000000000000000000{'FL_lower': -0.023},"0+00000000000--000000{'FL_upper': 0.029, 'NY_u..."
MMD black,000000000000000000000,000000000000000000000
