This notebook contains old code that we ended up cutting out and not using in the final project.

In [None]:
# Outlier distance was another metric we considered for measuring the distance between two distributions.

def outlier_distance(X, Y, cutoff=5):

    X = np.asarray(X)
    Y = np.asarray(Y)

    X_right = np.mean( X > np.percentile(Y,100-cutoff) )
    X_left = np.mean( X < np.percentile(Y, cutoff) )
    Y_right = np.mean( Y > np.percentile(X,100-cutoff) )
    Y_left = np.mean( Y < np.percentile(X, cutoff) )
    
    to_return=  max(X_right, Y_right) + max(X_left, Y_left)
    return to_return # noise can cause slightly negative values

def outlier_table(score, cutoff, my_ensemble_list = ['base1']+ ensemble_list, latex_filename = None, rounding = 2):
    """
    Returns a dataframe showing (for each state-chamber pair and each ensemble type)
    the outlier distance between that ensemble and the base0 ensemble with respect to the given score.
    if latex_filename is set, it will also save the dataframe as a latex table.
    """

    index_list = [f'{a[0]} {a[1]}' for a in state_chamber_list] + ['AVERAGE']
    df = pd.DataFrame(columns = my_ensemble_list, index = index_list)

    for state, chamber in state_chamber_list:
        for ensemble in my_ensemble_list:
            a0 = fetch_score_array(state, chamber, 'base0', score)
            a1 = fetch_score_array(state, chamber, ensemble, score)
            out_dist = outlier_distance(a1, a0, cutoff=cutoff)
            out_dist = np.round(out_dist, rounding)
            df.loc[f'{state} {chamber}', ensemble] = out_dist
    df.loc['AVERAGE'] = df.mean().round(rounding)
    df = df.round(rounding) 
    df_latex = df.copy()

    # return and output for Latex
    state_chamber_size_dict = {f'{state} {chamber}': f'{state} {num_seats_dict[(state, chamber)]}' 
                           for state, chamber in state_chamber_list}

    if latex_filename is not None:
        df_latex.rename(columns=ensemble_name_dict, index=state_chamber_size_dict, inplace=True)
        df_latex.to_latex(latex_filename, escape=False)
    return df

In [None]:
# Here is a slower but more flexible version of Ordered_seats_table that works with an arbitrary competitive window.

def Ordered_seats_table(competitive_window = .05, metric = 'mean_diff', combine_method = 'max',
                          plot_threshold = None):
    """
    Returns a dataframe showing (for each state-chamber pair and each ensemble type) 
    the "distance" between the ordered seats plots of the ensemble and the base0 ensemble.
    It only considers seats that are competitive for at least one of the two ensembles being compared,
    (which means that dem share is within competitive_window of 0.5 for at least one of the two ensembles).
    Args:
        competitive_window: the window around 0.5 that defines a competitive seat (default .05)
        metric: 'mean_diff' or 'ks' to measure, for each seat, the distance between the two histograms 
        combine_method: 'max', 'mean', 'sum' (default 'max')
        plot_threshold: if not None, it plots an ordered-seats plot for each cell of the returned dataframe past the threshold.
    """
    index_list = [f'{a[0]} {a[1]}' for a in state_chamber_list] + ['AVERAGE']
    columns_list = ['base1']+ ensemble_list
    df = pd.DataFrame(columns = columns_list, index = index_list)
    for state, chamber in state_chamber_list:
        for ensemble in columns_list:
            X0 = fetch_score_array(state, chamber, 'base0', 'by_district')
            X1 = fetch_score_array(state, chamber, ensemble, 'by_district')
            num_seats = X0.shape[1]
            ls = []
            for i in range(num_seats):
                if abs(np.mean(X0[:, i]) - 0.5) < competitive_window or abs(np.mean(X1[:, i]) - 0.5) < competitive_window:
                    if metric == 'mean_diff':
                        closeness = abs(np.mean(X1[:, i]) - np.mean(X0[:, i]))
                    elif metric == 'ks':
                        closeness, _, __ = ks_test(X1[:, i], X0[:, i])
                    else:
                        raise ValueError('metric must be mean_diff or ks')
                    ls.append(closeness)
            if combine_method == 'sum':
                combined_closeness = sum(ls)
            elif combine_method == 'mean':
                combined_closeness = np.mean(ls)
            elif combine_method == 'max':
                combined_closeness = max(ls)
            else:
                raise ValueError('combine_method must be sum, mean or max')

            df.loc[f'{state} {chamber}', ensemble] = combined_closeness
            if plot_threshold is not None and combined_closeness > plot_threshold:
                box_plot(state, chamber, [ensemble, 'base0'], competitive_window=competitive_window)
    df = df.apply(pd.to_numeric)
    return df

In [None]:
# This function helped us hunt for patterns that were significant and consistent across many state-chamber pairs.

def significance_table(my_score_list = primary_score_list, my_ensemble_list= ensemble_list, my_state_chamber_list=state_chamber_list,
               based_on = 'ks_p', threshold = .001, display_extremes = True, transpose = True):
    """
    Create a dataframe whose columns are the scores and whose rows are the ensembles.
    Each cell contains a string of +, -, or 0, one for each state-chamber, to compare that ensemble's score to that of the base0 ensemble.
    if based_on == 'ks', the symbol indicates whether the KS statistic is above the threshold.
    if based_on == 'ks_p', it indicates whether the KS pvalue is below the threshold.
    if based_on == 't_p', it indicates whether the t-test pvalue is below the threshold.
    if display_extremes is True, the cell will also contain a dictionary with the state-chamber with 
    the largest and smallest KS-statistic (or mean-difference statistics if based_on='t_p').

    It returns the transpose if transpose is True.
    """
    df = pd.DataFrame(columns=my_score_list, index=my_ensemble_list).fillna('')
    for score in my_score_list:
        for ensemble in my_ensemble_list:
            string_of_symbols = ''
            min_stat = 0
            max_stat = 0
            max_state_chamber = ''
            min_state_chamber = ''
            for state, chamber in my_state_chamber_list:
                md = mean_diff_dict[state, chamber, ensemble, score]
                ks = KS_stat_dict[state, chamber, ensemble, score]
                ks_p = KS_pvalue_dict[state, chamber, ensemble, score]
                t_p = T_pvalue_dict[state, chamber, ensemble, score]

                if based_on == 'ks':
                    if np.abs(ks) > threshold:
                        x = '+' if ks > 0 else '-'
                    else:
                        x = '0'
                if based_on == 't_p':
                    if t_p < threshold:
                        x = '+' if md > 0 else '-'
                    else:
                        x = '0'
                elif based_on == 'ks_p':
                    if ks_p < threshold:
                        x = '+' if ks > 0 else '-'
                    else:
                        x = '0'
                string_of_symbols += x
                
                if x in ['+', '-']:
                    stat = ks if based_on in ['ks', 'ks_p'] else md
                    if stat > max_stat:
                        max_stat = stat
                        max_state_chamber = f'{state}_{chamber}'
                    if stat < min_stat:
                        min_stat = stat
                        min_state_chamber = f'{state}_{chamber}'

            if display_extremes:
                D = dict()
                if max_stat > 0:
                    D[max_state_chamber] = np.round(max_stat,3)
                if min_stat < 0:
                    D[min_state_chamber] = np.round(min_stat,3)
                if len(D) > 0:
                    df.at[ensemble, score] = f'{string_of_symbols}{D}'
                else:
                    df.at[ensemble, score] = string_of_symbols
            else:
                df.at[ensemble, score] = string_of_symbols

    if transpose:
        df = df.transpose()
    return df