In [1]:
import os, glob, json

import re, math

import pprint

import numpy as np
import pandas as pd
import matplotlib 
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
%matplotlib inline

pd.options.display.max_columns = 1000
# pd.options.display.precision = 1
pd.options.display.float_format = lambda x: '%.1f' % x

import seaborn as sns
sns.set(rc={"figure.figsize": (6, 0.75)})
sns.set_style("white")

from dist_tools import *

In [2]:
pol_states = ['FL', 'IL', 'LA', 'MD', 'MN', 'NC', 'PA', 'TN', 'TX', 'VA', 'WI',
              "NC_race", "TX_senate"]

method_full = {"axis_ratio" : "Axis Ratio", "dist_a" : "Areal Distance", "dist_p" : "Population Distance",
               "dyn_radius" : "Dynamic Radius", "ehrenburg" : "Inscribed Circles", "exchange" : "Exchange", 
               "harm_radius" : "Harmonic Radius", "hull_a" : "Hull Area", "hull_p" : "Hull Population", 
               "inertia_a" : "Inertia Area", "inertia_p" : "Inertia Population", "mean_radius" : "Mean Radius", 
               "polsby" : "Isoperimeter Quotient", "power" : "Power Diagram", "reock" : "Circumscribing Circles", 
               "rohrbach" : "Distance to Perimeter", "split" : "Split-Line", "path_frac" : "Path Fraction",
               "107" : "107th Congress", "111" : "111th Congress", "114" : "114th Congress"}

In [3]:
try: # Removing dependence on database

    seats = pd.read_sql("select usps, fips, name, epsg, seats from states where fips < 57 order by usps;", con = cen_con)
    seats.to_csv("data/states.csv", index = False)
    
except: pass
    
seats = pd.read_csv("data/states.csv", index_col = "usps").seats.to_dict()
state_names = pd.read_csv("data/states.csv", index_col = "usps").name.to_dict()

## Load the data

In [4]:
rf = {s : {m : [] for m in method_full} for s in pol_states}
ds = {s : {m : {} for m in method_full} for s in pol_states}

jdir = "/media/jsaxon/brobdingnag/data/c4_redux/"
for s in pol_states:
    with open(jdir + "/{}_redux.json".format(s.lower())) as fi:
        for line in fi:
            
            j = json.loads(line)

            m = j["UID"].split("/")[1]

            if "TX" in s:
                if m == "axis_ratio":
                    if j["PopulationDeviation"] > 0.10: continue
                elif j["PopulationDeviation"] > 0.05: continue
            elif m == "axis_ratio":
                if j["PopulationDeviation"] > 0.05: continue
            elif j["PopulationDeviation"] > 0.021: continue

            rf[s][m].extend(j["RepFrac"])
            
            for ey, es in j["DemSeats"].items():
                if ey not in ds[s][m]: ds[s][m][ey] = []
                ds[s][m][ey].append(es)

## Vote Share

In [5]:
competitive_dfs = []
ST = ["IL", "MD", "NC", "TX"]

for s in ST:

    print(s, end = " ")
    competitive = []
    for m in rf[s]:
        
        if "1" in m: continue

        plot_share(rf, s, m, seats[s], True, True)
        
        competitive.append([m, (seats[s] / len(rf[s][m])) * \
                                  len([1 for v in rf[s][m] if math.fabs(v - 0.5) < 0.025]),
                            "XZcomp_hist/{}_{}ZX".format(s.lower(), m)])
        
    cdf = pd.DataFrame(competitive, columns = ["Method", "# Comp.", "Figure"])
    cdf.replace({"Method" : method_full}, inplace = True)
    cdf.set_index(["Method"], inplace = True)
        
    competitive_dfs.append(cdf)
    
cdf = pd.concat(competitive_dfs, axis = 1)
cdf.columns = pd.MultiIndex.from_product([ST, ["# Comp.", "Rep. Vote Share"]])

IL MD NC TX 

In [6]:
cdf.to_csv("data/competitive.csv")
cdf.index.name = None
cdf.sort_index(inplace = True)
cdf

Unnamed: 0_level_0,IL,IL,MD,MD,NC,NC,TX,TX
Unnamed: 0_level_1,# Comp.,Rep. Vote Share,# Comp.,Rep. Vote Share,# Comp.,Rep. Vote Share,# Comp.,Rep. Vote Share
Areal Distance,1.9,XZcomp_hist/il_dist_aZX,0.3,XZcomp_hist/md_dist_aZX,2.2,XZcomp_hist/nc_dist_aZX,3.0,XZcomp_hist/tx_dist_aZX
Axis Ratio,1.5,XZcomp_hist/il_axis_ratioZX,0.2,XZcomp_hist/md_axis_ratioZX,2.9,XZcomp_hist/nc_axis_ratioZX,3.0,XZcomp_hist/tx_axis_ratioZX
Circumscribing Circles,1.7,XZcomp_hist/il_reockZX,0.3,XZcomp_hist/md_reockZX,2.3,XZcomp_hist/nc_reockZX,2.8,XZcomp_hist/tx_reockZX
Distance to Perimeter,1.7,XZcomp_hist/il_rohrbachZX,0.3,XZcomp_hist/md_rohrbachZX,2.3,XZcomp_hist/nc_rohrbachZX,3.0,XZcomp_hist/tx_rohrbachZX
Dynamic Radius,1.9,XZcomp_hist/il_dyn_radiusZX,0.4,XZcomp_hist/md_dyn_radiusZX,2.3,XZcomp_hist/nc_dyn_radiusZX,3.0,XZcomp_hist/tx_dyn_radiusZX
Exchange,1.8,XZcomp_hist/il_exchangeZX,0.3,XZcomp_hist/md_exchangeZX,2.3,XZcomp_hist/nc_exchangeZX,2.9,XZcomp_hist/tx_exchangeZX
Harmonic Radius,1.7,XZcomp_hist/il_harm_radiusZX,0.3,XZcomp_hist/md_harm_radiusZX,2.3,XZcomp_hist/nc_harm_radiusZX,2.9,XZcomp_hist/tx_harm_radiusZX
Hull Area,1.6,XZcomp_hist/il_hull_aZX,0.3,XZcomp_hist/md_hull_aZX,2.6,XZcomp_hist/nc_hull_aZX,2.9,XZcomp_hist/tx_hull_aZX
Hull Population,1.7,XZcomp_hist/il_hull_pZX,0.4,XZcomp_hist/md_hull_pZX,2.4,XZcomp_hist/nc_hull_pZX,3.1,XZcomp_hist/tx_hull_pZX
Inertia Area,2.0,XZcomp_hist/il_inertia_aZX,0.3,XZcomp_hist/md_inertia_aZX,2.3,XZcomp_hist/nc_inertia_aZX,2.9,XZcomp_hist/tx_inertia_aZX


In [7]:
tex = cdf.to_latex(na_rep = "", column_format = "l" + "c" * 2 * len(ST),
                                multicolumn_format = "c")

caption = """
The vote shares accruing to Republicans are plotted for
  all districts of each map, and for all available elections,
  leading to one distribution for each state and method.
The consistency in the shapes of the distributions across methods
  suggests that the many methods do not differ in their treatment of the two parties.
The different shapes for the four states shows the impact of
  political geography on partisan representation.
Republican vote shares in excess of 0.5 correspond to Republican wins;
  the integral up to 0.5 corresponds to the Democratic seat share,
  as shown for Pennsylvania in Figure~\\ref{tab:PA_seats}.
The part of the distribution close to 0.5 are competitive races.
To the left of each distribution,
  I tabulate the number of competitive races, calculated as the integral of the vote share 
  distribution between 0.475 and 0.525.
As for seat shares, the level of competitiveness is quite consistent across measures.
"""


tex = re.sub(r" *([A-Y]{2}) \& *", r" \multicolumn{2}{c}{ \selectfont \1} ", tex)

tex = tex.replace("Comp", "\\fontsize{8.5}{12}\selectfont Comp")
tex = tex.replace("Rep", "\\fontsize{8.5}{12}\selectfont Rep")

for k, v in state_names.items():
    tex = tex.replace(k, v)

tex = tex.replace("XZ", "\includegraphics[width=7em]{")
tex = tex.replace("\_", "_")
tex = tex.replace("ZX", "}")

# tex = tex.replace("nan", "")
tex = re.sub("None", "\includegraphics[width=7em]{mini_hist/blank}", tex)

tex = tex.replace("toprule", "hline \hline")
tex = tex.replace("\midrule", "\hline")
tex = tex.replace("\\bottomrule", " \hline \hline")

tex = tex + "\caption{" + caption + "}"
tex = tex + "\label{tab:competitiveness}"

tex = "\n\\begin{table}\n\\renewcommand{\\arraystretch}{1.3}\n " + tex + "\n\\end{table}\n "

tex = re.sub(r"(Split-Line)(.*)([0-9].[0-9])(.*)([0-9].[0-9])(.*)([0-9].[0-9])(.*)([0-9].[0-9])", 
             r"\\raisebox{0.7em}{\1} \2 \\raisebox{0.7em}{\3} \4 \\raisebox{0.7em}{\5} \6 \\raisebox{0.7em}{\7} \8 \\raisebox{0.5em}{\9}", tex)
# tex = "^NT:\n\n" + tex

with open("tex/competitiveness_table.tex", "w") as o:
    o.write(tex)

# Seat Shares

## Start by retrieving the seat share (merging the votes) for the enacted maps.

In [8]:
if True:
    
    for ri, row in pd.read_csv("data/enacted_seat_shares.csv").iterrows():
        ds[row.State][str(row.Method)][str(row.Year)] = [row.Seats]

else:
        
    for s in pol_states:

        usps = s.split("_")[0]
        
        vote_file = None
        if "senate" in s:
            vote_file = s

        print(s, ":", end = " ")
        for ssn in [107, 111, 114]:
            print(ssn, end = " ")

            dseat_share = cdmap_seats(ssn, usps, vote_file)

            for y, dshare in dseat_share.items():

                ds[s][str(ssn)][str(y)] = [dshare * seats[usps]]

        print()
        

In [9]:
sns.set(rc={"figure.figsize": (2.5, 0.5)})
sns.set_style("white")

df_list = []

for s in pol_states:

    print(s, end = " ")
    
    years = set(sum([list(ds[s][k].keys()) for k in ds[s]], []))
    
    for y in years:
    
        min_seats = int(min(sum([ds[s][m][y] for m in method_full if y in ds[s][m]], [])))
        max_seats = int(max(sum([ds[s][m][y] for m in method_full if y in ds[s][m]], [])))
        bins = np.arange(min_seats-0.5, max_seats+0.6)

        # f, ax = plt.subplots(len(methods), sharex=True, sharey=True)
        for mi, m in enumerate(method_full):
            
            if not y in ds[s][m]: continue
                        
            f, ax = plt.subplots(1, sharex=True, sharey=True)

            dseats = sorted(ds[s][m][y])

            if len(dseats) > 1:

                # * 2 for seaborn bug: https://stackoverflow.com/questions/42404074/
                seats90 = set(dseats[round(len(dseats)*0.1):round(len(dseats)*0.9)])
                seats90 = list(np.arange(min(seats90), max(seats90)+0.1))

                ## Obnoxious seaborn/matplotlib warning aout kde keyword deprecation.  Ignore.
                with warnings.catch_warnings():
                    
                    warnings.simplefilter("ignore")

                    sns.distplot(seats90 * 2, ax = ax, bins = bins, kde = False, 
                                 hist_kws = {"alpha" : 0.1, "color" : "black",
                                             "weights" : [0.5] * len(seats90) * 2})

                    sns.distplot(dseats * 2, ax = ax, bins = bins, norm_hist = True, kde = False, 
                                 hist_kws={"alpha" : 1.0, "color" : "#4DAFFF",
                                           "weights" : [0.5] * len(dseats) * 2})

            if len(dseats):
                    
                avg_seats = sum(dseats) / len(dseats)
                ax.plot([avg_seats, avg_seats], [0, 1], linewidth = 3, linestyle = "solid", 
                        c = "r" if m[0] == "1" else "k")
                
                df_list.append([s, m, y, avg_seats])

            sns.despine(left = True)
            ax.set_xlim(bins[0], bins[-1])
            ax.set_ylim(0, 1)
            ax.set_yticks([])
            ax.set_xticks(np.arange(int(min_seats), max_seats + 0.1))

            f.savefig("mini_hist/{}_{}_{}_ax.pdf".format(s, y, m), bbox_inches='tight', pad_inches=0)

            ax.set_xticks([])
            f.savefig("mini_hist/{}_{}_{}.pdf".format(s, y, m), bbox_inches='tight', pad_inches=0)
                
            plt.close('all')


FL IL LA MD MN NC PA TN TX VA WI NC_race TX_senate 

In [10]:
df = pd.DataFrame(data = df_list, columns = ["State", "Method", "Year", "Seats"])
df["Year"] = df["Year"].astype(int)
df.sort_values(by = ["State", "Method", "Year"], inplace = True)

test = df.set_index(["Method", "State", "Year"])
test["file"] = test.index.map(lambda x : "XZ{}_{}_{}ZX".format(x[1], x[2], x[0]))
test = test.unstack(level = [1, 2])
test = test.reorder_levels([1, 2, 0], axis=1)
test.index = pd.Series(test.index).map(method_full)
test.sort_index(level = [0, 1], axis=1, inplace=True)
test.sort_index(inplace = True)


caption = """\caption{{Votes from presidential elections in {} are aggregated from precinct-level returns, into maps simulated with each algorithm or compactness metric. 
             The seats expected to accrue to Democrats (mean across maps) are displayed numerically as well as by a solid black line.
             The normalized distribution of seats per metric/algorithm is shown in blue and the 10-90\% range of possible seats is highlighted in gray.
             The same re-aggregation is performed for enacted maps used for the 107th, 111th, and 114th Congresses and shown in red.
             Since reapportionment shifts the number of seats per state,
               the entries for the 107th and 111th Congresses are the Democratic share,
               times the {} assigned after the 2010 Census.
             }}"""

for s in pol_states:
    
    print(s, end = " ")

    usps = s.split("_")[0]

    years = sorted(list(df[df.State == s].Year.unique()))
    if s == "TX": years = [2004, 2008, 2012, 2016]
    if s == "TX_presidential": years = [2004, 2008, 2012, 2016]
    if s == "TX_senate": years = [2008, 2012, 2014, 2018]
    
    
    for y in years: 
        test[s][y]["file"][test[s][y]["file"].isnull()] = "XZblankZX"
    
    table = test[s][years].to_latex(column_format = "l" + " rm{7em}" * len(years))
    table = table.replace("XZ", "\includegraphics[width=7em]{mini_hist/")
    table = table.replace("\_", "_")
    table = table.replace("ZX", "}")
    
    for y in years:
        if "senate" in s:
            table = re.sub(r"Seats & *file", "\multicolumn{{2}}{{c}}{{{} US Senate}}".format(y), table, count = 1)
        else:
            table = re.sub(r"Seats & *file", "\multicolumn{{2}}{{c}}{{{} Presidential}}".format(y), table, count = 1)
    
    table = re.sub("Year.*", "", table)
    table = re.sub("Method.*", "", table)
    table = table.replace("nan", "")
    # table = re.sub("None", "", table)
    table = re.sub("None", "\includegraphics[width=7em]{mini_hist/blank}", table)
    
    table = table.replace("toprule", "hline \hline \\\\")
    table = table.replace("\midrule", "\\\\ \hline \\\\")
    table = table.replace("bottomrule", "hline \hline")
    table = table.replace("Areal Distance", "\\\\ \hline \\\\ \nAreal Distance")

    table = table.replace("split", "split_ax")
    if "race" in s: table = table.replace("power", "power_ax")
    
    table = table + caption.format(state_names[usps], seats[usps])
    table = table + "\label{{tab:{}_seats}}".format(s)
    table = "\n\\begin{table}\n\\renewcommand{\\arraystretch}{0.7}\n " + table + "\n\\end{table}\n "

    # table = "^NT\n\n" + table
    # with open("tex/{}_table".format(s), "w") as o: o.write(table)

    with open("tex/{}_table.tex".format(s), "w") as o: o.write(table)

df[df.Method.str.contains("1")].reset_index(drop = True).to_csv("data/enacted_seat_shares.csv", index = False)

FL IL LA MD MN NC PA TN TX VA WI NC_race TX_senate 

In [11]:
test["TX"][[2004, 2008, 2012, 2016]]

Year,2004,2004,2008,2008,2012,2012,2016,2016
Unnamed: 0_level_1,Seats,file,Seats,file,Seats,file,Seats,file
Method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
107th Congress,10.8,XZTX_2004_107ZX,14.4,XZTX_2008_107ZX,13.2,XZTX_2012_107ZX,15.6,XZTX_2016_107ZX
111th Congress,7.9,XZTX_2004_111ZX,12.4,XZTX_2008_111ZX,11.2,XZTX_2012_111ZX,15.8,XZTX_2016_111ZX
114th Congress,8.0,XZTX_2004_114ZX,12.0,XZTX_2008_114ZX,11.0,XZTX_2012_114ZX,14.0,XZTX_2016_114ZX
Areal Distance,8.3,XZTX_2004_dist_aZX,12.2,XZTX_2008_dist_aZX,11.6,XZTX_2012_dist_aZX,15.0,XZTX_2016_dist_aZX
Axis Ratio,8.0,XZTX_2004_axis_ratioZX,12.4,XZTX_2008_axis_ratioZX,11.6,XZTX_2012_axis_ratioZX,14.9,XZTX_2016_axis_ratioZX
Circumscribing Circles,8.2,XZTX_2004_reockZX,12.4,XZTX_2008_reockZX,11.6,XZTX_2012_reockZX,15.0,XZTX_2016_reockZX
Distance to Perimeter,8.3,XZTX_2004_rohrbachZX,12.3,XZTX_2008_rohrbachZX,11.6,XZTX_2012_rohrbachZX,14.6,XZTX_2016_rohrbachZX
Dynamic Radius,8.2,XZTX_2004_dyn_radiusZX,12.6,XZTX_2008_dyn_radiusZX,11.9,XZTX_2012_dyn_radiusZX,15.4,XZTX_2016_dyn_radiusZX
Exchange,8.2,XZTX_2004_exchangeZX,12.6,XZTX_2008_exchangeZX,11.9,XZTX_2012_exchangeZX,15.2,XZTX_2016_exchangeZX
Harmonic Radius,8.2,XZTX_2004_harm_radiusZX,12.4,XZTX_2008_harm_radiusZX,11.8,XZTX_2012_harm_radiusZX,15.1,XZTX_2016_harm_radiusZX


In [12]:
N = []
for s in ds:
    for m in ds[s]:
        if "1" in m: continue
        if "split" in m: continue
        y = [yx for yx in ds[s][m]][0]
        N.append([s, m, len(ds[s][m][y])])
        
df = pd.DataFrame(N, columns = ["State", "Method", "N"]).set_index(["State", "Method"]).unstack(0)

df.columns = df.columns.levels[1]
df.index.name = None
df.columns.name = None
df.index = pd.Series(df.index).map(method_full)
df.sort_index(inplace = True)

df

Unnamed: 0,FL,IL,LA,MD,MN,NC,NC_race,PA,TN,TX,TX_senate,VA,WI
Areal Distance,1520,1611,1597,1589,1614,1603,726,1618,1547,1442,1442,1332,1579
Axis Ratio,29,411,1618,1610,1587,1448,349,594,1597,133,133,1500,1551
Circumscribing Circles,133,892,1385,1275,1362,1057,405,636,947,104,104,936,1228
Distance to Perimeter,425,1534,1598,1603,1531,1601,669,1592,1576,48,48,1597,1586
Dynamic Radius,1355,1609,1613,1619,1620,1618,767,1620,1619,1506,1506,1620,1613
Exchange,173,1388,1618,1620,1610,1615,657,1599,1617,740,740,1601,1618
Harmonic Radius,551,1550,1619,1620,1620,1615,1168,1601,1616,1121,1121,1619,1610
Hull Area,195,1005,1554,1572,1523,1465,706,899,1302,11,11,1049,1342
Hull Population,906,1615,1616,1604,1619,1620,763,1620,1614,1115,1115,1616,1618
Inertia Area,581,1596,1530,1597,1618,1616,761,1605,1181,1301,1301,1472,1562
