## 2018-2022 Nationwide election results, by precinct

This pulls nationwide precinct level results from 2018, 2020, and 2022 from the
MIT Election Science Lab, from several different files:
- 2018 State: https://doi.org/10.7910/DVN/ZFXEJU
- 2018 US Senate: https://doi.org/10.7910/DVN/DGNAFS
- 2018 US House: https://doi.org/10.7910/DVN/IVIXLK
- 2020 State: https://doi.org/10.7910/DVN/OKL2K1
- 2020 US House: https://doi.org/10.7910/DVN/VLGF2M
- 2020 US Senate: https://doi.org/10.7910/DVN/ER9XTV
- 2020 President: https://doi.org/10.7910/DVN/JXPREB
- 2022 All Levels: https://github.com/MEDSL/2022-elections-official/tree/ad541ea94cb6697c0375a8cffe8e91b60372f50b/individual_states
   not published on dataverse yet, download from github.

I tried to use [State Precinct-Level Returns 2016](https://doi.org/10.7910/DVN/GSZG1O),
but this is ain a different format from the later ones, and can't be parsed by duckdb:
https://github.com/duckdb/duckdb/issues/10929

I tried for a bit using the [programmatic API](https://guides.dataverse.org/en/5.5/api/dataaccess.html#download-by-dataset-api)
to download, but it wasn't working so just gave up.

It also does some additional cleanup of that raw data, eg
- normalize date formats and fips codes.
- normalize "APPROVE" and "FOR" to a simple "YES"
- fill in NULLs when it seems correct to do so

I don't drop any rows or otherwise lose information.
    

In [1]:
from pathlib import Path

import ibis
from ibis import _
from election_data import tests

ibis.options.interactive = True

In [2]:
def parse_date(s):
    # some dates are in the format "mm/dd/yyyy" instead of "yyyy-mm-dd"
    # https://github.com/MEDSL/2022-elections-official/issues/20
    s = s.re_replace(r"(\d\d?)/(\d\d?)/(20)?(\d\d)", r"20\4-\1-\2")
    return s.cast("date")


assert parse_date(ibis.literal("01/02/20")).cast(str).execute() == "2020-01-02"
assert parse_date(ibis.literal("11/8/22")).cast(str).execute() == "2022-11-08"
assert parse_date(ibis.literal("01/02/2020")).cast(str).execute() == "2020-01-02"
assert parse_date(ibis.literal("2020-01-02")).cast(str).execute() == "2020-01-02"


def parse_fips5(s):
    # guard against https://github.com/MEDSL/2022-elections-official/issues/21
    s = s.nullif("NAN")
    s = s.nullif("NA")
    s = s.cast(int).cast(str)
    s = s[:5]
    # pad with 0 if necessary
    s = s.re_replace(r"^(\d{4})$", r"0\1")
    return s


assert parse_fips5(ibis.literal("1234")).execute() == "01234"
assert parse_fips5(ibis.literal("12345")).execute() == "12345"
assert parse_fips5(ibis.literal("12345678")).execute() == "12345"
assert parse_fips5(ibis.literal("NA")).execute() is None
assert parse_fips5(ibis.literal("NAN")).execute() is None
assert parse_fips5(ibis.literal(None, type=str)).execute() is None

In [3]:
schema = {
    "year": "int16",
    "date": "date",
    "state_po": "string",
    # All of the following are duplicate info from state_po that just take up space.
    # You can look these up pretty easily if you need them.
    # "state": "string",
    # "state_fips": "string",
    # "state_cen": "string",
    # "state_ic": "string",
    "county_name": "string",
    "county_fips": "string",
    "jurisdiction_name": "string",
    "jurisdiction_fips": "string",
    "district": "string",
    "office": "string",
    "magnitude": "int",  # the number seats available for this office
    "special": "boolean",
    "stage": "string",
    "precinct": "string",
    "writein": "boolean",
    "candidate": "string",
    "party_detailed": "string",
    "party_simplified": "string",
    "mode": "string",  # how the votes were cast eg "EARLY", "ABSENTEE", etc
    # can't cast to int because some rows contain "*":
    # https://github.com/MEDSL/2022-elections-official/blob/ad541ea94cb6697c0375a8cffe8e91b60372f50b/README.md
    # Per the Nevada Secretary of State, "*"" in the Votes column indicates low
    # turnout and is hidden to protect voter privacy."
    "votes": "string",
    # "dataverse": "string", # one of {"STATE", "PRESIDENT", "SENATE", "HOUSE", None}
    "readme_check": "boolean",
}


def to_schema(t):
    t = t[schema.keys()]
    t = t.mutate(
        date=parse_date(t.date),
        county_fips=parse_fips5(t.county_fips),
        jurisdiction_fips=parse_fips5(t.jurisdiction_fips),
    )
    t = t.cast(schema)
    return t


def read_raw(p):
    # have to set quote explicitly to get around
    # https://github.com/duckdb/duckdb/issues/11838
    t = ibis.read_csv(p, all_varchar=True, quote='"')
    t = to_schema(t)
    return t

In [4]:
import zipfile

DATA_DIR = Path("../data")


def unzip_csvs(zip_path: Path, dest: Path) -> Path:
    with zipfile.ZipFile(zip_path, "r") as z:
        # extract any csvs in the zip file
        csvs = [f for f in z.namelist() if f.endswith(".csv")]
        z.extractall(dest, csvs)


def get_github_csvs() -> list[Path]:
    csv_dir = DATA_DIR / "github_2022/csvs"
    if not csv_dir.exists():
        zip_dir = DATA_DIR / "github_2022"
        for zip_path in zip_dir.glob("*.zip"):
            unzip_csvs(zip_path, csv_dir)
    csvs = sorted(csv_dir.glob("*.csv"))
    assert len(csvs) == 51, csvs  # 50 states + DC
    return csvs


def read_csvs():
    paths = [
        *get_github_csvs(),
        *(DATA_DIR / "dataverse_2018_state").glob("*.csv"),
        *(DATA_DIR / "dataverse_2018_us_house").glob("*.csv"),
        *(DATA_DIR / "dataverse_2018_us_senate").glob("*.csv"),
        *(DATA_DIR / "dataverse_2020_state").glob("*.csv"),
        *(DATA_DIR / "dataverse_2020_us_house").glob("*.csv"),
        *(DATA_DIR / "dataverse_2020_us_senate").glob("*.csv"),
        *(DATA_DIR / "dataverse_2020_president").glob("*.csv"),
    ]
    tables = [read_raw(p) for p in paths]
    return ibis.union(*tables)


t = read_csvs()
t = t.distinct()
t = t.cache()
print(t.count())
t

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌──────────┐
│ [1;36m34263636[0m │
└──────────┘


# Fixup Candidates

In [5]:
t.candidate.topk(100).preview(max_rows=100)

In [6]:
cand_mapping = {
    "FOR": "YES",
    "AGAINST": "NO",
    "UNDER VOTES": "UNDERVOTES",
    "OVER VOTES": "OVERVOTES",
    "WRITE-IN": "WRITEIN",
    "WRITE-IN VOTES": "WRITEIN",
    "WRITE IN": "WRITEIN",
    "WRITEIN VOTES": "WRITEIN",
    "NONE OF THESE CANDIDATES": "OTHER",
    "OTHER WRITE-INS": "OTHER",
    "BLANK BALLOTS": "BLANK",
    "TIMES BLANK VOTED": "BLANK",
    "BLANK VOTES": "BLANK",
    "BLANKS": "BLANK",
    "BLANK/VOID": "BLANK",
    "SCATTERING": "SCATTERING",
    "SCATTERED VOTES": "SCATTERING",
    "FEDERAL BALLOTS": "FEDERAL",
    "NOT QUALIFIED": "REJECTED",
}
t = t.mutate(candidate=t.candidate.substitute(cand_mapping))

## Fixup Office

In [7]:
office_mapping = {
    "US PRESIDENT": [
        "US PRESIDENT",
    ],
    "US SENATE": [
        "US SENATE",
    ],
    "US HOUSE": [
        "US HOUSE",
        "REPRESENTATIVE IN CONGRESS",
        "DELEGATE TO THE U.S. HOUSE OF REPRESENTATIVE",
        "DELEGATE TO THE US HOUSE",
        "DELEGATE TO THE HOUSE OF REPRESENTATIVES",
        "REPRESENTATIVE IN CONGRESS",
    ],
    "GOVERNOR": [
        "GOVERNOR",
        "GOVERNOR/LIEUTENANT GOVERNOR",
        "GOVERNOR AND LIEUTENANT GOVERNOR",
    ],
    "LIEUTENANT GOVERNOR": [
        "LIEUTENANT GOVERNOR",
    ],
    "ATTORNEY GENERAL": [
        "ATTORNEY GENERAL",
        "STATE ATTORNEY",
        "STATE ATTORNEY GENERAL",  # 39 States elected an Attorney General in this period
    ],
    "SECRETARY OF STATE": [
        "SECRETARY OF STATE",  # 32 States elected a Secretary of State in this period
    ],
    "STATE TREASURER": ["STATE TREASURER", "TREASURER"],
    "AUDITOR OF STATE": ["STATE AUDITOR", "AUDITOR OF STATE"],
    "STATE SENATE": [
        "STATE SENATE",
        "STATE SENATOR",
    ],
    "STATE HOUSE": [
        "STATE HOUSE",
        "STATE REPRESENTATIVE",
        "HOUSE OF DELEGATES",
    ],
}
office_mapping_flat = {}
for canon, appearances in office_mapping.items():
    for a in appearances:
        office_mapping_flat[a] = canon
t = t.mutate(office=t.office.substitute(office_mapping_flat))

## Fixup District

In [8]:
is_statewide = _.office.isin(["GOVERNOR", "US SENATE", "US PRESIDENT"])
t = t.mutate(district=_.district.upper().strip().nullif("NULL").name("district"))

t.filter(
    is_statewide,
).group_by("office", "district").agg(
    n=_.count(),
).order_by(
    _.office,
    _.n.asc(),
).preview(max_rows=40)

In [9]:
t = t.mutate(district=is_statewide.ifelse("STATEWIDE", _.district))
# also replace "AT-LARGE" with "STATEWIDE"
t = t.mutate(district=_.district.replace("AT-LARGE", "STATEWIDE"))
t.district.topk(20)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [10]:
# fixup districts that should be statewide but are NULL
is_dc_congress = (_.state_po == "DC") & (_.office == "US HOUSE")
is_ak_congress = (_.state_po == "AK") & (_.office == "US HOUSE")
is_statewide = is_dc_congress | is_ak_congress
t = t.mutate(district=is_statewide.ifelse("STATEWIDE", _.district))

## Fix Party Detailed

In [11]:
# "INDEPENDENT",
# "INDEPENDENCE",
# "NONPARTISAN",
# "NO PARTY AFFILIATION",
# "NO AFFILIATION",
# "INDEPENDENT AMERICAN PARTY",
# "INDEPENDENT AMERICAN",
# "NO PARTY",
# "INDEPENDENCE-ALLIANCE",
# "UNENROLLED",
# "INDEPENDENT PARTY",
# "INDEPENDENCE ALLIANCE",
# "NON-PARTISAN DELAWARE",
# "INDEPENDENT NOMINATION",
# "UNAFFILIATED",
# "INDEPENDENT PARTY OF OREGON",
# "NO PARTY PREFERENCE",
# "UNENROLLED INDEPENDENT",
# "COMMON SENSE INDEPENDENT",
# "INDEPENDENT FOR MAINE",
# "A TRUE INDEPENDENT",
party_detailed_mapping = {
    "NO PARTY AFFILIATION": "UNAFFILIATED",
    "NO AFFILIATION": "UNAFFILIATED",
    "NO PARTY": "UNAFFILIATED",
}
t = t.mutate(
    party_simplified=t.party_simplified.upper().strip().nullif(""),
    party_detailed=t.party_detailed.upper().strip().nullif("").substitute(cand_mapping),
)
t = t.cache()
t

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [12]:
from election_data import PartySimplified


def _regular(party):
    return (
        ibis.and_(_.party_simplified == party, _.party_detailed == party),
        (ibis.literal(party), ibis.literal(party)),
    )


simples = PartySimplified.__members__.keys()

party_map = [
    *[_regular(p) for p in simples],
    (
        ibis.and_(
            _.party_simplified.re_search(r"^\s*$"), _.party_detailed.re_search(r"^\s*$")
        ),
        (None, None),
    ),
    (
        ibis.and_(
            _.party_simplified == "DEMOCRAT", _.party_detailed == "WORKING FAMILIES"
        ),
        (ibis.literal("OTHER"), ibis.literal("WORKING FAMILIES")),
    ),
    (
        ibis.and_(
            _.party_simplified == "WORKING CLASS", _.party_detailed == "WORKING CLASS"
        ),
        (ibis.literal("OTHER"), ibis.literal("WORKING CLASS")),
    ),
    (
        ibis.and_(
            _.party_simplified == "WORKING CLASS PARTY",
            _.party_detailed == "WORKING CLASS PARTY",
        ),
        (ibis.literal("OTHER"), ibis.literal("WORKING CLASS PARTY")),
    ),
    (
        ibis.and_(
            _.party_simplified == "UNITED CITIZEN", _.party_detailed == "UNITED CITIZEN"
        ),
        (ibis.literal("OTHER"), ibis.literal("UNITED CITIZEN")),
    ),
    (
        ibis.and_(
            _.party_simplified == "CONSTITUTION", _.party_detailed == "CONSTITUTION"
        ),
        (ibis.literal("OTHER"), ibis.literal("CONSTITUTION")),
    ),
    (
        ibis.and_(_.party_simplified == "OTHER", _.party_detailed.notin(simples)),
        (ibis.literal("OTHER"), _.party_detailed),
    ),
    (
        _.party_detailed.isin(simples),
        (_.party_detailed, _.party_detailed),
    ),
]


conditions = [condition for condition, _ in party_map]
not_matched = t.filter(~ibis.or_(*conditions))
not_matched_vc = (
    not_matched.select(
        "party_simplified",
        "party_detailed",
    )
    .value_counts()
    .cache()
)
assert (not_matched_vc.count() == 0).execute()
not_matched_vc

In [13]:
from election_data.tests import test_party
from election_data.util import cases


simplified = cases(
    *[(condition, simplified) for condition, (simplified, _) in party_map]
)
detailed = cases(*[(condition, detailed) for condition, (_, detailed) in party_map])
t = t.mutate(
    party_simplified=simplified,
    party_detailed=detailed,
)
t = t.cache()
test_party(t)
t

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

## Fix Mode

In [16]:
t.mode.topk(30).preview(max_rows=30)

In [17]:
mode_mapping = {
    "EARLY VOTE": "EARLY",
    "EARLY VOTING": "EARLY",
    "ADVANCED VOTING": "EARLY",
    "ADVANCED": "EARLY",
}
t = t.mutate(mode=_.mode.substitute(mode_mapping))

In [18]:
t.filter(
    _.mode.isnull(),
).state_po.topk(10)

In [19]:
def add_n_rows_per_cand(t):
    return (
        t.group_by(
            _.year,
            _.state_po,
            _.office,
            _.district,
            _.county_name,
            _.precinct,
            _.candidate,
        )
        .mutate(
            n_per_candidate=_.count(),
        )
        .order_by(
            _.n_per_candidate.desc(),
        )
    )


add_n_rows_per_cand(t).filter(_.mode.isnull())

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [20]:
# OK, that looks like all the mode=NULL rows should actually be "TOTAL",
# there is only one row per candidate per precinct. If a candidate
# had more than one row per precinct, then one of them might be TOTAL,
# and the rest would be something else. But that's not the case here.
fixable = ibis.and_(
    _.mode.isnull(),
    _.n_per_candidate == 1,
)
t = (
    add_n_rows_per_cand(t)
    .mutate(mode=fixable.ifelse("TOTAL", _.mode))
    .drop("n_per_candidate")
)
t = t.cache()
t.filter(
    _.mode.isnull(),
).state_po.topk(10)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

## Fixup Writein

In [21]:
t = t.mutate(
    writein=(_.candidate == "WRITEIN").ifelse(True, _.writein),
)
tests.test_candidate(t)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

## Save to file

In [22]:
# order both so its more user friendly,
# and it might be smaller due to better compression
t = t.order_by(
    _.year,
    _.state_po,
    _.office,
    _.county_name,
    _.district,
    _.precinct,
    _.candidate,
    _.mode,
)
t = t.cache()
t

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [24]:
tests.test_all(t)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [25]:
t.to_parquet(DATA_DIR / "cleaned.parquet")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))