# Clean VA for odd number years

See [this MEDSL script](https://github.com/MEDSL/replication-scripts/blob/634832bc7c2df2c0a2238bc16781d07d581d3a3a/va2018.py) for previous work,
but that script isn't reproducible as it is.

In [1]:
import ibis
from pathlib import Path
import ibis_widget
import election_data as ed
from ibis import _
from ibis.expr import types as ir


ibis_widget.install()
ibis.options.interactive = True


In [2]:
# Use this as reference for what MEDSL did previously, since their replication
# scripts are insufficient.
medsl = ibis.read_parquet("../data/cleaned_medsl.parquet")
medsl_va = medsl.filter(_.state_po == "VA")
medsl_va

In [3]:
# see https://apps.elections.virginia.gov/SBE_CSV/ELECTIONS/ELECTIONRESULTS/
# There are other races available, such as primaries.
urls = {
    2017: "https://apps.elections.virginia.gov/SBE_CSV/ELECTIONS/ELECTIONRESULTS/2017/2017%20November%20General.csv",
    2019: "https://apps.elections.virginia.gov/SBE_CSV/ELECTIONS/ELECTIONRESULTS/2019/2019%20November%20General.csv",
    2021: "https://apps.elections.virginia.gov/SBE_CSV/ELECTIONS/ELECTIONRESULTS/2021/2021%20November%20General%20.csv",
    # 2023 isn't available yet :( https://apps.elections.virginia.gov/SBE_CSV/ELECTIONS/ELECTIONRESULTS/2023/
}
paths = {year: Path(f"../data/va/{year}.csv") for year in urls}
for year, url in urls.items():
    path = paths[year]
    path.parent.mkdir(parents=True, exist_ok=True)
    if not path.exists():
        print(f"Downloading {year} results to {path}")
        !curl -o {path} {url}

In [4]:
raws = [ibis.read_csv(paths[year]).mutate(year=year) for year in urls]
raw = ibis.union(*raws)
raw.widget()

┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓
┃[1m [0m[1m__row_id[0m[1m [0m┃[1m [0m[1mCandidateUid[0m[1m                          [0m[1m [0m┃[1m [0m[1mFirstName[0m[1m [0m┃[1m [0m[1mMiddleName[0m[1m [0m┃[1m [0m[1mLastName[0m[1m      [0m[1m [0m┃[1m [0m[1mSuffix[0m[1m [0m┃[1m [0m[1mTOTAL_VOTES[0m[1m [0m┃[1m [0m[1mParty[0m[1m      [0m[1m [0m┃[1m [0m[1mWriteInVote[0m[1m [0m┃[1m [0m[1mLocalityUid[0m[1m                           [0m[1m [0m┃[1m [0m[1mLocalityCode

## Easy cleaning

In [5]:
t = raw
assert (t.ElectionType == "General").all().execute()
t = t.drop("ElectionType")
t = t.drop("ElectionName")

t = t.drop(
    "CandidateUid",
    "LocalityUid",
    "LocalityCode",
    "PrecinctUid",
    "DistrictUid",
    "OfficeUid",
    "ElectionUid",
)
# In the larger dataset, the vote counts can be the string "*",
# so we need the votes column to be a string to match that.
t = t.mutate(
    votes=_.TOTAL_VOTES.cast(str),
    mode=ibis.literal("TOTAL"),
).drop("TOTAL_VOTES")
t = t.mutate(date=_.ElectionDate.date()).drop("ElectionDate")
t = t.mutate(
    readme_check=ibis.literal(False),
    special=ibis.literal(False),  # Probably is wrong from some rows
    stage=ibis.literal(ed.Stage.GEN),
)

assert (t.date.nunique()).execute() == len(urls)
t = t.cache()
t.widget()

┏━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┓
┃[1m [0m[1m__row_id[0m[1m [0m┃[1m [0m[1mFirstName[0m[1m [0m┃[1m [0m[1mMiddleName[0m[1m [0m┃[1m [0m[1mLastName[0m[1m      [0m[1m [0m┃[1m [0m[1mSuffix[0m[1m [0m┃[1m [0m[1mParty[0m[1m      [0m[1m [0m┃[1m [0m[1mWriteInVote[0m[1m [0m┃[1m [0m[1mLocalityName[0m[1m   [0m[1m [0m┃[1m [0m[1mPrecinctName[0m[1m  [0m[1m [0m┃[1m [0m[1mDistrictType[0m[1m      [0m[1m [0m┃[1m [0m[1mDistrictName[0m[1m [0m┃[1m [0m[1mOfficeTitle[0m[1m                    [0m[1m [0m┃[1m [0m[1myear[0m[1m [0m[1m [0m┃[1m [0m[1mvotes[0m[1m [0m[1m [0m┃[1m [0m[1mmode[0m[1m  [0m[1m [0m┃[1m [0m[1mdate[0m[1m      [0m[1m [0m┃[1m [0m[1mreadme_check[0m[

## fix county and county_fips

In [6]:
ed.util.compare(t.LocalityName, medsl_va.county_name)

In [7]:
def make_county_fips_lookup() -> ibis.Table:
    return medsl.select(
        "state_po",
        "county_name",
        "county_fips",
    ).distinct()


def add_county_fips(t: ibis.Table) -> ibis.Table:
    lookup = make_county_fips_lookup()
    return t.left_join(lookup, ["state_po", "county_name"]).drop(
        "state_po_right", "county_name_right"
    )


t = t.mutate(
    state_po=ibis.literal("VA"),
    county_name=_.LocalityName.upper().replace(" & ", " AND ").replace(" COUNTY", ""),
).drop("LocalityName")
t = add_county_fips(t)
# check we found a fips code for every county.
assert (t.county_fips.notnull()).all().execute()

## Fix jurisdiction and jurisdiction_fips

Looks like in the upstream data, jurisdiction is always the same as
the county. So just use that.

Be warned, it appears that the upstream FIPS codes for jurisdictions are somewhat BS.
For example, the "DISTRICT 12" jurisdiction in alaska has a FIPS code of "02012"
which isn't a [real fips code](https://transition.fcc.gov/oet/info/maps/census/fips/fips.txt)
FIPS codes really only go down to county level. Probably this DISTRICT 12
is actually referring to state HD 12, even though it is filed under the
US HOUSE office:

In [8]:
assert (medsl_va.county_name == medsl_va.jurisdiction_name).all().execute()
assert (medsl_va.county_fips == medsl_va.jurisdiction_fips).all().execute()
t = t.mutate(
    jurisdiction_name=_.county_name,
    jurisdiction_fips=_.county_fips,
)


## Add candidate and writein

In [9]:
t = t.mutate(
    candidate=(
        _.FirstName.fill_null("")
        + " "
        + _.MiddleName.fill_null("")
        + " "
        + _.LastName.fill_null("")
        + " "
        + _.Suffix.fill_null("")
    )
    .re_replace(r"\s+", " ")
    .strip()
    .upper(),
).drop("FirstName", "MiddleName", "LastName", "Suffix")

assert (t.filter(_.WriteInVote != 0).candidate == "WRITE IN VOTES").all().execute()
assert (t.filter(_.WriteInVote == 0).candidate != "WRITE IN VOTES").all().execute()
assert (t.filter(_.candidate == "WRITE IN VOTES").WriteInVote == 1).all().execute()
assert (t.filter(_.candidate != "WRITE IN VOTES").WriteInVote == 0).all().execute()

t = t.mutate(
    writein=_.WriteInVote == 1,
    candidate=ibis.case().when(_.WriteInVote == 1, "WRITEIN").else_(_.candidate).end(),
).drop("WriteInVote")
t.widget()

┏━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┓
┃[1m [0m[1m__row_id[0m[1m [0m┃[1m [0m[1mParty[0m[1m      [0m[1m [0m┃[1m [0m[1mPrecinctName[0m[1m           [0m[1m [0m┃[1m [0m[1mDistrictType[0m[1m      [0m[1m [0m┃[1m [0m[1mDistrictName[0m[1m [0m┃[1m [0m[1mOfficeTitle[0m[1m                                                    [0m[1m [0m┃[1m [0m[1myear[0m[1m [0m[1m [0m┃[1m [0m[1mvotes[0m[1m [0m[1m [0m┃[1m [0m[1mmode[0m[1m  [0m[1m [0m┃[1m [0m[1mdate[0m[1m      [0m[1m [0m┃[1m [0m[1mreadme_check[0m[1m [0m┃[1m [0m[1mspecial[0m[1m [0m┃[1m [0m[1mstage[0m[1m [0m[1m [0m┃[1m [0m[1mstate_po[0m[1m [0m┃

## Fix Party

In [10]:
# when party is write-in, that isn't a writein *candidate*:
# Their name actually appeared on the ballot, but they chose to
# use the write-in party.
assert (~t.filter(_.Party == "Write-In").writein).all().execute()

In [11]:
party_mapping = {
    "Democratic": ("DEMOCRAT", "DEMOCRAT"),
    "Republican": ("REPUBLICAN", "REPUBLICAN"),
    "Libertarian": ("LIBERTARIAN", "LIBERTARIAN"),
    "Liberation": ("OTHER", "LIBERATION"),
    "Green": ("GREEN", "GREEN"),
    "Independent": ("INDEPENDENT", "INDEPENDENT"),
    "Write-In": ("OTHER", None),
}
# Verify the values in the column are always one of the above
assert t.Party.isin(list(party_mapping)).all().execute()
simplified = ed.util.cases(*[(t.Party == k, v[0]) for k, v in party_mapping.items()])
detailed = ed.util.cases(*[(t.Party == k, v[1]) for k, v in party_mapping.items()])
t = t.mutate(party_simplified=simplified, party_detailed=detailed).drop("Party")
ed.tests.test_party(t)

## Precinct check

It looks like there isn't anything crazy going on here. Just normalize the strings and call it good.

In [12]:
ed.util.compare(t.PrecinctName, medsl_va.precinct)

In [13]:
t = t.mutate(precinct=_.PrecinctName.upper().strip().replace(" & ", " AND ")).drop(
    "PrecinctName"
)

## Drop District Type

In [14]:
assert set(t.select("DistrictType").distinct().DistrictType.execute()) == {
    "Congressional",
    None,
    "House of Delegates",
    "Election",
    "State Senate",
    "Town",
    "Super",
}
t = t.drop("DistrictType")

## Fix office

In [15]:
def convert_office(office_title: ir.StringValue) -> ir.StringValue:
    office_title = office_title.upper().strip()
    cases = [
        (office_title == "MEMBER HOUSE OF DELEGATES", ed.Office.STATE_HOUSE),
        (office_title == "MEMBER SENATE OF VIRGINIA", ed.Office.STATE_SENATE),
        (office_title == "GOVERNOR", ed.Office.GOVERNOR),
        (office_title == "LIEUTENANT GOVERNOR", ed.Office.LIEUTENANT_GOVERNOR),
        (office_title == "ATTORNEY GENERAL", ed.Office.ATTORNEY_GENERAL),
        (office_title.contains("SCHOOL BOARD"), ed.Office.SCHOOL_BOARD),
        (office_title.contains("MAYOR"), ed.Office.MAYOR),
    ]
    result = ed.util.cases(*cases, else_=office_title)
    min_results = {str(out) for inp, out in cases}
    seen_values = set(result.name("x").value_counts().x.execute())
    missing = min_results - seen_values
    assert not missing, f"Missing {missing}"
    return result


t = t.mutate(office=convert_office(t.OfficeTitle)).drop("OfficeTitle")

## fix district

In [16]:
def fix_district(district_name: ir.StringValue) -> ir.StringValue:
    # pad to 3 digits, eg "1" -> "001", "10" -> "010", "100" -> "100"
    padded = district_name.re_replace(r"^0*(\d{1,3})$", r"0000000\1").right(3)
    return district_name.re_search(r"^0*\d{1,3}$").ifelse(padded, district_name)


assert fix_district(ibis.literal("")).execute() == ""
assert fix_district(ibis.literal("0")).execute() == "000"
assert fix_district(ibis.literal("1")).execute() == "001"
assert fix_district(ibis.literal("10")).execute() == "010"
assert fix_district(ibis.literal("100")).execute() == "100"
assert fix_district(ibis.literal("1000")).execute() == "1000"
assert fix_district(ibis.literal("10000")).execute() == "10000"
assert fix_district(ibis.literal("foo")).execute() == "foo"
assert fix_district(ibis.literal("foo 10")).execute() == "foo 10"

In [17]:
ed.util.compare(fix_district(t.DistrictName), medsl_va.district)

In [18]:
t = t.mutate(district=fix_district(t.DistrictName)).drop("DistrictName")

## Fudge the Magnitude

We don't know the magnitude for most of these, so set to null where we don't know

In [19]:
t = t.mutate(
    magnitude=_.office.isin(ed.Office).ifelse(1, None),
)
t

In [20]:
t = t.cast(ed.RESULTS_SCHEMA).select(ed.RESULTS_SCHEMA)
ed.tests.test_schema(t)

In [25]:
together = ibis.union(medsl, t)
together = together.cast(ed.RESULTS_SCHEMA).select(ed.RESULTS_SCHEMA)
together.to_parquet("../data/cleaned.parquet")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))