Requirements Setting and Import Packages

In [None]:
%%writefile requirements.txt
pandas==2.1.4

In [None]:
!pip install -r requirements.txt

In [4]:
import pandas as pd
import requests

Data Download

Import Data into Pandas Dataframes

In [16]:
ballot_measures_2018_df = pd.read_excel(
    "https://elections.cdn.sos.ca.gov/sov/2018-general/sov/csv-ballot-measures.xls"
)
ballot_measures_2020_df = pd.read_excel(
    "https://elections.cdn.sos.ca.gov/sov/2020-general/sov/csv-ballot-measures.xlsx"
)
ballot_measures_2022_df = pd.read_excel(
    "https://elections.cdn.sos.ca.gov/sov/2022-general/sov/csv-ballotmeasures.xlsx"
)

In [23]:
ballot_measures_2018_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   ELECTION_DATE         649 non-null    object
 1   ELECTION_NAME         649 non-null    object
 2   COUNTY_ID             649 non-null    int64 
 3   COUNTY_NAME           649 non-null    object
 4   BALLOT_MEASURE_ID     649 non-null    int64 
 5   BALLOT_MEASURE_NAME   649 non-null    object
 6   BALLOT_MEASURE_TITLE  649 non-null    object
 7   YES_COUNT             649 non-null    int64 
 8   NO_COUNT              649 non-null    int64 
dtypes: int64(4), object(5)
memory usage: 45.8+ KB


In [5]:
voter_reg_2018_df = pd.read_excel(
    "https://elections.cdn.sos.ca.gov/ror/15day-gen-2018/county.xlsx"
)
voter_reg_2020_df = pd.read_excel(
    "https://elections.cdn.sos.ca.gov/ror/15day-gen-2020/county.xlsx"
)
voter_reg_2022_df = pd.read_excel(
    "https://elections.cdn.sos.ca.gov/ror/15day-general-2022/county.xlsx"
)

In [40]:
# census_demo_df = pd.read_csv('Census Demographics\DECENNIALDP2020.DP1-2024-09-18T002341.csv')
demo_df_new = pd.read_excel(
    "Census Demographics\datapile_-_headline_datasets_-_current.xlsx",
    sheet_name="People",
)

Preprocessing Functions - Census Demos

In [9]:
"""
    Census Demographic Functions
        
        Cleaning
            clean_cols_transpose: cleans and transposes column names to remove 'california' and 'county' to match county name conversion
            rename_rows_cols: renames rows and columns, columns are stripped of whitespace, rows are cleaned of 'Percent' text
"""


def clean_cols_transpose(df_in: pd.DataFrame) -> pd.DataFrame:
    """
    cleans and transposes column names to remove 'california' and 'county' to match county name conversion
    arguments: census demo dataframe
    returns: census demo dataframe with clean column names
    """
    df_in.set_index("Label (Grouping)", inplace=True)
    df_in.dropna(axis=0, how="all", inplace=True)
    df_in.columns = [
        x.replace("California!!", " ").replace("County,", "") for x in df_in.columns
    ]
    df_in = df_in.filter(like="Percent", axis=1).copy()
    return df_in.T


def rename_rows_cols(df_in: pd.DataFrame) -> pd.DataFrame:
    """
    renames rows and columns, columns are stripped of whitespace, rows are cleaned of 'Percent' text
    arguments: census demo dataframe after clean_cols_transpose
    returns: census demo dataframe with clean column names
    """
    df_in.columns = [x.lstrip() + " %" for x in df_in.columns]
    df_in.reset_index(inplace=True)
    df_in["index"] = df_in["index"].apply(lambda x: x.replace("Percent", ""))
    df_in = df_in.map(
        lambda x: float(x.replace("%", "")) if "X" not in x and " " not in x else x
    )
    df_in = df_in[df_in["index"] != " "].copy()
    df_in.rename(columns={"index": "county name"}, inplace=True)
    return df_in.drop(columns=["Total population %"])


def extract_rename_cols(df_in: pd.DataFrame) -> pd.DataFrame:
    col_names = (
        ["County", "Population (January 2023)", "Median Household Income (2021)"]
        + list(demo_df_new.filter(like="Race").columns)
        + list(demo_df_new.filter(like="Age:").columns)
    )
    df_in = df_in[col_names].copy()
    df_in.rename(columns={"County": "county name"}, inplace=True)
    return df_in


def get_percentage_total_demo(df_in: pd.DataFrame) -> pd.DataFrame:
    """
    calculates the percentage of each registration part from the total number of registrations
    arguments: voter registration dataframe
    returns: voter registration dataframe
    """
    df_in[df_in.columns[3:]] = df_in[df_in.columns[3:]].div(
        df_in["Population (January 2023)"], axis=0
    )
    return df_in

Preprocessing Functions - Ballot Measures

In [26]:
""" 
Ballot Measures Functions

    Cleaning
        rename_ballot_columns: lower cases column names, replaces underscores with spaces, replaces 'count' with 'vote
        select_relevant_columns selects columns from the full datafrom
        select_prop_data: selects rows of data relevant to the Dialysis propositions
        str_to_int: replaces comma and converts the string to int
        convert_vote_counts: applies the str_to_int function to the vote count columns
        get_vote_percent: calculates the total vote count, percent yes vote, and percent no vote for each county
        spec_year_cols: adds in the year to relevant columns for later query use

    Calling
        preprocess_ballot_data: aggregation of the function calls for preprocessing of the ballot measures data

    Merging
        merge_ballot_data: merges the ballot measures data from 3 years on the county name columns
"""


def rename_ballot_columns(df_in: pd.DataFrame) -> pd.DataFrame:
    """
    lower cases column names, replaces underscores with spaces, replaces 'count' with 'vote
    arguments: ballot measures dataframe raw data
    returns: ballot measures dataframe with clean column names
    """
    df_in.columns = map(str.lower, df_in.columns)
    df_in.columns = [
        x.replace("_", " ").replace("votes", "count") for x in df_in.columns
    ]
    return df_in


def select_relevant_columns(df_in: pd.DataFrame) -> pd.DataFrame:
    """
    selects columns from the full datafrom
    arguments: ballot measures dataframe data
    returns: ballot measures dataframe only relevant columns
    """
    return df_in[
        ["election date", "county name", "ballot measure name", "yes count", "no count"]
    ].copy()


def select_prop_data(df_in: pd.DataFrame) -> pd.DataFrame:
    """
    selects rows of data relevant to the Dialysis propositions
    arguments: ballot measures dataframe data
    returns: ballot measures dataframe with relevant proposition rows
    """
    # checks the year in the date column and applies the relevant proposition number
    if "2018" in df_in["election date"].astype(str).unique()[0]:
        return df_in[df_in["ballot measure name"] == "Proposition 8"].copy()
    elif "2020" in df_in["election date"].astype(str).unique()[0]:
        return df_in[df_in["ballot measure name"] == "Proposition 23"].copy()
    elif "2022" in df_in["election date"].astype(str).unique()[0]:
        return df_in[df_in["ballot measure name"] == "Proposition 29"].copy()


def str_to_int(row):
    """
    replaces comma and converts the string to int
    arguments: row of dataframe
    returns: row of dataframe
    """
    return int(float(str(row).replace(",", "")))


def convert_vote_counts(df_in: pd.DataFrame) -> pd.DataFrame:
    """
    applies the str_to_int function to the vote count columns
    arguments: ballot measures dataframe
    returns: ballot measures dataframe
    """
    df_in["yes count"] = df_in["yes count"].apply(str_to_int)
    df_in["no count"] = df_in["no count"].apply(str_to_int)
    return df_in


def get_vote_percent(df_in: pd.DataFrame) -> pd.DataFrame:
    """
    calculates the total vote count, percent yes vote, and percent no vote for each county
    arguments: ballot measures dataframe
    returns: ballot measures dataframe
    """
    df_in["total count"] = df_in["yes count"] + df_in["no count"]
    df_in["yes perc"] = df_in["yes count"] / df_in["total count"]
    df_in["no perc"] = df_in["no count"] / df_in["total count"]
    return df_in


def spec_year_cols(df_in: pd.DataFrame) -> pd.DataFrame:
    """
    adds in the year to relevant columns for later query use
    arguments: ballot measures dataframe
    returns: ballot measures dataframe
    """
    if "2018" in df_in["election date"].astype(str).unique()[0]:
        df_in.columns = [
            (
                x + " 2018"
                if (x != "county name")
                and (x != "ballot measure name")
                and (x != "election date")
                else x
            )
            for x in df_in.columns
        ]
    elif "2020" in df_in["election date"].astype(str).unique()[0]:
        df_in.columns = [
            (
                x + " 2020"
                if (x != "county name")
                and (x != "ballot measure name")
                and (x != "election date")
                else x
            )
            for x in df_in.columns
        ]
    elif "2022" in df_in["election date"].astype(str).unique()[0]:
        df_in.columns = [
            (
                x + " 2022"
                if (x != "county name")
                and (x != "ballot measure name")
                and (x != "election date")
                else x
            )
            for x in df_in.columns
        ]
    df_in.drop(columns=["election date", "ballot measure name"], inplace=True)
    return df_in


def preprocess_ballot_data(df_in: pd.DataFrame) -> pd.DataFrame:
    """
    aggregates the function calls for preprocessing of the ballot measures data
    arguments: ballot measures dataframe
    returns: ballot measures dataframe
    """
    df_in = select_prop_data(select_relevant_columns(rename_ballot_columns(df_in)))
    df_in = get_vote_percent(convert_vote_counts(df_in))
    return spec_year_cols(df_in)


def merge_ballot_data(df_list_in: list[pd.DataFrame]) -> pd.DataFrame:
    """
    merges the ballot measures data from 3 years on the county name columns
    arguments: list of ballot measure dataframes
    returns: merged ballot measure dataframe
    """
    # print(len(df_list_in[0]), len(df_list_in[1]), len(df_list_in[2]))
    df_merged = df_list_in[0].merge(df_list_in[1], on="county name", how="outer")
    df_merged = df_merged.merge(df_list_in[2], on="county name", how="outer")
    return df_merged.dropna().copy()

Preprocessing Functions - Voter Registration

In [6]:
""" 
Voter Registration Functions

    Cleaning
        drop_na_percent: drops fully empty rows and the summary stat rows called percent and state total
        rename_clean_cols: lower cases column names, renames the county column to be county name
        assign_year_cols: adds in the year marker on each column for easier queries later on
        get_percentage_total: calculates the percentage of each registration part from the total number of registrations
        convert_reg_counts: applies the str_to_int function to the all voter registration count columns

    Calling
        preprocess_voter_reg: applies preprocessing functions to the voter registration data
"""


def drop_na_percent(df_in: pd.DataFrame) -> pd.DataFrame:
    """
    drops fully empty rows and the summary stat rows called percent and state total
    arguments: voter registration dataframe
    returns: voter registration dataframe
    """
    df_in = df_in.dropna(axis=0, how="all")
    # print(df_in.columns)
    df_in = df_in[
        (df_in["County"] != "Percent") & (df_in["County"] != "State Total")
    ].copy()
    return df_in.reset_index(drop=True)


def rename_clean_cols(df_in: pd.DataFrame) -> pd.DataFrame:
    """
    lower cases column names, renames the county column to be county name
    arguments: voter registration dataframe
    returns: voter registration dataframe
    """
    df_in.columns = map(str.lower, df_in.columns)
    df_in.rename(columns={"county": "county name"}, inplace=True)
    return df_in


def assign_year_cols(df_in: pd.DataFrame, year_in: str) -> pd.DataFrame:
    """
    adds in the year marker on each column for easier queries later on
    arguments: voter registration dataframe, year to be added to col names
    returns: voter registration dataframe
    """
    if year_in == "2018":
        df_in.columns = [
            x + " 2018" if (x != "county name") else x for x in df_in.columns
        ]
        return df_in
    elif year_in == "2020":
        df_in.columns = [
            x + " 2020" if (x != "county name") else x for x in df_in.columns
        ]
        return df_in
    elif year_in == "2022":
        df_in.columns = [
            x + " 2022" if (x != "county name") else x for x in df_in.columns
        ]
        return df_in


def get_percentage_total(df_in: pd.DataFrame) -> pd.DataFrame:
    """
    calculates the percentage of each registration part from the total number of registrations
    arguments: voter registration dataframe
    returns: voter registration dataframe
    """
    df_in[df_in.columns[3:]] = df_in[df_in.columns[3:]].div(
        df_in["total registered"], axis=0
    )
    return df_in


def convert_reg_counts(df_in: pd.DataFrame) -> pd.DataFrame:
    """
    applies the str_to_int function to the all voter registration count columns
    arguments: voter registration dataframe
    returns: voter registration dataframe
    """
    # print(df_in.columns)
    df_in.columns = [x.replace("\n", "") for x in df_in.columns]
    df_in["Eligible"] = df_in["Eligible"].apply(str_to_int)
    df_in["Total Registered"] = df_in["Total Registered"].apply(str_to_int)
    df_in["Democratic"] = df_in["Democratic"].apply(str_to_int)
    df_in["Republican"] = df_in["Republican"].apply(str_to_int)
    df_in["American Independent"] = df_in["American Independent"].apply(str_to_int)
    df_in["Green"] = df_in["Green"].apply(str_to_int)
    df_in["Libertarian"] = df_in["Libertarian"].apply(str_to_int)
    df_in["Peace and Freedom"] = df_in["Peace and Freedom"].apply(str_to_int)
    df_in["Unknown"] = df_in["Unknown"].apply(str_to_int)
    df_in["Other"] = df_in["Other"].apply(str_to_int)
    df_in["No Party Preference"] = df_in["No Party Preference"].apply(str_to_int)
    return df_in


def preprocess_voter_reg(df_in: pd.DataFrame) -> pd.DataFrame:
    """
    applies preprocessing functions to the voter registration data
    arguments: voter registration dataframe raw
    returns: voter registration dataframe clean
    """
    return get_percentage_total(
        rename_clean_cols(convert_reg_counts(drop_na_percent(df_in)))
    )

Function Calls - Voter Registration

In [None]:
voter_reg_2018_df = assign_year_cols(preprocess_voter_reg(voter_reg_2018_df), "2018")
voter_reg_2020_df = assign_year_cols(preprocess_voter_reg(voter_reg_2020_df), "2020")
voter_reg_2022_df = assign_year_cols(preprocess_voter_reg(voter_reg_2022_df), "2022")

In [13]:
clean_voter_data = merge_ballot_data(
    [voter_reg_2018_df, voter_reg_2020_df, voter_reg_2022_df]
)
clean_voter_data.head()

Unnamed: 0,county name,eligible 2018,total registered 2018,democratic 2018,republican 2018,american independent 2018,green 2018,libertarian 2018,peace and freedom 2018,unknown 2018,...,total registered 2022,democratic 2022,republican 2022,american independent 2022,green 2022,libertarian 2022,peace and freedom 2022,unknown 2022,other 2022,no party preference 2022
0,Alameda,1089154,881491,0.556651,0.110469,0.01862,0.007602,0.005286,0.003049,3.4e-05,...,931130,0.603668,0.108446,0.02322,0.006044,0.006821,0.004661,2.8e-05,0.006107,0.241006
1,Alpine,939,758,0.411609,0.270449,0.032982,0.006596,0.007916,0.002639,0.0,...,915,0.424044,0.236066,0.054645,0.008743,0.019672,0.003279,0.002186,0.003279,0.248087
2,Amador,27117,22305,0.287962,0.439901,0.042233,0.004573,0.013226,0.002735,0.000269,...,25954,0.272212,0.472027,0.054982,0.0042,0.019881,0.003121,0.001233,0.004932,0.167412
3,Butte,171771,122741,0.349052,0.341817,0.034552,0.007626,0.011259,0.00312,0.002623,...,123935,0.35024,0.357946,0.047767,0.006124,0.016436,0.005108,0.00723,0.00848,0.20067
4,Calaveras,36101,29591,0.273698,0.41445,0.045453,0.006353,0.015106,0.003177,0.00294,...,32172,0.265106,0.461799,0.058187,0.004942,0.018961,0.00401,0.00373,0.009449,0.173816


Function Calls - Ballot Measures

In [27]:
clean_ballot_data = merge_ballot_data(
    [
        preprocess_ballot_data(ballot_measures_2020_df),
        preprocess_ballot_data(ballot_measures_2018_df),
        preprocess_ballot_data(ballot_measures_2022_df),
    ]
)

In [28]:
clean_ballot_data.head()

Unnamed: 0,county name,yes count 2020,no count 2020,total count 2020,yes perc 2020,no perc 2020,yes count 2018,no count 2018,total count 2018,yes perc 2018,no perc 2018,yes count 2022,no count 2022,total count 2022,yes perc 2022,no perc 2022
0,Alameda,329873.0,413277.0,743150.0,0.443885,0.556115,275550,280735,556285,0.49534,0.50466,182697.0,290746.0,473443.0,0.38589,0.61411
1,Alpine,344.0,369.0,713.0,0.482468,0.517532,298,281,579,0.51468,0.48532,272.0,317.0,589.0,0.4618,0.5382
2,Amador,4615.0,17149.0,21764.0,0.212047,0.787953,5256,11775,17031,0.308614,0.691386,3590.0,14569.0,18159.0,0.197698,0.802302
3,Butte,29338.0,70174.0,99512.0,0.294819,0.705181,30908,55394,86302,0.358138,0.641862,16478.0,54015.0,70493.0,0.233754,0.766246
4,Calaveras,5538.0,20896.0,26434.0,0.209503,0.790497,6688,14224,20912,0.319816,0.680184,4280.0,16440.0,20720.0,0.206564,0.793436


Function Calls - Census Demographics

In [None]:
rename_rows_cols(clean_cols_transpose(census_demo_df)).head()

Function Calls - Additional Demographics Data

In [None]:
demo_df_new = get_percentage_total_demo(extract_rename_cols(demo_df_new))
demo_df_new.to_csv("Additional Demographics Data Clean.csv", index=False)