Requirements Setting and Import Packages

In [23]:
%%writefile requirements.txt
pandas==2.1.4

Writing requirements.txt


In [None]:
!pip install -r requirements.txt

In [1]:
import pandas as pd

Import Data into Pandas Dataframes

In [2]:
ballot_measures_2018_df = pd.read_csv('Ballot Measures/California Ballot Measures Nov 2018.csv')
ballot_measures_2020_df = pd.read_csv('Ballot Measures/California Ballot Measures Nov 2020.csv')
ballot_measures_2022_df = pd.read_csv('Ballot Measures/California Ballot Measures Nov 2022.csv')

Preprocessing Functions

In [4]:
def rename_ballot_columns(df_in: pd.DataFrame) -> pd.DataFrame:
    '''
        lower cases column names, replaces underscores with spaces, replaces 'count' with 'vote
        arguments: ballot measures dataframe raw data
        returns: ballot measures dataframe with clean column names
    '''
    df_in.columns = map(str.lower, df_in.columns)
    df_in.columns = [x.replace('_', ' ').replace('votes', 'count') for x in df_in.columns]
    return df_in

def select_relevant_columns(df_in: pd.DataFrame) -> pd.DataFrame:
    '''
        selects columns from the full datafrom
        arguments: ballot measures dataframe data
        returns: ballot measures dataframe only relevant columns
    '''
    return df_in[['election date', 'county name', 'ballot measure name', 'yes count', 'no count']].copy()

def select_prop_data(df_in: pd.DataFrame) -> pd.DataFrame:
    '''
        selects rows of data relevant to the Dialysis propositions
        arguments: ballot measures dataframe data
        returns: ballot measures dataframe with relevant proposition rows
    '''
    # checks the year in the date column and applies the relevant proposition number
    if '2018' in df_in['election date'].unique()[0]:
        return df_in[df_in['ballot measure name']=='Proposition 8'].copy()
    elif '2020' in df_in['election date'].unique()[0]:
        return df_in[df_in['ballot measure name']=='Proposition 23'].copy()
    elif '2022' in df_in['election date'].unique()[0]:
        return df_in[df_in['ballot measure name']=='Proposition 29'].copy()
    
def str_to_int(row):
    '''
        replaces comma and converts the string to int
        arguments: row of dataframe
        returns: row of dataframe
    '''
    return int(row.replace(',',''))
    
def convert_vote_counts(df_in: pd.DataFrame) -> pd.DataFrame:
    '''
        applies the str_to_int function to the vote count columns
        arguments: ballot measures dataframe
        returns: ballot measures dataframe
    '''
    df_in['yes count'] = df_in['yes count'].apply(str_to_int)
    df_in['no count'] = df_in['no count'].apply(str_to_int)
    return df_in
    
def get_vote_percent(df_in: pd.DataFrame) -> pd.DataFrame:
    '''
        calculates the total vote count, percent yes vote, and percent no vote for each county
        arguments: ballot measures dataframe
        returns: ballot measures dataframe
    '''
    df_in['total count'] = df_in['yes count'] + df_in['no count']
    df_in['yes perc'] = df_in['yes count']/df_in['total count']
    df_in['no perc'] = df_in['no count']/df_in['total count']
    return df_in

def spec_year_cols(df_in: pd.DataFrame) -> pd.DataFrame:
    '''
        adds in the year to relevant columns for later query use
        arguments: ballot measures dataframe
        returns: ballot measures dataframe
    '''
    if '2018' in df_in['election date'].unique()[0]:
        df_in.columns = [x+' 2018' if (x!= 'county name') and (x != 'ballot measure name') and (x != 'election date') 
                         else x for x in df_in.columns]
    elif '2020' in df_in['election date'].unique()[0]:
        df_in.columns = [x+' 2020' if (x!= 'county name') and (x != 'ballot measure name') and (x != 'election date') 
                         else x for x in df_in.columns]
    elif '2022' in df_in['election date'].unique()[0]:
        df_in.columns = [x+' 2022' if (x!= 'county name') and (x != 'ballot measure name') and (x != 'election date') 
                         else x for x in df_in.columns]
    df_in.drop(columns=['election date', 'ballot measure name'], inplace=True)
    return df_in

def preprocess_ballot_data(df_in: pd.DataFrame) -> pd.DataFrame:
    '''
        aggregates the function calls for preprocessing of the ballot measures data
        arguments: ballot measures dataframe
        returns: ballot measures dataframe
    '''
    df_in = select_prop_data(select_relevant_columns(rename_ballot_columns(df_in)))
    df_in = get_vote_percent(convert_vote_counts(df_in))
    return spec_year_cols(df_in)

def merge_ballot_data(df_list_in: list[pd.DataFrame]) -> pd.DataFrame:
    '''
        merges the ballot measures data from 3 years on the county name columns
        arguments: list of ballot measure dataframes
        returns: merged ballot measure dataframe
    '''
    #print(len(df_list_in[0]), len(df_list_in[1]), len(df_list_in[2]))
    df_merged = df_list_in[0].merge(df_list_in[1], on='county name', how='outer')
    df_merged = df_merged.merge(df_list_in[2], on='county name', how='outer')
    return df_merged.dropna().copy()

In [5]:
clean_ballot_data = merge_ballot_data([preprocess_ballot_data(ballot_measures_2020_df), preprocess_ballot_data(ballot_measures_2018_df), preprocess_ballot_data(ballot_measures_2022_df)])

In [12]:
clean_ballot_data.head()

Unnamed: 0,county name,yes count 2020,no count 2020,total count 2020,yes perc 2020,no perc 2020,yes count 2018,no count 2018,total count 2018,yes perc 2018,no perc 2018,yes count 2022,no count 2022,total count 2022,yes perc 2022,no perc 2022
0,Alameda,329873.0,413277.0,743150.0,0.443885,0.556115,275550,280735,556285,0.49534,0.50466,182697.0,290746.0,473443.0,0.38589,0.61411
1,Alpine,344.0,369.0,713.0,0.482468,0.517532,298,281,579,0.51468,0.48532,272.0,317.0,589.0,0.4618,0.5382
2,Amador,4615.0,17149.0,21764.0,0.212047,0.787953,5256,11775,17031,0.308614,0.691386,3590.0,14569.0,18159.0,0.197698,0.802302
3,Butte,29338.0,70174.0,99512.0,0.294819,0.705181,30908,55394,86302,0.358138,0.641862,16478.0,54015.0,70493.0,0.233754,0.766246
4,Calaveras,5538.0,20896.0,26434.0,0.209503,0.790497,6688,14224,20912,0.319816,0.680184,4280.0,16440.0,20720.0,0.206564,0.793436
